diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
new file mode 100644
index 0000000000..47376addc6
--- /dev/null
+++ b/.github/workflows/eval.yml
@@ -0,0 +1,176 @@
+name: AI Evals
+
+on:
+  workflow_dispatch:  # Manual trigger only
+    inputs:
+      model:
+        description: 'Model to use for evaluation'
+        required: false
+        default: 'claude-sonnet-4-5-20250929'
+        type: string
+
+jobs:
+  eval:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write  # For posting PR comments
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          # No requirements.txt needed for check_eval_scores.py (uses stdlib only)
+
+      - name: Run Evaluations
+        env:
+          LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
+          LLM_AUTH_TOKEN: ${{ secrets.LLM_AUTH_TOKEN }}
+          LLM_MODEL: ${{ github.event.inputs.model || 'claude-sonnet-4-5-20250929' }}
+        run: |
+          chmod +x ./evals/scripts/run-promptfoo-eval.sh
+          ./evals/scripts/run-promptfoo-eval.sh --json
+
+      - name: Check Quality Thresholds
+        id: check_thresholds
+        run: |
+          python3 evals/scripts/check_eval_scores.py \
+            --results eval-results.json \
+            --min-score 0.70 \
+            --min-pass-rate 0.70 \
+            --verbose || echo "threshold_failed=true" >> $GITHUB_OUTPUT
+
+      - name: Generate Summary
+        if: always()
+        id: summary
+        run: |
+          if [ -f eval-results.json ]; then
+            python3 << 'EOF'
+          import json
+          import os
+
+          with open('eval-results.json', 'r') as f:
+              data = json.load(f)
+
+          results = data.get('results', {})
+          stats = results.get('stats', {})
+
+          total = stats.get('successes', 0) + stats.get('failures', 0)
+          passed = stats.get('successes', 0)
+          failed = stats.get('failures', 0)
+          pass_rate = (passed / total * 100) if total > 0 else 0
+
+          # Token usage
+          tokens = stats.get('tokenUsage', {})
+          total_tokens = tokens.get('total', 0)
+          cached_tokens = tokens.get('cached', 0)
+
+          summary = f"""## 📊 Eval Results
+
+          **Overall:** {passed}/{total} tests passed ({pass_rate:.0f}%)
+
+          | Metric | Value |
+          |--------|-------|
+          | ✅ Passed | {passed} |
+          | ❌ Failed | {failed} |
+          | 📈 Pass Rate | {pass_rate:.0f}% |
+          | 🪙 Total Tokens | {total_tokens:,} |
+          | 💾 Cached Tokens | {cached_tokens:,} |
+
+          """
+
+          # List failed tests
+          if failed > 0:
+              summary += "\n### ❌ Failed Tests\n\n"
+              for result in results.get('results', []):
+                  if not result.get('success', False):
+                      test_name = result.get('description', 'Unknown')
+                      score = result.get('score', 0)
+                      summary += f"- {test_name} (score: {score:.2f})\n"
+
+          # Success message
+          if pass_rate >= 70:
+              summary += "\n✅ **Quality thresholds met!**"
+          else:
+              summary += "\n⚠️ **Quality thresholds not met.** Please review failures."
+
+          # Write to output file for PR comment
+          with open('eval_summary.txt', 'w') as f:
+              f.write(summary)
+
+          print(summary)
+          EOF
+          else
+            echo "⚠️ No evaluation results found" > eval_summary.txt
+          fi
+
+      - name: Comment PR with Results
+        if: github.event_name == 'pull_request' && always()
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+
+            let summary = '## 📊 Eval Results\n\n⚠️ Evaluation failed to complete.';
+
+            if (fs.existsSync('eval_summary.txt')) {
+              summary = fs.readFileSync('eval_summary.txt', 'utf8');
+            }
+
+            // Find existing comment
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+
+            const botComment = comments.find(comment =>
+              comment.user.type === 'Bot' &&
+              comment.body.includes('📊 Eval Results')
+            );
+
+            if (botComment) {
+              // Update existing comment
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: botComment.id,
+                body: summary
+              });
+            } else {
+              // Create new comment
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body: summary
+              });
+            }
+
+      - name: Upload Results Artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: eval-results
+          path: |
+            eval-results*.json
+            eval_summary.txt
+          retention-days: 30
+
+      - name: Fail if thresholds not met
+        if: steps.check_thresholds.outputs.threshold_failed == 'true'
+        run: |
+          echo "❌ Quality thresholds not met"
+          exit 1
diff --git a/.github/workflows/scripts/create-github-release.sh b/.github/workflows/scripts/create-github-release.sh
index 1030bbef4c..64973aeff3 100644
--- a/.github/workflows/scripts/create-github-release.sh
+++ b/.github/workflows/scripts/create-github-release.sh
@@ -12,43 +12,27 @@ fi
 
 VERSION="$1"
 
-# Remove 'v' prefix from version for release title
+# Tag prefix must remain in sync with get-next-version
+TAG_PREFIX="agentic-sdlc-v"
+
+# Remove prefix from version for release title
 VERSION_NO_V=${VERSION#v}
 
-gh release create "$VERSION" \
-  .genreleases/spec-kit-template-copilot-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-copilot-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-claude-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-claude-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-gemini-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-gemini-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-cursor-agent-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-cursor-agent-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-opencode-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-opencode-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-qwen-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-qwen-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-windsurf-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-windsurf-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-codex-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-codex-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-kilocode-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-kilocode-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-auggie-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-auggie-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-roo-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-roo-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-codebuddy-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-codebuddy-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-qoder-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-qoder-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-amp-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-amp-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-shai-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-shai-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-q-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-q-ps-"$VERSION".zip \
-  .genreleases/spec-kit-template-bob-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-bob-ps-"$VERSION".zip \
-  --title "Spec Kit Templates - $VERSION_NO_V" \
+ASSETS=()
+AGENTS=(copilot claude gemini cursor-agent opencode qwen windsurf codex kilocode auggie roo codebuddy qoder amp shai q bob)
+SCRIPTS=(sh ps)
+
+for agent in "${AGENTS[@]}"; do
+  for script in "${SCRIPTS[@]}"; do
+    asset_path=".genreleases/agentic-sdlc-spec-kit-template-${agent}-${script}-${VERSION}.zip"
+    if [[ ! -f "$asset_path" ]]; then
+      echo "Error: missing release asset $asset_path" >&2
+      exit 1
+    fi
+    ASSETS+=("$asset_path")
+  done
+done
+
+gh release create "$VERSION" "${ASSETS[@]}" \
+  --title "Agentic SDLC Spec Kit Templates - $VERSION_NO_V" \
   --notes-file release_notes.md
diff --git a/.github/workflows/scripts/create-release-packages.sh b/.github/workflows/scripts/create-release-packages.sh
index 48678282e1..921eb007a9 100755
--- a/.github/workflows/scripts/create-release-packages.sh
+++ b/.github/workflows/scripts/create-release-packages.sh
@@ -18,8 +18,10 @@ if [[ $# -ne 1 ]]; then
   exit 1
 fi
 NEW_VERSION="$1"
-if [[ ! $NEW_VERSION =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
-  echo "Version must look like v0.0.0" >&2
+TAG_PREFIX="agentic-sdlc-v"
+
+if [[ ! $NEW_VERSION =~ ^${TAG_PREFIX}[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+  echo "Version must look like ${TAG_PREFIX}0.0.0" >&2
   exit 1
 fi
 
@@ -218,8 +220,8 @@ build_variant() {
       mkdir -p "$base_dir/.bob/commands"
       generate_commands bob md "\$ARGUMENTS" "$base_dir/.bob/commands" "$script" ;;
   esac
-  ( cd "$base_dir" && zip -r "../spec-kit-template-${agent}-${script}-${NEW_VERSION}.zip" . )
-  echo "Created $GENRELEASES_DIR/spec-kit-template-${agent}-${script}-${NEW_VERSION}.zip"
+  ( cd "$base_dir" && zip -r "../agentic-sdlc-spec-kit-template-${agent}-${script}-${NEW_VERSION}.zip" . )
+  echo "Created $GENRELEASES_DIR/agentic-sdlc-spec-kit-template-${agent}-${script}-${NEW_VERSION}.zip"
 }
 
 # Determine agent list
@@ -269,5 +271,4 @@ for agent in "${AGENT_LIST[@]}"; do
 done
 
 echo "Archives in $GENRELEASES_DIR:"
-ls -1 "$GENRELEASES_DIR"/spec-kit-template-*-"${NEW_VERSION}".zip
-
+ls -1 "$GENRELEASES_DIR"/agentic-sdlc-spec-kit-template-*-"${NEW_VERSION}".zip
diff --git a/.github/workflows/scripts/generate-release-notes.sh b/.github/workflows/scripts/generate-release-notes.sh
index d8f5dab1fc..0886c31a8a 100644
--- a/.github/workflows/scripts/generate-release-notes.sh
+++ b/.github/workflows/scripts/generate-release-notes.sh
@@ -10,11 +10,12 @@ if [[ $# -ne 2 ]]; then
   exit 1
 fi
 
+TAG_PREFIX="agentic-sdlc-v"
 NEW_VERSION="$1"
 LAST_TAG="$2"
 
 # Get commits since last tag
-if [ "$LAST_TAG" = "v0.0.0" ]; then
+if [ "$LAST_TAG" = "${TAG_PREFIX}0.0.0" ]; then
   # Check how many commits we have and use that as the limit
   COMMIT_COUNT=$(git rev-list --count HEAD)
   if [ "$COMMIT_COUNT" -gt 10 ]; then
diff --git a/.github/workflows/scripts/get-next-version.sh b/.github/workflows/scripts/get-next-version.sh
index 9770b9fdc3..c66437cc37 100644
--- a/.github/workflows/scripts/get-next-version.sh
+++ b/.github/workflows/scripts/get-next-version.sh
@@ -5,12 +5,18 @@ set -euo pipefail
 # Calculate the next version based on the latest git tag and output GitHub Actions variables
 # Usage: get-next-version.sh
 
-# Get the latest tag, or use v0.0.0 if no tags exist
-LATEST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0")
+# Prefix all fork-specific tags to avoid upstream conflicts
+TAG_PREFIX="agentic-sdlc-v"
+
+# Get the latest prefixed tag, or fall back to the prefixed zero version
+LATEST_TAG=$(git tag --list "${TAG_PREFIX}*" --sort=-v:refname | head -n 1)
+if [[ -z "${LATEST_TAG}" ]]; then
+  LATEST_TAG="${TAG_PREFIX}0.0.0"
+fi
 echo "latest_tag=$LATEST_TAG" >> $GITHUB_OUTPUT
 
 # Extract version number and increment
-VERSION=$(echo $LATEST_TAG | sed 's/v//')
+VERSION=${LATEST_TAG#${TAG_PREFIX}}
 IFS='.' read -ra VERSION_PARTS <<< "$VERSION"
 MAJOR=${VERSION_PARTS[0]:-0}
 MINOR=${VERSION_PARTS[1]:-0}
@@ -18,7 +24,7 @@ PATCH=${VERSION_PARTS[2]:-0}
 
 # Increment patch version
 PATCH=$((PATCH + 1))
-NEW_VERSION="v$MAJOR.$MINOR.$PATCH"
+NEW_VERSION="${TAG_PREFIX}$MAJOR.$MINOR.$PATCH"
 
 echo "new_version=$NEW_VERSION" >> $GITHUB_OUTPUT
 echo "New version will be: $NEW_VERSION"
diff --git a/.github/workflows/scripts/update-version.sh b/.github/workflows/scripts/update-version.sh
index 12bd9cd1d6..de1a247f06 100644
--- a/.github/workflows/scripts/update-version.sh
+++ b/.github/workflows/scripts/update-version.sh
@@ -12,8 +12,10 @@ fi
 
 VERSION="$1"
 
-# Remove 'v' prefix for Python versioning
-PYTHON_VERSION=${VERSION#v}
+TAG_PREFIX="agentic-sdlc-v"
+
+# Remove custom prefix for Python versioning
+PYTHON_VERSION=${VERSION#${TAG_PREFIX}}
 
 if [ -f "pyproject.toml" ]; then
   sed -i "s/version = \".*\"/version = \"$PYTHON_VERSION\"/" pyproject.toml
diff --git a/.gitignore b/.gitignore
index 1ed573622d..02dfff054d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,9 @@ env/
 .DS_Store
 *.tmp
 
+# Github
+.github/workflows/.secrets
+
 # Project specific
 *.log
 .env
@@ -43,4 +46,26 @@ env/
 # Spec Kit-specific files
 .genreleases/
 *.zip
-sdd-*/
+sdd-*
+
+# Evaluation artifacts
+eval-results*.json
+*.backup
+.promptfoo/
+evals/.venv/
+evals/notebooks/.venv/
+evals/notebooks/.ipynb_checkpoints/
+evals/datasets/analysis-results/*.csv
+evals/datasets/analysis-results/*.txt
+evals/datasets/analysis-results/automated-analysis-*.csv
+evals/datasets/analysis-results/plan-analysis-*.csv
+evals/datasets/analysis-results/plan-eval-analysis-*.txt
+evals/datasets/analysis-results/plan-summary-*.txt
+evals/datasets/analysis-results/promptfoo-analysis-*.csv
+evals/datasets/analysis-results/summary-*.txt
+
+# Annotation tool artifacts
+evals/annotation-tool/.venv/
+evals/annotation-tool/annotations.json
+evals/annotation-tool/annotations_export_*.json
+evals/annotation-tool/.sesskey
diff --git a/.markdownlint-cli2.jsonc b/.markdownlint-cli2.jsonc
index d6db02777c..941cd3236c 100644
--- a/.markdownlint-cli2.jsonc
+++ b/.markdownlint-cli2.jsonc
@@ -22,6 +22,7 @@
     }
   },
   "ignores": [
-    ".genreleases/"
+    ".genreleases/",
+    "evals/docs"
   ]
-}
\ No newline at end of file
+}
diff --git a/.markdownlintignore b/.markdownlintignore
new file mode 100644
index 0000000000..f86f0edcc5
--- /dev/null
+++ b/.markdownlintignore
@@ -0,0 +1 @@
+evals/docs
diff --git a/AGENTS.md b/AGENTS.md
index d7360487b8..bff55dbe9b 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,10 +1,10 @@
 # AGENTS.md
 
-## About Spec Kit and Specify
+## About Agentic SDLC Spec Kit and Specify
 
-**GitHub Spec Kit** is a comprehensive toolkit for implementing Spec-Driven Development (SDD) - a methodology that emphasizes creating clear specifications before implementation. The toolkit includes templates, scripts, and workflows that guide development teams through a structured approach to building software.
+**Agentic SDLC Spec Kit** is a comprehensive toolkit for implementing Spec-Driven Development (SDD) - a methodology that emphasizes creating clear specifications before implementation. The toolkit includes templates, scripts, and workflows that guide development teams through a structured approach to building software.
 
-**Specify CLI** is the command-line interface that bootstraps projects with the Spec Kit framework. It sets up the necessary directory structures, templates, and AI agent integrations to support the Spec-Driven Development workflow.
+**Specify CLI** is the command-line interface that bootstraps projects with the Agentic SDLC Spec Kit framework. It sets up the necessary directory structures, templates, and AI agent integrations to support the Spec-Driven Development workflow.
 
 The toolkit supports multiple AI coding assistants, allowing teams to use their preferred tools while maintaining consistent project structure and development practices.
 
@@ -132,8 +132,8 @@ Modify `.github/workflows/scripts/create-github-release.sh` to include the new a
 ```bash
 gh release create "$VERSION" \
   # ... existing packages ...
-  .genreleases/spec-kit-template-windsurf-sh-"$VERSION".zip \
-  .genreleases/spec-kit-template-windsurf-ps-"$VERSION".zip \
+  .genreleases/agentic-sdlc-spec-kit-template-windsurf-sh-"$VERSION".zip \
+  .genreleases/agentic-sdlc-spec-kit-template-windsurf-ps-"$VERSION".zip \
   # Add new agent packages here
 ```
 
@@ -364,6 +364,22 @@ Command content with {SCRIPT} and {{args}} placeholders.
 """
 ```
 
+## Spec-Code Synchronization
+
+The `--spec-sync` option enables automatic spec-code synchronization via git hooks. This feature is supported for both bash (Linux/macOS) and PowerShell (Windows) environments:
+
+### Bash Implementation
+
+- **Scripts**: `scripts/bash/spec-hooks-install.sh`, `spec-sync-pre-commit.sh`, `spec-sync-post-commit.sh`, `spec-sync-pre-push.sh`
+- **Hooks**: Creates bash-based git hooks that call the PowerShell scripts when on Windows
+
+### PowerShell Implementation
+
+- **Scripts**: `scripts/powershell/spec-hooks-install.ps1`, `spec-sync-pre-commit.ps1`, `spec-sync-post-commit.ps1`, `spec-sync-pre-push.ps1`
+- **Hooks**: Creates bash-based git hooks that call the PowerShell scripts using `powershell -ExecutionPolicy Bypass -File`
+
+The CLI automatically selects the appropriate implementation based on the `--script` parameter (sh/ps) or platform default.
+
 ## Directory Conventions
 
 - **CLI agents**: Usually `.<agent-name>/commands/`
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2b42e8fd61..361e5e5d7b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,6 +1,6 @@
-# Contributing to Spec Kit
+## Contributing to Agentic SDLC Spec Kit
 
-Hi there! We're thrilled that you'd like to contribute to Spec Kit. Contributions to this project are [released](https://help.github.com/articles/github-terms-of-service/#6-contributions-under-repository-license) to the public under the [project's open source license](LICENSE).
+Hi there! We're thrilled that you'd like to contribute to Agentic SDLC Spec Kit. Contributions to this project are [released](https://help.github.com/articles/github-terms-of-service/#6-contributions-under-repository-license) to the public under the [project's open source license](LICENSE).
 
 Please note that this project is released with a [Contributor Code of Conduct](CODE_OF_CONDUCT.md). By participating in this project you agree to abide by its terms.
 
@@ -30,6 +30,25 @@ On [GitHub Codespaces](https://github.com/features/codespaces) it's even simpler
 
 </details>
 
+## Repository setup
+
+This repository is a fork of [github/spec-kit](https://github.com/github/spec-kit). To keep your fork in sync:
+
+```bash
+# Add upstream remote (one-time setup)
+git remote add upstream https://github.com/github/spec-kit.git
+
+# Fetch upstream changes
+git fetch upstream
+
+# Merge upstream changes into your main branch
+git checkout main
+git merge upstream/main
+
+# Push the merged changes
+git push origin main
+```
+
 ## Submitting a pull request
 
 > [!NOTE]
@@ -55,7 +74,7 @@ Here are a few things you can do that will increase the likelihood of your pull
 
 ## Development workflow
 
-When working on spec-kit:
+When working on Agentic SDLC Spec Kit:
 
 1. Test changes with the `specify` CLI commands (`/speckit.specify`, `/speckit.plan`, `/speckit.tasks`) in your coding agent of choice
 2. Verify templates are working correctly in `templates/` directory
@@ -85,16 +104,16 @@ To test your templates, commands, and other changes locally, follow these steps:
 
    Navigate to your test project folder and open the agent to verify your implementation.
 
-## AI contributions in Spec Kit
+## AI contributions in Agentic SDLC Spec Kit
 
 > [!IMPORTANT]
 >
-> If you are using **any kind of AI assistance** to contribute to Spec Kit,
+> If you are using **any kind of AI assistance** to contribute to Agentic SDLC Spec Kit,
 > it must be disclosed in the pull request or issue.
 
-We welcome and encourage the use of AI tools to help improve Spec Kit! Many valuable contributions have been enhanced with AI assistance for code generation, issue detection, and feature definition.
+We welcome and encourage the use of AI tools to help improve Agentic SDLC Spec Kit! Many valuable contributions have been enhanced with AI assistance for code generation, issue detection, and feature definition.
 
-That being said, if you are using any kind of AI assistance (e.g., agents, ChatGPT) while contributing to Spec Kit,
+That being said, if you are using any kind of AI assistance (e.g., agents, ChatGPT) while contributing to Agentic SDLC Spec Kit,
 **this must be disclosed in the pull request or issue**, along with the extent to which AI assistance was used (e.g., documentation comments vs. code generation).
 
 If your PR responses or comments are being generated by an AI, disclose that as well.
@@ -122,7 +141,7 @@ When submitting AI-assisted contributions, please ensure they include:
 
 - **Clear disclosure of AI use** - You are transparent about AI use and degree to which you're using it for the contribution
 - **Human understanding and testing** - You've personally tested the changes and understand what they do
-- **Clear rationale** - You can explain why the change is needed and how it fits within Spec Kit's goals
+- **Clear rationale** - You can explain why the change is needed and how it fits within Agentic SDLC Spec Kit's goals  
 - **Concrete evidence** - Include test cases, scenarios, or examples that demonstrate the improvement
 - **Your own analysis** - Share your thoughts on the end-to-end developer experience
 
@@ -131,7 +150,7 @@ When submitting AI-assisted contributions, please ensure they include:
 We reserve the right to close contributions that appear to be:
 
 - Untested changes submitted without verification
-- Generic suggestions that don't address specific Spec Kit needs
+- Generic suggestions that don't address specific Agentic SDLC Spec Kit needs
 - Bulk submissions that show no human review or understanding
 
 ### Guidelines for success
diff --git a/README.md b/README.md
index 76149512f6..8d0cecbaf4 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <div align="center">
-    <img src="./media/logo_large.webp" alt="Spec Kit Logo" width="200" height="200"/>
-    <h1>🌱 Spec Kit</h1>
+    <img src="./media/timi_small.png" alt="Agentic SDLC Spec Kit Logo" width="200" height="200"/>
+    <h1>🐙 Agentic SDLC Spec Kit</h1>
     <h3><em>Build high-quality software faster.</em></h3>
 </div>
 
@@ -8,10 +8,34 @@
     <strong>An open source toolkit that allows you to focus on product scenarios and predictable outcomes instead of vibe coding every piece from scratch.</strong>
 </p>
 
+## 🎯 Project Vision
+
+This fork combines the [Agentic SDLC 12 Factors](https://tikalk.github.io/agentic-sdlc-12-factors/) methodology with Spec-Driven Development to create a comprehensive framework for AI-assisted software development. The 12 Factors provide the strategic foundation and operational principles, while Spec-Driven Development delivers the practical implementation workflow.
+
+### Why This Combination?
+
+**Agentic SDLC 12 Factors** establish the philosophical and strategic principles for building software with AI coding agents, covering aspects like strategic mindset, context scaffolding, dual execution loops, and team capability.
+
+**Spec-Driven Development** provides the concrete, actionable process for implementing these principles through structured specification, planning, task breakdown, and iterative implementation phases.
+
+Together, they form a complete methodology that transforms how organizations approach AI-assisted development, moving from ad-hoc AI usage to systematic, high-quality software production.
+
+### Why This Fork?
+
+The original [github/spec-kit](https://github.com/github/spec-kit) repository focused on the core Spec-Driven Development process. This fork extends that foundation by:
+
+- **Integrating the 12 Factors methodology** as the strategic layer above the tactical Spec-Driven process
+- **Adding enterprise-grade features** like team AI directives, MCP server integration, and gateway configuration
+- **Enhancing tooling** with advanced CLI options, async agent support, comprehensive issue tracker integration, and spec-code synchronization
+- **Implementing AI session context management** through the levelup command that creates reusable knowledge packets and analyzes contributions to team directives
+- **Providing team templates** and best practices for scaling AI-assisted development across teams
+
+This fork represents the evolution from a development process to a complete organizational methodology for AI-native software development, with sophisticated knowledge management and cross-project learning capabilities.
+
 <p align="center">
-    <a href="https://github.com/github/spec-kit/actions/workflows/release.yml"><img src="https://github.com/github/spec-kit/actions/workflows/release.yml/badge.svg" alt="Release"/></a>
-    <a href="https://github.com/github/spec-kit/stargazers"><img src="https://img.shields.io/github/stars/github/spec-kit?style=social" alt="GitHub stars"/></a>
-    <a href="https://github.com/github/spec-kit/blob/main/LICENSE"><img src="https://img.shields.io/github/license/github/spec-kit" alt="License"/></a>
+    <a href="https://github.com/tikalk/agentic-sdlc-spec-kit/actions/workflows/release.yml"><img src="https://github.com/tikalk/agentic-sdlc-spec-kit/actions/workflows/release.yml/badge.svg" alt="Release"/></a>
+    <a href="https://github.com/tikalk/agentic-sdlc-spec-kit/stargazers"><img src="https://img.shields.io/github/stars/tikalk/agentic-sdlc-spec-kit?style=social" alt="GitHub stars"/></a>
+    <a href="https://github.com/tikalk/agentic-sdlc-spec-kit/blob/main/LICENSE"><img src="https://img.shields.io/github/license/tikalk/agentic-sdlc-spec-kit" alt="License"/></a>
     <a href="https://github.github.io/spec-kit/"><img src="https://img.shields.io/badge/docs-GitHub_Pages-blue" alt="Documentation"/></a>
 </p>
 
@@ -21,7 +45,6 @@
 
 - [🤔 What is Spec-Driven Development?](#-what-is-spec-driven-development)
 - [⚡ Get Started](#-get-started)
-- [📽️ Video Overview](#️-video-overview)
 - [🤖 Supported AI Agents](#-supported-ai-agents)
 - [🔧 Specify CLI Reference](#-specify-cli-reference)
 - [📚 Core Philosophy](#-core-philosophy)
@@ -51,7 +74,7 @@ Choose your preferred installation method:
 Install once and use everywhere:
 
 ```bash
-uv tool install specify-cli --from git+https://github.com/github/spec-kit.git
+uv tool install agentic-sdlc-specify-cli --from git+https://github.com/tikalk/agentic-sdlc-spec-kit.git
 ```
 
 Then use the tool directly:
@@ -72,7 +95,7 @@ specify check
 To upgrade Specify, see the [Upgrade Guide](./docs/upgrade.md) for detailed instructions. Quick upgrade:
 
 ```bash
-uv tool install specify-cli --force --from git+https://github.com/github/spec-kit.git
+uv tool install agentic-sdlc-specify-cli --force --from git+https://github.com/tikalk/agentic-sdlc-spec-kit.git
 ```
 
 #### Option 2: One-time Usage
@@ -80,7 +103,7 @@ uv tool install specify-cli --force --from git+https://github.com/github/spec-ki
 Run directly without installing:
 
 ```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init <PROJECT_NAME>
+uvx --from git+https://github.com/tikalk/agentic-sdlc-spec-kit.git specify init <PROJECT_NAME>
 ```
 
 **Benefits of persistent installation:**
@@ -90,6 +113,207 @@ uvx --from git+https://github.com/github/spec-kit.git specify init <PROJECT_NAME
 - Better tool management with `uv tool list`, `uv tool upgrade`, `uv tool uninstall`
 - Cleaner shell configuration
 
+### 2. Initialize Your Project
+
+The `specify init` command supports comprehensive configuration options:
+
+#### Basic Usage
+
+```bash
+# Initialize a new project
+specify init my-project
+
+# Initialize in current directory
+specify init .
+
+# Initialize in current directory (alternative syntax)
+specify init --here
+```
+
+#### AI Agent Configuration
+
+```bash
+# Specify AI agent during initialization
+specify init my-project --ai claude
+specify init my-project --ai copilot
+specify init my-project --ai gemini
+specify init my-project --ai cursor-agent
+specify init my-project --ai qwen
+specify init my-project --ai opencode
+specify init my-project --ai codex
+specify init my-project --ai windsurf
+specify init my-project --ai kilocode
+specify init my-project --ai auggie
+specify init my-project --ai codebuddy
+specify init my-project --ai roo
+specify init my-project --ai q
+```
+
+#### Script Type Selection
+
+```bash
+# Auto-detect script type (default: ps on Windows, sh on others)
+specify init my-project
+
+# Force PowerShell scripts
+specify init my-project --script ps
+
+# Force POSIX shell scripts
+specify init my-project --script sh
+```
+
+#### Team AI Directives Integration
+
+```bash
+# Use local team-ai-directives directory
+specify init my-project --team-ai-directives ~/workspace/team-ai-directives
+
+# Clone from remote repository
+specify init my-project --team-ai-directives https://github.com/your-org/team-ai-directives.git
+
+# Example: Use the official Agentic SDLC team-ai-directives template
+specify init my-project --team-ai-directives https://github.com/tikalk/agentic-sdlc-team-ai-directives.git
+```
+
+#### Issue Tracker Integration
+
+```bash
+# Enable GitHub Issues integration
+specify init my-project --issue-tracker github
+
+# Enable Jira integration
+specify init my-project --issue-tracker jira
+
+# Enable Linear integration
+specify init my-project --issue-tracker linear
+
+# Enable GitLab integration
+specify init my-project --issue-tracker gitlab
+```
+
+#### Async Agent Support
+
+```bash
+# Enable Jules for autonomous task execution
+specify init my-project --async-agent jules
+
+# Enable Async Copilot
+specify init my-project --async-agent async-copilot
+
+# Enable Async Codex
+specify init my-project --async-agent async-codex
+```
+
+#### Git Platform MCP Configuration
+
+```bash
+# Enable GitHub platform MCP for PR operations
+specify init my-project --git-platform github
+
+# Enable GitLab platform MCP for merge request operations
+specify init my-project --git-platform gitlab
+```
+
+#### Gateway Configuration
+
+```bash
+# Configure central LLM gateway
+specify init my-project --gateway-url https://proxy.internal --gateway-token $TOKEN
+
+# Suppress gateway warnings
+specify init my-project --gateway-suppress-warning
+```
+
+#### Advanced Options
+
+```bash
+# Skip agent tool checks
+specify init my-project --ignore-agent-tools
+
+# Skip git repository initialization
+specify init my-project --no-git
+
+# Force overwrite existing files
+specify init my-project --here --force
+
+# Skip TLS verification (not recommended)
+specify init my-project --skip-tls
+
+# Show debug output
+specify init my-project --debug
+
+# Use custom GitHub token
+specify init my-project --github-token $GITHUB_TOKEN
+
+# Enable automatic spec-code synchronization
+specify init my-project --spec-sync
+specify init my-project --ai claude --spec-sync
+```
+
+### Workflow Mode Management
+
+Specify supports two workflow modes that control the complexity level of the development process:
+
+#### Available Modes
+
+- **`spec` mode (default)**: Full structured specification with comprehensive requirements, research, and validation
+- **`build` mode**: Lightweight, conversational approach focused on quick validation and exploration
+
+#### Mode Commands
+
+```bash
+# Show current mode and available options
+/mode
+
+# Switch to build mode (lightweight development)
+/mode build
+
+# Switch to spec mode (comprehensive development)
+/mode spec
+
+# Show detailed information about all modes
+/mode --info
+```
+
+#### When to Use Each Mode
+
+**Use `build` mode for:**
+
+- Prototyping and exploration
+- Simple features with clear requirements
+- Quick validation of ideas
+- When you want to get something working fast
+
+**Use `spec` mode for:**
+
+- Complex features requiring thorough analysis
+- Team collaboration with detailed documentation
+- Production systems needing comprehensive validation
+- When you need full traceability and quality gates
+
+#### Mode-Aware Commands
+
+All slash commands adapt their behavior based on the current mode:
+
+- **`/speckit.specify`**: Build mode uses simplified templates, spec mode uses comprehensive validation
+- **`/speckit.clarify`**: Build mode limits questions to 2, spec mode allows up to 5
+- **`/speckit.plan`**: Build mode creates lightweight plans, spec mode includes full research
+- **`/speckit.implement`**: Build mode focuses on core functionality, spec mode includes comprehensive quality gates
+- **`/speckit.analyze`**: Auto-detects pre vs post-implementation context based on project state
+
+#### Complete Example
+
+```bash
+specify init my-project \
+  --ai claude \
+  --script sh \
+  --team-ai-directives https://github.com/your-org/team-ai-directives.git \
+  --issue-tracker github \
+  --async-agent jules \
+  --gateway-url https://proxy.internal \
+  --gateway-token $TOKEN
+```
+
 ### 2. Establish project principles
 
 Launch your AI assistant in the project directory. The `/speckit.*` commands are available in the assistant.
@@ -126,19 +350,21 @@ Use **`/speckit.tasks`** to create an actionable task list from your implementat
 
 ### 6. Execute implementation
 
-Use **`/speckit.implement`** to execute all tasks and build your feature according to the plan.
+Use **`/speckit.implement`** to execute all tasks and build your feature according to the plan. Supports both synchronous (interactive) and asynchronous (autonomous) execution modes.
 
 ```bash
 /speckit.implement
 ```
 
-For detailed step-by-step instructions, see our [comprehensive guide](./spec-driven.md).
+### 7. Level up and contribute knowledge
 
-## 📽️ Video Overview
+Use **`/speckit.levelup`** to capture learnings from completed work and contribute reusable knowledge back to your team's shared repository.
 
-Want to see Spec Kit in action? Watch our [video overview](https://www.youtube.com/watch?v=a9eR1xsfvHg&pp=0gcJCckJAYcqIYzv)!
+```bash
+/speckit.levelup "Document the patterns learned from this implementation for future reference"
+```
 
-[![Spec Kit video header](/media/spec-kit-video-header.jpg)](https://www.youtube.com/watch?v=a9eR1xsfvHg&pp=0gcJCckJAYcqIYzv)
+For detailed step-by-step instructions, see our [comprehensive guide](./spec-driven.md).
 
 ## 🤖 Supported AI Agents
 
@@ -169,25 +395,46 @@ The `specify` command supports the following options:
 
 ### Commands
 
-| Command | Description                                                                                                                                             |
-| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `init`  | Initialize a new Specify project from the latest template                                                                                               |
-| `check` | Check for installed tools (`git`, `claude`, `gemini`, `code`/`code-insiders`, `cursor-agent`, `windsurf`, `qwen`, `opencode`, `codex`, `shai`, `qoder`) |
+| Command     | Description                                                    |
+|-------------|----------------------------------------------------------------|
+| `init`      | Initialize a new Specify project from the latest template      |
+| `check`     | Check for installed tools (`git`, `claude`, `gemini`, `code`/`code-insiders`, `cursor-agent`, `windsurf`, `qwen`, `opencode`, `codex`, `shai`, `qoder`) |
+| `mode`      | Manage workflow modes and framework opinions for development complexity control |
 
 ### `specify init` Arguments & Options
 
-| Argument/Option        | Type     | Description                                                                                                                                                                                  |
-| ---------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `<project-name>`       | Argument | Name for your new project directory (optional if using `--here`, or use `.` for current directory)                                                                                           |
+| Argument/Option              | Type     | Description                                                                 |
+|------------------------------|----------|-----------------------------------------------------------------------------|
+| `<project-name>`             | Argument | Name for your new project directory (optional if using `--here`, or use `.` for current directory) |
 | `--ai`                 | Option   | AI assistant to use: `claude`, `gemini`, `copilot`, `cursor-agent`, `qwen`, `opencode`, `codex`, `windsurf`, `kilocode`, `auggie`, `roo`, `codebuddy`, `amp`, `shai`, `q`, `bob`, or `qoder` |
-| `--script`             | Option   | Script variant to use: `sh` (bash/zsh) or `ps` (PowerShell)                                                                                                                                  |
-| `--ignore-agent-tools` | Flag     | Skip checks for AI agent tools like Claude Code                                                                                                                                              |
-| `--no-git`             | Flag     | Skip git repository initialization                                                                                                                                                           |
-| `--here`               | Flag     | Initialize project in the current directory instead of creating a new one                                                                                                                    |
-| `--force`              | Flag     | Force merge/overwrite when initializing in current directory (skip confirmation)                                                                                                             |
-| `--skip-tls`           | Flag     | Skip SSL/TLS verification (not recommended)                                                                                                                                                  |
-| `--debug`              | Flag     | Enable detailed debug output for troubleshooting                                                                                                                                             |
-| `--github-token`       | Option   | GitHub token for API requests (or set GH_TOKEN/GITHUB_TOKEN env variable)                                                                                                                    |
+| `--script`                   | Option   | Script type: `sh` (POSIX) or `ps` (PowerShell)                              |
+| `--ignore-agent-tools`       | Flag     | Skip checks for AI agent tools like Claude                                                  |
+| `--no-git`                   | Flag     | Skip git repository                                         |
+| `--here`                     | Flag     | Initialize project in the current directory instead of creating a new one                                            |
+| `--force`                    | Flag     | Force merge/overwrite when initializing in current directory (skip confirmation)             |
+| `--skip-tls`                 | Flag     | Skip SSL/TLS verification (not recommended)                                |
+| `--debug`                    | Flag     | Enable detailed debug output for troubleshooting                                                        |
+| `--github-token`             | Option   | GitHub token for API requests (or set GH_TOKEN/GITHUB_TOKEN env variable)      |
+| `--team-ai-directives`       | Option   | Path or URL to team-ai-directives repository                               |
+| `--issue-tracker`            | Option   | Issue tracker MCP: `github`, `jira`, `linear`, `gitlab`                    |
+| `--async-agent`              | Option   | Async agent MCP: `jules`, `async-copilot`, `async-codex`                   |
+| `--git-platform`             | Option   | Git platform MCP for PR operations: `github`, `gitlab`                     |
+| `--gateway-url`              | Option   | Central LLM gateway URL                                                    |
+| `--gateway-token`             | Option   | Gateway authentication token                                               |
+| `--gateway-suppress-warning` | Flag     | Suppress gateway warning messages                                          |
+| `--spec-sync`                | Flag     | Enable automatic spec-code synchronization (keeps specs/*.md files updated with code changes) |
+
+### `/mode` Arguments & Options
+
+| Argument/Option | Type     | Description                                                                 |
+|-----------------|----------|-----------------------------------------------------------------------------|
+| `<mode>`        | Argument | Workflow mode: `build` (lightweight) or `spec` (comprehensive) - leave empty to show current mode |
+| `--tdd/--no-tdd` | Option | Enable/disable TDD (Test-Driven Development) |
+| `--contracts/--no-contracts` | Option | Enable/disable API contract generation |
+| `--data-models/--no-data-models` | Option | Enable/disable data model generation |
+| `--risk-tests/--no-risk-tests` | Option | Enable/disable risk-based test generation |
+| `--reset-opinions` | Option | Reset framework opinions to mode defaults |
+| `--info`, `-i`  | Flag     | Show detailed information about available modes                             |
 
 ### Examples
 
@@ -238,8 +485,26 @@ specify init my-project --ai claude --debug
 # Use GitHub token for API requests (helpful for corporate environments)
 specify init my-project --ai claude --github-token ghp_your_token_here
 
+# Initialize with team AI directives and issue tracker integration
+specify init my-project --ai claude --team-ai-directives https://github.com/your-org/team-ai-directives.git --issue-tracker github
+
+# Initialize with async agent support for autonomous task execution
+specify init my-project --ai claude --async-agent jules
+
+# Initialize with central LLM gateway configuration
+specify init my-project --ai claude --gateway-url https://proxy.internal --gateway-token $TOKEN
+
+# Complex example: Enterprise setup with all integrations
+specify init enterprise-app --ai claude --script sh --team-ai-directives https://github.com/company/team-ai-directives.git --issue-tracker jira --async-agent jules --gateway-url https://llm-gateway.company.com --gateway-token $GATEWAY_TOKEN --github-token $GH_TOKEN
+
 # Check system requirements
 specify check
+
+# Workflow mode management
+/mode                    # Show current mode
+/mode build             # Switch to lightweight build mode
+/mode spec              # Switch to comprehensive spec mode
+/mode --info            # Show detailed mode information
 ```
 
 ### Available Slash Commands
@@ -250,13 +515,14 @@ After running `specify init`, your AI coding agent will have access to these sla
 
 Essential commands for the Spec-Driven Development workflow:
 
-| Command                 | Description                                                              |
-| ----------------------- | ------------------------------------------------------------------------ |
-| `/speckit.constitution` | Create or update project governing principles and development guidelines |
-| `/speckit.specify`      | Define what you want to build (requirements and user stories)            |
-| `/speckit.plan`         | Create technical implementation plans with your chosen tech stack        |
-| `/speckit.tasks`        | Generate actionable task lists for implementation                        |
-| `/speckit.implement`    | Execute all tasks to build the feature according to the plan             |
+| Command                  | Description                                                           |
+|--------------------------|-----------------------------------------------------------------------|
+| `/speckit.constitution`  | Create or update project governing principles and development guidelines |
+| `/speckit.specify`       | Define what you want to build (requirements and user stories)        |
+| `/speckit.plan`          | Create technical implementation plans with your chosen tech stack & SYNC/ASYNC triage          |
+| `/speckit.tasks`         | Generate actionable task lists for implementation     |
+| `/speckit.implement`     | Execute all tasks to build the feature according to the plan with dual execution loops (SYNC/ASYNC modes)           |
+| `/speckit.levelup`       | Capture learnings and contribute to team knowledge repository        |
 
 #### Optional Commands
 
@@ -283,6 +549,23 @@ Spec-Driven Development is a structured process that emphasizes:
 - **Multi-step refinement** rather than one-shot code generation from prompts
 - **Heavy reliance** on advanced AI model capabilities for specification interpretation
 
+### Alignment with Agentic SDLC 12 Factors
+
+This methodology aligns with the [Agentic SDLC 12 Factors](https://tikalk.github.io/agentic-sdlc-12-factors/) framework, which provides foundational principles for building software with AI coding agents. Key alignments include:
+
+- **Factor I: Strategic Mindset** - Intent-driven development with clear specifications
+- **Factor II: Context Scaffolding** - Rich organizational principles and guardrails
+- **Factor III: Mission Definition** - Structured specification creation process
+- **Factor IV: Structured Planning** - Multi-step refinement with technical planning
+- **Factor V: Dual Execution Loops** - SYNC/ASYNC execution modes for different development phases
+- **Factor VI: The Great Filter** - Quality gates and validation checkpoints
+- **Factor VII: Adaptive Quality Gates** - Flexible quality assurance based on project needs
+- **Factor VIII: AI-Augmented, Risk-Based Testing** - Intelligent testing strategies
+- **Factor IX: Traceability** - End-to-end artifact traceability
+- **Factor X: Strategic Tooling** - Purpose-built tools for AI-assisted development
+- **Factor XI: Directives as Code** - Team AI directives for consistent behavior
+- **Factor XII: Team Capability** - Knowledge sharing and continuous improvement
+
 ## 🌟 Development Phases
 
 | Phase                                    | Focus                    | Key Activities                                                                                                                                                     |
@@ -638,12 +921,18 @@ rm gcm-linux_amd64.2.6.1.deb
 
 ## 👥 Maintainers
 
+### Original Repository
+
 - Den Delimarsky ([@localden](https://github.com/localden))
 - John Lam ([@jflam](https://github.com/jflam))
 
+### Fork Maintainers (tikalk/agentic-sdlc-spec-kit)
+
+- Lior Kanfi ([@kanfil](https://github.com/kanfil))
+
 ## 💬 Support
 
-For support, please open a [GitHub issue](https://github.com/github/spec-kit/issues/new). We welcome bug reports, feature requests, and questions about using Spec-Driven Development.
+For support, please open a [GitHub issue](https://github.com/tikalk/agentic-sdlc-spec-kit/issues/new). We welcome bug reports, feature requests, and questions about using Spec-Driven Development.
 
 ## 🙏 Acknowledgements
 
diff --git a/SUPPORT.md b/SUPPORT.md
index c6acf76e05..24469f99aa 100644
--- a/SUPPORT.md
+++ b/SUPPORT.md
@@ -12,7 +12,7 @@ For help or questions about using this project, please:
 
 ## Project Status
 
-**Spec Kit** is under active development and maintained by GitHub staff **AND THE COMMUNITY**. We will do our best to respond to support, feature requests, and community questions in a timely manner.
+**Agentic SDLC Spec Kit** is under active development and maintained by GitHub staff **AND THE COMMUNITY**. We will do our best to respond to support, feature requests, and community questions in a timely manner.
 
 ## GitHub Support Policy
 
diff --git a/docs/README.md b/docs/README.md
index a9edc81dc9..ab9c84bef8 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,6 +1,6 @@
 # Documentation
 
-This folder contains the documentation source files for Spec Kit, built using [DocFX](https://dotnet.github.io/docfx/).
+This folder contains the documentation source files for Agentic SDLC Spec Kit, built using [DocFX](https://dotnet.github.io/docfx/). Agentic SDLC Spec Kit is built upon the [Agentic SDLC 12 Factors](https://tikalk.github.io/agentic-sdlc-12-factors/) methodology for structured AI-assisted development.
 
 ## Building Locally
 
diff --git a/docs/docfx.json b/docs/docfx.json
index dca3f0f578..790f951d43 100644
--- a/docs/docfx.json
+++ b/docs/docfx.json
@@ -56,9 +56,9 @@
     "cleanupCacheHistory": false,
     "disableGitFeatures": false,
     "globalMetadata": {
-      "_appTitle": "Spec Kit Documentation",
-      "_appName": "Spec Kit",
-      "_appFooter": "Spec Kit - A specification-driven development toolkit",
+      "_appTitle": "Agentic SDLC Spec Kit Documentation",
+      "_appName": "Agentic SDLC Spec Kit",
+      "_appFooter": "Agentic SDLC Spec Kit - A specification-driven development toolkit",
       "_enableSearch": true,
       "_disableContribution": false,
       "_gitContribute": {
diff --git a/docs/index.md b/docs/index.md
index a56fcc1761..3df4422667 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,4 +1,4 @@
-# Spec Kit
+# Agentic SDLC Spec Kit
 
 *Build high-quality software faster.*
 
@@ -58,6 +58,25 @@ Our research and experimentation focus on:
 - Provide robust iterative feature development workflows
 - Extend processes to handle upgrades and modernization tasks
 
+## Alignment with Agentic SDLC 12 Factors
+
+Spec-Driven Development is built upon the [Agentic SDLC 12 Factors](https://tikalk.github.io/agentic-sdlc-12-factors/) methodology, a comprehensive framework for software development with AI coding agents. The 12 factors provide the foundational principles that guide our approach:
+
+- [I. Strategic Mindset](https://tikalk.github.io/agentic-sdlc-12-factors/content/strategic-mindset.html): Developer as Orchestrator, AI as Intern
+- [II. Context Scaffolding](https://tikalk.github.io/agentic-sdlc-12-factors/content/context-scaffolding.html): Treat Context as a Dependency
+- [III. Mission Definition](https://tikalk.github.io/agentic-sdlc-12-factors/content/mission-definition.html): From Intent to Specification
+- [IV. Structured Planning](https://tikalk.github.io/agentic-sdlc-12-factors/content/structured-planning.html): Decompose and Triage Tasks
+- [V. Dual Execution Loops](https://tikalk.github.io/agentic-sdlc-12-factors/content/dual-execution-loops.html): Pair Program or Delegate Toil
+- [VI. The Great Filter](https://tikalk.github.io/agentic-sdlc-12-factors/content/great-filter.html): Apply Irreplaceable Human Judgment
+- [VII. Adaptive Quality Gates](https://tikalk.github.io/agentic-sdlc-12-factors/content/adaptive-quality-gates.html): Review Appropriately for Each Workflow
+- [VIII. AI-Augmented, Risk-Based Testing](https://tikalk.github.io/agentic-sdlc-12-factors/content/ai-augmented-testing.html): AI-Augmented, Risk-Based Testing
+- [IX. Traceability](https://tikalk.github.io/agentic-sdlc-12-factors/content/traceability.html): Linking the 'Why' to the 'How'
+- [X. Strategic Tooling](https://tikalk.github.io/agentic-sdlc-12-factors/content/strategic-tooling.html): Manage a Federated, Governed Stack
+- [XI. Directives as Code](https://tikalk.github.io/agentic-sdlc-12-factors/content/directives-as-code.html): Version and Share AI Behavior
+- [XII. Team Capability](https://tikalk.github.io/agentic-sdlc-12-factors/content/team-capability.html): Systematize Learning and Improvement
+
+This alignment ensures that Agentic SDLC Spec Kit provides a structured, high-velocity approach to AI-assisted development while maintaining quality and accountability.
+
 ## Contributing
 
 Please see our [Contributing Guide](https://github.com/github/spec-kit/blob/main/CONTRIBUTING.md) for information on how to contribute to this project.
diff --git a/docs/installation.md b/docs/installation.md
index 6daff24315..e8bb108319 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -1,5 +1,7 @@
 # Installation Guide
 
+> Agentic SDLC Spec Kit implements the [Agentic SDLC 12 Factors](https://tikalk.github.io/agentic-sdlc-12-factors/) methodology for structured AI-assisted development.
+
 ## Prerequisites
 
 - **Linux/macOS** (or Windows; PowerShell scripts now supported without WSL)
@@ -15,15 +17,15 @@
 The easiest way to get started is to initialize a new project:
 
 ```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init <PROJECT_NAME>
+uvx --from git+https://github.com/github/agentic-sdlc-agentic-sdlc-spec-kit.git specify init <PROJECT_NAME>
 ```
 
 Or initialize in the current directory:
 
 ```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init .
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init .
 # or use the --here flag
-uvx --from git+https://github.com/github/spec-kit.git specify init --here
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init --here
 ```
 
 ### Specify AI Agent
@@ -31,10 +33,10 @@ uvx --from git+https://github.com/github/spec-kit.git specify init --here
 You can proactively specify your AI agent during initialization:
 
 ```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init <project_name> --ai claude
-uvx --from git+https://github.com/github/spec-kit.git specify init <project_name> --ai gemini
-uvx --from git+https://github.com/github/spec-kit.git specify init <project_name> --ai copilot
-uvx --from git+https://github.com/github/spec-kit.git specify init <project_name> --ai codebuddy
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --ai claude
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --ai gemini
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --ai copilot
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --ai codebuddy
 ```
 
 ### Specify Script Type (Shell vs PowerShell)
@@ -50,8 +52,8 @@ Auto behavior:
 Force a specific script type:
 
 ```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init <project_name> --script sh
-uvx --from git+https://github.com/github/spec-kit.git specify init <project_name> --script ps
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --script sh
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --script ps
 ```
 
 ### Ignore Agent Tools Check
@@ -59,18 +61,69 @@ uvx --from git+https://github.com/github/spec-kit.git specify init <project_name
 If you prefer to get the templates without checking for the right tools:
 
 ```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init <project_name> --ai claude --ignore-agent-tools
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --ai claude --ignore-agent-tools
+```
+
+### Configure Team AI Directives
+
+Connect to shared team knowledge and standards:
+
+```bash
+# Use local team-ai-directives directory
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --team-ai-directives ~/workspace/team-ai-directives
+
+# Clone from remote repository
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --team-ai-directives https://github.com/your-org/team-ai-directives.git
+```
+
+### Enable Issue Tracker Integration
+
+Configure MCP servers for project management integration:
+
+```bash
+# GitHub Issues
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --issue-tracker github
+
+# Jira, Linear, or GitLab
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --issue-tracker jira
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --issue-tracker linear
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --issue-tracker gitlab
+```
+
+### Enable Async Agent Support
+
+Configure autonomous coding agents for delegated task execution:
+
+```bash
+# Jules, Async Copilot, or Async Codex
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --async-agent jules
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --async-agent async-copilot
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --async-agent async-codex
+```
+
+### Configure Central LLM Gateway
+
+Set up proxy configuration for centralized AI model access:
+
+```bash
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <project_name> --gateway-url https://proxy.internal --gateway-token $TOKEN
 ```
 
 ## Verification
 
 After initialization, you should see the following commands available in your AI agent:
 
+- `/speckit.constitution` - Establish project principles and assemble constitution
 - `/speckit.specify` - Create specifications
-- `/speckit.plan` - Generate implementation plans  
+- `/speckit.plan` - Generate implementation plans
 - `/speckit.tasks` - Break down into actionable tasks
+- `/speckit.implement` - Execute implementation with SYNC/ASYNC dual execution loops
+- `/speckit.levelup` - Capture learnings and contribute to team knowledge
+- `/speckit.analyze` - Cross-artifact consistency and alignment reports
+- `/speckit.checklist` - Generate quality checklists for requirements validation
+- `/speckit.clarify` - Structured questions to de-risk ambiguous areas
 
-The `.specify/scripts` directory will contain both `.sh` and `.ps1` scripts.
+The `.specify/scripts` directory will contain both `.sh` and `.ps1` scripts for automation, and `.mcp.json` will be configured for issue tracker and async agent integration if specified.
 
 ## Troubleshooting
 
diff --git a/docs/local-development.md b/docs/local-development.md
index 7fac06adf4..814c455293 100644
--- a/docs/local-development.md
+++ b/docs/local-development.md
@@ -1,13 +1,13 @@
 # Local Development Guide
 
-This guide shows how to iterate on the `specify` CLI locally without publishing a release or committing to `main` first.
+This guide shows how to iterate on the `specify` CLI locally without publishing a release or committing to `main` first. This process aligns with [X. Strategic Tooling](https://tikalk.github.io/agentic-sdlc-12-factors/content/strategic-tooling.html), managing a governed stack of development tools for controlled and efficient workflows.
 
 > Scripts now have both Bash (`.sh`) and PowerShell (`.ps1`) variants. The CLI auto-selects based on OS unless you pass `--script sh|ps`.
 
 ## 1. Clone and Switch Branches
 
 ```bash
-git clone https://github.com/github/spec-kit.git
+git clone https://github.com/github/agentic-sdlc-agentic-sdlc-spec-kit.git
 cd spec-kit
 # Work on a feature branch
 git checkout -b your-feature-branch
@@ -60,7 +60,7 @@ You can also point uvx at a specific branch without merging:
 ```bash
 # Push your working branch first
 git push origin your-feature-branch
-uvx --from git+https://github.com/github/spec-kit.git@your-feature-branch specify init demo-branch-test --script ps
+uvx --from git+https://github.com/github/agentic-sdlc-agentic-sdlc-spec-kit.git@your-feature-branch specify init demo-branch-test --script ps
 ```
 
 ### 4a. Absolute Path uvx (Run From Anywhere)
@@ -68,21 +68,21 @@ uvx --from git+https://github.com/github/spec-kit.git@your-feature-branch specif
 If you're in another directory, use an absolute path instead of `.`:
 
 ```bash
-uvx --from /mnt/c/GitHub/spec-kit specify --help
-uvx --from /mnt/c/GitHub/spec-kit specify init demo-anywhere --ai copilot --ignore-agent-tools --script sh
+uvx --from /mnt/c/GitHub/agentic-sdlc-spec-kit specify --help
+uvx --from /mnt/c/GitHub/agentic-sdlc-spec-kit specify init demo-anywhere --ai copilot --ignore-agent-tools --script sh
 ```
 
 Set an environment variable for convenience:
 
 ```bash
-export SPEC_KIT_SRC=/mnt/c/GitHub/spec-kit
+export SPEC_KIT_SRC=/mnt/c/GitHub/agentic-sdlc-spec-kit
 uvx --from "$SPEC_KIT_SRC" specify init demo-env --ai copilot --ignore-agent-tools --script ps
 ```
 
 (Optional) Define a shell function:
 
 ```bash
-specify-dev() { uvx --from /mnt/c/GitHub/spec-kit specify "$@"; }
+specify-dev() { uvx --from /mnt/c/GitHub/agentic-sdlc-spec-kit specify "$@"; }
 # Then
 specify-dev --help
 ```
@@ -146,7 +146,7 @@ specify init demo --skip-tls --ai gemini --ignore-agent-tools --script ps
 | Run CLI directly | `python -m src.specify_cli --help` |
 | Editable install | `uv pip install -e .` then `specify ...` |
 | Local uvx run (repo root) | `uvx --from . specify ...` |
-| Local uvx run (abs path) | `uvx --from /mnt/c/GitHub/spec-kit specify ...` |
+| Local uvx run (abs path) | `uvx --from /mnt/c/GitHub/agentic-sdlc-spec-kit specify ...` |
 | Git branch uvx | `uvx --from git+URL@branch specify ...` |
 | Build wheel | `uv build` |
 
diff --git a/docs/quickstart.md b/docs/quickstart.md
index 4d3b863b35..1456b49fa3 100644
--- a/docs/quickstart.md
+++ b/docs/quickstart.md
@@ -1,10 +1,144 @@
 # Quick Start Guide
 
-This guide will help you get started with Spec-Driven Development using Spec Kit.
+This guide will help you get started with Spec-Driven Development using Agentic SDLC Spec Kit.
 
 > [!NOTE]
 > All automation scripts now provide both Bash (`.sh`) and PowerShell (`.ps1`) variants. The `specify` CLI auto-selects based on OS unless you pass `--script sh|ps`.
 
+## Stage 0: Foundation & Setup
+
+**Goal:** Establish the foundational rules and configure the development environment so every later stage aligns with the project's architectural and security principles.  
+**Note:** Run these steps in a standard terminal before opening the Intelligent IDE.  
+**Alignment with 12 Factors:** This stage establishes the foundation guided by [I. Strategic Mindset](https://tikalk.github.io/agentic-sdlc-12-factors/content/strategic-mindset.html) and [II. Context Scaffolding](https://tikalk.github.io/agentic-sdlc-12-factors/content/context-scaffolding.html), positioning the developer as orchestrator and assembling necessary context for AI collaboration.
+
+### Choose Your Workflow Mode
+
+Specify supports two workflow modes that control development complexity, plus configurable framework opinions:
+
+- **`spec` mode (default)**: Full structured development with comprehensive requirements, research, and validation
+- **`build` mode**: Lightweight approach focused on quick implementation and exploration
+
+**Framework Opinions** (configurable within each mode):
+
+- **TDD**: Test-Driven Development (tests before implementation)
+- **API Contracts**: Automatic contract generation during planning
+- **Data Models**: Automatic data model generation during planning
+
+```bash
+# Check current mode and opinions
+/mode
+
+# Switch to lightweight mode for prototyping
+/mode build
+
+# Switch to comprehensive mode for production features
+/mode spec
+
+# Customize framework opinions
+/mode --no-contracts  # Disable API contracts
+/mode --tdd           # Enable TDD
+/mode --risk-tests    # Enable risk-based testing
+/mode --reset-opinions  # Reset to mode defaults
+
+# Learn more about modes and opinions
+/mode --info
+```
+
+**Recommendation:** Start with `build` mode for exploration, switch to `spec` mode when features become complex or need thorough documentation.
+
+1. **Project Initialization (`/init`)**  
+   **Action:** From the project root, run the Agentic SDLC Spec Kit `init` command (e.g., `specify init <project> --team-ai-directives https://github.com/your-org/team-ai-directives.git`) to configure local settings and clone the shared `team-ai-directives` modules.  
+   **Purpose:** Creates the handshake that brings the repository into the managed Agentic SDLC ecosystem, wiring credentials, endpoints, and shared knowledge needed for subsequent commands.
+2. **Establishing the Constitution (`/constitution`)**  
+   **Action:** Within the IDE, execute `/constitution`, importing relevant modules from `team-ai-directives` and adding any project-specific principles.  
+   **Purpose:** Generates `memory/constitution.md`, the immutable ruleset automatically injected into `/specify`, `/plan`, and other workflows so every response honors project standards.
+
+**Example Command:**
+
+```text
+/constitution "Assemble the constitution for this service. Import principles from @team/context_modules/principles/stateless_services.md and @team/context_modules/principles/zero_trust_security_model.md. Add the custom principle: 'All public APIs must be versioned.'"
+```
+
+**Outcome:** The IDE is fully integrated with the Orchestration Hub, and a committed `constitution.md` anchors all future automation.
+
+## Stage 1: Feature Specification
+
+**Goal:** Produce a committed `spec.md` that captures the feature's intent, constraints, and acceptance criteria.
+**Note:** From Stage 1 onward, all work happens inside the Intelligent IDE with the context automatically assembled by Agentic SDLC Spec Kit.  
+**Alignment with 12 Factors:** This stage focuses on [III. Mission Definition](https://tikalk.github.io/agentic-sdlc-12-factors/content/mission-definition.html), translating intent into formal, version-controlled specifications.
+
+1. **Craft the Directive (`/specify`)**  
+   **Action:** Author a single, comprehensive natural-language directive that blends the issue tracker mission, personas, constraints, and any clarifications.  
+   **Purpose:** Front-load human judgment so the AI can draft an accurate `spec.md` aligned with the constitution.
+2. **Execute the Command**  
+   **Action:** Run `/specify` in the IDE; Agentic SDLC Spec Kit loads `memory/constitution.md`, resolves `@team/...` references against the directives repo, and captures any `@issue-tracker ISSUE-###` reference in the prompt so the resulting spec links back to the originating ticket.  
+   **Purpose:** Generates the structured specification artifact under `specs/<feature>/spec.md` with shared principles and traceability already in context.
+3. **Review and Commit**  
+   **Action:** Perform a macro-review of the generated `spec.md`, refine if needed, then commit it.  
+   **Purpose:** Locks in the requirements that all later stages will honor.
+
+**Example Command:**
+
+```text
+/specify "Generate the specification for the feature in @issue-tracker ISSUE-123. The target user is the @team/personas/data_analyst.md. The operation must be asynchronous to handle large dashboards. The PDF title must include the dashboard name and an export timestamp."
+```
+
+**Outcome:** A committed `spec.md` ready to drive planning in Stage 2.
+
+## Stage 2: Planning & Task Management
+
+**Goal:** Convert the committed `spec.md` into a human-approved `plan.md` and a synced task list that routes work through the issue tracker.
+**Note:** `/plan` and `/tasks` run inside the IDE, reusing the constitution and the locally cloned `team-ai-directives` modules.  
+**Alignment with 12 Factors:** This stage implements [IV. Structured Planning](https://tikalk.github.io/agentic-sdlc-12-factors/content/structured-planning.html) and [V. Dual Execution Loops](https://tikalk.github.io/agentic-sdlc-12-factors/content/dual-execution-loops.html), decomposing tasks and triaging them for synchronous or asynchronous execution.
+
+1. **Generate the Plan (`/plan`)**  
+   **Action:** Execute `/plan` with a directive that covers tech stack, risk considerations, testing focus, and any implementation preferences. Agentic SDLC Spec Kit loads `memory/constitution.md`, references in `team-ai-directives`, and copies the plan template before executing automation.  
+   **Purpose:** Guides the AI in generating a comprehensive and strategically-sound first draft of `plan.md`—front-loading human judgment yields more robust outputs, and the AI produces technical steps with preliminary [SYNC]/[ASYNC] triage suggestions while emitting `plan.md`, `research.md`, `data-model.md`, `quickstart.md`, and contract stubs aligned with the constitution.
+2. **Macro-Review and Commit**  
+   **Action:** Review the generated artifacts, adjust as needed, decide [SYNC]/[ASYNC] triage, then commit.  
+   **Purpose:** Locks in an execution strategy that downstream stages must respect.
+3. **Sync Tasks (`/tasks`)**  
+   **Action:** Run `/tasks` to transform the validated plan into numbered tasks, ensuring each contract, test, and implementation step is represented. The command requires the committed plan artifacts and will surface gaps if prerequisites are missing.  
+   **Purpose:** Creates `tasks.md` and mirrors it to the issue tracker for execution visibility.
+
+**Outcome:** A constitution-compliant `plan.md`, supporting design artifacts, and an actionable task list synchronized with project management.
+
+## Stage 3: Implementation
+
+**Goal:** Execute the validated plan, honoring the `[SYNC]/[ASYNC]` execution modes and completing every task in `tasks.md`.
+**Note:** Use `/implement` within the IDE; the command enforces the TDD order, dependency rules, and execution modes captured in Stages 1-2.  
+**Alignment with 12 Factors:** This stage applies [VI. The Great Filter](https://tikalk.github.io/agentic-sdlc-12-factors/content/great-filter.html), [VII. Adaptive Quality Gates](https://tikalk.github.io/agentic-sdlc-12-factors/content/adaptive-quality-gates.html), and [VIII. AI-Augmented, Risk-Based Testing](https://tikalk.github.io/agentic-sdlc-12-factors/content/ai-augmented-testing.html), ensuring human judgment filters AI output with appropriate review processes and targeted testing.
+
+1. **Execute Tasks (`/implement`)**  
+    **Action:** Run `/implement` to load `plan.md`, `tasks.md`, and supporting artifacts. Follow the phase-by-phase flow, completing risk-based tests before implementation and respecting `[SYNC]/[ASYNC]` modes and `[P]` parallel markers for efficient execution.  
+    **Purpose:** Produces production-ready code with targeted testing based on identified risks, marks tasks as `[X]`, and preserves the execution trace for Stage 4.
+2. **Review & Validate**  
+   **Action:** Ensure all `[SYNC]` tasks received micro-reviews, all `[ASYNC]` work underwent macro-review, and the test suite passes before moving on.  
+   **Purpose:** Guarantees the feature matches the spec and plan with traceable quality gates.
+
+**Outcome:** A completed feature branch with passing tests and an updated `tasks.md` documenting execution status and modes.
+
+## Stage 4: Leveling Up
+
+**Goal:** Capture best practices from the completed feature, draft a reusable knowledge asset in `team-ai-directives`, and generate traceability notes for the original issue.
+**Note:** `/levelup` runs inside the IDE and relies on the locally cloned directives repository from Stage 0.  
+**Alignment with 12 Factors:** This stage encompasses [IX. Traceability](https://tikalk.github.io/agentic-sdlc-12-factors/content/traceability.html), [X. Strategic Tooling](https://tikalk.github.io/agentic-sdlc-12-factors/content/strategic-tooling.html), [XI. Directives as Code](https://tikalk.github.io/agentic-sdlc-12-factors/content/directives-as-code.html), and [XII. Team Capability](https://tikalk.github.io/agentic-sdlc-12-factors/content/team-capability.html), linking artifacts, managing tools, versioning AI behavior, and systematizing learning.
+
+1. **Run Level-Up Workflow (`/levelup`)**  
+   **Action:** Invoke `/levelup` with a strategic directive (e.g., highlight what should become reusable). Agentic SDLC Spec Kit gathers spec/plan/tasks metadata, validates the directives repo, and prompts you to synthesize a knowledge asset plus PR/issue summaries.  
+   **Purpose:** Produces a draft markdown asset under `.specify/memory/team-ai-directives/drafts/`, along with a pull-request description and trace comment for review.
+2. **Review & Publish**  
+   **Action:** Inspect the generated asset and summaries. When satisfied, confirm inside `/levelup` to let it create a `levelup/{slug}` branch, commit the asset, push (when remotes are configured), open a PR via `gh pr create` (or emit the command), and post the trace comment (or provide the text if automation is unavailable).  
+   **Purpose:** Ensures lessons learned become part of the team's shared brain and closes the loop with traceability artifacts without manual branching overhead.
+
+**Example Command:**
+
+```text
+/levelup "Capture the FastAPI error-handling patterns we refined while closing ISSUE-123. Summarize why the retry strategy works, when to apply it, and provide links to the final implementation."
+```
+
+**Outcome:** A knowledge asset ready for PR, a drafted trace comment for the issue tracker, and clear next steps for team review.
+
 ## The 6-Step Process
 
 > [!TIP]
@@ -14,19 +148,20 @@ This guide will help you get started with Spec-Driven Development using Spec Kit
 
 **In your terminal**, run the `specify` CLI command to initialize your project:
 
+> **Note:** All slash commands adapt their behavior based on your current workflow mode. Use `/mode` to check or change modes.
+
 ```bash
 # Create a new project directory
-uvx --from git+https://github.com/github/spec-kit.git specify init <PROJECT_NAME>
-
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <PROJECT_NAME>
 # OR initialize in the current directory
-uvx --from git+https://github.com/github/spec-kit.git specify init .
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init .
 ```
 
 Pick script type explicitly (optional):
 
 ```bash
-uvx --from git+https://github.com/github/spec-kit.git specify init <PROJECT_NAME> --script ps  # Force PowerShell
-uvx --from git+https://github.com/github/spec-kit.git specify init <PROJECT_NAME> --script sh  # Force POSIX shell
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <PROJECT_NAME> --script ps  # Force PowerShell
+uvx --from git+https://github.com/github/agentic-sdlc-spec-kit.git specify init <PROJECT_NAME> --script sh  # Force POSIX shell
 ```
 
 ### Step 2: Define Your Constitution
@@ -156,6 +291,37 @@ Finally, implement the solution:
 - **Iterate and refine** your specifications before implementation
 - **Validate** the plan before coding begins
 - **Let the AI agent handle** the implementation details
+- **Choose your complexity level** with workflow modes (build for speed, spec for thoroughness)
+
+## Mode Transitions
+
+Your development needs may change as features evolve:
+
+### When to Switch from Build to Spec Mode
+
+```bash
+/mode spec
+```
+
+**Indicators:**
+
+- Feature scope is growing beyond initial expectations
+- Multiple stakeholders need detailed documentation
+- Production deployment requires comprehensive testing
+- Integration with existing systems becomes complex
+
+### When to Switch from Spec to Build Mode
+
+```bash
+/mode build
+```
+
+**Indicators:**
+
+- Shifting to exploratory prototyping
+- Need to quickly validate a technical approach
+- Working on throwaway proof-of-concepts
+- Time pressure requires simplified process
 
 ## Next Steps
 
diff --git a/docs/triage-framework.md b/docs/triage-framework.md
new file mode 100644
index 0000000000..0d930c6fc5
--- /dev/null
+++ b/docs/triage-framework.md
@@ -0,0 +1,307 @@
+# Triage Framework: [SYNC] vs [ASYNC] Task Classification
+
+## Overview
+
+The Triage Framework provides structured guidance for classifying implementation tasks as [SYNC] (human-reviewed execution) or [ASYNC] (autonomous agent delegation). This classification optimizes the dual execution loop by routing tasks to the most appropriate execution method.
+
+## Core Principles
+
+1. **Quality First**: Complex tasks requiring human judgment are classified as [SYNC]
+2. **Efficiency Optimization**: Routine tasks are delegated to agents as [ASYNC]
+3. **Risk Mitigation**: Critical paths maintain human oversight
+4. **Continuous Learning**: Triage decisions improve over time through metrics
+
+## Decision Tree: Task Classification
+
+```text
+START: New Implementation Task
+    │
+    ├── Is this task security-critical?
+    │   ├── YES → [SYNC] (Security classification)
+    │   └── NO → Continue
+    │
+    ├── Does this task involve external integrations?
+    │   ├── YES → [SYNC] (Integration classification)
+    │   └── NO → Continue
+    │
+    ├── Is the requirement ambiguous or unclear?
+    │   ├── YES → [SYNC] (Clarity classification)
+    │   └── NO → Continue
+    │
+    ├── Does this task require architectural decisions?
+    │   ├── YES → [SYNC] (Architecture classification)
+    │   └── NO → Continue
+    │
+    ├── Is this complex business logic?
+    │   ├── YES → [SYNC] (Complexity classification)
+    │   └── NO → Continue
+    │
+    ├── Is this well-defined CRUD/standard pattern?
+    │   ├── YES → [ASYNC] (Standard pattern classification)
+    │   └── NO → Continue
+    │
+    ├── Does this have comprehensive test coverage?
+    │   ├── YES → [ASYNC] (Test coverage classification)
+    │   └── NO → [SYNC] (Insufficient testing safeguards)
+    │
+    └── [ASYNC] (Default classification)
+```
+
+## Detailed Classification Criteria
+
+### [SYNC] Classifications (Human Execution Required)
+
+#### 1. Security Classification
+
+**When to Apply**: Tasks involving authentication, authorization, encryption, data protection, or compliance requirements.
+
+**Examples**:
+
+- User authentication flows
+- API key management
+- Data encryption/decryption
+- Access control logic
+- GDPR/privacy compliance
+
+**Rationale**: Security-critical code requires human expertise and cannot be delegated due to liability and compliance risks.
+
+#### 2. Integration Classification
+
+**When to Apply**: Tasks involving external APIs, legacy systems, third-party services, or complex data transformations.
+
+**Examples**:
+
+- Payment gateway integration
+- External API consumption
+- Legacy system migration
+- Data import/export pipelines
+- Webhook implementations
+
+**Rationale**: Integration complexity often requires domain expertise and error handling that agents cannot reliably implement.
+
+#### 3. Clarity Classification
+
+**When to Apply**: Tasks with ambiguous requirements, unclear acceptance criteria, or multiple interpretation possibilities.
+
+**Examples**:
+
+- Vague user story requirements
+- Missing edge case specifications
+- Conflicting stakeholder expectations
+- Novel feature implementations
+- Research-dependent tasks
+
+**Rationale**: Ambiguous requirements need human clarification and interpretation before implementation.
+
+#### 4. Architecture Classification
+
+**When to Apply**: Tasks involving system design decisions, component boundaries, or architectural patterns.
+
+**Examples**:
+
+- Database schema design
+- API contract definition
+- Component architecture
+- Design pattern selection
+- Performance optimization strategies
+
+**Rationale**: Architectural decisions have long-term impact and require experienced design judgment.
+
+#### 5. Complexity Classification
+
+**When to Apply**: Tasks involving non-trivial algorithms, state machines, or complex business logic.
+
+**Examples**:
+
+- Custom algorithms
+- State machine implementations
+- Complex validation logic
+- Mathematical computations
+- Multi-step business processes
+
+**Rationale**: Complex logic requires deep understanding and careful implementation that benefits from human review.
+
+### [ASYNC] Classifications (Agent Delegation Suitable)
+
+#### 1. Standard Pattern Classification
+
+**When to Apply**: Tasks following well-established patterns, frameworks, or standard implementations.
+
+**Examples**:
+
+- RESTful API endpoints
+- Standard CRUD operations
+- Form validation
+- Basic error handling
+- Standard authentication flows
+
+**Rationale**: Standard patterns have predictable implementations that agents can reliably generate.
+
+#### 2. Test Coverage Classification
+
+**When to Apply**: Tasks with comprehensive automated test coverage providing execution safeguards.
+
+**Examples**:
+
+- Components with >80% test coverage
+- TDD-developed features
+- Well-tested utility functions
+- Components with integration tests
+
+**Rationale**: Comprehensive testing provides confidence that agent-generated code meets requirements.
+
+#### 3. Independent Component Classification
+
+**When to Apply**: Tasks implementing self-contained components with minimal external dependencies.
+
+**Examples**:
+
+- Utility libraries
+- Standalone services
+- Independent UI components
+- Pure functions
+- Data transformation helpers
+
+**Rationale**: Independent components have limited blast radius and can be safely delegated.
+
+## Triage Process Workflow
+
+### Phase 1: Task Identification
+
+1. Break down feature into discrete, implementable tasks
+2. Estimate effort and dependencies for each task
+3. Identify task boundaries and interfaces
+
+### Phase 2: Classification Assessment
+
+1. Apply decision tree to each task
+2. Document primary classification criteria
+3. Assess risk level of misclassification
+4. Record rationale for each decision
+
+### Phase 3: Review and Validation
+
+1. Peer review of triage decisions
+2. Validate classification consistency
+3. Confirm risk assessments
+4. Document any overrides or exceptions
+
+### Phase 4: Execution Planning
+
+1. Group tasks by classification
+2. Plan [SYNC] review checkpoints
+3. Configure [ASYNC] agent delegation
+4. Establish monitoring and rollback procedures
+
+## Triage Metrics and Improvement
+
+### Effectiveness Metrics
+
+#### Classification Accuracy
+
+- **Measurement**: Percentage of tasks correctly classified (validated post-implementation)
+- **Target**: >90% accuracy
+- **Tracking**: Monthly review of misclassifications
+
+#### Review Efficiency
+
+- **Measurement**: Time spent on [SYNC] reviews vs time saved by [ASYNC] delegation
+- **Target**: Net positive efficiency gain
+- **Tracking**: Per-feature analysis
+
+#### Quality Impact
+
+- **Measurement**: Defect rates by classification type
+- **Target**: [SYNC] defects <5%, [ASYNC] defects <15%
+- **Tracking**: Post-implementation defect analysis
+
+### Continuous Improvement
+
+#### Learning Opportunities
+
+- **Common Patterns**: Identify frequently misclassified task types
+- **Training Updates**: Update decision trees based on lessons learned
+- **Tool Improvements**: Enhance triage guidance based on metrics
+
+#### Feedback Integration
+
+- **Developer Feedback**: Collect classification experience reports
+- **Review Feedback**: Analyze review findings for triage improvements
+- **Quality Metrics**: Use defect data to refine classification criteria
+
+## Training Module: Triage Decision Making
+
+### Module 1: Understanding Classifications
+
+**Objective**: Understand the difference between [SYNC] and [ASYNC] tasks
+**Content**:
+
+- Classification criteria with examples
+- Risk assessment frameworks
+- Common misclassification patterns
+
+### Module 2: Decision Tree Application
+
+**Objective**: Practice applying the triage decision tree
+**Content**:
+
+- Interactive decision tree walkthrough
+- Real-world task classification exercises
+- Peer review of classification decisions
+
+### Module 3: Risk Assessment
+
+**Objective**: Learn to assess misclassification risks
+**Content**:
+
+- Risk level determination
+- Impact analysis techniques
+- Mitigation strategy development
+
+### Module 4: Metrics and Improvement
+
+**Objective**: Understand triage effectiveness measurement
+**Content**:
+
+- Metrics definition and calculation
+- Improvement opportunity identification
+- Feedback integration processes
+
+## Implementation Checklist
+
+### For Each Feature
+
+- [ ] All tasks classified as [SYNC] or [ASYNC]
+- [ ] Classification rationale documented
+- [ ] Risk assessment completed
+- [ ] Peer review conducted
+- [ ] Triage decisions approved
+
+### For Each Sprint/Iteration
+
+- [ ] Classification accuracy measured
+- [ ] Review efficiency analyzed
+- [ ] Quality metrics collected
+- [ ] Improvement opportunities identified
+- [ ] Training modules updated
+
+## Appendix: Common Misclassification Patterns
+
+### False Positives ([SYNC] when should be [ASYNC])
+
+- Over-classifying standard CRUD operations
+- Treating well-tested components as high-risk
+- Misinterpreting "complex" as requiring human review
+
+### False Negatives ([ASYNC] when should be [SYNC])
+
+- Underestimating integration complexity
+- Ignoring security implications
+- Delegating ambiguous requirements
+
+### Mitigation Strategies
+
+- Regular calibration sessions
+- Peer review of classifications
+- Metrics-driven refinement
+- Clear escalation procedures
diff --git a/evals/.markdownlintignore b/evals/.markdownlintignore
new file mode 100644
index 0000000000..77f12ae2e5
--- /dev/null
+++ b/evals/.markdownlintignore
@@ -0,0 +1 @@
+docs/
diff --git a/evals/README.md b/evals/README.md
new file mode 100644
index 0000000000..375eabf29a
--- /dev/null
+++ b/evals/README.md
@@ -0,0 +1,83 @@
+# Spec-Kit Evaluation Framework
+
+[![AI Evals](https://github.com/tikalk/agentic-sdlc-spec-kit/actions/workflows/eval.yml/badge.svg)](https://github.com/tikalk/agentic-sdlc-spec-kit/actions/workflows/eval.yml)
+
+Comprehensive evaluation infrastructure for testing spec-kit template quality using PromptFoo with Claude.
+
+## 📊 Current Evaluation Results (Updated: 2026-01-14)
+
+**Overall Pass Rate: 100% (10/10 tests passing)** ✅
+
+| Test Suite | Pass Rate | Status |
+|------------|-----------|--------|
+| **Spec Template** | 8/8 (100%) | ✅ |
+| **Plan Template** | 2/2 (100%) | ✅ |
+| **Total** | **10/10 (100%)** | ✅ |
+
+## Quick Start
+
+> **💡 New to the eval framework?** Check out [docs/QUICK_REFERENCE.md](./docs/QUICK_REFERENCE.md) for a one-page overview of all commands, files, and workflows!
+
+### 1. Prerequisites
+
+```bash
+# Install Node.js (if not already installed)
+# macOS:
+brew install node
+
+# Verify installation
+node --version  # Should be v18+
+npx --version   # Comes with Node.js
+```
+
+### 2. Configure Environment Variables
+
+```bash
+# Add to ~/.bashrc or ~/.zshrc
+export LLM_BASE_URL="your-llm-base-url"
+export LLM_AUTH_TOKEN="your-api-key"
+export LLM_MODEL="claude-sonnet-4-5-20250929"  # Optional, defaults to Sonnet 4.5
+
+# Reload shell
+source ~/.zshrc  # or source ~/.bashrc
+```
+
+### 3. Run Evaluations
+
+```bash
+# From repo root - run all PromptFoo tests
+./evals/scripts/run-promptfoo-eval.sh
+
+# Run with JSON output
+./evals/scripts/run-promptfoo-eval.sh --json
+
+# Run and open web UI
+./evals/scripts/run-promptfoo-eval.sh --view
+
+# Use a specific model (overrides LLM_MODEL env var)
+./evals/scripts/run-promptfoo-eval.sh --model claude-opus-4-5-20251101
+```
+
+## Test Suite
+
+The evaluation includes **10 automated tests** covering:
+- **Spec Template (8 tests)**: Structure, clarity, security, completeness
+- **Plan Template (2 tests)**: Simplicity, constitution compliance
+
+For more details on the test suite and individual tests, see the `tests` array in the `promptfooconfig.js` files.
+
+## Advanced Workflows
+
+For more advanced use cases, see our detailed workflow guides:
+
+- **[Error Analysis & Annotation](docs/WORKFLOWS.md)**: Deep dive into manual and automated error analysis, and how to use the annotation tool.
+- **[CI/CD & Local Testing](docs/GITHUB_ACTIONS_SETUP.md)**: Set up and run evaluations in GitHub Actions or test them locally with `act`.
+
+## Support
+
+For evaluation framework issues:
+- PromptFoo Discord: <https://discord.gg/promptfoo>
+- PromptFoo GitHub: <https://github.com/promptfoo/promptfoo>
+
+For spec-kit specific questions:
+- Open issue: <https://github.com/tikalk/agentic-sdlc-spec-kit/issues>
diff --git a/evals/annotation-tool/README.md b/evals/annotation-tool/README.md
new file mode 100644
index 0000000000..d74c84f695
--- /dev/null
+++ b/evals/annotation-tool/README.md
@@ -0,0 +1,127 @@
+# Spec Annotation Tool
+
+A fast, keyboard-driven interface for reviewing generated specs. Built with FastHTML for 10x faster annotation.
+
+## Features
+
+- **Keyboard-driven navigation**: N (next), P (previous), 1 (pass), 2 (fail)
+- **Progress tracking**: See how many specs reviewed, passed, failed
+- **Notes**: Add notes for each spec
+- **Auto-save**: Annotations automatically saved to JSON
+- **Export**: Export all annotations with statistics
+- **Beautiful rendering**: Markdown specs rendered with syntax highlighting
+
+## Quick Start
+
+```bash
+# Run using the provided script (from project root)
+./evals/scripts/run-annotation-tool.sh
+```
+
+Or manually:
+
+```bash
+# Install dependencies with uv
+cd evals/annotation-tool
+uv venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+uv pip install python-fasthtml markdown
+
+# Run the app
+python app.py
+```
+
+Then open your browser to `http://localhost:5001` (or the port shown in the terminal).
+
+## Usage
+
+### Keyboard Shortcuts
+
+- **N** - Next spec
+- **P** - Previous spec
+- **1** - Mark as Pass (and advance to next)
+- **2** - Mark as Fail (and advance to next)
+
+### Workflow
+
+1. Review the spec content displayed
+2. Add notes in the text area (optional)
+3. Mark as Pass (1) or Fail (2)
+4. The tool automatically advances to the next spec
+5. Use Export button to save all annotations with timestamp
+
+### Output
+
+Annotations are saved to:
+
+- `annotations.json` - Current annotations (auto-saved)
+- `annotations_export_YYYYMMDD_HHMMSS.json` - Exported snapshots
+
+## Data Structure
+
+```json
+{
+  "exported_at": "2026-01-08T14:30:00",
+  "statistics": {
+    "total": 17,
+    "passed": 12,
+    "failed": 3,
+    "pending": 2,
+    "progress": 88.2
+  },
+  "annotations": {
+    "spec-001.md": {
+      "status": "pass",
+      "notes": "Good structure, all sections present",
+      "timestamp": "2026-01-08T14:25:00"
+    },
+    "spec-002.md": {
+      "status": "fail",
+      "notes": "Missing acceptance criteria",
+      "timestamp": "2026-01-08T14:26:00"
+    }
+  }
+}
+```
+
+## Customization
+
+### Change Specs Directory
+
+Edit `SPECS_DIR` in `app.py`:
+
+```python
+SPECS_DIR = Path("path/to/your/specs")
+```
+
+### Add Filtering
+
+The MVP doesn't include filtering by status yet. To add:
+
+1. Add filter buttons in the UI
+2. Modify `get_current_spec()` to filter specs list
+3. Reset index when filter changes
+
+### Add Semantic Search
+
+For advanced features, you can extend with:
+
+- Vector embeddings for similar spec clustering
+- AI-powered categorization suggestions
+- Bulk operations
+
+## Architecture
+
+- **FastHTML**: Lightweight web framework with HTMX
+- **Pico CSS**: Minimal, beautiful styling
+- **JSON storage**: Simple file-based persistence
+- **Markdown**: Renders spec content with code highlighting
+
+## Next Steps
+
+After using this tool for initial reviews:
+
+1. Export annotations for error analysis in Jupyter
+2. Use findings to extend PromptFoo tests
+3. Build LLM-as-Judge evaluators based on failure patterns
+4. Add discovered failure modes to CI/CD pipeline
diff --git a/evals/annotation-tool/app.py b/evals/annotation-tool/app.py
new file mode 100644
index 0000000000..438fbefc07
--- /dev/null
+++ b/evals/annotation-tool/app.py
@@ -0,0 +1,385 @@
+#!/usr/bin/env python3
+"""
+FastHTML Annotation Tool for Spec Review
+Provides a fast, keyboard-driven interface for reviewing generated specs.
+"""
+
+from fasthtml.common import *
+from pathlib import Path
+import json
+from datetime import datetime
+import markdown
+
+# Initialize FastHTML app with Pico CSS
+app, rt = fast_app(pico=True)
+
+# Configuration
+SPECS_DIR = Path("../datasets/real-specs")
+ANNOTATIONS_FILE = Path("annotations.json")
+
+# Global state (in production, use proper state management)
+class AnnotationState:
+    def __init__(self):
+        self.load_specs()
+        self.load_annotations()
+        self.current_index = 0
+
+    def load_specs(self):
+        """Load all spec files"""
+        self.specs = sorted(list(SPECS_DIR.glob("spec-*.md")))
+        if not self.specs:
+            self.specs = []
+
+    def load_annotations(self):
+        """Load existing annotations"""
+        if ANNOTATIONS_FILE.exists():
+            with open(ANNOTATIONS_FILE, 'r') as f:
+                self.annotations = json.load(f)
+        else:
+            self.annotations = {}
+
+    def save_annotations(self):
+        """Save annotations to JSON"""
+        with open(ANNOTATIONS_FILE, 'w') as f:
+            json.dump(self.annotations, f, indent=2)
+
+    def get_current_spec(self):
+        """Get current spec content"""
+        if not self.specs or self.current_index >= len(self.specs):
+            return None, None
+
+        spec_path = self.specs[self.current_index]
+        with open(spec_path, 'r') as f:
+            content = f.read()
+
+        return spec_path.name, content
+
+    def get_annotation(self, spec_name):
+        """Get annotation for a spec"""
+        return self.annotations.get(spec_name, {
+            "status": "pending",
+            "notes": "",
+            "timestamp": None
+        })
+
+    def set_annotation(self, spec_name, status, notes=""):
+        """Set annotation for a spec"""
+        self.annotations[spec_name] = {
+            "status": status,
+            "notes": notes,
+            "timestamp": datetime.now().isoformat()
+        }
+        self.save_annotations()
+
+    def get_stats(self):
+        """Get annotation statistics"""
+        total = len(self.specs)
+        passed = sum(1 for a in self.annotations.values() if a["status"] == "pass")
+        failed = sum(1 for a in self.annotations.values() if a["status"] == "fail")
+        pending = total - passed - failed
+
+        return {
+            "total": total,
+            "passed": passed,
+            "failed": failed,
+            "pending": pending,
+            "progress": (passed + failed) / total * 100 if total > 0 else 0
+        }
+
+# Initialize state
+state = AnnotationState()
+
+@rt("/")
+def get():
+    """Main page"""
+    spec_name, content = state.get_current_spec()
+
+    if spec_name is None:
+        return Html(
+            Head(Title("Spec Annotation Tool")),
+            Body(
+                Main(
+                    H1("No specs found"),
+                    P(f"Please add spec files to {SPECS_DIR}"),
+                    cls="container"
+                )
+            )
+        )
+
+    annotation = state.get_annotation(spec_name)
+    stats = state.get_stats()
+
+    # Convert markdown to HTML
+    html_content = markdown.markdown(content, extensions=['fenced_code', 'tables'])
+
+    # Status badge color
+    status_colors = {
+        "pass": "green",
+        "fail": "red",
+        "pending": "gray"
+    }
+
+    return Html(
+        Head(
+            Title("Spec Annotation Tool"),
+            Script(src="https://unpkg.com/htmx.org@1.9.10"),
+            Link(rel="stylesheet", href="https://cdn.jsdelivr.net/npm/@picocss/pico@2/css/pico.min.css"),
+            Style("""
+                .spec-content {
+                    padding: 1rem;
+                    background: var(--pico-background-color);
+                    border-radius: 0.5rem;
+                    margin: 1rem 0;
+                    max-height: 60vh;
+                    overflow-y: auto;
+                }
+                .controls {
+                    display: flex;
+                    gap: 1rem;
+                    margin: 1rem 0;
+                    flex-wrap: wrap;
+                }
+                .stats {
+                    display: grid;
+                    grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
+                    gap: 1rem;
+                    margin: 1rem 0;
+                }
+                .stat-card {
+                    padding: 1rem;
+                    background: var(--pico-card-background-color);
+                    border-radius: 0.5rem;
+                    text-align: center;
+                }
+                .stat-value {
+                    font-size: 2rem;
+                    font-weight: bold;
+                }
+                .stat-label {
+                    font-size: 0.9rem;
+                    opacity: 0.8;
+                }
+                .progress-bar {
+                    width: 100%;
+                    height: 8px;
+                    background: var(--pico-muted-color);
+                    border-radius: 4px;
+                    overflow: hidden;
+                    margin: 1rem 0;
+                }
+                .progress-fill {
+                    height: 100%;
+                    background: var(--pico-primary);
+                    transition: width 0.3s ease;
+                }
+                .keyboard-help {
+                    font-size: 0.9rem;
+                    opacity: 0.7;
+                    padding: 1rem;
+                    background: var(--pico-card-background-color);
+                    border-radius: 0.5rem;
+                    margin: 1rem 0;
+                }
+                .status-badge {
+                    display: inline-block;
+                    padding: 0.25rem 0.75rem;
+                    border-radius: 1rem;
+                    font-size: 0.9rem;
+                    font-weight: bold;
+                }
+                .status-pass { background: #2ecc71; color: white; }
+                .status-fail { background: #e74c3c; color: white; }
+                .status-pending { background: #95a5a6; color: white; }
+            """)
+        ),
+        Body(
+            Main(
+                H1("Spec Annotation Tool"),
+
+                # Stats
+                Div(
+                    Div(
+                        Div(str(stats['total']), cls="stat-value"),
+                        Div("Total", cls="stat-label"),
+                        cls="stat-card"
+                    ),
+                    Div(
+                        Div(str(stats['passed']), cls="stat-value"),
+                        Div("Passed", cls="stat-label"),
+                        cls="stat-card"
+                    ),
+                    Div(
+                        Div(str(stats['failed']), cls="stat-value"),
+                        Div("Failed", cls="stat-label"),
+                        cls="stat-card"
+                    ),
+                    Div(
+                        Div(str(stats['pending']), cls="stat-value"),
+                        Div("Pending", cls="stat-label"),
+                        cls="stat-card"
+                    ),
+                    cls="stats"
+                ),
+
+                # Progress bar
+                Div(
+                    Div(style=f"width: {stats['progress']}%", cls="progress-fill"),
+                    cls="progress-bar"
+                ),
+
+                # Current spec info
+                H2(f"Spec {state.current_index + 1} of {len(state.specs)}: {spec_name}"),
+                Span(annotation['status'].upper(), cls=f"status-badge status-{annotation['status']}"),
+
+                # Spec content
+                Article(
+                    NotStr(html_content),
+                    cls="spec-content"
+                ),
+
+                # Notes
+                Form(
+                    Textarea(
+                        annotation.get('notes', ''),
+                        name="notes",
+                        placeholder="Add notes about this spec...",
+                        rows="3"
+                    ),
+                    Input(type="hidden", name="spec_name", value=spec_name),
+                    Button("Save Notes", type="submit"),
+                    hx_post="/save-notes",
+                    hx_target="#message",
+                    hx_swap="innerHTML"
+                ),
+                Div(id="message"),
+
+                # Controls
+                Div(
+                    Button("← Previous (P)",
+                           hx_post="/prev",
+                           hx_target="body",
+                           hx_swap="outerHTML",
+                           accesskey="p"),
+                    Button("Next (N) →",
+                           hx_post="/next",
+                           hx_target="body",
+                           hx_swap="outerHTML",
+                           accesskey="n"),
+                    Button("✓ Pass (1)",
+                           hx_post=f"/annotate/pass/{spec_name}",
+                           hx_target="body",
+                           hx_swap="outerHTML",
+                           accesskey="1",
+                           cls="contrast"),
+                    Button("✗ Fail (2)",
+                           hx_post=f"/annotate/fail/{spec_name}",
+                           hx_target="body",
+                           hx_swap="outerHTML",
+                           accesskey="2"),
+                    Button("Export JSON",
+                           hx_get="/export",
+                           hx_target="#message"),
+                    cls="controls"
+                ),
+
+                # Keyboard help
+                Details(
+                    Summary("Keyboard Shortcuts"),
+                    Div(
+                        P(Strong("N"), " - Next spec"),
+                        P(Strong("P"), " - Previous spec"),
+                        P(Strong("1"), " - Mark as Pass"),
+                        P(Strong("2"), " - Mark as Fail"),
+                        cls="keyboard-help"
+                    )
+                ),
+
+                cls="container"
+            ),
+            # JavaScript for additional keyboard shortcuts
+            Script("""
+                document.addEventListener('keydown', function(e) {
+                    // Ignore if user is typing in textarea
+                    if (e.target.tagName === 'TEXTAREA' || e.target.tagName === 'INPUT') {
+                        return;
+                    }
+
+                    switch(e.key.toLowerCase()) {
+                        case 'n':
+                            document.querySelector('button[accesskey="n"]').click();
+                            break;
+                        case 'p':
+                            document.querySelector('button[accesskey="p"]').click();
+                            break;
+                        case '1':
+                            document.querySelector('button[accesskey="1"]').click();
+                            break;
+                        case '2':
+                            document.querySelector('button[accesskey="2"]').click();
+                            break;
+                    }
+                });
+            """)
+        )
+    )
+
+@rt("/next")
+def post():
+    """Navigate to next spec"""
+    if state.current_index < len(state.specs) - 1:
+        state.current_index += 1
+    return get()
+
+@rt("/prev")
+def post():
+    """Navigate to previous spec"""
+    if state.current_index > 0:
+        state.current_index -= 1
+    return get()
+
+@rt("/annotate/{status}/{spec_name}")
+def post(status: str, spec_name: str):
+    """Annotate spec with pass/fail"""
+    annotation = state.get_annotation(spec_name)
+    state.set_annotation(spec_name, status, annotation.get('notes', ''))
+
+    # Auto-advance to next spec
+    if state.current_index < len(state.specs) - 1:
+        state.current_index += 1
+
+    # Return redirect header for HTMX
+    return Response(status_code=200, headers={'HX-Redirect': '/'})
+
+@rt("/save-notes")
+def post(spec_name: str, notes: str):
+    """Save notes for current spec"""
+    annotation = state.get_annotation(spec_name)
+    state.set_annotation(spec_name, annotation.get('status', 'pending'), notes)
+    return Div(
+        P("Notes saved!", style="color: green; margin-top: 0.5rem;"),
+        id="message"
+    )
+
+@rt("/export")
+def get():
+    """Export annotations to JSON"""
+    stats = state.get_stats()
+    export_data = {
+        "exported_at": datetime.now().isoformat(),
+        "statistics": stats,
+        "annotations": state.annotations
+    }
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    export_path = Path(f"annotations_export_{timestamp}.json")
+
+    with open(export_path, 'w') as f:
+        json.dump(export_data, f, indent=2)
+
+    return Div(
+        P(f"Exported to {export_path}", style="color: green; margin-top: 0.5rem;"),
+        id="message"
+    )
+
+if __name__ == "__main__":
+    serve()
diff --git a/evals/configs/promptfooconfig-plan.js b/evals/configs/promptfooconfig-plan.js
new file mode 100644
index 0000000000..3c34d1875e
--- /dev/null
+++ b/evals/configs/promptfooconfig-plan.js
@@ -0,0 +1,63 @@
+// PromptFoo configuration for Plan Template tests only
+module.exports = {
+  description: 'Plan Template Quality Evaluation',
+
+  // Plan prompt only
+  prompts: ['file://../prompts/plan-prompt.txt'],
+
+  // Configure LLM provider using OpenAI-compatible endpoint
+  providers: [
+    {
+      id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+      label: `Claude ${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`,
+      config: {
+        apiBaseUrl: process.env.LLM_BASE_URL,
+        apiKey: process.env.LLM_AUTH_TOKEN,
+        temperature: 0.7,
+        max_tokens: 4000,
+      },
+      env: {
+        OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
+        OPENAI_BASE_URL: process.env.LLM_BASE_URL,
+      },
+    },
+  ],
+
+  defaultTest: {
+    options: {
+      provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+    },
+  },
+
+  tests: [
+    // Test 7: Plan Template - Simplicity Gate
+    {
+      description: 'Plan Template: Simple app should have ≤3 projects',
+      vars: {
+        user_input: 'Plan a simple todo app with Node.js and SQLite',
+      },
+      assert: [
+        { type: 'python', value: 'file://../graders/custom_graders.py:check_simplicity_gate' },
+        // Note: Removed not-icontains checks - our custom grader is context-aware
+        // and handles "no microservices" vs "use microservices" correctly
+      ],
+    },
+
+    // Test 8: Plan Template - Constitution Compliance
+    {
+      description: 'Plan Template: No over-engineering for basic CRUD API',
+      vars: {
+        user_input: 'Plan a basic REST API for CRUD operations on a todo list',
+      },
+      assert: [
+        { type: 'python', value: 'file://../graders/custom_graders.py:check_constitution_compliance' },
+        {
+          type: 'llm-rubric',
+          value:
+            'Is the architecture appropriately simple for a basic CRUD API?\nCheck for:\n- No unnecessary complexity (service mesh, event sourcing, CQRS)\n- No over-engineered infrastructure (Kubernetes for simple app)\n- Direct framework usage (no unnecessary wrappers)\nReturn 1.0 if appropriately simple, 0.0 if over-engineered.',
+          threshold: 0.8,
+        },
+      ],
+    },
+  ],
+};
diff --git a/evals/configs/promptfooconfig-spec.js b/evals/configs/promptfooconfig-spec.js
new file mode 100644
index 0000000000..4f3d1002f0
--- /dev/null
+++ b/evals/configs/promptfooconfig-spec.js
@@ -0,0 +1,162 @@
+// PromptFoo configuration for Spec Template tests only
+module.exports = {
+  description: 'Spec Template Quality Evaluation',
+
+  // Spec prompt only
+  prompts: ['file://../prompts/spec-prompt.txt'],
+
+  // Configure LLM provider using OpenAI-compatible endpoint
+  providers: [
+    {
+      id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+      label: `Claude ${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`,
+      config: {
+        apiBaseUrl: process.env.LLM_BASE_URL,
+        apiKey: process.env.LLM_AUTH_TOKEN,
+        temperature: 0.7,
+        max_tokens: 4000,
+      },
+      env: {
+        OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
+        OPENAI_BASE_URL: process.env.LLM_BASE_URL,
+      },
+    },
+  ],
+
+  defaultTest: {
+    options: {
+      provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+    },
+  },
+
+  tests: [
+    // Test 1: Basic Spec Structure
+    {
+      description: 'Spec Template: Basic CRUD app - Structure validation',
+      vars: {
+        user_input:
+          'Build a task management app where users can create, edit, delete, and view tasks. Each task has a title, description, due date, and priority (low/medium/high).',
+      },
+      assert: [
+        { type: 'icontains', value: 'overview' },
+        { type: 'icontains', value: 'functional requirements' },
+        { type: 'icontains', value: 'user stor' },
+        { type: 'icontains', value: 'non-functional' },
+        { type: 'icontains', value: 'edge case' },
+      ],
+    },
+
+    // Test 2: No Premature Tech Stack
+    {
+      description: 'Spec Template: Should not include tech stack details',
+      vars: {
+        user_input: 'Build a REST API for managing user profiles with CRUD operations',
+      },
+      assert: [
+        { type: 'not-icontains', value: 'React' },
+        { type: 'not-icontains', value: 'Node.js' },
+        { type: 'not-icontains', value: 'Express' },
+        { type: 'not-icontains', value: 'MongoDB' },
+        {
+          type: 'llm-rubric',
+          value:
+            'Check if this specification avoids technical implementation details.\nIt should focus on WHAT needs to be built, not HOW to build it.\nReturn 1.0 if no tech stack is mentioned, 0.5 if some mentioned, 0.0 if heavy tech details.',
+          threshold: 0.8,
+        },
+      ],
+    },
+
+    // Test 3: Quality - User Stories
+    {
+      description: 'Spec Template: Has clear user stories with acceptance criteria',
+      vars: {
+        user_input: 'Create an authentication system with email/password login and social OAuth',
+      },
+      assert: [
+        {
+          type: 'llm-rubric',
+          value:
+            'Grade the specification on user story quality (0-1):\n1. Are there 5+ user stories?\n2. Do stories follow "As a [role], I want [feature], so that [benefit]" format?\n3. Does each story have clear acceptance criteria?\n4. Are the criteria measurable and testable?\nReturn average score 0-1.',
+          threshold: 0.75,
+        },
+      ],
+    },
+
+    // Test 4: Clarity - No Vague Terms
+    {
+      description: 'Spec Template: Flags vague requirements',
+      vars: {
+        user_input: 'Build a fast, scalable, user-friendly dashboard with good performance',
+      },
+      assert: [
+        {
+          type: 'llm-rubric',
+          value:
+            'Check if vague terms like "fast", "scalable", "user-friendly", "good performance"\nare either:\n1. Quantified with specific metrics (e.g., "response time < 200ms")\n2. Marked with [NEEDS CLARIFICATION] or similar flags\n\nReturn 1.0 if all vague terms are handled properly, 0.0 if none are.',
+          threshold: 0.7,
+        },
+      ],
+    },
+
+    // Test 5: Security Requirements
+    {
+      description: 'Spec Template: Security-critical features include security requirements',
+      vars: {
+        user_input: 'Create a payment processing system with credit card handling and transaction history',
+      },
+      assert: [
+        { type: 'icontains', value: 'security' },
+        { type: 'python', value: 'file://../graders/custom_graders.py:check_security_completeness' },
+      ],
+    },
+
+    // Test 6: Edge Cases Coverage
+    {
+      description: 'Spec Template: Includes edge cases and error scenarios',
+      vars: {
+        user_input: 'Build a file upload system supporting multiple file types up to 100MB',
+      },
+      assert: [
+        { type: 'icontains', value: 'edge case' },
+        // Using Python grader instead of LLM rubric for more reliable results
+        { type: 'python', value: 'file://../graders/custom_graders.py:check_edge_cases_coverage' },
+      ],
+    },
+
+    // Test 9: Completeness Score
+    {
+      description: 'Spec Template: E-commerce checkout has comprehensive requirements',
+      vars: {
+        user_input: 'Build an e-commerce checkout flow with cart, payment, and order confirmation',
+      },
+      assert: [
+        {
+          type: 'llm-rubric',
+          value:
+            'Grade completeness (0-1):\n1. Are functional requirements complete? (cart operations, payment, confirmation)\n2. Are user stories covering main flows?\n3. Are non-functional requirements specified? (performance, security)\n4. Are edge cases identified? (payment failures, session timeout)\nReturn average score 0-1.',
+          threshold: 0.75,
+        },
+      ],
+    },
+
+    // Test 10: Regression Test - Basic Structure
+    {
+      description: 'Regression: Spec template maintains required sections',
+      vars: {
+        user_input: 'Simple feature: Add a search bar to existing app',
+      },
+      assert: [
+        { type: 'icontains', value: 'functional requirements' },
+        { type: 'icontains', value: 'user stor' },
+        {
+          type: 'javascript',
+          value: `
+            // Count major sections (## or #)
+            const sections = output.split(/^#{1,2} /gm).length - 1;
+            return sections >= 4;
+          `,
+        },
+      ],
+    },
+  ],
+};
diff --git a/evals/configs/promptfooconfig.js b/evals/configs/promptfooconfig.js
new file mode 100644
index 0000000000..5ed518dc64
--- /dev/null
+++ b/evals/configs/promptfooconfig.js
@@ -0,0 +1,224 @@
+// PromptFoo configuration using JavaScript for environment variable support
+module.exports = {
+  description: 'Spec-Kit Quality Evaluation',
+
+  // Configure LLM provider using OpenAI-compatible endpoint
+  providers: [
+    {
+      id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+      label: `Claude ${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`,
+      config: {
+        // AI API Gateway exposes an OpenAI-compatible endpoint at /chat/completions
+        apiBaseUrl: process.env.LLM_BASE_URL,
+        apiKey: process.env.LLM_AUTH_TOKEN,
+        temperature: 0.7,
+        max_tokens: 4000,
+      },
+      // Also set the env vars that PromptFoo looks for
+      env: {
+        OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
+        OPENAI_BASE_URL: process.env.LLM_BASE_URL,
+      },
+    },
+  ],
+
+  // Default test configuration
+  defaultTest: {
+    options: {
+      provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+    },
+  },
+
+  // Test suite
+  tests: [
+    // ================================
+    // Test 1: Basic Spec Structure
+    // ================================
+    {
+      description: 'Spec Template: Basic CRUD app - Structure validation',
+      prompt: 'file://../prompts/spec-prompt.txt',
+      vars: {
+        user_input:
+          'Build a task management app where users can create, edit, delete, and view tasks. Each task has a title, description, due date, and priority (low/medium/high).',
+      },
+      assert: [
+        { type: 'icontains', value: 'overview' },
+        { type: 'icontains', value: 'functional requirements' },
+        { type: 'icontains', value: 'user stor' },  // Matches "User Stories" or "User Story"
+        { type: 'icontains', value: 'non-functional' },
+        { type: 'icontains', value: 'edge case' },
+      ],
+    },
+
+    // ================================
+    // Test 2: No Premature Tech Stack
+    // ================================
+    {
+      description: 'Spec Template: Should not include tech stack details',
+      prompt: 'file://../prompts/spec-prompt.txt',
+      vars: {
+        user_input: 'Build a REST API for managing user profiles with CRUD operations',
+      },
+      assert: [
+        { type: 'not-icontains', value: 'React' },
+        { type: 'not-icontains', value: 'Node.js' },
+        { type: 'not-icontains', value: 'Express' },
+        { type: 'not-icontains', value: 'MongoDB' },
+        {
+          type: 'llm-rubric',
+          value:
+            'Check if this specification avoids technical implementation details.\nIt should focus on WHAT needs to be built, not HOW to build it.\nReturn 1.0 if no tech stack is mentioned, 0.5 if some mentioned, 0.0 if heavy tech details.',
+          threshold: 0.8,
+        },
+      ],
+    },
+
+    // ================================
+    // Test 3: Quality - User Stories
+    // ================================
+    {
+      description: 'Spec Template: Has clear user stories with acceptance criteria',
+      prompt: 'file://../prompts/spec-prompt.txt',
+      vars: {
+        user_input: 'Create an authentication system with email/password login and social OAuth',
+      },
+      assert: [
+        {
+          type: 'llm-rubric',
+          value:
+            'Grade the specification on user story quality (0-1):\n1. Are there 5+ user stories?\n2. Do stories follow "As a [role], I want [feature], so that [benefit]" format?\n3. Does each story have clear acceptance criteria?\n4. Are the criteria measurable and testable?\nReturn average score 0-1.',
+          threshold: 0.75,
+        },
+      ],
+    },
+
+    // ================================
+    // Test 4: Clarity - No Vague Terms
+    // ================================
+    {
+      description: 'Spec Template: Flags vague requirements',
+      prompt: 'file://../prompts/spec-prompt.txt',
+      vars: {
+        user_input: 'Build a fast, scalable, user-friendly dashboard with good performance',
+      },
+      assert: [
+        {
+          type: 'llm-rubric',
+          value:
+            'Check if vague terms like "fast", "scalable", "user-friendly", "good performance"\nare either:\n1. Quantified with specific metrics (e.g., "response time < 200ms")\n2. Marked with [NEEDS CLARIFICATION] or similar flags\n\nReturn 1.0 if all vague terms are handled properly, 0.0 if none are.',
+          threshold: 0.7,
+        },
+      ],
+    },
+
+    // ================================
+    // Test 5: Security Requirements
+    // ================================
+    {
+      description: 'Spec Template: Security-critical features include security requirements',
+      prompt: 'file://../prompts/spec-prompt.txt',
+      vars: {
+        user_input: 'Create a payment processing system with credit card handling and transaction history',
+      },
+      assert: [
+        { type: 'icontains', value: 'security' },
+        { type: 'python', value: 'file://../graders/custom_graders.py:check_security_completeness' },
+      ],
+    },
+
+    // ================================
+    // Test 6: Edge Cases Coverage
+    // ================================
+    {
+      description: 'Spec Template: Includes edge cases and error scenarios',
+      prompt: 'file://../prompts/spec-prompt.txt',
+      vars: {
+        user_input: 'Build a file upload system supporting multiple file types up to 100MB',
+      },
+      assert: [
+        { type: 'icontains', value: 'edge case' },
+        // Using Python grader instead of LLM rubric for more reliable results
+        { type: 'python', value: 'file://../graders/custom_graders.py:check_edge_cases_coverage' },
+      ],
+    },
+
+    // ================================
+    // Test 7: Plan Template - Simplicity Gate
+    // ================================
+    {
+      description: 'Plan Template: Simple app should have ≤3 projects',
+      prompt: 'file://../prompts/plan-prompt.txt',
+      vars: {
+        user_input: 'Plan a simple todo app with Node.js and SQLite',
+      },
+      assert: [
+        { type: 'python', value: 'file://../graders/custom_graders.py:check_simplicity_gate' },
+        // Note: Removed not-icontains checks - our custom grader is context-aware
+        // and handles "no microservices" vs "use microservices" correctly
+      ],
+    },
+
+    // ================================
+    // Test 8: Plan Template - Constitution Compliance
+    // ================================
+    {
+      description: 'Plan Template: No over-engineering for basic CRUD API',
+      prompt: 'file://../prompts/plan-prompt.txt',
+      vars: {
+        user_input: 'Plan a basic REST API for CRUD operations on a todo list',
+      },
+      assert: [
+        { type: 'python', value: 'file://../graders/custom_graders.py:check_constitution_compliance' },
+        {
+          type: 'llm-rubric',
+          value:
+            'Is the architecture appropriately simple for a basic CRUD API?\nCheck for:\n- No unnecessary complexity (service mesh, event sourcing, CQRS)\n- No over-engineered infrastructure (Kubernetes for simple app)\n- Direct framework usage (no unnecessary wrappers)\nReturn 1.0 if appropriately simple, 0.0 if over-engineered.',
+          threshold: 0.8,
+        },
+      ],
+    },
+
+    // ================================
+    // Test 9: Completeness Score
+    // ================================
+    {
+      description: 'Spec Template: E-commerce checkout has comprehensive requirements',
+      prompt: 'file://../prompts/spec-prompt.txt',
+      vars: {
+        user_input: 'Build an e-commerce checkout flow with cart, payment, and order confirmation',
+      },
+      assert: [
+        {
+          type: 'llm-rubric',
+          value:
+            'Grade completeness (0-1):\n1. Are functional requirements complete? (cart operations, payment, confirmation)\n2. Are user stories covering main flows?\n3. Are non-functional requirements specified? (performance, security)\n4. Are edge cases identified? (payment failures, session timeout)\nReturn average score 0-1.',
+          threshold: 0.75,
+        },
+      ],
+    },
+
+    // ================================
+    // Test 10: Regression Test - Basic Structure
+    // ================================
+    {
+      description: 'Regression: Spec template maintains required sections',
+      prompt: 'file://../prompts/spec-prompt.txt',
+      vars: {
+        user_input: 'Simple feature: Add a search bar to existing app',
+      },
+      assert: [
+        { type: 'icontains', value: 'functional requirements' },
+        { type: 'icontains', value: 'user stor' },
+        // Even simple features should have structure
+        {
+          type: 'javascript',
+          value: `
+            // Count major sections (## or #)
+            const sections = output.split(/^#{1,2} /gm).length - 1;
+            return sections >= 4;  // Lowered from 5 to be more flexible
+          `,
+        },
+      ],
+    },
+  ],
+};
diff --git a/evals/datasets/real-plans/plan-001.md b/evals/datasets/real-plans/plan-001.md
new file mode 100644
index 0000000000..a4eda957bd
--- /dev/null
+++ b/evals/datasets/real-plans/plan-001.md
@@ -0,0 +1,313 @@
+# Test Plan 1
+
+## Prompt
+
+Plan a simple todo app with Node.js and SQLite
+
+## Generated Plan
+
+## Implementation Plan: Simple Todo App with Node.js and SQLite
+
+### 1. Overview
+
+A straightforward todo application built with Node.js and SQLite that allows users to create, read, update, and delete todo items. The application will feature a web-based interface with a RESTful API backend, all integrated into a single cohesive project for simplicity.
+
+**Technology Stack:**
+
+- Backend: Node.js with Express.js
+- Database: SQLite3
+- Frontend: HTML, CSS, vanilla JavaScript (served by Express)
+- No authentication (can be added later if needed)
+
+## 2. Project Structure
+
+### Single Project: `todo-app`
+
+**Rationale:** This is a simple CRUD application that doesn't require separation of concerns. Combining frontend and backend into one project reduces complexity, simplifies deployment, and maintains all code in one repository.
+
+**Project Count: 1** ✓ (within limit)
+
+---
+
+## 3. Project Details
+
+### Project: `todo-app`
+
+**Scope:**
+
+- RESTful API endpoints for todo operations
+- SQLite database for data persistence
+- Static frontend interface served by Express
+- Basic error handling and validation
+
+**Boundaries:**
+
+- Single-user application (no multi-tenancy)
+- No authentication/authorization (Phase 1)
+- No external integrations
+- Local deployment only (initially)
+
+#### Key Tasks and Milestones
+
+#### Milestone 1: Project Setup (Day 1)
+
+- [ ] Initialize Node.js project with npm
+- [ ] Install dependencies: express, sqlite3, body-parser
+- [ ] Create folder structure:
+
+```json
+  todo-app/
+  ├── server.js           # Express server entry point
+  ├── database.js         # SQLite connection and queries
+  ├── routes/
+  │   └── todos.js        # Todo API routes
+  ├── public/
+  │   ├── index.html      # Frontend interface
+  │   ├── styles.css      # Styling
+  │   └── app.js          # Frontend JavaScript
+  ├── package.json
+  └── .gitignore
+```
+
+- [ ] Set up basic Express server
+- [ ] Configure SQLite database connection
+
+#### Milestone 2: Database Layer (Day 1-2)
+
+- [ ] Create todos table schema:
+
+```sql
+  CREATE TABLE todos (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    title TEXT NOT NULL,
+    description TEXT,
+    completed BOOLEAN DEFAULT 0,
+    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
+    updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
+  );
+```
+
+- [ ] Implement database initialization function
+- [ ] Create CRUD query functions:
+  - `getAllTodos()`
+  - `getTodoById(id)`
+  - `createTodo(title, description)`
+  - `updateTodo(id, updates)`
+  - `deleteTodo(id)`
+
+#### Milestone 3: API Endpoints (Day 2-3)
+
+- [ ] `GET /api/todos` - Retrieve all todos
+- [ ] `GET /api/todos/:id` - Retrieve single todo
+- [ ] `POST /api/todos` - Create new todo
+- [ ] `PUT /api/todos/:id` - Update existing todo
+- [ ] `DELETE /api/todos/:id` - Delete todo
+- [ ] Add input validation middleware
+- [ ] Implement error handling middleware
+
+#### Milestone 4: Frontend Interface (Day 3-4)
+
+- [ ] Create HTML structure with:
+  - Input form for new todos
+  - List display for existing todos
+  - Edit/Delete buttons per todo
+  - Checkbox for completion status
+- [ ] Style with CSS (clean, responsive design)
+- [ ] Implement JavaScript functionality:
+  - Fetch and display todos on load
+  - Add new todo via form submission
+  - Toggle completion status
+  - Edit todo inline
+  - Delete todo with confirmation
+  - Handle API errors gracefully
+
+#### Milestone 5: Testing & Polish (Day 4-5)
+
+- [ ] Manual testing of all CRUD operations
+- [ ] Test edge cases (empty inputs, special characters)
+- [ ] Add loading states in UI
+- [ ] Improve error messages
+- [ ] Add README with setup instructions
+- [ ] Code cleanup and comments
+
+#### Dependencies
+
+- None (standalone project)
+
+#### Testing Strategy
+
+**Manual Testing:**
+
+1. **Database Operations:**
+   - Verify table creation on first run
+   - Test each CRUD function directly
+   - Check data persistence across server restarts
+
+2. **API Testing:**
+   - Use Postman or curl to test each endpoint
+   - Verify correct status codes (200, 201, 404, 500)
+   - Test with valid and invalid inputs
+   - Check JSON response formats
+
+3. **Frontend Testing:**
+   - Test in Chrome, Firefox, Safari
+   - Verify responsive design on mobile
+   - Test all user interactions
+   - Check console for JavaScript errors
+
+**Optional: Automated Testing (if time permits):**
+
+- Unit tests for database functions using Jest
+- API endpoint tests using Supertest
+- Simple integration tests
+
+---
+
+## 4. Development Approach
+
+### Phase 1: Core Functionality (Days 1-5)
+
+- Complete all milestones listed above
+- Focus on working features over perfection
+- Keep code simple and readable
+
+### Phase 2: Enhancements (Optional, Future)
+
+- Add user authentication
+- Implement todo categories/tags
+- Add due dates and priorities
+- Search and filter functionality
+- Data export/import
+
+### Deployment Strategy
+
+**Local Development:**
+
+```bash
+npm install
+npm start
+# Access at http://localhost:3000
+```
+
+**Simple Production Deployment:**
+
+- Option 1: VPS with Node.js (DigitalOcean, Linode)
+- Option 2: Platform-as-a-Service (Heroku, Render.com)
+- Use environment variables for configuration
+- SQLite file persisted in mounted volume
+
+**Deployment Steps:**
+
+1. Set `NODE_ENV=production`
+2. Configure port via environment variable
+3. Ensure SQLite database directory is writable
+4. Use process manager (PM2) for production
+5. Set up basic logging
+
+---
+
+## 5. Verification Steps
+
+### Project Verification Checklist
+
+**✓ Setup Verification:**
+
+- [ ] `npm install` runs without errors
+- [ ] Server starts on configured port
+- [ ] Database file is created automatically
+- [ ] Frontend loads at root URL
+
+**✓ Database Verification:**
+
+- [ ] Todos table exists with correct schema
+- [ ] Can insert data directly via SQL
+- [ ] Data persists after server restart
+- [ ] No SQL injection vulnerabilities
+
+**✓ API Verification:**
+
+- [ ] All endpoints return correct status codes
+- [ ] POST creates todo and returns new ID
+- [ ] GET retrieves all todos correctly
+- [ ] PUT updates only specified fields
+- [ ] DELETE removes todo from database
+- [ ] Invalid IDs return 404
+- [ ] Malformed requests return 400
+
+**✓ Frontend Verification:**
+
+- [ ] Page loads without console errors
+- [ ] Can add new todo via form
+- [ ] Todos display immediately after creation
+- [ ] Can toggle completion status
+- [ ] Can edit todo title/description
+- [ ] Can delete todo with confirmation
+- [ ] UI updates reflect database state
+- [ ] Error messages display appropriately
+
+**✓ Integration Verification:**
+
+- [ ] Frontend and backend communicate correctly
+- [ ] CORS issues resolved (if applicable)
+- [ ] Data consistency between UI and database
+- [ ] No race conditions in concurrent operations
+
+**✓ Code Quality Verification:**
+
+- [ ] Code is properly commented
+- [ ] No hardcoded configuration values
+- [ ] Error handling covers edge cases
+- [ ] README includes setup instructions
+- [ ] Git repository has meaningful commits
+
+---
+
+## 6. Success Criteria
+
+The implementation is complete when:
+
+1. ✅ User can perform all CRUD operations via the web interface
+2. ✅ Data persists across application restarts
+3. ✅ Application runs without errors in console/logs
+4. ✅ Code is clean, commented, and maintainable
+5. ✅ README provides clear setup instructions
+6. ✅ All verification steps pass
+
+---
+
+## 7. Estimated Timeline
+
+- **Total Duration:** 4-5 days
+- **Lines of Code (estimated):** 500-700
+- **Complexity:** Low to Medium
+
+**Daily Breakdown:**
+
+- Day 1: Setup + Database (4-6 hours)
+- Day 2: API Development (4-6 hours)
+- Day 3: Frontend Development (4-6 hours)
+- Day 4: Integration + Testing (3-4 hours)
+- Day 5: Polish + Documentation (2-3 hours)
+
+---
+
+## 8. Risk Mitigation
+
+| Risk | Impact | Mitigation |
+|------|--------|------------|
+| SQLite file corruption | High | Regular backups, proper connection handling |
+| Concurrent write conflicts | Medium | SQLite handles via locking, keep operations simple |
+| Frontend-backend sync issues | Medium | Use proper HTTP status codes, implement retry logic |
+| Scope creep | Medium | Stick to core features, document future enhancements |
+
+---
+
+## Conclusion
+
+This plan delivers a fully functional todo application using a **single-project architecture** that maintains simplicity while providing all essential features. The monolithic approach is appropriate for this use case and allows for rapid development and easy maintenance. Future enhancements can be added incrementally without requiring architectural changes.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail: Pass (from automated eval)
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-plans/plan-002.md b/evals/datasets/real-plans/plan-002.md
new file mode 100644
index 0000000000..4808f0caaf
--- /dev/null
+++ b/evals/datasets/real-plans/plan-002.md
@@ -0,0 +1,433 @@
+# Test Plan 2
+
+## Prompt
+
+Plan a basic REST API for CRUD operations on a todo list
+
+## Generated Plan
+
+## Implementation Plan: Todo List REST API
+
+### 1. Overview
+
+A simple REST API service for managing a todo list with full CRUD (Create, Read, Update, Delete) operations. The API will allow users to create todos, retrieve them (individually or as a list), update their status and content, and delete them. This will be implemented as a single, straightforward backend service with persistent storage.
+
+**Technology Stack:**
+
+- **Runtime:** Node.js with Express.js (simple, well-documented, ideal for REST APIs)
+- **Database:** SQLite (file-based, zero configuration, perfect for todo apps)
+- **Validation:** Express-validator (built-in request validation)
+- **Testing:** Jest + Supertest (API endpoint testing)
+
+## 2. Project Structure
+
+### Project 1: Todo API Service
+
+**Single monolithic backend service** - All API logic, data access, and business rules in one deployable unit.
+
+**Rationale:** A todo list CRUD API is a straightforward application with no complex domain separation. Splitting into multiple services would introduce unnecessary complexity, deployment overhead, and inter-service communication without any real benefit.
+
+---
+
+## 3. Project Details
+
+### Project 1: Todo API Service
+
+#### Scope and Boundaries
+
+- RESTful API endpoints for todo management
+- Data persistence layer (SQLite database)
+- Input validation and error handling
+- Basic logging for debugging
+- API documentation (inline comments + README)
+
+#### File Structure
+
+```text
+todo-api/
+├── src/
+│   ├── index.js              # Entry point, server setup
+│   ├── routes/
+│   │   └── todos.js          # Todo route handlers
+│   ├── models/
+│   │   └── todo.js           # Data access layer
+│   ├── middleware/
+│   │   ├── validation.js     # Request validators
+│   │   └── errorHandler.js   # Global error handling
+│   └── db/
+│       └── database.js       # Database initialization
+├── tests/
+│   └── todos.test.js         # API endpoint tests
+├── package.json
+├── .env.example
+└── README.md
+```
+
+#### API Endpoints
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| GET | `/api/todos` | Get all todos (with optional filters: ?completed=true) |
+| GET | `/api/todos/:id` | Get a specific todo by ID |
+| POST | `/api/todos` | Create a new todo |
+| PUT | `/api/todos/:id` | Update an existing todo |
+| DELETE | `/api/todos/:id` | Delete a todo |
+
+#### Data Model
+
+```javascript
+Todo {
+  id: INTEGER (auto-increment, primary key)
+  title: TEXT (required, max 200 chars)
+  description: TEXT (optional, max 1000 chars)
+  completed: BOOLEAN (default: false)
+  createdAt: DATETIME (auto-generated)
+  updatedAt: DATETIME (auto-updated)
+}
+```
+
+#### Key Tasks and Milestones
+
+##### Phase 1: Foundation (Day 1)
+
+- [ ] Initialize Node.js project with Express
+- [ ] Set up SQLite database connection
+- [ ] Create todos table schema with migration
+- [ ] Implement basic server with health check endpoint (`/health`)
+- [ ] Configure environment variables (port, database path)
+
+##### Phase 2: Core CRUD Operations (Day 2)
+
+- [ ] Implement POST `/api/todos` - Create todo
+- [ ] Implement GET `/api/todos` - List all todos
+- [ ] Implement GET `/api/todos/:id` - Get single todo
+- [ ] Implement PUT `/api/todos/:id` - Update todo
+- [ ] Implement DELETE `/api/todos/:id` - Delete todo
+- [ ] Add data access layer for database operations
+
+##### Phase 3: Validation & Error Handling (Day 3)
+
+- [ ] Add input validation middleware (title required, length limits)
+- [ ] Implement global error handler
+- [ ] Add 404 handling for non-existent resources
+- [ ] Validate ID parameters (numeric, exists in DB)
+- [ ] Return appropriate HTTP status codes (200, 201, 400, 404, 500)
+
+##### Phase 4: Testing & Documentation (Day 4)
+
+- [ ] Write unit tests for data access layer
+- [ ] Write integration tests for all API endpoints
+- [ ] Test error scenarios (invalid input, missing resources)
+- [ ] Document API endpoints in README
+- [ ] Add example requests/responses
+
+#### Dependencies
+
+- **External:** None (standalone service)
+- **NPM Packages:**
+  - `express` - Web framework
+  - `better-sqlite3` - Synchronous SQLite client
+  - `express-validator` - Request validation
+  - `dotenv` - Environment configuration
+  - `jest` & `supertest` - Testing (dev dependencies)
+
+#### Testing Strategy
+
+**Unit Tests:**
+
+- Database operations (CRUD functions in isolation)
+- Validation logic
+- Error handling utilities
+
+**Integration Tests:**
+
+- Full request/response cycle for each endpoint
+- Test successful operations (200, 201 responses)
+- Test error cases (400, 404 responses)
+- Test edge cases (empty list, updating non-existent todo)
+- Test query parameters (filtering by completed status)
+
+**Test Coverage Goals:**
+
+- Minimum 80% code coverage
+- 100% endpoint coverage
+- All error paths tested
+
+**Testing Tools:**
+
+```bash
+# Run all tests
+npm test
+
+# Run with coverage report
+npm run test:coverage
+
+# Run in watch mode during development
+npm run test:watch
+```
+
+---
+
+## 4. Development Approach
+
+### Phasing Strategy
+
+#### Phase 1: MVP (Minimum Viable Product) - Days 1-2
+
+- Basic CRUD operations working
+- Data persists in SQLite
+- Manual testing via Postman/curl
+- **Deliverable:** Functional API with all endpoints
+
+#### Phase 2: Hardening - Days 3-4
+
+- Add validation and error handling
+- Write comprehensive tests
+- Document API
+- **Deliverable:** Production-ready service with tests
+
+#### Phase 3: Enhancements (Optional/Future)
+
+- Add pagination for GET `/api/todos`
+- Add search/filter capabilities
+- Add due dates to todos
+- Add priority levels
+
+### Development Workflow
+
+1. Create feature branch from `main`
+2. Implement functionality
+3. Write tests (aim for test-first when possible)
+4. Run tests locally (`npm test`)
+5. Manual testing with REST client
+6. Merge to `main` when tests pass
+
+### Deployment Strategy
+
+**Local Development:**
+
+```bash
+npm install
+npm run dev  # Uses nodemon for auto-reload
+```
+
+**Production Deployment:**
+
+```bash
+npm install --production
+npm start
+```
+
+**Environment Configuration:**
+
+- `PORT` - Server port (default: 3000)
+- `DATABASE_PATH` - SQLite file location (default: `./todos.db`)
+- `NODE_ENV` - Environment (development/production)
+
+**Deployment Options:**
+
+- **Simple:** Run directly on VPS with PM2 process manager
+- **Containerized:** Docker container (single Dockerfile)
+- **Platform:** Deploy to Heroku, Railway, or Render (all support Node.js natively)
+
+**Docker Setup (Optional):**
+
+```dockerfile
+FROM node:18-alpine
+WORKDIR /app
+COPY package*.json ./
+RUN npm ci --production
+COPY src/ ./src/
+EXPOSE 3000
+CMD ["node", "src/index.js"]
+```
+
+---
+
+## 5. Verification Steps
+
+### Project 1: Todo API Service
+
+#### Functional Verification
+
+##### Step 1: Health Check
+
+```bash
+curl http://localhost:3000/health
+# Expected: {"status": "ok"}
+```
+
+##### Step 2: Create Todo
+
+```bash
+curl -X POST http://localhost:3000/api/todos \
+  -H "Content-Type: application/json" \
+  -d '{"title": "Test todo", "description": "Test description"}'
+# Expected: 201 status, returns created todo with ID
+```
+
+##### Step 3: List Todos
+
+```bash
+curl http://localhost:3000/api/todos
+# Expected: 200 status, array containing created todo
+```
+
+##### Step 4: Get Single Todo
+
+```bash
+curl http://localhost:3000/api/todos/1
+# Expected: 200 status, returns todo with ID 1
+```
+
+##### Step 5: Update Todo
+
+```bash
+curl -X PUT http://localhost:3000/api/todos/1 \
+  -H "Content-Type: application/json" \
+  -d '{"completed": true}'
+# Expected: 200 status, returns updated todo
+```
+
+##### Step 6: Delete Todo
+
+```bash
+curl -X DELETE http://localhost:3000/api/todos/1
+# Expected: 204 status, no content
+```
+
+##### Step 7: Verify Deletion
+
+```bash
+curl http://localhost:3000/api/todos/1
+# Expected: 404 status, error message
+```
+
+#### Error Handling Verification
+
+**Test Invalid Input:**
+
+```bash
+curl -X POST http://localhost:3000/api/todos \
+  -H "Content-Type: application/json" \
+  -d '{}'
+# Expected: 400 status, validation error for missing title
+```
+
+**Test Non-existent Resource:**
+
+```bash
+curl http://localhost:3000/api/todos/99999
+# Expected: 404 status, "Todo not found" message
+```
+
+**Test Invalid ID:**
+
+```bash
+curl http://localhost:3000/api/todos/invalid
+# Expected: 400 status, "Invalid ID" message
+```
+
+#### Automated Testing Verification
+
+```bash
+# Run test suite
+npm test
+
+# Expected output:
+# ✓ GET /api/todos returns empty array initially
+# ✓ POST /api/todos creates new todo
+# ✓ POST /api/todos validates required fields
+# ✓ GET /api/todos/:id returns todo
+# ✓ GET /api/todos/:id returns 404 for non-existent
+# ✓ PUT /api/todos/:id updates todo
+# ✓ DELETE /api/todos/:id removes todo
+# ... (all tests passing)
+```
+
+#### Performance Verification (Basic)
+
+```bash
+# Test response time for list endpoint
+time curl http://localhost:3000/api/todos
+# Expected: < 100ms for small datasets
+
+# Create 100 todos and test performance
+for i in {1..100}; do
+  curl -X POST http://localhost:3000/api/todos \
+    -H "Content-Type: application/json" \
+    -d "{\"title\": \"Todo $i\"}" &
+done
+wait
+
+time curl http://localhost:3000/api/todos
+# Expected: < 500ms for 100 items
+```
+
+#### Data Persistence Verification
+
+```bash
+# Create a todo
+curl -X POST http://localhost:3000/api/todos \
+  -H "Content-Type: application/json" \
+  -d '{"title": "Persistence test"}'
+
+# Stop server
+# Restart server
+
+# Verify todo still exists
+curl http://localhost:3000/api/todos
+# Expected: Previously created todo is still in the list
+```
+
+---
+
+## 6. Success Criteria
+
+The implementation is considered complete when:
+
+- ✅ All 5 CRUD endpoints are functional
+- ✅ Data persists across server restarts
+- ✅ Input validation prevents invalid data
+- ✅ Appropriate HTTP status codes are returned
+- ✅ All automated tests pass (minimum 80% coverage)
+- ✅ API documentation is complete in README
+- ✅ Error handling covers common failure scenarios
+- ✅ Manual verification steps all pass
+- ✅ Service can be deployed and run in production environment
+
+---
+
+## 7. Timeline Estimate
+
+**Total Duration:** 4 days (assuming 1 developer working full-time)
+
+- **Day 1:** Project setup, database, basic server (Phase 1 Foundation)
+- **Day 2:** Implement all CRUD endpoints (Phase 2 Core Operations)
+- **Day 3:** Add validation, error handling (Phase 3 Hardening)
+- **Day 4:** Testing, documentation, verification (Phase 4 Testing)
+
+**Note:** This is a conservative estimate. An experienced developer could complete this in 2-3 days.
+
+---
+
+## 8. Future Enhancements (Out of Scope)
+
+These are explicitly NOT part of the initial implementation but could be added later:
+
+- User authentication and authorization
+- Multiple todo lists per user
+- Tags/categories for todos
+- Due dates and reminders
+- File attachments
+- Real-time updates (WebSocket)
+- Frontend web application
+- Mobile app
+
+**Principle:** Ship the simple version first, validate usage, then enhance based on actual needs.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail: Pass (from automated eval)
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-001.md b/evals/datasets/real-specs/spec-001.md
new file mode 100644
index 0000000000..c1289f882c
--- /dev/null
+++ b/evals/datasets/real-specs/spec-001.md
@@ -0,0 +1,231 @@
+# Test Spec 1
+
+## Prompt
+
+Build a simple todo list web application with task creation, editing, and deletion
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+[NEEDS CLARIFICATION: No user requirements provided]
+
+**Feature Name:** [Awaiting Input]
+
+**Description:**
+This section will contain a brief description of the feature once user requirements are provided. The overview should explain:
+
+- What the feature does
+- Who will use it
+- Why it's needed
+- High-level scope and boundaries
+
+**Business Value:**
+[To be defined based on user input]
+
+**Stakeholders:**
+[To be identified based on user input]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Must Have
+
+**US-001: [Awaiting User Input]**
+
+- **As a** [role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+  **When** [action occurs]
+  **Then** [expected outcome]
+
+---
+
+### Priority 2 (P2) - Should Have
+
+**US-002: [Awaiting User Input]**
+
+- **As a** [role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+  **When** [action occurs]
+  **Then** [expected outcome]
+
+---
+
+### Priority 3 (P3) - Nice to Have
+
+**US-003: [Awaiting User Input]**
+
+- **As a** [role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+  **When** [action occurs]
+  **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [Awaiting specification]
+
+**FR-002:** [Awaiting specification]
+
+**FR-003:** [Awaiting specification]
+
+**FR-004:** [Awaiting specification]
+
+**FR-005:** [Awaiting specification]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-001 to NFR-003)
+
+**NFR-001:** [Performance requirement - response time, throughput, etc.]
+
+**NFR-002:** [Capacity requirement - concurrent users, data volume, etc.]
+
+**NFR-003:** [Resource utilization requirement]
+
+---
+
+### Security (NFR-004 to NFR-006)
+
+**NFR-004:** [Authentication requirement]
+
+**NFR-005:** [Authorization requirement]
+
+**NFR-006:** [Data protection requirement]
+
+---
+
+### Scalability (NFR-007 to NFR-008)
+
+**NFR-007:** [Horizontal/vertical scaling requirement]
+
+**NFR-008:** [Growth accommodation requirement]
+
+---
+
+### Usability (NFR-009 to NFR-010)
+
+**NFR-009:** [User experience requirement]
+
+**NFR-010:** [Accessibility requirement]
+
+---
+
+### Reliability (NFR-011 to NFR-012)
+
+**NFR-011:** [Uptime/availability requirement]
+
+**NFR-012:** [Error handling requirement]
+
+---
+
+## 5. Edge Cases
+
+### Boundary Conditions
+
+**EC-001:** [Edge case description and expected behavior]
+
+**EC-002:** [Edge case description and expected behavior]
+
+**EC-003:** [Edge case description and expected behavior]
+
+---
+
+### Error Scenarios
+
+**ES-001:** [Error scenario and expected system response]
+
+**ES-002:** [Error scenario and expected system response]
+
+**ES-003:** [Error scenario and expected system response]
+
+---
+
+### Data Validation
+
+**DV-001:** [Invalid input scenario and expected handling]
+
+**DV-002:** [Invalid input scenario and expected handling]
+
+---
+
+## 6. Success Criteria
+
+**SC-001:** [Measurable success metric with target value]
+
+**SC-002:** [Measurable success metric with target value]
+
+**SC-003:** [Measurable success metric with target value]
+
+**SC-004:** [Measurable success metric with target value]
+
+**SC-005:** [Measurable success metric with target value]
+
+---
+
+## 7. Out of Scope
+
+[Items explicitly excluded from this feature]
+
+---
+
+## 8. Dependencies
+
+[Prerequisites, integrations, or external dependencies]
+
+---
+
+## 9. Assumptions
+
+[Assumptions made during specification]
+
+---
+
+## 10. Open Questions
+
+[NEEDS CLARIFICATION: User requirements needed]
+
+1. What is the feature being requested?
+2. Who are the target users?
+3. What problem does this solve?
+4. Are there any existing systems this integrates with?
+5. What are the key constraints or limitations?
+
+---
+
+**Document Status:** Draft - Awaiting User Requirements
+
+**Last Updated:** [Current Date]
+
+**Version:** 0.1
+
+---
+
+**Note:** Please provide your user requirements so I can create a comprehensive feature specification document tailored to your specific needs.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-002.md b/evals/datasets/real-specs/spec-002.md
new file mode 100644
index 0000000000..d5a441ea38
--- /dev/null
+++ b/evals/datasets/real-specs/spec-002.md
@@ -0,0 +1,378 @@
+# Test Spec 2
+
+## Prompt
+
+Create a personal blog platform with posts, comments, and basic authentication
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - Feature name not provided]
+
+**Description:**
+[NEEDS CLARIFICATION - No user requirements were provided in the input. Please provide the feature requirements you would like documented.]
+
+**Purpose:**
+[NEEDS CLARIFICATION - Business objective and target users need to be specified]
+
+**Scope:**
+
+- In Scope: [NEEDS CLARIFICATION]
+- Out of Scope: [NEEDS CLARIFICATION]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Must Have
+
+#### US-001: [NEEDS CLARIFICATION]
+
+**As a** [role not specified]
+**I want** [feature not specified]
+**So that** [benefit not specified]
+
+**Acceptance Criteria:**
+
+- **Given** [NEEDS CLARIFICATION - initial state]
+  **When** [NEEDS CLARIFICATION - action taken]
+  **Then** [NEEDS CLARIFICATION - expected outcome]
+
+**Priority:** P1
+**Story Points:** [NEEDS CLARIFICATION]
+
+---
+
+#### US-002: [NEEDS CLARIFICATION]
+
+**As a** [role not specified]
+**I want** [feature not specified]
+**So that** [benefit not specified]
+
+**Acceptance Criteria:**
+
+- **Given** [NEEDS CLARIFICATION]
+  **When** [NEEDS CLARIFICATION]
+  **Then** [NEEDS CLARIFICATION]
+
+**Priority:** P1
+**Story Points:** [NEEDS CLARIFICATION]
+
+---
+
+### Priority 2 (P2) - Should Have
+
+#### US-003: [NEEDS CLARIFICATION]
+
+**As a** [role not specified]
+**I want** [feature not specified]
+**So that** [benefit not specified]
+
+**Acceptance Criteria:**
+
+- **Given** [NEEDS CLARIFICATION]
+  **When** [NEEDS CLARIFICATION]
+  **Then** [NEEDS CLARIFICATION]
+
+**Priority:** P2
+**Story Points:** [NEEDS CLARIFICATION]
+
+---
+
+#### US-004: [NEEDS CLARIFICATION]
+
+**As a** [role not specified]
+**I want** [feature not specified]
+**So that** [benefit not specified]
+
+**Acceptance Criteria:**
+
+- **Given** [NEEDS CLARIFICATION]
+  **When** [NEEDS CLARIFICATION]
+  **Then** [NEEDS CLARIFICATION]
+
+**Priority:** P2
+**Story Points:** [NEEDS CLARIFICATION]
+
+---
+
+### Priority 3 (P3) - Nice to Have
+
+#### US-005: [NEEDS CLARIFICATION]
+
+**As a** [role not specified]
+**I want** [feature not specified]
+**So that** [benefit not specified]
+
+**Acceptance Criteria:**
+
+- **Given** [NEEDS CLARIFICATION]
+  **When** [NEEDS CLARIFICATION]
+  **Then** [NEEDS CLARIFICATION]
+
+**Priority:** P3
+**Story Points:** [NEEDS CLARIFICATION]
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [NEEDS CLARIFICATION - Core functional requirement not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**FR-002:** [NEEDS CLARIFICATION - Core functional requirement not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### Data Requirements
+
+**FR-003:** [NEEDS CLARIFICATION - Data handling requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**FR-004:** [NEEDS CLARIFICATION - Data validation requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### User Interface Requirements
+
+**FR-005:** [NEEDS CLARIFICATION - UI/UX requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### Integration Requirements
+
+**FR-006:** [NEEDS CLARIFICATION - Integration points not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-P)
+
+**NFR-P-001:** [NEEDS CLARIFICATION - Response time requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., Response time < X seconds for Y% of requests]
+
+**NFR-P-002:** [NEEDS CLARIFICATION - Throughput requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., System must handle X concurrent users]
+
+**NFR-P-003:** [NEEDS CLARIFICATION - Load time requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., Initial load time < X seconds]
+
+### Security (NFR-S)
+
+**NFR-S-001:** [NEEDS CLARIFICATION - Authentication requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**NFR-S-002:** [NEEDS CLARIFICATION - Authorization requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**NFR-S-003:** [NEEDS CLARIFICATION - Data encryption requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**NFR-S-004:** [NEEDS CLARIFICATION - Audit logging requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### Scalability (NFR-SC)
+
+**NFR-SC-001:** [NEEDS CLARIFICATION - User growth requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., System must scale to X users within Y timeframe]
+
+**NFR-SC-002:** [NEEDS CLARIFICATION - Data volume requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### Availability & Reliability (NFR-A)
+
+**NFR-A-001:** [NEEDS CLARIFICATION - Uptime requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., 99.9% uptime]
+
+**NFR-A-002:** [NEEDS CLARIFICATION - Recovery time objective not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., RTO < X hours]
+
+**NFR-A-003:** [NEEDS CLARIFICATION - Backup requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### Usability (NFR-U)
+
+**NFR-U-001:** [NEEDS CLARIFICATION - Accessibility requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION - e.g., WCAG 2.1 Level AA compliance]
+
+**NFR-U-002:** [NEEDS CLARIFICATION - Browser/device support not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**NFR-U-003:** [NEEDS CLARIFICATION - Internationalization requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+### Maintainability (NFR-M)
+
+**NFR-M-001:** [NEEDS CLARIFICATION - Documentation requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+**NFR-M-002:** [NEEDS CLARIFICATION - Monitoring requirements not specified]
+**Measurement:** [NEEDS CLARIFICATION]
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Input Validation
+
+**EC-001:** [NEEDS CLARIFICATION - Invalid input handling not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-002:** [NEEDS CLARIFICATION - Missing required data handling not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-003:** [NEEDS CLARIFICATION - Data format validation not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+### Boundary Conditions
+
+**EC-004:** [NEEDS CLARIFICATION - Maximum capacity scenarios not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-005:** [NEEDS CLARIFICATION - Minimum threshold scenarios not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-006:** [NEEDS CLARIFICATION - Empty state handling not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+### System Failures
+
+**EC-007:** [NEEDS CLARIFICATION - Network failure scenarios not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-008:** [NEEDS CLARIFICATION - Service unavailability handling not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-009:** [NEEDS CLARIFICATION - Timeout scenarios not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+### Concurrency Issues
+
+**EC-010:** [NEEDS CLARIFICATION - Simultaneous user actions not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+**EC-011:** [NEEDS CLARIFICATION - Data conflict resolution not specified]
+**Expected Behavior:** [NEEDS CLARIFICATION]
+
+---
+
+## 6. Success Criteria
+
+### User Adoption Metrics
+
+**SC-001:** [NEEDS CLARIFICATION - User adoption target not specified]
+**Target:** [NEEDS CLARIFICATION - e.g., X% of target users actively using feature within Y days]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+**SC-002:** [NEEDS CLARIFICATION - User engagement target not specified]
+**Target:** [NEEDS CLARIFICATION]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+### Performance Metrics
+
+**SC-003:** [NEEDS CLARIFICATION - Performance benchmark not specified]
+**Target:** [NEEDS CLARIFICATION]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+**SC-004:** [NEEDS CLARIFICATION - Error rate target not specified]
+**Target:** [NEEDS CLARIFICATION - e.g., < X% error rate]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+### Business Metrics
+
+**SC-005:** [NEEDS CLARIFICATION - Business impact target not specified]
+**Target:** [NEEDS CLARIFICATION]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+**SC-006:** [NEEDS CLARIFICATION - ROI or cost savings target not specified]
+**Target:** [NEEDS CLARIFICATION]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+### Quality Metrics
+
+**SC-007:** [NEEDS CLARIFICATION - User satisfaction target not specified]
+**Target:** [NEEDS CLARIFICATION - e.g., X+ satisfaction score]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+**SC-008:** [NEEDS CLARIFICATION - Support ticket reduction target not specified]
+**Target:** [NEEDS CLARIFICATION]
+**Measurement Method:** [NEEDS CLARIFICATION]
+
+---
+
+## 7. Dependencies & Assumptions
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - External dependencies not specified]
+- [NEEDS CLARIFICATION - Internal dependencies not specified]
+- [NEEDS CLARIFICATION - Third-party service dependencies not specified]
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - Assumptions about user behavior not specified]
+- [NEEDS CLARIFICATION - Assumptions about system environment not specified]
+- [NEEDS CLARIFICATION - Assumptions about data availability not specified]
+
+---
+
+## 8. Risks & Mitigation
+
+| Risk ID | Risk Description | Impact | Likelihood | Mitigation Strategy |
+|---------|-----------------|--------|------------|-------------------|
+| R-001 | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] |
+| R-002 | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] |
+| R-003 | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] | [NEEDS CLARIFICATION] |
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION - What is the specific feature being requested?]
+2. [NEEDS CLARIFICATION - Who are the target users?]
+3. [NEEDS CLARIFICATION - What problem does this feature solve?]
+4. [NEEDS CLARIFICATION - What are the business objectives?]
+5. [NEEDS CLARIFICATION - What are the timeline and budget constraints?]
+6. [NEEDS CLARIFICATION - Are there any regulatory or compliance requirements?]
+7. [NEEDS CLARIFICATION - What are the integration points with existing systems?]
+
+---
+
+## 10. Approval & Sign-off
+
+| Role | Name | Signature | Date |
+|------|------|-----------|------|
+| Product Owner | [NEEDS CLARIFICATION] | | |
+| Engineering Lead | [NEEDS CLARIFICATION] | | |
+| Design Lead | [NEEDS CLARIFICATION] | | |
+| QA Lead | [NEEDS CLARIFICATION] | | |
+
+---
+
+**Document Version:** 1.0
+**Last Updated:** [Current Date]
+**Status:** DRAFT - AWAITING USER REQUIREMENTS
+
+---
+
+## Next Steps
+
+To complete this feature specification, please provide:
+
+1. Detailed description of the feature you want to build
+2. Target users and use cases
+3. Business objectives and success criteria
+4. Any specific requirements or constraints
+5. Integration points or dependencies
+
+Once this information is provided, I can create a comprehensive, actionable feature specification document.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-003.md b/evals/datasets/real-specs/spec-003.md
new file mode 100644
index 0000000000..8747e2f2c5
--- /dev/null
+++ b/evals/datasets/real-specs/spec-003.md
@@ -0,0 +1,531 @@
+# Test Spec 3
+
+## Prompt
+
+Develop a simple calculator web app with basic arithmetic operations
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature described in user requirements]
+
+**Description:**
+No user requirements were provided in the input. This template demonstrates the structure for a comprehensive feature specification document.
+
+**Purpose:**
+[NEEDS CLARIFICATION - Awaiting user requirements to define feature purpose and value proposition]
+
+**Target Users:**
+[NEEDS CLARIFICATION - User personas and roles to be defined based on requirements]
+
+**Business Value:**
+[NEEDS CLARIFICATION - Expected business impact and ROI to be defined]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Must Have
+
+**US-001: [NEEDS CLARIFICATION - Core user story needed]**
+
+- **As a** [role]
+- **I want** [feature/capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger]
+- **Then** [expected outcome]
+
+**Testability:** [Define how this story can be independently tested]
+
+---
+
+**US-002: [NEEDS CLARIFICATION - Core user story needed]**
+
+- **As a** [role]
+- **I want** [feature/capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger]
+- **Then** [expected outcome]
+
+**Testability:** [Define how this story can be independently tested]
+
+---
+
+### Priority 2 (P2) - Should Have
+
+**US-003: [NEEDS CLARIFICATION - Secondary user story needed]**
+
+- **As a** [role]
+- **I want** [feature/capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger]
+- **Then** [expected outcome]
+
+**Testability:** [Define how this story can be independently tested]
+
+---
+
+**US-004: [NEEDS CLARIFICATION - Secondary user story needed]**
+
+- **As a** [role]
+- **I want** [feature/capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger]
+- **Then** [expected outcome]
+
+**Testability:** [Define how this story can be independently tested]
+
+---
+
+### Priority 3 (P3) - Nice to Have
+
+**US-005: [NEEDS CLARIFICATION - Enhancement user story needed]**
+
+- **As a** [role]
+- **I want** [feature/capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger]
+- **Then** [expected outcome]
+
+**Testability:** [Define how this story can be independently tested]
+
+---
+
+**US-006: [NEEDS CLARIFICATION - Enhancement user story needed]**
+
+- **As a** [role]
+- **I want** [feature/capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger]
+- **Then** [expected outcome]
+
+**Testability:** [Define how this story can be independently tested]
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [NEEDS CLARIFICATION - Define primary functional requirement]
+
+- **Description:** [What the system must do]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+**FR-002:** [NEEDS CLARIFICATION - Define primary functional requirement]
+
+- **Description:** [What the system must do]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+**FR-003:** [NEEDS CLARIFICATION - Define primary functional requirement]
+
+- **Description:** [What the system must do]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+---
+
+### Data Requirements
+
+**FR-004:** [NEEDS CLARIFICATION - Define data handling requirements]
+
+- **Description:** [Data that must be captured, stored, or processed]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+**FR-005:** [NEEDS CLARIFICATION - Define data validation requirements]
+
+- **Description:** [Data validation rules and constraints]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+---
+
+### User Interface Requirements
+
+**FR-006:** [NEEDS CLARIFICATION - Define UI/UX requirements]
+
+- **Description:** [User interface elements and interactions needed]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+**FR-007:** [NEEDS CLARIFICATION - Define accessibility requirements]
+
+- **Description:** [Accessibility standards and requirements]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+---
+
+### Integration Requirements
+
+**FR-008:** [NEEDS CLARIFICATION - Define system integration needs]
+
+- **Description:** [External systems or services that must integrate]
+- **Measurement Criteria:** [How to verify this requirement is met]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** Response Time
+
+- **Requirement:** [NEEDS CLARIFICATION - Define acceptable response times]
+- **Measurement:** [How to measure: e.g., "95th percentile response time under X seconds"]
+
+**NFR-002:** Throughput
+
+- **Requirement:** [NEEDS CLARIFICATION - Define transaction volume requirements]
+- **Measurement:** [How to measure: e.g., "System must handle X transactions per second"]
+
+**NFR-003:** Concurrent Users
+
+- **Requirement:** [NEEDS CLARIFICATION - Define concurrent user capacity]
+- **Measurement:** [How to measure: e.g., "Support X concurrent users without degradation"]
+
+---
+
+### Security Requirements
+
+**NFR-004:** Authentication
+
+- **Requirement:** [NEEDS CLARIFICATION - Define authentication requirements]
+- **Measurement:** [How to verify security controls are implemented]
+
+**NFR-005:** Authorization
+
+- **Requirement:** [NEEDS CLARIFICATION - Define access control requirements]
+- **Measurement:** [How to verify authorization mechanisms]
+
+**NFR-006:** Data Protection
+
+- **Requirement:** [NEEDS CLARIFICATION - Define data encryption and protection needs]
+- **Measurement:** [How to verify data protection compliance]
+
+**NFR-007:** Audit Logging
+
+- **Requirement:** [NEEDS CLARIFICATION - Define audit trail requirements]
+- **Measurement:** [How to verify logging completeness]
+
+---
+
+### Scalability Requirements
+
+**NFR-008:** Horizontal Scalability
+
+- **Requirement:** [NEEDS CLARIFICATION - Define scaling requirements]
+- **Measurement:** [How to measure scalability capacity]
+
+**NFR-009:** Data Growth
+
+- **Requirement:** [NEEDS CLARIFICATION - Define data volume projections]
+- **Measurement:** [How to verify system handles projected growth]
+
+---
+
+### Reliability Requirements
+
+**NFR-010:** Availability
+
+- **Requirement:** [NEEDS CLARIFICATION - Define uptime requirements (e.g., 99.9% availability)]
+- **Measurement:** [How to measure: "System uptime percentage over 30-day period"]
+
+**NFR-011:** Fault Tolerance
+
+- **Requirement:** [NEEDS CLARIFICATION - Define error recovery requirements]
+- **Measurement:** [How to verify system recovers from failures]
+
+**NFR-012:** Data Integrity
+
+- **Requirement:** [NEEDS CLARIFICATION - Define data consistency requirements]
+- **Measurement:** [How to verify data remains accurate and consistent]
+
+---
+
+### Usability Requirements
+
+**NFR-013:** Learning Curve
+
+- **Requirement:** [NEEDS CLARIFICATION - Define ease-of-use expectations]
+- **Measurement:** [How to measure: "New users can complete core tasks within X minutes"]
+
+**NFR-014:** Accessibility Standards
+
+- **Requirement:** [NEEDS CLARIFICATION - Define accessibility compliance needs (e.g., WCAG 2.1 Level AA)]
+- **Measurement:** [How to verify accessibility compliance]
+
+---
+
+### Maintainability Requirements
+
+**NFR-015:** Documentation
+
+- **Requirement:** [NEEDS CLARIFICATION - Define documentation requirements]
+- **Measurement:** [How to verify documentation completeness]
+
+**NFR-016:** Monitoring
+
+- **Requirement:** [NEEDS CLARIFICATION - Define system monitoring needs]
+- **Measurement:** [How to verify monitoring coverage]
+
+---
+
+### Compliance Requirements
+
+**NFR-017:** Regulatory Compliance
+
+- **Requirement:** [NEEDS CLARIFICATION - Define applicable regulations (e.g., GDPR, HIPAA, SOC2)]
+- **Measurement:** [How to verify compliance]
+
+---
+
+## 5. Edge Cases and Error Scenarios
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION - Define minimum input boundaries]
+
+- **Scenario:** [Describe edge case]
+- **Expected Behavior:** [How system should respond]
+- **Error Handling:** [What feedback user receives]
+
+**EC-002:** [NEEDS CLARIFICATION - Define maximum input boundaries]
+
+- **Scenario:** [Describe edge case]
+- **Expected Behavior:** [How system should respond]
+- **Error Handling:** [What feedback user receives]
+
+**EC-003:** [NEEDS CLARIFICATION - Define empty/null state handling]
+
+- **Scenario:** [Describe edge case]
+- **Expected Behavior:** [How system should respond]
+- **Error Handling:** [What feedback user receives]
+
+---
+
+### Error Scenarios
+
+**EC-004:** [NEEDS CLARIFICATION - Define invalid input handling]
+
+- **Scenario:** [Describe error condition]
+- **Expected Behavior:** [How system should respond]
+- **Error Message:** [User-friendly error message]
+- **Recovery Path:** [How user can correct the error]
+
+**EC-005:** [NEEDS CLARIFICATION - Define network failure handling]
+
+- **Scenario:** [Describe error condition]
+- **Expected Behavior:** [How system should respond]
+- **Error Message:** [User-friendly error message]
+- **Recovery Path:** [How user can retry or recover]
+
+**EC-006:** [NEEDS CLARIFICATION - Define timeout scenarios]
+
+- **Scenario:** [Describe error condition]
+- **Expected Behavior:** [How system should respond]
+- **Error Message:** [User-friendly error message]
+- **Recovery Path:** [How user can proceed]
+
+**EC-007:** [NEEDS CLARIFICATION - Define concurrent access conflicts]
+
+- **Scenario:** [Describe conflict condition]
+- **Expected Behavior:** [How system should respond]
+- **Error Message:** [User-friendly error message]
+- **Resolution:** [How conflict is resolved]
+
+---
+
+### Data Integrity Edge Cases
+
+**EC-008:** [NEEDS CLARIFICATION - Define duplicate data handling]
+
+- **Scenario:** [Describe edge case]
+- **Expected Behavior:** [How system should respond]
+- **Prevention:** [How system prevents data issues]
+
+**EC-009:** [NEEDS CLARIFICATION - Define data migration scenarios]
+
+- **Scenario:** [Describe edge case]
+- **Expected Behavior:** [How system should respond]
+- **Rollback Plan:** [How to revert if issues occur]
+
+---
+
+## 6. Success Criteria
+
+### Functional Success Metrics
+
+**SC-001:** Feature Completeness
+
+- **Metric:** [NEEDS CLARIFICATION - Define completion criteria]
+- **Target:** [e.g., "100% of P1 user stories completed and accepted"]
+- **Measurement Method:** [How to verify]
+
+**SC-002:** Acceptance Testing
+
+- **Metric:** [NEEDS CLARIFICATION - Define acceptance test pass rate]
+- **Target:** [e.g., "100% of acceptance criteria passing"]
+- **Measurement Method:** [How to verify]
+
+---
+
+### Performance Success Metrics
+
+**SC-003:** Response Time Achievement
+
+- **Metric:** [NEEDS CLARIFICATION - Define performance target]
+- **Target:** [e.g., "95th percentile response time < X seconds"]
+- **Measurement Method:** [Performance testing methodology]
+
+**SC-004:** Load Capacity
+
+- **Metric:** [NEEDS CLARIFICATION - Define load handling target]
+- **Target:** [e.g., "Successfully handle X concurrent users"]
+- **Measurement Method:** [Load testing methodology]
+
+---
+
+### Quality Success Metrics
+
+**SC-005:** Defect Rate
+
+- **Metric:** [NEEDS CLARIFICATION - Define acceptable defect rate]
+- **Target:** [e.g., "Less than X critical defects in production after 30 days"]
+- **Measurement Method:** [Defect tracking methodology]
+
+**SC-006:** Test Coverage
+
+- **Metric:** [NEEDS CLARIFICATION - Define test coverage requirements]
+- **Target:** [e.g., "Automated test coverage of X% for critical paths"]
+- **Measurement Method:** [Coverage analysis tools]
+
+---
+
+### User Adoption Success Metrics
+
+**SC-007:** User Engagement
+
+- **Metric:** [NEEDS CLARIFICATION - Define engagement metrics]
+- **Target:** [e.g., "X% of target users actively using feature within 30 days"]
+- **Measurement Method:** [Analytics tracking]
+
+**SC-008:** User Satisfaction
+
+- **Metric:** [NEEDS CLARIFICATION - Define satisfaction measurement]
+- **Target:** [e.g., "Average satisfaction score of X/10 or higher"]
+- **Measurement Method:** [User feedback surveys]
+
+**SC-009:** Task Completion Rate
+
+- **Metric:** [NEEDS CLARIFICATION - Define task success rate]
+- **Target:** [e.g., "X% of users successfully complete primary workflow"]
+- **Measurement Method:** [User analytics and session tracking]
+
+---
+
+### Business Success Metrics
+
+**SC-010:** Business Impact
+
+- **Metric:** [NEEDS CLARIFICATION - Define business value metrics]
+- **Target:** [e.g., "Achieve X% increase in [specific business metric]"]
+- **Measurement Method:** [Business analytics]
+
+**SC-011:** ROI Achievement
+
+- **Metric:** [NEEDS CLARIFICATION - Define ROI expectations]
+- **Target:** [e.g., "Achieve positive ROI within X months"]
+- **Measurement Method:** [Financial analysis]
+
+---
+
+## 7. Dependencies and Assumptions
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - List external dependencies that must be available]
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - List assumptions made during specification]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - List known constraints affecting the feature]
+
+---
+
+## 8. Out of Scope
+
+The following items are explicitly out of scope for this feature:
+
+- [NEEDS CLARIFICATION - Define what is NOT included]
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION - List unresolved questions requiring stakeholder input]
+
+---
+
+## 10. Approval and Sign-off
+
+| Role | Name | Date | Signature |
+|------|------|------|-----------|
+| Product Owner | [TBD] | [TBD] | [TBD] |
+| Business Stakeholder | [TBD] | [TBD] | [TBD] |
+| Engineering Lead | [TBD] | [TBD] | [TBD] |
+| QA Lead | [TBD] | [TBD] | [TBD] |
+
+---
+
+**Document Version:** 1.0
+**Last Updated:** [Date]
+**Next Review Date:** [Date]
+
+---
+
+## Instructions for Use
+
+**To complete this specification, please provide:**
+
+1. **Clear feature description** - What problem are you solving?
+2. **Target users** - Who will use this feature?
+3. **User workflows** - What tasks should users be able to complete?
+4. **Business objectives** - What business goals does this support?
+5. **Performance expectations** - What are acceptable response times, capacity, etc.?
+6. **Security and compliance needs** - What standards must be met?
+7. **Success metrics** - How will you measure success?
+
+Once requirements are provided, all sections marked with [NEEDS CLARIFICATION] will be completed with specific, measurable, and testable requirements.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-004.md b/evals/datasets/real-specs/spec-004.md
new file mode 100644
index 0000000000..0736c7d5d7
--- /dev/null
+++ b/evals/datasets/real-specs/spec-004.md
@@ -0,0 +1,301 @@
+# Test Spec 4
+
+## Prompt
+
+Build an e-commerce platform with product catalog, shopping cart, and checkout flow
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+#### Feature Name
+
+[NEEDS CLARIFICATION: No user requirements provided]
+
+#### Description
+
+This feature specification template is ready to be populated once user requirements are provided. The document will detail the complete functional and non-functional requirements, user stories, acceptance criteria, edge cases, and success metrics for the requested feature.
+
+#### Business Justification
+
+[NEEDS CLARIFICATION: Business value and justification pending user input]
+
+#### Scope
+
+**In Scope:**
+
+- [To be defined based on user requirements]
+
+**Out of Scope:**
+
+- [To be defined based on user requirements]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (Critical - Must Have)
+
+#### US-001: [Title Pending]
+
+**As a** [role]
+**I want** [capability]
+**So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [context]
+  **When** [action]
+  **Then** [expected outcome]
+
+**Priority:** P1
+**Story Points:** [TBD]
+
+---
+
+### Priority 2 (Important - Should Have)
+
+#### US-002: [Title Pending]
+
+**As a** [role]
+**I want** [capability]
+**So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [context]
+  **When** [action]
+  **Then** [expected outcome]
+
+**Priority:** P2
+**Story Points:** [TBD]
+
+---
+
+### Priority 3 (Nice to Have - Could Have)
+
+#### US-003: [Title Pending]
+
+**As a** [role]
+**I want** [capability]
+**So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [context]
+  **When** [action]
+  **Then** [expected outcome]
+
+**Priority:** P3
+**Story Points:** [TBD]
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [NEEDS CLARIFICATION: Requirement description pending]
+
+- **Testability Criteria:** [How to verify this requirement]
+
+**FR-002:** [NEEDS CLARIFICATION: Requirement description pending]
+
+- **Testability Criteria:** [How to verify this requirement]
+
+### Data Requirements
+
+**FR-003:** [NEEDS CLARIFICATION: Data handling requirements pending]
+
+- **Testability Criteria:** [How to verify this requirement]
+
+### User Interface Requirements
+
+**FR-004:** [NEEDS CLARIFICATION: UI/UX requirements pending]
+
+- **Testability Criteria:** [How to verify this requirement]
+
+### Integration Requirements
+
+**FR-005:** [NEEDS CLARIFICATION: System integration requirements pending]
+
+- **Testability Criteria:** [How to verify this requirement]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION: Performance benchmarks pending]
+
+- **Measurement Method:** [How performance will be measured]
+
+**NFR-002:** [NEEDS CLARIFICATION: Response time requirements pending]
+
+- **Measurement Method:** [How response times will be measured]
+
+### Security Requirements
+
+**NFR-003:** [NEEDS CLARIFICATION: Security and authentication requirements pending]
+
+- **Verification Method:** [How security will be verified]
+
+**NFR-004:** [NEEDS CLARIFICATION: Data protection requirements pending]
+
+- **Verification Method:** [How data protection will be verified]
+
+### Scalability Requirements
+
+**NFR-005:** [NEEDS CLARIFICATION: Scalability requirements pending]
+
+- **Measurement Method:** [How scalability will be measured]
+
+### Usability Requirements
+
+**NFR-006:** [NEEDS CLARIFICATION: Usability standards pending]
+
+- **Measurement Method:** [How usability will be measured]
+
+### Reliability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION: Uptime and reliability requirements pending]
+
+- **Measurement Method:** [How reliability will be measured]
+
+### Accessibility Requirements
+
+**NFR-008:** [NEEDS CLARIFICATION: Accessibility standards pending]
+
+- **Verification Method:** [How accessibility compliance will be verified]
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION: Minimum/maximum value handling]
+
+- **Expected Behavior:** [How system should respond]
+
+**EC-002:** [NEEDS CLARIFICATION: Empty or null input handling]
+
+- **Expected Behavior:** [How system should respond]
+
+### Error Scenarios
+
+**ES-001:** [NEEDS CLARIFICATION: Invalid input handling]
+
+- **Expected Behavior:** [Error message and recovery process]
+
+**ES-002:** [NEEDS CLARIFICATION: System unavailability]
+
+- **Expected Behavior:** [Fallback mechanism]
+
+**ES-003:** [NEEDS CLARIFICATION: Concurrent access conflicts]
+
+- **Expected Behavior:** [Conflict resolution approach]
+
+### Network & Connectivity
+
+**ES-004:** [NEEDS CLARIFICATION: Network failure handling]
+
+- **Expected Behavior:** [Offline capability or error handling]
+
+**ES-005:** [NEEDS CLARIFICATION: Timeout scenarios]
+
+- **Expected Behavior:** [Timeout handling and user notification]
+
+---
+
+## 6. Success Criteria
+
+### Functional Success Metrics
+
+**SC-001:** [NEEDS CLARIFICATION: Feature completion criteria]
+
+- **Target:** [Measurable target value]
+- **Measurement Method:** [How success will be measured]
+
+**SC-002:** [NEEDS CLARIFICATION: User adoption metrics]
+
+- **Target:** [Measurable target value]
+- **Measurement Method:** [How success will be measured]
+
+### Performance Success Metrics
+
+**SC-003:** [NEEDS CLARIFICATION: Performance benchmarks]
+
+- **Target:** [Specific numeric target]
+- **Measurement Method:** [How performance will be tracked]
+
+### Business Success Metrics
+
+**SC-004:** [NEEDS CLARIFICATION: Business impact metrics]
+
+- **Target:** [Measurable business outcome]
+- **Measurement Method:** [How business impact will be measured]
+
+**SC-005:** [NEEDS CLARIFICATION: User satisfaction metrics]
+
+- **Target:** [Satisfaction score or rating]
+- **Measurement Method:** [Survey or feedback mechanism]
+
+---
+
+## 7. Assumptions & Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION: List assumptions about user environment, behavior, or system state]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION: External systems, data sources, or third-party services required]
+
+### Constraints
+
+- [NEEDS CLARIFICATION: Known limitations or restrictions]
+
+---
+
+## 8. Open Questions
+
+1. [NEEDS CLARIFICATION: What are the specific user requirements for this feature?]
+2. [NEEDS CLARIFICATION: Who are the target users/personas?]
+3. [NEEDS CLARIFICATION: What problem does this feature solve?]
+4. [NEEDS CLARIFICATION: What are the expected usage volumes?]
+5. [NEEDS CLARIFICATION: Are there any regulatory or compliance requirements?]
+
+---
+
+## 9. Appendix
+
+### Glossary
+
+- **Term:** Definition
+
+### Related Documents
+
+- [Link to related specifications or requirements documents]
+
+---
+
+## Document Control
+
+| Version | Date | Author | Changes |
+|---------|------|--------|---------|
+| 0.1 | [Date] | [Author] | Initial template created |
+
+---
+
+**Note:** This specification document requires user input to be completed. Please provide detailed requirements including the feature description, target users, business goals, and any specific functionality needed. Once provided, all sections marked with [NEEDS CLARIFICATION] will be populated with specific, measurable, and testable requirements.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-005.md b/evals/datasets/real-specs/spec-005.md
new file mode 100644
index 0000000000..d2b88214b5
--- /dev/null
+++ b/evals/datasets/real-specs/spec-005.md
@@ -0,0 +1,315 @@
+# Test Spec 5
+
+## Prompt
+
+Create a customer relationship management (CRM) system with contact management and sales pipeline
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+[NEEDS CLARIFICATION: No user requirements were provided in the input]
+
+**Feature Name:** [Pending User Input]
+
+**Description:**
+This section will contain a brief description of the feature once user requirements are provided. The overview should explain the purpose of the feature, the problem it solves, and the value it delivers to users and the business.
+
+**Target Users:** [To be defined based on user input]
+
+**Business Value:** [To be defined based on user input]
+
+---
+
+## 2. User Stories
+
+### High Priority (P1)
+
+**US-001: [Story Title - Pending]**
+
+- **Priority:** P1
+- **As a** [user role],
+- **I want** [feature capability],
+- **So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+**US-002: [Story Title - Pending]**
+
+- **Priority:** P1
+- **As a** [user role],
+- **I want** [feature capability],
+- **So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+### Medium Priority (P2)
+
+**US-003: [Story Title - Pending]**
+
+- **Priority:** P2
+- **As a** [user role],
+- **I want** [feature capability],
+- **So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+**US-004: [Story Title - Pending]**
+
+- **Priority:** P2
+- **As a** [user role],
+- **I want** [feature capability],
+- **So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+### Low Priority (P3)
+
+**US-005: [Story Title - Pending]**
+
+- **Priority:** P3
+- **As a** [user role],
+- **I want** [feature capability],
+- **So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [Requirement description - Pending user input]
+
+- **Testability:** [How this will be verified]
+- **Priority:** [P1/P2/P3]
+
+**FR-002:** [Requirement description - Pending user input]
+
+- **Testability:** [How this will be verified]
+- **Priority:** [P1/P2/P3]
+
+**FR-003:** [Requirement description - Pending user input]
+
+- **Testability:** [How this will be verified]
+- **Priority:** [P1/P2/P3]
+
+**FR-004:** [Requirement description - Pending user input]
+
+- **Testability:** [How this will be verified]
+- **Priority:** [P1/P2/P3]
+
+**FR-005:** [Requirement description - Pending user input]
+
+- **Testability:** [How this will be verified]
+- **Priority:** [P1/P2/P3]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-001 to NFR-003)
+
+**NFR-001:** [Performance requirement - Pending user input]
+
+- **Measurement:** [Specific metrics and thresholds]
+- **Priority:** [P1/P2/P3]
+
+**NFR-002:** [Response time requirement - Pending user input]
+
+- **Measurement:** [Specific metrics and thresholds]
+- **Priority:** [P1/P2/P3]
+
+**NFR-003:** [Throughput requirement - Pending user input]
+
+- **Measurement:** [Specific metrics and thresholds]
+- **Priority:** [P1/P2/P3]
+
+---
+
+### Security (NFR-004 to NFR-006)
+
+**NFR-004:** [Security requirement - Pending user input]
+
+- **Measurement:** [Specific security controls and verification methods]
+- **Priority:** [P1/P2/P3]
+
+**NFR-005:** [Data protection requirement - Pending user input]
+
+- **Measurement:** [Specific security controls and verification methods]
+- **Priority:** [P1/P2/P3]
+
+**NFR-006:** [Access control requirement - Pending user input]
+
+- **Measurement:** [Specific security controls and verification methods]
+- **Priority:** [P1/P2/P3]
+
+---
+
+### Scalability (NFR-007 to NFR-008)
+
+**NFR-007:** [Scalability requirement - Pending user input]
+
+- **Measurement:** [Specific capacity metrics]
+- **Priority:** [P1/P2/P3]
+
+**NFR-008:** [Growth handling requirement - Pending user input]
+
+- **Measurement:** [Specific capacity metrics]
+- **Priority:** [P1/P2/P3]
+
+---
+
+### Usability (NFR-009 to NFR-010)
+
+**NFR-009:** [Usability requirement - Pending user input]
+
+- **Measurement:** [Specific usability metrics]
+- **Priority:** [P1/P2/P3]
+
+**NFR-010:** [Accessibility requirement - Pending user input]
+
+- **Measurement:** [Specific accessibility standards]
+- **Priority:** [P1/P2/P3]
+
+---
+
+### Reliability (NFR-011 to NFR-012)
+
+**NFR-011:** [Availability requirement - Pending user input]
+
+- **Measurement:** [Uptime percentage and measurement period]
+- **Priority:** [P1/P2/P3]
+
+**NFR-012:** [Error handling requirement - Pending user input]
+
+- **Measurement:** [Error rate thresholds]
+- **Priority:** [P1/P2/P3]
+
+---
+
+## 5. Edge Cases
+
+**EC-001: [Edge case title - Pending user input]**
+
+- **Scenario:** [Description of boundary condition]
+- **Expected Behavior:** [How the system should respond]
+- **Priority:** [P1/P2/P3]
+
+**EC-002: [Edge case title - Pending user input]**
+
+- **Scenario:** [Description of boundary condition]
+- **Expected Behavior:** [How the system should respond]
+- **Priority:** [P1/P2/P3]
+
+**EC-003: [Edge case title - Pending user input]**
+
+- **Scenario:** [Description of error scenario]
+- **Expected Behavior:** [How the system should respond]
+- **Priority:** [P1/P2/P3]
+
+**EC-004: [Edge case title - Pending user input]**
+
+- **Scenario:** [Description of error scenario]
+- **Expected Behavior:** [How the system should respond]
+- **Priority:** [P1/P2/P3]
+
+**EC-005: [Edge case title - Pending user input]**
+
+- **Scenario:** [Description of concurrent operation scenario]
+- **Expected Behavior:** [How the system should respond]
+- **Priority:** [P1/P2/P3]
+
+---
+
+## 6. Success Criteria
+
+**SC-001:** [Success metric - Pending user input]
+
+- **Measurement Method:** [How this will be measured]
+- **Target:** [Specific numerical target]
+- **Timeline:** [When this should be achieved]
+
+**SC-002:** [Success metric - Pending user input]
+
+- **Measurement Method:** [How this will be measured]
+- **Target:** [Specific numerical target]
+- **Timeline:** [When this should be achieved]
+
+**SC-003:** [Success metric - Pending user input]
+
+- **Measurement Method:** [How this will be measured]
+- **Target:** [Specific numerical target]
+- **Timeline:** [When this should be achieved]
+
+**SC-004:** [Success metric - Pending user input]
+
+- **Measurement Method:** [How this will be measured]
+- **Target:** [Specific numerical target]
+- **Timeline:** [When this should be achieved]
+
+**SC-005:** [Success metric - Pending user input]
+
+- **Measurement Method:** [How this will be measured]
+- **Target:** [Specific numerical target]
+- **Timeline:** [When this should be achieved]
+
+---
+
+## Appendix
+
+### Assumptions
+
+[NEEDS CLARIFICATION: Assumptions will be documented once user requirements are provided]
+
+### Dependencies
+
+[NEEDS CLARIFICATION: Dependencies will be documented once user requirements are provided]
+
+### Out of Scope
+
+[NEEDS CLARIFICATION: Out of scope items will be documented once user requirements are provided]
+
+### Open Questions
+
+1. [NEEDS CLARIFICATION: User requirements input is required to complete this specification]
+2. [Additional questions will be added based on user input]
+
+---
+
+**Note:** Please provide the user requirements to generate a complete and detailed feature specification document. The template above demonstrates the structure and level of detail that will be included once requirements are specified.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-006.md b/evals/datasets/real-specs/spec-006.md
new file mode 100644
index 0000000000..aec54f5cad
--- /dev/null
+++ b/evals/datasets/real-specs/spec-006.md
@@ -0,0 +1,418 @@
+# Test Spec 6
+
+## Prompt
+
+Develop a project management dashboard with task tracking, team collaboration, and reporting
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+#### Feature Name
+
+[NEEDS CLARIFICATION - Feature name not provided]
+
+#### Description
+
+[NEEDS CLARIFICATION - No user requirements were provided in the input. This specification template is ready to be populated once feature requirements are submitted.]
+
+#### Business Value
+
+[NEEDS CLARIFICATION - Business objectives and value proposition need to be defined]
+
+#### Target Users
+
+[NEEDS CLARIFICATION - User personas and target audience need to be identified]
+
+#### Scope
+
+**In Scope:**
+
+- [NEEDS CLARIFICATION]
+
+**Out of Scope:**
+
+- [NEEDS CLARIFICATION]
+
+---
+
+## 2. User Stories
+
+### User Story Template (Ready for Population)
+
+#### **US-001: [Story Title] [Priority: P1/P2/P3]**
+
+**As a** [user role]
+**I want** [capability]
+**So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+  **When** [action performed]
+  **Then** [expected outcome]
+
+- **Given** [initial context]
+  **When** [action performed]
+  **Then** [expected outcome]
+
+**Definition of Done:**
+
+- [ ] All acceptance criteria met
+- [ ] Edge cases handled
+- [ ] Validation rules implemented
+- [ ] User feedback mechanism in place
+- [ ] Accessible to users with disabilities
+- [ ] Testable independently
+
+---
+
+[NEEDS CLARIFICATION - Please provide user requirements to generate specific user stories. Template will include 5+ prioritized stories covering:]
+
+- Primary user workflows (P1)
+- Secondary features (P2)
+- Nice-to-have enhancements (P3)
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [NEEDS CLARIFICATION]
+**Description:** System must [specific, measurable requirement]
+**Priority:** [P1/P2/P3]
+**Dependencies:** None
+
+**FR-002:** [NEEDS CLARIFICATION]
+**Description:** System must [specific, measurable requirement]
+**Priority:** [P1/P2/P3]
+**Dependencies:** FR-001
+
+### Data Management
+
+**FR-003:** [NEEDS CLARIFICATION]
+**Description:** System must support [data requirements]
+**Priority:** [P1/P2/P3]
+
+### User Interface
+
+**FR-004:** [NEEDS CLARIFICATION]
+**Description:** Interface must provide [UI requirement]
+**Priority:** [P1/P2/P3]
+
+### Integration
+
+**FR-005:** [NEEDS CLARIFICATION]
+**Description:** System must integrate with [integration requirement]
+**Priority:** [P1/P2/P3]
+
+### Validation & Error Handling
+
+**FR-006:** [NEEDS CLARIFICATION]
+**Description:** System must validate [validation requirement]
+**Priority:** [P1/P2/P3]
+
+**FR-007:** [NEEDS CLARIFICATION]
+**Description:** System must display error messages when [condition]
+**Priority:** [P1/P2/P3]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-001 to NFR-005)
+
+**NFR-001:** Response Time
+**Requirement:** System must respond to user actions within [X] seconds under normal load conditions
+**Measurement:** 95th percentile response time < [X] seconds
+
+**NFR-002:** Throughput
+**Requirement:** System must support [X] concurrent users without performance degradation
+**Measurement:** Load testing with [X] concurrent users maintaining response times per NFR-001
+
+**NFR-003:** Data Processing
+**Requirement:** System must process [X] records/transactions per [time unit]
+**Measurement:** Batch processing completion time < [X] minutes for [Y] records
+
+**NFR-004:** Availability
+**Requirement:** System must maintain [X]% uptime during business hours
+**Measurement:** Monthly uptime monitoring excluding planned maintenance
+
+**NFR-005:** Capacity
+**Requirement:** System must handle [X] volume of data/transactions
+**Measurement:** Storage and processing capacity tests
+
+### Security (NFR-006 to NFR-010)
+
+**NFR-006:** Authentication
+**Requirement:** System must authenticate all users before granting access
+**Measurement:** 100% of access attempts require valid authentication
+
+**NFR-007:** Authorization
+**Requirement:** System must enforce role-based access controls
+**Measurement:** Users can only access features permitted by their role
+
+**NFR-008:** Data Protection
+**Requirement:** System must protect sensitive data at rest and in transit
+**Measurement:** Security audit confirms encryption standards are met
+
+**NFR-009:** Audit Logging
+**Requirement:** System must log all user actions and system events
+**Measurement:** 100% of critical actions are logged with timestamp, user, and action details
+
+**NFR-010:** Session Management
+**Requirement:** System must terminate inactive sessions after [X] minutes
+**Measurement:** Automated logout occurs at specified time interval
+
+### Usability (NFR-011 to NFR-014)
+
+**NFR-011:** Accessibility
+**Requirement:** System must comply with WCAG 2.1 Level AA standards
+**Measurement:** Accessibility audit confirms compliance
+
+**NFR-012:** User Interface Consistency
+**Requirement:** System must maintain consistent UI patterns across all screens
+**Measurement:** UI review confirms adherence to design standards
+
+**NFR-013:** Error Messages
+**Requirement:** System must provide clear, actionable error messages
+**Measurement:** User testing confirms 90% of users understand error messages
+
+**NFR-014:** Learning Curve
+**Requirement:** New users must be able to complete core tasks within [X] minutes of first use
+**Measurement:** User testing with [Y] participants completing [Z] core tasks
+
+### Compatibility (NFR-015 to NFR-017)
+
+**NFR-015:** Browser Support
+**Requirement:** System must function correctly on supported browsers
+**Measurement:** Testing confirms functionality on specified browser versions
+
+**NFR-016:** Device Compatibility
+**Requirement:** System must be accessible from [desktop/mobile/tablet] devices
+**Measurement:** Testing confirms responsive behavior across device types
+
+**NFR-017:** Data Format Support
+**Requirement:** System must support [specified file formats/data types]
+**Measurement:** Import/export testing with all supported formats
+
+### Scalability (NFR-018 to NFR-019)
+
+**NFR-018:** User Growth
+**Requirement:** System architecture must support [X]% growth in user base without redesign
+**Measurement:** Capacity planning confirms headroom for growth
+
+**NFR-019:** Data Growth
+**Requirement:** System must handle [X]% annual data growth
+**Measurement:** Storage and query performance maintained with increased data volume
+
+### Reliability (NFR-020 to NFR-021)
+
+**NFR-020:** Data Integrity
+**Requirement:** System must maintain data accuracy and consistency
+**Measurement:** Data validation tests confirm 100% accuracy
+
+**NFR-021:** Recovery
+**Requirement:** System must recover from failures within [X] minutes
+**Measurement:** Disaster recovery testing confirms recovery time objective
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Input Validation Edge Cases
+
+**EC-001:** [NEEDS CLARIFICATION]
+**Scenario:** User enters [boundary condition]
+**Expected Behavior:** System should [expected response]
+
+**EC-002:** [NEEDS CLARIFICATION]
+**Scenario:** User submits empty/null values
+**Expected Behavior:** System should [expected response]
+
+**EC-003:** [NEEDS CLARIFICATION]
+**Scenario:** User enters special characters/invalid format
+**Expected Behavior:** System should [expected response]
+
+**EC-004:** [NEEDS CLARIFICATION]
+**Scenario:** User exceeds maximum input length/size
+**Expected Behavior:** System should [expected response]
+
+### Concurrency Edge Cases
+
+**EC-005:** [NEEDS CLARIFICATION]
+**Scenario:** Multiple users attempt to modify the same record simultaneously
+**Expected Behavior:** System should [expected response]
+
+**EC-006:** [NEEDS CLARIFICATION]
+**Scenario:** User performs rapid repeated actions
+**Expected Behavior:** System should [expected response]
+
+### System State Edge Cases
+
+**EC-007:** [NEEDS CLARIFICATION]
+**Scenario:** System reaches maximum capacity
+**Expected Behavior:** System should [expected response]
+
+**EC-008:** [NEEDS CLARIFICATION]
+**Scenario:** External dependency is unavailable
+**Expected Behavior:** System should [expected response]
+
+**EC-009:** [NEEDS CLARIFICATION]
+**Scenario:** User session expires during operation
+**Expected Behavior:** System should [expected response]
+
+### Data Edge Cases
+
+**EC-010:** [NEEDS CLARIFICATION]
+**Scenario:** Data set is empty/contains zero records
+**Expected Behavior:** System should [expected response]
+
+**EC-011:** [NEEDS CLARIFICATION]
+**Scenario:** Data contains extreme values (min/max boundaries)
+**Expected Behavior:** System should [expected response]
+
+**EC-012:** [NEEDS CLARIFICATION]
+**Scenario:** Data contains duplicates
+**Expected Behavior:** System should [expected response]
+
+### Network & Connectivity Edge Cases
+
+**EC-013:** [NEEDS CLARIFICATION]
+**Scenario:** Network connection is lost during operation
+**Expected Behavior:** System should [expected response]
+
+**EC-014:** [NEEDS CLARIFICATION]
+**Scenario:** Request times out
+**Expected Behavior:** System should [expected response]
+
+---
+
+## 6. Success Criteria
+
+### User Adoption Metrics
+
+**SC-001:** User Engagement
+**Criteria:** [X]% of target users actively using the feature within [Y] days of launch
+**Measurement Method:** Analytics tracking active users
+
+**SC-002:** Task Completion Rate
+**Criteria:** [X]% of users successfully complete primary workflow without assistance
+**Measurement Method:** User session analysis and funnel tracking
+
+**SC-003:** User Satisfaction
+**Criteria:** Average user satisfaction score of [X] out of [Y] in post-launch survey
+**Measurement Method:** User satisfaction survey (NPS, CSAT, or similar)
+
+### Performance Metrics
+
+**SC-004:** Response Time
+**Criteria:** 95th percentile response time remains below [X] seconds
+**Measurement Method:** Performance monitoring tools
+
+**SC-005:** Error Rate
+**Criteria:** Error rate remains below [X]% of all transactions
+**Measurement Method:** Error logging and monitoring
+
+**SC-006:** System Uptime
+**Criteria:** System maintains [X]% uptime in first [Y] months post-launch
+**Measurement Method:** Uptime monitoring service
+
+### Business Metrics
+
+**SC-007:** [NEEDS CLARIFICATION - Business KPI]
+**Criteria:** Achieve [specific business outcome] within [timeframe]
+**Measurement Method:** [Measurement approach]
+
+**SC-008:** Cost Efficiency [if applicable]
+**Criteria:** Feature reduces [process/cost] by [X]%
+**Measurement Method:** Before/after comparison analysis
+
+### Quality Metrics
+
+**SC-009:** Defect Rate
+**Criteria:** Post-launch critical defects < [X] per [time period]
+**Measurement Method:** Defect tracking system
+
+**SC-010:** Accessibility Compliance
+**Criteria:** Zero critical accessibility violations in audit
+**Measurement Method:** Accessibility testing tools and manual audit
+
+### Adoption Milestones
+
+**SC-011:** Initial Adoption
+**Criteria:** [X] users/transactions in first week
+**Measurement Method:** Usage analytics
+
+**SC-012:** Sustained Adoption
+**Criteria:** [X]% month-over-month growth for first [Y] months
+**Measurement Method:** Monthly usage reports
+
+---
+
+## 7. Assumptions & Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - List assumptions about user behavior, system environment, etc.]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - List dependencies on other features, systems, or external factors]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - List any known constraints (budget, time, resources)]
+
+---
+
+## 8. Open Questions
+
+1. [NEEDS CLARIFICATION - What are the specific user requirements for this feature?]
+2. [NEEDS CLARIFICATION - Who are the target users and their roles?]
+3. [NEEDS CLARIFICATION - What problem does this feature solve?]
+4. [NEEDS CLARIFICATION - What are the key workflows users need to complete?]
+5. [NEEDS CLARIFICATION - Are there existing systems this needs to work with?]
+6. [NEEDS CLARIFICATION - What are the success metrics from a business perspective?]
+7. [NEEDS CLARIFICATION - What is the expected launch timeline?]
+8. [NEEDS CLARIFICATION - What are the priority levels for different aspects?]
+
+---
+
+## 9. Approval & Sign-off
+
+**Document Version:** 0.1 (Template - Awaiting Requirements)
+**Last Updated:** [Current Date]
+**Status:** Draft - Awaiting User Input
+
+**Stakeholder Approval:**
+
+- [ ] Product Owner
+- [ ] Business Stakeholder
+- [ ] User Experience Lead
+- [ ] Quality Assurance Lead
+
+---
+
+## Instructions for Completing This Specification
+
+To complete this feature specification document, please provide:
+
+1. **Feature description** - What is being built and why?
+2. **User roles** - Who will use this feature?
+3. **Core workflows** - What tasks do users need to accomplish?
+4. **Business requirements** - What business problems does this solve?
+5. **Integration needs** - What other systems are involved?
+6. **Success metrics** - How will success be measured?
+7. **Constraints** - Any known limitations or requirements?
+
+Once requirements are provided, this template will be populated with specific, measurable, and testable specifications following the structure above.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-007.md b/evals/datasets/real-specs/spec-007.md
new file mode 100644
index 0000000000..78b4c99446
--- /dev/null
+++ b/evals/datasets/real-specs/spec-007.md
@@ -0,0 +1,272 @@
+# Test Spec 7
+
+## Prompt
+
+Build a distributed microservices architecture for a real-time analytics platform
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature description provided]
+
+**Description:**
+No user requirements were provided in the input. This specification template is ready to be populated once feature requirements are submitted.
+
+**Target Users:** [NEEDS CLARIFICATION]
+
+**Business Objective:** [NEEDS CLARIFICATION]
+
+**Scope:** [NEEDS CLARIFICATION]
+
+---
+
+## 2. User Stories
+
+### P1 - Critical Priority
+
+**US-001: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [feature capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger occurs]
+- **Then** [expected outcome]
+
+---
+
+**US-002: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [feature capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger occurs]
+- **Then** [expected outcome]
+
+---
+
+### P2 - High Priority
+
+**US-003: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [feature capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger occurs]
+- **Then** [expected outcome]
+
+---
+
+**US-004: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [feature capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger occurs]
+- **Then** [expected outcome]
+
+---
+
+### P3 - Medium Priority
+
+**US-005: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [feature capability]
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/trigger occurs]
+- **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Specific functional behavior required]
+
+**FR-002:** [NEEDS CLARIFICATION - Specific functional behavior required]
+
+**FR-003:** [NEEDS CLARIFICATION - Specific functional behavior required]
+
+**FR-004:** [NEEDS CLARIFICATION - Specific functional behavior required]
+
+**FR-005:** [NEEDS CLARIFICATION - Specific functional behavior required]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION - Response time requirements]
+
+- Example: System shall respond to user actions within X seconds under normal load
+
+**NFR-002:** [NEEDS CLARIFICATION - Throughput requirements]
+
+- Example: System shall support X concurrent users
+
+**NFR-003:** [NEEDS CLARIFICATION - Load capacity]
+
+- Example: System shall handle X transactions per second
+
+### Security Requirements
+
+**NFR-004:** [NEEDS CLARIFICATION - Authentication requirements]
+
+**NFR-005:** [NEEDS CLARIFICATION - Authorization requirements]
+
+**NFR-006:** [NEEDS CLARIFICATION - Data protection requirements]
+
+### Scalability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION - Growth capacity]
+
+**NFR-008:** [NEEDS CLARIFICATION - Resource scaling]
+
+### Reliability Requirements
+
+**NFR-009:** [NEEDS CLARIFICATION - Uptime requirements]
+
+- Example: System shall maintain 99.X% uptime
+
+**NFR-010:** [NEEDS CLARIFICATION - Data integrity requirements]
+
+### Usability Requirements
+
+**NFR-011:** [NEEDS CLARIFICATION - User experience standards]
+
+**NFR-012:** [NEEDS CLARIFICATION - Accessibility requirements]
+
+### Maintainability Requirements
+
+**NFR-013:** [NEEDS CLARIFICATION - Support and maintenance needs]
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Input Validation
+
+**EC-001:** [NEEDS CLARIFICATION - Invalid input handling]
+
+**EC-002:** [NEEDS CLARIFICATION - Missing required data]
+
+**EC-003:** [NEEDS CLARIFICATION - Data format errors]
+
+### System Boundaries
+
+**EC-004:** [NEEDS CLARIFICATION - Maximum capacity scenarios]
+
+**EC-005:** [NEEDS CLARIFICATION - Minimum threshold scenarios]
+
+**EC-006:** [NEEDS CLARIFICATION - Timeout scenarios]
+
+### Concurrent Operations
+
+**EC-007:** [NEEDS CLARIFICATION - Simultaneous user actions]
+
+**EC-008:** [NEEDS CLARIFICATION - Race conditions]
+
+### Integration Points
+
+**EC-009:** [NEEDS CLARIFICATION - External system failures]
+
+**EC-010:** [NEEDS CLARIFICATION - Network connectivity issues]
+
+### Data Scenarios
+
+**EC-011:** [NEEDS CLARIFICATION - Empty state handling]
+
+**EC-012:** [NEEDS CLARIFICATION - Duplicate data handling]
+
+---
+
+## 6. Success Criteria
+
+**SC-001:** [NEEDS CLARIFICATION - Measurable adoption metric]
+
+- Example: X% of target users complete primary workflow within first week
+
+**SC-002:** [NEEDS CLARIFICATION - Measurable performance metric]
+
+- Example: X% reduction in time to complete task Y
+
+**SC-003:** [NEEDS CLARIFICATION - Measurable quality metric]
+
+- Example: Error rate below X%
+
+**SC-004:** [NEEDS CLARIFICATION - Measurable satisfaction metric]
+
+- Example: User satisfaction score of X or higher
+
+**SC-005:** [NEEDS CLARIFICATION - Measurable business metric]
+
+- Example: X% increase in conversion/completion rate
+
+---
+
+## 7. Assumptions & Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - List assumptions about user behavior, environment, or context]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - List dependencies on other systems, teams, or features]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - List known limitations or constraints]
+
+---
+
+## 8. Open Questions
+
+1. [NEEDS CLARIFICATION - What is the feature being specified?]
+2. [NEEDS CLARIFICATION - Who are the target users?]
+3. [NEEDS CLARIFICATION - What problem does this solve?]
+4. [NEEDS CLARIFICATION - What are the key workflows?]
+5. [NEEDS CLARIFICATION - What are the priority requirements?]
+
+---
+
+## Next Steps
+
+**To complete this specification, please provide:**
+
+1. Detailed description of the feature or capability to be built
+2. Target user roles and personas
+3. Key user workflows and scenarios
+4. Business objectives and success metrics
+5. Any known constraints or requirements
+
+Once user requirements are provided, this document will be fully populated with specific, measurable, and testable requirements.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-008.md b/evals/datasets/real-specs/spec-008.md
new file mode 100644
index 0000000000..e5c5fc400b
--- /dev/null
+++ b/evals/datasets/real-specs/spec-008.md
@@ -0,0 +1,221 @@
+# Test Spec 8
+
+## Prompt
+
+Create an enterprise inventory management system with multi-warehouse support and compliance tracking
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature description provided]
+
+**Description:**
+No user requirements were provided in the input. This specification document template is ready to be populated once feature requirements are submitted.
+
+**Purpose:**
+[NEEDS CLARIFICATION - Awaiting user input to define feature purpose and business objectives]
+
+**Target Users:**
+[NEEDS CLARIFICATION - User roles and personas to be defined]
+
+**Business Value:**
+[NEEDS CLARIFICATION - Expected business impact and value proposition to be defined]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Must Have
+
+**US-001: [NEEDS CLARIFICATION]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+- **Given** [context to be defined]
+- **When** [action to be defined]
+- **Then** [expected outcome to be defined]
+
+---
+
+**US-002: [NEEDS CLARIFICATION]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+- **Given** [context to be defined]
+- **When** [action to be defined]
+- **Then** [expected outcome to be defined]
+
+---
+
+### Priority 2 (P2) - Should Have
+
+**US-003: [NEEDS CLARIFICATION]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+- **Given** [context to be defined]
+- **When** [action to be defined]
+- **Then** [expected outcome to be defined]
+
+---
+
+**US-004: [NEEDS CLARIFICATION]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+- **Given** [context to be defined]
+- **When** [action to be defined]
+- **Then** [expected outcome to be defined]
+
+---
+
+### Priority 3 (P3) - Nice to Have
+
+**US-005: [NEEDS CLARIFICATION]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+- **Given** [context to be defined]
+- **When** [action to be defined]
+- **Then** [expected outcome to be defined]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Core functionality to be defined]
+
+**FR-002:** [NEEDS CLARIFICATION - User interaction requirements to be defined]
+
+**FR-003:** [NEEDS CLARIFICATION - Data handling requirements to be defined]
+
+**FR-004:** [NEEDS CLARIFICATION - Validation rules to be defined]
+
+**FR-005:** [NEEDS CLARIFICATION - Business logic requirements to be defined]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-P)
+
+**NFR-P-001:** [NEEDS CLARIFICATION - Response time requirements to be defined]
+
+**NFR-P-002:** [NEEDS CLARIFICATION - Throughput requirements to be defined]
+
+**NFR-P-003:** [NEEDS CLARIFICATION - Resource utilization limits to be defined]
+
+### Security (NFR-S)
+
+**NFR-S-001:** [NEEDS CLARIFICATION - Authentication requirements to be defined]
+
+**NFR-S-002:** [NEEDS CLARIFICATION - Authorization requirements to be defined]
+
+**NFR-S-003:** [NEEDS CLARIFICATION - Data protection requirements to be defined]
+
+### Scalability (NFR-SC)
+
+**NFR-SC-001:** [NEEDS CLARIFICATION - Concurrent user capacity to be defined]
+
+**NFR-SC-002:** [NEEDS CLARIFICATION - Data volume handling to be defined]
+
+### Usability (NFR-U)
+
+**NFR-U-001:** [NEEDS CLARIFICATION - Accessibility standards to be defined]
+
+**NFR-U-002:** [NEEDS CLARIFICATION - User experience requirements to be defined]
+
+### Reliability (NFR-R)
+
+**NFR-R-001:** [NEEDS CLARIFICATION - Availability requirements to be defined]
+
+**NFR-R-002:** [NEEDS CLARIFICATION - Error handling requirements to be defined]
+
+---
+
+## 5. Edge Cases
+
+**EC-001:** [NEEDS CLARIFICATION - Boundary conditions to be defined]
+
+**EC-002:** [NEEDS CLARIFICATION - Invalid input scenarios to be defined]
+
+**EC-003:** [NEEDS CLARIFICATION - Concurrent operation conflicts to be defined]
+
+**EC-004:** [NEEDS CLARIFICATION - System unavailability scenarios to be defined]
+
+**EC-005:** [NEEDS CLARIFICATION - Data inconsistency scenarios to be defined]
+
+---
+
+## 6. Success Criteria
+
+**SC-001:** [NEEDS CLARIFICATION - User adoption metrics to be defined]
+
+**SC-002:** [NEEDS CLARIFICATION - Performance benchmarks to be defined]
+
+**SC-003:** [NEEDS CLARIFICATION - Quality metrics to be defined]
+
+**SC-004:** [NEEDS CLARIFICATION - Business outcome measurements to be defined]
+
+**SC-005:** [NEEDS CLARIFICATION - User satisfaction targets to be defined]
+
+---
+
+## 7. Assumptions
+
+[NEEDS CLARIFICATION - List assumptions about user behavior, system context, and constraints]
+
+---
+
+## 8. Dependencies
+
+[NEEDS CLARIFICATION - Identify dependencies on other systems, features, or external services]
+
+---
+
+## 9. Out of Scope
+
+[NEEDS CLARIFICATION - Explicitly define what is NOT included in this feature]
+
+---
+
+## Next Steps
+
+To complete this specification, please provide:
+
+1. **Feature description**: What problem does this feature solve?
+2. **Target users**: Who will use this feature?
+3. **Key capabilities**: What should users be able to do?
+4. **Constraints**: Any limitations, compliance requirements, or specific needs?
+5. **Success metrics**: How will we measure if this feature is successful?
+
+Once these details are provided, this specification can be fully populated with specific, measurable, and testable requirements.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-009.md b/evals/datasets/real-specs/spec-009.md
new file mode 100644
index 0000000000..134a218b4a
--- /dev/null
+++ b/evals/datasets/real-specs/spec-009.md
@@ -0,0 +1,355 @@
+# Test Spec 9
+
+## Prompt
+
+Develop a multi-tenant SaaS application with role-based access control and API integrations
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature description provided]
+
+**Description:**
+[NEEDS CLARIFICATION - Please provide the user requirements/feature description to create a comprehensive specification document]
+
+**Purpose:**
+[To be defined based on user input]
+
+**Target Users:**
+[To be defined based on user input]
+
+**Business Value:**
+[To be defined based on user input]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Critical
+
+**US-001: [NEEDS CLARIFICATION]**
+
+- **As a** [user role],
+- **I want** [feature/capability],
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/event occurs]
+- **Then** [expected outcome]
+
+**US-002: [NEEDS CLARIFICATION]**
+
+- **As a** [user role],
+- **I want** [feature/capability],
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/event occurs]
+- **Then** [expected outcome]
+
+**US-003: [NEEDS CLARIFICATION]**
+
+- **As a** [user role],
+- **I want** [feature/capability],
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/event occurs]
+- **Then** [expected outcome]
+
+### Priority 2 (P2) - Important
+
+**US-004: [NEEDS CLARIFICATION]**
+
+- **As a** [user role],
+- **I want** [feature/capability],
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/event occurs]
+- **Then** [expected outcome]
+
+**US-005: [NEEDS CLARIFICATION]**
+
+- **As a** [user role],
+- **I want** [feature/capability],
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/event occurs]
+- **Then** [expected outcome]
+
+### Priority 3 (P3) - Nice to Have
+
+**US-006: [NEEDS CLARIFICATION]**
+
+- **As a** [user role],
+- **I want** [feature/capability],
+- **So that** [business value/benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context/state]
+- **When** [action/event occurs]
+- **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Specific functional requirement]
+
+- **Description:** [Detailed description]
+- **Priority:** [P1/P2/P3]
+- **Dependencies:** [Related requirements]
+
+**FR-002:** [NEEDS CLARIFICATION - Specific functional requirement]
+
+- **Description:** [Detailed description]
+- **Priority:** [P1/P2/P3]
+- **Dependencies:** [Related requirements]
+
+**FR-003:** [NEEDS CLARIFICATION - Specific functional requirement]
+
+- **Description:** [Detailed description]
+- **Priority:** [P1/P2/P3]
+- **Dependencies:** [Related requirements]
+
+**FR-004:** [NEEDS CLARIFICATION - Specific functional requirement]
+
+- **Description:** [Detailed description]
+- **Priority:** [P1/P2/P3]
+- **Dependencies:** [Related requirements]
+
+**FR-005:** [NEEDS CLARIFICATION - Specific functional requirement]
+
+- **Description:** [Detailed description]
+- **Priority:** [P1/P2/P3]
+- **Dependencies:** [Related requirements]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION - Performance metric]
+
+- **Description:** [Specific measurable performance requirement]
+- **Target:** [Numeric threshold]
+- **Measurement Method:** [How to measure]
+
+**NFR-002:** [NEEDS CLARIFICATION - Response time requirement]
+
+- **Description:** [Specific measurable requirement]
+- **Target:** [Numeric threshold]
+- **Measurement Method:** [How to measure]
+
+### Security Requirements
+
+**NFR-003:** [NEEDS CLARIFICATION - Security requirement]
+
+- **Description:** [Specific security control or measure]
+- **Compliance:** [Relevant standards if applicable]
+
+**NFR-004:** [NEEDS CLARIFICATION - Data protection requirement]
+
+- **Description:** [Specific data security requirement]
+- **Compliance:** [Relevant standards if applicable]
+
+### Scalability Requirements
+
+**NFR-005:** [NEEDS CLARIFICATION - Scalability requirement]
+
+- **Description:** [Specific scalability metric]
+- **Target:** [Numeric threshold or growth capacity]
+
+### Usability Requirements
+
+**NFR-006:** [NEEDS CLARIFICATION - Usability requirement]
+
+- **Description:** [Specific usability standard]
+- **Target:** [Measurable usability metric]
+
+### Reliability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION - Availability requirement]
+
+- **Description:** [Uptime or availability requirement]
+- **Target:** [Percentage or time-based metric]
+
+### Accessibility Requirements
+
+**NFR-008:** [NEEDS CLARIFICATION - Accessibility standard]
+
+- **Description:** [Specific accessibility compliance requirement]
+- **Standard:** [e.g., WCAG 2.1 Level AA]
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION - Boundary condition]
+
+- **Scenario:** [Description of edge case]
+- **Expected Behavior:** [How system should handle this case]
+
+**EC-002:** [NEEDS CLARIFICATION - Maximum capacity scenario]
+
+- **Scenario:** [Description of edge case]
+- **Expected Behavior:** [How system should handle this case]
+
+**EC-003:** [NEEDS CLARIFICATION - Minimum value scenario]
+
+- **Scenario:** [Description of edge case]
+- **Expected Behavior:** [How system should handle this case]
+
+### Error Scenarios
+
+**EC-004:** [NEEDS CLARIFICATION - Invalid input scenario]
+
+- **Scenario:** [Description of error case]
+- **Expected Behavior:** [Error handling approach]
+- **User Feedback:** [What user should see/experience]
+
+**EC-005:** [NEEDS CLARIFICATION - System unavailability scenario]
+
+- **Scenario:** [Description of error case]
+- **Expected Behavior:** [Error handling approach]
+- **User Feedback:** [What user should see/experience]
+
+**EC-006:** [NEEDS CLARIFICATION - Timeout scenario]
+
+- **Scenario:** [Description of error case]
+- **Expected Behavior:** [Error handling approach]
+- **User Feedback:** [What user should see/experience]
+
+### Data Integrity Scenarios
+
+**EC-007:** [NEEDS CLARIFICATION - Concurrent access scenario]
+
+- **Scenario:** [Description of edge case]
+- **Expected Behavior:** [How system should handle this case]
+
+**EC-008:** [NEEDS CLARIFICATION - Data corruption scenario]
+
+- **Scenario:** [Description of edge case]
+- **Expected Behavior:** [How system should handle this case]
+
+---
+
+## 6. Success Criteria
+
+### User Adoption Metrics
+
+**SC-001:** [NEEDS CLARIFICATION - Adoption metric]
+
+- **Metric:** [Specific measurable indicator]
+- **Target:** [Numeric goal]
+- **Timeline:** [When to achieve]
+- **Measurement Method:** [How to track]
+
+**SC-002:** [NEEDS CLARIFICATION - User satisfaction metric]
+
+- **Metric:** [Specific measurable indicator]
+- **Target:** [Numeric goal]
+- **Timeline:** [When to achieve]
+- **Measurement Method:** [How to track]
+
+### Business Impact Metrics
+
+**SC-003:** [NEEDS CLARIFICATION - Business outcome metric]
+
+- **Metric:** [Specific measurable indicator]
+- **Target:** [Numeric goal]
+- **Timeline:** [When to achieve]
+- **Measurement Method:** [How to track]
+
+**SC-004:** [NEEDS CLARIFICATION - Efficiency metric]
+
+- **Metric:** [Specific measurable indicator]
+- **Target:** [Numeric goal]
+- **Timeline:** [When to achieve]
+- **Measurement Method:** [How to track]
+
+### Quality Metrics
+
+**SC-005:** [NEEDS CLARIFICATION - Error rate metric]
+
+- **Metric:** [Specific measurable indicator]
+- **Target:** [Numeric goal]
+- **Timeline:** [When to achieve]
+- **Measurement Method:** [How to track]
+
+**SC-006:** [NEEDS CLARIFICATION - Performance metric]
+
+- **Metric:** [Specific measurable indicator]
+- **Target:** [Numeric goal]
+- **Timeline:** [When to achieve]
+- **Measurement Method:** [How to track]
+
+---
+
+## 7. Assumptions & Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - List key assumptions]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - List external dependencies]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - List known constraints]
+
+---
+
+## 8. Out of Scope
+
+The following items are explicitly out of scope for this feature:
+
+- [NEEDS CLARIFICATION - Items not included in this release]
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION - Question requiring stakeholder input]
+2. [NEEDS CLARIFICATION - Question requiring stakeholder input]
+3. [NEEDS CLARIFICATION - Question requiring stakeholder input]
+
+---
+
+## Document Control
+
+**Version:** 0.1 (Draft - Awaiting User Requirements)
+**Last Updated:** [Current Date]
+**Status:** Incomplete - Requires user input
+**Next Review Date:** [To be scheduled after requirements provided]
+
+---
+
+**Note:** This document is a template awaiting your specific feature requirements. Please provide the details of the feature you want specified, and I will create a complete, detailed specification document tailored to your needs.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-010.md b/evals/datasets/real-specs/spec-010.md
new file mode 100644
index 0000000000..b27cdd7520
--- /dev/null
+++ b/evals/datasets/real-specs/spec-010.md
@@ -0,0 +1,199 @@
+# Test Spec 10
+
+## Prompt
+
+Design a RESTful API for a social media platform with user profiles, posts, and messaging
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature specified]
+
+**Description:**
+No user requirements were provided in the input. This specification template is ready to be populated once feature requirements are submitted.
+
+**Business Value:**
+[NEEDS CLARIFICATION - Awaiting feature description]
+
+**Target Users:**
+[NEEDS CLARIFICATION - Awaiting user persona details]
+
+**Scope:**
+[NEEDS CLARIFICATION - Awaiting feature boundaries]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (Must Have)
+
+**US-001: [NEEDS CLARIFICATION]**
+
+- **As a** [role]
+- **I want** [feature]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+### Priority 2 (Should Have)
+
+**US-002: [NEEDS CLARIFICATION]**
+
+- **As a** [role]
+- **I want** [feature]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+### Priority 3 (Nice to Have)
+
+**US-003: [NEEDS CLARIFICATION]**
+
+- **As a** [role]
+- **I want** [feature]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Awaiting feature requirements]
+
+**FR-002:** [NEEDS CLARIFICATION - Awaiting feature requirements]
+
+**FR-003:** [NEEDS CLARIFICATION - Awaiting feature requirements]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance
+
+**NFR-001:** [NEEDS CLARIFICATION - Awaiting performance expectations]
+
+**NFR-002:** [NEEDS CLARIFICATION - Awaiting response time requirements]
+
+### Security
+
+**NFR-003:** [NEEDS CLARIFICATION - Awaiting security requirements]
+
+**NFR-004:** [NEEDS CLARIFICATION - Awaiting authentication/authorization requirements]
+
+### Scalability
+
+**NFR-005:** [NEEDS CLARIFICATION - Awaiting scalability requirements]
+
+### Usability
+
+**NFR-006:** [NEEDS CLARIFICATION - Awaiting usability requirements]
+
+### Accessibility
+
+**NFR-007:** [NEEDS CLARIFICATION - Awaiting accessibility standards]
+
+### Reliability
+
+**NFR-008:** [NEEDS CLARIFICATION - Awaiting uptime/availability requirements]
+
+---
+
+## 5. Edge Cases
+
+**EC-001:** [NEEDS CLARIFICATION - Awaiting boundary conditions]
+
+**EC-002:** [NEEDS CLARIFICATION - Awaiting error scenarios]
+
+**EC-003:** [NEEDS CLARIFICATION - Awaiting invalid input handling]
+
+**EC-004:** [NEEDS CLARIFICATION - Awaiting concurrent usage scenarios]
+
+**EC-005:** [NEEDS CLARIFICATION - Awaiting data limit scenarios]
+
+---
+
+## 6. Success Criteria
+
+**SC-001:** [NEEDS CLARIFICATION - Awaiting measurable success metrics]
+
+**SC-002:** [NEEDS CLARIFICATION - Awaiting adoption targets]
+
+**SC-003:** [NEEDS CLARIFICATION - Awaiting performance benchmarks]
+
+**SC-004:** [NEEDS CLARIFICATION - Awaiting user satisfaction metrics]
+
+---
+
+## 7. Assumptions
+
+[NEEDS CLARIFICATION - Awaiting feature context to identify assumptions]
+
+---
+
+## 8. Dependencies
+
+[NEEDS CLARIFICATION - Awaiting feature details to identify dependencies]
+
+---
+
+## 9. Out of Scope
+
+[NEEDS CLARIFICATION - Awaiting feature boundaries]
+
+---
+
+## 10. Open Questions
+
+1. What is the primary feature being requested?
+2. Who are the target users and what problems are they experiencing?
+3. What are the business goals for this feature?
+4. Are there any existing systems or processes this feature should integrate with?
+5. What are the expected usage patterns and volume?
+6. Are there any compliance or regulatory requirements?
+7. What is the timeline or release deadline?
+
+---
+
+## Next Steps
+
+- Detailed description of the desired feature
+- Target user personas and their pain points
+- Business objectives and expected outcomes
+- Any existing workflows or systems to consider
+- Constraints (budget, timeline, resources)
+
+---
+
+**Document Status:** DRAFT - Awaiting Requirements
+**Last Updated:** [Current Date]
+**Author:** Product Specification Team
+
+---
+
+*Please provide your feature requirements, and I will generate a comprehensive specification document tailored to your needs.*
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-011.md b/evals/datasets/real-specs/spec-011.md
new file mode 100644
index 0000000000..719da907c2
--- /dev/null
+++ b/evals/datasets/real-specs/spec-011.md
@@ -0,0 +1,549 @@
+# Test Spec 11
+
+## Prompt
+
+Build a GraphQL API for a content management system with flexible querying
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No user input provided]
+
+**Description:**
+This feature specification document is a template awaiting user requirements. Once requirements are provided, this section will contain:
+
+- Brief summary of the feature's purpose
+- Primary problem it solves
+- Target user base
+- High-level scope boundaries
+
+**Document Version:** 1.0
+**Last Updated:** [Current Date]
+**Status:** Draft - Awaiting Requirements
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Must Have
+
+**US-001: [Awaiting Requirements]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is taken]
+Then [expected outcome]
+
+Given [alternative context]
+When [action is taken]
+Then [expected outcome]
+```
+
+---
+
+**US-002: [Awaiting Requirements]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is taken]
+Then [expected outcome]
+```
+
+---
+
+### Priority 2 (P2) - Should Have
+
+**US-003: [Awaiting Requirements]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is taken]
+Then [expected outcome]
+```
+
+---
+
+**US-004: [Awaiting Requirements]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is taken]
+Then [expected outcome]
+```
+
+---
+
+### Priority 3 (P3) - Nice to Have
+
+**US-005: [Awaiting Requirements]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is taken]
+Then [expected outcome]
+```
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [NEEDS CLARIFICATION - Awaiting user requirements]
+
+- **Description:** The system shall [specific behavior]
+- **Priority:** P1
+- **Testability Criteria:** [How to verify this requirement]
+
+**FR-002:** [NEEDS CLARIFICATION - Awaiting user requirements]
+
+- **Description:** The system shall [specific behavior]
+- **Priority:** P1
+- **Testability Criteria:** [How to verify this requirement]
+
+**FR-003:** [NEEDS CLARIFICATION - Awaiting user requirements]
+
+- **Description:** The system shall [specific behavior]
+- **Priority:** P2
+- **Testability Criteria:** [How to verify this requirement]
+
+### Data Management
+
+**FR-004:** [NEEDS CLARIFICATION - Awaiting user requirements]
+
+- **Description:** The system shall [specific behavior]
+- **Priority:** P1
+- **Testability Criteria:** [How to verify this requirement]
+
+### User Interface
+
+**FR-005:** [NEEDS CLARIFICATION - Awaiting user requirements]
+
+- **Description:** The system shall [specific behavior]
+- **Priority:** P2
+- **Testability Criteria:** [How to verify this requirement]
+
+### Integration Points
+
+**FR-006:** [NEEDS CLARIFICATION - Awaiting user requirements]
+
+- **Description:** The system shall [specific behavior]
+- **Priority:** P2
+- **Testability Criteria:** [How to verify this requirement]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** Response Time
+
+- **Description:** [NEEDS CLARIFICATION] The system shall respond to user actions within [X] seconds under normal load conditions
+- **Measurement:** Average response time measured across [specify measurement period]
+- **Priority:** P1
+
+**NFR-002:** Throughput
+
+- **Description:** [NEEDS CLARIFICATION] The system shall support [X] concurrent users performing [specific actions]
+- **Measurement:** System maintains acceptable performance with specified user load
+- **Priority:** P1
+
+**NFR-003:** Data Processing
+
+- **Description:** [NEEDS CLARIFICATION] The system shall process [X] transactions/records per [time unit]
+- **Measurement:** Processing rate during peak operations
+- **Priority:** P2
+
+### Scalability Requirements
+
+**NFR-004:** User Growth
+
+- **Description:** [NEEDS CLARIFICATION] The system shall support growth to [X] total users within [timeframe]
+- **Measurement:** System performance metrics remain within acceptable thresholds at specified scale
+- **Priority:** P1
+
+**NFR-005:** Data Volume
+
+- **Description:** [NEEDS CLARIFICATION] The system shall handle up to [X] amount of data without performance degradation
+- **Measurement:** Response times and throughput at maximum data capacity
+- **Priority:** P2
+
+### Security Requirements
+
+**NFR-006:** Authentication
+
+- **Description:** [NEEDS CLARIFICATION] The system shall require user authentication before accessing protected features
+- **Measurement:** All protected endpoints reject unauthenticated requests
+- **Priority:** P1
+
+**NFR-007:** Authorization
+
+- **Description:** [NEEDS CLARIFICATION] The system shall enforce role-based access controls for all user actions
+- **Measurement:** Users can only perform actions authorized for their role
+- **Priority:** P1
+
+**NFR-008:** Data Protection
+
+- **Description:** [NEEDS CLARIFICATION] The system shall protect sensitive data at rest and in transit
+- **Measurement:** Security audit confirms data protection standards are met
+- **Priority:** P1
+
+**NFR-009:** Audit Logging
+
+- **Description:** [NEEDS CLARIFICATION] The system shall log all security-relevant events
+- **Measurement:** Audit logs capture specified events with required information
+- **Priority:** P2
+
+### Reliability Requirements
+
+**NFR-010:** Availability
+
+- **Description:** [NEEDS CLARIFICATION] The system shall maintain [X]% uptime during business hours
+- **Measurement:** Actual uptime percentage over specified period
+- **Priority:** P1
+
+**NFR-011:** Data Integrity
+
+- **Description:** [NEEDS CLARIFICATION] The system shall ensure zero data loss during normal operations
+- **Measurement:** Data validation checks confirm no data corruption or loss
+- **Priority:** P1
+
+**NFR-012:** Error Recovery
+
+- **Description:** [NEEDS CLARIFICATION] The system shall recover from errors within [X] minutes
+- **Measurement:** Time from error detection to full service restoration
+- **Priority:** P2
+
+### Usability Requirements
+
+**NFR-013:** Accessibility
+
+- **Description:** [NEEDS CLARIFICATION] The system shall comply with [accessibility standard, e.g., WCAG 2.1 Level AA]
+- **Measurement:** Accessibility audit confirms compliance
+- **Priority:** P1
+
+**NFR-014:** User Training
+
+- **Description:** [NEEDS CLARIFICATION] New users shall be able to complete core tasks within [X] minutes of initial use
+- **Measurement:** User testing demonstrates task completion within timeframe
+- **Priority:** P2
+
+**NFR-015:** Error Messaging
+
+- **Description:** The system shall provide clear, actionable error messages to users
+- **Measurement:** Error messages include explanation and next steps
+- **Priority:** P2
+
+### Maintainability Requirements
+
+**NFR-016:** Documentation
+
+- **Description:** [NEEDS CLARIFICATION] The system shall include user documentation for all features
+- **Measurement:** Documentation exists and covers specified feature set
+- **Priority:** P2
+
+**NFR-017:** Monitoring
+
+- **Description:** [NEEDS CLARIFICATION] The system shall provide operational monitoring capabilities
+- **Measurement:** Key system metrics are observable and measurable
+- **Priority:** P1
+
+### Compliance Requirements
+
+**NFR-018:** Regulatory Compliance
+
+- **Description:** [NEEDS CLARIFICATION] The system shall comply with [relevant regulations, e.g., GDPR, HIPAA, SOC 2]
+- **Measurement:** Compliance audit confirms adherence to specified regulations
+- **Priority:** P1
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Input Validation
+
+**EC-001:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** User provides invalid input format
+- **Expected Behavior:** System displays clear error message and does not process invalid data
+- **Recovery:** User can correct input and resubmit
+
+**EC-002:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** User provides input exceeding maximum allowed size/length
+- **Expected Behavior:** System rejects input with descriptive error
+- **Recovery:** User is informed of limits and can adjust input
+
+**EC-003:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** User provides empty/null required fields
+- **Expected Behavior:** System identifies missing required fields
+- **Recovery:** User is prompted to provide required information
+
+### Boundary Conditions
+
+**EC-004:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** System reaches maximum capacity (users, data, transactions)
+- **Expected Behavior:** System gracefully handles limit
+- **Recovery:** Clear messaging about limitations and alternatives
+
+**EC-005:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** Operation times out due to duration
+- **Expected Behavior:** System cancels operation and notifies user
+- **Recovery:** User can retry or choose alternative action
+
+**EC-006:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** Minimum boundary values are used
+- **Expected Behavior:** System processes minimum valid values correctly
+- **Recovery:** N/A - normal operation
+
+### Concurrent Operations
+
+**EC-007:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** Multiple users attempt to modify the same resource simultaneously
+- **Expected Behavior:** System handles conflicts appropriately
+- **Recovery:** Users are notified of conflicts with options to resolve
+
+**EC-008:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** User initiates same operation multiple times rapidly
+- **Expected Behavior:** System prevents duplicate processing
+- **Recovery:** User receives confirmation of single operation
+
+### System Resource Constraints
+
+**EC-009:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** System experiences temporary resource constraints
+- **Expected Behavior:** System queues or delays non-critical operations
+- **Recovery:** Operations complete when resources available
+
+**EC-010:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** External dependency is unavailable
+- **Expected Behavior:** System continues operating in degraded mode or provides clear error
+- **Recovery:** System resumes full functionality when dependency restored
+
+### Data Consistency
+
+**EC-011:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** Data synchronization fails between components
+- **Expected Behavior:** System detects inconsistency and alerts operators
+- **Recovery:** System provides reconciliation mechanism
+
+**EC-012:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** User session expires during operation
+- **Expected Behavior:** System safely terminates operation or saves state
+- **Recovery:** User can re-authenticate and resume or restart
+
+### Network & Connectivity
+
+**EC-013:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** Network connection is lost during operation
+- **Expected Behavior:** System detects connection loss and handles gracefully
+- **Recovery:** System retries operation or notifies user to retry
+
+**EC-014:** [NEEDS CLARIFICATION - Awaiting requirements]
+
+- **Scenario:** Partial data transmission occurs
+- **Expected Behavior:** System validates data completeness
+- **Recovery:** System requests retransmission of incomplete data
+
+---
+
+## 6. Success Criteria
+
+### User Adoption Metrics
+
+**SC-001:** User Engagement
+
+- **Metric:** [NEEDS CLARIFICATION] [X]% of target users actively use the feature within [timeframe]
+- **Measurement Method:** Usage analytics tracking active users
+- **Target:** [Specific percentage and timeframe]
+
+**SC-002:** Task Completion Rate
+
+- **Metric:** [NEEDS CLARIFICATION] [X]% of users successfully complete primary user flows
+- **Measurement Method:** Analytics tracking successful task completions
+- **Target:** [Specific percentage]
+
+**SC-003:** User Satisfaction
+
+- **Metric:** [NEEDS CLARIFICATION] Average user satisfaction score of [X] out of [Y]
+- **Measurement Method:** User surveys and feedback collection
+- **Target:** [Specific score]
+
+### Performance Metrics
+
+**SC-004:** System Performance
+
+- **Metric:** [NEEDS CLARIFICATION] [X]% of requests complete within target response time
+- **Measurement Method:** Performance monitoring tools
+- **Target:** [Specific percentage and response time]
+
+**SC-005:** System Reliability
+
+- **Metric:** [NEEDS CLARIFICATION] System maintains [X]% uptime
+- **Measurement Method:** Uptime monitoring over specified period
+- **Target:** [Specific uptime percentage]
+
+**SC-006:** Error Rate
+
+- **Metric:** [NEEDS CLARIFICATION] Error rate below [X]% of total operations
+- **Measurement Method:** Error logging and monitoring
+- **Target:** [Specific error rate threshold]
+
+### Business Metrics
+
+**SC-007:** Business Value
+
+- **Metric:** [NEEDS CLARIFICATION] Feature generates/saves [specific business outcome]
+- **Measurement Method:** [Business metric tracking method]
+- **Target:** [Specific measurable outcome]
+
+**SC-008:** Efficiency Gain
+
+- **Metric:** [NEEDS CLARIFICATION] Reduces time for [specific task] by [X]%
+- **Measurement Method:** Time tracking before and after implementation
+- **Target:** [Specific percentage improvement]
+
+### Quality Metrics
+
+**SC-009:** Defect Rate
+
+- **Metric:** [NEEDS CLARIFICATION] Fewer than [X] critical defects in first [timeframe] post-launch
+- **Measurement Method:** Defect tracking system
+- **Target:** [Specific number of defects]
+
+**SC-010:** Accessibility Compliance
+
+- **Metric:** [NEEDS CLARIFICATION] Achieve [X]% accessibility compliance score
+- **Measurement Method:** Automated and manual accessibility testing
+- **Target:** [Specific compliance percentage]
+
+### Operational Metrics
+
+**SC-011:** Support Tickets
+
+- **Metric:** [NEEDS CLARIFICATION] Support tickets related to feature remain below [X] per [time period]
+- **Measurement Method:** Support ticket tracking system
+- **Target:** [Specific number of tickets]
+
+**SC-012:** Training Effectiveness
+
+- **Metric:** [NEEDS CLARIFICATION] [X]% of users require no additional support after initial training
+- **Measurement Method:** Support request tracking correlated with user training
+- **Target:** [Specific percentage]
+
+---
+
+## 7. Dependencies & Assumptions
+
+### Dependencies
+
+[NEEDS CLARIFICATION - Awaiting requirements to identify:]
+
+- External systems or services required
+- Data sources needed
+- Third-party integrations
+- Organizational resources required
+
+### Assumptions
+
+[NEEDS CLARIFICATION - Awaiting requirements to document:]
+
+- Assumed user behaviors
+- Assumed technical capabilities available
+- Assumed business constraints
+- Assumed timeline and resource availability
+
+---
+
+## 8. Out of Scope
+
+[NEEDS CLARIFICATION - To be defined based on user requirements]
+
+The following items are explicitly excluded from this feature:
+
+- [Item 1]
+- [Item 2]
+- [Item 3]
+
+---
+
+## 9. Open Questions
+
+**OQ-001:** [NEEDS CLARIFICATION] What are the specific user requirements for this feature?
+
+**OQ-002:** [NEEDS CLARIFICATION] Who are the target users and what are their roles?
+
+**OQ-003:** [NEEDS CLARIFICATION] What problem is this feature intended to solve?
+
+**OQ-004:** [NEEDS CLARIFICATION] What are the primary user workflows?
+
+**OQ-005:** [NEEDS CLARIFICATION] What are the regulatory or compliance requirements?
+
+**OQ-006:** [NEEDS CLARIFICATION] What are the performance expectations?
+
+**OQ-007:** [NEEDS CLARIFICATION] What are the integration requirements?
+
+**OQ-008:** [NEEDS CLARIFICATION] What is the expected timeline for delivery?
+
+---
+
+## 10. Approval & Sign-off
+
+| Role | Name | Signature | Date |
+|------|------|-----------|------|
+| Product Owner | [Name] | | |
+| Technical Lead | [Name] | | |
+| QA Lead | [Name] | | |
+| UX Lead | [Name] | | |
+| Stakeholder | [Name] | | |
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-012.md b/evals/datasets/real-specs/spec-012.md
new file mode 100644
index 0000000000..784322b313
--- /dev/null
+++ b/evals/datasets/real-specs/spec-012.md
@@ -0,0 +1,360 @@
+# Test Spec 12
+
+## Prompt
+
+Create a .NET Core web application for employee onboarding with document management
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature described in user input]
+
+**Feature Description:**
+No user requirements were provided in the input. This template demonstrates the structure and format for a comprehensive feature specification document.
+
+**Business Value:**
+[NEEDS CLARIFICATION - Business objectives and value proposition undefined]
+
+**Target Users:**
+[NEEDS CLARIFICATION - User personas and roles undefined]
+
+**Scope:**
+[NEEDS CLARIFICATION - Feature boundaries and limitations undefined]
+
+---
+
+## 2. User Stories
+
+### P1 (Must Have) Stories
+
+**US-001: [NEEDS CLARIFICATION - Core user story undefined]**
+
+- **As a** [role undefined]
+- **I want** [feature undefined]
+- **So that** [benefit undefined]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context undefined]
+- **When** [action undefined]
+- **Then** [expected outcome undefined]
+
+---
+
+### P2 (Should Have) Stories
+
+**US-002: [NEEDS CLARIFICATION - Secondary user story undefined]**
+
+- **As a** [role undefined]
+- **I want** [feature undefined]
+- **So that** [benefit undefined]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context undefined]
+- **When** [action undefined]
+- **Then** [expected outcome undefined]
+
+---
+
+### P3 (Nice to Have) Stories
+
+**US-003: [NEEDS CLARIFICATION - Enhancement user story undefined]**
+
+- **As a** [role undefined]
+- **I want** [feature undefined]
+- **So that** [benefit undefined]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context undefined]
+- **When** [action undefined]
+- **Then** [expected outcome undefined]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Core functionality undefined]
+
+- **Priority:** P1
+- **Testability:** [Test criteria undefined]
+
+**FR-002:** [NEEDS CLARIFICATION - Data handling requirements undefined]
+
+- **Priority:** P1
+- **Testability:** [Test criteria undefined]
+
+**FR-003:** [NEEDS CLARIFICATION - User interaction requirements undefined]
+
+- **Priority:** P2
+- **Testability:** [Test criteria undefined]
+
+**FR-004:** [NEEDS CLARIFICATION - Integration requirements undefined]
+
+- **Priority:** P2
+- **Testability:** [Test criteria undefined]
+
+**FR-005:** [NEEDS CLARIFICATION - Notification/feedback requirements undefined]
+
+- **Priority:** P3
+- **Testability:** [Test criteria undefined]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION - Response time requirements undefined]
+
+- **Metric:** [Undefined - e.g., "95% of requests complete within X seconds"]
+- **Priority:** P1
+
+**NFR-002:** [NEEDS CLARIFICATION - Throughput requirements undefined]
+
+- **Metric:** [Undefined - e.g., "System handles X concurrent users"]
+- **Priority:** P1
+
+**NFR-003:** [NEEDS CLARIFICATION - Load time requirements undefined]
+
+- **Metric:** [Undefined - e.g., "Initial load completes within X seconds"]
+- **Priority:** P2
+
+### Security Requirements
+
+**NFR-004:** [NEEDS CLARIFICATION - Authentication requirements undefined]
+
+- **Metric:** [Undefined - e.g., "All sensitive operations require authentication"]
+- **Priority:** P1
+
+**NFR-005:** [NEEDS CLARIFICATION - Data protection requirements undefined]
+
+- **Metric:** [Undefined - e.g., "Sensitive data encrypted at rest and in transit"]
+- **Priority:** P1
+
+**NFR-006:** [NEEDS CLARIFICATION - Authorization requirements undefined]
+
+- **Metric:** [Undefined - e.g., "Role-based access control implemented"]
+- **Priority:** P1
+
+### Scalability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION - User growth requirements undefined]
+
+- **Metric:** [Undefined - e.g., "System scales to X users within Y months"]
+- **Priority:** P2
+
+**NFR-008:** [NEEDS CLARIFICATION - Data volume requirements undefined]
+
+- **Metric:** [Undefined - e.g., "System handles X records without degradation"]
+- **Priority:** P2
+
+### Usability Requirements
+
+**NFR-009:** [NEEDS CLARIFICATION - Accessibility requirements undefined]
+
+- **Metric:** [Undefined - e.g., "Meets WCAG 2.1 Level AA standards"]
+- **Priority:** P2
+
+**NFR-010:** [NEEDS CLARIFICATION - Device compatibility requirements undefined]
+
+- **Metric:** [Undefined - e.g., "Functions on devices with screen sizes X to Y"]
+- **Priority:** P1
+
+### Reliability Requirements
+
+**NFR-011:** [NEEDS CLARIFICATION - Availability requirements undefined]
+
+- **Metric:** [Undefined - e.g., "99.9% uptime during business hours"]
+- **Priority:** P1
+
+**NFR-012:** [NEEDS CLARIFICATION - Error handling requirements undefined]
+
+- **Metric:** [Undefined - e.g., "All errors logged with user-friendly messages"]
+- **Priority:** P1
+
+---
+
+## 5. Edge Cases and Error Scenarios
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION - Minimum input scenarios undefined]
+
+- **Scenario:** [Undefined - e.g., "User enters minimum allowed value"]
+- **Expected Behavior:** [Undefined]
+
+**EC-002:** [NEEDS CLARIFICATION - Maximum input scenarios undefined]
+
+- **Scenario:** [Undefined - e.g., "User enters maximum allowed value"]
+- **Expected Behavior:** [Undefined]
+
+**EC-003:** [NEEDS CLARIFICATION - Empty state scenarios undefined]
+
+- **Scenario:** [Undefined - e.g., "User accesses feature with no data"]
+- **Expected Behavior:** [Undefined]
+
+**EC-004:** [NEEDS CLARIFICATION - Concurrent operation scenarios undefined]
+
+- **Scenario:** [Undefined - e.g., "Multiple users modify same resource"]
+- **Expected Behavior:** [Undefined]
+
+### Error Scenarios
+
+**ES-001:** [NEEDS CLARIFICATION - Invalid input handling undefined]
+
+- **Trigger:** [Undefined - e.g., "User submits invalid data format"]
+- **Expected Response:** [Undefined - e.g., "Clear error message with guidance"]
+
+**ES-002:** [NEEDS CLARIFICATION - Network failure handling undefined]
+
+- **Trigger:** [Undefined - e.g., "Connection lost during operation"]
+- **Expected Response:** [Undefined - e.g., "Graceful degradation with retry option"]
+
+**ES-003:** [NEEDS CLARIFICATION - Permission denial handling undefined]
+
+- **Trigger:** [Undefined - e.g., "User attempts unauthorized action"]
+- **Expected Response:** [Undefined - e.g., "Access denied message"]
+
+**ES-004:** [NEEDS CLARIFICATION - Timeout handling undefined]
+
+- **Trigger:** [Undefined - e.g., "Operation exceeds time limit"]
+- **Expected Response:** [Undefined - e.g., "Timeout notification with options"]
+
+**ES-005:** [NEEDS CLARIFICATION - Resource exhaustion handling undefined]
+
+- **Trigger:** [Undefined - e.g., "System reaches capacity limits"]
+- **Expected Response:** [Undefined - e.g., "Capacity message with alternatives"]
+
+---
+
+## 6. Success Criteria
+
+### Quantitative Metrics
+
+**SC-001:** [NEEDS CLARIFICATION - Adoption metrics undefined]
+
+- **Metric:** [Undefined - e.g., "X% of target users adopt feature within Y days"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+**SC-002:** [NEEDS CLARIFICATION - Usage metrics undefined]
+
+- **Metric:** [Undefined - e.g., "Average of X sessions per user per week"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+**SC-003:** [NEEDS CLARIFICATION - Performance metrics undefined]
+
+- **Metric:** [Undefined - e.g., "Average task completion time under X minutes"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+**SC-004:** [NEEDS CLARIFICATION - Error rate metrics undefined]
+
+- **Metric:** [Undefined - e.g., "Error rate below X%"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+### Qualitative Metrics
+
+**SC-005:** [NEEDS CLARIFICATION - User satisfaction metrics undefined]
+
+- **Metric:** [Undefined - e.g., "User satisfaction score of X/10 or higher"]
+- **Measurement Method:** [Undefined - e.g., "Post-feature survey"]
+- **Target:** [Undefined]
+
+**SC-006:** [NEEDS CLARIFICATION - Task completion metrics undefined]
+
+- **Metric:** [Undefined - e.g., "X% task completion rate without assistance"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+### Business Metrics
+
+**SC-007:** [NEEDS CLARIFICATION - Business impact metrics undefined]
+
+- **Metric:** [Undefined - e.g., "X% reduction in support tickets"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+**SC-008:** [NEEDS CLARIFICATION - ROI metrics undefined]
+
+- **Metric:** [Undefined - e.g., "Positive ROI within X months"]
+- **Measurement Method:** [Undefined]
+- **Target:** [Undefined]
+
+---
+
+## 7. Assumptions and Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - Project assumptions undefined]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - External dependencies undefined]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - Project constraints undefined]
+
+---
+
+## 8. Out of Scope
+
+- [NEEDS CLARIFICATION - Explicitly excluded features undefined]
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION - What is the specific feature being requested?]
+2. [NEEDS CLARIFICATION - Who are the target users and their roles?]
+3. [NEEDS CLARIFICATION - What problem does this feature solve?]
+4. [NEEDS CLARIFICATION - What are the key user workflows?]
+5. [NEEDS CLARIFICATION - What are the integration requirements?]
+6. [NEEDS CLARIFICATION - What are the business objectives?]
+7. [NEEDS CLARIFICATION - What is the expected timeline?]
+8. [NEEDS CLARIFICATION - What are the resource constraints?]
+
+---
+
+## 10. Approval and Sign-off
+
+**Document Version:** 1.0 (Template)
+
+**Prepared By:** [Name/Role]
+
+**Date:** [Date]
+
+**Stakeholder Approvals:**
+
+- Product Owner: _________________ Date: _______
+- Engineering Lead: _________________ Date: _______
+- Design Lead: _________________ Date: _______
+- QA Lead: _________________ Date: _______
+
+---
+
+## Notes
+
+**This is a template document.** Please provide specific user requirements to generate a complete and actionable feature specification. The actual specification should include:
+
+- Detailed descriptions of desired functionality
+- User roles and personas
+- Specific workflows and interactions
+- Business objectives and constraints
+- Integration requirements
+- Any existing system context
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-013.md b/evals/datasets/real-specs/spec-013.md
new file mode 100644
index 0000000000..79918e8b3d
--- /dev/null
+++ b/evals/datasets/real-specs/spec-013.md
@@ -0,0 +1,443 @@
+# Test Spec 13
+
+## Prompt
+
+Build a Python Flask API for data processing and machine learning model serving
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No feature description provided]
+
+**Description:**
+This section requires user input to define the feature being specified. The feature specification document framework is ready to document:
+
+- Core functionality and purpose
+- Target users and use cases
+- Business value and objectives
+- High-level scope and boundaries
+
+[NEEDS CLARIFICATION - Please provide details about the feature you want specified, including:
+
+- What problem does this feature solve?
+- Who are the primary users?
+- What are the main capabilities needed?
+- What business goals does this support?]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Critical
+
+**US-001: [NEEDS CLARIFICATION - Core user story]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit/value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context/precondition]
+When [action/trigger]
+Then [expected outcome]
+And [additional outcome]
+```
+
+---
+
+**US-002: [NEEDS CLARIFICATION - Essential functionality]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit/value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context/precondition]
+When [action/trigger]
+Then [expected outcome]
+```
+
+---
+
+### Priority 2 (P2) - Important
+
+**US-003: [NEEDS CLARIFICATION - Important enhancement]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit/value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context/precondition]
+When [action/trigger]
+Then [expected outcome]
+```
+
+---
+
+**US-004: [NEEDS CLARIFICATION - Supporting functionality]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit/value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context/precondition]
+When [action/trigger]
+Then [expected outcome]
+```
+
+---
+
+### Priority 3 (P3) - Nice to Have
+
+**US-005: [NEEDS CLARIFICATION - Additional feature]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit/value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context/precondition]
+When [action/trigger]
+Then [expected outcome]
+```
+
+---
+
+**US-006: [NEEDS CLARIFICATION - Optional enhancement]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit/value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context/precondition]
+When [action/trigger]
+Then [expected outcome]
+```
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Core functionality requirement]
+
+- **Description:** The system shall [specific capability]
+- **Priority:** P1
+- **Related User Stories:** US-001
+
+**FR-002:** [NEEDS CLARIFICATION - Data handling requirement]
+
+- **Description:** The system shall [specific capability]
+- **Priority:** P1
+- **Related User Stories:** US-001, US-002
+
+**FR-003:** [NEEDS CLARIFICATION - User interaction requirement]
+
+- **Description:** The system shall [specific capability]
+- **Priority:** P2
+- **Related User Stories:** US-003
+
+**FR-004:** [NEEDS CLARIFICATION - Validation requirement]
+
+- **Description:** The system shall [specific capability]
+- **Priority:** P2
+- **Related User Stories:** US-004
+
+**FR-005:** [NEEDS CLARIFICATION - Reporting/feedback requirement]
+
+- **Description:** The system shall [specific capability]
+- **Priority:** P3
+- **Related User Stories:** US-005
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-P)
+
+**NFR-P-001:** Response Time
+
+- The system shall respond to user actions within [X] seconds under normal load conditions
+- **Priority:** P1
+- **Measurement:** 95th percentile response time
+
+**NFR-P-002:** Throughput
+
+- The system shall support [X] concurrent users without performance degradation
+- **Priority:** P1
+- **Measurement:** Load testing results
+
+**NFR-P-003:** Data Processing
+
+- The system shall process [X] transactions per second
+- **Priority:** P2
+- **Measurement:** Transaction processing metrics
+
+---
+
+### Security (NFR-S)
+
+**NFR-S-001:** Authentication
+
+- The system shall require user authentication for all protected operations
+- **Priority:** P1
+- **Measurement:** Security audit compliance
+
+**NFR-S-002:** Authorization
+
+- The system shall enforce role-based access controls for all sensitive operations
+- **Priority:** P1
+- **Measurement:** Access control testing results
+
+**NFR-S-003:** Data Protection
+
+- The system shall protect sensitive data at rest and in transit
+- **Priority:** P1
+- **Measurement:** Security scanning results
+
+**NFR-S-004:** Audit Logging
+
+- The system shall log all security-relevant events with timestamp, user, and action details
+- **Priority:** P2
+- **Measurement:** Audit log completeness
+
+---
+
+### Scalability (NFR-SC)
+
+**NFR-SC-001:** Horizontal Scaling
+
+- The system shall support scaling to handle [X]% increase in load without architectural changes
+- **Priority:** P2
+- **Measurement:** Load testing at scale
+
+**NFR-SC-002:** Data Volume
+
+- The system shall handle up to [X] records without performance degradation
+- **Priority:** P2
+- **Measurement:** Benchmark testing results
+
+---
+
+### Reliability (NFR-R)
+
+**NFR-R-001:** Availability
+
+- The system shall maintain [X]% uptime during business hours
+- **Priority:** P1
+- **Measurement:** Uptime monitoring
+
+**NFR-R-002:** Error Handling
+
+- The system shall handle error conditions gracefully without data loss
+- **Priority:** P1
+- **Measurement:** Error recovery testing
+
+**NFR-R-003:** Data Integrity
+
+- The system shall maintain data consistency across all operations
+- **Priority:** P1
+- **Measurement:** Data validation testing
+
+---
+
+### Usability (NFR-U)
+
+**NFR-U-001:** Accessibility
+
+- The system shall comply with WCAG 2.1 Level AA accessibility standards
+- **Priority:** P1
+- **Measurement:** Accessibility audit results
+
+**NFR-U-002:** Learnability
+
+- New users shall be able to complete core tasks within [X] minutes of first use
+- **Priority:** P2
+- **Measurement:** User testing observations
+
+**NFR-U-003:** Error Messages
+
+- The system shall provide clear, actionable error messages for all user-facing errors
+- **Priority:** P2
+- **Measurement:** User comprehension testing
+
+---
+
+### Maintainability (NFR-M)
+
+**NFR-M-001:** Logging
+
+- The system shall log sufficient information to diagnose issues in production
+- **Priority:** P1
+- **Measurement:** Incident resolution time
+
+**NFR-M-002:** Monitoring
+
+- The system shall expose health check endpoints and operational metrics
+- **Priority:** P1
+- **Measurement:** Monitoring coverage
+
+---
+
+## 5. Edge Cases
+
+### EC-001: [NEEDS CLARIFICATION - Boundary condition]
+
+**Scenario:** [Description of edge case]
+**Expected Behavior:** The system shall [specific handling]
+**Priority:** P1
+
+### EC-002: [NEEDS CLARIFICATION - Invalid input]
+
+**Scenario:** User provides invalid or malformed input
+**Expected Behavior:** The system shall reject the input and provide clear feedback
+**Priority:** P1
+
+### EC-003: [NEEDS CLARIFICATION - Resource limits]
+
+**Scenario:** System reaches maximum capacity/limits
+**Expected Behavior:** The system shall [specific handling]
+**Priority:** P2
+
+### EC-004: [NEEDS CLARIFICATION - Concurrent operations]
+
+**Scenario:** Multiple users attempt to modify the same resource simultaneously
+**Expected Behavior:** The system shall [specific handling] to prevent conflicts
+**Priority:** P2
+
+### EC-005: [NEEDS CLARIFICATION - Empty/null states]
+
+**Scenario:** No data available or null values encountered
+**Expected Behavior:** The system shall display appropriate empty state messaging
+**Priority:** P2
+
+### EC-006: [NEEDS CLARIFICATION - Network/service failure]
+
+**Scenario:** External dependency becomes unavailable
+**Expected Behavior:** The system shall degrade gracefully and notify users appropriately
+**Priority:** P1
+
+### EC-007: [NEEDS CLARIFICATION - Session timeout]
+
+**Scenario:** User session expires during active use
+**Expected Behavior:** The system shall preserve user work and prompt for re-authentication
+**Priority:** P2
+
+---
+
+## 6. Success Criteria
+
+**SC-001: User Adoption** [NEEDS CLARIFICATION - Target metrics]
+
+- **Metric:** [X]% of target users actively using the feature within [timeframe]
+- **Measurement Method:** User analytics
+- **Target:** [Specific number/percentage]
+
+**SC-002: Task Completion Rate** [NEEDS CLARIFICATION - Target metrics]
+
+- **Metric:** [X]% of users successfully complete primary workflows
+- **Measurement Method:** User behavior tracking
+- **Target:** [Specific percentage]
+
+**SC-003: Performance Benchmarks** [NEEDS CLARIFICATION - Target metrics]
+
+- **Metric:** [X]% of operations complete within target time
+- **Measurement Method:** Performance monitoring
+- **Target:** [Specific percentage]
+
+**SC-004: Error Rate** [NEEDS CLARIFICATION - Target metrics]
+
+- **Metric:** System error rate below [X]%
+- **Measurement Method:** Error logging and monitoring
+- **Target:** [Specific percentage]
+
+**SC-005: User Satisfaction** [NEEDS CLARIFICATION - Target metrics]
+
+- **Metric:** User satisfaction score of [X]/10 or higher
+- **Measurement Method:** User surveys/feedback
+- **Target:** [Specific score]
+
+**SC-006: Business Impact** [NEEDS CLARIFICATION - Target metrics]
+
+- **Metric:** [Specific business metric improvement]
+- **Measurement Method:** Business analytics
+- **Target:** [Specific improvement percentage]
+
+---
+
+## 7. Dependencies and Constraints
+
+### Dependencies [NEEDS CLARIFICATION]
+
+- External system integration requirements
+- Data availability requirements
+- User authentication/authorization systems
+- Third-party service dependencies
+
+### Constraints [NEEDS CLARIFICATION]
+
+- Regulatory compliance requirements
+- Budget limitations
+- Timeline constraints
+- Resource availability
+
+---
+
+## 8. Assumptions [NEEDS CLARIFICATION]
+
+- User technical proficiency level
+- Available infrastructure
+- Data quality and availability
+- User access patterns
+
+---
+
+## 9. Out of Scope
+
+[NEEDS CLARIFICATION - Define what is explicitly not included in this feature]
+
+---
+
+## 10. Open Questions
+
+1. [NEEDS CLARIFICATION - What is the specific feature being built?]
+2. [NEEDS CLARIFICATION - Who are the target users?]
+3. [NEEDS CLARIFICATION - What are the specific business objectives?]
+4. [NEEDS CLARIFICATION - What are the success metrics?]
+5. [NEEDS CLARIFICATION - Are there any regulatory or compliance requirements?]
+6. [NEEDS CLARIFICATION - What is the expected timeline?]
+
+---
+
+## Document Control
+
+| Version | Date | Author | Changes |
+|---------|------|--------|---------|
+| 0.1 | [Date] | [Author] | Initial template created - awaiting requirements |
+
+---
+
+**Note:** This document requires substantial input to complete. Please provide detailed information about the feature you want specified, and I will populate all sections with specific, measurable, and testable requirements.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-014.md b/evals/datasets/real-specs/spec-014.md
new file mode 100644
index 0000000000..a7b018b4e7
--- /dev/null
+++ b/evals/datasets/real-specs/spec-014.md
@@ -0,0 +1,348 @@
+# Test Spec 14
+
+## Prompt
+
+Develop a Node.js/Express backend with real-time WebSocket communication
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - Feature name not provided]
+
+**Description:**
+[NEEDS CLARIFICATION - No user requirements were provided in the input. This specification template is ready to be populated once user requirements are submitted.]
+
+**Purpose:**
+[NEEDS CLARIFICATION - Purpose and objectives need to be defined based on user requirements]
+
+**Target Users:**
+[NEEDS CLARIFICATION - User segments and personas need to be identified]
+
+**Business Value:**
+[NEEDS CLARIFICATION - Expected business impact and value proposition need to be defined]
+
+---
+
+## 2. User Stories
+
+### P1 - Critical Priority
+
+**US-001: [NEEDS CLARIFICATION - Story Title]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+1. **Given** [context]
+   **When** [action]
+   **Then** [expected outcome]
+
+2. **Given** [context]
+   **When** [action]
+   **Then** [expected outcome]
+
+---
+
+**US-002: [NEEDS CLARIFICATION - Story Title]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+1. **Given** [context]
+   **When** [action]
+   **Then** [expected outcome]
+
+---
+
+### P2 - High Priority
+
+**US-003: [NEEDS CLARIFICATION - Story Title]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+1. **Given** [context]
+   **When** [action]
+   **Then** [expected outcome]
+
+---
+
+**US-004: [NEEDS CLARIFICATION - Story Title]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+1. **Given** [context]
+   **When** [action]
+   **Then** [expected outcome]
+
+---
+
+### P3 - Medium Priority
+
+**US-005: [NEEDS CLARIFICATION - Story Title]**
+
+- **As a** [role to be defined]
+- **I want** [capability to be defined]
+- **So that** [benefit to be defined]
+
+**Acceptance Criteria:**
+
+1. **Given** [context]
+   **When** [action]
+   **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION - Functional requirement description]
+
+- **Measurable Criteria:** [Specific testable condition]
+- **Priority:** [P1/P2/P3]
+
+**FR-002:** [NEEDS CLARIFICATION - Functional requirement description]
+
+- **Measurable Criteria:** [Specific testable condition]
+- **Priority:** [P1/P2/P3]
+
+**FR-003:** [NEEDS CLARIFICATION - Functional requirement description]
+
+- **Measurable Criteria:** [Specific testable condition]
+- **Priority:** [P1/P2/P3]
+
+**FR-004:** [NEEDS CLARIFICATION - Functional requirement description]
+
+- **Measurable Criteria:** [Specific testable condition]
+- **Priority:** [P1/P2/P3]
+
+**FR-005:** [NEEDS CLARIFICATION - Functional requirement description]
+
+- **Measurable Criteria:** [Specific testable condition]
+- **Priority:** [P1/P2/P3]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION - Performance requirement]
+
+- **Metric:** [Specific measurement]
+- **Target:** [Numerical target value]
+- **Measurement Method:** [How this will be verified]
+
+**NFR-002:** [NEEDS CLARIFICATION - Response time requirement]
+
+- **Metric:** [Specific measurement]
+- **Target:** [Numerical target value]
+- **Measurement Method:** [How this will be verified]
+
+### Security Requirements
+
+**NFR-003:** [NEEDS CLARIFICATION - Security requirement]
+
+- **Compliance Standard:** [Applicable standards]
+- **Verification Method:** [How compliance will be tested]
+
+**NFR-004:** [NEEDS CLARIFICATION - Data protection requirement]
+
+- **Compliance Standard:** [Applicable standards]
+- **Verification Method:** [How compliance will be tested]
+
+### Scalability Requirements
+
+**NFR-005:** [NEEDS CLARIFICATION - Scalability requirement]
+
+- **Metric:** [Specific measurement]
+- **Target:** [Numerical target value]
+- **Measurement Method:** [How this will be verified]
+
+### Usability Requirements
+
+**NFR-006:** [NEEDS CLARIFICATION - Usability requirement]
+
+- **Metric:** [Specific measurement]
+- **Target:** [Numerical target value]
+- **Measurement Method:** [How this will be verified]
+
+### Reliability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION - Availability requirement]
+
+- **Metric:** [Uptime percentage or MTBF]
+- **Target:** [Numerical target value]
+- **Measurement Method:** [How this will be verified]
+
+### Compatibility Requirements
+
+**NFR-008:** [NEEDS CLARIFICATION - Platform/device compatibility]
+
+- **Supported Platforms:** [List of platforms]
+- **Verification Method:** [How compatibility will be tested]
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION - Edge case description]
+
+- **Scenario:** [Describe the boundary condition]
+- **Expected Behavior:** [How the system should handle it]
+- **User Feedback:** [What message/feedback is provided]
+
+**EC-002:** [NEEDS CLARIFICATION - Edge case description]
+
+- **Scenario:** [Describe the boundary condition]
+- **Expected Behavior:** [How the system should handle it]
+- **User Feedback:** [What message/feedback is provided]
+
+### Error Scenarios
+
+**EC-003:** [NEEDS CLARIFICATION - Error scenario]
+
+- **Trigger Condition:** [What causes this error]
+- **Expected Behavior:** [How the system should respond]
+- **Recovery Path:** [How the user can recover]
+- **User Feedback:** [Error message or guidance provided]
+
+**EC-004:** [NEEDS CLARIFICATION - Error scenario]
+
+- **Trigger Condition:** [What causes this error]
+- **Expected Behavior:** [How the system should respond]
+- **Recovery Path:** [How the user can recover]
+- **User Feedback:** [Error message or guidance provided]
+
+### Data Validation Edge Cases
+
+**EC-005:** [NEEDS CLARIFICATION - Validation edge case]
+
+- **Invalid Input Type:** [Description]
+- **System Response:** [Expected handling]
+- **User Guidance:** [Feedback provided]
+
+---
+
+## 6. Success Criteria
+
+**SC-001:** [NEEDS CLARIFICATION - Success metric]
+
+- **Baseline:** [Current state measurement]
+- **Target:** [Desired outcome with numerical value]
+- **Measurement Period:** [Timeframe for measurement]
+- **Measurement Method:** [How this will be tracked]
+
+**SC-002:** [NEEDS CLARIFICATION - Success metric]
+
+- **Baseline:** [Current state measurement]
+- **Target:** [Desired outcome with numerical value]
+- **Measurement Period:** [Timeframe for measurement]
+- **Measurement Method:** [How this will be tracked]
+
+**SC-003:** [NEEDS CLARIFICATION - Success metric]
+
+- **Baseline:** [Current state measurement]
+- **Target:** [Desired outcome with numerical value]
+- **Measurement Period:** [Timeframe for measurement]
+- **Measurement Method:** [How this will be tracked]
+
+**SC-004:** [NEEDS CLARIFICATION - Success metric]
+
+- **Baseline:** [Current state measurement]
+- **Target:** [Desired outcome with numerical value]
+- **Measurement Period:** [Timeframe for measurement]
+- **Measurement Method:** [How this will be tracked]
+
+**SC-005:** [NEEDS CLARIFICATION - Success metric]
+
+- **Baseline:** [Current state measurement]
+- **Target:** [Desired outcome with numerical value]
+- **Measurement Period:** [Timeframe for measurement]
+- **Measurement Method:** [How this will be tracked]
+
+---
+
+## 7. Assumptions & Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - List assumptions about user behavior, environment, etc.]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - List external dependencies, prerequisites, or required integrations]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - List any known constraints or limitations]
+
+---
+
+## 8. Out of Scope
+
+[NEEDS CLARIFICATION - Explicitly define what is NOT included in this feature]
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION - Question requiring stakeholder input]
+2. [NEEDS CLARIFICATION - Question requiring stakeholder input]
+3. [NEEDS CLARIFICATION - Question requiring stakeholder input]
+
+---
+
+## 10. Approval & Sign-off
+
+| Role | Name | Date | Signature |
+|------|------|------|-----------|
+| Product Owner | | | |
+| Business Stakeholder | | | |
+| QA Lead | | | |
+| Development Lead | | | |
+
+---
+
+**Document Version:** 1.0
+**Last Updated:** [Date]
+**Created By:** [Author]
+**Status:** DRAFT - Awaiting User Requirements
+
+---
+
+## Instructions for Completion
+
+**To complete this specification, please provide:**
+
+1. **Feature description**: What problem are you solving? What capability should users have?
+2. **Target users**: Who will use this feature?
+3. **User goals**: What should users be able to accomplish?
+4. **Business objectives**: What business outcomes are expected?
+5. **Constraints**: Any budget, timeline, or regulatory constraints?
+6. **Integration needs**: Does this feature need to work with existing systems?
+7. **Success metrics**: How will you measure if this feature is successful?
+
+Once requirements are provided, this template will be populated with specific, measurable, and testable specifications.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-015.md b/evals/datasets/real-specs/spec-015.md
new file mode 100644
index 0000000000..00bc981aca
--- /dev/null
+++ b/evals/datasets/real-specs/spec-015.md
@@ -0,0 +1,322 @@
+# Test Spec 15
+
+## Prompt
+
+Build a HIPAA-compliant healthcare appointment scheduling system
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - No user requirements provided]
+
+**Description:**
+No user input was provided for this feature specification. This document serves as a template structure that should be populated once user requirements are submitted.
+
+**Problem Statement:** [NEEDS CLARIFICATION]
+
+**Target Users:** [NEEDS CLARIFICATION]
+
+**Business Value:** [NEEDS CLARIFICATION]
+
+---
+
+## 2. User Stories
+
+### P1 - Critical Priority
+
+**US-001: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+**US-002: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+### P2 - High Priority
+
+**US-003: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+**US-004: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+### P3 - Medium Priority
+
+**US-005: [NEEDS CLARIFICATION]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+- **When** [action taken]
+- **Then** [expected outcome]
+
+---
+
+## 3. Functional Requirements
+
+**FR-001:** [NEEDS CLARIFICATION] - The system shall [specific functionality]
+
+**FR-002:** [NEEDS CLARIFICATION] - The system shall [specific functionality]
+
+**FR-003:** [NEEDS CLARIFICATION] - The system shall [specific functionality]
+
+**FR-004:** [NEEDS CLARIFICATION] - The system shall [specific functionality]
+
+**FR-005:** [NEEDS CLARIFICATION] - The system shall [specific functionality]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION] - The system shall respond to user actions within [X seconds/milliseconds] under normal load conditions
+
+**NFR-002:** [NEEDS CLARIFICATION] - The system shall support [X] concurrent users without performance degradation
+
+**NFR-003:** [NEEDS CLARIFICATION] - The system shall process [X] transactions per second
+
+---
+
+### Security Requirements
+
+**NFR-004:** [NEEDS CLARIFICATION] - The system shall authenticate all users before granting access
+
+**NFR-005:** [NEEDS CLARIFICATION] - The system shall encrypt sensitive data at rest and in transit
+
+**NFR-006:** [NEEDS CLARIFICATION] - The system shall maintain audit logs of all user actions for [X] duration
+
+---
+
+### Scalability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION] - The system shall scale to accommodate [X%] growth in user base over [time period]
+
+**NFR-008:** [NEEDS CLARIFICATION] - The system shall handle peak loads of [X times] normal traffic
+
+---
+
+### Availability Requirements
+
+**NFR-009:** [NEEDS CLARIFICATION] - The system shall maintain [X%] uptime during business hours
+
+**NFR-010:** [NEEDS CLARIFICATION] - The system shall recover from failures within [X] minutes
+
+---
+
+### Usability Requirements
+
+**NFR-011:** [NEEDS CLARIFICATION] - The system shall be accessible to users with disabilities per [specific standard]
+
+**NFR-012:** [NEEDS CLARIFICATION] - New users shall be able to complete [core task] within [X] minutes without training
+
+---
+
+### Compatibility Requirements
+
+**NFR-013:** [NEEDS CLARIFICATION] - The system shall function on [specified browsers/devices/platforms]
+
+**NFR-014:** [NEEDS CLARIFICATION] - The system shall integrate with [external systems] via standard protocols
+
+---
+
+## 5. Edge Cases
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION] - System behavior when [minimum input value] is provided
+
+**EC-002:** [NEEDS CLARIFICATION] - System behavior when [maximum input value] is provided
+
+**EC-003:** [NEEDS CLARIFICATION] - System behavior when [empty/null input] is provided
+
+**EC-004:** [NEEDS CLARIFICATION] - System behavior when [special characters/invalid format] is provided
+
+---
+
+### Error Scenarios
+
+**EC-005:** [NEEDS CLARIFICATION] - System behavior when [external service] is unavailable
+
+**EC-006:** [NEEDS CLARIFICATION] - System behavior when user session expires during operation
+
+**EC-007:** [NEEDS CLARIFICATION] - System behavior when [resource limit] is exceeded
+
+**EC-008:** [NEEDS CLARIFICATION] - System behavior during network interruption
+
+**EC-009:** [NEEDS CLARIFICATION] - System behavior with concurrent conflicting operations
+
+**EC-010:** [NEEDS CLARIFICATION] - System behavior when duplicate entries are attempted
+
+---
+
+### Data Integrity Scenarios
+
+**EC-011:** [NEEDS CLARIFICATION] - System behavior when data format changes mid-process
+
+**EC-012:** [NEEDS CLARIFICATION] - System behavior with missing required dependencies
+
+---
+
+## 6. Success Criteria
+
+### User Adoption Metrics
+
+**SC-001:** [NEEDS CLARIFICATION] - Achieve [X%] user adoption rate within [timeframe]
+
+**SC-002:** [NEEDS CLARIFICATION] - Achieve [X%] user satisfaction score (measured via survey/feedback)
+
+**SC-003:** [NEEDS CLARIFICATION] - Reduce user support tickets related to [area] by [X%]
+
+---
+
+### Performance Metrics
+
+**SC-004:** [NEEDS CLARIFICATION] - Reduce time to complete [task] from [current time] to [target time]
+
+**SC-005:** [NEEDS CLARIFICATION] - Process [X] transactions per time period successfully
+
+**SC-006:** [NEEDS CLARIFICATION] - Maintain page load time under [X] seconds for [Y%] of requests
+
+---
+
+### Business Metrics
+
+**SC-007:** [NEEDS CLARIFICATION] - Increase [business metric] by [X%] within [timeframe]
+
+**SC-008:** [NEEDS CLARIFICATION] - Reduce operational costs by [X amount/percentage]
+
+**SC-009:** [NEEDS CLARIFICATION] - Achieve ROI of [X%] within [timeframe]
+
+---
+
+### Quality Metrics
+
+**SC-010:** [NEEDS CLARIFICATION] - Maintain defect rate below [X] per [time period/release]
+
+**SC-011:** [NEEDS CLARIFICATION] - Achieve [X%] test coverage for critical paths
+
+**SC-012:** [NEEDS CLARIFICATION] - Zero critical security vulnerabilities at launch
+
+---
+
+## 7. Assumptions and Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION] - Assumption about user environment
+- [NEEDS CLARIFICATION] - Assumption about data availability
+- [NEEDS CLARIFICATION] - Assumption about user capabilities
+
+### Dependencies
+
+- [NEEDS CLARIFICATION] - Dependency on external system/service
+- [NEEDS CLARIFICATION] - Dependency on internal team/resource
+- [NEEDS CLARIFICATION] - Dependency on third-party provider
+
+---
+
+## 8. Out of Scope
+
+- [NEEDS CLARIFICATION] - Feature/capability explicitly not included in this release
+- [NEEDS CLARIFICATION] - Future enhancement consideration
+- [NEEDS CLARIFICATION] - Related but separate initiative
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION] - Question about user requirements
+2. [NEEDS CLARIFICATION] - Question about business rules
+3. [NEEDS CLARIFICATION] - Question about integration points
+4. [NEEDS CLARIFICATION] - Question about data handling
+5. [NEEDS CLARIFICATION] - Question about user workflows
+
+---
+
+## 10. Approval and Sign-off
+
+| Role | Name | Date | Signature |
+|------|------|------|-----------|
+| Product Owner | [NEEDS CLARIFICATION] | | |
+| Business Stakeholder | [NEEDS CLARIFICATION] | | |
+| Technical Lead | [NEEDS CLARIFICATION] | | |
+| QA Lead | [NEEDS CLARIFICATION] | | |
+
+---
+
+**Document Version:** 1.0
+**Last Updated:** [Date]
+**Next Review Date:** [NEEDS CLARIFICATION]
+
+---
+
+## Instructions for Completion
+
+To complete this feature specification, please provide detailed user requirements including:
+
+1. **Feature description and purpose**
+2. **Target user personas and their needs**
+3. **Core functionality and capabilities required**
+4. **Business goals and expected outcomes**
+5. **Known constraints or limitations**
+6. **Integration requirements with existing systems**
+7. **Regulatory or compliance requirements**
+8. **Timeline and priority considerations**
+
+Once requirements are provided, this template will be populated with specific, measurable, and testable specifications.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-016.md b/evals/datasets/real-specs/spec-016.md
new file mode 100644
index 0000000000..38b98cbe66
--- /dev/null
+++ b/evals/datasets/real-specs/spec-016.md
@@ -0,0 +1,325 @@
+# Test Spec 16
+
+## Prompt
+
+Create a financial transaction processing system with audit trails and compliance reporting
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+[NEEDS CLARIFICATION: No user requirements have been provided in the input]
+
+**Feature Name:** [Pending User Input]
+
+**Description:** This specification document is ready to be populated with detailed requirements once the user provides their feature requirements. The template below demonstrates the structure and level of detail that will be included.
+
+**Target Users:** [To be determined based on user input]
+
+**Business Value:** [To be determined based on user input]
+
+---
+
+## 2. User Stories
+
+### Priority 1 (P1) - Must Have
+
+**US-001: [Story Title]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is performed]
+Then [expected outcome]
+
+Given [alternative context]
+When [action is performed]
+Then [expected outcome]
+```
+
+**US-002: [Story Title]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is performed]
+Then [expected outcome]
+```
+
+### Priority 2 (P2) - Should Have
+
+**US-003: [Story Title]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is performed]
+Then [expected outcome]
+```
+
+### Priority 3 (P3) - Nice to Have
+
+**US-004: [Story Title]**
+
+- **As a** [user role]
+- **I want** [capability]
+- **So that** [business value]
+
+**Acceptance Criteria:**
+
+```gherkin
+Given [initial context]
+When [action is performed]
+Then [expected outcome]
+```
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [Requirement description]
+
+- **Priority:** P1
+- **Testability:** [How to verify this requirement]
+
+**FR-002:** [Requirement description]
+
+- **Priority:** P1
+- **Testability:** [How to verify this requirement]
+
+### Data Management
+
+**FR-003:** [Requirement description]
+
+- **Priority:** P2
+- **Testability:** [How to verify this requirement]
+
+### User Interface
+
+**FR-004:** [Requirement description]
+
+- **Priority:** P1
+- **Testability:** [How to verify this requirement]
+
+### Integration Points
+
+**FR-005:** [Requirement description]
+
+- **Priority:** P2
+- **Testability:** [How to verify this requirement]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance (NFR-P)
+
+**NFR-P-001:** [Performance requirement with specific metrics]
+
+- **Metric:** [Measurable target]
+- **Testing Method:** [How to measure]
+
+**NFR-P-002:** [Performance requirement with specific metrics]
+
+- **Metric:** [Measurable target]
+- **Testing Method:** [How to measure]
+
+### Security (NFR-S)
+
+**NFR-S-001:** [Security requirement]
+
+- **Compliance:** [Relevant standards]
+- **Testing Method:** [How to verify]
+
+**NFR-S-002:** [Security requirement]
+
+- **Compliance:** [Relevant standards]
+- **Testing Method:** [How to verify]
+
+### Scalability (NFR-SC)
+
+**NFR-SC-001:** [Scalability requirement]
+
+- **Metric:** [Measurable target]
+- **Testing Method:** [How to measure]
+
+### Usability (NFR-U)
+
+**NFR-U-001:** [Usability requirement]
+
+- **Metric:** [Measurable target]
+- **Testing Method:** [How to measure]
+
+### Reliability (NFR-R)
+
+**NFR-R-001:** [Reliability requirement]
+
+- **Metric:** [Measurable target (e.g., 99.9% uptime)]
+- **Testing Method:** [How to measure]
+
+### Maintainability (NFR-M)
+
+**NFR-M-001:** [Maintainability requirement]
+
+- **Metric:** [Measurable target]
+- **Testing Method:** [How to measure]
+
+---
+
+## 5. Edge Cases & Error Scenarios
+
+### Boundary Conditions
+
+**EC-001: [Edge case description]**
+
+- **Scenario:** [What happens at the boundary]
+- **Expected Behavior:** [How the system should respond]
+- **Test Case:** [How to reproduce]
+
+**EC-002: [Edge case description]**
+
+- **Scenario:** [What happens at the boundary]
+- **Expected Behavior:** [How the system should respond]
+- **Test Case:** [How to reproduce]
+
+### Error Scenarios
+
+**ES-001: [Error scenario]**
+
+- **Trigger:** [What causes this error]
+- **Expected Behavior:** [How the system should handle it]
+- **User Feedback:** [What message/feedback the user receives]
+
+**ES-002: [Error scenario]**
+
+- **Trigger:** [What causes this error]
+- **Expected Behavior:** [How the system should handle it]
+- **User Feedback:** [What message/feedback the user receives]
+
+### Data Validation
+
+**DV-001: [Validation scenario]**
+
+- **Invalid Input:** [Description of invalid data]
+- **Expected Behavior:** [How the system validates and responds]
+
+---
+
+## 6. Success Criteria
+
+### Quantitative Metrics
+
+**SC-001:** [Measurable outcome with specific target]
+
+- **Baseline:** [Current state]
+- **Target:** [Desired state]
+- **Measurement Method:** [How to track]
+
+**SC-002:** [Measurable outcome with specific target]
+
+- **Baseline:** [Current state]
+- **Target:** [Desired state]
+- **Measurement Method:** [How to track]
+
+**SC-003:** [Measurable outcome with specific target]
+
+- **Baseline:** [Current state]
+- **Target:** [Desired state]
+- **Measurement Method:** [How to track]
+
+### Qualitative Metrics
+
+**SC-004:** [User satisfaction or experience metric]
+
+- **Measurement Method:** [Survey, feedback, etc.]
+- **Target:** [Desired outcome]
+
+**SC-005:** [Business value metric]
+
+- **Measurement Method:** [How to evaluate]
+- **Target:** [Desired outcome]
+
+### Acceptance Criteria for Release
+
+**SC-006:** All P1 user stories must be completed and verified
+**SC-007:** All functional requirements marked as P1 must pass testing
+**SC-008:** All non-functional requirements must meet specified thresholds
+**SC-009:** No critical or high-priority defects remain unresolved
+**SC-010:** [Feature-specific acceptance criterion]
+
+---
+
+## 7. Assumptions & Dependencies
+
+### Assumptions
+
+- [Assumption 1]
+- [Assumption 2]
+- [Assumption 3]
+
+### Dependencies
+
+- [Dependency 1]
+- [Dependency 2]
+- [Dependency 3]
+
+---
+
+## 8. Out of Scope
+
+The following items are explicitly out of scope for this feature:
+
+- [Out of scope item 1]
+- [Out of scope item 2]
+- [Out of scope item 3]
+
+---
+
+## 9. Open Questions
+
+**[NEEDS CLARIFICATION]**
+
+1. [Question requiring stakeholder input]
+2. [Question requiring stakeholder input]
+3. [Question requiring stakeholder input]
+
+---
+
+## Document Information
+
+| Field | Value |
+|-------|-------|
+| **Document Version** | 1.0 (Draft) |
+| **Last Updated** | [Date] |
+| **Author** | [Name] |
+| **Stakeholders** | [List] |
+| **Status** | Awaiting User Requirements |
+
+---
+
+**Note:** This is a template structure. Please provide your specific user requirements to generate a complete, detailed feature specification document tailored to your needs.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/datasets/real-specs/spec-017.md b/evals/datasets/real-specs/spec-017.md
new file mode 100644
index 0000000000..41aa5a3a07
--- /dev/null
+++ b/evals/datasets/real-specs/spec-017.md
@@ -0,0 +1,370 @@
+# Test Spec 17
+
+## Prompt
+
+Develop a legacy system integration middleware using REST and SOAP protocols
+
+## Generated Spec
+
+## Feature Specification Document
+
+### 1. Overview Section
+
+**Feature Name:** [NEEDS CLARIFICATION - Feature name not provided]
+
+**Description:**
+[NEEDS CLARIFICATION - No user requirements were provided in the input. This specification document template is ready to be populated once specific feature requirements are shared.]
+
+**Target Users:** [NEEDS CLARIFICATION]
+
+**Business Value:** [NEEDS CLARIFICATION]
+
+**Scope:** [NEEDS CLARIFICATION]
+
+---
+
+## 2. User Stories
+
+### P1 (Must Have) Stories
+
+#### US-001: [NEEDS CLARIFICATION]
+
+**As a** [user role]
+**I want** [feature capability]
+**So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+  **When** [action taken]
+  **Then** [expected outcome]
+
+- **Given** [initial context]
+  **When** [action taken]
+  **Then** [expected outcome]
+
+**Priority:** P1
+**Story Points:** [TBD]
+
+---
+
+#### US-002: [NEEDS CLARIFICATION]
+
+**As a** [user role]
+**I want** [feature capability]
+**So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+  **When** [action taken]
+  **Then** [expected outcome]
+
+**Priority:** P1
+**Story Points:** [TBD]
+
+---
+
+### P2 (Should Have) Stories
+
+#### US-003: [NEEDS CLARIFICATION]
+
+**As a** [user role]
+**I want** [feature capability]
+**So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+  **When** [action taken]
+  **Then** [expected outcome]
+
+**Priority:** P2
+**Story Points:** [TBD]
+
+---
+
+#### US-004: [NEEDS CLARIFICATION]
+
+**As a** [user role]
+**I want** [feature capability]
+**So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+  **When** [action taken]
+  **Then** [expected outcome]
+
+**Priority:** P2
+**Story Points:** [TBD]
+
+---
+
+### P3 (Nice to Have) Stories
+
+#### US-005: [NEEDS CLARIFICATION]
+
+**As a** [user role]
+**I want** [feature capability]
+**So that** [business benefit]
+
+**Acceptance Criteria:**
+
+- **Given** [initial context]
+  **When** [action taken]
+  **Then** [expected outcome]
+
+**Priority:** P3
+**Story Points:** [TBD]
+
+---
+
+## 3. Functional Requirements
+
+### Core Functionality
+
+**FR-001:** [NEEDS CLARIFICATION - Core feature requirement]
+**Testability:** [Define how this will be verified]
+
+**FR-002:** [NEEDS CLARIFICATION - Core feature requirement]
+**Testability:** [Define how this will be verified]
+
+### User Interface Requirements
+
+**FR-003:** [NEEDS CLARIFICATION - UI requirement]
+**Testability:** [Define how this will be verified]
+
+**FR-004:** [NEEDS CLARIFICATION - UI requirement]
+**Testability:** [Define how this will be verified]
+
+### Data Requirements
+
+**FR-005:** [NEEDS CLARIFICATION - Data requirement]
+**Testability:** [Define how this will be verified]
+
+**FR-006:** [NEEDS CLARIFICATION - Data requirement]
+**Testability:** [Define how this will be verified]
+
+### Integration Requirements
+
+**FR-007:** [NEEDS CLARIFICATION - Integration requirement]
+**Testability:** [Define how this will be verified]
+
+---
+
+## 4. Non-Functional Requirements
+
+### Performance Requirements
+
+**NFR-001:** [NEEDS CLARIFICATION - Response time requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-002:** [NEEDS CLARIFICATION - Throughput requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-003:** [NEEDS CLARIFICATION - Load capacity requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+### Security Requirements
+
+**NFR-004:** [NEEDS CLARIFICATION - Authentication requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-005:** [NEEDS CLARIFICATION - Authorization requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-006:** [NEEDS CLARIFICATION - Data protection requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+### Scalability Requirements
+
+**NFR-007:** [NEEDS CLARIFICATION - Concurrent users requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-008:** [NEEDS CLARIFICATION - Data volume requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+### Reliability Requirements
+
+**NFR-009:** [NEEDS CLARIFICATION - Uptime requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-010:** [NEEDS CLARIFICATION - Error rate requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+### Usability Requirements
+
+**NFR-011:** [NEEDS CLARIFICATION - Accessibility requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+**NFR-012:** [NEEDS CLARIFICATION - User learning curve requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+### Maintainability Requirements
+
+**NFR-013:** [NEEDS CLARIFICATION - Support requirement]
+**Measurement:** [Define specific metrics and thresholds]
+
+---
+
+## 5. Edge Cases and Error Scenarios
+
+### Boundary Conditions
+
+**EC-001:** [NEEDS CLARIFICATION - Minimum input boundary]
+**Expected Behavior:** [Define system response]
+
+**EC-002:** [NEEDS CLARIFICATION - Maximum input boundary]
+**Expected Behavior:** [Define system response]
+
+**EC-003:** [NEEDS CLARIFICATION - Empty/null state]
+**Expected Behavior:** [Define system response]
+
+### Error Scenarios
+
+**ES-001:** [NEEDS CLARIFICATION - Invalid input error]
+**Expected Behavior:** [Define error handling and user feedback]
+
+**ES-002:** [NEEDS CLARIFICATION - System unavailability error]
+**Expected Behavior:** [Define error handling and user feedback]
+
+**ES-003:** [NEEDS CLARIFICATION - Timeout error]
+**Expected Behavior:** [Define error handling and user feedback]
+
+**ES-004:** [NEEDS CLARIFICATION - Permission denied error]
+**Expected Behavior:** [Define error handling and user feedback]
+
+### Concurrent Operations
+
+**CO-001:** [NEEDS CLARIFICATION - Simultaneous access scenario]
+**Expected Behavior:** [Define system response]
+
+**CO-002:** [NEEDS CLARIFICATION - Race condition scenario]
+**Expected Behavior:** [Define system response]
+
+### Data Integrity
+
+**DI-001:** [NEEDS CLARIFICATION - Data validation failure]
+**Expected Behavior:** [Define system response]
+
+**DI-002:** [NEEDS CLARIFICATION - Data corruption scenario]
+**Expected Behavior:** [Define system response]
+
+---
+
+## 6. Success Criteria
+
+### Functional Success Metrics
+
+**SC-001:** [NEEDS CLARIFICATION - Feature adoption metric]
+**Target:** [Define specific, measurable target]
+**Measurement Method:** [Define how this will be measured]
+
+**SC-002:** [NEEDS CLARIFICATION - Feature completion metric]
+**Target:** [Define specific, measurable target]
+**Measurement Method:** [Define how this will be measured]
+
+### User Experience Metrics
+
+**SC-003:** [NEEDS CLARIFICATION - User satisfaction metric]
+**Target:** [Define specific, measurable target]
+**Measurement Method:** [Define how this will be measured]
+
+**SC-004:** [NEEDS CLARIFICATION - Task completion rate metric]
+**Target:** [Define specific, measurable target]
+**Measurement Method:** [Define how this will be measured]
+
+**SC-005:** [NEEDS CLARIFICATION - Time to complete task metric]
+**Target:** [Define specific, measurable target]
+**Measurement Method:** [Define how this will be measured]
+
+### Performance Metrics
+
+**SC-006:** [NEEDS CLARIFICATION - Response time metric]
+
+- **Target:** [Define specific, measurable target]
+- **Measurement Method:** [Define how this will be measured]
+
+**SC-007:** [NEEDS CLARIFICATION - Error rate metric]
+
+- **Target:** [Define specific, measurable target]
+- **Measurement Method:** [Define how this will be measured]
+
+### Business Metrics
+
+**SC-008:** [NEEDS CLARIFICATION - ROI or business value metric]
+
+- **Target:** [Define specific, measurable target]
+- **Measurement Method:** [Define how this will be measured]
+
+**SC-009:** [NEEDS CLARIFICATION - Cost efficiency metric]
+
+- **Target:** [Define specific, measurable target]
+- **Measurement Method:** [Define how this will be measured]
+
+---
+
+## 7. Assumptions and Dependencies
+
+### Assumptions
+
+- [NEEDS CLARIFICATION - List assumptions about user behavior, environment, etc.]
+
+### Dependencies
+
+- [NEEDS CLARIFICATION - List dependencies on other systems, teams, or features]
+
+### Constraints
+
+- [NEEDS CLARIFICATION - List known constraints (budget, time, resources, etc.)]
+
+---
+
+## 8. Out of Scope
+
+The following items are explicitly **NOT** included in this feature:
+
+- [NEEDS CLARIFICATION - List items that are out of scope]
+
+---
+
+## 9. Open Questions
+
+1. [NEEDS CLARIFICATION - What are the specific user requirements for this feature?]
+2. [NEEDS CLARIFICATION - Who are the target users and their roles?]
+3. [NEEDS CLARIFICATION - What problem is this feature solving?]
+4. [NEEDS CLARIFICATION - What are the business goals and KPIs?]
+5. [NEEDS CLARIFICATION - Are there any existing systems this needs to integrate with?]
+6. [NEEDS CLARIFICATION - What are the compliance or regulatory requirements?]
+7. [NEEDS CLARIFICATION - What is the expected launch timeline?]
+
+---
+
+## Document Control
+
+**Version:** 0.1 (Draft - Awaiting Requirements)
+**Last Updated:** [Current Date]
+**Author:** [TBD]
+**Stakeholders:** [TBD]
+**Review Status:** Awaiting User Requirements
+
+---
+
+## Next Steps
+
+1. **Gather Requirements:** Conduct stakeholder interviews to define feature requirements
+2. **Populate Specification:** Fill in all [NEEDS CLARIFICATION] sections with specific details
+3. **Review & Validate:** Share with stakeholders for feedback and approval
+4. **Refine & Finalize:** Incorporate feedback and create final specification
+5. **Handoff:** Share approved specification with development team
+
+---
+
+**Note:** This is a template specification document. Please provide specific user requirements to generate a complete, actionable feature specification.
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
diff --git a/evals/docs/AI-EVALS-WORKPLAN.md b/evals/docs/AI-EVALS-WORKPLAN.md
new file mode 100644
index 0000000000..a2593a0b35
--- /dev/null
+++ b/evals/docs/AI-EVALS-WORKPLAN.md
@@ -0,0 +1,147 @@
+# AI Evals Work Plan for Agentic SDLC Spec Kit
+
+**Purpose:** Apply proven AI evaluation practices to our Agentic SDLC Spec Kit project.
+
+---
+
+## 🚀 Implementation Progress (MVP Approach)
+
+**Strategy:** Implement minimum viable features for each step, expand later as time permits.
+
+### 📊 Current Status Summary
+
+**Overall Progress:** 5/5 core weeks completed (100%) ✅
+
+| Phase | Status | Pass Rate |
+|-------|--------|-----------|
+| Week 1: Error Analysis Foundation | ✅ Complete | **Plan Analysis: 100% (2/2)** |
+| Week 2-3: Custom Annotation Tool | ✅ Complete | - |
+| Week 4: Extend PromptFoo | ✅ Complete | **100% (10/10 tests)** |
+| Week 5: GitHub Actions CI/CD | ✅ Complete | - |
+| Week 5-6: Production Monitoring | 📋 Optional | - |
+
+**Latest Evaluation Results:**
+- **Spec Template Tests:** 8/8 passed (100%)
+- **Plan Template Tests:** 2/2 passed (100%)
+- **Overall:** 10/10 tests passing (100%)
+
+### Week 1: Error Analysis Foundation ✅ **COMPLETED**
+- **Directory structure** created: `evals/notebooks/`, `evals/datasets/`
+- **Test data generation script** created: `evals/scripts/generate-test-data.sh`
+- **Error analysis notebook** created: `evals/notebooks/error-analysis.ipynb`
+- **Environment setup** using uv: `evals/scripts/run-error-analysis.sh`
+- **Automated error analysis for specs** created: `evals/scripts/run-auto-error-analysis.sh`
+- **Automated error analysis for plans** created: `evals/scripts/run-auto-plan-analysis.sh`
+- **Real test specs generated** - 17 specs in `evals/datasets/real-specs/`
+- **Real test plans generated** - 2 plans in `evals/datasets/real-plans/` (expandable)
+- **First error analysis session** - Completed through PromptFoo evaluation runs
+- **Plan error analysis results** - 100% pass rate on initial test cases
+- **Document findings** - Results documented in this file and README
+
+### Week 2-3: Custom Annotation Tool 📋 ✅ **COMPLETED**
+- Basic FastHTML annotation app (MVP)
+- Keyboard shortcuts (N, P, 1, 2)
+- Export to JSON
+- Progress tracking with statistics
+- Auto-save functionality
+- Beautiful markdown rendering
+- Notes support for each spec
+- Launch script in evals/scripts/
+
+### Week 4: Extend PromptFoo ✅ **COMPLETED**
+- **Ran comprehensive evaluation** - All 10 tests executed
+- **Identified failures** - 3 initial failures documented
+- **Fixed prompts iteratively** - 2 rounds of refinements
+- **Achieved 100% pass rate** - 10/10 tests passing
+
+### Week 5-6: Production Monitoring 📈 **TODO (NOT MANDATORY)**
+- [ ] Async evaluation script (vanilla Python)
+- [ ] Simple alerting mechanism
+
+### Week 5: GitHub Actions Integration 🔄 ✅ **COMPLETED**
+- [x] Add GitHub Action for running evals on PR
+- [x] Add threshold checks to fail PR if quality drops
+- [x] Add automated reporting
+- [x] Create comprehensive setup documentation
+- [x] Add status badge to README
+- [x] Configure PR commenting for results
+
+---
+
+## Table of Contents
+
+1. [Tool Strategy & Current State](#1-tool-strategy--current-state) ⭐ **START HERE**
+2. [Foundation & Strategy](#2-foundation--strategy)
+3. [Evaluation Infrastructure](#3-evaluation-infrastructure)
+4. [Data & Datasets](#4-data--datasets)
+5. [Error Analysis & Debugging](#5-error-analysis--debugging)
+6. [Evaluators & Metrics](#6-evaluators--metrics)
+7. [RAG & Retrieval Evaluation](#7-rag--retrieval-evaluation)
+8. [Agentic Workflows](#8-agentic-workflows)
+9. [CI/CD & Production Monitoring](#9-cicd--production-monitoring) ⭐ **GitHub Actions + Optional Production**
+10. [Team & Process](#10-team--process)
+11. [Tools & Vendors](#11-tools--vendors)
+
+---
+
+## 1. Tool Strategy & Current State
+
+### Hybrid Tool Strategy 🎯
+
+Use **different tools for different purposes**:
+
+| Purpose | Tool | Why | Priority |
+|---------|------|-----|----------|
+| **Error Analysis** | Jupyter Notebooks | Flexible, exploratory, visualization | 🔥 **CRITICAL** |
+| **Fast Annotation** | Custom FastHTML App | 10x faster than manual review | 🔥 **HIGH** |
+| **CI/CD Template Testing** | PromptFoo (keep existing) | Fast, deterministic regression tests | ✅ **Keep** |
+| **Production Monitoring** | Vanilla Python + Vendor | Async, streaming, alerting | 🔥 **HIGH** |
+| **LLM-as-Judge Building** | Notebooks → PromptFoo | Prototype → Productionize | **MEDIUM** |
+
+### What NOT to Do ❌
+
+Per the FAQ, **avoid these common mistakes**:
+
+1. ❌ **Don't rely only on PromptFoo**
+2. ❌ **Don't use generic eval metrics from PromptFoo**
+3. ❌ **Don't skip error analysis**
+4. ❌ **Don't build annotation UI in PromptFoo**
+
+### Quick Start: What to Do Now
+
+**Week 1 (START HERE):**
+```bash
+# 1. Generate REAL test data (not synthetic prompts)
+mkdir -p evals/datasets/real-specs
+for i in {1..100}; do
+  # Use diverse prompts
+  specify init "test-$i" --ai claude
+  cp test-$i/.specify/specs/*/spec.md evals/datasets/real-specs/spec-$i.md
+done
+
+# 2. Create error analysis notebook
+jupyter lab evals/notebooks/error-analysis.ipynb
+# - Load 100 real specs
+# - Domain expert notes failures (open coding)
+# - Categorize into taxonomy (axial coding)
+# - Count failure modes
+
+# 3. Keep running PromptFoo for CI
+npx promptfoo eval  # This tests TEMPLATES, not real outputs
+```
+
+**Week 2-3:**
+```bash
+# Build custom annotation tool
+cd evals/annotation-tool
+uv venv
+uv pip install fasthtml
+python app.py  # Start annotation server
+```
+
+**Week 4:**
+```bash
+# Extend PromptFoo based on discovered failures
+# Add new tests to evals/configs/promptfooconfig-spec.js or promptfooconfig-plan.js
+# Add new graders to evals/graders/custom_graders.py
+```
diff --git a/evals/docs/AI-Evals-Frequently-Asked-Questions.pdf b/evals/docs/AI-Evals-Frequently-Asked-Questions.pdf
new file mode 100644
index 0000000000..18a00b44fa
Binary files /dev/null and b/evals/docs/AI-Evals-Frequently-Asked-Questions.pdf differ
diff --git a/evals/docs/EVAL.md b/evals/docs/EVAL.md
new file mode 100644
index 0000000000..39a4054f88
--- /dev/null
+++ b/evals/docs/EVAL.md
@@ -0,0 +1,265 @@
+# Annotation Evaluation System
+
+This document explains the annotation evaluation framework for testing spec-kit template quality.
+
+## What are the Annotation Evals?
+
+The annotation evals are a **multi-layered evaluation framework** for testing the quality of spec-kit templates (specifications and implementation plans) that are generated by AI prompts. The system has three main components:
+
+## 1. Automated Testing (PromptFoo)
+
+**10 automated regression tests** that run against the spec and plan generation templates.
+
+**Current Status:** 90% pass rate (9/10 tests passing)
+
+### Test Categories
+
+#### Spec Template Tests (8 tests)
+- **Basic Structure**: Validates required sections (Overview, Requirements, User Stories, etc.)
+- **No Premature Tech Stack**: Ensures spec focuses on WHAT, not HOW
+- **Quality User Stories**: Checks for proper format and acceptance criteria
+- **Clarity & Vague Terms**: Flags unmeasurable requirements needing quantification
+- **Security Requirements**: Security-critical features include security considerations
+- **Edge Cases Coverage**: Validates error scenarios and boundary conditions
+- **Completeness**: Comprehensive requirements for complex features
+- **Regression**: Even simple features maintain proper structure
+
+#### Plan Template Tests (2 tests)
+- **Simplicity Gate**: Simple apps should have ≤3 projects (Constitution Article VII)
+- **Constitution Compliance**: No over-engineering or unnecessary abstractions
+
+### Running Automated Tests
+
+```bash
+# Run all tests
+./evals/scripts/run-promptfoo-eval.sh
+
+# Run with JSON output
+./evals/scripts/run-promptfoo-eval.sh --json
+
+# Run and open web UI
+./evals/scripts/run-promptfoo-eval.sh --view
+
+# Filter specific tests
+./evals/scripts/run-promptfoo-eval.sh --filter "Spec Template"
+```
+
+## 2. Custom Annotation Tool
+
+Located in `evals/annotation-tool/`, this is a **FastHTML-based web interface** for manual spec review that's "10x faster than manual review".
+
+### Features
+- **Keyboard-driven interface**: N (next), P (previous), 1 (pass), 2 (fail)
+- **Progress tracking**: Visual statistics on review progress
+- **Note-taking**: Add observations for each spec
+- **Auto-save**: Annotations automatically saved to JSON
+- **Export**: Timestamped exports for analysis
+
+### Purpose
+- Human reviewers manually evaluate generated specs for quality
+- Binary pass/fail assessment (no Likert scales)
+- Identify failure patterns that automated tests miss
+- Build understanding of quality dimensions
+
+### Running the Annotation Tool
+
+```bash
+# Launch the web interface
+./evals/scripts/run-annotation-tool.sh
+
+# Open browser to http://localhost:5001
+```
+
+### Output
+Annotations are saved to:
+- `evals/annotation-tool/annotations.json` - Current state (auto-saved)
+- `evals/annotation-tool/annotations_export_YYYYMMDD_HHMMSS.json` - Timestamped exports
+
+## 3. Error Analysis Workflows
+
+Two approaches for deeper investigation of failures:
+
+### a) Automated Analysis (using Claude API)
+
+Uses Claude API to automatically evaluate specs/plans and categorize failures.
+
+```bash
+# For specs
+./evals/scripts/run-auto-error-analysis.sh
+
+# For plans
+./evals/scripts/run-auto-plan-analysis.sh
+```
+
+**Features:**
+- Evaluates all specs automatically using Claude
+- Binary pass/fail with reasoning
+- Categorizes failures automatically
+- Generates comprehensive CSV reports and summaries
+- Saves time on initial review
+
+**Output:**
+- `evals/datasets/analysis-results/automated-analysis-<timestamp>.csv`
+- `evals/datasets/analysis-results/summary-<timestamp>.txt`
+- `evals/datasets/analysis-results/plan-analysis-<timestamp>.csv`
+- `evals/datasets/analysis-results/plan-summary-<timestamp>.txt`
+
+### b) Manual Analysis (using Jupyter)
+
+Deep investigation following qualitative coding methodology.
+
+```bash
+# Launch Jupyter Lab
+./evals/scripts/run-error-analysis.sh
+```
+
+**Process:**
+1. **Open Coding** - Domain expert reviews 10-20 real specs/plans, notes issues without categorization
+2. **Axial Coding** - Group similar failures into categories, count frequency
+3. **Fix & Iterate** - Address high-frequency failures, add automated checks
+
+**Location:** `evals/notebooks/error-analysis.ipynb`
+
+## What Do They Test?
+
+The evaluations test the quality of **AI-generated specifications and implementation plans**.
+
+### Test Data
+
+- **17 diverse spec templates**: `evals/datasets/real-specs/spec-001.md` through `spec-017.md`
+- **2 plan templates**: `evals/datasets/real-plans/plan-001.md` and `plan-002.md` (expandable)
+
+### Quality Criteria
+
+#### For Specifications
+- Required sections present (Overview, Requirements, User Stories, etc.)
+- No premature technical decisions (focuses on WHAT, not HOW)
+- Proper user story format with acceptance criteria
+- Measurable requirements (flags vague terms)
+- Security considerations for security-critical features
+- Edge cases and error scenarios
+- Completeness for complex features
+- Proper structure even for simple features
+
+#### For Implementation Plans
+- **Simplicity gate**: ≤3 projects for simple apps (CRITICAL - Constitution Article VII)
+- No over-engineering or microservices for simple apps
+- Clear project boundaries and tasks
+- Testing strategy included
+- Appropriate architecture
+- No premature optimization
+- Testability and verification steps
+
+### Common Failure Categories
+
+**For Specs:**
+- Incomplete requirements
+- Ambiguous specifications
+- Missing acceptance criteria
+- Security considerations missing
+- Over-engineering indicators
+- Vague or unmeasurable terms
+- Missing edge cases
+
+**For Plans:**
+- Too many projects (>3)
+- Over-engineering
+- Unclear project boundaries
+- Missing verification steps
+- Microservices for simple app
+- Premature optimization
+- Missing testing strategy
+- Tech stack mismatch
+
+## The Complete Workflow
+
+```
+1. Generate Specs/Plans (using prompt templates)
+   ↓
+2. PromptFoo Tests (automated regression checks)
+   ↓
+3. Manual Annotation Tool (human review, pass/fail)
+   ↓
+4. Error Analysis (categorize failures, find patterns)
+   ↓
+5. Fix Templates (address high-frequency failures)
+   ↓
+6. Add Automated Checks (extend PromptFoo)
+   ↓
+7. Repeat (continuous improvement)
+```
+
+## The 80/20 Rule
+
+According to AI evaluation best practices:
+
+> "We spent 60-80% of our development time on error analysis and evaluation. Expect most of your effort to go toward understanding failures (i.e. looking at data) rather than building automated checks."
+
+**80% of value comes from:**
+1. Manual annotation tool for human review
+2. Error analysis (Jupyter notebooks + automated analysis)
+3. PromptFoo for CI/CD regression testing
+
+**20% of value comes from:**
+- Production monitoring (planned)
+- Advanced features (clustering, AI assistance)
+
+## Key Insight
+
+**Manual review via the annotation tool is the most valuable activity**, with automated tests serving as regression checks to prevent known failures from recurring. The workflow emphasizes:
+
+1. Human judgment first (annotation tool)
+2. Pattern discovery (error analysis)
+3. Automation second (PromptFoo tests)
+
+This mirrors industry best practices where teams spend 60-80% of evaluation time on manual review and understanding failures rather than building automated checks.
+
+## Directory Structure
+
+```
+evals/
+├── EVAL.md                    # This file - evaluation system overview
+├── README.md                  # Setup and usage instructions
+├── configs/                   # PromptFoo configuration files
+│   ├── promptfooconfig.js            # Main config (all 10 tests)
+│   ├── promptfooconfig-spec.js       # Spec template tests only
+│   └── promptfooconfig-plan.js       # Plan template tests only
+├── prompts/                   # Templates under test
+│   ├── spec-prompt.txt               # Specification generation template
+│   └── plan-prompt.txt               # Implementation plan template
+├── graders/                   # Custom evaluation logic
+│   └── custom_graders.py             # Python-based quality checks
+├── scripts/                   # Test execution utilities
+│   ├── run-promptfoo-eval.sh         # PromptFoo test runner
+│   ├── run-error-analysis.sh         # Jupyter error analysis launcher
+│   ├── run-auto-error-analysis.sh    # Automated spec analysis
+│   ├── run-auto-plan-analysis.sh     # Automated plan analysis
+│   └── run-annotation-tool.sh        # Annotation tool launcher
+├── annotation-tool/           # FastHTML annotation interface
+│   ├── app.py                        # Web application
+│   ├── README.md                     # Tool documentation
+│   └── annotations.json              # Saved annotations
+├── notebooks/                 # Jupyter notebooks
+│   └── error-analysis.ipynb          # Manual review workflow
+└── datasets/                  # Test data and results
+    ├── real-specs/                   # Generated specs (17 templates)
+    ├── real-plans/                   # Generated plans (2 templates)
+    └── analysis-results/             # Analysis outputs (CSV, summaries)
+```
+
+## Quick Reference
+
+| Task | Command |
+|------|---------|
+| Run all automated tests | `./evals/scripts/run-promptfoo-eval.sh` |
+| Launch annotation tool | `./evals/scripts/run-annotation-tool.sh` |
+| Run automated spec analysis | `./evals/scripts/run-auto-error-analysis.sh` |
+| Run automated plan analysis | `./evals/scripts/run-auto-plan-analysis.sh` |
+| Manual error analysis | `./evals/scripts/run-error-analysis.sh` |
+| View test results in browser | `./evals/scripts/run-promptfoo-eval.sh --view` |
+
+## See Also
+
+- [README.md](../README.md) - Complete setup and configuration guide
+- [annotation-tool/README.md](../annotation-tool/README.md) - Annotation tool detailed documentation
+- [AI-EVALS-WORKPLAN.md](AI-EVALS-WORKPLAN.md) - Implementation roadmap
\ No newline at end of file
diff --git a/evals/docs/GITHUB_ACTIONS_SETUP.md b/evals/docs/GITHUB_ACTIONS_SETUP.md
new file mode 100644
index 0000000000..c644c1e8c6
--- /dev/null
+++ b/evals/docs/GITHUB_ACTIONS_SETUP.md
@@ -0,0 +1,57 @@
+# GitHub Actions Setup for AI Evaluations
+
+This guide explains how to set up GitHub Actions for manual evaluation runs.
+
+## Overview
+
+The GitHub Actions workflow (`.github/workflows/eval.yml`) provides:
+- **Manual execution** via GitHub Actions interface
+- Quality threshold checks (minimum 70% pass rate)
+- Detailed evaluation reports with pass/fail status
+- Result artifacts stored for 30 days
+- On-demand quality validation
+
+## Required Secrets
+
+The workflow requires two secrets to be configured in your GitHub repository:
+
+### 1. LLM_BASE_URL
+
+Your LiteLLM proxy URL or other LLM provider API base URL.
+
+### 2. LLM_AUTH_TOKEN
+
+Your authentication token for the API.
+
+## Setting Up Secrets
+
+1. Go to your repository on GitHub
+2. Click on **Settings** (top menu)
+3. In the left sidebar, click **Secrets and variables** → **Actions**
+4. Click **New repository secret**
+5. Enter the secret name (e.g., `LLM_BASE_URL`)
+6. Paste the secret value
+7. Click **Add secret**
+
+Repeat for both `LLM_BASE_URL` and `LLM_AUTH_TOKEN`.
+
+## Running the Workflow
+
+The workflow is configured for **manual execution only**.
+
+1. Go to **Actions** tab in your repository
+2. Select **AI Evals** workflow from the left sidebar
+3. Click **Run workflow** button (top right)
+4. Select the branch to run against (usually `main`)
+5. (Optional) Enter the model name in the **Model** input field.
+6. Click the green **Run workflow** button
+
+## Viewing Results
+
+After the workflow completes:
+
+1. Click on the completed workflow run
+2. Click on the **eval** job to see detailed logs
+3. Scroll to **Artifacts** section at the bottom
+4. Download `eval-results` to get detailed JSON reports
+5. View the summary in the workflow logs
diff --git a/evals/docs/LOCAL_TESTING.md b/evals/docs/LOCAL_TESTING.md
new file mode 100644
index 0000000000..94325b4bfa
--- /dev/null
+++ b/evals/docs/LOCAL_TESTING.md
@@ -0,0 +1,463 @@
+# Testing GitHub Actions Locally
+
+Guide for testing the AI Evals workflow locally before pushing to GitHub.
+
+## Quick Start (Easiest Method)
+
+We provide a helper script that handles everything for you:
+
+```bash
+# 1. Install act (if not already installed)
+brew install act  # macOS
+# or: curl https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash
+
+# 2. Run the helper script
+./evals/scripts/test-workflow-locally.sh
+
+# The script will:
+# - Check prerequisites (act, Docker)
+# - Create secrets template if needed
+# - Run the workflow locally
+# - Display results
+
+# Options:
+./evals/scripts/test-workflow-locally.sh --list           # Dry run (list steps)
+./evals/scripts/test-workflow-locally.sh --verbose        # Show details
+./evals/scripts/test-workflow-locally.sh --reuse          # Faster iterations
+./evals/scripts/test-workflow-locally.sh --skip-pr-comment # Skip PR comment step
+./evals/scripts/test-workflow-locally.sh --help           # Show all options
+```
+
+That's it! The script handles secrets setup, Docker checks, and runs the workflow.
+
+---
+
+## Manual Setup (Advanced)
+
+If you prefer manual control or need to customize the setup:
+
+## Prerequisites
+
+1. **Docker** - Required for `act` to run workflows in containers
+   ```bash
+   # Verify Docker is installed and running
+   docker --version
+   docker ps
+   ```
+
+2. **act** - Tool for running GitHub Actions locally
+   ```bash
+   # macOS
+   brew install act
+
+   # Linux
+   curl https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash
+
+   # Verify installation
+   act --version
+   ```
+
+## Setup
+
+### 1. Create Secrets File
+
+Create `.github/workflows/.secrets` (gitignored):
+
+```bash
+# Create the secrets file
+cat > .github/workflows/.secrets << 'EOF'
+LLM_BASE_URL=your-llm-base-url
+LLM_AUTH_TOKEN=your-api-key
+EOF
+
+# Secure the file
+chmod 600 .github/workflows/.secrets
+```
+
+**Important:** Never commit this file! It's already in `.gitignore`.
+
+### 2. Add to .gitignore
+
+Ensure `.github/workflows/.secrets` is in your `.gitignore`:
+
+```bash
+# Add to .gitignore if not already there
+echo ".github/workflows/.secrets" >> .gitignore
+```
+
+## Running Tests
+
+### Basic Commands
+
+```bash
+# List all jobs and steps (dry run)
+act pull_request --list
+
+# Run the full workflow
+act pull_request --secret-file .github/workflows/.secrets
+
+# Run with verbose output
+act pull_request --secret-file .github/workflows/.secrets -v
+
+# Run specific job
+act pull_request -j eval --secret-file .github/workflows/.secrets
+```
+
+### Simulating Different Events
+
+```bash
+# Simulate pull_request event (default)
+act pull_request --secret-file .github/workflows/.secrets
+
+# Simulate push to main
+act push --secret-file .github/workflows/.secrets
+
+# Simulate schedule (cron)
+act schedule --secret-file .github/workflows/.secrets
+
+# Simulate manual workflow_dispatch
+act workflow_dispatch --secret-file .github/workflows/.secrets
+```
+
+### Advanced Options
+
+```bash
+# Use smaller runner image (faster)
+act pull_request \
+  --secret-file .github/workflows/.secrets \
+  -P ubuntu-latest=catthehacker/ubuntu:act-latest
+
+# Run with specific event payload
+act pull_request \
+  --secret-file .github/workflows/.secrets \
+  --eventpath .github/workflows/test-event.json
+
+# Skip steps that can't run locally (e.g., PR comments)
+act pull_request \
+  --secret-file .github/workflows/.secrets \
+  --job eval \
+  --skip-steps "Comment PR with Results"
+
+# Reuse containers (faster for repeated runs)
+act pull_request \
+  --secret-file .github/workflows/.secrets \
+  --reuse
+```
+
+### Using Environment Variables Instead
+
+```bash
+# Export secrets as environment variables
+export LLM_BASE_URL="your-url"
+export LLM_AUTH_TOKEN="your-token"
+
+# Run with -s flag for each secret
+act pull_request \
+  -s LLM_BASE_URL \
+  -s LLM_AUTH_TOKEN
+```
+
+## Testing Specific Scenarios
+
+### Test Only Evaluation Steps
+
+```bash
+# Skip setup, just run evals
+act pull_request \
+  --secret-file .github/workflows/.secrets \
+  --matrix ubuntu-latest:catthehacker/ubuntu:act-latest \
+  --job eval
+```
+
+### Test Threshold Failures
+
+Temporarily modify `check_eval_scores.py` to fail:
+
+```bash
+# Edit the threshold to impossible value
+# In .github/workflows/eval.yml, change:
+#   --min-pass-rate 0.70
+# To:
+#   --min-pass-rate 0.99
+
+# Then test
+act pull_request --secret-file .github/workflows/.secrets
+```
+
+### Test PR Comment Logic
+
+The PR comment step won't work locally (requires GitHub API), but you can verify the summary generation:
+
+```bash
+# Run up to summary generation
+act pull_request \
+  --secret-file .github/workflows/.secrets \
+  --skip-steps "Comment PR with Results"
+```
+
+## Limitations of Local Testing
+
+### What Works ✅
+
+- Job execution
+- Step execution
+- Environment variables and secrets
+- Docker container actions
+- Artifacts (stored in local directory)
+- Most shell commands
+- Python/Node.js setup
+
+### What Doesn't Work ❌
+
+- **GitHub API interactions**: PR comments, issue updates
+- **GitHub context**: Some `github.*` variables may be missing
+- **Exact GitHub runner environment**: Uses Docker images that approximate GitHub runners
+- **Caching between runs**: Less effective than GitHub's cache
+- **Concurrent job execution**: Runs sequentially locally
+
+### Workarounds
+
+1. **PR Comments**: Test summary generation, skip actual commenting
+   ```bash
+   act pull_request --secret-file .github/workflows/.secrets --skip-steps "Comment PR"
+   ```
+
+2. **GitHub Context**: Create mock event file
+   ```json
+   {
+     "pull_request": {
+       "number": 123,
+       "head": {
+         "ref": "test-branch"
+       }
+     }
+   }
+   ```
+
+3. **Artifacts**: Check local `.artifacts/` directory
+
+## Debugging
+
+### View Detailed Logs
+
+```bash
+# Maximum verbosity
+act pull_request --secret-file .github/workflows/.secrets -v -v
+
+# Show Docker commands
+act pull_request --secret-file .github/workflows/.secrets --verbose
+```
+
+### Interactive Debugging
+
+```bash
+# Start interactive shell in workflow container
+act pull_request --secret-file .github/workflows/.secrets --shell
+
+# Then manually run commands to debug
+cd /github/workspace
+./evals/scripts/run-promptfoo-eval.sh --json
+```
+
+### Check Container Logs
+
+```bash
+# Keep containers after run
+act pull_request --secret-file .github/workflows/.secrets --reuse
+
+# List containers
+docker ps -a | grep act-
+
+# View container logs
+docker logs <container-id>
+
+# Exec into container
+docker exec -it <container-id> /bin/bash
+```
+
+## Best Practices
+
+### 1. Use Smaller Test Suite Locally
+
+Create a minimal config for local testing:
+
+```bash
+# Create evals/configs/promptfooconfig-test.js
+# With just 1-2 tests for quick iteration
+
+# Modify workflow to use test config locally
+# Or pass --filter flag to run-promptfoo-eval.sh
+```
+
+### 2. Cache Dependencies
+
+```bash
+# Use --reuse flag for faster iterations
+act pull_request --secret-file .github/workflows/.secrets --reuse
+
+# This keeps containers running between tests
+```
+
+### 3. Test Incrementally
+
+```bash
+# Test just the setup steps first
+act pull_request --secret-file .github/workflows/.secrets --list
+
+# Then test evaluation steps
+act pull_request --secret-file .github/workflows/.secrets -j eval
+
+# Finally test full workflow
+act pull_request --secret-file .github/workflows/.secrets
+```
+
+### 4. Use Smaller Runner Image
+
+The default `ubuntu-latest` image is large (~18GB). Use a smaller one:
+
+```bash
+# Create .actrc in repo root
+echo "-P ubuntu-latest=catthehacker/ubuntu:act-latest" > .actrc
+
+# Now act will use smaller image by default
+act pull_request --secret-file .github/workflows/.secrets
+```
+
+## Common Issues
+
+### Issue: Docker Not Running
+
+```
+Error: Cannot connect to Docker daemon
+```
+
+**Solution:**
+```bash
+# Start Docker Desktop (macOS)
+open -a Docker
+
+# Or start Docker service (Linux)
+sudo systemctl start docker
+
+# Verify
+docker ps
+```
+
+### Issue: Secrets Not Found
+
+```
+Error: LLM_BASE_URL not set
+```
+
+**Solution:**
+```bash
+# Verify secrets file exists and has correct format
+cat .github/workflows/.secrets
+
+# Use absolute path
+act pull_request --secret-file $(pwd)/.github/workflows/.secrets
+```
+
+### Issue: Node.js/Python Not Found
+
+```
+Error: node: command not found
+```
+
+**Solution:**
+```bash
+# Use official runner images (larger but more compatible)
+act pull_request \
+  --secret-file .github/workflows/.secrets \
+  -P ubuntu-latest=ghcr.io/catthehacker/ubuntu:runner-latest
+```
+
+### Issue: Workflow Takes Too Long
+
+```
+# Runs forever or very slow
+```
+
+**Solution:**
+```bash
+# Use smaller image
+act pull_request --secret-file .github/workflows/.secrets \
+  -P ubuntu-latest=catthehacker/ubuntu:act-latest
+
+# Skip non-essential steps
+act pull_request --secret-file .github/workflows/.secrets \
+  --skip-steps "Upload Results Artifact"
+
+# Run specific job only
+act pull_request -j eval --secret-file .github/workflows/.secrets
+```
+
+## Quick Reference
+
+```bash
+# Complete local test workflow
+# 1. Ensure Docker is running
+docker ps
+
+# 2. Verify secrets file
+cat .github/workflows/.secrets
+
+# 3. List jobs (dry run)
+act pull_request --list
+
+# 4. Run workflow
+act pull_request --secret-file .github/workflows/.secrets
+
+# 5. Check results
+ls -la eval-results*.json
+
+# 6. View summary
+cat eval_summary.txt
+```
+
+## Integration with Development Workflow
+
+### Pre-commit Hook (Optional)
+
+Create `.git/hooks/pre-push`:
+
+```bash
+#!/bin/bash
+echo "Running local GitHub Actions tests..."
+
+act pull_request \
+  --secret-file .github/workflows/.secrets \
+  -P ubuntu-latest=catthehacker/ubuntu:act-latest \
+  --quiet
+
+if [ $? -ne 0 ]; then
+    echo "❌ Workflow tests failed. Fix issues before pushing."
+    exit 1
+fi
+
+echo "✅ Workflow tests passed."
+```
+
+Make it executable:
+```bash
+chmod +x .git/hooks/pre-push
+```
+
+## Resources
+
+- **act Documentation**: https://github.com/nektos/act
+- **act Runner Images**: https://github.com/catthehacker/docker_images
+- **GitHub Actions Docs**: https://docs.github.com/en/actions
+- **Docker Installation**: https://docs.docker.com/get-docker/
+
+## Next Steps
+
+After testing locally:
+
+1. ✅ Fix any issues found during local testing
+2. ✅ Push changes to GitHub
+3. ✅ Verify workflow runs successfully on GitHub
+4. ✅ Set up GitHub secrets (if not already done)
+5. ✅ Test with real PR
+
+See [GITHUB_ACTIONS_SETUP.md](GITHUB_ACTIONS_SETUP.md) for production deployment.
diff --git a/evals/docs/QUICK_REFERENCE.md b/evals/docs/QUICK_REFERENCE.md
new file mode 100644
index 0000000000..e363dbccc0
--- /dev/null
+++ b/evals/docs/QUICK_REFERENCE.md
@@ -0,0 +1,137 @@
+# GitHub Actions Eval Workflow - Quick Reference
+
+## 🚀 Quick Commands
+
+### Local Testing
+
+```bash
+# Install act (first time only)
+brew install act  # macOS
+
+# Test workflow locally (easiest)
+./evals/scripts/test-workflow-locally.sh
+
+# With options
+./evals/scripts/test-workflow-locally.sh --list      # Dry run
+./evals/scripts/test-workflow-locally.sh --verbose   # Detailed output
+./evals/scripts/test-workflow-locally.sh --reuse     # Faster iterations
+```
+
+### Manual Testing
+
+```bash
+# Run all evaluations
+./evals/scripts/run-promptfoo-eval.sh
+
+# Run with JSON output
+./evals/scripts/run-promptfoo-eval.sh --json
+
+# Run specific tests
+./evals/scripts/run-promptfoo-eval.sh --filter "Spec"
+
+# Check scores
+python3 evals/scripts/check_eval_scores.py \
+  --results eval-results.json \
+  --min-pass-rate 0.70
+```
+
+## 📁 Important Files
+
+| File | Purpose |
+|------|---------|
+| `.github/workflows/eval.yml` | Main GitHub Actions workflow |
+| `.github/workflows/.secrets` | Local testing secrets (gitignored) |
+| `evals/configs/promptfooconfig.js` | All evaluation tests |
+| `evals/scripts/check_eval_scores.py` | Threshold validation |
+| `evals/scripts/run-promptfoo-eval.sh` | Evaluation runner |
+| `evals/scripts/test-workflow-locally.sh` | Local testing helper |
+
+## 🔧 Setup Checklist
+
+### For GitHub Actions (Production)
+
+- [ ] Add `LLM_BASE_URL` secret to GitHub
+- [ ] Add `LLM_AUTH_TOKEN` secret to GitHub
+- [ ] Run workflow manually from Actions tab
+- [ ] (Optional) Set `LLM_MODEL` in the workflow dispatch UI.
+- [ ] Verify results in workflow logs
+- [ ] Download and review artifacts
+
+**Guide:** See [GITHUB_ACTIONS_SETUP.md](GITHUB_ACTIONS_SETUP.md)
+
+### For Local Testing
+
+- [ ] Install `act`: `brew install act`
+- [ ] Ensure Docker is running
+- [ ] Run: `./evals/scripts/test-workflow-locally.sh`
+- [ ] Edit `.github/workflows/.secrets` with your values
+- [ ] Run again to test
+
+**Guide:** See [LOCAL_TESTING.md](LOCAL_TESTING.md)
+
+## 🎯 Workflow Triggers
+
+| Trigger | When | Purpose |
+|---------|------|---------|
+| **workflow_dispatch** | Manual trigger only | On-demand quality validation |
+
+**Note:** The workflow does NOT run automatically on PRs, pushes, or schedules.
+Run manually from GitHub Actions tab when you need quality validation.
+
+## 📊 Quality Thresholds
+
+| Metric | Threshold | Action if Failed |
+|--------|-----------|------------------|
+| Average Score | ≥ 0.70 | ❌ Workflow fails |
+| Pass Rate | ≥ 70% | ❌ Workflow fails |
+| Overall | 10/10 tests pass | ✅ Currently passing |
+
+## 🐛 Troubleshooting
+
+### Workflow Fails Locally
+
+```bash
+# Check Docker is running
+docker ps
+
+# Verify secrets
+cat .github/workflows/.secrets
+
+# Run with verbose output
+./evals/scripts/test-workflow-locally.sh --verbose
+
+# Check logs
+act pull_request --secret-file .github/workflows/.secrets -v
+```
+
+### Workflow Fails on GitHub
+
+- Check **Actions** tab for error logs
+- Verify secrets are set in repository settings
+- Test locally first with `act`
+- Ensure API credentials are valid
+
+## 🎓 Learning Resources
+
+| Topic | Link |
+|-------|------|
+| **Full Setup Guide** | [GITHUB_ACTIONS_SETUP.md](GITHUB_ACTIONS_SETUP.md) |
+| **Local Testing** | [LOCAL_TESTING.md](LOCAL_TESTING.md) |
+| **Eval Framework** | [README.md](../README.md) |
+| **act Documentation** | <https://github.com/nektos/act> |
+| **GitHub Actions** | <https://docs.github.com/en/actions> |
+
+## 📞 Getting Help
+
+1. Check [LOCAL_TESTING.md](LOCAL_TESTING.md) troubleshooting section
+2. Check [GITHUB_ACTIONS_SETUP.md](GITHUB_ACTIONS_SETUP.md) common issues
+3. Review workflow logs in GitHub Actions tab
+4. Open issue: <https://github.com/tikalk/agentic-sdlc-spec-kit/issues>
+
+## ✅ Current Status
+
+[![AI Evals](https://github.com/tikalk/agentic-sdlc-spec-kit/actions/workflows/eval.yml/badge.svg)](https://github.com/tikalk/agentic-sdlc-spec-kit/actions/workflows/eval.yml)
+
+- **Pass Rate:** 100% (10/10 tests)
+- **Status:** ✅ Production ready
+- **Last Updated:** 2026-01-14
diff --git a/evals/docs/WORKFLOWS.md b/evals/docs/WORKFLOWS.md
new file mode 100644
index 0000000000..8eda0d1f80
--- /dev/null
+++ b/evals/docs/WORKFLOWS.md
@@ -0,0 +1,258 @@
+# Advanced Workflows
+
+This document provides detailed information on the error analysis and annotation workflows used in the Spec-Kit Evaluation Framework.
+
+## Error Analysis Workflow
+
+PromptFoo provides automated regression testing, but **error analysis on real outputs** is the most important evaluation activity. According to AI evaluation best practices:
+
+> "We spent 60-80% of our development time on error analysis and evaluation. Expect most of your effort to go toward understanding failures (i.e. looking at data) rather than building automated checks."
+
+### Quick Start
+
+```bash
+# 1. Generate test data
+cd evals/scripts
+./generate-test-data.sh  # Creates 17 diverse test case templates
+
+# 2. Run error analysis workflow (sets up environment + launches Jupyter)
+cd ../scripts
+./run-error-analysis.sh
+
+# 4. Run error analysis session (30-60 minutes)
+# - Load specs
+# - Review and annotate (pass/fail, issues, categories)
+# - Categorize failures
+# - Prioritize fixes
+```
+
+### Directory Structure
+
+```text
+evals/
+├── notebooks/              # Error analysis
+│   ├── error-analysis.ipynb       # Main analysis notebook (manual)
+│   └── .venv/                     # Virtual environment
+├── scripts/                # Automation scripts
+│   ├── run-promptfoo-eval.sh              # PromptFoo test runner
+│   ├── run-error-analysis.sh              # Manual error analysis (Jupyter)
+│   ├── run-auto-error-analysis.sh         # Automated error analysis for specs (Claude API)
+│   ├── run-automated-error-analysis.py    # Python script for spec automation
+│   ├── run-auto-plan-analysis.sh          # Automated error analysis for plans
+│   ├── run-automated-plan-analysis.py     # Python script for plan automation
+│   ├── generate-real-plans.py             # Generate plan test data
+│   └── check_eval_scores.py               # Score validation
+└── datasets/               # Test data
+    ├── real-specs/                # Generated specs for review (17 templates)
+    ├── real-plans/                # Generated plans for review (2 templates, expandable)
+    └── analysis-results/          # Analysis output (CSV, summaries)
+        ├── automated-analysis-*.csv     # Automated spec eval results
+        ├── plan-analysis-*.csv          # Automated plan eval results
+        ├── plan-eval-analysis-*.txt     # Plan evaluation analysis reports
+        ├── summary-*.txt                # Summary reports
+        └── error-analysis-results.csv   # Manual review results
+```
+
+### The 80/20 Rule
+
+**80% of value** comes from:
+
+1. ✅ Jupyter notebooks for error analysis (manual review)
+2. ✅ Custom annotation tools (planned - Week 2)
+3. ✅ PromptFoo for CI/CD (already working)
+
+**20% of value** comes from:
+
+- Production monitoring (planned - Week 5-6)
+- Advanced features (clustering, AI assistance)
+
+### Error Analysis Process
+
+You can run error analysis in two ways:
+
+#### Option 1: Automated Analysis (Using Claude API)
+
+Uses Claude API to automatically evaluate specs and categorize failures:
+
+```bash
+# Run automated error analysis (uses default model)
+./evals/scripts/run-auto-error-analysis.sh
+
+# Use a specific model
+./evals/scripts/run-auto-error-analysis.sh --model claude-opus-4-5-20251101
+
+# Requirements:
+# - ANTHROPIC_API_KEY environment variable set
+# - Generated specs in evals/datasets/real-specs/
+
+# Output:
+# - Detailed CSV: evals/datasets/analysis-results/automated-analysis-<timestamp>.csv
+# - Summary report: evals/datasets/analysis-results/summary-<timestamp>.txt
+```
+
+**Features:**
+
+- Evaluates all specs automatically using Claude
+- Binary pass/fail with reasoning
+- Categorizes failures (incomplete requirements, ambiguous specs, etc.)
+- Generates comprehensive reports
+- Saves time on initial review
+
+#### Option 2: Manual Analysis (Using Jupyter Notebook)
+
+Traditional error analysis workflow for deep investigation:
+
+```bash
+# Launch Jupyter Lab
+./evals/scripts/run-error-analysis.sh
+
+# In Jupyter:
+# 1. Load specs from datasets/real-specs/
+# 2. Review and annotate manually
+# 3. Categorize failures
+# 4. Export results
+```
+
+**Process:**
+
+1. **Open Coding** (Week 1)
+   - Domain expert reviews 10-20 real specs/plans
+   - Notes issues without categorization yet
+   - Binary pass/fail (no Likert scales)
+
+2. **Axial Coding** (Week 1-2)
+   - Group similar failures into categories
+   - Count frequency of each failure mode
+   - Prioritize by impact
+
+3. **Fix & Iterate** (Ongoing)
+   - Fix high-frequency failure modes
+   - Add automated checks to PromptFoo
+   - Re-run error analysis monthly
+
+### Plan Error Analysis (NEW)
+
+In addition to spec evaluation, we now support error analysis for **implementation plans**.
+
+#### Quick Start - Plan Analysis
+
+```bash
+# 1. Generate plan test data
+cd evals/scripts
+LLM_BASE_URL="your-url" \
+LLM_AUTH_TOKEN="your-key" \
+./evals/.venv/bin/python generate-real-plans.py
+
+# 2. Run automated plan analysis (uses default model)
+export ANTHROPIC_API_KEY="your-anthropic-key"
+./run-auto-plan-analysis.sh
+
+# Or use a specific model
+./run-auto-plan-analysis.sh --model claude-opus-4-5-20251101
+
+# Output:
+# - evals/datasets/analysis-results/plan-analysis-<timestamp>.csv
+# - evals/datasets/analysis-results/plan-summary-<timestamp>.txt
+```
+
+#### Plan Analysis Features
+
+**Evaluation Criteria:**
+
+- **Simplicity Gate**: ≤3 projects (CRITICAL - Constitution compliance)
+- **Completeness**: All necessary components and phases defined
+- **Clarity**: Project boundaries, tasks, and milestones clear
+- **Appropriateness**: Simple architecture, no over-engineering
+- **Constitution Compliance**: No microservices for simple apps, no premature optimization
+- **Testability**: Testing strategy and verification steps included
+
+**Common Failure Categories for Plans:**
+
+- Too many projects (>3)
+- Over-engineering
+- Missing verification steps
+- Unclear project boundaries
+- Microservices for simple app
+- Premature optimization
+- Missing testing strategy
+- Tech stack mismatch
+- Incomplete milestones
+
+**Current Status:**
+
+- Plan evaluation integrated with PromptFoo (100% pass rate on 2 test cases)
+- Error analysis infrastructure ready for expansion
+- Support for both automated (Claude API) and manual review workflows
+
+## Custom Annotation Tool
+
+A fast, keyboard-driven web interface for reviewing generated specs - **10x faster than manual review**.
+
+### Quick Start
+
+```bash
+# Run the annotation tool
+./evals/scripts/run-annotation-tool.sh
+```
+
+Open your browser to `http://localhost:5001` and start reviewing specs.
+
+### Features
+
+- **Keyboard shortcuts**: N (next), P (previous), 1 (pass), 2 (fail)
+- **Progress tracking**: Visual progress bar with statistics
+- **Notes**: Add observations for each spec
+- **Auto-save**: Annotations saved automatically to JSON
+- **Export**: Export all annotations with timestamps
+
+### Workflow
+
+1. Review the spec displayed on the page
+2. Evaluate quality (structure, completeness, clarity)
+3. Add notes about any issues (optional)
+4. Press **1** for Pass or **2** for Fail
+5. Tool automatically advances to next spec
+6. Click "Export JSON" when done
+
+### Output
+
+Annotations are saved to:
+
+- `evals/annotation-tool/annotations.json` - Auto-saved current state
+- `evals/annotation-tool/annotations_export_YYYYMMDD_HHMMSS.json` - Timestamped exports
+
+Example output structure:
+
+```json
+{
+  "exported_at": "2026-01-08T14:30:00",
+  "statistics": {
+    "total": 17,
+    "passed": 12,
+    "failed": 3,
+    "pending": 2,
+    "progress": 88.2
+  },
+  "annotations": {
+    "spec-001.md": {
+      "status": "pass",
+      "notes": "Good structure, all sections present",
+      "timestamp": "2026-01-08T14:25:00"
+    }
+  }
+}
+```
+
+### What to Look For
+
+Common failure patterns to note during review:
+
+- Missing required sections
+- Vague or unmeasurable requirements
+- Premature technical decisions
+- Missing acceptance criteria
+- Incomplete user stories
+- Security considerations missing
+- Over-engineering indicators
+
+See [annotation-tool/README.md](annotation-tool/README.md) for detailed documentation.
diff --git a/evals/graders/custom_graders.py b/evals/graders/custom_graders.py
new file mode 100644
index 0000000000..31e90628b1
--- /dev/null
+++ b/evals/graders/custom_graders.py
@@ -0,0 +1,383 @@
+"""
+Custom graders for Spec-Kit evaluation.
+
+These graders check specific quality criteria for specifications and plans
+generated by the spec-kit templates.
+"""
+
+
+def check_security_completeness(output: str, context: dict) -> dict:
+    """
+    Check if security-critical features include proper security requirements.
+
+    Args:
+        output: The generated specification or plan text
+        context: Additional context (unused but required by PromptFoo)
+
+    Returns:
+        dict with 'pass', 'score', and 'reason' keys
+    """
+    required_security_topics = [
+        'authentication',
+        'authorization',
+        'encryption',
+        'session management',
+        'data protection',
+        'input validation'
+    ]
+
+    output_lower = output.lower()
+    found_topics = [
+        topic for topic in required_security_topics
+        if topic in output_lower
+    ]
+
+    score = len(found_topics) / len(required_security_topics)
+
+    return {
+        'pass': score >= 0.5,  # At least 50% of security topics
+        'score': score,
+        'reason': f'Found {len(found_topics)}/{len(required_security_topics)} security topics: {", ".join(found_topics) if found_topics else "none"}'
+    }
+
+
+def check_simplicity_gate(output: str, context: dict) -> dict:
+    """
+    Check if plan follows simplicity gate (Article VII: ≤3 projects).
+
+    Args:
+        output: The generated plan text
+        context: Additional context (unused but required by PromptFoo)
+
+    Returns:
+        dict with 'pass', 'score', and 'reason' keys
+    """
+    # Count unique projects in the plan
+    # Look for patterns like "Project 1:", "## Project", etc.
+    import re
+
+    # Extract project numbers to avoid counting duplicates
+    # Pattern: "Project" followed by a number
+    project_number_pattern = re.compile(r'Project\s+(\d+)', re.IGNORECASE)
+    project_numbers = project_number_pattern.findall(output)
+
+    # Get unique project numbers
+    unique_projects = set(project_numbers)
+    project_count = len(unique_projects)
+
+    # If no numbered projects found, look for "Project Structure" section
+    # and try to extract count from table or list
+    if project_count == 0:
+        # Look for table format: "| Project 1" or "| **Project 1"
+        table_project_pattern = re.compile(r'\|\s*\*?\*?Project\s+(\d+)', re.IGNORECASE)
+        table_numbers = table_project_pattern.findall(output)
+        if table_numbers:
+            unique_projects = set(table_numbers)
+            project_count = len(unique_projects)
+
+    # If still no projects found, look for explicit project count in text
+    if project_count == 0:
+        count_pattern = re.compile(r'(\d+)\s+projects?', re.IGNORECASE)
+        count_matches = count_pattern.findall(output)
+        if count_matches:
+            # Take the first explicit count mentioned
+            project_count = int(count_matches[0])
+        else:
+            # Assume single project if nothing found
+            project_count = 1
+
+    passed = project_count <= 3
+    score = 1.0 if passed else max(0, 1 - (project_count - 3) * 0.2)
+
+    return {
+        'pass': passed,
+        'score': score,
+        'reason': f'Found {project_count} projects (expected ≤3 for simplicity)'
+    }
+
+
+def check_constitution_compliance(output: str, context: dict) -> dict:
+    """
+    Check if plan violates constitution principles.
+
+    Checks:
+    - Article VII: Simplicity (≤3 projects)
+    - Article VIII: Anti-Abstraction (no unnecessary wrappers)
+    - Over-engineering detection
+
+    Args:
+        output: The generated plan text
+        context: Additional context (unused but required by PromptFoo)
+
+    Returns:
+        dict with 'pass', 'score', and 'reason' keys
+    """
+    violations = []
+    output_lower = output.lower()
+
+    # Check simplicity gate
+    simplicity_result = check_simplicity_gate(output, context)
+    if not simplicity_result['pass']:
+        violations.append(simplicity_result['reason'])
+
+    # Check for over-engineering patterns (context-aware)
+    # Only flag if NOT in a negative context (e.g., "no microservices", "avoiding kubernetes")
+    import re
+
+    over_engineering_terms = [
+        'microservices',
+        'kubernetes',
+        'k8s',
+        'service mesh',
+        'event sourcing',
+        'cqrs',
+        'saga pattern',
+        'message queue' if 'simple' in context.get('vars', {}).get('user_input', '').lower() else None
+    ]
+    over_engineering_terms = [t for t in over_engineering_terms if t]  # Remove None
+
+    found_overengineering = []
+    for term in over_engineering_terms:
+        if term not in output_lower:
+            continue
+
+        # Check if term is in a negative context
+        # Look for patterns like "no X", "avoid X", "not X", "without X"
+        negative_patterns = [
+            rf'\b(no|avoid|avoiding|not|without)\s+\w*\s*{re.escape(term)}',
+            rf'{re.escape(term)}\s*\w*\s*(avoided|rejected|unnecessary)'
+        ]
+
+        is_negative = False
+        for pattern in negative_patterns:
+            if re.search(pattern, output_lower, re.IGNORECASE):
+                is_negative = True
+                break
+
+        # Only flag if NOT in negative context
+        if not is_negative:
+            found_overengineering.append(term)
+
+    if found_overengineering:
+        violations.append(f"Over-engineering detected: {', '.join(found_overengineering)}")
+
+    # Check for unnecessary abstractions/wrappers
+    abstraction_terms = [
+        'wrapper',
+        'facade',
+        'adapter layer',
+        'abstraction layer'
+    ]
+
+    found_abstractions = [
+        term for term in abstraction_terms
+        if term in output_lower
+    ]
+
+    if found_abstractions:
+        violations.append(f"Unnecessary abstractions: {', '.join(found_abstractions)}")
+
+    # Calculate score
+    if not violations:
+        score = 1.0
+    else:
+        # Deduct 0.3 per violation, minimum 0
+        score = max(0, 1.0 - len(violations) * 0.3)
+
+    return {
+        'pass': score >= 0.7,
+        'score': score,
+        'reason': '; '.join(violations) if violations else 'Constitution compliant'
+    }
+
+
+def check_vague_terms(output: str, context: dict) -> dict:
+    """
+    Check for vague, unmeasurable terms that need clarification.
+
+    Args:
+        output: The generated specification text
+        context: Additional context (unused but required by PromptFoo)
+
+    Returns:
+        dict with 'pass', 'score', and 'reason' keys
+    """
+    vague_terms = [
+        'fast',
+        'quick',
+        'scalable',
+        'secure',
+        'intuitive',
+        'robust',
+        'performant',
+        'user-friendly',
+        'easy',
+        'simple',
+        'good performance',
+        'high availability'
+    ]
+
+    output_lower = output.lower()
+    vague_found = [term for term in vague_terms if term in output_lower]
+
+    if not vague_found:
+        return {
+            'pass': True,
+            'score': 1.0,
+            'reason': 'No vague terms found'
+        }
+
+    # Check if vague terms are quantified or flagged
+    quantified_count = 0
+    for term in vague_found:
+        # Look for the term followed by quantification or clarification markers
+        term_index = output_lower.find(term)
+        if term_index == -1:
+            continue
+
+        # Check 200 chars after the term
+        context_window = output_lower[term_index:term_index + 200]
+
+        # Check for quantification patterns
+        quantification_patterns = [
+            r'\d+\s*(ms|milliseconds|seconds|minutes)',  # time
+            r'\d+\s*(mb|gb|requests|users)',  # size/count
+            r'<\s*\d+',  # less than X
+            r'>\s*\d+',  # greater than X
+            r'\[needs clarification\]',
+            r'\[tbd\]',
+            r'\[todo\]'
+        ]
+
+        import re
+        if any(re.search(pattern, context_window) for pattern in quantification_patterns):
+            quantified_count += 1
+
+    quantified_ratio = quantified_count / len(vague_found) if vague_found else 1.0
+
+    return {
+        'pass': quantified_ratio >= 0.7,
+        'score': quantified_ratio,
+        'reason': f'Found {len(vague_found)} vague terms, {quantified_count} properly quantified/flagged'
+    }
+
+
+def check_edge_cases_coverage(output: str, context: dict) -> dict:
+    """
+    Check if edge cases section has comprehensive coverage.
+
+    Args:
+        output: The generated specification text
+        context: Additional context with vars (user_input)
+
+    Returns:
+        dict with 'pass', 'score', and 'reason' keys
+    """
+    import re
+
+    output_lower = output.lower()
+
+    # Define categories of edge cases to check
+    edge_case_categories = {
+        'boundary_values': [
+            'empty', 'min', 'max', 'limit', 'boundary', 'zero', 'negative',
+            'very large', 'exceed'
+        ],
+        'invalid_inputs': [
+            'invalid', 'malformed', 'incorrect', 'wrong', 'unsupported',
+            'malicious', 'corrupt'
+        ],
+        'network_failures': [
+            'network', 'timeout', 'connection', 'disconnect', 'offline',
+            'latency', 'fail'
+        ],
+        'concurrent_actions': [
+            'concurrent', 'simultaneous', 'parallel', 'race condition',
+            'multiple users'
+        ],
+        'state_issues': [
+            'session', 'expire', 'recovery', 'rollback', 'partial',
+            'inconsistent', 'state'
+        ]
+    }
+
+    # Count how many categories are covered
+    covered_categories = 0
+    found_terms = []
+
+    for category, terms in edge_case_categories.items():
+        for term in terms:
+            if term in output_lower:
+                covered_categories += 1
+                found_terms.append(f"{category}: {term}")
+                break  # Count each category only once
+
+    # Calculate score based on coverage
+    total_categories = len(edge_case_categories)
+    score = covered_categories / total_categories
+
+    # Pass if at least 3 out of 5 categories covered
+    passed = covered_categories >= 3
+
+    return {
+        'pass': passed,
+        'score': score,
+        'reason': f'Covered {covered_categories}/{total_categories} edge case categories ({", ".join(found_terms[:3])}{"..." if len(found_terms) > 3 else ""})'
+    }
+
+
+def check_testability(output: str, context: dict) -> dict:
+    """
+    Check if requirements are testable with clear acceptance criteria.
+
+    Args:
+        output: The generated specification text
+        context: Additional context (unused but required by PromptFoo)
+
+    Returns:
+        dict with 'pass', 'score', and 'reason' keys
+    """
+    # Look for acceptance criteria patterns
+    import re
+
+    # Find user stories (should have acceptance criteria)
+    user_story_pattern = re.compile(r'\*\*As a .+?\*\*', re.IGNORECASE)
+    user_stories = user_story_pattern.findall(output)
+
+    if not user_stories:
+        return {
+            'pass': False,
+            'score': 0.0,
+            'reason': 'No user stories found'
+        }
+
+    # Look for acceptance criteria after each user story
+    acceptance_patterns = [
+        r'acceptance criteria',
+        r'given .+? when .+? then',  # BDD format
+        r'should .+?',
+        r'must .+?',
+        r'verify that',
+        r'confirm that'
+    ]
+
+    stories_with_criteria = 0
+    for story in user_stories:
+        # Find the story position
+        story_index = output.lower().find(story.lower())
+        if story_index == -1:
+            continue
+
+        # Check next 500 chars for acceptance criteria
+        context_window = output.lower()[story_index:story_index + 500]
+
+        if any(re.search(pattern, context_window, re.IGNORECASE) for pattern in acceptance_patterns):
+            stories_with_criteria += 1
+
+    testability_ratio = stories_with_criteria / len(user_stories)
+
+    return {
+        'pass': testability_ratio >= 0.7,
+        'score': testability_ratio,
+        'reason': f'{stories_with_criteria}/{len(user_stories)} user stories have testable acceptance criteria'
+    }
diff --git a/evals/notebooks/error-analysis.ipynb b/evals/notebooks/error-analysis.ipynb
new file mode 100644
index 0000000000..2bbb2d6e14
--- /dev/null
+++ b/evals/notebooks/error-analysis.ipynb
@@ -0,0 +1,998 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Error Analysis Notebook\n",
+    "\n",
+    "**Purpose:** Manual review and error analysis of generated specs/plans\n",
+    "\n",
+    "**Process:**\n",
+    "1. **Open Coding:** Review outputs and note issues\n",
+    "2. **Axial Coding:** Group similar failures into categories\n",
+    "3. **Prioritization:** Count frequency and prioritize fixes\n",
+    "\n",
+    "**Time:** 30-60 minutes per session\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Setup complete\n",
+      "📁 Dataset directory: ../datasets/real-specs\n",
+      "📊 Results directory: ../datasets/analysis-results\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Setup\n",
+    "import os\n",
+    "import glob\n",
+    "import pandas as pd\n",
+    "from pathlib import Path\n",
+    "import re\n",
+    "\n",
+    "# Configuration\n",
+    "DATASET_DIR = Path('../datasets/real-specs')\n",
+    "RESULTS_DIR = Path('../datasets/analysis-results')\n",
+    "RESULTS_DIR.mkdir(exist_ok=True)\n",
+    "\n",
+    "print(f\"✅ Setup complete\")\n",
+    "print(f\"📁 Dataset directory: {DATASET_DIR}\")\n",
+    "print(f\"📊 Results directory: {RESULTS_DIR}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Load Test Specs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "📝 Found 17 spec files\n",
+      "\n",
+      "✅ Loaded 17 specs into dataframe\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>file</th>\n",
+       "      <th>prompt</th>\n",
+       "      <th>spec</th>\n",
+       "      <th>pass_fail</th>\n",
+       "      <th>issues</th>\n",
+       "      <th>category</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>spec-001.md</td>\n",
+       "      <td>Build a simple todo list web application with ...</td>\n",
+       "      <td># Feature Specification Document\\n\\n## 1. Over...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>spec-002.md</td>\n",
+       "      <td>Create a personal blog platform with posts, co...</td>\n",
+       "      <td># Feature Specification Document\\n\\n## 1. Over...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>spec-003.md</td>\n",
+       "      <td>Develop a simple calculator web app with basic...</td>\n",
+       "      <td># Feature Specification Document\\n\\n## 1. Over...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>spec-004.md</td>\n",
+       "      <td>Build an e-commerce platform with product cata...</td>\n",
+       "      <td># Feature Specification Document\\n\\n## 1. Over...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>spec-005.md</td>\n",
+       "      <td>Create a customer relationship management (CRM...</td>\n",
+       "      <td># Feature Specification Document\\n\\n## 1. Over...</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          file                                             prompt  \\\n",
+       "0  spec-001.md  Build a simple todo list web application with ...   \n",
+       "1  spec-002.md  Create a personal blog platform with posts, co...   \n",
+       "2  spec-003.md  Develop a simple calculator web app with basic...   \n",
+       "3  spec-004.md  Build an e-commerce platform with product cata...   \n",
+       "4  spec-005.md  Create a customer relationship management (CRM...   \n",
+       "\n",
+       "                                                spec pass_fail issues category  \n",
+       "0  # Feature Specification Document\\n\\n## 1. Over...                            \n",
+       "1  # Feature Specification Document\\n\\n## 1. Over...                            \n",
+       "2  # Feature Specification Document\\n\\n## 1. Over...                            \n",
+       "3  # Feature Specification Document\\n\\n## 1. Over...                            \n",
+       "4  # Feature Specification Document\\n\\n## 1. Over...                            "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Load all spec files\n",
+    "spec_files = sorted(glob.glob(str(DATASET_DIR / 'spec-*.md')))\n",
+    "\n",
+    "print(f\"📝 Found {len(spec_files)} spec files\\n\")\n",
+    "\n",
+    "# Load into dataframe\n",
+    "data = []\n",
+    "for spec_file in spec_files:\n",
+    "    with open(spec_file, 'r') as f:\n",
+    "        content = f.read()\n",
+    "    \n",
+    "    # Extract prompt\n",
+    "    prompt_match = re.search(r'## Prompt\\n(.+?)\\n\\n', content, re.DOTALL)\n",
+    "    prompt = prompt_match.group(1).strip() if prompt_match else ''\n",
+    "    \n",
+    "    # Extract generated spec\n",
+    "    spec_match = re.search(r'## Generated Spec\\n(.+?)\\n\\n## Review Notes', content, re.DOTALL)\n",
+    "    spec = spec_match.group(1).strip() if spec_match else ''\n",
+    "    \n",
+    "    data.append({\n",
+    "        'file': os.path.basename(spec_file),\n",
+    "        'prompt': prompt,\n",
+    "        'spec': spec,\n",
+    "        'pass_fail': '',  # To be filled during review\n",
+    "        'issues': '',     # To be filled during review\n",
+    "        'category': ''    # To be filled during review\n",
+    "    })\n",
+    "\n",
+    "df = pd.DataFrame(data)\n",
+    "print(f\"✅ Loaded {len(df)} specs into dataframe\\n\")\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Review Process (Open Coding)\n",
+    "\n",
+    "**Instructions for Domain Expert:**\n",
+    "- Review each spec\n",
+    "- Mark Pass/Fail (binary, no Likert scale)\n",
+    "- Note specific issues found\n",
+    "- Don't categorize yet - just note what you see\n",
+    "\n",
+    "**Example issues to look for:**\n",
+    "- Missing requirements\n",
+    "- Ambiguous user stories\n",
+    "- Over-engineering\n",
+    "- Missing constraints\n",
+    "- Incorrect tech stack assumptions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "================================================================================\n",
+      "📄 File: spec-003.md\n",
+      "\n",
+      "📝 Prompt:\n",
+      "Develop a simple calculator web app with basic arithmetic operations\n",
+      "\n",
+      "================================================================================\n",
+      "\n",
+      "📋 Generated Spec:\n",
+      "# Feature Specification Document\n",
+      "\n",
+      "## 1. Overview Section\n",
+      "\n",
+      "**Feature Name:** [NEEDS CLARIFICATION - No feature described in user requirements]\n",
+      "\n",
+      "**Description:** \n",
+      "No user requirements were provided in the input. This template demonstrates the structure for a comprehensive feature specification document.\n",
+      "\n",
+      "**Purpose:**\n",
+      "[NEEDS CLARIFICATION - Awaiting user requirements to define feature purpose and value proposition]\n",
+      "\n",
+      "**Target Users:**\n",
+      "[NEEDS CLARIFICATION - User personas and roles to be defined based on requirements]\n",
+      "\n",
+      "**Business Value:**\n",
+      "[NEEDS CLARIFICATION - Expected business impact and ROI to be defined]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "## 2. User Stories\n",
+      "\n",
+      "### Priority 1 (P1) - Must Have\n",
+      "\n",
+      "**US-001: [NEEDS CLARIFICATION - Core user story needed]**\n",
+      "- **As a** [role]\n",
+      "- **I want** [feature/capability]\n",
+      "- **So that** [business value/benefit]\n",
+      "\n",
+      "**Acceptance Criteria:**\n",
+      "- **Given** [initial context/state]\n",
+      "- **When** [action/trigger]\n",
+      "- **Then** [expected outcome]\n",
+      "\n",
+      "**Testability:** [Define how this story can be independently tested]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "**US-002: [NEEDS CLARIFICATION - Core user story needed]**\n",
+      "- **As a** [role]\n",
+      "- **I want** [feature/capability]\n",
+      "- **So that** [business value/benefit]\n",
+      "\n",
+      "**Acceptance Criteria:**\n",
+      "- **Given** [initial context/state]\n",
+      "- **When** [action/trigger]\n",
+      "- **Then** [expected outcome]\n",
+      "\n",
+      "**Testability:** [Define how this story can be independently tested]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Priority 2 (P2) - Should Have\n",
+      "\n",
+      "**US-003: [NEEDS CLARIFICATION - Secondary user story needed]**\n",
+      "- **As a** [role]\n",
+      "- **I want** [feature/capability]\n",
+      "- **So that** [business value/benefit]\n",
+      "\n",
+      "**Acceptance Criteria:**\n",
+      "- **Given** [initial context/state]\n",
+      "- **When** [action/trigger]\n",
+      "- **Then** [expected outcome]\n",
+      "\n",
+      "**Testability:** [Define how this story can be independently tested]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "**US-004: [NEEDS CLARIFICATION - Secondary user story needed]**\n",
+      "- **As a** [role]\n",
+      "- **I want** [feature/capability]\n",
+      "- **So that** [business value/benefit]\n",
+      "\n",
+      "**Acceptance Criteria:**\n",
+      "- **Given** [initial context/state]\n",
+      "- **When** [action/trigger]\n",
+      "- **Then** [expected outcome]\n",
+      "\n",
+      "**Testability:** [Define how this story can be independently tested]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Priority 3 (P3) - Nice to Have\n",
+      "\n",
+      "**US-005: [NEEDS CLARIFICATION - Enhancement user story needed]**\n",
+      "- **As a** [role]\n",
+      "- **I want** [feature/capability]\n",
+      "- **So that** [business value/benefit]\n",
+      "\n",
+      "**Acceptance Criteria:**\n",
+      "- **Given** [initial context/state]\n",
+      "- **When** [action/trigger]\n",
+      "- **Then** [expected outcome]\n",
+      "\n",
+      "**Testability:** [Define how this story can be independently tested]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "**US-006: [NEEDS CLARIFICATION - Enhancement user story needed]**\n",
+      "- **As a** [role]\n",
+      "- **I want** [feature/capability]\n",
+      "- **So that** [business value/benefit]\n",
+      "\n",
+      "**Acceptance Criteria:**\n",
+      "- **Given** [initial context/state]\n",
+      "- **When** [action/trigger]\n",
+      "- **Then** [expected outcome]\n",
+      "\n",
+      "**Testability:** [Define how this story can be independently tested]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "## 3. Functional Requirements\n",
+      "\n",
+      "### Core Functionality\n",
+      "\n",
+      "**FR-001:** [NEEDS CLARIFICATION - Define primary functional requirement]\n",
+      "- **Description:** [What the system must do]\n",
+      "- **Measurement Criteria:** [How to verify this requirement is met]\n",
+      "\n",
+      "**FR-002:** [NEEDS CLARIFICATION - Define primary functional requirement]\n",
+      "- **Description:** [What the system must do]\n",
+      "- **Measurement Criteria:** [How to verify this requirement is met]\n",
+      "\n",
+      "**FR-003:** [NEEDS CLARIFICATION - Define primary functional requirement]\n",
+      "- **Description:** [What the system must do]\n",
+      "- **Measurement Criteria:** [How to verify this requirement is met]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Data Requirements\n",
+      "\n",
+      "**FR-004:** [NEEDS CLARIFICATION - Define data handling requirements]\n",
+      "- **Description:** [Data that must be captured, stored, or processed]\n",
+      "- **Measurement Criteria:** [How to verify this requirement is met]\n",
+      "\n",
+      "**FR-005:** [NEEDS CLARIFICATION - Define data validation requirements]\n",
+      "- **Description:** [Data validation rules and constraints]\n",
+      "- **Measurement Criteria:** [How to verify this requirement is met]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### User Interface Requirements\n",
+      "\n",
+      "**FR-006:** [NEEDS CLARIFICATION - Define UI/UX requirements]\n",
+      "- **Description:** [User interface elements and interactions needed]\n",
+      "- **Measurement Criteria:** [How to verify this requirement is met]\n",
+      "\n",
+      "**FR-007:** [NEEDS CLARIFICATION - Define accessibility requirements]\n",
+      "- **Description:** [Accessibility standards and requirements]\n",
+      "- **Measurement Criteria:** [How to verify this requirement is met]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Integration Requirements\n",
+      "\n",
+      "**FR-008:** [NEEDS CLARIFICATION - Define system integration needs]\n",
+      "- **Description:** [External systems or services that must integrate]\n",
+      "- **Measurement Criteria:** [How to verify this requirement is met]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "## 4. Non-Functional Requirements\n",
+      "\n",
+      "### Performance Requirements\n",
+      "\n",
+      "**NFR-001:** Response Time\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define acceptable response times]\n",
+      "- **Measurement:** [How to measure: e.g., \"95th percentile response time under X seconds\"]\n",
+      "\n",
+      "**NFR-002:** Throughput\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define transaction volume requirements]\n",
+      "- **Measurement:** [How to measure: e.g., \"System must handle X transactions per second\"]\n",
+      "\n",
+      "**NFR-003:** Concurrent Users\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define concurrent user capacity]\n",
+      "- **Measurement:** [How to measure: e.g., \"Support X concurrent users without degradation\"]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Security Requirements\n",
+      "\n",
+      "**NFR-004:** Authentication\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define authentication requirements]\n",
+      "- **Measurement:** [How to verify security controls are implemented]\n",
+      "\n",
+      "**NFR-005:** Authorization\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define access control requirements]\n",
+      "- **Measurement:** [How to verify authorization mechanisms]\n",
+      "\n",
+      "**NFR-006:** Data Protection\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define data encryption and protection needs]\n",
+      "- **Measurement:** [How to verify data protection compliance]\n",
+      "\n",
+      "**NFR-007:** Audit Logging\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define audit trail requirements]\n",
+      "- **Measurement:** [How to verify logging completeness]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Scalability Requirements\n",
+      "\n",
+      "**NFR-008:** Horizontal Scalability\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define scaling requirements]\n",
+      "- **Measurement:** [How to measure scalability capacity]\n",
+      "\n",
+      "**NFR-009:** Data Growth\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define data volume projections]\n",
+      "- **Measurement:** [How to verify system handles projected growth]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Reliability Requirements\n",
+      "\n",
+      "**NFR-010:** Availability\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define uptime requirements (e.g., 99.9% availability)]\n",
+      "- **Measurement:** [How to measure: \"System uptime percentage over 30-day period\"]\n",
+      "\n",
+      "**NFR-011:** Fault Tolerance\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define error recovery requirements]\n",
+      "- **Measurement:** [How to verify system recovers from failures]\n",
+      "\n",
+      "**NFR-012:** Data Integrity\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define data consistency requirements]\n",
+      "- **Measurement:** [How to verify data remains accurate and consistent]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Usability Requirements\n",
+      "\n",
+      "**NFR-013:** Learning Curve\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define ease-of-use expectations]\n",
+      "- **Measurement:** [How to measure: \"New users can complete core tasks within X minutes\"]\n",
+      "\n",
+      "**NFR-014:** Accessibility Standards\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define accessibility compliance needs (e.g., WCAG 2.1 Level AA)]\n",
+      "- **Measurement:** [How to verify accessibility compliance]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Maintainability Requirements\n",
+      "\n",
+      "**NFR-015:** Documentation\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define documentation requirements]\n",
+      "- **Measurement:** [How to verify documentation completeness]\n",
+      "\n",
+      "**NFR-016:** Monitoring\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define system monitoring needs]\n",
+      "- **Measurement:** [How to verify monitoring coverage]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Compliance Requirements\n",
+      "\n",
+      "**NFR-017:** Regulatory Compliance\n",
+      "- **Requirement:** [NEEDS CLARIFICATION - Define applicable regulations (e.g., GDPR, HIPAA, SOC2)]\n",
+      "- **Measurement:** [How to verify compliance]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "## 5. Edge Cases and Error Scenarios\n",
+      "\n",
+      "### Boundary Conditions\n",
+      "\n",
+      "**EC-001:** [NEEDS CLARIFICATION - Define minimum input boundaries]\n",
+      "- **Scenario:** [Describe edge case]\n",
+      "- **Expected Behavior:** [How system should respond]\n",
+      "- **Error Handling:** [What feedback user receives]\n",
+      "\n",
+      "**EC-002:** [NEEDS CLARIFICATION - Define maximum input boundaries]\n",
+      "- **Scenario:** [Describe edge case]\n",
+      "- **Expected Behavior:** [How system should respond]\n",
+      "- **Error Handling:** [What feedback user receives]\n",
+      "\n",
+      "**EC-003:** [NEEDS CLARIFICATION - Define empty/null state handling]\n",
+      "- **Scenario:** [Describe edge case]\n",
+      "- **Expected Behavior:** [How system should respond]\n",
+      "- **Error Handling:** [What feedback user receives]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Error Scenarios\n",
+      "\n",
+      "**EC-004:** [NEEDS CLARIFICATION - Define invalid input handling]\n",
+      "- **Scenario:** [Describe error condition]\n",
+      "- **Expected Behavior:** [How system should respond]\n",
+      "- **Error Message:** [User-friendly error message]\n",
+      "- **Recovery Path:** [How user can correct the error]\n",
+      "\n",
+      "**EC-005:** [NEEDS CLARIFICATION - Define network failure handling]\n",
+      "- **Scenario:** [Describe error condition]\n",
+      "- **Expected Behavior:** [How system should respond]\n",
+      "- **Error Message:** [User-friendly error message]\n",
+      "- **Recovery Path:** [How user can retry or recover]\n",
+      "\n",
+      "**EC-006:** [NEEDS CLARIFICATION - Define timeout scenarios]\n",
+      "- **Scenario:** [Describe error condition]\n",
+      "- **Expected Behavior:** [How system should respond]\n",
+      "- **Error Message:** [User-friendly error message]\n",
+      "- **Recovery Path:** [How user can proceed]\n",
+      "\n",
+      "**EC-007:** [NEEDS CLARIFICATION - Define concurrent access conflicts]\n",
+      "- **Scenario:** [Describe conflict condition]\n",
+      "- **Expected Behavior:** [How system should respond]\n",
+      "- **Error Message:** [User-friendly error message]\n",
+      "- **Resolution:** [How conflict is resolved]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Data Integrity Edge Cases\n",
+      "\n",
+      "**EC-008:** [NEEDS CLARIFICATION - Define duplicate data handling]\n",
+      "- **Scenario:** [Describe edge case]\n",
+      "- **Expected Behavior:** [How system should respond]\n",
+      "- **Prevention:** [How system prevents data issues]\n",
+      "\n",
+      "**EC-009:** [NEEDS CLARIFICATION - Define data migration scenarios]\n",
+      "- **Scenario:** [Describe edge case]\n",
+      "- **Expected Behavior:** [How system should respond]\n",
+      "- **Rollback Plan:** [How to revert if issues occur]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "## 6. Success Criteria\n",
+      "\n",
+      "### Functional Success Metrics\n",
+      "\n",
+      "**SC-001:** Feature Completeness\n",
+      "- **Metric:** [NEEDS CLARIFICATION - Define completion criteria]\n",
+      "- **Target:** [e.g., \"100% of P1 user stories completed and accepted\"]\n",
+      "- **Measurement Method:** [How to verify]\n",
+      "\n",
+      "**SC-002:** Acceptance Testing\n",
+      "- **Metric:** [NEEDS CLARIFICATION - Define acceptance test pass rate]\n",
+      "- **Target:** [e.g., \"100% of acceptance criteria passing\"]\n",
+      "- **Measurement Method:** [How to verify]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Performance Success Metrics\n",
+      "\n",
+      "**SC-003:** Response Time Achievement\n",
+      "- **Metric:** [NEEDS CLARIFICATION - Define performance target]\n",
+      "- **Target:** [e.g., \"95th percentile response time < X seconds\"]\n",
+      "- **Measurement Method:** [Performance testing methodology]\n",
+      "\n",
+      "**SC-004:** Load Capacity\n",
+      "- **Metric:** [NEEDS CLARIFICATION - Define load handling target]\n",
+      "- **Target:** [e.g., \"Successfully handle X concurrent users\"]\n",
+      "- **Measurement Method:** [Load testing methodology]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Quality Success Metrics\n",
+      "\n",
+      "**SC-005:** Defect Rate\n",
+      "- **Metric:** [NEEDS CLARIFICATION - Define acceptable defect rate]\n",
+      "- **Target:** [e.g., \"Less than X critical defects in production after 30 days\"]\n",
+      "- **Measurement Method:** [Defect tracking methodology]\n",
+      "\n",
+      "**SC-006:** Test Coverage\n",
+      "- **Metric:** [NEEDS CLARIFICATION - Define test coverage requirements]\n",
+      "- **Target:** [e.g., \"Automated test coverage of X% for critical paths\"]\n",
+      "- **Measurement Method:** [Coverage analysis tools]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### User Adoption Success Metrics\n",
+      "\n",
+      "**SC-007:** User Engagement\n",
+      "- **Metric:** [NEEDS CLARIFICATION - Define engagement metrics]\n",
+      "- **Target:** [e.g., \"X% of target users actively using feature within 30 days\"]\n",
+      "- **Measurement Method:** [Analytics tracking]\n",
+      "\n",
+      "**SC-008:** User Satisfaction\n",
+      "- **Metric:** [NEEDS CLARIFICATION - Define satisfaction measurement]\n",
+      "- **Target:** [e.g., \"Average satisfaction score of X/10 or higher\"]\n",
+      "- **Measurement Method:** [User feedback surveys]\n",
+      "\n",
+      "**SC-009:** Task Completion Rate\n",
+      "- **Metric:** [NEEDS CLARIFICATION - Define task success rate]\n",
+      "- **Target:** [e.g., \"X% of users successfully complete primary workflow\"]\n",
+      "- **Measurement Method:** [User analytics and session tracking]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "### Business Success Metrics\n",
+      "\n",
+      "**SC-010:** Business Impact\n",
+      "- **Metric:** [NEEDS CLARIFICATION - Define business value metrics]\n",
+      "- **Target:** [e.g., \"Achieve X% increase in [specific business metric]\"]\n",
+      "- **Measurement Method:** [Business analytics]\n",
+      "\n",
+      "**SC-011:** ROI Achievement\n",
+      "- **Metric:** [NEEDS CLARIFICATION - Define ROI expectations]\n",
+      "- **Target:** [e.g., \"Achieve positive ROI within X months\"]\n",
+      "- **Measurement Method:** [Financial analysis]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "## 7. Dependencies and Assumptions\n",
+      "\n",
+      "### Dependencies\n",
+      "- [NEEDS CLARIFICATION - List external dependencies that must be available]\n",
+      "\n",
+      "### Assumptions\n",
+      "- [NEEDS CLARIFICATION - List assumptions made during specification]\n",
+      "\n",
+      "### Constraints\n",
+      "- [NEEDS CLARIFICATION - List known constraints affecting the feature]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "## 8. Out of Scope\n",
+      "\n",
+      "The following items are explicitly out of scope for this feature:\n",
+      "- [NEEDS CLARIFICATION - Define what is NOT included]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "## 9. Open Questions\n",
+      "\n",
+      "1. [NEEDS CLARIFICATION - List unresolved questions requiring stakeholder input]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "## 10. Approval and Sign-off\n",
+      "\n",
+      "| Role | Name | Date | Signature |\n",
+      "|------|------|------|-----------|\n",
+      "| Product Owner | [TBD] | [TBD] | [TBD] |\n",
+      "| Business Stakeholder | [TBD] | [TBD] | [TBD] |\n",
+      "| Engineering Lead | [TBD] | [TBD] | [TBD] |\n",
+      "| QA Lead | [TBD] | [TBD] | [TBD] |\n",
+      "\n",
+      "---\n",
+      "\n",
+      "**Document Version:** 1.0  \n",
+      "**Last Updated:** [Date]  \n",
+      "**Next Review Date:** [Date]\n",
+      "\n",
+      "---\n",
+      "\n",
+      "## Instructions for Use\n",
+      "\n",
+      "**To complete this specification, please provide:**\n",
+      "\n",
+      "1. **Clear feature description** - What problem are you solving?\n",
+      "2. **Target users** - Who will use this feature?\n",
+      "3. **User workflows** - What tasks should users be able to complete?\n",
+      "4. **Business objectives** - What business goals does this support?\n",
+      "5. **Performance expectations** - What are acceptable response times, capacity, etc.?\n",
+      "6. **Security and compliance needs** - What standards must be met?\n",
+      "7. **Success metrics** - How will you measure success?\n",
+      "\n",
+      "Once requirements are provided, all sections marked with [NEEDS CLARIFICATION] will be completed with specific, measurable, and testable requirements.\n",
+      "\n",
+      "================================================================================\n",
+      "\n",
+      "✍️ Review (fill in manually):\n",
+      "   Pass/Fail: [TODO]\n",
+      "   Issues: [TODO]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Display specs one at a time for review\n",
+    "# Change idx to review different specs\n",
+    "\n",
+    "idx = 2  # Change this to review different specs (0 to len(df)-1)\n",
+    "\n",
+    "if idx < len(df):\n",
+    "    row = df.iloc[idx]\n",
+    "    print(f\"\\n{'='*80}\")\n",
+    "    print(f\"📄 File: {row['file']}\")\n",
+    "    print(f\"\\n📝 Prompt:\\n{row['prompt']}\")\n",
+    "    print(f\"\\n{'='*80}\")\n",
+    "    print(f\"\\n📋 Generated Spec:\\n{row['spec']}\")\n",
+    "    print(f\"\\n{'='*80}\")\n",
+    "    print(f\"\\n✍️ Review (fill in manually):\")\n",
+    "    print(f\"   Pass/Fail: {row['pass_fail'] or '[TODO]'}\")\n",
+    "    print(f\"   Issues: {row['issues'] or '[TODO]'}\")\n",
+    "else:\n",
+    "    print(f\"❌ Index {idx} out of range (0-{len(df)-1})\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "💾 Remember to save your work!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Manual annotation helper\n",
+    "# Update this cell to record your review\n",
+    "\n",
+    "# Example:\n",
+    "# df.loc[0, 'pass_fail'] = 'Fail'\n",
+    "# df.loc[0, 'issues'] = 'Missing authentication requirements, vague data model'\n",
+    "\n",
+    "# Your annotations here:\n",
+    "# df.loc[idx, 'pass_fail'] = 'Pass' or 'Fail'\n",
+    "# df.loc[idx, 'issues'] = 'Description of issues found'\n",
+    "\n",
+    "print(\"💾 Remember to save your work!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Categorization (Axial Coding)\n",
+    "\n",
+    "After reviewing all specs, group similar issues into categories.\n",
+    "\n",
+    "**Common failure categories:**\n",
+    "- Incomplete requirements\n",
+    "- Ambiguous specifications\n",
+    "- Over-engineering\n",
+    "- Missing constraints\n",
+    "- Tech stack mismatch\n",
+    "- Missing edge cases\n",
+    "- Unclear acceptance criteria"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "❌ Failed specs: 0 / 17\n",
+      "\n",
+      "✅ No failed specs (or not yet reviewed)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# View all failed specs\n",
+    "failed_specs = df[df['pass_fail'] == 'Fail']\n",
+    "print(f\"❌ Failed specs: {len(failed_specs)} / {len(df)}\\n\")\n",
+    "\n",
+    "if len(failed_specs) > 0:\n",
+    "    print(\"Issues found:\")\n",
+    "    for idx, row in failed_specs.iterrows():\n",
+    "        print(f\"\\n[{row['file']}]\")\n",
+    "        print(f\"  {row['issues']}\")\n",
+    "else:\n",
+    "    print(\"✅ No failed specs (or not yet reviewed)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "💾 Update categories based on your error analysis\n"
+     ]
+    }
+   ],
+   "source": [
+    "# After reviewing issues, assign categories\n",
+    "# Example:\n",
+    "# df.loc[0, 'category'] = 'Incomplete requirements'\n",
+    "# df.loc[1, 'category'] = 'Over-engineering'\n",
+    "\n",
+    "# Your categorization here:\n",
+    "\n",
+    "print(\"💾 Update categories based on your error analysis\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Failure Taxonomy & Prioritization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "📊 Failure Taxonomy:\n",
+      "\n",
+      "Series([], Name: count, dtype: int64)\n",
+      "\n",
+      "Total failures: 0\n",
+      "Pass rate: 100.0%\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Count failures by category\n",
+    "category_counts = df[df['pass_fail'] == 'Fail']['category'].value_counts()\n",
+    "\n",
+    "print(\"📊 Failure Taxonomy:\\n\")\n",
+    "print(category_counts)\n",
+    "print(f\"\\nTotal failures: {len(failed_specs)}\")\n",
+    "print(f\"Pass rate: {(len(df) - len(failed_specs)) / len(df) * 100:.1f}%\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Matplotlib is building the font cache; this may take a moment.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "No failures to visualize (or not yet categorized)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Visualize failure categories\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "if len(category_counts) > 0:\n",
+    "    plt.figure(figsize=(10, 6))\n",
+    "    category_counts.plot(kind='barh')\n",
+    "    plt.title('Failure Categories (Frequency)')\n",
+    "    plt.xlabel('Count')\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()\n",
+    "else:\n",
+    "    print(\"No failures to visualize (or not yet categorized)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Save Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "💾 Saved results to: ../datasets/analysis-results/error-analysis-results.csv\n",
+      "📄 Saved summary to: ../datasets/analysis-results/summary.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Save annotated data\n",
+    "output_file = RESULTS_DIR / 'error-analysis-results.csv'\n",
+    "df.to_csv(output_file, index=False)\n",
+    "print(f\"💾 Saved results to: {output_file}\")\n",
+    "\n",
+    "# Save summary\n",
+    "summary_file = RESULTS_DIR / 'summary.txt'\n",
+    "with open(summary_file, 'w') as f:\n",
+    "    f.write(f\"Error Analysis Summary\\n\")\n",
+    "    f.write(f\"{'='*50}\\n\\n\")\n",
+    "    f.write(f\"Total specs reviewed: {len(df)}\\n\")\n",
+    "    f.write(f\"Failed specs: {len(failed_specs)}\\n\")\n",
+    "    f.write(f\"Pass rate: {(len(df) - len(failed_specs)) / len(df) * 100:.1f}%\\n\\n\")\n",
+    "    f.write(f\"Failure Categories:\\n\")\n",
+    "    for category, count in category_counts.items():\n",
+    "        f.write(f\"  - {category}: {count}\\n\")\n",
+    "\n",
+    "print(f\"📄 Saved summary to: {summary_file}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Next Steps\n",
+    "\n",
+    "Based on this error analysis:\n",
+    "\n",
+    "1. **Prioritize fixes** by failure frequency\n",
+    "2. **Update prompts/templates** to address top issues\n",
+    "3. **Add automated checks** for frequent failure modes\n",
+    "4. **Re-run evaluation** after fixes\n",
+    "5. **Repeat monthly** or when making major changes\n",
+    "\n",
+    "---\n",
+    "\n",
+    "**💡 Tips:**\n",
+    "- Aim for 70% pass rate (not 100%)\n",
+    "- Focus on high-frequency failure modes\n",
+    "- Don't build automated checks until you understand the pattern\n",
+    "- Use LLMs to help categorize (but human reviews what matters)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/evals/prompts/plan-prompt.txt b/evals/prompts/plan-prompt.txt
new file mode 100644
index 0000000000..5cf3fa74d0
--- /dev/null
+++ b/evals/prompts/plan-prompt.txt
@@ -0,0 +1,38 @@
+You are tasked with creating an implementation plan.
+
+USER REQUIREMENTS:
+{{ user_input }}
+
+INSTRUCTIONS:
+Create a detailed implementation plan that includes:
+
+1. **Overview**: Brief description of what will be implemented
+2. **Project Structure**: Break down into ≤3 projects (MANDATORY simplicity gate)
+   - For simple CRUD/todo apps: Aim for 1-2 projects
+   - For apps with separate concerns: 2-3 projects maximum
+   - Examples of appropriate project counts:
+     * Simple todo app: 1 project (combined frontend + backend)
+     * Todo app with API: 2 projects (Backend API, Frontend)
+     * Complex system: 3 projects (Backend, Frontend, Background Worker)
+   - REJECT the urge to over-engineer with unnecessary separation
+3. **For Each Project**:
+   - Clear scope and boundaries
+   - Key tasks and milestones
+   - Dependencies on other projects
+   - Testing strategy
+4. **Development Approach**: Phasing, deployment strategy
+5. **Verification Steps**: How to validate each project
+
+IMPORTANT CONSTRAINTS (Constitution Principles):
+- Keep it SIMPLE: ≤3 projects maximum (THIS IS A HARD LIMIT - violating this fails the plan)
+- For basic CRUD/todo apps: 1-2 projects is usually sufficient
+- Avoid over-engineering and unnecessary complexity:
+  * Don't split into too many services (separate API Gateway, Auth Service, Database Layer) for simple apps
+  * Don't create a project for every logical component
+  * Combine related concerns into single projects
+- Choose simple solutions first - no premature optimization
+- For simple apps: Use monolith or simple client-server architecture
+- For deployment: Simple solutions like Docker Compose work well for basic apps
+- Use frameworks directly without unnecessary abstractions or wrapper layers
+
+OUTPUT: A complete implementation plan document.
\ No newline at end of file
diff --git a/evals/prompts/spec-prompt.txt b/evals/prompts/spec-prompt.txt
new file mode 100644
index 0000000000..d359bf96e3
--- /dev/null
+++ b/evals/prompts/spec-prompt.txt
@@ -0,0 +1,67 @@
+You are tasked with creating a detailed feature specification.
+
+USER REQUIREMENTS:
+{{ user_input }}
+
+INSTRUCTIONS:
+Create a comprehensive feature specification document with the following structure.
+
+IMPORTANT: Use ## (level 2) markdown headers for ALL major sections below:
+
+## 1. Overview Section
+Brief description of the feature
+
+## 2. User Stories
+5+ prioritized user stories (P1, P2, P3) with:
+   - Clear "As a [role], I want [feature], so that [benefit]" format
+   - Detailed acceptance criteria in Given/When/Then format
+   - Independent testability for each story
+
+## 3. Functional Requirements
+Specific, measurable, testable requirements (FR-001, FR-002, etc.)
+
+## 4. Non-Functional Requirements
+Performance, security, scalability requirements (NFR-001, NFR-002, etc.)
+   - For security-critical features (authentication, payments, PII, healthcare data), MUST include:
+     * Authentication mechanisms (without specifying JWT, OAuth, etc.)
+     * Authorization and access control requirements
+     * Encryption requirements for sensitive data (at rest and in transit)
+     * Input validation and sanitization needs
+     * Audit logging and monitoring requirements
+     * Compliance requirements (PCI-DSS for payments, GDPR for personal data, HIPAA for healthcare)
+     * NOTE: Describe WHAT security is needed, not HOW (no specific algorithms, protocols, or libraries)
+   - For performance-critical features, specify measurable targets:
+     * Response time requirements (e.g., "API response < 200ms at p95")
+     * Throughput requirements (e.g., "Handle 1000 requests/second")
+     * Concurrent user limits (e.g., "Support 10,000 simultaneous users")
+
+## 5. Edge Cases
+Document boundary conditions and error scenarios:
+   - For multi-step flows (checkout, onboarding, wizards), include:
+     * Failed transitions between steps (payment declined, timeout, network errors)
+     * State recovery and rollback scenarios
+     * Concurrent user actions (e.g., modifying cart during checkout)
+     * Session expiration handling
+   - For data input features, include:
+     * Boundary values (min/max, empty, very large inputs)
+     * Invalid formats and data types
+     * Special characters and injection attempts
+   - For integration points, include:
+     * External service failures (API down, timeout, rate limiting)
+     * Partial failures and retry logic
+
+## 6. Success Criteria
+Measurable outcomes (SC-001, SC-002, etc.)
+
+IMPORTANT CONSTRAINTS:
+- Do NOT include technical implementation details (no specific frameworks, libraries, or tech stack)
+- Focus on WHAT needs to be built, not HOW to build it
+- All requirements must be measurable and testable
+- Mark any vague or unclear requirements with [NEEDS CLARIFICATION]
+- For complex multi-step features (checkout, onboarding), ensure COMPLETE coverage:
+  * All steps in the flow documented
+  * NFRs for each critical step (security for payment, performance for cart operations)
+  * Edge cases for each transition (what happens if step fails, timeout, etc.)
+  * Success criteria for the entire flow
+
+OUTPUT: A complete feature specification document.
\ No newline at end of file
diff --git a/evals/scripts/check_eval_scores.py b/evals/scripts/check_eval_scores.py
new file mode 100755
index 0000000000..04cb3c6958
--- /dev/null
+++ b/evals/scripts/check_eval_scores.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+Check evaluation scores from PromptFoo output.
+
+This script validates that evaluation scores meet minimum thresholds
+for use in CI/CD pipelines.
+"""
+
+import json
+import sys
+import argparse
+from typing import Dict, List, Any
+
+
+def load_results(file_path: str) -> Dict[str, Any]:
+    """Load PromptFoo results from JSON file."""
+    try:
+        with open(file_path, 'r') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        print(f"❌ Error: Results file not found: {file_path}", file=sys.stderr)
+        sys.exit(1)
+    except json.JSONDecodeError as e:
+        print(f"❌ Error: Invalid JSON in results file: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+def calculate_stats(results: Dict[str, Any]) -> Dict[str, Any]:
+    """Calculate statistics from evaluation results."""
+    test_results = results.get('results', [])
+
+    if not test_results:
+        return {
+            'total': 0,
+            'passed': 0,
+            'failed': 0,
+            'pass_rate': 0.0,
+            'average_score': 0.0,
+            'min_score': 0.0,
+            'max_score': 0.0
+        }
+
+    total = len(test_results)
+    passed = sum(1 for r in test_results if r.get('success', False))
+    failed = total - passed
+
+    scores = [r.get('score', 0) for r in test_results if 'score' in r]
+    average_score = sum(scores) / len(scores) if scores else 0.0
+    min_score = min(scores) if scores else 0.0
+    max_score = max(scores) if scores else 0.0
+
+    return {
+        'total': total,
+        'passed': passed,
+        'failed': failed,
+        'pass_rate': passed / total if total > 0 else 0.0,
+        'average_score': average_score,
+        'min_score': min_score,
+        'max_score': max_score
+    }
+
+
+def print_summary(stats: Dict[str, Any], results: Dict[str, Any]) -> None:
+    """Print evaluation summary."""
+    print("\n" + "="*60)
+    print("📊 Evaluation Summary")
+    print("="*60)
+    print(f"Total Tests:    {stats['total']}")
+    print(f"Passed:         {stats['passed']} ✅")
+    print(f"Failed:         {stats['failed']} ❌")
+    print(f"Pass Rate:      {stats['pass_rate']:.1%}")
+    print(f"Average Score:  {stats['average_score']:.2f}")
+    print(f"Score Range:    {stats['min_score']:.2f} - {stats['max_score']:.2f}")
+    print("="*60)
+
+    # Show failed tests
+    if stats['failed'] > 0:
+        print("\n❌ Failed Tests:")
+        for i, result in enumerate(results.get('results', []), 1):
+            if not result.get('success', False):
+                test_name = result.get('description', f'Test {i}')
+                score = result.get('score', 0)
+                print(f"  {i}. {test_name} (score: {score:.2f})")
+
+
+def check_thresholds(
+    stats: Dict[str, Any],
+    min_score: float,
+    min_pass_rate: float
+) -> bool:
+    """Check if results meet minimum thresholds."""
+    score_ok = stats['average_score'] >= min_score
+    pass_rate_ok = stats['pass_rate'] >= min_pass_rate
+
+    print("\n" + "="*60)
+    print("🎯 Threshold Checks")
+    print("="*60)
+
+    # Check average score
+    score_status = "✅ PASS" if score_ok else "❌ FAIL"
+    print(f"Average Score:  {stats['average_score']:.2f} >= {min_score:.2f}  {score_status}")
+
+    # Check pass rate
+    pass_status = "✅ PASS" if pass_rate_ok else "❌ FAIL"
+    print(f"Pass Rate:      {stats['pass_rate']:.1%} >= {min_pass_rate:.1%}  {pass_status}")
+
+    print("="*60)
+
+    return score_ok and pass_rate_ok
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Check PromptFoo evaluation scores against thresholds'
+    )
+    parser.add_argument(
+        '--results',
+        required=True,
+        help='Path to PromptFoo results JSON file'
+    )
+    parser.add_argument(
+        '--min-score',
+        type=float,
+        default=0.75,
+        help='Minimum average score threshold (default: 0.75)'
+    )
+    parser.add_argument(
+        '--min-pass-rate',
+        type=float,
+        default=0.80,
+        help='Minimum pass rate threshold (default: 0.80 = 80%%)'
+    )
+    parser.add_argument(
+        '--verbose',
+        action='store_true',
+        help='Show detailed test results'
+    )
+
+    args = parser.parse_args()
+
+    # Load results
+    results = load_results(args.results)
+
+    # Calculate stats
+    stats = calculate_stats(results)
+
+    # Print summary
+    print_summary(stats, results)
+
+    # Show detailed results if verbose
+    if args.verbose:
+        print("\n" + "="*60)
+        print("📋 Detailed Results")
+        print("="*60)
+        for i, result in enumerate(results.get('results', []), 1):
+            test_name = result.get('description', f'Test {i}')
+            success = result.get('success', False)
+            score = result.get('score', 0)
+            status = "✅" if success else "❌"
+            print(f"{i}. {status} {test_name}")
+            print(f"   Score: {score:.2f}")
+            if not success and 'error' in result:
+                print(f"   Error: {result['error']}")
+            print()
+
+    # Check thresholds
+    passed = check_thresholds(stats, args.min_score, args.min_pass_rate)
+
+    if passed:
+        print("\n✅ All quality thresholds met!")
+        sys.exit(0)
+    else:
+        print("\n❌ Quality thresholds not met. Please review failures.")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/evals/scripts/generate-real-plans.py b/evals/scripts/generate-real-plans.py
new file mode 100755
index 0000000000..614076eadf
--- /dev/null
+++ b/evals/scripts/generate-real-plans.py
@@ -0,0 +1,162 @@
+#!/usr/bin/env python3
+
+"""
+Generate Real Plans for Error Analysis
+Uses Claude API to generate actual plans for all test case templates
+"""
+
+import os
+import sys
+import re
+import json
+import requests
+from pathlib import Path
+
+# Colors for terminal output
+GREEN = '\033[0;32m'
+YELLOW = '\033[1;33m'
+RED = '\033[0;31m'
+NC = '\033[0m'
+
+def main():
+    print("🤖 Real Plan Generator (Python)")
+    print("=" * 35)
+    print()
+
+    # Get config
+    api_url = os.getenv('ANTHROPIC_BASE_URL')
+    api_key = os.getenv('ANTHROPIC_AUTH_TOKEN')
+    model = os.getenv('CLAUDE_MODEL', 'claude-sonnet-4-5-20250929')
+
+    if not api_url:
+        print(f"{RED}❌ ANTHROPIC_BASE_URL not set{NC}")
+        sys.exit(1)
+
+    if not api_key:
+        print(f"{RED}❌ ANTHROPIC_AUTH_TOKEN not set{NC}")
+        sys.exit(1)
+
+    print(f"{GREEN}✓{NC} Using model: {model}")
+    print(f"{GREEN}✓{NC} API endpoint: {api_url}")
+    print()
+
+    # Get paths
+    script_dir = Path(__file__).parent
+    repo_root = script_dir.parent.parent
+    plans_dir = repo_root / 'evals' / 'datasets' / 'real-plans'
+    prompt_file = repo_root / 'evals' / 'prompts' / 'plan-prompt.txt'
+
+    # Read prompt template
+    if not prompt_file.exists():
+        print(f"{RED}❌ Prompt template not found: {prompt_file}{NC}")
+        sys.exit(1)
+
+    with open(prompt_file, 'r') as f:
+        prompt_template = f.read()
+
+    # Find all plan files
+    plan_files = sorted(plans_dir.glob('plan-*.md'))
+    total = len(plan_files)
+
+    print(f"📝 Found {total} plan files to generate")
+    print()
+
+    count = 0
+    for plan_file in plan_files:
+        count += 1
+        filename = plan_file.name
+
+        print(f"{YELLOW}[{count}/{total}]{NC} Processing {filename}...")
+
+        # Read the plan file
+        with open(plan_file, 'r') as f:
+            content = f.read()
+
+        # Extract the prompt
+        prompt_match = re.search(r'## Prompt\n(.+?)\n\n', content, re.DOTALL)
+        if not prompt_match:
+            print(f"  ⚠️  No prompt found, skipping")
+            continue
+
+        user_prompt = prompt_match.group(1).strip()
+        print(f"  📋 Prompt: {user_prompt[:60]}...")
+
+        # Replace {{user_input}} in template
+        full_prompt = prompt_template.replace('{{ user_input }}', user_prompt)
+
+        # Call the API
+        print(f"  🤖 Generating plan...")
+
+        try:
+            response = requests.post(
+                f"{api_url}/v1/chat/completions",
+                headers={
+                    "Content-Type": "application/json",
+                    "Authorization": f"Bearer {api_key}"
+                },
+                json={
+                    "model": model,
+                    "max_tokens": 4000,
+                    "temperature": 0.7,
+                    "messages": [{
+                        "role": "user",
+                        "content": full_prompt
+                    }]
+                },
+                timeout=60
+            )
+
+            response.raise_for_status()
+            result = response.json()
+
+            # Extract generated content (OpenAI format)
+            if 'choices' not in result or len(result['choices']) == 0:
+                print(f"  {RED}❌ No content in response{NC}")
+                continue
+
+            generated_plan = result['choices'][0]['message']['content']
+
+            # Update the plan file
+            # Split at "## Generated Plan"
+            parts = content.split('## Generated Plan\n', 1)
+            if len(parts) != 2:
+                print(f"  {RED}❌ Could not find '## Generated Plan' marker{NC}")
+                continue
+
+            before = parts[0] + '## Generated Plan\n'
+
+            # Find the "## Review Notes" section
+            review_match = re.search(r'\n## Review Notes', parts[1])
+            if review_match:
+                after = '\n## Review Notes' + parts[1][review_match.end():]
+            else:
+                after = '\n\n## Review Notes\n<!-- Domain expert notes go here during error analysis -->\n- [ ] Pass/Fail:\n- [ ] Issues found:\n- [ ] Failure category:\n'
+
+            # Write updated content
+            new_content = before + generated_plan + '\n' + after
+
+            with open(plan_file, 'w') as f:
+                f.write(new_content)
+
+            print(f"  {GREEN}✓{NC} Saved to {filename}")
+            print()
+
+        except requests.exceptions.RequestException as e:
+            print(f"  {RED}❌ API Error: {e}{NC}")
+            continue
+        except Exception as e:
+            print(f"  {RED}❌ Error: {e}{NC}")
+            continue
+
+    print()
+    print("━" * 35)
+    print(f"{GREEN}✅ Generation complete!{NC}")
+    print()
+    print("📊 Next steps:")
+    print(f"   1. Review generated plans in: {plans_dir}")
+    print("   2. Run error analysis: ./evals/scripts/run-auto-plan-analysis.sh")
+    print("   3. Review analysis results")
+    print()
+
+if __name__ == '__main__':
+    main()
diff --git a/evals/scripts/generate-real-specs.py b/evals/scripts/generate-real-specs.py
new file mode 100755
index 0000000000..b4f7e89293
--- /dev/null
+++ b/evals/scripts/generate-real-specs.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+
+"""
+Generate Real Specs for Error Analysis
+Uses Claude API to generate actual specs for all test case templates
+"""
+
+import os
+import sys
+import re
+import json
+import requests
+from pathlib import Path
+
+# Colors for terminal output
+GREEN = '\033[0;32m'
+YELLOW = '\033[1;33m'
+RED = '\033[0;31m'
+NC = '\033[0m'
+
+def main():
+    print("🤖 Real Spec Generator (Python)")
+    print("=" * 35)
+    print()
+
+    # Get config
+    api_url = os.getenv('ANTHROPIC_BASE_URL')
+    api_key = os.getenv('ANTHROPIC_AUTH_TOKEN')
+    model = os.getenv('CLAUDE_MODEL', 'claude-sonnet-4-5-20250929')
+
+    if not api_url:
+        print(f"{RED}❌ ANTHROPIC_BASE_URL not set{NC}")
+        sys.exit(1)
+
+    if not api_key:
+        print(f"{RED}❌ ANTHROPIC_AUTH_TOKEN not set{NC}")
+        sys.exit(1)
+
+    print(f"{GREEN}✓{NC} Using model: {model}")
+    print(f"{GREEN}✓{NC} API endpoint: {api_url}")
+    print()
+
+    # Get paths
+    script_dir = Path(__file__).parent
+    repo_root = script_dir.parent.parent
+    specs_dir = repo_root / 'evals' / 'datasets' / 'real-specs'
+    prompt_file = repo_root / 'evals' / 'prompts' / 'spec-prompt.txt'
+
+    # Read prompt template
+    if not prompt_file.exists():
+        print(f"{RED}❌ Prompt template not found: {prompt_file}{NC}")
+        sys.exit(1)
+
+    with open(prompt_file, 'r') as f:
+        prompt_template = f.read()
+
+    # Find all spec files
+    spec_files = sorted(specs_dir.glob('spec-*.md'))
+    total = len(spec_files)
+
+    print(f"📝 Found {total} spec files to generate")
+    print()
+
+    count = 0
+    for spec_file in spec_files:
+        count += 1
+        filename = spec_file.name
+
+        print(f"{YELLOW}[{count}/{total}]{NC} Processing {filename}...")
+
+        # Read the spec file
+        with open(spec_file, 'r') as f:
+            content = f.read()
+
+        # Extract the prompt
+        prompt_match = re.search(r'## Prompt\n(.+?)\n\n', content, re.DOTALL)
+        if not prompt_match:
+            print(f"  ⚠️  No prompt found, skipping")
+            continue
+
+        user_prompt = prompt_match.group(1).strip()
+        print(f"  📋 Prompt: {user_prompt[:60]}...")
+
+        # Replace {{user_input}} in template
+        full_prompt = prompt_template.replace('{{ user_input }}', user_prompt)
+
+        # Call the API
+        print(f"  🤖 Generating spec...")
+
+        try:
+            response = requests.post(
+                f"{api_url}/v1/messages",
+                headers={
+                    "Content-Type": "application/json",
+                    "x-api-key": api_key
+                },
+                json={
+                    "model": model,
+                    "max_tokens": 4000,
+                    "messages": [{
+                        "role": "user",
+                        "content": full_prompt
+                    }]
+                },
+                timeout=60
+            )
+
+            response.raise_for_status()
+            result = response.json()
+
+            # Extract generated content
+            if 'content' not in result or len(result['content']) == 0:
+                print(f"  {RED}❌ No content in response{NC}")
+                continue
+
+            generated_spec = result['content'][0]['text']
+
+            # Update the spec file
+            # Split at "## Generated Spec"
+            parts = content.split('## Generated Spec\n', 1)
+            if len(parts) != 2:
+                print(f"  {RED}❌ Could not find '## Generated Spec' marker{NC}")
+                continue
+
+            before = parts[0] + '## Generated Spec\n'
+
+            # Find the "## Review Notes" section
+            review_match = re.search(r'\n## Review Notes', parts[1])
+            if review_match:
+                after = '\n## Review Notes' + parts[1][review_match.end():]
+            else:
+                after = '\n\n## Review Notes\n<!-- Domain expert notes go here during error analysis -->\n- [ ] Pass/Fail:\n- [ ] Issues found:\n- [ ] Failure category:\n'
+
+            # Write updated content
+            new_content = before + generated_spec + '\n' + after
+
+            with open(spec_file, 'w') as f:
+                f.write(new_content)
+
+            print(f"  {GREEN}✓{NC} Saved to {filename}")
+            print()
+
+        except requests.exceptions.RequestException as e:
+            print(f"  {RED}❌ API Error: {e}{NC}")
+            continue
+        except Exception as e:
+            print(f"  {RED}❌ Error: {e}{NC}")
+            continue
+
+    print()
+    print("━" * 35)
+    print(f"{GREEN}✅ Generation complete!{NC}")
+    print()
+    print("📊 Next steps:")
+    print(f"   1. Review generated specs in: {specs_dir}")
+    print("   2. Run error analysis: ./evals/scripts/run-error-analysis.sh")
+    print("   3. Annotate specs in Jupyter notebook")
+    print()
+
+if __name__ == '__main__':
+    main()
diff --git a/evals/scripts/generate-real-specs.sh b/evals/scripts/generate-real-specs.sh
new file mode 100755
index 0000000000..8ac36c9b89
--- /dev/null
+++ b/evals/scripts/generate-real-specs.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+
+# Generate Real Specs for Error Analysis
+# This script generates actual specs for all test case templates
+# Uses the same LLM endpoint as PromptFoo evals
+#
+# Usage: ./evals/scripts/generate-real-specs.sh
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+SPECS_DIR="$REPO_ROOT/evals/datasets/real-specs"
+PROMPT_FILE="$REPO_ROOT/evals/prompts/spec-prompt.txt"
+
+# Colors
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+RED='\033[0;31m'
+NC='\033[0m'
+
+echo "🤖 Real Spec Generator"
+echo "======================"
+echo ""
+
+# Check environment variables
+if [ -z "$ANTHROPIC_BASE_URL" ]; then
+    echo -e "${RED}❌ ANTHROPIC_BASE_URL not set${NC}"
+    exit 1
+fi
+
+if [ -z "$ANTHROPIC_AUTH_TOKEN" ]; then
+    echo -e "${RED}❌ ANTHROPIC_AUTH_TOKEN not set${NC}"
+    exit 1
+fi
+
+# Set default model
+CLAUDE_MODEL="${CLAUDE_MODEL:-claude-sonnet-4-5-20250929}"
+
+echo -e "${GREEN}✓${NC} Using model: $CLAUDE_MODEL"
+echo -e "${GREEN}✓${NC} API endpoint: $ANTHROPIC_BASE_URL"
+echo ""
+
+# Check if prompt template exists
+if [ ! -f "$PROMPT_FILE" ]; then
+    echo -e "${RED}❌ Prompt template not found: $PROMPT_FILE${NC}"
+    exit 1
+fi
+
+# Read the prompt template
+PROMPT_TEMPLATE=$(cat "$PROMPT_FILE")
+
+# Find all spec files
+SPEC_FILES=$(find "$SPECS_DIR" -name "spec-*.md" | sort)
+TOTAL=$(echo "$SPEC_FILES" | wc -l | tr -d ' ')
+
+echo "📝 Found $TOTAL spec files to generate"
+echo ""
+
+COUNT=0
+for SPEC_FILE in $SPEC_FILES; do
+    COUNT=$((COUNT + 1))
+    FILENAME=$(basename "$SPEC_FILE")
+
+    echo -e "${YELLOW}[$COUNT/$TOTAL]${NC} Processing $FILENAME..."
+
+    # Extract the prompt from the spec file
+    USER_PROMPT=$(sed -n '/## Prompt/,/## Generated Spec/p' "$SPEC_FILE" | \
+                  sed '1d;$d' | \
+                  sed '/^$/d' | \
+                  head -n 1)
+
+    if [ -z "$USER_PROMPT" ]; then
+        echo "  ⚠️  No prompt found, skipping"
+        continue
+    fi
+
+    echo "  📋 Prompt: ${USER_PROMPT:0:60}..."
+
+    # Replace {{user_input}} in the template
+    FULL_PROMPT="${PROMPT_TEMPLATE//\{\{user_input\}\}/$USER_PROMPT}"
+
+    # Call the LLM API
+    echo "  🤖 Generating spec..."
+
+    RESPONSE=$(curl -s -X POST "$ANTHROPIC_BASE_URL/v1/messages" \
+        -H "Content-Type: application/json" \
+        -H "x-api-key: $ANTHROPIC_AUTH_TOKEN" \
+        -d "{
+            \"model\": \"$CLAUDE_MODEL\",
+            \"max_tokens\": 4000,
+            \"messages\": [{
+                \"role\": \"user\",
+                \"content\": $(echo "$FULL_PROMPT" | jq -Rs .)
+            }]
+        }")
+
+    # Check for errors
+    if echo "$RESPONSE" | jq -e '.error' > /dev/null 2>&1; then
+        ERROR_MSG=$(echo "$RESPONSE" | jq -r '.error.message')
+        echo -e "  ${RED}❌ API Error: $ERROR_MSG${NC}"
+        continue
+    fi
+
+    # Extract the generated content
+    GENERATED_SPEC=$(echo "$RESPONSE" | jq -r '.content[0].text')
+
+    if [ -z "$GENERATED_SPEC" ] || [ "$GENERATED_SPEC" = "null" ]; then
+        echo -e "  ${RED}❌ Failed to extract generated spec${NC}"
+        continue
+    fi
+
+    # Update the spec file
+    # Create a temporary file with the updated content
+    TEMP_FILE=$(mktemp)
+
+    # Copy everything up to "## Generated Spec"
+    sed -n '1,/## Generated Spec/p' "$SPEC_FILE" > "$TEMP_FILE"
+
+    # Add the generated spec
+    echo "$GENERATED_SPEC" >> "$TEMP_FILE"
+    echo "" >> "$TEMP_FILE"
+
+    # Add the Review Notes section
+    sed -n '/## Review Notes/,$p' "$SPEC_FILE" >> "$TEMP_FILE"
+
+    # Replace the original file
+    mv "$TEMP_FILE" "$SPEC_FILE"
+
+    echo -e "  ${GREEN}✓${NC} Saved to $FILENAME"
+    echo ""
+done
+
+echo ""
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+echo -e "${GREEN}✅ Generation complete!${NC}"
+echo ""
+echo "📊 Next steps:"
+echo "   1. Review generated specs in: $SPECS_DIR"
+echo "   2. Run error analysis: ./evals/scripts/run-error-analysis.sh"
+echo "   3. Annotate specs in Jupyter notebook"
+echo ""
\ No newline at end of file
diff --git a/evals/scripts/generate-test-data.sh b/evals/scripts/generate-test-data.sh
new file mode 100755
index 0000000000..0224491dff
--- /dev/null
+++ b/evals/scripts/generate-test-data.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+# Generate test data for evaluation
+# This script creates diverse test specs for error analysis
+# MVP version: Generates 10-20 specs (can be expanded to 100+ later)
+
+set -e
+
+DATASET_DIR="$(cd "$(dirname "$0")" && pwd)"
+SPECS_DIR="$DATASET_DIR/real-specs"
+PLANS_DIR="$DATASET_DIR/real-plans"
+
+echo "🚀 Generating test data for evaluation..."
+echo "Output directory: $SPECS_DIR"
+echo ""
+
+# Test prompts with diverse dimensions:
+# - Complexity: simple, medium, complex
+# - Domain: web-app, api, mobile
+# - Tech stack: nodejs, dotnet, python
+# - Team size: solo, small-team
+
+declare -a TEST_PROMPTS=(
+    # Simple web apps
+    "Build a simple todo list web application with task creation, editing, and deletion"
+    "Create a personal blog platform with posts, comments, and basic authentication"
+    "Develop a simple calculator web app with basic arithmetic operations"
+
+    # Medium complexity
+    "Build an e-commerce platform with product catalog, shopping cart, and checkout flow"
+    "Create a customer relationship management (CRM) system with contact management and sales pipeline"
+    "Develop a project management dashboard with task tracking, team collaboration, and reporting"
+
+    # Complex / Enterprise
+    "Build a distributed microservices architecture for a real-time analytics platform"
+    "Create an enterprise inventory management system with multi-warehouse support and compliance tracking"
+    "Develop a multi-tenant SaaS application with role-based access control and API integrations"
+
+    # API focused
+    "Design a RESTful API for a social media platform with user profiles, posts, and messaging"
+    "Build a GraphQL API for a content management system with flexible querying"
+
+    # Different tech stacks
+    "Create a .NET Core web application for employee onboarding with document management"
+    "Build a Python Flask API for data processing and machine learning model serving"
+    "Develop a Node.js/Express backend with real-time WebSocket communication"
+
+    # Specific constraints
+    "Build a HIPAA-compliant healthcare appointment scheduling system"
+    "Create a financial transaction processing system with audit trails and compliance reporting"
+    "Develop a legacy system integration middleware using REST and SOAP protocols"
+)
+
+# Count prompts
+TOTAL_PROMPTS=${#TEST_PROMPTS[@]}
+echo "📝 Will generate $TOTAL_PROMPTS test cases"
+echo ""
+
+# Generate specs
+for i in "${!TEST_PROMPTS[@]}"; do
+    PROMPT="${TEST_PROMPTS[$i]}"
+    SPEC_NUM=$((i + 1))
+    SPEC_FILE="$SPECS_DIR/spec-$(printf "%03d" $SPEC_NUM).md"
+
+    echo "[$SPEC_NUM/$TOTAL_PROMPTS] Generating spec for: ${PROMPT:0:60}..."
+
+    # Create a markdown file with the prompt and placeholder for output
+    # In real usage, you would run: specify init "test-$SPEC_NUM" --ai claude
+    # For MVP, we'll create placeholder files that can be filled in manually or via the specify command
+
+    cat > "$SPEC_FILE" << EOF
+# Test Spec $SPEC_NUM
+
+## Prompt
+$PROMPT
+
+## Generated Spec
+<!-- TODO: Run specify command or paste generated spec here -->
+
+## Review Notes
+<!-- Domain expert notes go here during error analysis -->
+- [ ] Pass/Fail:
+- [ ] Issues found:
+- [ ] Failure category:
+
+EOF
+
+done
+
+echo ""
+echo "✅ Generated $TOTAL_PROMPTS test spec placeholders in: $SPECS_DIR"
+echo ""
+echo "📌 Next steps:"
+echo "   1. Fill in specs by running: specify init <project-name> --ai claude"
+echo "   2. Copy generated specs to the placeholder files"
+echo "   3. Or manually paste generated specs into the files"
+echo "   4. Run error analysis in Jupyter notebook"
+echo ""
+echo "💡 To expand dataset to 100+ specs, add more prompts to this script"
diff --git a/evals/scripts/run-annotation-tool.sh b/evals/scripts/run-annotation-tool.sh
new file mode 100755
index 0000000000..e6f34c77f2
--- /dev/null
+++ b/evals/scripts/run-annotation-tool.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Run the FastHTML annotation tool for spec review
+# Usage: ./evals/scripts/run-annotation-tool.sh
+
+set -e
+
+ANNOTATION_DIR="evals/annotation-tool"
+
+echo "Setting up annotation tool..."
+
+# Navigate to annotation tool directory
+cd "$ANNOTATION_DIR"
+
+# Create venv if it doesn't exist
+if [ ! -d ".venv" ]; then
+    echo "Creating virtual environment with uv..."
+    uv venv
+fi
+
+# Activate venv and install dependencies
+echo "Installing dependencies..."
+source .venv/bin/activate
+uv pip install python-fasthtml markdown
+
+echo ""
+echo "Starting annotation tool..."
+echo "Open your browser to the URL shown below:"
+echo ""
+
+python app.py
diff --git a/evals/scripts/run-auto-error-analysis.sh b/evals/scripts/run-auto-error-analysis.sh
new file mode 100755
index 0000000000..7eeb6e1dc6
--- /dev/null
+++ b/evals/scripts/run-auto-error-analysis.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Run Automated Error Analysis
+# Uses Claude API to automatically evaluate specs and categorize failures
+# Location: evals/scripts/run-auto-error-analysis.sh
+#
+# This script:
+# 1. Checks for ANTHROPIC_API_KEY
+# 2. Sets up Python environment
+# 3. Runs automated error analysis
+# 4. Generates results CSV and summary report
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# Parse command line arguments
+MODEL=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        --help|-h)
+            echo "Usage: ./evals/scripts/run-auto-error-analysis.sh [OPTIONS]"
+            echo ""
+            echo "Options:"
+            echo "  --model MODEL    Specify model to use (default: claude-sonnet-4-5-20250929)"
+            echo "  --help, -h       Show this help message"
+            echo ""
+            echo "Examples:"
+            echo "  ./evals/scripts/run-auto-error-analysis.sh"
+            echo "  ./evals/scripts/run-auto-error-analysis.sh --model claude-opus-4-5-20251101"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+echo "🤖 Automated Error Analysis"
+echo "=" >&2
+echo ""
+
+# Check for API key
+if [ -z "$ANTHROPIC_API_KEY" ]; then
+    echo "❌ Error: ANTHROPIC_API_KEY not set"
+    echo "   Set it with: export ANTHROPIC_API_KEY=your-key"
+    exit 1
+fi
+
+# Set model if provided via --model flag
+if [ -n "$MODEL" ]; then
+    export CLAUDE_MODEL="$MODEL"
+fi
+
+# Create venv if needed
+VENV_DIR="$PROJECT_ROOT/evals/.venv"
+if [ ! -d "$VENV_DIR" ]; then
+    echo "📦 Creating virtual environment..."
+    cd "$PROJECT_ROOT/evals"
+    uv venv
+fi
+
+# Install dependencies
+echo "📥 Installing dependencies..."
+cd "$PROJECT_ROOT/evals"
+source "$VENV_DIR/bin/activate"
+uv pip install --quiet anthropic pandas
+
+echo ""
+echo "🚀 Running automated error analysis..."
+echo ""
+
+# Run the script
+python "$SCRIPT_DIR/run-automated-error-analysis.py"
+
+echo ""
+echo "💡 Results saved to: evals/datasets/analysis-results/"
+echo "   - CSV with detailed evaluations"
+echo "   - Summary report with failure categories"
\ No newline at end of file
diff --git a/evals/scripts/run-auto-plan-analysis.sh b/evals/scripts/run-auto-plan-analysis.sh
new file mode 100755
index 0000000000..96dd272f7a
--- /dev/null
+++ b/evals/scripts/run-auto-plan-analysis.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+# Run Automated Plan Error Analysis
+# Uses Claude API to automatically evaluate plans and categorize failures
+# Location: evals/scripts/run-auto-plan-analysis.sh
+#
+# This script:
+# 1. Checks for ANTHROPIC_API_KEY
+# 2. Sets up Python environment
+# 3. Runs automated plan error analysis
+# 4. Generates results CSV and summary report
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# Parse command line arguments
+MODEL=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        --help|-h)
+            echo "Usage: ./evals/scripts/run-auto-plan-analysis.sh [OPTIONS]"
+            echo ""
+            echo "Options:"
+            echo "  --model MODEL    Specify model to use (default: claude-sonnet-4-5-20250929)"
+            echo "  --help, -h       Show this help message"
+            echo ""
+            echo "Examples:"
+            echo "  ./evals/scripts/run-auto-plan-analysis.sh"
+            echo "  ./evals/scripts/run-auto-plan-analysis.sh --model claude-opus-4-5-20251101"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+echo "🤖 Automated Plan Error Analysis"
+echo "========================================" >&2
+echo ""
+
+# Check for API key
+if [ -z "$ANTHROPIC_API_KEY" ]; then
+    echo "❌ Error: ANTHROPIC_API_KEY not set"
+    echo "   Set it with: export ANTHROPIC_API_KEY=your-key"
+    exit 1
+fi
+
+# Set model if provided via --model flag
+if [ -n "$MODEL" ]; then
+    export CLAUDE_MODEL="$MODEL"
+fi
+
+# Create venv if needed
+VENV_DIR="$PROJECT_ROOT/evals/.venv"
+if [ ! -d "$VENV_DIR" ]; then
+    echo "📦 Creating virtual environment..."
+    cd "$PROJECT_ROOT/evals"
+    uv venv
+fi
+
+# Install dependencies
+echo "📥 Installing dependencies..."
+cd "$PROJECT_ROOT/evals"
+source "$VENV_DIR/bin/activate"
+uv pip install --quiet anthropic pandas
+
+echo ""
+echo "🚀 Running automated plan error analysis..."
+echo ""
+
+# Run the script
+python "$SCRIPT_DIR/run-automated-plan-analysis.py"
+
+echo ""
+echo "💡 Results saved to: evals/datasets/analysis-results/"
+echo "   - CSV with detailed evaluations"
+echo "   - Summary report with failure categories"
\ No newline at end of file
diff --git a/evals/scripts/run-automated-error-analysis.py b/evals/scripts/run-automated-error-analysis.py
new file mode 100755
index 0000000000..09816766af
--- /dev/null
+++ b/evals/scripts/run-automated-error-analysis.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""
+Automated Error Analysis using Claude API
+Automatically evaluates generated specs and categorizes failures
+Location: evals/scripts/run-automated-error-analysis.py
+"""
+
+import os
+import glob
+import json
+import re
+from pathlib import Path
+from anthropic import Anthropic
+import pandas as pd
+from datetime import datetime
+
+# Configuration
+DATASET_DIR = Path(__file__).parent.parent / 'datasets' / 'real-specs'
+RESULTS_DIR = Path(__file__).parent.parent / 'datasets' / 'analysis-results'
+RESULTS_DIR.mkdir(exist_ok=True)
+
+# Initialize Claude client
+client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
+
+# Evaluation prompt
+EVAL_PROMPT = """You are an expert software engineer reviewing a technical specification document.
+
+**Task:** Evaluate the following spec and determine if it meets professional quality standards.
+
+**Spec to Review:**
+{spec}
+
+**Original Prompt:**
+{prompt}
+
+**Evaluation Criteria:**
+1. **Completeness:** Are all requirements clearly defined?
+2. **Clarity:** Are user stories and acceptance criteria unambiguous?
+3. **Appropriateness:** Is the scope appropriate (not over-engineered)?
+4. **Constraints:** Are technical constraints and assumptions documented?
+5. **Testability:** Can QA write tests from this spec?
+
+**Your Task:**
+Respond with a JSON object containing:
+{{
+  "pass": true/false,
+  "issues": ["issue 1", "issue 2", ...],
+  "categories": ["category 1", "category 2", ...],
+  "reasoning": "Brief explanation of your decision"
+}}
+
+**Common failure categories:**
+- "Incomplete requirements"
+- "Ambiguous specifications"
+- "Over-engineering"
+- "Missing constraints"
+- "Tech stack mismatch"
+- "Missing edge cases"
+- "Unclear acceptance criteria"
+
+Be strict but fair. A spec should fail if it has significant issues that would block implementation.
+"""
+
+
+def extract_spec_content(file_path):
+    """Extract prompt and spec from markdown file"""
+    with open(file_path, 'r') as f:
+        content = f.read()
+
+    # Extract prompt
+    prompt_match = re.search(r'## Prompt\n(.+?)\n\n', content, re.DOTALL)
+    prompt = prompt_match.group(1).strip() if prompt_match else ''
+
+    # Extract generated spec
+    spec_match = re.search(r'## Generated Spec\n(.+?)(?:\n\n##|$)', content, re.DOTALL)
+    spec = spec_match.group(1).strip() if spec_match else ''
+
+    return prompt, spec
+
+
+def evaluate_spec(prompt, spec, model=None):
+    """Use Claude to evaluate a spec"""
+    if model is None:
+        model = os.environ.get("CLAUDE_MODEL", "claude-sonnet-4-5-20250929")
+
+    try:
+        message = client.messages.create(
+            model=model,
+            max_tokens=2000,
+            messages=[{
+                "role": "user",
+                "content": EVAL_PROMPT.format(spec=spec, prompt=prompt)
+            }]
+        )
+
+        response_text = message.content[0].text
+
+        # Try to extract JSON from response
+        json_match = re.search(r'\{[^}]+\}', response_text, re.DOTALL)
+        if json_match:
+            result = json.loads(json_match.group(0))
+        else:
+            # Fallback: parse manually
+            result = {
+                "pass": "pass" in response_text.lower(),
+                "issues": [],
+                "categories": [],
+                "reasoning": response_text
+            }
+
+        return result
+
+    except Exception as e:
+        print(f"⚠️  Error evaluating spec: {e}")
+        return {
+            "pass": None,
+            "issues": [f"Evaluation error: {str(e)}"],
+            "categories": ["Evaluation error"],
+            "reasoning": str(e)
+        }
+
+
+def main():
+    print("🤖 Automated Error Analysis")
+    print("=" * 60)
+    print()
+
+    # Check for API key
+    if not os.environ.get("ANTHROPIC_API_KEY"):
+        print("❌ Error: ANTHROPIC_API_KEY not set")
+        print("   Set it with: export ANTHROPIC_API_KEY=your-key")
+        return 1
+
+    # Get model from environment or use default
+    model = os.environ.get("CLAUDE_MODEL", "claude-sonnet-4-5-20250929")
+    print(f"🤖 Using model: {model}")
+    print()
+
+    # Load spec files
+    spec_files = sorted(glob.glob(str(DATASET_DIR / 'spec-*.md')))
+    print(f"📝 Found {len(spec_files)} spec files")
+    print()
+
+    if len(spec_files) == 0:
+        print("❌ No spec files found in:", DATASET_DIR)
+        return 1
+
+    # Evaluate each spec
+    results = []
+    for i, spec_file in enumerate(spec_files, 1):
+        filename = os.path.basename(spec_file)
+        print(f"[{i}/{len(spec_files)}] Evaluating {filename}...", end=" ")
+
+        prompt, spec = extract_spec_content(spec_file)
+
+        if not spec:
+            print("⚠️  No spec content found")
+            continue
+
+        evaluation = evaluate_spec(prompt, spec)
+
+        results.append({
+            'file': filename,
+            'prompt': prompt,
+            'spec': spec[:200] + '...',  # Truncate for CSV
+            'pass': 'Pass' if evaluation['pass'] else 'Fail',
+            'issues': '; '.join(evaluation.get('issues', [])),
+            'categories': '; '.join(evaluation.get('categories', [])),
+            'reasoning': evaluation.get('reasoning', '')
+        })
+
+        status = "✅ Pass" if evaluation['pass'] else "❌ Fail"
+        print(status)
+
+    print()
+    print("=" * 60)
+
+    # Create DataFrame
+    df = pd.DataFrame(results)
+
+    # Calculate stats
+    total = len(df)
+    failed = len(df[df['pass'] == 'Fail'])
+    passed = total - failed
+    pass_rate = (passed / total * 100) if total > 0 else 0
+
+    print(f"\n📊 Results:")
+    print(f"   Total specs: {total}")
+    print(f"   Passed: {passed}")
+    print(f"   Failed: {failed}")
+    print(f"   Pass rate: {pass_rate:.1f}%")
+    print()
+
+    # Categorize failures
+    if failed > 0:
+        print("❌ Failure Categories:")
+        all_categories = []
+        for categories in df[df['pass'] == 'Fail']['categories']:
+            if categories:
+                all_categories.extend([c.strip() for c in categories.split(';')])
+
+        from collections import Counter
+        category_counts = Counter(all_categories)
+        for category, count in category_counts.most_common():
+            print(f"   {category}: {count}")
+        print()
+
+    # Save results
+    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+
+    # Save detailed CSV
+    csv_file = RESULTS_DIR / f'automated-analysis-{timestamp}.csv'
+    df.to_csv(csv_file, index=False)
+    print(f"💾 Saved detailed results: {csv_file}")
+
+    # Save summary report
+    summary_file = RESULTS_DIR / f'summary-{timestamp}.txt'
+    with open(summary_file, 'w') as f:
+        f.write(f"Automated Error Analysis Report\n")
+        f.write(f"{'=' * 60}\n\n")
+        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"Model: {model}\n\n")
+        f.write(f"Results:\n")
+        f.write(f"  Total specs: {total}\n")
+        f.write(f"  Passed: {passed}\n")
+        f.write(f"  Failed: {failed}\n")
+        f.write(f"  Pass rate: {pass_rate:.1f}%\n\n")
+
+        if failed > 0:
+            f.write(f"Failure Categories:\n")
+            for category, count in category_counts.most_common():
+                f.write(f"  - {category}: {count}\n")
+            f.write("\n")
+
+            f.write("Failed Specs:\n")
+            for _, row in df[df['pass'] == 'Fail'].iterrows():
+                f.write(f"\n[{row['file']}]\n")
+                f.write(f"  Issues: {row['issues']}\n")
+                f.write(f"  Categories: {row['categories']}\n")
+
+    print(f"📄 Saved summary report: {summary_file}")
+    print()
+    print("✅ Automated error analysis complete!")
+
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/evals/scripts/run-automated-plan-analysis.py b/evals/scripts/run-automated-plan-analysis.py
new file mode 100755
index 0000000000..19478753a0
--- /dev/null
+++ b/evals/scripts/run-automated-plan-analysis.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+"""
+Automated Plan Error Analysis using Claude API
+Automatically evaluates generated plans and categorizes failures
+Location: evals/scripts/run-automated-plan-analysis.py
+"""
+
+import os
+import glob
+import json
+import re
+from pathlib import Path
+from anthropic import Anthropic
+import pandas as pd
+from datetime import datetime
+
+# Configuration
+DATASET_DIR = Path(__file__).parent.parent / 'datasets' / 'real-plans'
+RESULTS_DIR = Path(__file__).parent.parent / 'datasets' / 'analysis-results'
+RESULTS_DIR.mkdir(exist_ok=True)
+
+# Initialize Claude client
+client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
+
+# Evaluation prompt for plans
+EVAL_PROMPT = """You are an expert software architect reviewing an implementation plan document.
+
+**Task:** Evaluate the following implementation plan and determine if it meets professional quality standards.
+
+**Plan to Review:**
+{plan}
+
+**Original Prompt:**
+{prompt}
+
+**Evaluation Criteria:**
+1. **Simplicity Gate:** Does the plan respect the ≤3 projects limit? (THIS IS CRITICAL)
+2. **Completeness:** Are all necessary components and phases defined?
+3. **Clarity:** Are project boundaries, tasks, and milestones clear?
+4. **Appropriateness:** Is the architecture simple enough? No over-engineering?
+5. **Constitution Compliance:** Does it follow the principles (no microservices for simple apps, no premature optimization)?
+6. **Testability:** Does the plan include testing strategy and verification steps?
+
+**Your Task:**
+Respond with a JSON object containing:
+{{
+  "pass": true/false,
+  "project_count": number,
+  "issues": ["issue 1", "issue 2", ...],
+  "categories": ["category 1", "category 2", ...],
+  "reasoning": "Brief explanation of your decision"
+}}
+
+**Common failure categories:**
+- "Too many projects (>3)"
+- "Over-engineering"
+- "Missing verification steps"
+- "Unclear project boundaries"
+- "Microservices for simple app"
+- "Premature optimization"
+- "Missing testing strategy"
+- "Tech stack mismatch"
+- "Incomplete milestones"
+
+Be strict but fair. A plan should fail if it violates the simplicity gate (>3 projects) or has significant architectural issues that would lead to over-engineering.
+"""
+
+
+def extract_plan_content(file_path):
+    """Extract prompt and plan from markdown file"""
+    with open(file_path, 'r') as f:
+        content = f.read()
+
+    # Extract prompt
+    prompt_match = re.search(r'## Prompt\n(.+?)\n\n', content, re.DOTALL)
+    prompt = prompt_match.group(1).strip() if prompt_match else ''
+
+    # Extract generated plan
+    plan_match = re.search(r'## Generated Plan\n(.+?)(?:\n\n##|$)', content, re.DOTALL)
+    plan = plan_match.group(1).strip() if plan_match else ''
+
+    return prompt, plan
+
+
+def evaluate_plan(prompt, plan, model=None):
+    """Use Claude to evaluate a plan"""
+    if model is None:
+        model = os.environ.get("CLAUDE_MODEL", "claude-sonnet-4-5-20250929")
+
+    try:
+        message = client.messages.create(
+            model=model,
+            max_tokens=2000,
+            messages=[{
+                "role": "user",
+                "content": EVAL_PROMPT.format(plan=plan, prompt=prompt)
+            }]
+        )
+
+        response_text = message.content[0].text
+
+        # Try to extract JSON from response
+        json_match = re.search(r'\{[^}]+\}', response_text, re.DOTALL)
+        if json_match:
+            result = json.loads(json_match.group(0))
+        else:
+            # Fallback: parse manually
+            result = {
+                "pass": "pass" in response_text.lower(),
+                "project_count": None,
+                "issues": [],
+                "categories": [],
+                "reasoning": response_text
+            }
+
+        return result
+
+    except Exception as e:
+        print(f"⚠️  Error evaluating plan: {e}")
+        return {
+            "pass": None,
+            "project_count": None,
+            "issues": [f"Evaluation error: {str(e)}"],
+            "categories": ["Evaluation error"],
+            "reasoning": str(e)
+        }
+
+
+def main():
+    print("🤖 Automated Plan Error Analysis")
+    print("=" * 60)
+    print()
+
+    # Check for API key
+    if not os.environ.get("ANTHROPIC_API_KEY"):
+        print("❌ Error: ANTHROPIC_API_KEY not set")
+        print("   Set it with: export ANTHROPIC_API_KEY=your-key")
+        return 1
+
+    # Get model from environment or use default
+    model = os.environ.get("CLAUDE_MODEL", "claude-sonnet-4-5-20250929")
+    print(f"🤖 Using model: {model}")
+    print()
+
+    # Load plan files
+    plan_files = sorted(glob.glob(str(DATASET_DIR / 'plan-*.md')))
+    print(f"📝 Found {len(plan_files)} plan files")
+    print()
+
+    if len(plan_files) == 0:
+        print("❌ No plan files found in:", DATASET_DIR)
+        return 1
+
+    # Evaluate each plan
+    results = []
+    for i, plan_file in enumerate(plan_files, 1):
+        filename = os.path.basename(plan_file)
+        print(f"[{i}/{len(plan_files)}] Evaluating {filename}...", end=" ")
+
+        prompt, plan = extract_plan_content(plan_file)
+
+        if not plan:
+            print("⚠️  No plan content found")
+            continue
+
+        evaluation = evaluate_plan(prompt, plan)
+
+        results.append({
+            'file': filename,
+            'prompt': prompt,
+            'plan_summary': plan[:200] + '...',  # Truncate for CSV
+            'project_count': evaluation.get('project_count', 'N/A'),
+            'pass': 'Pass' if evaluation['pass'] else 'Fail',
+            'issues': '; '.join(evaluation.get('issues', [])),
+            'categories': '; '.join(evaluation.get('categories', [])),
+            'reasoning': evaluation.get('reasoning', '')
+        })
+
+        status = "✅ Pass" if evaluation['pass'] else "❌ Fail"
+        project_count = f" ({evaluation.get('project_count', '?')} projects)" if evaluation.get('project_count') else ""
+        print(status + project_count)
+
+    print()
+    print("=" * 60)
+
+    # Create DataFrame
+    df = pd.DataFrame(results)
+
+    # Calculate stats
+    total = len(df)
+    failed = len(df[df['pass'] == 'Fail'])
+    passed = total - failed
+    pass_rate = (passed / total * 100) if total > 0 else 0
+
+    print(f"\n📊 Results:")
+    print(f"   Total plans: {total}")
+    print(f"   Passed: {passed}")
+    print(f"   Failed: {failed}")
+    print(f"   Pass rate: {pass_rate:.1f}%")
+    print()
+
+    # Project count analysis
+    if 'project_count' in df.columns:
+        print("📐 Project Count Distribution:")
+        project_counts = df[df['project_count'] != 'N/A']['project_count'].value_counts().sort_index()
+        for count, freq in project_counts.items():
+            print(f"   {count} projects: {freq}")
+        print()
+
+    # Categorize failures
+    if failed > 0:
+        print("❌ Failure Categories:")
+        all_categories = []
+        for categories in df[df['pass'] == 'Fail']['categories']:
+            if categories:
+                all_categories.extend([c.strip() for c in categories.split(';')])
+
+        from collections import Counter
+        category_counts = Counter(all_categories)
+        for category, count in category_counts.most_common():
+            print(f"   {category}: {count}")
+        print()
+
+    # Save results
+    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+
+    # Save detailed CSV
+    csv_file = RESULTS_DIR / f'plan-analysis-{timestamp}.csv'
+    df.to_csv(csv_file, index=False)
+    print(f"💾 Saved detailed results: {csv_file}")
+
+    # Save summary report
+    summary_file = RESULTS_DIR / f'plan-summary-{timestamp}.txt'
+    with open(summary_file, 'w') as f:
+        f.write(f"Automated Plan Error Analysis Report\n")
+        f.write(f"{'=' * 60}\n\n")
+        f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"Model: {model}\n\n")
+        f.write(f"Results:\n")
+        f.write(f"  Total plans: {total}\n")
+        f.write(f"  Passed: {passed}\n")
+        f.write(f"  Failed: {failed}\n")
+        f.write(f"  Pass rate: {pass_rate:.1f}%\n\n")
+
+        if 'project_count' in df.columns:
+            f.write(f"Project Count Distribution:\n")
+            for count, freq in project_counts.items():
+                f.write(f"  {count} projects: {freq}\n")
+            f.write("\n")
+
+        if failed > 0:
+            f.write(f"Failure Categories:\n")
+            for category, count in category_counts.most_common():
+                f.write(f"  - {category}: {count}\n")
+            f.write("\n")
+
+            f.write("Failed Plans:\n")
+            for _, row in df[df['pass'] == 'Fail'].iterrows():
+                f.write(f"\n[{row['file']}]\n")
+                f.write(f"  Project Count: {row['project_count']}\n")
+                f.write(f"  Issues: {row['issues']}\n")
+                f.write(f"  Categories: {row['categories']}\n")
+
+    print(f"📄 Saved summary report: {summary_file}")
+    print()
+    print("✅ Automated plan error analysis complete!")
+
+    return 0
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/evals/scripts/run-error-analysis.sh b/evals/scripts/run-error-analysis.sh
new file mode 100755
index 0000000000..4114d0c169
--- /dev/null
+++ b/evals/scripts/run-error-analysis.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Run Error Analysis Workflow
+# Sets up Jupyter environment and launches error-analysis notebook
+# Location: evals/scripts/run-error-analysis.sh
+#
+# This script:
+# 1. Creates/activates virtual environment
+# 2. Installs required packages (jupyter, pandas, matplotlib, etc.)
+# 3. Launches Jupyter Lab with error-analysis notebook
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+NOTEBOOKS_DIR="$(cd "$SCRIPT_DIR/../notebooks" && pwd)"
+
+echo "🔍 Starting Error Analysis Workflow..."
+echo "📁 Notebooks directory: $NOTEBOOKS_DIR"
+echo ""
+
+# Create venv if it doesn't exist
+if [ ! -d "$NOTEBOOKS_DIR/.venv" ]; then
+    echo "📦 Creating virtual environment..."
+    cd "$NOTEBOOKS_DIR"
+    uv venv
+
+    # Install packages
+    echo "📥 Installing packages..."
+    uv pip install jupyter pandas matplotlib seaborn ipywidgets
+else
+    echo "✅ Virtual environment exists"
+
+    # Check if packages need updating
+    echo "📥 Ensuring packages are installed..."
+    cd "$NOTEBOOKS_DIR"
+    uv pip install --quiet jupyter pandas matplotlib seaborn ipywidgets
+fi
+
+echo ""
+echo "✅ Environment ready!"
+echo "🚀 Launching Jupyter Lab..."
+echo ""
+echo "💡 In Jupyter Lab:"
+echo "   - Open: error-analysis.ipynb"
+echo "   - Review specs in datasets/real-specs/"
+echo "   - Annotate with pass/fail and notes"
+echo "   - Categorize failures and prioritize"
+echo ""
+
+# Activate and launch
+cd "$NOTEBOOKS_DIR"
+source .venv/bin/activate
+jupyter lab error-analysis.ipynb
diff --git a/evals/scripts/run-promptfoo-eval.sh b/evals/scripts/run-promptfoo-eval.sh
new file mode 100755
index 0000000000..45e40d879f
--- /dev/null
+++ b/evals/scripts/run-promptfoo-eval.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+#
+# Quick script to run PromptFoo evaluations with proper setup
+#
+
+set -e -o pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+echo "🔍 Spec-Kit Evaluation Runner (npx)"
+echo "=============================="
+
+# Check if Node.js is installed (required for npx)
+if ! command -v node &> /dev/null; then
+    echo -e "${RED}❌ Node.js not installed${NC}"
+    echo ""
+    echo "Install Node.js from: https://nodejs.org/"
+    echo "Or with Homebrew: brew install node"
+    exit 1
+fi
+
+echo -e "${GREEN}✓${NC} Node.js installed ($(node --version))"
+
+# Check if npx is available (comes with Node.js)
+if ! command -v npx &> /dev/null; then
+    echo -e "${RED}❌ npx not available${NC}"
+    echo ""
+    echo "npx should come with Node.js. Try reinstalling Node.js."
+    exit 1
+fi
+
+echo -e "${GREEN}✓${NC} npx available"
+
+# Check environment variables
+if [ -z "$LLM_BASE_URL" ]; then
+    echo -e "${RED}❌ LLM_BASE_URL not set${NC}"
+    echo ""
+    echo "Set with: export LLM_BASE_URL='your-llm-base-url'"
+    exit 1
+fi
+
+echo -e "${GREEN}✓${NC} LLM_BASE_URL set"
+
+if [ -z "$LLM_AUTH_TOKEN" ]; then
+    echo -e "${RED}❌ LLM_AUTH_TOKEN not set${NC}"
+    echo ""
+    echo "Set with: export LLM_AUTH_TOKEN='your-token'"
+    exit 1
+fi
+
+echo -e "${GREEN}✓${NC} LLM_AUTH_TOKEN set"
+
+# Note: Model will be set after parsing command line arguments
+
+# Check if we're in the repo root (look for config files)
+if [ ! -f "evals/configs/promptfooconfig-spec.js" ] || [ ! -f "evals/configs/promptfooconfig-plan.js" ]; then
+    echo -e "${RED}❌ Config files not found${NC}"
+    echo ""
+    echo "Expected: evals/configs/promptfooconfig-spec.js and evals/configs/promptfooconfig-plan.js"
+    echo "Make sure you're running this from the repository root"
+    exit 1
+fi
+
+echo -e "${GREEN}✓${NC} Configuration files found"
+
+# Export OpenAI-compatible env vars for PromptFoo
+export OPENAI_API_KEY="${LLM_AUTH_TOKEN}"
+export OPENAI_BASE_URL="${LLM_BASE_URL}"
+
+echo ""
+
+# Parse command line arguments
+FILTER=""
+OUTPUT_JSON=false
+VIEW_RESULTS=false
+MODEL=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --filter)
+            FILTER="$2"
+            shift 2
+            ;;
+        --json)
+            OUTPUT_JSON=true
+            shift
+            ;;
+        --view)
+            VIEW_RESULTS=true
+            shift
+            ;;
+        --model)
+            MODEL="$2"
+            shift 2
+            ;;
+        --help)
+            echo "Usage: ./evals/scripts/run-eval.sh [OPTIONS]"
+            echo ""
+            echo "Options:"
+            echo "  --filter TEXT    Run only tests matching TEXT"
+            echo "  --json           Output results as JSON"
+            echo "  --view           Open web UI after running"
+            echo "  --model MODEL    Specify model to use (default: claude-sonnet-4-5-20250929)"
+            echo "  --help           Show this help message"
+            echo ""
+            echo "Examples:"
+            echo "  ./evals/scripts/run-eval.sh                    # Run all tests"
+            echo "  ./evals/scripts/run-eval.sh --filter 'Spec'   # Run only spec template tests"
+            echo "  ./evals/scripts/run-eval.sh --json --view     # Run tests, save JSON, open UI"
+            echo "  ./evals/scripts/run-eval.sh --model claude-opus-4-5-20251101  # Use Opus 4.5"
+            exit 0
+            ;;
+        *)
+            echo -e "${RED}Unknown option: $1${NC}"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+# Set Claude model (priority: --model flag > env var > default)
+if [ -n "$MODEL" ]; then
+    export LLM_MODEL="$MODEL"
+    echo -e "${GREEN}✓${NC} LLM_MODEL set to ${LLM_MODEL} (from --model flag)"
+elif [ -z "$LLM_MODEL" ]; then
+    export LLM_MODEL="claude-sonnet-4-5-20250929"
+    echo -e "${GREEN}✓${NC} LLM_MODEL defaulted to ${LLM_MODEL}"
+else
+    echo -e "${GREEN}✓${NC} LLM_MODEL set to ${LLM_MODEL} (from environment)"
+fi
+
+# Build filter argument if provided
+FILTER_ARG=""
+if [ -n "$FILTER" ]; then
+    echo -e "${YELLOW}🔍 Running tests matching: ${FILTER}${NC}"
+    FILTER_ARG="--filter-pattern \"$FILTER\""
+fi
+
+echo ""
+echo "🚀 Running evaluations with npx..."
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+echo ""
+
+# Run spec tests (don't exit on failure, capture exit code)
+echo "📋 Running Spec Template tests..."
+if [ "$OUTPUT_JSON" = true ]; then
+    npx promptfoo eval -c evals/configs/promptfooconfig-spec.js -o eval-results-spec.json $FILTER_ARG || SPEC_EXIT=$?
+else
+    npx promptfoo eval -c evals/configs/promptfooconfig-spec.js $FILTER_ARG || SPEC_EXIT=$?
+fi
+SPEC_EXIT=${SPEC_EXIT:-0}
+
+echo ""
+echo "📋 Running Plan Template tests..."
+if [ "$OUTPUT_JSON" = true ]; then
+    npx promptfoo eval -c evals/configs/promptfooconfig-plan.js -o eval-results-plan.json $FILTER_ARG || PLAN_EXIT=$?
+else
+    npx promptfoo eval -c evals/configs/promptfooconfig-plan.js $FILTER_ARG || PLAN_EXIT=$?
+fi
+PLAN_EXIT=${PLAN_EXIT:-0}
+
+EXIT_CODE=0
+if [ $SPEC_EXIT -ne 0 ] || [ $PLAN_EXIT -ne 0 ]; then
+    EXIT_CODE=1
+fi
+
+echo ""
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+
+if [ $EXIT_CODE -eq 0 ]; then
+    echo -e "${GREEN}✅ All evaluations completed successfully${NC}"
+else
+    echo -e "${YELLOW}⚠️  Some evaluations had issues${NC}"
+fi
+
+# Combine JSON results if requested
+if [ "$OUTPUT_JSON" = true ] && [ -f "eval-results-spec.json" ] && [ -f "eval-results-plan.json" ]; then
+    echo ""
+    echo "📊 Combining results..."
+    python3 << 'PYTHON_EOF'
+import json
+
+# Load both result files
+with open('eval-results-spec.json', 'r') as f:
+    spec_data = json.load(f)
+with open('eval-results-plan.json', 'r') as f:
+    plan_data = json.load(f)
+
+# Combine results
+combined = {
+    'evalId': 'combined',
+    'results': {
+        'version': spec_data['results']['version'],
+        'timestamp': spec_data['results']['timestamp'],
+        'prompts': spec_data['results']['prompts'] + plan_data['results']['prompts'],
+        'results': spec_data['results']['results'] + plan_data['results']['results'],
+        'stats': {
+            'successes': spec_data['results']['stats']['successes'] + plan_data['results']['stats']['successes'],
+            'failures': spec_data['results']['stats']['failures'] + plan_data['results']['stats']['failures'],
+            'tokenUsage': {
+                'total': spec_data['results']['stats']['tokenUsage']['total'] + plan_data['results']['stats']['tokenUsage']['total'],
+                'prompt': spec_data['results']['stats']['tokenUsage']['prompt'] + plan_data['results']['stats']['tokenUsage']['prompt'],
+                'completion': spec_data['results']['stats']['tokenUsage']['completion'] + plan_data['results']['stats']['tokenUsage']['completion'],
+                'cached': spec_data['results']['stats']['tokenUsage']['cached'] + plan_data['results']['stats']['tokenUsage']['cached'],
+            }
+        }
+    },
+    'config': spec_data['config'],
+    'shareableUrl': None,
+}
+
+with open('eval-results.json', 'w') as f:
+    json.dump(combined, f, indent=2)
+
+# Print summary
+total = combined['results']['stats']['successes'] + combined['results']['stats']['failures']
+pass_rate = (combined['results']['stats']['successes'] / total * 100) if total > 0 else 0
+print(f"✓ Combined results: {combined['results']['stats']['successes']}/{total} passed ({pass_rate:.0f}%)")
+PYTHON_EOF
+fi
+
+# Open web UI if requested
+if [ "$VIEW_RESULTS" = true ]; then
+    echo ""
+    echo "🌐 Opening web UI..."
+    npx promptfoo view
+fi
+
+echo ""
+exit $EXIT_CODE
diff --git a/evals/scripts/test-workflow-locally.sh b/evals/scripts/test-workflow-locally.sh
new file mode 100755
index 0000000000..9929e5a5d7
--- /dev/null
+++ b/evals/scripts/test-workflow-locally.sh
@@ -0,0 +1,207 @@
+#!/bin/bash
+#
+# Helper script to test GitHub Actions workflow locally using act
+#
+
+set -e
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+echo -e "${BLUE}🔧 GitHub Actions Local Testing${NC}"
+echo "================================"
+echo ""
+
+# Check if act is installed
+if ! command -v act &> /dev/null; then
+    echo -e "${RED}❌ 'act' not installed${NC}"
+    echo ""
+    echo "Install with:"
+    echo "  macOS:  brew install act"
+    echo "  Linux:  curl https://raw.githubusercontent.com/nektos/act/master/install.sh | sudo bash"
+    echo ""
+    echo "See: https://github.com/nektos/act"
+    exit 1
+fi
+
+echo -e "${GREEN}✓${NC} act installed ($(act --version))"
+
+# Check if Docker is running
+if ! docker ps &> /dev/null; then
+    echo -e "${RED}❌ Docker not running${NC}"
+    echo ""
+    echo "Start Docker Desktop (macOS) or:"
+    echo "  sudo systemctl start docker"
+    exit 1
+fi
+
+echo -e "${GREEN}✓${NC} Docker running"
+
+# Check for secrets file
+SECRETS_FILE=".github/workflows/.secrets"
+if [ ! -f "$SECRETS_FILE" ]; then
+    echo -e "${YELLOW}⚠️  Secrets file not found${NC}"
+    echo ""
+    echo "Creating template: $SECRETS_FILE"
+    mkdir -p .github/workflows
+    cat > "$SECRETS_FILE" << 'EOF'
+# GitHub Actions Secrets for Local Testing
+# DO NOT COMMIT THIS FILE!
+
+LLM_BASE_URL=your-llm-base-url
+LLM_AUTH_TOKEN=your-auth-token
+EOF
+    chmod 600 "$SECRETS_FILE"
+
+    echo ""
+    echo -e "${YELLOW}Please edit $SECRETS_FILE with your actual values${NC}"
+    echo "Then run this script again."
+    exit 1
+fi
+
+echo -e "${GREEN}✓${NC} Secrets file exists"
+
+# Verify secrets have values
+if grep -q "your-llm-base-url" "$SECRETS_FILE" || grep -q "your-auth-token" "$SECRETS_FILE"; then
+    echo -e "${YELLOW}⚠️  Warning: Secrets file contains placeholder values${NC}"
+    echo ""
+    read -p "Continue anyway? (y/N): " -n 1 -r
+    echo
+    if [[ ! $REPLY =~ ^[Yy]$ ]]; then
+        exit 1
+    fi
+fi
+
+echo ""
+
+# Parse command line options
+EVENT="pull_request"
+VERBOSE=""
+LIST_ONLY=false
+REUSE=""
+SKIP_STEPS=""
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --event)
+            EVENT="$2"
+            shift 2
+            ;;
+        --verbose|-v)
+            VERBOSE="-v"
+            shift
+            ;;
+        --list)
+            LIST_ONLY=true
+            shift
+            ;;
+        --reuse)
+            REUSE="--reuse"
+            shift
+            ;;
+        --skip-pr-comment)
+            SKIP_STEPS="Comment PR with Results"
+            shift
+            ;;
+        --help|-h)
+            echo "Usage: ./evals/scripts/test-workflow-locally.sh [OPTIONS]"
+            echo ""
+            echo "Options:"
+            echo "  --event TYPE        Event to simulate (default: pull_request)"
+            echo "                      Options: pull_request, push, schedule, workflow_dispatch"
+            echo "  --verbose, -v       Show verbose output"
+            echo "  --list              List jobs and steps only (dry run)"
+            echo "  --reuse             Reuse containers for faster iterations"
+            echo "  --skip-pr-comment   Skip PR comment step (doesn't work locally)"
+            echo "  --help, -h          Show this help message"
+            echo ""
+            echo "Examples:"
+            echo "  ./evals/scripts/test-workflow-locally.sh                # Run with defaults"
+            echo "  ./evals/scripts/test-workflow-locally.sh --list        # Dry run"
+            echo "  ./evals/scripts/test-workflow-locally.sh --verbose     # Verbose output"
+            echo "  ./evals/scripts/test-workflow-locally.sh --reuse       # Faster iterations"
+            exit 0
+            ;;
+        *)
+            echo -e "${RED}Unknown option: $1${NC}"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+# Build act command
+ACT_CMD="act $EVENT --secret-file $SECRETS_FILE"
+
+# Add optional flags
+if [ -n "$VERBOSE" ]; then
+    ACT_CMD="$ACT_CMD $VERBOSE"
+fi
+
+if [ -n "$REUSE" ]; then
+    ACT_CMD="$ACT_CMD $REUSE"
+fi
+
+if [ -n "$SKIP_STEPS" ]; then
+    ACT_CMD="$ACT_CMD --skip-steps \"$SKIP_STEPS\""
+fi
+
+# Use smaller runner image for faster execution
+ACT_CMD="$ACT_CMD -P ubuntu-latest=catthehacker/ubuntu:act-latest"
+
+# Execute
+echo -e "${BLUE}Running:${NC} $ACT_CMD"
+echo ""
+
+if [ "$LIST_ONLY" = true ]; then
+    echo -e "${YELLOW}📋 Listing jobs and steps (dry run)...${NC}"
+    act $EVENT --secret-file "$SECRETS_FILE" --list
+    exit 0
+fi
+
+echo -e "${YELLOW}🚀 Running workflow locally...${NC}"
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+echo ""
+
+# Run with eval to properly handle the command with potential quotes
+eval $ACT_CMD
+
+EXIT_CODE=$?
+
+echo ""
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+
+if [ $EXIT_CODE -eq 0 ]; then
+    echo -e "${GREEN}✅ Workflow completed successfully${NC}"
+
+    # Show results if they exist
+    if [ -f "eval-results.json" ]; then
+        echo ""
+        echo "📊 Results available in:"
+        echo "  - eval-results.json"
+        echo "  - eval-results-spec.json"
+        echo "  - eval-results-plan.json"
+
+        if [ -f "eval_summary.txt" ]; then
+            echo "  - eval_summary.txt"
+            echo ""
+            echo "Summary:"
+            cat eval_summary.txt
+        fi
+    fi
+else
+    echo -e "${RED}❌ Workflow failed with exit code $EXIT_CODE${NC}"
+    echo ""
+    echo "💡 Tips:"
+    echo "  - Run with --verbose for more details"
+    echo "  - Check Docker logs: docker ps -a | grep act-"
+    echo "  - Verify secrets are correct: cat .github/workflows/.secrets"
+    echo "  - See LOCAL_TESTING.md for troubleshooting"
+fi
+
+echo ""
+exit $EXIT_CODE
diff --git a/media/timi.png b/media/timi.png
new file mode 100644
index 0000000000..a6ebaa42ce
Binary files /dev/null and b/media/timi.png differ
diff --git a/media/timi_small.png b/media/timi_small.png
new file mode 100644
index 0000000000..5fd2c4c6e3
Binary files /dev/null and b/media/timi_small.png differ
diff --git a/pyproject.toml b/pyproject.toml
index fb972adc7c..4a1209ccad 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [project]
-name = "specify-cli"
+name = "agentic-sdlc-specify-cli"
 version = "0.0.22"
 description = "Specify CLI, part of GitHub Spec Kit. A tool to bootstrap your projects for Spec-Driven Development (SDD)."
 requires-python = ">=3.11"
diff --git a/roadmap.md b/roadmap.md
new file mode 100644
index 0000000000..4097c499d2
--- /dev/null
+++ b/roadmap.md
@@ -0,0 +1,456 @@
+# 📊 **Agentic SDLC Spec Kit - Structured Roadmap**
+
+## ✅ **COMPLETED ITEMS** (Fully Implemented & Verified)
+
+### **CLI Infrastructure & Theming**
+
+- ✅ **Orange Theme Restoration**: Centralized `ACCENT_COLOR` and `BANNER_COLORS` constants in CLI
+- ✅ **Gateway Configuration**: `--gateway-url`/`--gateway-token` support with `.specify/config/gateway.env` scaffolding
+- ✅ **Team Directives Integration**: Local path support and remote cloning for team-ai-directives
+- ✅ **Context Readiness Enforcement**: `/specify`, `/plan`, `/tasks`, `/implement` validate `context.md` completeness
+
+### **MCP Server Integration**
+
+- ✅ **Issue Tracker MCP**: `--issue-tracker` parameter supporting github/jira/linear/gitlab with `.mcp.json` configuration
+- ✅ **Async Agent MCP**: `--async-agent` parameter for jules/async-copilot/async-codex with MCP server setup
+- ✅ **Git Platform MCP**: `--git-platform` parameter supporting github/gitlab with `.mcp.json` configuration for PR/merge request operations
+- ✅ **Team Directives MCP Merging**: Template inheritance for consistent MCP configurations
+
+### **Constitution Management System**
+
+- ✅ **Automated Constitution Assembly**: Team constitution inheritance with validation
+- ✅ **Constitution Evolution Tracking**: Amendment proposal, approval, and version management
+- ✅ **Project Artifact Scanning**: Constitution enhancement suggestions from codebase analysis
+- ✅ **Validation Framework**: Structure, quality, compliance, and conflict checking
+- ✅ **Levelup Integration**: Constitution evolution through feature learnings
+
+### **Workflow Modes Feature** - **COMPLETED**
+
+- ✅ **Mode Switching Command**: `/mode` command to set build/spec workflow modes and framework options (spec mode is default)
+- ✅ **Consolidated Configuration**: Unified `.specify/config/mode.json` with `options` section replacing separate `opinions.json`
+- ✅ **Framework Options**: Configurable TDD, contracts, data models, and risk-based testing via `/mode` command
+- ✅ **Mode State Persistence**: Store current mode, options, and history in single config file
+- ✅ **Mode-Aware Commands**: `/specify`, `/clarify`, `/plan`, `/implement`, `/analyze` commands adapted for mode-aware behavior
+- ✅ **Mode Validation**: Commands validate mode compatibility and provide guidance
+- ✅ **Complexity Reduction**: Allow users to choose workflow complexity level (spec-driven vs lightweight)
+- ✅ **Auto-Detection**: `/analyze` automatically detects pre vs post-implementation context
+- ✅ **Documentation**: Mode functionality documented in README.md and quickstart.md
+- ✅ **12-Factors Integration**: Workflow modes documented in methodology documentation
+
+- ✅ **Checklist Integration**: `/checklist` command adapts validation based on enabled framework options
+
+### **AI Session Context Management** *(100% Complete)* - **HIGH PRIORITY** - Knowledge management and team learning
+
+- ✅ **Levelup Command Implementation**: `/levelup` command creates reusable AI session context packets
+- ✅ **MCP Integration for Git Operations**: Uses Git platform MCP servers for PR/merge request operations
+- ✅ **Team Directives Analysis**: Analyzes session context for contributions to rules, constitution, personas, and examples
+- ✅ **Reusable Knowledge Packets**: Creates context packets for cross-project AI agent learning
+- ✅ **Comprehensive Issue Summaries**: Generates detailed session summaries for issue tracker comments
+
+### **Spec-Code Synchronization** *(100% Complete)* - **MEDIUM PRIORITY** - Documentation automation
+
+- ✅ **Git Hook Integration**: `--spec-sync` option installs pre-commit/post-commit/pre-push hooks
+- ✅ **Automatic Change Detection**: Detects code changes and queues documentation updates
+- ✅ **Non-blocking Updates**: Background automation that preserves developer workflow
+- ✅ **Mode-aware Batch Review**: Queued updates reviewed at natural breakpoints
+
+### **Basic Local Parallel Execution ([P] Markers)**
+
+- ✅ **Task Generation**: `/tasks` creates tasks with [P] markers for parallelizable tasks
+- ✅ **Parallel Execution**: `/implement` recognizes [P] markers and executes concurrently
+- ✅ **File-based Coordination**: Tasks affecting same files run sequentially
+- ✅ **User Story Organization**: Parallel execution within story phases
+
+### **Risk-Based Testing Framework**
+
+- ✅ **Risk Extraction**: Standardized severity levels (Critical/High/Medium/Low) in `check-prerequisites.sh`
+- ✅ **Automated Test Generation**: `generate-risk-tests.sh` creates targeted test tasks
+- ✅ **Mode Integration**: Risk-based testing configurable via `/mode --risk-tests` command
+- ✅ **Test Evidence Capture**: `/implement` preserves risk mitigation validation
+
+#### **Dual Execution Loop Infrastructure**
+
+- ✅ **Task Classification Framework**: SYNC/ASYNC classification in templates and triage system
+- ✅ **Runtime Scripts**: `implement.sh`/`implement.ps1` for actual task execution
+- ✅ **Rich Context Delegation**: `dispatch_async_task()` function for ASYNC task delegation with comprehensive project context
+- ✅ **Delegation Template**: `templates/delegation-template.md` for conversational AI assistant prompts
+- ✅ **Context Generation**: `generate_agent_context()` provides spec, plan, research, and team constitution context
+- ✅ **Delegation Utilities**: `tasks-meta-utils.sh` with enhanced prompt generation and status checking
+- ✅ **Interactive Reviews**: `perform_micro_review()` and `perform_macro_review()` with user prompts
+- ✅ **Differentiated Quality Gates**: SYNC (80% coverage + security) vs ASYNC (60% coverage + macro review)
+- ✅ **End-to-End Testing**: `test-dual-execution-loop.sh` comprehensive workflow validation
+
+#### **Triage Framework**
+
+- ✅ **Decision Trees**: Comprehensive SYNC/ASYNC classification guidance
+- ✅ **Training Modules**: Triage effectiveness metrics and improvement tracking
+- ✅ **Audit Trails**: Rationale documentation for classification decisions
+- ✅ **Template Integration**: Triage guidance in `plan.md` and `plan-template.md`
+
+#### **12-Factor Alignment**
+
+- ✅ **Factor I-II (Strategy)**: Strategic mindset and context scaffolding implemented via constitution and directives
+- ✅ **Factor III-V (Workflow)**: Mission definition, planning, and dual execution loops fully supported
+- ✅ **Factor VI-VIII (Governance)**: Great filter, quality gates, and risk-based testing implemented
+- ✅ **Factor IX-XII (Team Capability)**: Traceability, tooling, directives as code, and team learning supported
+
+### **Iterative Development Support** *(100% Complete)* - **HIGH PRIORITY** - Addresses anti-iterative critique
+
+- ✅ **Git-Managed Documentation**: Specs stored in `specs/[feature-name]/` directories with full version control
+- ✅ **Branch-Based Isolation**: Each feature has dedicated branch enabling parallel development
+- ✅ **Clarify Command Iteration**: Enables iterative spec refinement with direct spec file modifications
+- ✅ **Analyze Command Cross-Reference**: Performs consistency analysis with remediation suggestions
+- ✅ **Post-Implementation Analysis**: Extended `/analyze` command with auto-detection for pre/post-implementation context
+- ✅ **Documentation Evolution**: Specs and plans actively evolve through git commits during development
+- ✅ **Rollback Integration**: Git rollback capabilities preserve documentation state consistency
+- ✅ **Automated Documentation Updates**: Background, non-blocking automation that detects code changes and queues documentation updates for review at natural breakpoints (pre-commit/push), with CLI-injected git hooks and mode-aware behavior
+
+#### **Workflow Stage Orchestration** *(100% Complete)* - **COMPLETED** - Workflow completeness through command-to-command guidance
+
+- ✅ **Git-Based Rollback**: Code and documentation rollback via git commands (already working)
+- ✅ **Command-to-Command Guidance**: Sequential workflow guidance through existing command outputs (specify → clarify/plan → tasks → implement)
+- ✅ **CLI Workflow Overview**: Complete SDD workflow steps displayed on project initialization
+- ✅ **Context-Aware Next Actions**: Commands provide mode-aware guidance for next steps (e.g., /analyze auto-detects pre/post-implementation)
+
+#### **Configurable Framework Options** *(100% Complete)* - **MEDIUM PRIORITY** - Addresses over-opinionated critique
+
+- ✅ **Opt-in Architecture Patterns**: TDD, contracts, data models, risk-based testing become user-configurable via `/mode` command
+- ✅ **Consolidated Configuration**: Unified `mode.json` with `options` section (renamed from `opinions.json`)
+- ✅ **Mode-Based Preferences**: Different defaults for build vs spec modes
+- ✅ **Reduced Mandatory Options**: Core workflow preserved, options made optional
+- ✅ **User-Driven Defaults**: Users can override mode defaults with custom settings
+
+---
+
+## 🔄 **CURRENT PHASE** (Complete After Next Phase)
+
+### **Enhanced Traceability Framework** *(100% Complete)* - **MEDIUM PRIORITY** - Core 12F Factor IX implementation
+
+- ✅ **MCP Configuration Foundation**: Issue tracker integration ready (GitHub/Jira/Linear/GitLab)
+- ✅ **@issue-tracker Prompt Parsing**: Automatic trace detection from `@issue-tracker ISSUE-123` syntax in command prompts
+- ✅ **Automatic Trace Creation**: Spec-issue links created automatically when issues referenced in `/specify` and other commands
+- ✅ **Smart Trace Validation**: Enhanced `/analyze` detects missing traces and suggests automatic linking
+- ✅ **Task-to-Issues Command**: `/taskstoissues` command converts existing tasks into GitHub/Jira/Linear/GitLab issues with dependency ordering
+
+#### **Strategic Tooling Improvements** *(100% Complete)* - **MEDIUM PRIORITY**
+
+- ✅ **Gateway Health Checks**: Basic framework established
+- ✅ **Tool Selection Guidance**: Implementation in CLI and scripts
+- ✅ **Config Consolidation**: Consolidate all `.specify/config/` files into a single unified configuration file to reduce complexity and improve maintainability
+
+**NOTE**: User settings like `config.json` should remain user-specific and not tracked in git. However, team governance files like `.specify/constitution.md` should be version-controlled. Consider relocating constitution.md to a more appropriate location that clearly distinguishes it from user-specific configuration.
+
+#### **Persistent Issue ID Storage Enhancement** *(0% Complete)* - **HIGH PRIORITY** - Issue-tracker-first workflow improvement
+
+- ❌ **Add --issue Parameter to Specify**: Implement `--issue ISSUE-ID` parameter for specify command to fetch issue data from configured tracker
+- ❌ **Store Issue Context Persistently**: Save issue ID, tracker type, and metadata in context.md for automatic propagation
+- ❌ **Automatic Issue Propagation**: Subsequent commands (/clarify, /plan, /tasks, /analyze, /levelup) automatically use stored issue context
+- ❌ **Dynamic MCP Tool Resolution**: Use declarative tools pattern with configuration-driven tool selection based on detected issue tracker
+- ❌ **Multi-Tracker Support**: Support GitHub/Jira/Linear/GitLab issue formats with appropriate MCP tool routing
+
+#### **Context.md Population Bug Fix** *(0% Complete)* - **HIGH PRIORITY** - Critical workflow blocker
+
+- ❌ **Modify Specify Command Context Generation**: Update `/specify` command to populate `context.md` with derived values instead of `[NEEDS INPUT]` placeholders
+- ❌ **Context Field Population**: Generate Feature, Mission, Code Paths, Directives, Research, and Gateway fields from feature description and project context
+- ❌ **Mode-Aware Context**: Implement for both build and spec modes as integral part of specify command
+- ❌ **Validation Compliance**: Ensure populated context.md passes `check-prerequisites.sh` validation requirements
+
+#### **Levelup Command Build Mode Compatibility** *(0% Complete)* - **HIGH PRIORITY** - AI session context management blocker
+
+- ❌ **Make Levelup Mode-Aware**: Update `/levelup` command to work in both build and spec modes
+- ❌ **Build Mode Levelup Path**: Adapt levelup for build mode (only requires spec.md, skip plan.md/tasks.md validation)
+- ❌ **Spec Mode Levelup Path**: Maintain current comprehensive levelup for spec mode (requires all artifacts + task completion)
+- ❌ **Context Packet Adaptation**: Create appropriate AI session context packets for each mode's workflow patterns
+- ❌ **Test Both Mode Levelups**: Verify levelup works in build mode and maintains full functionality in spec mode
+
+#### **Build Mode Workflow Bug Fix** *(0% Complete)* - **HIGH PRIORITY** - Critical workflow blocker
+
+- ❌ **Fix Build Mode specify→implement Flow**: Implement command requires tasks.md but build mode skips plan/tasks phases
+- ❌ **Mode-Aware Task Validation**: Skip --require-tasks in build mode to enable lightweight specify→implement workflow
+- ❌ **Update implement.md Template**: Add build mode execution path that works without tasks.md
+- ❌ **Fix Build Mode Checking in Analyze and Clarify**: Ensure analyze and clarify commands properly check build mode before execution
+- ❌ **Test Build Mode Workflow**: Verify specify → implement works in build mode without tasks.md
+
+#### **Async Task Context Delivery Architecture** *(0% Complete)* - **CRITICAL PRIORITY** - Makes async functionality non-functional
+
+- ❌ **MCP Task Submission Protocol**: Define standard MCP tools for async task submission (submit_task, check_status, get_result)
+- ❌ **Remote Context Delivery Mechanism**: Implement file upload, URL references, or embedded payload for spec content delivery to remote MCP servers
+- ❌ **Repository Context Provision**: Provide repository URL, branch, and authentication for remote agents to access committed specs
+- ❌ **Webhook/Callback Integration**: Establish completion notification and result retrieval from remote async agents
+- ❌ **Agent-Specific MCP Tool Implementation**: Custom MCP tool implementations for jules, async-copilot, async-codex
+
+#### **Multi-Tracker Task-to-Issues Extension** *(0% Complete)* - **MEDIUM PRIORITY** - Enhanced traceability
+
+- ❌ **Extend taskstoissues Command**: Update `/taskstoissues` command to support Jira/Linear/GitLab in addition to GitHub
+- ❌ **Dynamic Tracker Detection**: Add logic to detect configured issue tracker from `.mcp.json`
+- ❌ **Tracker-Specific MCP Tools**: Implement tracker-specific issue creation logic for each platform
+- ❌ **URL Validation Updates**: Update remote URL validation for different tracker types (Git-based vs non-Git-based)
+
+#### **Unified Spec Template Implementation** *(100% Complete)* - **MEDIUM PRIORITY** - Template maintenance reduction
+
+- ✅ **Mode-Aware Template Selection**: Implemented automatic template selection based on workflow mode (build vs spec)
+- ✅ **Script-Based Mode Detection**: Added mode detection logic to create-new-feature.sh and create-new-feature.ps1 scripts
+- ✅ **Template Selection Logic**: Build mode uses spec-template-build.md, spec mode uses spec-template.md
+- ✅ **Minimal Conflict Surface**: Changes isolated to fork-specific scripts, templates remain upstream-compatible
+- ✅ **Maintained Template Separation**: Analysis showed only 15-20% content overlap, separate templates remain optimal
+
+#### **Spec Management & Cleanup** *(0% Complete)* - **MEDIUM PRIORITY**
+
+- ❌ **Spec Deletion Command**: `/delete-spec` command to safely remove spec with all associated files (spec.md, plan.md, tasks.md, context.md, feature branches)
+- ❌ **Dependency Validation**: Check for dependent artifacts before deletion
+- ❌ **Archive Option**: Optional archiving instead of permanent deletion
+- ❌ **Cleanup Verification**: Confirm all related files and branches are removed
+
+---
+
+## 🚀 **NEXT PHASE** (Immediate Priority)
+
+### **Command Prefix Migration** *(0% Complete)* - **CRITICAL PRIORITY** - Fork differentiation and user experience
+
+- ❌ **Prefix Change Implementation**: Migrate from `/speckit.*` to `/agenticsdlc.*` commands for clear fork identification
+- ❌ **Documentation Updates**: Update all references in README.md, docs, and templates (100+ instances)
+- ❌ **Release Script Modification**: Update `.github/workflows/scripts/create-release-packages.sh` to generate new prefix
+- ❌ **Migration Support**: Dual prefix support during transition with deprecation warnings
+- ❌ **User Communication**: Migration guide for existing projects and clear differentiation messaging
+
+---
+
+## 🆕 **FUTURE PHASE** (New Items - Not Yet Started)
+
+### **Future Enhancement Categories**
+
+### **Architecture Description Command (/architect)** *(0% Complete)* - **HIGH PRIORITY** - Structural integrity for complex systems
+
+- **Description**: Implement `/architect` command following Rozanski & Woods methodology to generate comprehensive Architecture Descriptions (ADs) that move beyond "Vibe Coding" and ensure structural integrity in complex systems (like the CNE Agent). This command focuses on global system boundaries and operational concerns, unlike feature-specific `/plan`.
+- **Rozanski & Woods Viewpoints**:
+  - Context View: Defines system scope and external entity interactions (Users, APIs, Cloud Providers)
+  - Functional View: Details functional elements, responsibilities, and interfaces
+  - Information View: Manages data storage, movement, and lifecycle
+  - Concurrency View: Describes runtime processes, threading, and coordination
+  - Development View: Sets constraints for developers (code organization, dependencies, CI/CD)
+  - Deployment View: Defines physical environment (EKS clusters, VPCs, network interconnections)
+  - Operational View: Covers operations, support, and maintenance in production
+- **Architectural Perspectives (Cross-Cutting Qualities)**: Security, Performance & Scalability, Availability & Resilience, Evolution
+- **BMAD Integration**: Operates at "A" (Architecture) layer, creates global context inherited by `/specify` and `/plan`
+- **Traceability**: Establishes clear links from Stakeholder Concerns to Architectural Views (Factor IX implementation)
+- **Output Location**: Generates artifacts in parallel `/architecture` folder separate from feature implementation specs
+- **Benefits**: Transforms AI from simple coder to System Architect capable of describing complex, production-ready ecosystems
+- **Implementation**: Template engine for 7 viewpoints, constraint injection into constitution.md, cross-view analysis linter in `/analyze`
+
+### **Hook-Based Tool Auto-Activation** *(0% Complete)* - **MEDIUM PRIORITY** - Extends Factor X Strategic Tooling
+
+- **Description**: Implement hook-based systems that automatically analyze user prompts and suggest relevant AI tools/agents based on project context, similar to Claude's UserPromptSubmit hooks. This reduces manual agent selection and ensures optimal tool usage.
+- **Key Components**:
+  - Prompt analysis hooks that detect context patterns (file types, project structure, task types)
+  - Automatic agent/tool suggestions based on skill-rules.json style configuration
+  - Integration with existing agent context files for seamless activation
+- **Benefits**: Eliminates "which agent should I use?" friction, improves workflow efficiency
+- **Implementation**: Add to Factor X with hook templates and activation rules, extending the current AGENTS.md framework
+- **Reference**: Based on patterns from <https://github.com/diet103/claude-code-infrastructure-showcase>
+
+### **Progressive Context Disclosure (500-Line Rule)** *(0% Complete)* - **MEDIUM PRIORITY** - Enhances Factor II Context Scaffolding
+
+- **Description**: Implement modular context loading patterns where AI context is loaded progressively rather than all at once, preventing token limit issues while maintaining comprehensive guidance. Similar to Claude's skill architecture with main files + resource files.
+- **Key Components**:
+  - Hierarchical agent context files (overview + detailed resources)
+  - On-demand context expansion based on task complexity
+  - Token-aware context management for different agent types
+- **Benefits**: Manages context limits effectively across all supported agents, provides scalable context management
+- **Implementation**: Extend Factor II with progressive loading patterns, building on existing update-agent-context.sh infrastructure
+- **Reference**: Based on patterns from <https://github.com/diet103/claude-code-infrastructure-showcase>
+
+### **Session Context Persistence Patterns** *(50% Complete)* - **LOW PRIORITY** - Supports Factor IX Process Documentation
+
+- **Description**: Enhance the existing dev docs patterns with auto-generation and session persistence, using structured file formats to maintain project context across AI tool sessions and prevent context resets.
+- **Key Components**:
+  - Auto-generation of three-file structure (plan.md, context.md, tasks.md) from session artifacts
+  - Session state preservation across agent interactions
+  - Integration with existing levelup command for comprehensive session capture
+- **Benefits**: Reduces context loss during complex development sessions, improves handoff between different AI agents
+- **Implementation**: Enhance Factor IX with auto-generation templates, building on existing levelup.md and agent context patterns
+- **Reference**: Based on patterns from <https://github.com/diet103/claude-code-infrastructure-showcase>
+
+### **Agent Skill Modularization** *(0% Complete)* - **LOW PRIORITY** - Extends Factor XI Directives as Code
+
+- **Description**: Implement modular agent skill patterns where complex agent capabilities are broken into smaller, reusable skill modules that can be loaded progressively, similar to Claude's skill architecture.
+- **Key Components**:
+  - Skill module templates with main + resource file structure
+  - Agent-specific skill activation rules
+  - Version-controlled skill libraries for different agent types
+- **Benefits**: Enables complex agent behaviors without hitting context limits, improves skill reusability across projects
+- **Implementation**: Add to Factor XI with skill templates and modular loading patterns, extending the current agent file template system
+- **Reference**: Based on patterns from <https://github.com/diet103/claude-code-infrastructure-showcase>
+
+#### **Context Intelligence & Optimization** *(0% Complete)* - **HIGH PRIORITY** - Cost & Accuracy
+
+- ❌ **Smart Context Windowing**: Logic to slice `spec.md` and `plan.md` based on the active User Story phase during implementation to save tokens.
+- ❌ **Semantic Diffs**: `specify diff` command to summarize behavioral changes in specs rather than just line-diffs.
+- ❌ **Directive Embeddings**: (Future) Local vector index for `team-ai-directives` to support large governance repositories without context flooding.
+
+#### **Resilience & Self-Healing** *(0% Complete)* - **MEDIUM PRIORITY** - Automation robustness
+
+- ❌ **Triage Escalation Protocol**: Automated promotion of failing `[ASYNC]` tasks to `[SYNC]` status in `tasks_meta.json` with user notification.
+- ❌ **Connection Health Checks**: Enhance `specify check` to validate API connectivity to Gateway and MCP servers, not just binary presence.
+
+**Assessment**: Not currently needed. Core workflow (dual execution loop, MCP integration) should be completed first. Existing terminal interface with agent context files provides sufficient IDE support. Consider lightweight integration only after core adoption is proven.
+
+### **Repository Governance Automation** *(0% Complete)* - **FUTURE ENHANCEMENT** - Enterprise governance
+
+- ❌ **Enhanced Governance**: Advanced team directive management (optional enterprise feature)
+
+#### **Team Directives Layout Awareness**
+
+- ❌ **Structural Repository Scans**: Automated analysis of team-ai-directives structure
+- ❌ **Layout Validation**: Consistency checking across team repositories
+- ❌ **Template Enforcement**: Standardized repository organization
+
+#### **Team Directives Directory Restructuring** *(0% Complete)* - **MEDIUM PRIORITY**
+
+- ❌ **Separate Cloning Location**: Move `--team-ai-directive` cloning from `.specify/` to dedicated `.team-directives/` directory to prevent repository mixture
+- ❌ **Directory Isolation**: Maintain clean separation between project scaffolding and team governance repositories
+- ❌ **Migration Support**: Provide migration utilities for existing projects with directives in `.specify/`
+- ❌ **Path Resolution Updates**: Update all team directive path resolution logic to use new directory structure
+
+#### **Referenceable Cross-Referencing System** *(0% Complete)* - **HIGH PRIORITY**
+
+- **Description**: Implement a structured reference format (`@rule:relative_filepath`) for cross-referencing within team-ai-directives to eliminate duplication, improve navigation, and enable future tooling integration.
+- **Key Components**:
+  - Define reference syntax: `@rule:path/relative/to/rules/dir.md` for rules, extend to `@example:`, `@persona:` as needed
+  - Update existing files to use references instead of duplicating content (start with null safety overlap)
+  - Add validation in CONTRIBUTING.md or CI to enforce references and prevent broken links
+  - Enable tooling integration for automatic link resolution and IDE support
+- **Benefits**: Eliminates duplication across atomic/composite sections, enhances maintainability, enables scalable directive repositories, supports Factor XI Directives as Code
+- **Implementation**: Establish conventions, apply to overlap fixes, integrate with governance process, build tooling support
+
+#### **Feature-Level Mode Configuration** *(0% Complete)* - **FUTURE ENHANCEMENT**
+
+- ❌ **Per-Feature Mode Settings**: Allow different workflow modes (build/spec) per feature instead of project-wide
+- ❌ **Feature Mode Inheritance**: Default to project mode with ability to override per feature
+- ❌ **Mode Compatibility Validation**: Ensure feature modes are compatible with project infrastructure
+- ❌ **Mode Migration Support**: Tools to change feature modes mid-development
+
+#### **Issue Tracker Automation** *(0% Complete)* - **FUTURE ENHANCEMENT** - Separate from documentation updates
+
+- ❌ **Automated Status Updates**: Sync documentation changes with issue status (GitHub/Jira/Linear)
+- ❌ **Comment Synchronization**: Auto-post documentation updates as issue comments
+- ❌ **Cross-Platform Compatibility**: Unified API for different issue trackers
+- ❌ **Workflow Integration**: Optional integration with documentation automation pipeline
+
+#### **Issue Tracker Enhancements** *(0% Complete)* - **FUTURE ENHANCEMENT**
+
+- ❌ **Trace Visualization**: Dashboard showing spec-issue relationships
+- ❌ **Lifecycle Trace Updates**: Automatic issue status updates during development lifecycle
+
+#### **Issue Tracker Labeling** *(0% Complete)* - **FUTURE ENHANCEMENT**
+
+- ❌ **Issue Label Application**: `apply_issue_labels()` for `async-ready` and `agent-delegatable` labels
+- ❌ **Spec vs Task Complexity**: Handle original spec issues vs. generated implementation tasks
+- ❌ **External Agent Integration**: Enable monitoring systems to pick up labeled issues
+- ❌ **Workflow Compatibility**: Ensure compatibility with natural language delegation approach
+
+#### **Agent-Optimized Testing Infrastructure** *(0% Complete)* - **MEDIUM PRIORITY**
+
+- ❌ **Selective Test Execution**: Enable agents to run targeted test subsets during development (pytest integration)
+- ❌ **Interactive Testing Support**: Development server startup guides and Playwright/curl integration for real-time testing
+- ❌ **Enhanced Error Messages**: Rich assertion failures with contextual debugging information for agent comprehension
+- ❌ **Test Suite Optimization**: Agent-friendly test organization and execution patterns
+
+#### **GitHub Issues Integration Enhancement** *(0% Complete)* - **MEDIUM PRIORITY**
+
+- ❌ **Direct Issue URL Processing**: Seamless integration of GitHub issue URLs into agent context
+- ❌ **Issue-Driven Development**: Enhanced workflow for issue-to-spec conversion and tracking
+- ❌ **Context Preservation**: Maintain issue relationships throughout development lifecycle
+
+#### **Code Quality Automation** *(0% Complete)* - **LOW PRIORITY**
+
+- ❌ **Agent-Driven Linting**: Automated code quality checks with agent-executable linters and formatters
+- ❌ **Type Checking Integration**: Real-time type validation during agent code generation
+- ❌ **Quality Gate Automation**: Pre-commit hooks for agent-generated code validation
+
+---
+
+## 📈 **IMPLEMENTATION STATUS SUMMARY**
+
+|Category|Completion|Status|
+|--------|----------|------|
+|**CLI Infrastructure**|100%|✅ Complete|
+|**MCP Integration**|100%|✅ Complete|
+|**Constitution System**|100%|✅ Complete|
+|**Workflow Modes**|100%|✅ Complete|
+|**AI Session Context Management**|100%|✅ Complete|
+|**Spec-Code Synchronization**|100%|✅ Complete|
+|**Local Parallel Execution**|100%|✅ Complete|
+|**Dual Execution Loop**|100%|✅ Complete|
+|**Triage Framework**|100%|✅ Complete|
+|**Risk-Based Testing**|100%|✅ Complete|
+|**12-Factor Alignment**|100%|✅ Complete|
+|**Command Prefix Migration**|0%|🚀 Next Phase|
+|**Iterative Development**|100%|✅ Complete|
+
+|**Enhanced Traceability**|100%|✅ Complete|
+|**Multi-Tracker Task-to-Issues**|0%|🔄 Current Phase|
+|**Strategic Tooling**|100%|✅ Complete|
+|**Build Mode Bug Fix**|0%|🔄 Current Phase|
+|**Async Context Delivery**|0%|🔄 Current Phase|
+|**Spec Management**|0%|🔄 Current Phase|
+|**Hook-Based Tool Auto-Activation**|0%|🆕 Future Phase|
+|**Progressive Context Disclosure**|0%|🆕 Future Phase|
+|**Session Context Persistence**|50%|🆕 Future Phase|
+|**Agent Skill Modularization**|0%|🆕 Future Phase|
+|**Agent Testing Infrastructure**|0%|🆕 Future Phase|
+|**GitHub Issues Enhancement**|0%|🆕 Future Phase|
+|**Code Quality Automation**|0%|🆕 Future Phase|
+|**Architecture Description Command**|0%|🆕 Future Phase|
+|**Advanced MCP**|0%|🆕 Future Phase|
+|**IDE Integration**|0%|🆕 Future Phase|
+|**Evaluation Suite**|0%|🆕 Future Phase|
+|**Context Engineering**|0%|🆕 Future Phase|
+|**Context Intelligence & Optimization**|0%|🆕 Future Phase|
+|**Resilience & Self-Healing**|0%|🆕 Future Phase|
+
+**Overall Implementation Status**: ~100% Complete
+
+- **Core Workflow**: 100% Complete (constitution, dual execution, MCP integration, workflow orchestration)
+- **12F Factors III-V (Workflow)**: 100% Complete - Mission definition, planning, execution, and orchestration work effectively
+- **Knowledge Management**: 100% Complete (AI session context packets, team directives analysis, reusable knowledge sharing)
+- **Documentation Automation**: 100% Complete (spec-code synchronization with git hooks, non-blocking updates, mode-aware batch review)
+- **MCP Infrastructure**: 100% Complete (issue tracker, async agent, and git platform integrations)
+- **SDD Optimization**: 100% Complete (workflow flexibility with comprehensive iterative development, enhanced UX, completed mode switching with auto-detection, and mode-aware checklist validation)
+- **Complexity Solutions**: ~100% Complete (completed workflow modes with auto-detecting post-implementation analysis, iterative development, enhanced rollback, configurable options - HIGH PRIORITY response to user feedback)
+- **Next Phase Priorities**: 1 CRITICAL priority feature (command prefix migration) - **IMMEDIATE FOCUS**
+- **Current Phase Priorities**: 1 CRITICAL priority feature (async context delivery) + 4 HIGH priority features (workflow blockers) + 4 MEDIUM priority features - **SECONDARY FOCUS**
+- **Future Enhancements**: 0% Complete (minimal enterprise features only)
+- **Deferred Features**: IDE Integration & overkill enhancements (removed to maintain focus)
+
+**Note**: @agentic-sdlc-12-factors serves dual purposes as methodology documentation and reference implementation, providing working examples and command templates that accelerate Spec Kit development. **Core 12F workflow Factors III-V are 100% complete** - mission definition, planning, execution, and orchestration work effectively through existing commands, git infrastructure, and command-to-command guidance system. **Workflow orchestration implemented** through CLI workflow overview, context-aware next actions, and sequential command guidance - no advanced visualization or blocking validation needed. **All overkill features eliminated** - advanced monitoring, interactive tutorials, evaluation suites, and context engineering removed to maintain razor focus on essential SDD functionality. Key SDD flexibility features are implemented via `/clarify` (iterative refinement), `/analyze` (consistency validation with auto-detection and post-implementation analysis), and `/checklist` (requirements quality testing with mode-aware framework option validation). **Complexity reduction prioritized** based on user feedback analysis - workflow modes provide user-choice flexibility (spec-driven structured mode as default vs lightweight build mode for exploration), **iterative development is comprehensively supported** through git-managed specs, branch isolation, clarify command modifications, and analyze cross-references, and configurable framework options make TDD/contracts/data models/risk-based testing opt-in rather than mandatory, with checklist validation ensuring enabled options are properly specified in requirements. **AI session context management is implemented** through the levelup command that creates reusable knowledge packets and analyzes contributions to team directives for cross-project learning. **Automated documentation updates are implemented** as non-blocking background automation with CLI-injected git hooks, queued updates at natural breakpoints, and mode-aware batch review to preserve developer workflow. **Issue tracker traceability is intentionally separate** from documentation automation for modularity, reliability, and independent adoption. **Command prefix migration prioritized as CRITICAL** due to immediate user impact as a breaking change affecting fork differentiation. Rich context delegation provides superior AI assistance compared to issue labeling approaches.
+
+## 🎯 **PRIORITY RANKING** - Refined based on user impact and breaking changes
+
+**🚀 NEXT PHASE (Immediate):**
+
+1. **CRITICAL**: Command prefix migration (0% → 100%) - **BREAKING CHANGE** - Immediate user impact, fork differentiation
+
+**🔄 CURRENT PHASE (Complete After Next Phase):**
+2. **CRITICAL**: Async task context delivery architecture (0% → 100%) - Makes async functionality completely non-functional
+3. **HIGH**: Context.md population bug fix (0% → 100%) - Critical workflow blocker preventing specify→implement flow
+4. **HIGH**: Build mode workflow bug fix (0% → 100%) - Critical workflow blocker preventing build mode usage
+5. **HIGH**: Levelup command build mode compatibility (0% → 100%) - AI session context management blocker
+6. **HIGH**: Persistent issue ID storage enhancement (0% → 100%) - Issue-tracker-first workflow improvement
+7. **MEDIUM**: Strategic tooling improvements (90% → 100%) - Tool health, guidance, and config consolidation
+8. **MEDIUM**: Multi-tracker task-to-issues extension (0% → 100%) - Enhanced traceability across platforms
+9. **MEDIUM**: Unified spec template implementation (100% → 100%) - Template maintenance reduction
+10. **MEDIUM**: Spec management & cleanup (0% → 100%) - Workflow maintenance
+
+**🆕 FUTURE PHASE (Complete After Current Phase):**
+9. **HIGH**: Architecture Description Command (/architect) (0% → future consideration)
+10. **MEDIUM**: Hook-based tool auto-activation (0% → future consideration)
+11. **MEDIUM**: Progressive context disclosure (500-line rule) (0% → future consideration)
+12. **LOW**: Session context persistence patterns (50% → future consideration)
+13. **LOW**: Agent skill modularization (0% → future consideration)
+14. **MEDIUM**: Agent-optimized testing infrastructure (0% → future consideration)
+15. **MEDIUM**: GitHub issues integration enhancement (0% → future consideration)
+16. **LOW**: Code quality automation (0% → future consideration)
+17. **HIGH**: Context Intelligence & Optimization (0% → future consideration)  # New addition
+18. **MEDIUM**: Resilience & Self-Healing (0% → future consideration)  # New addition
+18. **LOW**: Feature-level mode configuration (0% → future consideration)
+19. **LOW**: IDE Integration & advanced cockpit features (0% → future consideration)
diff --git a/scripts/bash/check-prerequisites.sh b/scripts/bash/check-prerequisites.sh
old mode 100644
new mode 100755
index 98e387c271..e54c5b6abf
--- a/scripts/bash/check-prerequisites.sh
+++ b/scripts/bash/check-prerequisites.sh
@@ -55,10 +55,10 @@ OPTIONS:
   --help, -h          Show this help message
 
 EXAMPLES:
-  # Check task prerequisites (plan.md required)
+  # Check task prerequisites (plan.md required in spec mode)
   ./check-prerequisites.sh --json
   
-  # Check implementation prerequisites (plan.md + tasks.md required)
+  # Check implementation prerequisites (plan.md required in spec mode, tasks.md always required)
   ./check-prerequisites.sh --json --require-tasks --include-tasks
   
   # Get feature paths only (no validation)
@@ -78,6 +78,121 @@ done
 SCRIPT_DIR="$(CDPATH="" cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "$SCRIPT_DIR/common.sh"
 
+# Extract risk entries from a markdown file's Risk Register section
+extract_risks() {
+    local file="$1"
+    if [[ ! -f "$file" ]]; then
+        echo "[]"
+        return
+    fi
+
+    python3 - "$file" <<'PY'
+import json
+import re
+import sys
+from pathlib import Path
+
+path = Path(sys.argv[1])
+pattern = re.compile(r"^-\s*RISK:\s*(.+)$", re.IGNORECASE)
+risks = []
+
+def normalize_severity(value):
+    """Normalize severity/impact to standard levels."""
+    if not value:
+        return "Medium"
+    value = value.lower().strip()
+    if value in ["critical", "crit", "high", "hi"]:
+        return "Critical" if value.startswith("crit") else "High"
+    elif value in ["medium", "med"]:
+        return "Medium"
+    elif value in ["low", "lo"]:
+        return "Low"
+    else:
+        # Try to map numeric or other values
+        return "Medium"
+
+for line in path.read_text().splitlines():
+    match = pattern.match(line.strip())
+    if not match:
+        continue
+
+    parts = [p.strip() for p in match.group(1).split("|") if p.strip()]
+    data = {}
+
+    if parts and ":" not in parts[0]:
+        data["id"] = parts[0]
+        parts = parts[1:]
+
+    for part in parts:
+        if ":" not in part:
+            continue
+        key, value = part.split(":", 1)
+        key = key.strip()
+        value = value.strip()
+        normalized = key.lower().replace(" ", "_")
+        if normalized == "risk":
+            data["id"] = value
+        else:
+            data[normalized] = value
+
+    if data:
+        if "id" not in data:
+            data["id"] = f"missing-id-{len(risks)+1}"
+        # Normalize severity from impact or severity field
+        severity = data.get("severity") or data.get("impact")
+        data["severity"] = normalize_severity(severity)
+        risks.append(data)
+
+print(json.dumps(risks, ensure_ascii=False))
+PY
+}
+
+# Extract mode configuration
+get_mode_config() {
+    local config_file=".specify/config/config.json"
+
+    # Extract current mode and options from consolidated config
+    python3 - "$config_file" <<'PY'
+import json
+import sys
+from pathlib import Path
+
+config_file = Path(sys.argv[1])
+try:
+    with open(config_file, 'r') as f:
+        data = json.load(f)
+
+    # Mode is now under workflow.current_mode
+    current_mode = data.get('workflow', {}).get('current_mode', 'spec')
+    # Options are at the top level
+    options = data.get('options', {})
+
+    # Ensure all expected options are present with defaults
+    defaults = {
+        'tdd_enabled': True,
+        'contracts_enabled': True,
+        'data_models_enabled': True,
+        'risk_tests_enabled': True
+    }
+
+    # Merge with defaults for any missing options
+    for key, default_value in defaults.items():
+        if key not in options:
+            options[key] = default_value
+
+    result = {
+        'current_mode': current_mode,
+        'options': options
+    }
+
+    print(json.dumps(result))
+
+except Exception as e:
+    # Fallback to defaults on any error
+    print('{"current_mode":"spec","options":{"tdd_enabled":true,"contracts_enabled":true,"data_models_enabled":true,"risk_tests_enabled":true}}')
+PY
+}
+
 # Get feature paths and validate branch
 eval $(get_feature_paths)
 check_feature_branch "$CURRENT_BRANCH" "$HAS_GIT" || exit 1
@@ -106,9 +221,39 @@ if [[ ! -d "$FEATURE_DIR" ]]; then
     exit 1
 fi
 
+# Check for plan.md (required in spec mode, optional in build mode)
 if [[ ! -f "$IMPL_PLAN" ]]; then
-    echo "ERROR: plan.md not found in $FEATURE_DIR" >&2
-    echo "Run /speckit.plan first to create the implementation plan." >&2
+    # Get current mode to determine if plan.md is required
+    current_mode="spec"
+    if [[ -f ".specify/config/config.json" ]]; then
+        current_mode=$(python3 -c "
+import json
+try:
+    with open('.specify/config/config.json', 'r') as f:
+        data = json.load(f)
+    print(data.get('workflow', {}).get('current_mode', 'spec'))
+except:
+    print('spec')
+" 2>/dev/null || echo "spec")
+    fi
+
+    if [[ "$current_mode" == "spec" ]]; then
+        echo "ERROR: plan.md not found in $FEATURE_DIR" >&2
+        echo "Run /speckit.plan first to create the implementation plan." >&2
+        exit 1
+    fi
+    # In build mode, plan.md is optional - allow implementation to proceed
+fi
+
+if [[ ! -f "$CONTEXT" ]]; then
+    echo "ERROR: context.md not found in $FEATURE_DIR" >&2
+    echo "Run /specify and populate context.md before continuing." >&2
+    exit 1
+fi
+
+if grep -q "\[NEEDS INPUT\]" "$CONTEXT"; then
+    echo "ERROR: context.md contains unresolved [NEEDS INPUT] markers." >&2
+    echo "Update $CONTEXT with current mission, code paths, directives, research, and gateway details before proceeding." >&2
     exit 1
 fi
 
@@ -148,7 +293,10 @@ if $JSON_MODE; then
         json_docs="[${json_docs%,}]"
     fi
     
-    printf '{"FEATURE_DIR":"%s","AVAILABLE_DOCS":%s}\n' "$FEATURE_DIR" "$json_docs"
+    SPEC_RISKS=$(extract_risks "$FEATURE_SPEC")
+    PLAN_RISKS=$(extract_risks "$IMPL_PLAN")
+    MODE_CONFIG=$(get_mode_config)
+    printf '{"FEATURE_DIR":"%s","AVAILABLE_DOCS":%s,"SPEC_RISKS":%s,"PLAN_RISKS":%s,"MODE_CONFIG":%s}\n' "$FEATURE_DIR" "$json_docs" "$SPEC_RISKS" "$PLAN_RISKS" "$MODE_CONFIG"
 else
     # Text output
     echo "FEATURE_DIR:$FEATURE_DIR"
@@ -163,4 +311,26 @@ else
     if $INCLUDE_TASKS; then
         check_file "$TASKS" "tasks.md"
     fi
+
+    spec_risks_count=$(extract_risks "$FEATURE_SPEC" | python3 - <<'PY'
+import json, sys
+try:
+    data = json.load(sys.stdin)
+except json.JSONDecodeError:
+    data = []
+print(len(data))
+PY
+    )
+    plan_risks_count=$(extract_risks "$IMPL_PLAN" | python3 - <<'PY'
+import json, sys
+try:
+    data = json.load(sys.stdin)
+except json.JSONDecodeError:
+    data = []
+print(len(data))
+PY
+    )
+
+    echo "SPEC_RISKS: $spec_risks_count"
+    echo "PLAN_RISKS: $plan_risks_count"
 fi
diff --git a/scripts/bash/common.sh b/scripts/bash/common.sh
index 2c3165e41d..588e934de9 100644
--- a/scripts/bash/common.sh
+++ b/scripts/bash/common.sh
@@ -1,6 +1,61 @@
 #!/usr/bin/env bash
 # Common functions and variables for all scripts
 
+# Shared constants
+TEAM_DIRECTIVES_DIRNAME="team-ai-directives"
+
+# Load gateway configuration and export helper environment variables
+load_gateway_config() {
+    local config_dir=".specify/config"
+    local config_file="$config_dir/config.json"
+
+    if [[ -f "$config_file" ]] && command -v jq >/dev/null 2>&1; then
+        # Read gateway config from consolidated config
+        SPECIFY_GATEWAY_URL=$(jq -r '.gateway.url // empty' "$config_file" 2>/dev/null)
+        SPECIFY_GATEWAY_TOKEN=$(jq -r '.gateway.token // empty' "$config_file" 2>/dev/null)
+        SPECIFY_SUPPRESS_GATEWAY_WARNING=$(jq -r '.gateway.suppress_warning // false' "$config_file" 2>/dev/null)
+
+        # Export token if set
+        if [[ -n "$SPECIFY_GATEWAY_TOKEN" ]]; then
+            export SPECIFY_GATEWAY_TOKEN
+        fi
+    fi
+
+    if [[ -n "${SPECIFY_GATEWAY_URL:-}" ]]; then
+        export SPECIFY_GATEWAY_URL
+        export SPECIFY_GATEWAY_ACTIVE="true"
+        [[ -z "${ANTHROPIC_BASE_URL:-}" ]] && export ANTHROPIC_BASE_URL="$SPECIFY_GATEWAY_URL"
+        [[ -z "${GEMINI_BASE_URL:-}" ]] && export GEMINI_BASE_URL="$SPECIFY_GATEWAY_URL"
+        [[ -z "${OPENAI_BASE_URL:-}" ]] && export OPENAI_BASE_URL="$SPECIFY_GATEWAY_URL"
+    else
+        export SPECIFY_GATEWAY_ACTIVE="false"
+        if [[ "$SPECIFY_SUPPRESS_GATEWAY_WARNING" != "true" ]]; then
+            echo "[specify] Warning: Gateway URL not configured. Set gateway.url in .specify/config/config.json." >&2
+        fi
+    fi
+}
+
+load_team_directives_config() {
+    local repo_root="$1"
+
+    local config_file="$repo_root/.specify/config/config.json"
+    if [[ -f "$config_file" ]] && command -v jq >/dev/null 2>&1; then
+        local path
+        path=$(jq -r '.team_directives.path // empty' "$config_file" 2>/dev/null)
+        if [[ -n "$path" && "$path" != "null" && -d "$path" ]]; then
+            export SPECIFY_TEAM_DIRECTIVES="$path"
+            return
+        elif [[ -n "$path" && "$path" != "null" ]]; then
+            echo "[specify] Warning: team directives path '$path' from $config_file is unavailable." >&2
+        fi
+    fi
+
+    local default_dir="$repo_root/.specify/memory/$TEAM_DIRECTIVES_DIRNAME"
+    if [[ -d "$default_dir" ]]; then
+        export SPECIFY_TEAM_DIRECTIVES="$default_dir"
+    fi
+}
+
 # Get repository root, with fallback for non-git repositories
 get_repo_root() {
     if git rev-parse --show-toplevel >/dev/null 2>&1; then
@@ -126,6 +181,8 @@ find_feature_dir_by_prefix() {
 
 get_feature_paths() {
     local repo_root=$(get_repo_root)
+    load_gateway_config "$repo_root"
+    load_team_directives_config "$repo_root"
     local current_branch=$(get_current_branch)
     local has_git_repo="false"
 
@@ -147,7 +204,9 @@ TASKS='$feature_dir/tasks.md'
 RESEARCH='$feature_dir/research.md'
 DATA_MODEL='$feature_dir/data-model.md'
 QUICKSTART='$feature_dir/quickstart.md'
+CONTEXT='$feature_dir/context.md'
 CONTRACTS_DIR='$feature_dir/contracts'
+TEAM_DIRECTIVES='${SPECIFY_TEAM_DIRECTIVES:-}'
 EOF
 }
 
diff --git a/scripts/bash/constitution-evolution.sh b/scripts/bash/constitution-evolution.sh
new file mode 100644
index 0000000000..bfd54b73ef
--- /dev/null
+++ b/scripts/bash/constitution-evolution.sh
@@ -0,0 +1,391 @@
+#!/usr/bin/env bash
+
+set -e
+
+JSON_MODE=false
+AMEND_MODE=false
+HISTORY_MODE=false
+DIFF_MODE=false
+VERSION_MODE=false
+
+for arg in "$@"; do
+    case "$arg" in
+        --json)
+            JSON_MODE=true
+            ;;
+        --amend)
+            AMEND_MODE=true
+            ;;
+        --history)
+            HISTORY_MODE=true
+            ;;
+        --diff)
+            DIFF_MODE=true
+            ;;
+        --version)
+            VERSION_MODE=true
+            ;;
+        --help|-h)
+            echo "Usage: $0 [--json] [--amend|--history|--diff|--version] [options]"
+            echo "  --json        Output results in JSON format"
+            echo "  --amend       Propose or apply constitution amendment"
+            echo "  --history     Show constitution amendment history"
+            echo "  --diff        Show differences between constitution versions"
+            echo "  --version     Manage constitution versioning"
+            echo "  --help        Show this help message"
+            exit 0
+            ;;
+        *)
+            ARGS+=("$arg")
+            ;;
+    esac
+done
+
+# Get script directory and load common functions
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/common.sh"
+
+eval $(get_feature_paths)
+
+CONSTITUTION_FILE="$REPO_ROOT/.specify/memory/constitution.md"
+AMENDMENT_LOG="$REPO_ROOT/.specify/memory/constitution-amendments.log"
+
+# Ensure amendment log exists
+mkdir -p "$(dirname "$AMENDMENT_LOG")"
+touch "$AMENDMENT_LOG"
+
+# Function to log amendment
+log_amendment() {
+    local version="$1"
+    local author="$2"
+    local description="$3"
+    local timestamp=$(date +%Y-%m-%dT%H:%M:%S%z)
+
+    echo "$timestamp|$version|$author|$description" >> "$AMENDMENT_LOG"
+}
+
+# Function to get current version
+get_current_version() {
+    if [[ ! -f "$CONSTITUTION_FILE" ]]; then
+        echo "1.0.0"
+        return
+    fi
+
+    local version=""
+    version=$(grep "\*\*Version\*\*:" "$CONSTITUTION_FILE" | sed 's/.*Version\*\*: *\([0-9.]*\).*/\1/')
+
+    if [[ -z "$version" ]]; then
+        echo "1.0.0"
+    else
+        echo "$version"
+    fi
+}
+
+# Function to increment version
+increment_version() {
+    local current_version="$1"
+    local change_type="$2"  # major, minor, patch
+
+    # Parse version components
+    local major=""
+    local minor=""
+    local patch=""
+
+    IFS='.' read -r major minor patch <<< "$current_version"
+
+    case "$change_type" in
+        major)
+            major=$((major + 1))
+            minor=0
+            patch=0
+            ;;
+        minor)
+            minor=$((minor + 1))
+            patch=0
+            ;;
+        patch)
+            patch=$((patch + 1))
+            ;;
+        *)
+            echo "ERROR: Invalid change type: $change_type" >&2
+            return 1
+            ;;
+    esac
+
+    echo "$major.$minor.$patch"
+}
+
+# Function to propose amendment
+propose_amendment() {
+    local amendment_file="$1"
+
+    if [[ ! -f "$amendment_file" ]]; then
+        echo "ERROR: Amendment file not found: $amendment_file"
+        exit 1
+    fi
+
+    local amendment_content=""
+    amendment_content=$(cat "$amendment_file")
+
+    # Validate amendment format
+    if ! echo "$amendment_content" | grep -q "\*\*Proposed Principle:\*\*"; then
+        echo "ERROR: Amendment must include '**Proposed Principle:**' section"
+        exit 1
+    fi
+
+    # Generate amendment ID
+    local amendment_id=""
+    amendment_id="amendment-$(date +%Y%m%d-%H%M%S)"
+
+    # Create amendment record
+    local record_file="$REPO_ROOT/.specify/memory/amendments/$amendment_id.md"
+
+    mkdir -p "$(dirname "$record_file")"
+
+    cat > "$record_file" << EOF
+# Constitution Amendment: $amendment_id
+
+**Status:** Proposed
+**Proposed Date:** $(date +%Y-%m-%d)
+**Proposed By:** $(git config user.name 2>/dev/null || echo "Unknown")
+
+## Amendment Content
+
+$amendment_content
+
+## Review Status
+
+- [ ] Technical Review
+- [ ] Team Approval
+- [ ] Implementation
+
+## Comments
+
+EOF
+
+    echo "Amendment proposed: $amendment_id"
+    echo "Review file: $record_file"
+
+    if $JSON_MODE; then
+        printf '{"status":"proposed","id":"%s","file":"%s"}\n' "$amendment_id" "$record_file"
+    fi
+}
+
+# Function to apply amendment
+apply_amendment() {
+    local amendment_id="$1"
+    local change_type="${2:-minor}"
+
+    local record_file="$REPO_ROOT/.specify/memory/amendments/$amendment_id.md"
+
+    if [[ ! -f "$record_file" ]]; then
+        echo "ERROR: Amendment record not found: $record_file"
+        exit 1
+    fi
+
+    # Check if amendment is approved
+    if ! grep -q "**Status:** Approved" "$record_file"; then
+        echo "ERROR: Amendment $amendment_id is not approved for application"
+        exit 1
+    fi
+
+    # Get current version and increment
+    local current_version=""
+    current_version=$(get_current_version)
+
+    local new_version=""
+    new_version=$(increment_version "$current_version" "$change_type")
+
+    # Extract amendment content
+    local amendment_content=""
+    amendment_content=$(sed -n '/^## Amendment Content/,/^## Review Status/p' "$record_file" | head -n -1 | tail -n +2)
+
+    # Read current constitution
+    local current_constitution=""
+    current_constitution=$(cat "$CONSTITUTION_FILE")
+
+    # Apply amendment (this is a simplified implementation)
+    # In practice, this would need more sophisticated merging logic
+    local updated_constitution="$current_constitution
+
+## Amendment: $amendment_id
+
+$amendment_content"
+
+    # Update version and amendment date
+    local today=$(date +%Y-%m-%d)
+    updated_constitution=$(echo "$updated_constitution" | sed "s/\*\*Version\*\*:.*/**Version**: $new_version/")
+    updated_constitution=$(echo "$updated_constitution" | sed "s/\*\*Last Amended\*\*:.*/**Last Amended**: $today/")
+
+    # Write updated constitution
+    echo "$updated_constitution" > "$CONSTITUTION_FILE"
+
+    # Log amendment
+    local author=""
+    author=$(grep "**Proposed By:**" "$record_file" | sed 's/.*: //')
+    local description=""
+    description=$(grep "\*\*Proposed Principle:\*\*" "$record_file" | sed 's/.*: //' | head -1)
+
+    log_amendment "$new_version" "$author" "Applied amendment $amendment_id: $description"
+
+    # Update amendment status
+    sed -i 's/**Status:** Approved/**Status:** Applied/' "$record_file"
+
+    echo "Amendment applied: $amendment_id"
+    echo "New version: $new_version"
+
+    if $JSON_MODE; then
+        printf '{"status":"applied","id":"%s","version":"%s"}\n' "$amendment_id" "$new_version"
+    fi
+}
+
+# Function to show history
+show_history() {
+    if [[ ! -f "$AMENDMENT_LOG" ]]; then
+        echo "No amendment history found"
+        return
+    fi
+
+    if $JSON_MODE; then
+        echo '{"amendments":['
+        local first=true
+        while IFS='|' read -r timestamp version author description; do
+            if [[ "$first" == "true" ]]; then
+                first=false
+            else
+                echo ','
+            fi
+            printf '{"timestamp":"%s","version":"%s","author":"%s","description":"%s"}' \
+                "$timestamp" "$version" "$author" "$description"
+        done < "$AMENDMENT_LOG"
+        echo ']}'
+    else
+        echo "Constitution Amendment History:"
+        echo "================================"
+        printf "%-20s %-10s %-20s %s\n" "Date" "Version" "Author" "Description"
+        echo "--------------------------------------------------------------------------------"
+
+        while IFS='|' read -r timestamp version author description; do
+            local date=""
+            date=$(echo "$timestamp" | cut -d'T' -f1)
+            printf "%-20s %-10s %-20s %s\n" "$date" "$version" "$author" "$description"
+        done < "$AMENDMENT_LOG"
+    fi
+}
+
+# Function to show diff
+show_diff() {
+    local version1="${1:-HEAD~1}"
+    local version2="${2:-HEAD}"
+
+    if ! git log --oneline -n 10 -- "$CONSTITUTION_FILE" > /dev/null 2>&1; then
+        echo "ERROR: Constitution file not under git version control"
+        exit 1
+    fi
+
+    echo "Constitution differences between $version1 and $version2:"
+    echo "========================================================"
+
+    git diff "$version1:$CONSTITUTION_FILE" "$version2:$CONSTITUTION_FILE" || {
+        echo "Could not generate diff. Make sure both versions exist."
+        exit 1
+    }
+}
+
+# Function to manage versions
+manage_version() {
+    local action="$1"
+    local change_type="$2"
+
+    case "$action" in
+        current)
+            local version=""
+            version=$(get_current_version)
+            echo "Current constitution version: $version"
+            ;;
+        bump)
+            if [[ -z "$change_type" ]]; then
+                echo "ERROR: Must specify change type for version bump (major, minor, patch)"
+                exit 1
+            fi
+
+            local current_version=""
+            current_version=$(get_current_version)
+
+            local new_version=""
+            new_version=$(increment_version "$current_version" "$change_type")
+
+            # Update constitution
+            sed -i "s/\*\*Version\*\*:.*/**Version**: $new_version/" "$CONSTITUTION_FILE"
+            sed -i "s/\*\*Last Amended\*\*:.*/**Last Amended**: $(date +%Y-%m-%d)/" "$CONSTITUTION_FILE"
+
+            log_amendment "$new_version" "$(git config user.name 2>/dev/null || echo "System")" "Version bump: $change_type"
+
+            echo "Version bumped from $current_version to $new_version"
+            ;;
+        *)
+            echo "ERROR: Invalid version action: $action"
+            echo "Valid actions: current, bump"
+            exit 1
+            ;;
+    esac
+}
+
+# Main logic
+if $AMEND_MODE; then
+    if [[ ${#ARGS[@]} -eq 0 ]]; then
+        echo "ERROR: Must specify amendment file for --amend"
+        exit 1
+    fi
+
+    amendment_file="${ARGS[0]}"
+    change_type="${ARGS[1]:-minor}"
+
+    if [[ -f "$amendment_file" ]]; then
+        propose_amendment "$amendment_file"
+    else
+        apply_amendment "$amendment_file" "$change_type"
+    fi
+
+elif $HISTORY_MODE; then
+    show_history
+
+elif $DIFF_MODE; then
+    version1="${ARGS[0]}"
+    version2="${ARGS[1]}"
+    show_diff "$version1" "$version2"
+
+elif $VERSION_MODE; then
+    action="${ARGS[0]:-current}"
+    change_type="${ARGS[1]}"
+    manage_version "$action" "$change_type"
+
+else
+    # Default: show current status
+    if [[ ! -f "$CONSTITUTION_FILE" ]]; then
+        echo "No constitution found. Run setup-constitution.sh first."
+        exit 1
+    fi
+
+    current_version=$(get_current_version)
+    amendment_count=$(wc -l < "$AMENDMENT_LOG" 2>/dev/null || echo 0)
+
+    if $JSON_MODE; then
+        printf '{"version":"%s","amendments":%d,"file":"%s"}\n' \
+            "$current_version" "$amendment_count" "$CONSTITUTION_FILE"
+    else
+        echo "Constitution Status:"
+        echo "==================="
+        echo "Current Version: $current_version"
+        echo "Total Amendments: $amendment_count"
+        echo "Constitution File: $CONSTITUTION_FILE"
+        echo ""
+        echo "Available commands:"
+        echo "  --history          Show amendment history"
+        echo "  --version current  Show current version"
+        echo "  --version bump <type>  Bump version (major/minor/patch)"
+        echo "  --amend <file>     Propose new amendment"
+        echo "  --amend <id> <type> Apply approved amendment"
+        echo "  --diff [v1] [v2]  Show constitution differences"
+    fi
+fi
\ No newline at end of file
diff --git a/scripts/bash/constitution-levelup.sh b/scripts/bash/constitution-levelup.sh
new file mode 100644
index 0000000000..d885527d4a
--- /dev/null
+++ b/scripts/bash/constitution-levelup.sh
@@ -0,0 +1,385 @@
+#!/usr/bin/env bash
+
+# Analyze AI session context for team-ai-directives contributions
+# This script evaluates AI agent session context packets for potential
+# contributions to team-ai-directives components: rules, constitution,
+# personas, and examples.
+
+set -e
+
+JSON_MODE=false
+AMENDMENT_MODE=false
+VALIDATE_MODE=false
+
+for arg in "$@"; do
+    case "$arg" in
+        --json)
+            JSON_MODE=true
+            ;;
+        --amendment)
+            AMENDMENT_MODE=true
+            ;;
+        --validate)
+            VALIDATE_MODE=true
+            ;;
+        --help|-h)
+            echo "Usage: $0 [--json] [--amendment] [--validate] <ai_session_context_file>"
+            echo "  --json        Output results in JSON format"
+            echo "  --amendment   Generate team-ai-directives contribution proposals"
+            echo "  --validate    Validate directives contributions"
+            echo "  --help        Show this help message"
+            exit 0
+            ;;
+        *)
+            KNOWLEDGE_FILE="$arg"
+            ;;
+    esac
+done
+
+# Get script directory and load common functions
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/common.sh"
+
+eval $(get_feature_paths)
+
+# Function to analyze AI session context for team-ai-directives contributions
+analyze_team_directives_contributions() {
+    local context_file="$1"
+
+    if [[ ! -f "$context_file" ]]; then
+        echo "ERROR: AI session context file not found: $context_file"
+        exit 1
+    fi
+
+    # Read the context file
+    local content=""
+    content=$(cat "$context_file")
+
+    # Extract different sections for analysis
+    local session_overview=""
+    session_overview=$(echo "$content" | sed -n '/## Session Overview/,/^## /p' | head -n -1)
+
+    local decision_patterns=""
+    decision_patterns=$(echo "$content" | sed -n '/## Decision Patterns/,/^## /p' | head -n -1)
+
+    local execution_context=""
+    execution_context=$(echo "$content" | sed -n '/## Execution Context/,/^## /p' | head -n -1)
+
+    local reusable_patterns=""
+    reusable_patterns=$(echo "$content" | sed -n '/## Reusable Patterns/,/^## /p' | head -n -1)
+
+    # Analyze for different component contributions
+    local constitution_score=0
+    local rules_score=0
+    local personas_score=0
+    local examples_score=0
+
+    local constitution_keywords=("must" "shall" "required" "mandatory" "always" "never" "principle" "governance" "policy" "standard" "quality" "security" "testing" "documentation" "architecture" "compliance" "oversight" "review" "approval")
+    local rules_keywords=("agent" "behavior" "interaction" "decision" "pattern" "approach" "strategy" "methodology" "tool" "execution")
+    local personas_keywords=("role" "specialization" "capability" "expertise" "persona" "assistant" "specialist" "focus")
+    local examples_keywords=("example" "case" "study" "implementation" "usage" "reference" "demonstration" "scenario")
+
+    # Analyze constitution relevance
+    for keyword in "${constitution_keywords[@]}"; do
+        if echo "$content" | grep -qi "$keyword"; then
+            constitution_score=$((constitution_score + 1))
+        fi
+    done
+
+    # Analyze rules relevance
+    for keyword in "${rules_keywords[@]}"; do
+        if echo "$decision_patterns $reusable_patterns" | grep -qi "$keyword"; then
+            rules_score=$((rules_score + 1))
+        fi
+    done
+
+    # Analyze personas relevance
+    for keyword in "${personas_keywords[@]}"; do
+        if echo "$session_overview $execution_context" | grep -qi "$keyword"; then
+            personas_score=$((personas_score + 1))
+        fi
+    done
+
+    # Analyze examples relevance
+    for keyword in "${examples_keywords[@]}"; do
+        if echo "$execution_context $reusable_patterns" | grep -qi "$keyword"; then
+            examples_score=$((examples_score + 1))
+        fi
+    done
+
+    # Check for imperative language patterns (constitution)
+    if echo "$content" | grep -q "^[[:space:]]*-[[:space:]]*[A-Z][a-z]*.*must\|shall\|should\|will"; then
+        constitution_score=$((constitution_score + 2))
+    fi
+
+    # Return analysis results: constitution|rules|personas|examples|content_sections
+    echo "$constitution_score|$rules_score|$personas_score|$examples_score|$session_overview|$decision_patterns|$execution_context|$reusable_patterns"
+}
+
+# Function to generate team-ai-directives proposals
+generate_directives_proposals() {
+    local context_file="$1"
+    local analysis_result="$2"
+
+    local constitution_score="" rules_score="" personas_score="" examples_score=""
+    local session_overview="" decision_patterns="" execution_context="" reusable_patterns=""
+
+    IFS='|' read -r constitution_score rules_score personas_score examples_score session_overview decision_patterns execution_context reusable_patterns <<< "$analysis_result"
+
+    # Extract feature name from file path
+    local feature_name=""
+    feature_name=$(basename "$context_file" | sed 's/-session\.md$//')
+
+    local proposals=""
+
+    # Generate constitution amendment if relevant
+    if [[ $constitution_score -ge 3 ]]; then
+        local amendment_title=""
+        amendment_title=$(echo "$session_overview" | head -3 | grep -v "^#" | head -1 | sed 's/^[[:space:]]*-[[:space:]]*//' | cut -c1-50)
+
+        if [[ -z "$amendment_title" ]]; then
+            amendment_title="Constitution Amendment from $feature_name"
+        fi
+
+        proposals="${proposals}**CONSTITUTION AMENDMENT PROPOSAL**
+
+**Proposed Principle:** $amendment_title
+
+**Description:**
+$(echo "$session_overview" | sed 's/^#/###/')
+
+**Rationale:** This principle was derived from AI agent session in feature '$feature_name'. The approach demonstrated governance and quality considerations that should be codified.
+
+**Evidence:** See AI session context at $context_file
+
+**Impact Assessment:**
+- Adds new governance requirement for AI agent sessions
+- May require updates to agent behavior guidelines
+- Enhances project quality and consistency
+- Should be reviewed by team before adoption
+
+---
+"
+    fi
+
+    # Generate rules proposal if relevant
+    if [[ $rules_score -ge 2 ]]; then
+        proposals="${proposals}**RULES CONTRIBUTION**
+
+**Proposed Rule:** AI Agent Decision Pattern from $feature_name
+
+**Pattern Description:**
+$(echo "$decision_patterns" | sed 's/^#/###/')
+
+**When to Apply:** Use this decision pattern when facing similar challenges in $feature_name-type features.
+
+**Evidence:** See AI session context at $context_file
+
+---
+"
+    fi
+
+    # Generate personas proposal if relevant
+    if [[ $personas_score -ge 2 ]]; then
+        proposals="${proposals}**PERSONA DEFINITION**
+
+**Proposed Persona:** Specialized Agent for $feature_name-type Features
+
+**Capabilities Demonstrated:**
+$(echo "$execution_context" | sed 's/^#/###/')
+
+**Specialization:** $feature_name implementation and similar complex feature development.
+
+**Evidence:** See AI session context at $context_file
+
+---
+"
+    fi
+
+    # Generate examples proposal if relevant
+    if [[ $examples_score -ge 2 ]]; then
+        proposals="${proposals}**EXAMPLE CONTRIBUTION**
+
+**Example:** $feature_name Implementation Approach
+
+**Scenario:** Complete feature development from spec to deployment.
+
+**Approach Used:**
+$(echo "$reusable_patterns" | sed 's/^#/###/')
+
+**Outcome:** Successful implementation with quality gates passed.
+
+**Evidence:** See AI session context at $context_file
+
+---
+"
+    fi
+
+    if [[ -z "$proposals" ]]; then
+        echo "No significant team-ai-directives contributions identified from this session."
+        return 1
+    fi
+
+    echo "$proposals"
+}
+
+# Function to validate directives contributions
+validate_directives_contributions() {
+    local contributions="$1"
+    local constitution_file="$REPO_ROOT/.specify/memory/constitution.md"
+
+    # For now, just check constitution conflicts if constitution file exists
+    # Future enhancement: check for conflicts in rules, personas, examples
+    if [[ ! -f "$constitution_file" ]]; then
+        echo "WARNING: No project constitution found at $constitution_file"
+        return 0
+    fi
+
+    local constitution_content=""
+    constitution_content=$(cat "$constitution_file")
+
+    # Check for conflicts with existing principles (constitution amendments only)
+    local conflicts=()
+
+    # Extract constitution amendment if present
+    local constitution_section=""
+    constitution_section=$(echo "$contributions" | sed -n '/\*\*CONSTITUTION AMENDMENT PROPOSAL\*\*/,/---/p' | head -n -1)
+
+    if [[ -n "$constitution_section" ]]; then
+        local amendment_principle=""
+        amendment_principle=$(echo "$constitution_section" | grep "^\*\*Proposed Principle:\*\*" | sed 's/.*: //' | head -1)
+
+        # Check if similar principle already exists
+        if echo "$constitution_content" | grep -qi "$amendment_principle"; then
+            conflicts+=("Similar constitution principle already exists: $amendment_principle")
+        fi
+
+        # Check for contradictory language
+        local amendment_rules=""
+        amendment_rules=$(echo "$constitution_section" | sed -n '/^\*\*Description:\*\*/,/^\*\*Rationale:\*\*/p' | grep -E "^[[:space:]]*-[[:space:]]*[A-Z]")
+
+        for rule in $amendment_rules; do
+            # Look for contradictions in existing constitution
+            if echo "$constitution_content" | grep -qi "never.*$(echo "$rule" | sed 's/.* //')" || echo "$constitution_content" | grep -qi "must not.*$(echo "$rule" | sed 's/.* //')"; then
+                conflicts+=("Potential contradiction with existing constitution rule: $rule")
+            fi
+        done
+    fi
+
+    if [[ ${#conflicts[@]} -gt 0 ]]; then
+        echo "VALIDATION ISSUES:"
+        for conflict in "${conflicts[@]}"; do
+            echo "  - $conflict"
+        done
+        return 1
+    else
+        echo "✓ Directives contributions validation passed - no conflicts detected"
+        return 0
+    fi
+}
+
+# Main logic
+if [[ -z "$KNOWLEDGE_FILE" && $AMENDMENT_MODE == false && $VALIDATE_MODE == false ]]; then
+    echo "ERROR: Must specify knowledge asset file or use --amendment/--validate mode"
+    exit 1
+fi
+
+if $VALIDATE_MODE; then
+    if [[ -z "$KNOWLEDGE_FILE" ]]; then
+        echo "ERROR: Must specify directives contributions file for validation"
+        exit 1
+    fi
+
+    contributions_content=$(cat "$KNOWLEDGE_FILE")
+    if validate_directives_contributions "$contributions_content"; then
+        if $JSON_MODE; then
+            printf '{"status":"valid","file":"%s"}\n' "$KNOWLEDGE_FILE"
+        else
+            echo "Directives contributions validation successful"
+        fi
+    else
+        if $JSON_MODE; then
+            printf '{"status":"invalid","file":"%s"}\n' "$KNOWLEDGE_FILE"
+        else
+            echo "Directives contributions validation failed"
+        fi
+        exit 1
+    fi
+    exit 0
+fi
+
+if $AMENDMENT_MODE; then
+    if [[ -z "$KNOWLEDGE_FILE" ]]; then
+        echo "ERROR: Must specify AI session context file for directives proposals"
+        exit 1
+    fi
+
+    analysis=$(analyze_team_directives_contributions "$KNOWLEDGE_FILE")
+    proposals=$(generate_directives_proposals "$KNOWLEDGE_FILE" "$analysis")
+
+    if [[ $? -eq 0 ]]; then
+        if $JSON_MODE; then
+            printf '{"status":"proposed","file":"%s","proposals":%s}\n' "$KNOWLEDGE_FILE" "$(echo "$proposals" | jq -R -s .)"
+        else
+            echo "Team-AI-Directives Contribution Proposals:"
+            echo "=========================================="
+            echo "$proposals"
+            echo ""
+            echo "To apply these contributions, run:"
+            echo "  directives-update --file proposals.md"
+        fi
+    else
+        if $JSON_MODE; then
+            printf '{"status":"no_contributions","file":"%s"}\n' "$KNOWLEDGE_FILE"
+        else
+            echo "$proposals"
+        fi
+    fi
+    exit 0
+fi
+
+# Default: analyze mode
+analysis=$(analyze_team_directives_contributions "$KNOWLEDGE_FILE")
+
+if $JSON_MODE; then
+    constitution_score=$(echo "$analysis" | cut -d'|' -f1)
+    rules_score=$(echo "$analysis" | cut -d'|' -f2)
+    personas_score=$(echo "$analysis" | cut -d'|' -f3)
+    examples_score=$(echo "$analysis" | cut -d'|' -f4)
+    printf '{"file":"%s","constitution_score":%d,"rules_score":%d,"personas_score":%d,"examples_score":%d}\n' "$KNOWLEDGE_FILE" "$constitution_score" "$rules_score" "$personas_score" "$examples_score"
+else
+    constitution_score=$(echo "$analysis" | cut -d'|' -f1)
+    rules_score=$(echo "$analysis" | cut -d'|' -f2)
+    personas_score=$(echo "$analysis" | cut -d'|' -f3)
+    examples_score=$(echo "$analysis" | cut -d'|' -f4)
+
+    echo "Team-AI-Directives Contribution Analysis for: $KNOWLEDGE_FILE"
+    echo "Constitution Score: $constitution_score/10"
+    echo "Rules Score: $rules_score/5"
+    echo "Personas Score: $personas_score/5"
+    echo "Examples Score: $examples_score/5"
+
+    local has_contributions=false
+    if [[ $constitution_score -ge 3 ]]; then
+        echo "✓ Constitution contribution potential detected"
+        has_contributions=true
+    fi
+    if [[ $rules_score -ge 2 ]]; then
+        echo "✓ Rules contribution potential detected"
+        has_contributions=true
+    fi
+    if [[ $personas_score -ge 2 ]]; then
+        echo "✓ Personas contribution potential detected"
+        has_contributions=true
+    fi
+    if [[ $examples_score -ge 2 ]]; then
+        echo "✓ Examples contribution potential detected"
+        has_contributions=true
+    fi
+
+    if [[ "$has_contributions" == true ]]; then
+        echo "Run with --amendment to generate contribution proposals"
+    else
+        echo "ℹ No significant team-ai-directives contributions identified"
+    fi
+fi
\ No newline at end of file
diff --git a/scripts/bash/create-new-feature.sh b/scripts/bash/create-new-feature.sh
old mode 100644
new mode 100755
index c40cfd77f0..759ae619fc
--- a/scripts/bash/create-new-feature.sh
+++ b/scripts/bash/create-new-feature.sh
@@ -280,10 +280,112 @@ fi
 FEATURE_DIR="$SPECS_DIR/$BRANCH_NAME"
 mkdir -p "$FEATURE_DIR"
 
-TEMPLATE="$REPO_ROOT/.specify/templates/spec-template.md"
+# Mode-aware template selection
+MODE_FILE="$REPO_ROOT/.specify/config/config.json"
+CURRENT_MODE="spec"
+if [ -f "$MODE_FILE" ]; then
+    CURRENT_MODE=$(python3 -c "
+import json
+try:
+    with open('$MODE_FILE', 'r') as f:
+        data = json.load(f)
+    print(data.get('workflow', {}).get('current_mode', 'spec'))
+except:
+    print('spec')
+" 2>/dev/null || echo "spec")
+fi
+if [ "$CURRENT_MODE" = "build" ]; then
+    TEMPLATE="$REPO_ROOT/templates/spec-template-build.md"
+else
+    TEMPLATE="$REPO_ROOT/templates/spec-template.md"
+fi
 SPEC_FILE="$FEATURE_DIR/spec.md"
 if [ -f "$TEMPLATE" ]; then cp "$TEMPLATE" "$SPEC_FILE"; else touch "$SPEC_FILE"; fi
 
+CONTEXT_TEMPLATE="$REPO_ROOT/templates/context-template.md"
+CONTEXT_FILE="$FEATURE_DIR/context.md"
+
+# Function to populate context.md with intelligent defaults (mode-aware)
+populate_context_file() {
+    local context_file="$1"
+    local feature_name="$2"
+    local feature_description="$3"
+    local mode="$4"
+
+    # Extract feature title (first line or first sentence)
+    local feature_title=$(echo "$feature_description" | head -1 | sed 's/^[[:space:]]*//' | sed 's/[[:space:]]*$//')
+
+    # Extract mission (first sentence, limited to reasonable length)
+    local mission=$(echo "$feature_description" | grep -o '^[[:print:]]*[.!?]' | head -1 | sed 's/[.!?]$//')
+    if [ -z "$mission" ]; then
+        mission="$feature_description"
+    fi
+    # Limit mission length for readability
+    if [ ${#mission} -gt 200 ]; then
+        mission=$(echo "$mission" | cut -c1-200 | sed 's/[[:space:]]*$//' | sed 's/[[:space:]]*$/.../')
+    fi
+
+    # Mode-aware field population
+    if [ "$mode" = "build" ]; then
+        # Build mode: Minimal context, focus on core functionality
+        local code_paths="To be determined during implementation"
+        local directives="None (build mode)"
+        local research="Minimal research needed for lightweight implementation"
+        local gateway="None (build mode)"
+    else
+        # Spec mode: Comprehensive context for full specification
+        # Detect code paths (basic detection based on common patterns)
+        local code_paths="To be determined during planning phase"
+        if echo "$feature_description" | grep -qi "api\|endpoint\|service"; then
+            code_paths="api/, services/"
+        elif echo "$feature_description" | grep -qi "ui\|frontend\|component"; then
+            code_paths="src/components/, src/pages/"
+        elif echo "$feature_description" | grep -qi "database\|data\|model"; then
+            code_paths="src/models/, database/"
+        fi
+
+        # Read team directives if available
+        local directives="None"
+        local team_directives_file="$REPO_ROOT/.specify/memory/team-ai-directives/directives.md"
+        if [ -f "$team_directives_file" ]; then
+            directives="See team-ai-directives repository for applicable guidelines"
+        fi
+
+        # Set research needs
+        local research="To be identified during specification and planning phases"
+
+        # Read gateway configuration if available
+        local gateway="None"
+        local config_file="$REPO_ROOT/.specify/config/config.json"
+        if [ -f "$config_file" ]; then
+            local gateway_url=$(grep -o '"url"[[:space:]]*:[[:space:]]*"[^"]*"' "$config_file" 2>/dev/null | cut -d'"' -f4)
+            if [ -n "$gateway_url" ]; then
+                gateway="$gateway_url"
+            fi
+        fi
+    fi
+
+    # Create context.md with populated values
+    cat > "$context_file" << EOF
+# Feature Context
+
+**Feature**: $feature_title
+**Mission**: $mission
+**Code Paths**: $code_paths
+**Directives**: $directives
+**Research**: $research
+**Gateway**: $gateway
+
+EOF
+}
+
+# Populate context.md with intelligent defaults
+if [ -f "$CONTEXT_TEMPLATE" ]; then
+    populate_context_file "$CONTEXT_FILE" "$BRANCH_SUFFIX" "$FEATURE_DESCRIPTION" "$CURRENT_MODE"
+else
+    touch "$CONTEXT_FILE"
+fi
+
 # Set the SPECIFY_FEATURE environment variable for the current session
 export SPECIFY_FEATURE="$BRANCH_NAME"
 
diff --git a/scripts/bash/generate-risk-tests.sh b/scripts/bash/generate-risk-tests.sh
new file mode 100755
index 0000000000..3602478335
--- /dev/null
+++ b/scripts/bash/generate-risk-tests.sh
@@ -0,0 +1,139 @@
+#!/usr/bin/env bash
+
+# Generate risk-based test tasks from extracted risks
+#
+# This script takes risk data (JSON) and generates corresponding test tasks
+# based on risk severity and category.
+#
+# Usage: ./generate-risk-tests.sh <risks_json>
+#
+# Input: JSON array of risks [{"id": "...", "severity": "High", ...}]
+# Output: Markdown-formatted test tasks
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/common.sh"
+
+# Test case templates by severity and category
+declare -A TEST_TEMPLATES
+
+TEST_TEMPLATES[Critical,security]="
+- [ ] **SECURITY TEST**: Implement authentication bypass prevention for {description}
+- [ ] **SECURITY TEST**: Add input validation tests for {description}
+- [ ] **SECURITY TEST**: Verify audit logging for {description}
+- [ ] **INTEGRATION TEST**: Test security controls integration for {description}"
+
+TEST_TEMPLATES[Critical,performance]="
+- [ ] **PERFORMANCE TEST**: Implement load testing for {description}
+- [ ] **PERFORMANCE TEST**: Add stress testing for {description}
+- [ ] **PERFORMANCE TEST**: Verify performance benchmarks for {description}"
+
+TEST_TEMPLATES[Critical,data]="
+- [ ] **DATA INTEGRITY TEST**: Implement data validation for {description}
+- [ ] **DATA INTEGRITY TEST**: Add rollback testing for {description}
+- [ ] **DATA INTEGRITY TEST**: Verify data consistency for {description}"
+
+TEST_TEMPLATES[High,security]="
+- [ ] **SECURITY TEST**: Add authorization tests for {description}
+- [ ] **INTEGRATION TEST**: Test secure API endpoints for {description}"
+
+TEST_TEMPLATES[High,performance]="
+- [ ] **PERFORMANCE TEST**: Add response time validation for {description}
+- [ ] **PERFORMANCE TEST**: Implement concurrency testing for {description}"
+
+TEST_TEMPLATES[High,data]="
+- [ ] **DATA TEST**: Add data migration testing for {description}
+- [ ] **DATA TEST**: Verify backup/restore for {description}"
+
+TEST_TEMPLATES[Medium,security]="
+- [ ] **SECURITY TEST**: Add basic access control tests for {description}"
+
+TEST_TEMPLATES[Medium,performance]="
+- [ ] **PERFORMANCE TEST**: Add basic load validation for {description}"
+
+TEST_TEMPLATES[Medium,data]="
+- [ ] **DATA TEST**: Add data integrity checks for {description}"
+
+TEST_TEMPLATES[Low,ux]="
+- [ ] **UI TEST**: Add accessibility testing for {description}
+- [ ] **UI TEST**: Verify error handling UX for {description}"
+
+TEST_TEMPLATES[Low,functional]="
+- [ ] **FUNCTIONAL TEST**: Add edge case validation for {description}"
+
+generate_test_tasks() {
+    local risks_json="$1"
+
+    # Parse risks and generate tasks
+    python3 - "$risks_json" <<'PY'
+import json
+import sys
+import re
+
+risks_json = sys.argv[1]
+risks = json.loads(risks_json)
+
+# Test templates (embedded from bash)
+templates = {
+    "Critical,security": "\n- [ ] **SECURITY TEST**: Implement authentication bypass prevention for {description}\n- [ ] **SECURITY TEST**: Add input validation tests for {description}\n- [ ] **SECURITY TEST**: Verify audit logging for {description}\n- [ ] **INTEGRATION TEST**: Test security controls integration for {description}",
+    "Critical,performance": "\n- [ ] **PERFORMANCE TEST**: Implement load testing for {description}\n- [ ] **PERFORMANCE TEST**: Add stress testing for {description}\n- [ ] **PERFORMANCE TEST**: Verify performance benchmarks for {description}",
+    "Critical,data": "\n- [ ] **DATA INTEGRITY TEST**: Implement data validation for {description}\n- [ ] **DATA INTEGRITY TEST**: Add rollback testing for {description}\n- [ ] **DATA INTEGRITY TEST**: Verify data consistency for {description}",
+    "High,security": "\n- [ ] **SECURITY TEST**: Add authorization tests for {description}\n- [ ] **INTEGRATION TEST**: Test secure API endpoints for {description}",
+    "High,performance": "\n- [ ] **PERFORMANCE TEST**: Add response time validation for {description}\n- [ ] **PERFORMANCE TEST**: Implement concurrency testing for {description}",
+    "High,data": "\n- [ ] **DATA TEST**: Add data migration testing for {description}\n- [ ] **DATA TEST**: Verify backup/restore for {description}",
+    "Medium,security": "\n- [ ] **SECURITY TEST**: Add basic access control tests for {description}",
+    "Medium,performance": "\n- [ ] **PERFORMANCE TEST**: Add basic load validation for {description}",
+    "Medium,data": "\n- [ ] **DATA TEST**: Add data integrity checks for {description}",
+    "Low,ux": "\n- [ ] **UI TEST**: Add accessibility testing for {description}\n- [ ] **UI TEST**: Verify error handling UX for {description}",
+    "Low,functional": "\n- [ ] **FUNCTIONAL TEST**: Add edge case validation for {description}"
+}
+
+def get_category(description):
+    desc = description.lower()
+    if any(word in desc for word in ["security", "auth", "access", "vulnerability"]):
+        return "security"
+    elif any(word in desc for word in ["performance", "speed", "load", "latency"]):
+        return "performance"
+    elif any(word in desc for word in ["data", "database", "integrity", "consistency"]):
+        return "data"
+    elif any(word in desc for word in ["ui", "ux", "interface", "user"]):
+        return "ux"
+    else:
+        return "functional"
+
+print("## Risk-Based Test Tasks")
+print()
+print("Generated test tasks to mitigate identified risks:")
+print()
+
+for risk in risks:
+    risk_id = risk.get("id", "unknown")
+    severity = risk.get("severity", "Medium")
+    description = risk.get("description", risk.get("risk", "Unknown risk"))
+    category = get_category(description)
+
+    key = f"{severity},{category}"
+    template = templates.get(key, templates.get(f"{severity},functional", "\n- [ ] **FUNCTIONAL TEST**: Add validation test for {description}"))
+
+    print(f"### Risk {risk_id} ({severity} - {category})")
+    print(f"**Description**: {description}")
+    print("**Test Tasks**:")
+    print(template.format(description=description))
+    print()
+
+PY
+}
+
+if [[ $# -eq 1 ]]; then
+    risks_json="$1"
+elif [[ $# -eq 0 ]]; then
+    # Read from stdin
+    risks_json=$(cat)
+else
+    echo "Usage: $0 [risks_json]" >&2
+    echo "If no argument provided, reads from stdin" >&2
+    exit 1
+fi
+
+generate_test_tasks "$risks_json"
\ No newline at end of file
diff --git a/scripts/bash/implement.sh b/scripts/bash/implement.sh
new file mode 100644
index 0000000000..0d41bbd9cd
--- /dev/null
+++ b/scripts/bash/implement.sh
@@ -0,0 +1,481 @@
+#!/bin/bash
+# implement.sh - Execute the implementation plan with dual execution loop support
+# Handles SYNC/ASYNC task classification, LLM delegation, and review enforcement
+
+set -euo pipefail
+
+# Source common utilities
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/common.sh"
+source "$SCRIPT_DIR/tasks-meta-utils.sh"
+
+# Global variables
+FEATURE_DIR=""
+AVAILABLE_DOCS=""
+TASKS_FILE=""
+TASKS_META_FILE=""
+CHECKLISTS_DIR=""
+IMPLEMENTATION_LOG=""
+
+# Logging functions
+log_info() {
+    echo "[INFO] $*" >&2
+}
+
+log_success() {
+    echo "[SUCCESS] $*" >&2
+}
+
+log_error() {
+    echo "[ERROR] $*" >&2
+}
+
+log_warning() {
+    echo "[WARNING] $*" >&2
+}
+
+# Initialize implementation environment
+init_implementation() {
+    local json_output="$1"
+
+    # Parse JSON output from check-prerequisites.sh
+    FEATURE_DIR=$(echo "$json_output" | jq -r '.FEATURE_DIR // empty')
+    AVAILABLE_DOCS=$(echo "$json_output" | jq -r '.AVAILABLE_DOCS // empty')
+
+    if [[ -z "$FEATURE_DIR" ]]; then
+        log_error "FEATURE_DIR not found in prerequisites check"
+        exit 1
+    fi
+
+    TASKS_FILE="$FEATURE_DIR/tasks.md"
+    TASKS_META_FILE="$FEATURE_DIR/tasks_meta.json"
+    CHECKLISTS_DIR="$FEATURE_DIR/checklists"
+
+    # Create implementation log
+    IMPLEMENTATION_LOG="$FEATURE_DIR/implementation.log"
+    echo "# Implementation Log - $(date)" > "$IMPLEMENTATION_LOG"
+    echo "" >> "$IMPLEMENTATION_LOG"
+
+    log_info "Initialized implementation for feature: $(basename "$FEATURE_DIR")"
+}
+
+# Check checklists status
+check_checklists_status() {
+    if [[ ! -d "$CHECKLISTS_DIR" ]]; then
+        log_info "No checklists directory found - proceeding without checklist validation"
+        return 0
+    fi
+
+    log_info "Checking checklist status..."
+
+    local total_checklists=0
+    local passed_checklists=0
+    local failed_checklists=0
+
+    echo "## Checklist Status Report" >> "$IMPLEMENTATION_LOG"
+    echo "" >> "$IMPLEMENTATION_LOG"
+    echo "| Checklist | Total | Completed | Incomplete | Status |" >> "$IMPLEMENTATION_LOG"
+    echo "|-----------|-------|-----------|------------|--------|" >> "$IMPLEMENTATION_LOG"
+
+    for checklist_file in "$CHECKLISTS_DIR"/*.md; do
+        if [[ ! -f "$checklist_file" ]]; then
+            continue
+        fi
+
+        local filename=$(basename "$checklist_file" .md)
+        local total_items=$(grep -c "^- \[" "$checklist_file" || echo "0")
+        local completed_items=$(grep -c "^- \[X\]\|^- \[x\]" "$checklist_file" || echo "0")
+        local incomplete_items=$((total_items - completed_items))
+
+        local status="PASS"
+        if [[ $incomplete_items -gt 0 ]]; then
+            status="FAIL"
+            failed_checklists=$((failed_checklists + 1))
+        else
+            passed_checklists=$((passed_checklists + 1))
+        fi
+
+        total_checklists=$((total_checklists + 1))
+
+        echo "| $filename | $total_items | $completed_items | $incomplete_items | $status |" >> "$IMPLEMENTATION_LOG"
+    done
+
+    echo "" >> "$IMPLEMENTATION_LOG"
+
+    if [[ $failed_checklists -gt 0 ]]; then
+        log_warning "Found $failed_checklists checklist(s) with incomplete items"
+        echo "Some checklists are incomplete. Do you want to proceed with implementation anyway? (yes/no): "
+        read -r response
+        if [[ ! "$response" =~ ^(yes|y)$ ]]; then
+            log_info "Implementation cancelled by user"
+            exit 0
+        fi
+    else
+        log_success "All $total_checklists checklists passed"
+    fi
+}
+
+# Load implementation context
+load_implementation_context() {
+    log_info "Loading implementation context..."
+
+    # Get current workflow mode
+    local workflow_mode="spec"  # Default
+    if [[ -f ".specify/config/config.json" ]]; then
+        workflow_mode=$(jq -r '.workflow.current_mode // "spec"' ".specify/config/config.json" 2>/dev/null || echo "spec")
+    fi
+
+    # Required files (plan.md is optional in build mode)
+    local required_files=("tasks.md" "spec.md")
+    if [[ "$workflow_mode" == "spec" ]]; then
+        required_files+=("plan.md")
+    fi
+
+    for file in "${required_files[@]}"; do
+        if [[ ! -f "$FEATURE_DIR/$file" ]]; then
+            log_error "Required file missing: $FEATURE_DIR/$file"
+            exit 1
+        fi
+    done
+
+    # Optional files (plan.md is optional in build mode)
+    local optional_files=("data-model.md" "contracts/" "research.md" "quickstart.md")
+    if [[ "$workflow_mode" == "build" ]]; then
+        optional_files+=("plan.md")
+    fi
+
+    # Optional files
+    local optional_files=("data-model.md" "contracts/" "research.md" "quickstart.md")
+
+    for file in "${optional_files[@]}"; do
+        if [[ -f "$FEATURE_DIR/$file" ]] || [[ -d "$FEATURE_DIR/$file" ]]; then
+            log_info "Found optional context: $file"
+        fi
+    done
+}
+
+# Parse tasks from tasks.md
+parse_tasks() {
+    log_info "Parsing tasks from $TASKS_FILE..."
+
+    # Extract tasks with their metadata
+    # This is a simplified parser - in practice, you'd want more robust parsing
+    local task_lines
+    task_lines=$(grep -n "^- \[ \] T[0-9]\+" "$TASKS_FILE" || true)
+
+    if [[ -z "$task_lines" ]]; then
+        log_warning "No uncompleted tasks found in $TASKS_FILE"
+        return 0
+    fi
+
+    echo "$task_lines" | while IFS=: read -r line_num task_line; do
+        # Extract task ID, description, and markers
+        local task_id
+        task_id=$(echo "$task_line" | sed -n 's/.*\(T[0-9]\+\).*/\1/p')
+
+        local description
+        description=$(echo "$task_line" | sed 's/^- \[ \] T[0-9]\+ //' | sed 's/\[.*\]//g' | xargs)
+
+        local execution_mode="SYNC"  # Default
+        if echo "$task_line" | grep -q "\[ASYNC\]"; then
+            execution_mode="ASYNC"
+        fi
+
+        local parallel_marker=""
+        if echo "$task_line" | grep -q "\[P\]"; then
+            parallel_marker="P"
+        fi
+
+        # Extract file paths (simplified - look for file extensions in the task)
+        local task_files=""
+        task_files=$(echo "$task_line" | grep -oE '\b\w+\.(js|ts|py|java|cpp|md|json|yml|yaml)\b' | tr '\n' ' ' | xargs || echo "")
+
+        log_info "Found task $task_id: $description [$execution_mode] ${parallel_marker:+$parallel_marker }($task_files)"
+
+        # Classify and add to tasks_meta.json
+        local classified_mode
+        classified_mode=$(classify_task_execution_mode "$description" "$task_files")
+
+        # Override with explicit marker if present
+        if [[ "$execution_mode" == "ASYNC" ]]; then
+            classified_mode="ASYNC"
+        fi
+
+        add_task "$TASKS_META_FILE" "$task_id" "$description" "$task_files" "$classified_mode"
+    done
+}
+
+# Execute task with dual execution loop
+execute_task() {
+    local task_id="$1"
+    local execution_mode
+    execution_mode=$(jq -r ".tasks[\"$task_id\"].execution_mode" "$TASKS_META_FILE")
+
+    log_info "Executing task $task_id in $execution_mode mode"
+
+    if [[ "$execution_mode" == "ASYNC" ]]; then
+        # Generate delegation prompt for LLM
+        local task_description
+        task_description=$(jq -r ".tasks[\"$task_id\"].description" "$TASKS_META_FILE")
+        local task_files
+        task_files=$(jq -r ".tasks[\"$task_id\"].files // empty" "$TASKS_META_FILE")
+        local agent_type
+        agent_type=$(jq -r ".tasks[\"$task_id\"].agent_type // \"general\"" "$TASKS_META_FILE")
+
+        # For now, use simple context and requirements
+        local task_context="Files: $task_files"
+        local task_requirements="Complete the task according to specifications"
+        local execution_instructions="Execute the task and provide detailed results"
+
+        if dispatch_async_task "$task_id" "$agent_type" "$task_description" "$task_context" "$task_requirements" "$execution_instructions" "$FEATURE_DIR"; then
+            log_success "ASYNC task $task_id dispatched successfully"
+        else
+            handle_task_failure "$task_id" "Failed to dispatch ASYNC task"
+        fi
+    else
+        # Execute SYNC task (would normally involve AI agent execution)
+        log_info "SYNC task $task_id would be executed here (simulated)"
+
+        # Simulate execution success/failure (in real implementation, check actual execution result)
+        local execution_success=true
+
+        if [[ "$execution_success" == "true" ]]; then
+            # Mark as completed (in real implementation, this would happen after successful execution)
+            safe_json_update "$TASKS_META_FILE" --arg task_id "$task_id" '.tasks[$task_id].status = "completed"'
+
+            # Perform micro-review
+            perform_micro_review "$TASKS_META_FILE" "$task_id"
+        else
+            handle_task_failure "$task_id" "SYNC task execution failed"
+        fi
+    fi
+
+    # Apply quality gates
+    apply_quality_gates "$TASKS_META_FILE" "$task_id"
+}
+
+# Monitor ASYNC tasks
+monitor_async_tasks() {
+    log_info "Monitoring ASYNC tasks..."
+
+    local async_tasks
+    async_tasks=$(jq -r '.tasks | to_entries[] | select(.value.execution_mode == "ASYNC" and .value.status != "completed") | .key' "$TASKS_META_FILE")
+
+    if [[ -z "$async_tasks" ]]; then
+        log_info "No ASYNC tasks to monitor"
+        return 0
+    fi
+
+    echo "$async_tasks" | while read -r task_id; do
+        if [[ -z "$task_id" ]]; then
+            continue
+        fi
+
+        local status
+        status=$(check_delegation_status "$task_id")
+
+        case "$status" in
+            "completed")
+                log_success "ASYNC task $task_id completed"
+                # Mark as completed in tasks_meta.json
+                safe_json_update "$TASKS_META_FILE" --arg task_id "$task_id" '.tasks[$task_id].status = "completed"'
+                # Perform macro-review for completed ASYNC tasks
+                perform_macro_review "$TASKS_META_FILE"
+                ;;
+            "running")
+                log_info "ASYNC task $task_id still running"
+                ;;
+            "failed")
+                log_error "ASYNC task $task_id failed"
+                # Handle failure with rollback options
+                handle_task_failure "$task_id" "ASYNC task execution failed"
+                ;;
+            "no_job")
+                log_warning "ASYNC task $task_id has no delegation response"
+                ;;
+        esac
+    done
+}
+
+# Main implementation workflow
+main() {
+    local json_output="$1"
+
+    init_implementation "$json_output"
+    check_checklists_status
+    load_implementation_context
+
+    # Initialize tasks_meta.json if needed
+    if [[ ! -f "$TASKS_META_FILE" ]]; then
+        init_tasks_meta "$FEATURE_DIR"
+    fi
+
+    parse_tasks
+
+    # Execute tasks (simplified - in practice would handle phases and dependencies)
+    local pending_tasks
+    pending_tasks=$(jq -r '.tasks | to_entries[] | select(.value.status == "pending") | .key' "$TASKS_META_FILE")
+
+    if [[ -n "$pending_tasks" ]]; then
+        echo "$pending_tasks" | while read -r task_id; do
+            if [[ -z "$task_id" ]]; then
+                continue
+            fi
+            execute_task "$task_id"
+        done
+    fi
+
+    # Monitor ASYNC tasks
+    monitor_async_tasks
+
+    # Check if all tasks are completed for macro-review
+    local all_completed
+    all_completed=$(jq '.tasks | all(.status == "completed")' "$TASKS_META_FILE")
+
+    if [[ "$all_completed" == "true" ]]; then
+        log_info "All tasks completed - performing macro-review"
+        perform_macro_review "$TASKS_META_FILE"
+
+        # Offer documentation evolution after successful implementation
+        log_info "Offering documentation evolution based on implementation learnings"
+        offer_documentation_evolution "$FEATURE_DIR"
+    else
+        log_info "Some tasks still pending - macro-review deferred until completion"
+    fi
+
+    # Generate summary
+    get_execution_summary "$TASKS_META_FILE"
+
+    log_success "Implementation phase completed"
+}
+
+# Offer documentation evolution after implementation
+offer_documentation_evolution() {
+    local feature_dir="$1"
+
+    log_info "Analyzing implementation for documentation evolution opportunities..."
+
+    # Analyze implementation changes
+    local analysis_results
+    analysis_results=$(analyze_implementation_changes "$feature_dir")
+
+    # Check if there are significant changes worth documenting
+    if echo "$analysis_results" | grep -q "new features\|architecture\|refinements\|additional tasks"; then
+        log_info "Implementation changes detected - offering documentation evolution"
+
+        # Propose documentation updates
+        local proposals
+        proposals=$(propose_documentation_updates "$feature_dir" "$analysis_results")
+
+        echo "## Documentation Evolution Available
+$proposals
+
+Would you like to apply these documentation updates? (yes/no): "
+        read -r response
+        if [[ "$response" =~ ^(yes|y)$ ]]; then
+            apply_recommended_updates "$feature_dir" "$analysis_results"
+        else
+            log_info "Documentation evolution skipped by user"
+        fi
+    else
+        log_info "No significant documentation evolution needed"
+    fi
+}
+
+# Apply recommended documentation updates
+apply_recommended_updates() {
+    local feature_dir="$1"
+    local analysis_results="$2"
+
+    # Apply spec updates if new features detected
+    if echo "$analysis_results" | grep -q "new features\|new API\|new components"; then
+        echo "What new features should be added to spec.md? (describe or 'skip'): "
+        read -r spec_updates
+        if [[ "$spec_updates" != "skip" ]]; then
+            apply_documentation_updates "$feature_dir" "spec" "$spec_updates"
+        fi
+    fi
+
+    # Apply plan updates if architecture changes detected
+    if echo "$analysis_results" | grep -q "architecture\|performance\|security"; then
+        echo "What architecture changes should be documented in plan.md? (describe or 'skip'): "
+        read -r plan_updates
+        if [[ "$plan_updates" != "skip" ]]; then
+            apply_documentation_updates "$feature_dir" "plan" "$plan_updates"
+        fi
+    fi
+
+    # Apply task updates if refinements detected
+    if echo "$analysis_results" | grep -q "additional tasks\|refinements"; then
+        echo "What refinement tasks should be added to tasks.md? (describe or 'skip'): "
+        read -r task_updates
+        if [[ "$task_updates" != "skip" ]]; then
+            apply_documentation_updates "$feature_dir" "tasks" "$task_updates"
+        fi
+    fi
+
+    log_success "Documentation evolution completed"
+}
+
+# Handle task failure with enhanced rollback options
+handle_task_failure() {
+    local task_id="$1"
+    local failure_reason="$2"
+
+    log_warning "Task $task_id failed: $failure_reason"
+
+    # Get workflow mode for mode-aware rollback
+    local mode="spec"  # Default
+    if [[ -f ".specify/config/config.json" ]]; then
+        mode=$(jq -r '.workflow.current_mode // "spec"' ".specify/config/config.json" 2>/dev/null || echo "spec")
+    fi
+
+    echo "Task $task_id failed. Options:
+1. Retry task
+2. Rollback task and continue (mode-aware: $mode)
+3. Rollback entire feature and regenerate tasks
+4. Regenerate plan and tasks
+5. Skip and continue
+Choose (1-5): "
+    read -r choice
+
+    case "$choice" in
+        1)
+            log_info "Retrying task $task_id"
+            # Reset task status to pending
+            safe_json_update "$TASKS_META_FILE" --arg task_id "$task_id" '.tasks[$task_id].status = "pending"'
+            ;;
+        2)
+            log_info "Rolling back task $task_id with $mode mode strategy"
+            execute_mode_aware_rollback "$FEATURE_DIR" "task" "$mode" "$task_id"
+            ensure_documentation_consistency "$FEATURE_DIR"
+            ;;
+        3)
+            log_info "Rolling back entire feature and regenerating tasks"
+            execute_mode_aware_rollback "$FEATURE_DIR" "feature" "$mode"
+            regenerate_tasks_after_rollback "$FEATURE_DIR" "$failure_reason"
+            ensure_documentation_consistency "$FEATURE_DIR"
+            ;;
+        4)
+            log_info "Regenerating plan and tasks after failure analysis"
+            regenerate_plan "$FEATURE_DIR" "Task failure: $failure_reason"
+            regenerate_tasks_after_rollback "$FEATURE_DIR" "$failure_reason"
+            ;;
+        5)
+            log_info "Skipping failed task $task_id"
+            safe_json_update "$TASKS_META_FILE" --arg task_id "$task_id" '.tasks[$task_id].status = "skipped"'
+            ;;
+        *)
+            log_warning "Invalid choice, skipping task"
+            ;;
+    esac
+}
+
+# Run main if script is executed directly
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    if [[ $# -lt 1 ]]; then
+        echo "Usage: $0 <json_output_from_check_prerequisites>"
+        exit 1
+    fi
+    main "$1"
+fi
\ No newline at end of file
diff --git a/scripts/bash/prepare-levelup.sh b/scripts/bash/prepare-levelup.sh
new file mode 100755
index 0000000000..5dc2369c96
--- /dev/null
+++ b/scripts/bash/prepare-levelup.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+
+set -e
+
+JSON_MODE=false
+ARGS=()
+
+for arg in "$@"; do
+    case "$arg" in
+        --json)
+            JSON_MODE=true
+            ;;
+        --help|-h)
+            echo "Usage: $0 [--json]"
+            echo "  --json    Output results in JSON format"
+            echo "  --help    Show this help message"
+            exit 0
+            ;;
+        *)
+            ARGS+=("$arg")
+            ;;
+    esac
+done
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/common.sh"
+
+eval $(get_feature_paths)
+
+check_feature_branch "$CURRENT_BRANCH" "$HAS_GIT" || exit 1
+
+FEATURE_BASENAME="$(basename "$FEATURE_DIR")"
+
+KNOWLEDGE_ROOT="${SPECIFY_TEAM_DIRECTIVES:-}"
+if [[ -z "$KNOWLEDGE_ROOT" ]]; then
+    KNOWLEDGE_ROOT="$REPO_ROOT/.specify/memory/team-ai-directives"
+fi
+
+KNOWLEDGE_DRAFTS=""
+if [[ -d "$KNOWLEDGE_ROOT" ]]; then
+    KNOWLEDGE_DRAFTS="$KNOWLEDGE_ROOT/drafts"
+    mkdir -p "$KNOWLEDGE_DRAFTS"
+else
+    KNOWLEDGE_ROOT=""
+fi
+
+if $JSON_MODE; then
+    printf '{"FEATURE_DIR":"%s","BRANCH":"%s","SPEC_FILE":"%s","PLAN_FILE":"%s","TASKS_FILE":"%s","RESEARCH_FILE":"%s","QUICKSTART_FILE":"%s","KNOWLEDGE_ROOT":"%s","KNOWLEDGE_DRAFTS":"%s"}\n' \
+        "$FEATURE_DIR" "$CURRENT_BRANCH" "$FEATURE_SPEC" "$IMPL_PLAN" "$TASKS" "$RESEARCH" "$QUICKSTART" "$KNOWLEDGE_ROOT" "$KNOWLEDGE_DRAFTS"
+else
+    echo "FEATURE_DIR: $FEATURE_DIR"
+    echo "BRANCH: $CURRENT_BRANCH"
+    echo "SPEC_FILE: $FEATURE_SPEC"
+    echo "PLAN_FILE: $IMPL_PLAN"
+    echo "TASKS_FILE: $TASKS"
+    echo "RESEARCH_FILE: $RESEARCH"
+    echo "QUICKSTART_FILE: $QUICKSTART"
+    if [[ -n "$KNOWLEDGE_ROOT" ]]; then
+        echo "KNOWLEDGE_ROOT: $KNOWLEDGE_ROOT"
+        echo "KNOWLEDGE_DRAFTS: $KNOWLEDGE_DRAFTS"
+    else
+        echo "KNOWLEDGE_ROOT: (missing)"
+        echo "KNOWLEDGE_DRAFTS: (missing)"
+    fi
+fi
diff --git a/scripts/bash/scan-project-artifacts.sh b/scripts/bash/scan-project-artifacts.sh
new file mode 100644
index 0000000000..8c5bb8dd77
--- /dev/null
+++ b/scripts/bash/scan-project-artifacts.sh
@@ -0,0 +1,282 @@
+#!/usr/bin/env bash
+
+set -e
+
+JSON_MODE=false
+SUGGESTIONS_MODE=false
+
+for arg in "$@"; do
+    case "$arg" in
+        --json)
+            JSON_MODE=true
+            ;;
+        --suggestions)
+            SUGGESTIONS_MODE=true
+            ;;
+        --help|-h)
+            echo "Usage: $0 [--json] [--suggestions]"
+            echo "  --json         Output results in JSON format"
+            echo "  --suggestions  Generate constitution suggestions based on scan"
+            echo "  --help         Show this help message"
+            exit 0
+            ;;
+    esac
+done
+
+# Get script directory and load common functions
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/common.sh"
+
+eval $(get_feature_paths)
+
+# Function to scan for testing patterns
+scan_testing_patterns() {
+    local repo_root="$1"
+
+    local test_patterns=(
+        "test_*.py" "*Test.java" "*.spec.js" "*.test.js" "*_test.go"
+        "spec/**/*.rb" "test/**/*.rs" "__tests__/**/*.js"
+    )
+
+    local test_frameworks_found=()
+    local test_files_count=0
+
+    # Count test files
+    for pattern in "${test_patterns[@]}"; do
+        count=$(find "$repo_root" -name "$pattern" -type f 2>/dev/null | wc -l)
+        test_files_count=$((test_files_count + count))
+    done
+
+    # Detect testing frameworks
+    if find "$repo_root" -name "package.json" -exec grep -l '"jest"' {} \; 2>/dev/null | grep -q .; then
+        test_frameworks_found+=("Jest")
+    fi
+    if find "$repo_root" -name "pytest.ini" -o -name "setup.cfg" -exec grep -l "pytest" {} \; 2>/dev/null | grep -q .; then
+        test_frameworks_found+=("pytest")
+    fi
+    if find "$repo_root" -name "Cargo.toml" -exec grep -l "testing" {} \; 2>/dev/null | grep -q .; then
+        test_frameworks_found+=("Rust testing")
+    fi
+    if find "$repo_root" -name "*.go" -exec grep -l "testing" {} \; 2>/dev/null | grep -q .; then
+        test_frameworks_found+=("Go testing")
+    fi
+
+    echo "$test_files_count|${test_frameworks_found[*]}"
+}
+
+# Function to scan for security patterns
+scan_security_patterns() {
+    local repo_root="$1"
+
+    local security_indicators=0
+    local auth_patterns=0
+    local input_validation=0
+
+    # Check for authentication patterns
+    if grep -r "jwt\|oauth\|bearer\|token" "$repo_root" --include="*.py" --include="*.js" --include="*.java" --include="*.go" --include="*.rs" 2>/dev/null | grep -q .; then
+        auth_patterns=$((auth_patterns + 1))
+    fi
+
+    # Check for input validation
+    if grep -r "sanitize\|validate\|escape" "$repo_root" --include="*.py" --include="*.js" --include="*.java" --include="*.go" --include="*.rs" 2>/dev/null | grep -q .; then
+        input_validation=$((input_validation + 1))
+    fi
+
+    # Check for security-related files
+    if find "$repo_root" -name "*security*" -o -name "*auth*" -o -name "*crypto*" 2>/dev/null | grep -q .; then
+        security_indicators=$((security_indicators + 1))
+    fi
+
+    echo "$auth_patterns|$input_validation|$security_indicators"
+}
+
+# Function to scan for documentation patterns
+scan_documentation_patterns() {
+    local repo_root="$1"
+
+    local readme_count=0
+    local api_docs=0
+    local inline_comments=0
+
+    # Count README files
+    readme_count=$(find "$repo_root" -iname "readme*" -type f 2>/dev/null | wc -l)
+
+    # Check for API documentation
+    if find "$repo_root" -name "*api*" -o -name "*docs*" -type d 2>/dev/null | grep -q .; then
+        api_docs=1
+    fi
+
+    # Sample code files for comment analysis
+    local code_files=""
+    code_files=$(find "$repo_root" -name "*.py" -o -name "*.js" -o -name "*.java" -o -name "*.go" -o -name "*.rs" | head -10)
+
+    if [[ -n "$code_files" ]]; then
+        # Count comment lines in sample files
+        local total_lines=0
+        local comment_lines=0
+
+        for file in $code_files; do
+            if [[ -f "$file" ]]; then
+                lines_in_file=$(wc -l < "$file")
+                total_lines=$((total_lines + lines_in_file))
+
+                case "${file##*.}" in
+                    py)
+                        comments_in_file=$(grep -c "^[[:space:]]*#" "$file" 2>/dev/null || echo 0)
+                        ;;
+                    js)
+                        comments_in_file=$(grep -c "^[[:space:]]*//\|/\*" "$file" 2>/dev/null || echo 0)
+                        ;;
+                    java)
+                        comments_in_file=$(grep -c "^[[:space:]]*//\|/\*" "$file" 2>/dev/null || echo 0)
+                        ;;
+                    go)
+                        comments_in_file=$(grep -c "^[[:space:]]*//" "$file" 2>/dev/null || echo 0)
+                        ;;
+                    rs)
+                        comments_in_file=$(grep -c "^[[:space:]]*//\|/\*" "$file" 2>/dev/null || echo 0)
+                        ;;
+                    *)
+                        comments_in_file=0
+                        ;;
+                esac
+                comment_lines=$((comment_lines + comments_in_file))
+            fi
+        done
+
+        if [[ $total_lines -gt 0 ]]; then
+            inline_comments=$((comment_lines * 100 / total_lines))
+        fi
+    fi
+
+    echo "$readme_count|$api_docs|$inline_comments"
+}
+
+# Function to scan for architecture patterns
+scan_architecture_patterns() {
+    local repo_root="$1"
+
+    local layered_architecture=0
+    local modular_structure=0
+    local config_management=0
+
+    # Check for layered architecture (common folders)
+    if find "$repo_root" -type d \( -name "controllers" -o -name "services" -o -name "models" -o -name "views" \) 2>/dev/null | grep -q .; then
+        layered_architecture=1
+    fi
+
+    # Check for modular structure
+    local dir_count=$(find "$repo_root" -maxdepth 2 -type d | wc -l)
+    if [[ $dir_count -gt 10 ]]; then
+        modular_structure=1
+    fi
+
+    # Check for configuration management
+    if find "$repo_root" -name "*.env*" -o -name "config*" -o -name "settings*" 2>/dev/null | grep -q .; then
+        config_management=1
+    fi
+
+    echo "$layered_architecture|$modular_structure|$config_management"
+}
+
+# Function to generate constitution suggestions
+generate_constitution_suggestions() {
+    local testing_data="$1"
+    local security_data="$2"
+    local docs_data="$3"
+    local arch_data="$4"
+
+    local suggestions=()
+
+    # Parse testing data
+    local test_files=""
+    local test_frameworks=""
+    IFS='|' read -r test_files test_frameworks <<< "$testing_data"
+
+    if [[ $test_files -gt 0 ]]; then
+        suggestions+=("**Testing Standards**: Project has $test_files test files using $test_frameworks. Consider mandating test coverage requirements and framework consistency.")
+    fi
+
+    # Parse security data
+    local auth_patterns=""
+    local input_validation=""
+    local security_indicators=""
+    IFS='|' read -r auth_patterns input_validation security_indicators <<< "$security_data"
+
+    if [[ $auth_patterns -gt 0 || $security_indicators -gt 0 ]]; then
+        suggestions+=("**Security by Default**: Project shows security practices. Consider requiring security reviews and input validation standards.")
+    fi
+
+    # Parse documentation data
+    local readme_count=""
+    local api_docs=""
+    local comment_percentage=""
+    IFS='|' read -r readme_count api_docs comment_percentage <<< "$docs_data"
+
+    if [[ $readme_count -gt 0 ]]; then
+        suggestions+=("**Documentation Matters**: Project has $readme_count README files. Consider mandating documentation for APIs and complex logic.")
+    fi
+
+    if [[ $comment_percentage -gt 10 ]]; then
+        suggestions+=("**Code Comments**: Project shows $comment_percentage% comment density. Consider requiring meaningful comments for complex algorithms.")
+    fi
+
+    # Parse architecture data
+    local layered=""
+    local modular=""
+    local config=""
+    IFS='|' read -r layered modular config <<< "$arch_data"
+
+    if [[ $layered -gt 0 ]]; then
+        suggestions+=("**Architecture Consistency**: Project uses layered architecture. Consider documenting architectural patterns and separation of concerns.")
+    fi
+
+    if [[ $modular -gt 0 ]]; then
+        suggestions+=("**Modular Design**: Project shows modular organization. Consider requiring modular design principles and dependency management.")
+    fi
+
+    if [[ $config -gt 0 ]]; then
+        suggestions+=("**Configuration Management**: Project uses configuration files. Consider requiring environment-specific configuration and secrets management.")
+    fi
+
+    # Output suggestions
+    if [[ ${#suggestions[@]} -gt 0 ]]; then
+        echo "Constitution Suggestions Based on Codebase Analysis:"
+        echo "=================================================="
+        for suggestion in "${suggestions[@]}"; do
+            echo "- $suggestion"
+        done
+    else
+        echo "No specific constitution suggestions generated from codebase analysis."
+        echo "Consider adding general development principles to your constitution."
+    fi
+}
+
+# Main logic
+if $JSON_MODE; then
+    testing=$(scan_testing_patterns "$REPO_ROOT")
+    security=$(scan_security_patterns "$REPO_ROOT")
+    docs=$(scan_documentation_patterns "$REPO_ROOT")
+    arch=$(scan_architecture_patterns "$REPO_ROOT")
+
+    printf '{"testing":"%s","security":"%s","documentation":"%s","architecture":"%s"}\n' \
+        "$testing" "$security" "$docs" "$arch"
+else
+    echo "Scanning project artifacts for constitution patterns..."
+    echo ""
+
+    testing=$(scan_testing_patterns "$REPO_ROOT")
+    security=$(scan_security_patterns "$REPO_ROOT")
+    docs=$(scan_documentation_patterns "$REPO_ROOT")
+    arch=$(scan_architecture_patterns "$REPO_ROOT")
+
+    echo "Testing Patterns: $testing"
+    echo "Security Patterns: $security"
+    echo "Documentation Patterns: $docs"
+    echo "Architecture Patterns: $arch"
+    echo ""
+
+    if $SUGGESTIONS_MODE; then
+        generate_constitution_suggestions "$testing" "$security" "$docs" "$arch"
+    fi
+fi
\ No newline at end of file
diff --git a/scripts/bash/setup-constitution.sh b/scripts/bash/setup-constitution.sh
new file mode 100755
index 0000000000..3671e7df4e
--- /dev/null
+++ b/scripts/bash/setup-constitution.sh
@@ -0,0 +1,405 @@
+#!/usr/bin/env bash
+
+set -e
+
+JSON_MODE=false
+ARGS=()
+
+VALIDATE_MODE=false
+SCAN_MODE=false
+
+for arg in "$@"; do
+    case "$arg" in
+        --json)
+            JSON_MODE=true
+            ;;
+        --validate)
+            VALIDATE_MODE=true
+            ;;
+        --scan)
+            SCAN_MODE=true
+            ;;
+        --help|-h)
+            echo "Usage: $0 [--json] [--validate] [--scan]"
+            echo "  --json      Output results in JSON format"
+            echo "  --validate  Validate existing constitution against team inheritance"
+            echo "  --scan      Scan project artifacts and suggest constitution enhancements"
+            echo "  --help      Show this help message"
+            exit 0
+            ;;
+        *)
+            ARGS+=("$arg")
+            ;;
+    esac
+done
+
+# Get script directory and load common functions
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/common.sh"
+
+# Get all paths and variables from common functions
+eval $(get_feature_paths)
+
+# Ensure the .specify/memory directory exists
+mkdir -p "$REPO_ROOT/.specify/memory"
+
+CONSTITUTION_FILE="$REPO_ROOT/.specify/memory/constitution.md"
+TEMPLATE_FILE="$REPO_ROOT/memory/constitution.md"
+
+# Function to load team constitution
+load_team_constitution() {
+    local team_constitution=""
+
+    # Try to find team constitution in team directives
+    if [[ -n "$TEAM_DIRECTIVES" && -d "$TEAM_DIRECTIVES" ]]; then
+        # Look for constitution.md in team directives
+        local team_const_file="$TEAM_DIRECTIVES/constitution.md"
+        if [[ -f "$team_const_file" ]]; then
+            team_constitution=$(cat "$team_const_file")
+        else
+            # Look in context_modules subdirectory
+            team_const_file="$TEAM_DIRECTIVES/context_modules/constitution.md"
+            if [[ -f "$team_const_file" ]]; then
+                team_constitution=$(cat "$team_const_file")
+            fi
+        fi
+    fi
+
+    # If no team constitution found, use default template
+    if [[ -z "$team_constitution" ]]; then
+        team_constitution="# Project Constitution
+
+## Core Principles
+
+### Principle 1: Quality First
+All code must meet quality standards and include appropriate testing.
+
+### Principle 2: Documentation Required
+Clear documentation must accompany all significant changes.
+
+### Principle 3: Security by Default
+Security considerations must be addressed for all features.
+
+## Governance
+
+**Version**: 1.0.0 | **Ratified**: $(date +%Y-%m-%d) | **Last Amended**: $(date +%Y-%m-%d)
+
+*This constitution was auto-generated from team defaults. Customize as needed for your project.*"
+    fi
+
+    echo "$team_constitution"
+}
+
+# Function to parse team principles
+parse_team_principles() {
+    local team_constitution="$1"
+
+    principles=()
+    descriptions=()
+
+    echo "DEBUG: Parsing team constitution:" >&2
+    echo "$team_constitution" | head -10 >&2
+
+    # Split team constitution into lines and parse
+    in_description=false
+    current_description=""
+
+    while IFS= read -r line; do
+        echo "DEBUG: Line: '$line'" >&2
+        # Check for principle header: "1. **Principle Name**"
+        if [[ $line =~ ^([0-9]+)\.\ \*\*(.*)\*\*\ *$ ]]; then
+            echo "DEBUG: Matched principle: '${BASH_REMATCH[2]}'" >&2
+            # Save previous principle if exists
+            if [[ ${#principles[@]} -gt 0 && -n "$current_description" ]]; then
+                descriptions[${#descriptions[@]}]="$current_description"
+            fi
+
+            # Start new principle
+            principles[${#principles[@]}]="${BASH_REMATCH[2]}"
+            current_description=""
+            in_description=true
+        elif [[ $in_description == true && -n "$line" ]]; then
+            # Accumulate description lines
+            if [[ -z "$current_description" ]]; then
+                current_description="$line"
+            else
+                current_description="$current_description $line"
+            fi
+        fi
+    done <<< "$team_constitution"
+
+    # Save last principle description
+    if [[ ${#principles[@]} -gt 0 && -n "$current_description" ]]; then
+        descriptions[${#descriptions[@]}]="$current_description"
+    fi
+}
+
+# Function to fill constitution template with team inheritance
+fill_constitution_template() {
+    local team_constitution="$1"
+    local template_content="$2"
+    local project_name=""
+
+    # Try to extract project name from git or directory
+    if [[ -n "$CURRENT_BRANCH" && "$CURRENT_BRANCH" != "main" ]]; then
+        project_name="$CURRENT_BRANCH"
+    else
+        project_name=$(basename "$REPO_ROOT")
+    fi
+
+    # Parse team principles
+    parse_team_principles "$team_constitution"
+
+    # Debug: Show what was parsed
+    echo "DEBUG: Found ${#principles[@]} principles" >&2
+    for i in "${!principles[@]}"; do
+        echo "DEBUG: Principle $((i+1)): '${principles[$i]}'" >&2
+    done
+
+    # Set template variables
+    local today=$(date +%Y-%m-%d)
+
+    # Replace placeholders in template
+    filled_template="$template_content"
+    filled_template="${filled_template//\[PROJECT_NAME\]/$project_name}"
+    filled_template="${filled_template//\[CONSTITUTION_VERSION\]/1.0.0}"
+    filled_template="${filled_template//\[RATIFICATION_DATE\]/$today}"
+    filled_template="${filled_template//\[LAST_AMENDED_DATE\]/$today}"
+
+    # Fill principle placeholders
+    for i in {1..5}; do
+        # Get principle name and description (arrays are 0-indexed)
+        local idx=$((i-1))
+        local name_value=""
+        local desc_value=""
+
+        if [[ $idx -lt ${#principles[@]} ]]; then
+            name_value="${principles[$idx]}"
+            desc_value="${descriptions[$idx]}"
+        fi
+
+        filled_template="${filled_template//\[PRINCIPLE_${i}_NAME\]/$name_value}"
+        filled_template="${filled_template//\[PRINCIPLE_${i}_DESCRIPTION\]/$desc_value}"
+    done
+
+    # Fill section placeholders with team governance
+    filled_template="${filled_template//\[SECTION_2_NAME\]/Additional Constraints}"
+    filled_template="${filled_template//\[SECTION_2_CONTENT\]/All team principles must be followed. Constitution supersedes other practices.}"
+    filled_template="${filled_template//\[SECTION_3_NAME\]/Development Workflow}"
+    filled_template="${filled_template//\[SECTION_3_CONTENT\]/Follow team constitution principles in all development activities.}"
+    filled_template="${filled_template//\[GOVERNANCE_RULES\]/All changes must comply with team constitution. Amendments require team approval.}"
+
+    echo "$filled_template"
+}
+
+# Function to validate inheritance integrity
+validate_inheritance() {
+    local team_constitution="$1"
+    local project_constitution="$2"
+
+    # Extract core principles from team constitution
+    local team_principles=""
+    if echo "$team_constitution" | grep -q "^[0-9]\+\. \*\*.*\*\*"; then
+        # Numbered list format
+        team_principles=$(echo "$team_constitution" | grep "^[0-9]\+\. \*\*.*\*\*" | sed 's/^[0-9]\+\. \*\{2\}\(.*\)\*\{2\}.*/\1/')
+    fi
+
+    # Check if project constitution contains team principles
+    local missing_principles=""
+    for principle in $team_principles; do
+        if ! echo "$project_constitution" | grep -qi "$principle"; then
+            missing_principles="$missing_principles$principle, "
+        fi
+    done
+
+    if [[ -n "$missing_principles" ]]; then
+        echo "WARNING: Project constitution may be missing some team principles: ${missing_principles%, }"
+        echo "Consider ensuring all team principles are represented in your project constitution."
+    else
+        echo "✓ Inheritance validation passed - all team principles detected in project constitution"
+    fi
+}
+
+# Function to check for team constitution updates
+check_team_updates() {
+    local team_constitution="$1"
+    local project_constitution="$2"
+
+    # Check if project constitution has inheritance marker
+    if echo "$project_constitution" | grep -q "Inherited from team constitution"; then
+        local inheritance_date=""
+        inheritance_date=$(echo "$project_constitution" | grep "Inherited from team constitution" | sed 's/.*on \([0-9-]\+\).*/\1/')
+
+        if [[ -n "$inheritance_date" ]]; then
+            # Get team constitution file modification date
+            local team_file=""
+            if [[ -n "$TEAM_DIRECTIVES" && -d "$TEAM_DIRECTIVES" ]]; then
+                if [[ -f "$TEAM_DIRECTIVES/constitution.md" ]]; then
+                    team_file="$TEAM_DIRECTIVES/constitution.md"
+                elif [[ -f "$TEAM_DIRECTIVES/context_modules/constitution.md" ]]; then
+                    team_file="$TEAM_DIRECTIVES/context_modules/constitution.md"
+                fi
+            fi
+
+            if [[ -n "$team_file" ]]; then
+                local team_mod_date=""
+                team_mod_date=$(stat -c %Y "$team_file" 2>/dev/null)
+
+                local inheritance_timestamp=""
+                inheritance_timestamp=$(date -d "$inheritance_date" +%s 2>/dev/null)
+
+                if [[ -n "$team_mod_date" && -n "$inheritance_timestamp" && "$team_mod_date" -gt "$inheritance_timestamp" ]]; then
+                    echo "NOTICE: Team constitution has been updated since project constitution was created."
+                    echo "Consider reviewing the team constitution for any changes that should be reflected in your project."
+                    echo "Team constitution: $team_file"
+                fi
+            fi
+        fi
+    fi
+}
+
+# Scan-only mode
+if $SCAN_MODE && [[ ! -f "$CONSTITUTION_FILE" ]]; then
+    echo "Scanning project artifacts for constitution suggestions..."
+    "$SCRIPT_DIR/scan-project-artifacts.sh" --suggestions
+    exit 0
+fi
+
+# Validation-only mode
+if $VALIDATE_MODE; then
+    if [[ ! -f "$CONSTITUTION_FILE" ]]; then
+        echo "ERROR: No constitution file found at $CONSTITUTION_FILE"
+        echo "Run without --validate to create the constitution first."
+        exit 1
+    fi
+
+    # Load constitutions for validation
+    TEAM_CONSTITUTION=$(load_team_constitution)
+    PROJECT_CONSTITUTION=$(cat "$CONSTITUTION_FILE")
+
+    if $JSON_MODE; then
+        # Basic validation result
+        printf '{"status":"validated","file":"%s","team_directives":"%s"}\n' "$CONSTITUTION_FILE" "$TEAM_DIRECTIVES"
+    else
+        echo "Validating constitution at: $CONSTITUTION_FILE"
+        echo "Team directives source: $TEAM_DIRECTIVES"
+        echo ""
+        validate_inheritance "$TEAM_CONSTITUTION" "$PROJECT_CONSTITUTION"
+        echo ""
+        check_team_updates "$TEAM_CONSTITUTION" "$PROJECT_CONSTITUTION"
+    fi
+    exit 0
+fi
+
+# Main logic
+if [[ -f "$CONSTITUTION_FILE" ]]; then
+    echo "Constitution file already exists at $CONSTITUTION_FILE"
+    echo "Use git to modify it directly, or remove it to recreate from team directives."
+
+    # Load team constitution for comparison
+    TEAM_CONSTITUTION=$(load_team_constitution)
+    EXISTING_CONSTITUTION=$(cat "$CONSTITUTION_FILE")
+
+    # Check for team constitution updates
+    if ! $JSON_MODE; then
+        check_team_updates "$TEAM_CONSTITUTION" "$EXISTING_CONSTITUTION"
+        echo ""
+    fi
+
+    if $JSON_MODE; then
+        printf '{"status":"exists","file":"%s"}\n' "$CONSTITUTION_FILE"
+    fi
+    exit 0
+fi
+
+# Load team constitution
+TEAM_CONSTITUTION=$(load_team_constitution)
+
+# Load constitution template
+if [[ ! -f "$TEMPLATE_FILE" ]]; then
+    echo "ERROR: Constitution template not found at $TEMPLATE_FILE"
+    exit 1
+fi
+TEMPLATE_CONTENT=$(cat "$TEMPLATE_FILE")
+
+# Fill template with team inheritance
+PROJECT_CONSTITUTION=$(fill_constitution_template "$TEAM_CONSTITUTION" "$TEMPLATE_CONTENT")
+
+# If scan mode is enabled, enhance constitution with project insights
+if $SCAN_MODE; then
+    if ! $JSON_MODE; then
+        echo "Enhancing constitution with project artifact analysis..."
+    fi
+
+    # Get scan suggestions
+    SCAN_SUGGESTIONS=$("$SCRIPT_DIR/scan-project-artifacts.sh" --json)
+
+    # Parse scan data and generate suggestions
+    TESTING_DATA=$(echo "$SCAN_SUGGESTIONS" | jq -r '.testing')
+    SECURITY_DATA=$(echo "$SCAN_SUGGESTIONS" | jq -r '.security')
+    DOCS_DATA=$(echo "$SCAN_SUGGESTIONS" | jq -r '.documentation')
+    ARCH_DATA=$(echo "$SCAN_SUGGESTIONS" | jq -r '.architecture')
+
+    # Generate additional principles based on scan
+    ADDITIONAL_PRINCIPLES=""
+
+    # Parse testing data
+    TEST_FILES=$(echo "$TESTING_DATA" | cut -d'|' -f1)
+    TEST_FRAMEWORKS=$(echo "$TESTING_DATA" | cut -d'|' -f2)
+
+    if [[ $TEST_FILES -gt 0 ]]; then
+        ADDITIONAL_PRINCIPLES="${ADDITIONAL_PRINCIPLES}
+### Tests Drive Confidence (Project Practice)
+Automated testing is established with $TEST_FILES test files using $TEST_FRAMEWORKS. All features must maintain or improve test coverage. Refuse to ship when test suites fail."
+    fi
+
+    # Parse security data
+    AUTH_PATTERNS=$(echo "$SECURITY_DATA" | cut -d'|' -f1)
+    SECURITY_INDICATORS=$(echo "$SECURITY_DATA" | cut -d'|' -f3)
+
+    if [[ $AUTH_PATTERNS -gt 0 || $SECURITY_INDICATORS -gt 0 ]]; then
+        ADDITIONAL_PRINCIPLES="${ADDITIONAL_PRINCIPLES}
+### Security by Default (Project Practice)
+Security practices are established in the codebase. All features must include security considerations, input validation, and follow established security patterns."
+    fi
+
+    # Parse documentation data
+    README_COUNT=$(echo "$DOCS_DATA" | cut -d'|' -f1)
+    COMMENT_PERCENTAGE=$(echo "$DOCS_DATA" | cut -d'|' -f3)
+
+    if [[ $README_COUNT -gt 0 ]]; then
+        ADDITIONAL_PRINCIPLES="${ADDITIONAL_PRINCIPLES}
+### Documentation Matters (Project Practice)
+Documentation practices are established with $README_COUNT README files. All features must include appropriate documentation and maintain existing documentation standards."
+    fi
+
+    # Insert additional principles into constitution
+    if [[ -n "$ADDITIONAL_PRINCIPLES" ]]; then
+        # Find the end of core principles section and insert additional principles
+        PROJECT_CONSTITUTION=$(echo "$PROJECT_CONSTITUTION" | sed "/## Additional Constraints/i\\
+## Project-Specific Principles\\
+$ADDITIONAL_PRINCIPLES")
+    fi
+fi
+
+# Validate inheritance integrity
+if ! $JSON_MODE; then
+    validate_inheritance "$TEAM_CONSTITUTION" "$PROJECT_CONSTITUTION"
+    echo ""
+fi
+
+# Write to file
+echo "$PROJECT_CONSTITUTION" > "$CONSTITUTION_FILE"
+
+# Output results
+if $JSON_MODE; then
+    printf '{"status":"created","file":"%s","team_directives":"%s"}\n' "$CONSTITUTION_FILE" "$TEAM_DIRECTIVES"
+else
+    echo "Constitution created at: $CONSTITUTION_FILE"
+    echo "Team directives source: $TEAM_DIRECTIVES"
+    echo ""
+    echo "Next steps:"
+    echo "1. Review and customize the constitution for your project needs"
+    echo "2. Commit the constitution: git add .specify/memory/constitution.md && git commit -m 'docs: initialize project constitution'"
+    echo "3. The constitution will be used by planning and implementation commands"
+fi
\ No newline at end of file
diff --git a/scripts/bash/setup-plan.sh b/scripts/bash/setup-plan.sh
old mode 100644
new mode 100755
index d01c6d6cb5..16e6ef8ca4
--- a/scripts/bash/setup-plan.sh
+++ b/scripts/bash/setup-plan.sh
@@ -33,6 +33,12 @@ eval $(get_feature_paths)
 # Check if we're on a proper feature branch (only for git repos)
 check_feature_branch "$CURRENT_BRANCH" "$HAS_GIT" || exit 1
 
+# Resolve team directives path if provided
+if [[ -n "$TEAM_DIRECTIVES" && ! -d "$TEAM_DIRECTIVES" ]]; then
+    echo "ERROR: TEAM_DIRECTIVES path $TEAM_DIRECTIVES is not accessible." >&2
+    exit 1
+fi
+
 # Ensure the feature directory exists
 mkdir -p "$FEATURE_DIR"
 
@@ -47,15 +53,63 @@ else
     touch "$IMPL_PLAN"
 fi
 
+CONTEXT_FILE="$FEATURE_DIR/context.md"
+if [[ ! -f "$CONTEXT_FILE" ]]; then
+    echo "ERROR: context.md not found in $FEATURE_DIR" >&2
+    echo "Fill out the feature context before running /plan." >&2
+    exit 1
+fi
+
+if grep -q "\[NEEDS INPUT\]" "$CONTEXT_FILE"; then
+    echo "ERROR: context.md contains unresolved [NEEDS INPUT] markers." >&2
+    echo "Please update $CONTEXT_FILE with mission, code paths, directives, research, and gateway details before proceeding." >&2
+    exit 1
+fi
+
+# Resolve constitution and team directives paths (prefer env overrides)
+CONSTITUTION_FILE="${SPECIFY_CONSTITUTION:-}"
+if [[ -z "$CONSTITUTION_FILE" ]]; then
+    CONSTITUTION_FILE="$REPO_ROOT/.specify/memory/constitution.md"
+fi
+if [[ -f "$CONSTITUTION_FILE" ]]; then
+    export SPECIFY_CONSTITUTION="$CONSTITUTION_FILE"
+else
+    CONSTITUTION_FILE=""
+fi
+
+TEAM_DIRECTIVES_DIR="${TEAM_DIRECTIVES:-}"
+if [[ -z "$TEAM_DIRECTIVES_DIR" ]]; then
+    TEAM_DIRECTIVES_DIR="${SPECIFY_TEAM_DIRECTIVES:-}"
+fi
+if [[ -z "$TEAM_DIRECTIVES_DIR" ]]; then
+    TEAM_DIRECTIVES_DIR="$REPO_ROOT/.specify/memory/team-ai-directives"
+fi
+if [[ -d "$TEAM_DIRECTIVES_DIR" ]]; then
+    export SPECIFY_TEAM_DIRECTIVES="$TEAM_DIRECTIVES_DIR"
+else
+    TEAM_DIRECTIVES_DIR=""
+fi
+
 # Output results
 if $JSON_MODE; then
-    printf '{"FEATURE_SPEC":"%s","IMPL_PLAN":"%s","SPECS_DIR":"%s","BRANCH":"%s","HAS_GIT":"%s"}\n' \
-        "$FEATURE_SPEC" "$IMPL_PLAN" "$FEATURE_DIR" "$CURRENT_BRANCH" "$HAS_GIT"
+    printf '{"FEATURE_SPEC":"%s","IMPL_PLAN":"%s","SPECS_DIR":"%s","BRANCH":"%s","HAS_GIT":"%s","CONSTITUTION":"%s","TEAM_DIRECTIVES":"%s","CONTEXT_FILE":"%s"}\n' \
+        "$FEATURE_SPEC" "$IMPL_PLAN" "$FEATURE_DIR" "$CURRENT_BRANCH" "$HAS_GIT" "$CONSTITUTION_FILE" "$TEAM_DIRECTIVES_DIR" "$CONTEXT_FILE"
 else
     echo "FEATURE_SPEC: $FEATURE_SPEC"
     echo "IMPL_PLAN: $IMPL_PLAN" 
     echo "SPECS_DIR: $FEATURE_DIR"
     echo "BRANCH: $CURRENT_BRANCH"
     echo "HAS_GIT: $HAS_GIT"
+    if [[ -n "$CONSTITUTION_FILE" ]]; then
+        echo "CONSTITUTION: $CONSTITUTION_FILE"
+    else
+        echo "CONSTITUTION: (missing)"
+    fi
+    if [[ -n "$TEAM_DIRECTIVES_DIR" ]]; then
+        echo "TEAM_DIRECTIVES: $TEAM_DIRECTIVES_DIR"
+    else
+        echo "TEAM_DIRECTIVES: (missing)"
+    fi
+    echo "CONTEXT_FILE: $CONTEXT_FILE"
 fi
 
diff --git a/scripts/bash/spec-hooks-install.sh b/scripts/bash/spec-hooks-install.sh
new file mode 100755
index 0000000000..293066bf8f
--- /dev/null
+++ b/scripts/bash/spec-hooks-install.sh
@@ -0,0 +1,199 @@
+#!/bin/bash
+# spec-hooks-install.sh - Install git hooks for automatic spec-code synchronization
+# This script sets up pre-commit, post-commit, and pre-push hooks to detect code changes
+# and queue documentation updates for specs/*.md files
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Logging functions
+log_info() {
+    echo -e "${BLUE}INFO:${NC} $*" >&2
+}
+
+log_success() {
+    echo -e "${GREEN}SUCCESS:${NC} $*" >&2
+}
+
+log_warning() {
+    echo -e "${YELLOW}WARNING:${NC} $*" >&2
+}
+
+log_error() {
+    echo -e "${RED}ERROR:${NC} $*" >&2
+}
+
+# Check if we're in a git repository
+check_git_repo() {
+    if ! git rev-parse --git-dir > /dev/null 2>&1; then
+        log_error "Not in a git repository. Spec sync requires git."
+        exit 1
+    fi
+}
+
+# Create hooks directory if it doesn't exist
+ensure_hooks_dir() {
+    local hooks_dir=".git/hooks"
+    if [[ ! -d "$hooks_dir" ]]; then
+        log_warning "Git hooks directory not found, creating it"
+        mkdir -p "$hooks_dir"
+    fi
+}
+
+# Install a specific hook
+install_hook() {
+    local hook_name="$1"
+    local hook_script="$2"
+    local hooks_dir=".git/hooks"
+    local hook_path="$hooks_dir/$hook_name"
+
+    # Check if hook already exists
+    if [[ -f "$hook_path" ]]; then
+        # Check if it's already our hook
+        if grep -q "spec-sync" "$hook_path" 2>/dev/null; then
+            log_info "$hook_name hook already installed"
+            return 0
+        else
+            log_warning "$hook_name hook already exists, backing up and replacing"
+            cp "$hook_path" "${hook_path}.backup.$(date +%Y%m%d_%H%M%S)"
+        fi
+    fi
+
+    # Create the hook script
+    cat > "$hook_path" << EOF
+#!/bin/bash
+# $hook_name hook for spec-code synchronization
+# Automatically detects code changes and queues spec updates
+
+set -euo pipefail
+
+# Source the spec sync utilities
+SCRIPT_DIR="\$(cd "\$(dirname "\${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="\$(cd "\$SCRIPT_DIR/../.." && pwd)"
+
+# Check if spec sync is enabled for this project
+if [[ ! -f "\$PROJECT_ROOT/.specify/config/config.json" ]]; then
+    exit 0
+fi
+
+# Check if spec sync is enabled in config
+if command -v jq >/dev/null 2>&1; then
+    enabled=\$(jq -r '.spec_sync.enabled // false' "\$PROJECT_ROOT/.specify/config/config.json" 2>/dev/null)
+    if [[ "\$enabled" != "true" ]]; then
+        exit 0
+    fi
+else
+    # Fallback: check if enabled exists in config (simple grep)
+    if ! grep -q '"enabled":\s*true' "\$PROJECT_ROOT/.specify/config/config.json" 2>/dev/null; then
+        exit 0
+    fi
+fi
+
+# Run the $hook_script
+if [[ -f "\$PROJECT_ROOT/scripts/bash/$hook_script" ]]; then
+    bash "\$PROJECT_ROOT/scripts/bash/$hook_script"
+fi
+EOF
+
+    # Make executable
+    chmod +x "$hook_path"
+
+    log_success "Installed $hook_name hook"
+}
+
+# Create spec sync configuration
+function create_config {
+    local config_dir=".specify/config"
+    local config_file="$config_dir/config.json"
+    mkdir -p "$config_dir"
+
+    # Check if config file exists, create if not
+    if [[ ! -f "$config_file" ]]; then
+        cat > "$config_file" << EOF
+{
+  "version": "1.0",
+  "project": {
+    "created": "$(date -Iseconds)",
+    "last_modified": "$(date -Iseconds)"
+  },
+  "workflow": {
+    "current_mode": "spec",
+    "default_mode": "spec",
+    "mode_history": []
+  },
+  "options": {
+    "tdd_enabled": false,
+    "contracts_enabled": false,
+    "data_models_enabled": false,
+    "risk_tests_enabled": false
+  },
+  "mode_defaults": {
+    "build": {
+      "tdd_enabled": false,
+      "contracts_enabled": false,
+      "data_models_enabled": false,
+      "risk_tests_enabled": false
+    },
+    "spec": {
+      "tdd_enabled": true,
+      "contracts_enabled": true,
+      "data_models_enabled": true,
+      "risk_tests_enabled": true
+    }
+  },
+  "spec_sync": {
+    "enabled": true,
+    "queue": {
+      "version": "1.0",
+      "created": "$(date -Iseconds)",
+      "pending": [],
+      "processed": []
+    }
+  },
+  "gateway": {
+    "url": null,
+    "token": null,
+    "suppress_warning": false
+  }
+}
+EOF
+    else
+        # Update existing config to enable spec sync
+        # Use jq if available, otherwise use sed for simple update
+        if command -v jq >/dev/null 2>&1; then
+            jq '.spec_sync.enabled = true' "$config_file" > "${config_file}.tmp" && mv "${config_file}.tmp" "$config_file"
+        else
+            # Fallback: simple sed replacement (assumes enabled is currently false)
+            sed -i 's/"enabled": false/"enabled": true/' "$config_file"
+        fi
+    fi
+
+    log_success "Created/updated spec sync configuration"
+}
+
+# Main installation function
+main() {
+    log_info "Installing spec-code synchronization hooks..."
+
+    check_git_repo
+    ensure_hooks_dir
+    create_config
+
+    # Install the hooks
+    install_hook "pre-commit" "spec-sync-pre-commit.sh"
+    install_hook "post-commit" "spec-sync-post-commit.sh"
+    install_hook "pre-push" "spec-sync-pre-push.sh"
+
+    log_success "Spec-code synchronization hooks installed successfully!"
+    log_info "Hooks will automatically detect code changes and queue spec updates"
+    log_info "Use 'git commit' or 'git push' to trigger synchronization"
+}
+
+# Run main function
+main "$@"
\ No newline at end of file
diff --git a/scripts/bash/spec-sync-post-commit.sh b/scripts/bash/spec-sync-post-commit.sh
new file mode 100755
index 0000000000..327486afb7
--- /dev/null
+++ b/scripts/bash/spec-sync-post-commit.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# spec-sync-post-commit.sh - Post-commit hook for spec-code synchronization
+# This script runs after commits to process queued spec updates
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Logging functions
+log_info() {
+    echo -e "${BLUE}INFO:${NC} $*" >&2
+}
+
+log_success() {
+    echo -e "${GREEN}SUCCESS:${NC} $*" >&2
+}
+
+log_warning() {
+    echo -e "${YELLOW}WARNING:${NC} $*" >&2
+}
+
+log_error() {
+    echo -e "${RED}ERROR:${NC} $*" >&2
+}
+
+# Get the project root
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# Check if spec sync is enabled
+CONFIG_FILE="$PROJECT_ROOT/.specify/config/config.json"
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    exit 0
+fi
+
+# Parse config to check if spec sync is enabled
+if command -v jq >/dev/null 2>&1; then
+    SPEC_SYNC_ENABLED=$(jq -r '.spec_sync.enabled // false' "$CONFIG_FILE" 2>/dev/null)
+    if [[ "$SPEC_SYNC_ENABLED" != "true" ]]; then
+        exit 0
+    fi
+else
+    # Fallback: check if enabled is set to true in the file
+    if ! grep -q '"enabled": true' "$CONFIG_FILE" 2>/dev/null; then
+        exit 0
+    fi
+fi
+
+log_info "Processing spec sync queue after commit..."
+
+# Check if spec sync is enabled and there's a config file
+if [[ ! -f "$CONFIG_FILE" ]] || ! command -v jq >/dev/null 2>&1; then
+    log_info "No spec sync config found"
+    exit 0
+fi
+
+# Check if spec sync is enabled
+SPEC_SYNC_ENABLED=$(jq -r '.spec_sync.enabled // false' "$CONFIG_FILE" 2>/dev/null)
+if [[ "$SPEC_SYNC_ENABLED" != "true" ]]; then
+    exit 0
+fi
+
+# For now, just log that post-commit processing would happen here
+# In a full implementation, this would process queued spec updates
+log_info "Spec sync post-commit processing completed (stub implementation)"
+
+log_success "Post-commit spec sync processing completed"
\ No newline at end of file
diff --git a/scripts/bash/spec-sync-pre-commit.sh b/scripts/bash/spec-sync-pre-commit.sh
new file mode 100755
index 0000000000..9410645d41
--- /dev/null
+++ b/scripts/bash/spec-sync-pre-commit.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# spec-sync-pre-commit.sh - Pre-commit hook for spec-code synchronization
+# This script runs before commits to detect code changes and queue spec updates
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Logging functions
+log_info() {
+    echo -e "${BLUE}INFO:${NC} $*" >&2
+}
+
+log_success() {
+    echo -e "${GREEN}SUCCESS:${NC} $*" >&2
+}
+
+log_warning() {
+    echo -e "${YELLOW}WARNING:${NC} $*" >&2
+}
+
+log_error() {
+    echo -e "${RED}ERROR:${NC} $*" >&2
+}
+
+# Get the project root
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# Check if spec sync is enabled
+CONFIG_FILE="$PROJECT_ROOT/.specify/config/config.json"
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    exit 0
+fi
+
+# Parse config to check if spec sync is enabled
+if command -v jq >/dev/null 2>&1; then
+    SPEC_SYNC_ENABLED=$(jq -r '.spec_sync.enabled // false' "$CONFIG_FILE" 2>/dev/null)
+    if [[ "$SPEC_SYNC_ENABLED" != "true" ]]; then
+        exit 0
+    fi
+else
+    # Fallback: check if enabled is set to true in the file
+    if ! grep -q '"enabled": true' "$CONFIG_FILE" 2>/dev/null; then
+        exit 0
+    fi
+fi
+
+log_info "Checking for code changes that may require spec updates..."
+
+# Get list of changed files
+CHANGED_FILES=$(git diff --cached --name-only --diff-filter=ACMRTUXB)
+
+if [[ -z "$CHANGED_FILES" ]]; then
+    log_info "No files changed, skipping spec sync check"
+    exit 0
+fi
+
+# Check if any spec files or code files changed
+SPEC_CHANGED=false
+CODE_CHANGED=false
+
+while IFS= read -r file; do
+    if [[ "$file" =~ ^specs/.*\.md$ ]]; then
+        SPEC_CHANGED=true
+    elif [[ "$file" =~ \.(py|js|ts|java|c|cpp|h|go|rs|php)$ ]]; then
+        CODE_CHANGED=true
+    fi
+done <<< "$CHANGED_FILES"
+
+# If code changed but no spec updates, warn the user
+if [[ "$CODE_CHANGED" == "true" && "$SPEC_CHANGED" == "false" ]]; then
+    log_warning "Code changes detected but no spec files updated"
+    log_warning "Consider updating relevant specs/*.md files to reflect code changes"
+    log_warning "Use 'git commit --no-verify' to skip this check if intentional"
+    # Don't fail the commit, just warn
+fi
+
+log_success "Pre-commit spec sync check completed"
\ No newline at end of file
diff --git a/scripts/bash/spec-sync-pre-push.sh b/scripts/bash/spec-sync-pre-push.sh
new file mode 100755
index 0000000000..bccde5ef97
--- /dev/null
+++ b/scripts/bash/spec-sync-pre-push.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# spec-sync-pre-push.sh - Pre-push hook for spec-code synchronization
+# This script runs before pushes to ensure spec updates are processed
+
+set -euo pipefail
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Logging functions
+log_info() {
+    echo -e "${BLUE}INFO:${NC} $*" >&2
+}
+
+log_success() {
+    echo -e "${GREEN}SUCCESS:${NC} $*" >&2
+}
+
+log_warning() {
+    echo -e "${YELLOW}WARNING:${NC} $*" >&2
+}
+
+log_error() {
+    echo -e "${RED}ERROR:${NC} $*" >&2
+}
+
+# Get the project root
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+# Check if spec sync is enabled
+CONFIG_FILE="$PROJECT_ROOT/.specify/config/config.json"
+if [[ ! -f "$CONFIG_FILE" ]]; then
+    exit 0
+fi
+
+# Parse config to check if spec sync is enabled
+if command -v jq >/dev/null 2>&1; then
+    SPEC_SYNC_ENABLED=$(jq -r '.spec_sync.enabled // false' "$CONFIG_FILE" 2>/dev/null)
+    if [[ "$SPEC_SYNC_ENABLED" != "true" ]]; then
+        exit 0
+    fi
+else
+    # Fallback: check if enabled is set to true in the file
+    if ! grep -q '"enabled": true' "$CONFIG_FILE" 2>/dev/null; then
+        exit 0
+    fi
+fi
+
+log_info "Checking spec sync status before push..."
+
+# Check if there are any pending spec updates in the queue
+if [[ -f "$CONFIG_FILE" ]] && command -v jq >/dev/null 2>&1; then
+    # Check if queue has pending items
+    PENDING_COUNT=$(jq -r '.spec_sync.queue.pending | length' "$CONFIG_FILE" 2>/dev/null || echo "0")
+    if [[ "$PENDING_COUNT" -gt 0 ]]; then
+        log_warning "Pending spec updates detected in queue ($PENDING_COUNT items)"
+        log_warning "Consider processing spec updates before pushing"
+        log_warning "Use 'git push --no-verify' to skip this check if intentional"
+        # Don't fail the push, just warn
+    fi
+fi
+
+log_success "Pre-push spec sync check completed"
\ No newline at end of file
diff --git a/scripts/bash/tasks-meta-utils.sh b/scripts/bash/tasks-meta-utils.sh
new file mode 100755
index 0000000000..7eae98323e
--- /dev/null
+++ b/scripts/bash/tasks-meta-utils.sh
@@ -0,0 +1,818 @@
+#!/bin/bash
+
+# Task Meta Utilities for Agentic SDLC
+# Handles task classification, delegation, and status tracking
+
+# Initialize tasks_meta.json for a feature
+init_tasks_meta() {
+    local feature_dir="$1"
+    local tasks_meta_file="$feature_dir/tasks_meta.json"
+
+    # Create basic structure
+    cat > "$tasks_meta_file" << EOF
+{
+    "feature": "$(basename "$feature_dir")",
+    "created": "$(date -Iseconds)",
+    "tasks": {}
+}
+EOF
+
+    echo "Initialized tasks_meta.json at $tasks_meta_file"
+}
+
+# Classify task execution mode (SYNC/ASYNC)
+classify_task_execution_mode() {
+    local description="$1"
+    local files="$2"
+
+    # Simple classification logic
+    # ASYNC if description contains certain keywords or involves multiple files
+    if echo "$description" | grep -qi "research\|analyze\|design\|plan\|review\|test"; then
+        echo "ASYNC"
+    elif [[ $(echo "$files" | wc -w) -gt 2 ]]; then
+        echo "ASYNC"
+    else
+        echo "SYNC"
+    fi
+}
+
+# Add task to tasks_meta.json
+add_task() {
+    local tasks_meta_file="$1"
+    local task_id="$2"
+    local description="$3"
+    local files="$4"
+    local execution_mode="$5"
+
+    # Use jq if available, otherwise create manually
+    if command -v jq >/dev/null 2>&1; then
+        jq --arg task_id "$task_id" \
+           --arg desc "$description" \
+           --arg files "$files" \
+           --arg mode "$execution_mode" \
+           '.tasks[$task_id] = {
+               "description": $desc,
+               "files": $files,
+               "execution_mode": $mode,
+               "status": "pending",
+               "agent_type": "general"
+           }' "$tasks_meta_file" > "${tasks_meta_file}.tmp" && mv "${tasks_meta_file}.tmp" "$tasks_meta_file"
+    else
+        # Fallback without jq - just log for now
+        echo "Added task $task_id ($execution_mode) to $tasks_meta_file"
+    fi
+}
+
+# Generate comprehensive agent context
+generate_agent_context() {
+    local feature_dir="$1"
+
+    local context="## Project Context
+
+"
+
+    # Add spec.md content
+    if [[ -f "$feature_dir/spec.md" ]]; then
+        context="${context}### Feature Specification
+$(cat "$feature_dir/spec.md")
+
+"
+    fi
+
+    # Add plan.md content
+    if [[ -f "$feature_dir/plan.md" ]]; then
+        context="${context}### Technical Plan
+$(cat "$feature_dir/plan.md")
+
+"
+    fi
+
+    # Add data-model.md if exists
+    if [[ -f "$feature_dir/data-model.md" ]]; then
+        context="${context}### Data Model
+$(cat "$feature_dir/data-model.md")
+
+"
+    fi
+
+    # Add research.md if exists
+    if [[ -f "$feature_dir/research.md" ]]; then
+        context="${context}### Research & Decisions
+$(cat "$feature_dir/research.md")
+
+"
+    fi
+
+    # Add contracts if exists
+    if [[ -d "$feature_dir/contracts" ]]; then
+        context="${context}### API Contracts
+"
+        for contract_file in "$feature_dir/contracts"/*.md; do
+            if [[ -f "$contract_file" ]]; then
+                context="${context}#### $(basename "$contract_file" .md)
+$(cat "$contract_file")
+
+"
+            fi
+        done
+    fi
+
+    # Add team context if available
+    if [[ -f "constitution.md" ]]; then
+        context="${context}### Team Constitution
+$(head -50 constitution.md)
+
+"
+    fi
+
+    echo "$context"
+}
+
+# Generate delegation prompt from task metadata with rich context
+generate_delegation_prompt() {
+    local task_id="$1"
+    local agent_type="$2"
+    local task_description="$3"
+    local task_context="$4"
+    local task_requirements="$5"
+    local execution_instructions="$6"
+    local feature_dir="$7"
+
+    # Read template
+    local script_dir
+    script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+    local template_file="$script_dir/../../templates/delegation-template.md"
+    if [[ ! -f "$template_file" ]]; then
+        echo "Error: Delegation template not found at $template_file" >&2
+        return 1
+    fi
+
+    local template_content
+    template_content=$(cat "$template_file")
+
+    # Generate comprehensive context
+    local agent_context
+    agent_context=$(generate_agent_context "$feature_dir")
+
+    # Combine task context with agent context
+    local full_context="${task_context}
+
+${agent_context}"
+
+    # Substitute variables using awk to handle multiline content safely
+    local prompt
+    prompt=$(awk -v agent_type="$agent_type" \
+                 -v task_description="$task_description" \
+                 -v full_context="$full_context" \
+                 -v task_requirements="$task_requirements" \
+                 -v execution_instructions="$execution_instructions" \
+                 -v task_id="$task_id" \
+                 -v timestamp="$(date)" '
+    {
+        gsub(/{AGENT_TYPE}/, agent_type);
+        gsub(/{TASK_DESCRIPTION}/, task_description);
+        gsub(/{TASK_CONTEXT}/, full_context);
+        gsub(/{TASK_REQUIREMENTS}/, task_requirements);
+        gsub(/{EXECUTION_INSTRUCTIONS}/, execution_instructions);
+        gsub(/{TASK_ID}/, task_id);
+        gsub(/{TIMESTAMP}/, timestamp);
+        print;
+    }' <<< "$template_content")
+
+    echo "$prompt"
+}
+
+# Check delegation status
+check_delegation_status() {
+    local task_id="$1"
+
+    # Check if prompt exists (task was delegated)
+    local prompt_file="delegation_prompts/${task_id}.md"
+    if [[ ! -f "$prompt_file" ]]; then
+        echo "no_job"
+        return 0
+    fi
+
+    # Check for completion marker (AI assistant would create this)
+    local completion_file="delegation_completed/${task_id}.txt"
+    if [[ -f "$completion_file" ]]; then
+        echo "completed"
+        return 0
+    fi
+
+    # Check for error marker
+    local error_file="delegation_errors/${task_id}.txt"
+    if [[ -f "$error_file" ]]; then
+        echo "failed"
+        return 0
+    fi
+
+    # Otherwise, assume still running
+    echo "running"
+}
+
+# Dispatch async task using natural language delegation with rich context
+dispatch_async_task() {
+    local task_id="$1"
+    local agent_type="$2"
+    local task_description="$3"
+    local task_context="$4"
+    local task_requirements="$5"
+    local execution_instructions="$6"
+    local feature_dir="$7"
+
+    # Generate natural language delegation prompt with comprehensive context
+    local prompt
+    prompt=$(generate_delegation_prompt "$task_id" "$agent_type" "$task_description" "$task_context" "$task_requirements" "$execution_instructions" "$feature_dir")
+
+    if [[ $? -ne 0 ]]; then
+        echo "Failed to generate delegation prompt" >&2
+        return 1
+    fi
+
+    # Save prompt for AI assistant consumption
+    # The AI assistant with MCP tool access will process this prompt
+    local prompt_file="delegation_prompts/${task_id}.md"
+    mkdir -p delegation_prompts
+    echo "$prompt" > "$prompt_file"
+
+    echo "Task $task_id delegated successfully - comprehensive prompt saved for AI assistant"
+}
+
+# Analyze implementation changes vs documentation for evolution
+analyze_implementation_changes() {
+    local feature_dir="$1"
+    local spec_file="$feature_dir/spec.md"
+    local plan_file="$feature_dir/plan.md"
+    local tasks_file="$feature_dir/tasks.md"
+
+    local changes="## Implementation vs Documentation Analysis
+"
+
+    # Check for new features in code not in spec
+    if [[ -d "src" ]] || [[ -d "lib" ]] || find . -name "*.js" -o -name "*.ts" -o -name "*.py" | grep -q .; then
+        changes="${changes}### Potential New Features
+- Scan codebase for implemented functionality not documented in spec.md
+- Check for new API endpoints, UI components, or business logic
+- Identify user flows that may have evolved during implementation
+
+"
+    fi
+
+    # Check for architecture changes
+    if [[ -f "$plan_file" ]]; then
+        changes="${changes}### Architecture Evolution
+- Compare implemented architecture against plan.md
+- Identify performance optimizations or security enhancements
+- Note technology stack changes or library updates
+
+"
+    fi
+
+    # Check for completed tasks that might indicate refinements
+    if [[ -f "$tasks_file" ]]; then
+        local completed_tasks
+        completed_tasks=$(grep -c "^- \[X\]" "$tasks_file" || echo "0")
+        local total_tasks
+        total_tasks=$(grep -c "^- \[.\]" "$tasks_file" || echo "0")
+
+        changes="${changes}### Task Completion Status
+- Completed: $completed_tasks / $total_tasks tasks
+- Check for additional tasks added during implementation
+- Identify refinements or bug fixes that emerged
+
+"
+    fi
+
+    echo "$changes"
+}
+
+# Propose documentation updates based on implementation analysis
+propose_documentation_updates() {
+    local feature_dir="$1"
+    local analysis_results="$2"
+
+    local proposals="## Documentation Evolution Proposals
+
+Based on implementation analysis, here are recommended documentation updates:
+
+"
+
+    # Check if there are undocumented features
+    if echo "$analysis_results" | grep -q "new features\|new API\|new components"; then
+        proposals="${proposals}### Spec.md Updates
+- Add newly implemented features to functional requirements
+- Document discovered edge cases and user experience insights
+- Update acceptance criteria based on actual implementation
+
+"
+    fi
+
+    # Check for architecture changes
+    if echo "$analysis_results" | grep -q "architecture\|performance\|security"; then
+        proposals="${proposals}### Plan.md Updates
+- Document architecture changes made during implementation
+- Add performance optimizations and their rationale
+- Update technology decisions based on implementation experience
+
+"
+    fi
+
+    # Check for task additions
+    if echo "$analysis_results" | grep -q "additional tasks\|refinements"; then
+        proposals="${proposals}### Tasks.md Updates
+- Add follow-up tasks for refinements discovered during implementation
+- Document bug fixes and improvements made
+- Update task status and add completion notes
+
+"
+    fi
+
+    proposals="${proposals}### Evolution Guidelines
+- Preserve original requirements while incorporating implementation learnings
+- Maintain traceability between documentation and code
+- Version documentation changes with clear rationale
+- Ensure constitution compliance for any new requirements
+
+"
+
+    echo "$proposals"
+}
+
+# Apply documentation updates with user confirmation
+apply_documentation_updates() {
+    local feature_dir="$1"
+    local update_type="$2"  # spec, plan, tasks
+    local update_content="$3"
+
+    case "$update_type" in
+        "spec")
+            local spec_file="$feature_dir/spec.md"
+            echo "## Implementation Learnings - $(date)" >> "$spec_file"
+            echo "" >> "$spec_file"
+            echo "$update_content" >> "$spec_file"
+            echo "Updated spec.md with implementation insights"
+            ;;
+        "plan")
+            local plan_file="$feature_dir/plan.md"
+            echo "## Architecture Evolution - $(date)" >> "$plan_file"
+            echo "" >> "$plan_file"
+            echo "$update_content" >> "$plan_file"
+            echo "Updated plan.md with architecture changes"
+            ;;
+        "tasks")
+            local tasks_file="$feature_dir/tasks.md"
+            echo "" >> "$tasks_file"
+            echo "## Refinement Tasks - $(date)" >> "$tasks_file"
+            echo "$update_content" >> "$tasks_file"
+            echo "Added refinement tasks to tasks.md"
+            ;;
+        *)
+            echo "Unknown update type: $update_type" >&2
+            return 1
+            ;;
+    esac
+}
+
+# Rollback individual task while preserving documentation
+rollback_task() {
+    local tasks_meta_file="$1"
+    local task_id="$2"
+    local preserve_docs="${3:-true}"
+
+    echo "Rolling back task: $task_id"
+
+    # Get task information before rollback
+    local task_description=""
+    local task_files=""
+    if command -v jq >/dev/null 2>&1; then
+        task_description=$(jq -r ".tasks[\"$task_id\"].description // empty" "$tasks_meta_file")
+        task_files=$(jq -r ".tasks[\"$task_id\"].files // empty" "$tasks_meta_file")
+    fi
+
+    # Mark task as rolled back in metadata
+    if command -v jq >/dev/null 2>&1; then
+        jq --arg task_id "$task_id" '.tasks[$task_id].status = "rolled_back"' "$tasks_meta_file" > "${tasks_meta_file}.tmp" && \
+        mv "${tasks_meta_file}.tmp" "$tasks_meta_file"
+    fi
+
+    # Attempt to revert code changes (simplified - would need proper git integration)
+    if [[ -n "$task_files" ]]; then
+        echo "Attempting to revert changes to files: $task_files"
+        # In a real implementation, this would use git checkout or similar
+        echo "Note: Manual code reversion may be required for files: $task_files"
+    fi
+
+    # Log rollback details
+    local feature_dir
+    feature_dir=$(dirname "$tasks_meta_file")
+    echo "## Task Rollback - $(date)" >> "$feature_dir/rollback.log"
+    echo "Task ID: $task_id" >> "$feature_dir/rollback.log"
+    echo "Description: $task_description" >> "$feature_dir/rollback.log"
+    echo "Files: $task_files" >> "$feature_dir/rollback.log"
+    echo "Documentation Preserved: $preserve_docs" >> "$feature_dir/rollback.log"
+    echo "" >> "$feature_dir/rollback.log"
+
+    echo "Task $task_id rolled back successfully"
+
+    if [[ "$preserve_docs" == "true" ]]; then
+        echo "Documentation preserved for rolled back task $task_id"
+    fi
+}
+
+# Rollback entire feature implementation
+rollback_feature() {
+    local feature_dir="$1"
+    local preserve_docs="${2:-true}"
+
+    echo "Rolling back entire feature: $(basename "$feature_dir")"
+
+    local tasks_meta_file="$feature_dir/tasks_meta.json"
+
+    if [[ -f "$tasks_meta_file" ]]; then
+        # Mark all tasks as rolled back
+        if command -v jq >/dev/null 2>&1; then
+            jq '.tasks |= map_values(.status = "rolled_back")' "$tasks_meta_file" > "${tasks_meta_file}.tmp" && \
+            mv "${tasks_meta_file}.tmp" "$tasks_meta_file"
+        fi
+    fi
+
+    # Remove implementation artifacts
+    rm -f "$feature_dir/implementation.log"
+    rm -rf "$feature_dir/checklists"
+
+    # Log comprehensive rollback
+    echo "## Feature Rollback - $(date)" >> "$feature_dir/rollback.log"
+    echo "Feature: $(basename "$feature_dir")" >> "$feature_dir/rollback.log"
+    echo "All tasks marked as rolled back" >> "$feature_dir/rollback.log"
+    echo "Implementation artifacts removed" >> "$feature_dir/rollback.log"
+    echo "Documentation Preserved: $preserve_docs" >> "$feature_dir/rollback.log"
+    echo "" >> "$feature_dir/rollback.log"
+
+    echo "Feature $(basename "$feature_dir") implementation rolled back"
+
+    if [[ "$preserve_docs" == "true" ]]; then
+        echo "Documentation preserved during feature rollback"
+    fi
+}
+
+# Regenerate tasks after rollback with corrected approach
+regenerate_tasks_after_rollback() {
+    local feature_dir="$1"
+    local rollback_reason="$2"
+
+    local tasks_file="$feature_dir/tasks.md"
+    local tasks_meta_file="$feature_dir/tasks_meta.json"
+
+    # Add new tasks for corrected implementation
+    echo "" >> "$tasks_file"
+    echo "## Corrected Implementation Tasks - $(date)" >> "$tasks_file"
+    echo "" >> "$tasks_file"
+    echo "- [ ] T_CORRECT_001 Apply corrected implementation approach based on: $rollback_reason" >> "$tasks_file"
+    echo "- [ ] T_CORRECT_002 Verify fixes address root cause of rollback" >> "$tasks_file"
+    echo "- [ ] T_CORRECT_003 Test corrected implementation thoroughly" >> "$tasks_file"
+
+    # Reinitialize tasks metadata
+    if [[ -f "$tasks_meta_file" ]]; then
+        # Reset rolled back tasks to pending for retry
+        if command -v jq >/dev/null 2>&1; then
+            jq '.tasks |= map_values(if .status == "rolled_back" then .status = "pending" else . end)' "$tasks_meta_file" > "${tasks_meta_file}.tmp" && \
+            mv "${tasks_meta_file}.tmp" "$tasks_meta_file"
+        fi
+    fi
+
+    echo "Regenerated tasks for corrected implementation approach"
+}
+
+
+
+
+
+# Regenerate plan based on current specifications and implementation learnings
+regenerate_plan() {
+    local feature_dir="$1"
+    local reason="$2"
+
+    local spec_file="$feature_dir/spec.md"
+    local plan_file="$feature_dir/plan.md"
+
+    echo "Regenerating plan for feature: $(basename "$feature_dir")"
+    echo "Reason: $reason"
+
+    if [[ ! -f "$spec_file" ]]; then
+        echo "Error: Cannot regenerate plan without spec.md"
+        return 1
+    fi
+
+    # Backup original plan
+    if [[ -f "$plan_file" ]]; then
+        cp "$plan_file" "${plan_file}.backup.$(date +%Y%m%d_%H%M%S)"
+        echo "Original plan backed up"
+    fi
+
+    # Create regeneration template
+    cat > "$plan_file" << EOF
+# Implementation Plan - Regenerated $(date)
+## Reason for Regeneration
+$reason
+
+## Original Specification Context
+$(head -20 "$spec_file")
+
+## Architecture Decisions
+<!-- Regenerate based on current spec and implementation learnings -->
+
+## Technical Stack
+<!-- Update based on implementation experience -->
+
+## Component Design
+<!-- Refine based on actual implementation needs -->
+
+## Data Model
+<!-- Adjust based on real-world usage patterns -->
+
+## Implementation Phases
+<!-- Reorganize based on lessons learned -->
+
+## Risk Mitigation
+<!-- Update based on encountered issues -->
+
+## Success Metrics
+<!-- Refine based on implementation insights -->
+EOF
+
+    # Log regeneration
+    echo "## Plan Regeneration - $(date)" >> "$feature_dir/rollback.log"
+    echo "Reason: $reason" >> "$feature_dir/rollback.log"
+    echo "Original plan backed up" >> "$feature_dir/rollback.log"
+    echo "New plan template created for regeneration" >> "$feature_dir/rollback.log"
+    echo "" >> "$feature_dir/rollback.log"
+
+    echo "Plan regeneration template created. Manual editing required to complete regeneration."
+    echo "Original plan backed up for reference."
+}
+
+# Ensure documentation consistency after rollback
+ensure_documentation_consistency() {
+    local feature_dir="$1"
+
+    echo "Ensuring documentation consistency after rollback..."
+
+    local spec_file="$feature_dir/spec.md"
+    local plan_file="$feature_dir/plan.md"
+    local tasks_file="$feature_dir/tasks.md"
+
+    # Check for consistency issues
+    local issues_found=false
+
+    # Check if plan references tasks that no longer exist
+    if [[ -f "$plan_file" ]] && [[ -f "$tasks_file" ]]; then
+        # This is a simplified check - in practice would need more sophisticated analysis
+        if grep -q "T[0-9]" "$plan_file" && ! grep -q "T[0-9]" "$tasks_file"; then
+            echo "⚠️  Plan references tasks that may no longer exist"
+            issues_found=true
+        fi
+    fi
+
+    # Check for implementation references in docs after rollback
+    if [[ -f "$spec_file" ]] && grep -q "implementation\|code\|deploy" "$spec_file"; then
+        echo "⚠️  Spec contains implementation details that should be reviewed"
+        issues_found=true
+    fi
+
+    if [[ "$issues_found" == "false" ]]; then
+        echo "✅ Documentation consistency verified"
+    else
+        echo "⚠️  Some documentation consistency issues detected"
+        echo "Consider running '/analyze' to identify specific issues"
+    fi
+}
+
+# Mode-aware rollback strategies
+get_mode_aware_rollback_strategy() {
+    local mode="${1:-spec}"  # Default to spec mode
+
+    case "$mode" in
+        "build")
+            echo "build_mode_rollback"
+            ;;
+        "spec")
+            echo "spec_mode_rollback"
+            ;;
+        *)
+            echo "default_rollback"
+            ;;
+    esac
+}
+
+# Execute mode-aware rollback
+execute_mode_aware_rollback() {
+    local feature_dir="$1"
+    local rollback_type="$2"
+    local mode="${3:-spec}"
+
+    local strategy
+    strategy=$(get_mode_aware_rollback_strategy "$mode")
+
+    echo "Executing $strategy for $rollback_type in $mode mode"
+
+    case "$strategy" in
+        "build_mode_rollback")
+            # Lightweight rollback for build mode
+            echo "Build mode: Minimal rollback preserving rapid iteration artifacts"
+            case "$rollback_type" in
+                "task")
+                    # Less aggressive task rollback in build mode
+                    echo "Task rollback completed with minimal cleanup"
+                    ;;
+                "feature")
+                    # Preserve more artifacts in build mode
+                    echo "Feature rollback completed, preserving iteration artifacts"
+                    ;;
+            esac
+            ;;
+
+        "spec_mode_rollback")
+            # Comprehensive rollback for spec mode
+            echo "Spec mode: Comprehensive rollback with full documentation preservation"
+            case "$rollback_type" in
+                "task")
+                    rollback_task "$feature_dir/tasks_meta.json" "$4" "true"
+                    ;;
+                "feature")
+                    rollback_feature "$feature_dir" "true"
+                    ;;
+            esac
+            ;;
+
+        "default_rollback")
+    echo "Default rollback strategy applied"
+    ;;
+    esac
+}
+
+# Get framework options configuration
+get_framework_opinions() {
+    local mode="${1:-spec}"
+    local config_file=".specify/config/config.json"
+
+    # Read from consolidated config
+    if [[ -f "$config_file" ]] && command -v jq >/dev/null 2>&1; then
+        local user_tdd
+        user_tdd=$(jq -r ".options.tdd_enabled" "$config_file" 2>/dev/null || echo "null")
+        local user_contracts
+        user_contracts=$(jq -r ".options.contracts_enabled" "$config_file" 2>/dev/null || echo "null")
+        local user_data_models
+        user_data_models=$(jq -r ".options.data_models_enabled" "$config_file" 2>/dev/null || echo "null")
+        local user_risk_tests
+        user_risk_tests=$(jq -r ".options.risk_tests_enabled" "$config_file" 2>/dev/null || echo "null")
+
+        # Fill in defaults for unset options based on mode
+        case "$mode" in
+            "build")
+                [[ "$user_tdd" == "null" ]] && user_tdd="false"
+                [[ "$user_contracts" == "null" ]] && user_contracts="false"
+                [[ "$user_data_models" == "null" ]] && user_data_models="false"
+                [[ "$user_risk_tests" == "null" ]] && user_risk_tests="false"
+                ;;
+            "spec")
+                [[ "$user_tdd" == "null" ]] && user_tdd="true"
+                [[ "$user_contracts" == "null" ]] && user_contracts="true"
+                [[ "$user_data_models" == "null" ]] && user_data_models="true"
+                [[ "$user_risk_tests" == "null" ]] && user_risk_tests="true"
+                ;;
+        esac
+        echo "tdd_enabled=$user_tdd contracts_enabled=$user_contracts data_models_enabled=$user_data_models risk_tests_enabled=$user_risk_tests"
+        return
+    fi
+
+    # Fallback to mode-based defaults
+    case "$mode" in
+        "build")
+            echo "tdd_enabled=false contracts_enabled=false data_models_enabled=false risk_tests_enabled=false"
+            ;;
+        "spec")
+            echo "tdd_enabled=true contracts_enabled=true data_models_enabled=true risk_tests_enabled=true"
+            ;;
+        *)
+            echo "tdd_enabled=false contracts_enabled=false data_models_enabled=false risk_tests_enabled=false"
+            ;;
+    esac
+}
+
+# Set framework opinion (legacy compatibility - now handled by specify CLI)
+set_framework_opinion() {
+    local opinion_type="$1"
+    local value="$2"
+
+    echo "Framework opinions are now managed by the '/mode' slash command."
+    echo "Use '/mode --$opinion_type' to change this setting."
+    echo "Run '/mode --help' for more information."
+}
+
+# Check if opinion is enabled
+is_opinion_enabled() {
+    local opinion_type="$1"
+    local mode="${2:-spec}"
+
+    local opinions
+    opinions=$(get_framework_opinions "$mode")
+
+    case "$opinion_type" in
+        "tdd")
+            echo "$opinions" | grep -o "tdd_enabled=[^ ]*" | cut -d= -f2
+            ;;
+        "contracts")
+            echo "$opinions" | grep -o "contracts_enabled=[^ ]*" | cut -d= -f2
+            ;;
+        "data_models")
+            echo "$opinions" | grep -o "data_models_enabled=[^ ]*" | cut -d= -f2
+            ;;
+        *)
+            echo "false"
+            ;;
+    esac
+}
+
+# Generate tasks with configurable opinions
+generate_tasks_with_opinions() {
+    local feature_dir="$1"
+    local mode="${2:-spec}"
+
+    local opinions
+    opinions=$(get_framework_opinions "$mode")
+
+    echo "Generating tasks with framework opinions for $mode mode:"
+    echo "$opinions"
+    echo ""
+
+    local tdd_enabled
+    tdd_enabled=$(is_opinion_enabled "tdd" "$mode")
+    local contracts_enabled
+    contracts_enabled=$(is_opinion_enabled "contracts" "$mode")
+    local data_models_enabled
+    data_models_enabled=$(is_opinion_enabled "data_models" "$mode")
+
+    # Generate tasks based on enabled opinions
+    echo "### Task Generation Configuration"
+    echo "- TDD: $tdd_enabled"
+    echo "- Contracts: $contracts_enabled"
+    echo "- Data Models: $data_models_enabled"
+    echo ""
+
+    if [[ "$tdd_enabled" == "true" ]]; then
+        echo "TDD enabled: Tests will be generated before implementation tasks"
+    else
+    echo "TDD disabled: Tests are optional and generated only if explicitly requested"
+    fi
+}
+
+# Main function for testing
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    case "$1" in
+        generate_delegation_prompt)
+            shift
+            generate_delegation_prompt "$@"
+            ;;
+        check_delegation_status)
+            shift
+            check_delegation_status "$@"
+            ;;
+        dispatch_async_task)
+            shift
+            dispatch_async_task "$@"
+            ;;
+        analyze_implementation_changes)
+            shift
+            analyze_implementation_changes "$@"
+            ;;
+        propose_documentation_updates)
+            shift
+            propose_documentation_updates "$@"
+            ;;
+        apply_documentation_updates)
+            shift
+            apply_documentation_updates "$@"
+            ;;
+        rollback_task)
+            shift
+            rollback_task "$@"
+            ;;
+        rollback_feature)
+            shift
+            rollback_feature "$@"
+            ;;
+        regenerate_tasks_after_rollback)
+            shift
+            regenerate_tasks_after_rollback "$@"
+            ;;
+        regenerate_plan|ensure_documentation_consistency|        get_mode_aware_rollback_strategy|execute_mode_aware_rollback|        get_framework_opinions|set_framework_opinion|is_opinion_enabled|generate_tasks_with_opinions)
+            shift
+            "$1" "$@"
+            ;;
+        *)
+            echo "Usage: $0 {generate_delegation_prompt|check_delegation_status|dispatch_async_task|analyze_implementation_changes|propose_documentation_updates|apply_documentation_updates|rollback_task|rollback_feature|regenerate_tasks_after_rollback|regenerate_plan|ensure_documentation_consistency|get_mode_aware_rollback_strategy|execute_mode_aware_rollback} [args...]"
+            exit 1
+            ;;
+    esac
+fi
\ No newline at end of file
diff --git a/scripts/bash/test-dual-execution-loop.sh b/scripts/bash/test-dual-execution-loop.sh
new file mode 100755
index 0000000000..8276b978d0
--- /dev/null
+++ b/scripts/bash/test-dual-execution-loop.sh
@@ -0,0 +1,339 @@
+#!/bin/bash
+# test-dual-execution-loop.sh - End-to-end test of the dual execution loop
+# Tests the complete workflow from task generation through implementation to level-up
+
+set -euo pipefail
+
+# Source common utilities
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/common.sh"
+source "$SCRIPT_DIR/tasks-meta-utils.sh"
+
+# Test configuration
+TEST_FEATURE_DIR="/tmp/test-feature-$(date +%s)"
+TEST_PROJECT_ROOT="/tmp/test-project-$(date +%s)"
+
+# Logging functions
+log_test_info() {
+    echo "[TEST INFO] $*" >&2
+}
+
+log_test_success() {
+    echo "[TEST SUCCESS] $*" >&2
+}
+
+log_test_error() {
+    echo "[TEST ERROR] $*" >&2
+}
+
+log_test_warning() {
+    echo "[TEST WARNING] $*" >&2
+}
+
+# Setup test environment
+setup_test_environment() {
+    log_test_info "Setting up test environment..."
+
+    # Create test directories
+    mkdir -p "$TEST_FEATURE_DIR"
+    mkdir -p "$TEST_PROJECT_ROOT"
+
+    # Create mock .mcp.json for async agent testing
+    cat > "$TEST_PROJECT_ROOT/.mcp.json" << EOF
+{
+  "mcpServers": {
+    "agent-jules": {
+      "type": "http",
+      "url": "https://mcp.jules.ai/"
+    },
+    "agent-async-copilot": {
+      "type": "http",
+      "url": "https://mcp.async-copilot.dev/"
+    },
+    "agent-async-codex": {
+      "type": "http",
+      "url": "https://mcp.async-codex.ai/"
+    }
+  }
+}
+EOF
+    # Note: LLM delegation no longer uses MCP servers for async tasks
+
+    # Create mock feature files
+    cat > "$TEST_FEATURE_DIR/spec.md" << EOF
+# User Stories
+
+## US1: User Authentication
+As a user, I want to be able to log in so that I can access my account.
+
+### Acceptance Criteria
+- Users can log in with email/password
+- Invalid credentials are rejected
+- Sessions are maintained
+
+## US2: User Profile Management
+As a user, I want to update my profile so that I can keep my information current.
+
+### Acceptance Criteria
+- Users can view their profile
+- Users can edit profile information
+- Changes are saved and displayed
+EOF
+
+    cat > "$TEST_FEATURE_DIR/plan.md" << EOF
+# Implementation Plan
+
+## Tech Stack
+- Backend: Node.js with Express
+- Database: PostgreSQL
+- Authentication: JWT tokens
+- Frontend: React
+
+## Architecture
+- RESTful API design
+- MVC pattern
+- Input validation and sanitization
+
+## User Story US1: User Authentication
+- Implement login endpoint
+- Add JWT token generation
+- Create authentication middleware
+
+## User Story US2: User Profile Management
+- Create user profile model
+- Implement CRUD operations
+- Add profile update endpoint
+EOF
+
+    cat > "$TEST_FEATURE_DIR/tasks.md" << EOF
+# Implementation Tasks
+
+## Phase 1: Setup
+- [ ] T001 Setup project structure and dependencies [SYNC]
+- [ ] T002 Configure database connection [SYNC]
+- [ ] T003 Initialize authentication framework [ASYNC]
+
+## Phase 2: User Authentication (US1)
+- [ ] T004 Implement login endpoint with validation [SYNC]
+- [ ] T005 Add JWT token generation and verification [SYNC]
+- [ ] T006 Create authentication middleware [ASYNC]
+
+## Phase 3: User Profile Management (US2)
+- [ ] T007 Create user profile database schema [ASYNC]
+- [ ] T008 Implement profile CRUD operations [SYNC]
+- [ ] T009 Add profile update API endpoint [SYNC]
+EOF
+
+    log_test_success "Test environment setup complete"
+}
+
+# Test task generation and classification
+test_task_generation() {
+    log_test_info "Testing task generation and classification..."
+
+    cd "$TEST_PROJECT_ROOT"
+
+    # Initialize tasks_meta.json
+    init_tasks_meta "$TEST_FEATURE_DIR"
+
+    # Test task classification
+    local task_descriptions=(
+        "Setup project structure and dependencies"
+        "Configure database connection"
+        "Initialize authentication framework"
+        "Implement login endpoint with validation"
+        "Add JWT token generation and verification"
+        "Create authentication middleware"
+        "Create user profile database schema"
+        "Implement profile CRUD operations"
+        "Add profile update API endpoint"
+    )
+
+    local task_files=(
+        "package.json src/"
+        "src/database/"
+        "src/auth/"
+        "src/routes/auth.js"
+        "src/auth/jwt.js"
+        "src/middleware/auth.js"
+        "src/models/User.js"
+        "src/controllers/profile.js"
+        "src/routes/profile.js"
+    )
+
+    local expected_modes=(
+        "SYNC"
+        "SYNC"
+        "ASYNC"
+        "SYNC"
+        "SYNC"
+        "ASYNC"
+        "ASYNC"
+        "SYNC"
+        "SYNC"
+    )
+
+    for i in "${!task_descriptions[@]}"; do
+        local task_id="T$((i+1))"
+        local description="${task_descriptions[$i]}"
+        local files="${task_files[$i]}"
+        local expected_mode="${expected_modes[$i]}"
+
+        # Classify task
+        local classified_mode
+        classified_mode=$(classify_task_execution_mode "$description" "$files")
+
+        # Add task to meta
+        add_task "$TEST_FEATURE_DIR/tasks_meta.json" "$task_id" "$description" "$files" "$classified_mode"
+
+        if [[ "$classified_mode" == "$expected_mode" ]]; then
+            log_test_success "Task $task_id classified correctly as $classified_mode"
+        else
+            log_test_warning "Task $task_id classified as $classified_mode, expected $expected_mode"
+        fi
+    done
+
+    log_test_success "Task generation and classification test complete"
+}
+
+# Test LLM delegation dispatching
+test_llm_delegation_dispatching() {
+    log_test_info "Testing natural language delegation dispatching..."
+
+    cd "$TEST_PROJECT_ROOT"
+
+    # Test dispatching ASYNC tasks
+    local async_tasks=("T003" "T006" "T007")
+
+    for task_id in "${async_tasks[@]}"; do
+        # Use mock task details for testing (since jq not available)
+        local task_description="Test ASYNC task $task_id"
+        local task_files="test-file.js"
+        local agent_type="general"
+
+        # Dispatch using natural language delegation with context
+        dispatch_async_task "$task_id" "$agent_type" "$task_description" "Files: $task_files" "Complete the task according to specifications" "Execute the task and provide detailed results" "$TEST_FEATURE_DIR"
+        log_test_success "Dispatched ASYNC task $task_id via natural language delegation"
+
+        # Simulate AI assistant completion (in real usage, AI assistant would create this)
+        mkdir -p delegation_completed
+        echo "Task completed by AI assistant" > "delegation_completed/${task_id}.txt"
+    done
+
+    log_test_success "LLM delegation dispatching test complete"
+}
+
+# Test review workflows
+test_review_workflows() {
+    log_test_info "Testing review workflows..."
+
+    cd "$TEST_PROJECT_ROOT"
+
+    # Test micro-review for SYNC tasks
+    log_test_info "Testing micro-review (simulated - would require user input in real scenario)"
+
+    # Simulate micro-review completion for a SYNC task
+    local task_id="T001"
+    safe_json_update "$TEST_FEATURE_DIR/tasks_meta.json" --arg task_id "$task_id" '.tasks[$task_id].status = "completed"'
+
+    # In real scenario, this would prompt for user input
+    # perform_micro_review "$TEST_FEATURE_DIR/tasks_meta.json" "$task_id"
+
+    log_test_success "Review workflows test complete (simulated)"
+}
+
+# Test quality gates
+test_quality_gates() {
+    log_test_info "Testing quality gates..."
+
+    cd "$TEST_PROJECT_ROOT"
+
+    # Test quality gates for different execution modes
+    local sync_task="T001"  # SYNC task
+    local async_task="T003"  # ASYNC task
+
+    # Mark tasks as completed first
+    safe_json_update "$TEST_FEATURE_DIR/tasks_meta.json" --arg task_id "$sync_task" '.tasks[$task_id].status = "completed"'
+    safe_json_update "$TEST_FEATURE_DIR/tasks_meta.json" --arg task_id "$async_task" '.tasks[$task_id].status = "completed"'
+
+    # Apply quality gates
+    if apply_quality_gates "$TEST_FEATURE_DIR/tasks_meta.json" "$sync_task"; then
+        log_test_success "Quality gates passed for SYNC task $sync_task"
+    else
+        log_test_warning "Quality gates failed for SYNC task $sync_task"
+    fi
+
+    if apply_quality_gates "$TEST_FEATURE_DIR/tasks_meta.json" "$async_task"; then
+        log_test_success "Quality gates passed for ASYNC task $async_task"
+    else
+        log_test_warning "Quality gates failed for ASYNC task $async_task"
+    fi
+
+    log_test_success "Quality gates test complete"
+}
+
+# Test execution summary
+test_execution_summary() {
+    log_test_info "Testing execution summary..."
+
+    cd "$TEST_PROJECT_ROOT"
+
+    get_execution_summary "$TEST_FEATURE_DIR/tasks_meta.json"
+
+    log_test_success "Execution summary test complete"
+}
+
+# Test macro-review
+test_macro_review() {
+    log_test_info "Testing macro-review..."
+
+    cd "$TEST_PROJECT_ROOT"
+
+    # Mark all tasks as completed for macro-review
+    local all_tasks
+    all_tasks=$(jq -r '.tasks | keys[]' "$TEST_FEATURE_DIR/tasks_meta.json")
+
+    for task_id in $all_tasks; do
+        safe_json_update "$TEST_FEATURE_DIR/tasks_meta.json" --arg task_id "$task_id" '.tasks[$task_id].status = "completed"'
+    done
+
+    # Perform macro-review (simulated)
+    log_test_info "Macro-review would be performed here (simulated - requires user input)"
+
+    log_test_success "Macro-review test complete (simulated)"
+}
+
+# Cleanup test environment
+cleanup_test_environment() {
+    log_test_info "Cleaning up test environment..."
+
+    rm -rf "$TEST_FEATURE_DIR"
+    rm -rf "$TEST_PROJECT_ROOT"
+
+    log_test_success "Test environment cleanup complete"
+}
+
+# Main test function
+main() {
+    log_test_info "Starting Dual Execution Loop End-to-End Test"
+    echo "=================================================="
+
+    setup_test_environment
+    test_task_generation
+    test_llm_delegation_dispatching
+    test_review_workflows
+    test_quality_gates
+    test_execution_summary
+    test_macro_review
+
+    echo ""
+    log_test_success "All Dual Execution Loop tests completed successfully!"
+    echo "=================================================="
+
+    cleanup_test_environment
+}
+
+# Run main if script is executed directly
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+    main "$@"
+fi
\ No newline at end of file
diff --git a/scripts/bash/validate-constitution.sh b/scripts/bash/validate-constitution.sh
new file mode 100644
index 0000000000..633496d10c
--- /dev/null
+++ b/scripts/bash/validate-constitution.sh
@@ -0,0 +1,331 @@
+#!/usr/bin/env bash
+
+set -e
+
+JSON_MODE=false
+STRICT_MODE=false
+COMPLIANCE_MODE=false
+
+for arg in "$@"; do
+    case "$arg" in
+        --json)
+            JSON_MODE=true
+            ;;
+        --strict)
+            STRICT_MODE=true
+            ;;
+        --compliance)
+            COMPLIANCE_MODE=true
+            ;;
+        --help|-h)
+            echo "Usage: $0 [--json] [--strict] [--compliance] [constitution_file]"
+            echo "  --json        Output results in JSON format"
+            echo "  --strict      Perform strict validation (fail on warnings)"
+            echo "  --compliance  Check compliance with team directives"
+            echo "  --help        Show this help message"
+            exit 0
+            ;;
+        *)
+            CONSTITUTION_FILE="$arg"
+            ;;
+    esac
+done
+
+# Get script directory and load common functions
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$SCRIPT_DIR/common.sh"
+
+eval $(get_feature_paths)
+
+# Default constitution file
+if [[ -z "$CONSTITUTION_FILE" ]]; then
+    CONSTITUTION_FILE="$REPO_ROOT/.specify/memory/constitution.md"
+fi
+
+# Validation result structure
+VALIDATION_RESULTS="{}"
+
+# Function to add validation result
+add_validation_result() {
+    local category="$1"
+    local check="$2"
+    local status="$3"
+    local message="$4"
+
+    VALIDATION_RESULTS=$(echo "$VALIDATION_RESULTS" | jq ".${category} += [{\"check\": \"$check\", \"status\": \"$status\", \"message\": \"$message\"}]")
+}
+
+# Function to validate constitution file exists
+validate_file_exists() {
+    if [[ ! -f "$CONSTITUTION_FILE" ]]; then
+        add_validation_result "critical" "file_exists" "fail" "Constitution file not found at $CONSTITUTION_FILE"
+        return 1
+    fi
+    add_validation_result "basic" "file_exists" "pass" "Constitution file found"
+    return 0
+}
+
+# Function to validate basic structure
+validate_basic_structure() {
+    local content="$1"
+
+    # Check for required sections
+    if ! echo "$content" | grep -q "^# .* Constitution"; then
+        add_validation_result "structure" "title" "fail" "Constitution must have a title starting with '# ... Constitution'"
+        return 1
+    fi
+    add_validation_result "structure" "title" "pass" "Title format correct"
+
+    if ! echo "$content" | grep -q "^## Core Principles"; then
+        add_validation_result "structure" "core_principles" "fail" "Constitution must have '## Core Principles' section"
+        return 1
+    fi
+    add_validation_result "structure" "core_principles" "pass" "Core Principles section present"
+
+    if ! echo "$content" | grep -q "^##.*Governance"; then
+        add_validation_result "structure" "governance" "fail" "Constitution must have a Governance section"
+        return 1
+    fi
+    add_validation_result "structure" "governance" "pass" "Governance section present"
+}
+
+# Function to validate principle quality
+validate_principle_quality() {
+    local content="$1"
+
+    # Extract principles (lines starting with ###)
+    local principles=""
+    principles=$(echo "$content" | grep "^### " | sed 's/^### //')
+
+    local principle_count=0
+    while IFS= read -r principle; do
+        if [[ -n "$principle" ]]; then
+            ((principle_count++))
+
+            # Check principle name quality
+            if [[ ${#principle} -lt 10 ]]; then
+                add_validation_result "quality" "principle_name_length" "warn" "Principle '$principle' name is very short"
+            elif [[ ${#principle} -gt 80 ]]; then
+                add_validation_result "quality" "principle_name_length" "warn" "Principle '$principle' name is very long"
+            else
+                add_validation_result "quality" "principle_name_length" "pass" "Principle '$principle' name length appropriate"
+            fi
+
+            # Check for vague language
+            if echo "$principle" | grep -qi "should\|may\|might\|try\|consider"; then
+                add_validation_result "quality" "principle_clarity" "warn" "Principle '$principle' contains vague language (should/may/might/try/consider)"
+            else
+                add_validation_result "quality" "principle_clarity" "pass" "Principle '$principle' uses clear language"
+            fi
+        fi
+    done <<< "$principles"
+
+    if [[ $principle_count -lt 3 ]]; then
+        add_validation_result "quality" "principle_count" "warn" "Only $principle_count principles found (recommended: 3-7)"
+    elif [[ $principle_count -gt 10 ]]; then
+        add_validation_result "quality" "principle_count" "warn" "$principle_count principles found (consider consolidating)"
+    else
+        add_validation_result "quality" "principle_count" "pass" "$principle_count principles (appropriate range)"
+    fi
+}
+
+# Function to validate versioning
+validate_versioning() {
+    local content="$1"
+
+    # Check for version line
+    if ! echo "$content" | grep -q "\*\*Version\*\*:"; then
+        add_validation_result "versioning" "version_present" "fail" "Version information not found"
+        return 1
+    fi
+    add_validation_result "versioning" "version_present" "pass" "Version information present"
+
+    # Extract version
+    local version=""
+    version=$(echo "$content" | grep "\*\*Version\*\*:" | sed 's/.*Version\*\*: *\([0-9.]*\).*/\1/')
+
+    if [[ -z "$version" ]]; then
+        add_validation_result "versioning" "version_format" "fail" "Could not parse version number"
+        return 1
+    fi
+
+    # Check semantic versioning format
+    if ! echo "$version" | grep -q "^[0-9]\+\.[0-9]\+\.[0-9]\+$"; then
+        add_validation_result "versioning" "version_format" "warn" "Version '$version' does not follow semantic versioning (X.Y.Z)"
+    else
+        add_validation_result "versioning" "version_format" "pass" "Version follows semantic versioning"
+    fi
+
+    # Check dates
+    local ratified_date=""
+    local amended_date=""
+
+    ratified_date=$(echo "$content" | grep "\*\*Ratified\*\*:" | sed 's/.*Ratified\*\*: *\([0-9-]*\).*/\1/')
+    amended_date=$(echo "$content" | grep "\*\*Last Amended\*\*:" | sed 's/.*Last Amended\*\*: *\([0-9-]*\).*/\1/')
+
+    if [[ -z "$ratified_date" ]]; then
+        add_validation_result "versioning" "ratified_date" "fail" "Ratification date not found"
+    elif ! echo "$ratified_date" | grep -q "^[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}$"; then
+        add_validation_result "versioning" "ratified_date" "warn" "Ratification date '$ratified_date' not in YYYY-MM-DD format"
+    else
+        add_validation_result "versioning" "ratified_date" "pass" "Ratification date format correct"
+    fi
+
+    if [[ -z "$amended_date" ]]; then
+        add_validation_result "versioning" "amended_date" "fail" "Last amended date not found"
+    elif ! echo "$amended_date" | grep -q "^[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}$"; then
+        add_validation_result "versioning" "amended_date" "warn" "Last amended date '$amended_date' not in YYYY-MM-DD format"
+    else
+        add_validation_result "versioning" "amended_date" "pass" "Last amended date format correct"
+    fi
+}
+
+# Function to validate team compliance
+validate_team_compliance() {
+    local content="$1"
+
+    # Load team constitution
+    local team_constitution=""
+    if [[ -n "$TEAM_DIRECTIVES" && -d "$TEAM_DIRECTIVES" ]]; then
+        if [[ -f "$TEAM_DIRECTIVES/constitution.md" ]]; then
+            team_constitution=$(cat "$TEAM_DIRECTIVES/constitution.md")
+        elif [[ -f "$TEAM_DIRECTIVES/context_modules/constitution.md" ]]; then
+            team_constitution=$(cat "$TEAM_DIRECTIVES/context_modules/constitution.md")
+        fi
+    fi
+
+    if [[ -z "$team_constitution" ]]; then
+        add_validation_result "compliance" "team_constitution" "warn" "Team constitution not found - cannot validate compliance"
+        return 0
+    fi
+
+    add_validation_result "compliance" "team_constitution" "pass" "Team constitution found"
+
+    # Extract team principles
+    local team_principles=""
+    team_principles=$(echo "$team_constitution" | grep "^[0-9]\+\. \*\*.*\*\*" | sed 's/^[0-9]\+\. \*\{2\}\(.*\)\*\{2\}.*/\1/')
+
+    # Check each team principle is represented
+    local missing_principles=""
+    while IFS= read -r principle; do
+        if [[ -n "$principle" ]]; then
+            if ! echo "$content" | grep -qi "$principle"; then
+                missing_principles="$missing_principles$principle, "
+            fi
+        fi
+    done <<< "$team_principles"
+
+    if [[ -n "$missing_principles" ]]; then
+        add_validation_result "compliance" "team_principles" "fail" "Missing team principles: ${missing_principles%, }"
+    else
+        add_validation_result "compliance" "team_principles" "pass" "All team principles represented"
+    fi
+}
+
+# Function to check for conflicts
+validate_conflicts() {
+    local content="$1"
+
+    # Look for contradictory terms
+    local contradictions_found=0
+
+    if echo "$content" | grep -qi "must.*never\|never.*must\|required.*forbidden\|forbidden.*required"; then
+        add_validation_result "conflicts" "contradictory_terms" "warn" "Found potentially contradictory terms (must/never, required/forbidden)"
+        ((contradictions_found++))
+    fi
+
+    # Check for duplicate principles
+    local principle_names=""
+    principle_names=$(echo "$content" | grep "^### " | sed 's/^### //' | tr '[:upper:]' '[:lower:]')
+
+    local duplicates=""
+    while IFS= read -r name; do
+        if [[ -n "$name" ]]; then
+            local count=""
+            count=$(echo "$principle_names" | grep -c "^$name$")
+            if [[ $count -gt 1 ]]; then
+                duplicates="$duplicates$name, "
+            fi
+        fi
+    done <<< "$principle_names"
+
+    if [[ -n "$duplicates" ]]; then
+        add_validation_result "conflicts" "duplicate_principles" "warn" "Duplicate principle names found: ${duplicates%, }"
+        ((contradictions_found++))
+    fi
+
+    if [[ $contradictions_found -eq 0 ]]; then
+        add_validation_result "conflicts" "no_conflicts" "pass" "No obvious conflicts detected"
+    fi
+}
+
+# Main validation logic
+if ! validate_file_exists; then
+    if $JSON_MODE; then
+        echo "$VALIDATION_RESULTS"
+    else
+        echo "CRITICAL: Constitution file not found"
+        exit 1
+    fi
+fi
+
+# Read constitution content
+CONTENT=$(cat "$CONSTITUTION_FILE")
+
+# Initialize validation results
+VALIDATION_RESULTS=$(jq -n '{}')
+
+# Run validations
+validate_basic_structure "$CONTENT"
+validate_principle_quality "$CONTENT"
+validate_versioning "$CONTENT"
+
+if $COMPLIANCE_MODE; then
+    validate_team_compliance "$CONTENT"
+fi
+
+validate_conflicts "$CONTENT"
+
+# Calculate overall status
+CRITICAL_FAILS=$(echo "$VALIDATION_RESULTS" | jq '[.critical[]? | select(.status == "fail")] | length')
+STRUCTURE_FAILS=$(echo "$VALIDATION_RESULTS" | jq '[.structure[]? | select(.status == "fail")] | length')
+QUALITY_FAILS=$(echo "$VALIDATION_RESULTS" | jq '[.quality[]? | select(.status == "fail")] | length')
+VERSIONING_FAILS=$(echo "$VALIDATION_RESULTS" | jq '[.versioning[]? | select(.status == "fail")] | length')
+COMPLIANCE_FAILS=$(echo "$VALIDATION_RESULTS" | jq '[.compliance[]? | select(.status == "fail")] | length')
+
+TOTAL_FAILS=$((CRITICAL_FAILS + STRUCTURE_FAILS + QUALITY_FAILS + VERSIONING_FAILS + COMPLIANCE_FAILS))
+
+if [[ $TOTAL_FAILS -gt 0 ]]; then
+    OVERALL_STATUS="fail"
+elif $STRICT_MODE && echo "$VALIDATION_RESULTS" | jq -e '[.[]?[]? | select(.status == "warn")] | length > 0' > /dev/null; then
+    OVERALL_STATUS="fail"
+else
+    OVERALL_STATUS="pass"
+fi
+
+VALIDATION_RESULTS=$(echo "$VALIDATION_RESULTS" | jq ".overall = \"$OVERALL_STATUS\"")
+
+# Output results
+if $JSON_MODE; then
+    echo "$VALIDATION_RESULTS"
+else
+    echo "Constitution Validation Results for: $CONSTITUTION_FILE"
+    echo "Overall Status: $(echo "$OVERALL_STATUS" | tr '[:lower:]' '[:upper:]')"
+    echo ""
+
+    # Display results by category
+    for category in critical structure quality versioning compliance conflicts; do
+        if echo "$VALIDATION_RESULTS" | jq -e ".${category}" > /dev/null 2>&1; then
+            echo "$category checks:"
+            echo "$VALIDATION_RESULTS" | jq -r ".${category}[]? | \"  [\(.status | ascii_upcase)] \(.check): \(.message)\""
+            echo ""
+        fi
+    done
+
+    if [[ "$OVERALL_STATUS" == "fail" ]]; then
+        echo "❌ Validation failed - address the issues above"
+        exit 1
+    else
+        echo "✅ Validation passed"
+    fi
+fi
\ No newline at end of file
diff --git a/scripts/powershell/check-prerequisites.ps1 b/scripts/powershell/check-prerequisites.ps1
index 91667e9ef1..dac8c10c9b 100644
--- a/scripts/powershell/check-prerequisites.ps1
+++ b/scripts/powershell/check-prerequisites.ps1
@@ -40,10 +40,10 @@ OPTIONS:
   -Help, -h           Show this help message
 
 EXAMPLES:
-  # Check task prerequisites (plan.md required)
+  # Check task prerequisites (plan.md required in spec mode)
   .\check-prerequisites.ps1 -Json
   
-  # Check implementation prerequisites (plan.md + tasks.md required)
+  # Check implementation prerequisites (plan.md required in spec mode, tasks.md always required)
   .\check-prerequisites.ps1 -Json -RequireTasks -IncludeTasks
   
   # Get feature paths only (no validation)
@@ -56,6 +56,64 @@ EXAMPLES:
 # Source common functions
 . "$PSScriptRoot/common.ps1"
 
+function Get-RisksFromFile {
+    param([string]$Path)
+
+    if (-not (Test-Path $Path -PathType Leaf)) {
+        return @()
+    }
+
+    $risks = @()
+    $pattern = '^-\s*RISK:\s*(.+)$'
+    $index = 1
+
+    foreach ($line in Get-Content -Path $Path) {
+        $trim = $line.Trim()
+        if ($trim -notmatch $pattern) { continue }
+
+        $content = $Matches[1]
+        $parts = @()
+        foreach ($piece in ($content -split '\|')) {
+            $trimmed = $piece.Trim()
+            if ($trimmed) { $parts += $trimmed }
+        }
+
+        $data = @{}
+
+        if ($parts.Count -gt 0 -and $parts[0] -notmatch ':') {
+            $data['id'] = $parts[0]
+            if ($parts.Count -gt 1) {
+                $parts = $parts[1..($parts.Count - 1)]
+            } else {
+                $parts = @()
+            }
+        }
+
+        foreach ($part in $parts) {
+            if ($part -match '^\s*([^:]+):\s*(.+)$') {
+                $key = $Matches[1].Trim()
+                $value = $Matches[2].Trim()
+                $normalized = $key.ToLower().Replace(' ', '_')
+
+                if ($normalized -eq 'risk') {
+                    $data['id'] = $value
+                } else {
+                    $data[$normalized] = $value
+                }
+            }
+        }
+
+        if (-not $data.ContainsKey('id')) {
+            $data['id'] = "missing-id-$index"
+        }
+        $index += 1
+
+        $risks += [PSCustomObject]$data
+    }
+
+    return $risks
+}
+
 # Get feature paths and validate branch
 $paths = Get-FeaturePathsEnv
 
@@ -92,10 +150,27 @@ if (-not (Test-Path $paths.FEATURE_DIR -PathType Container)) {
     exit 1
 }
 
+# Check for plan.md (required in spec mode, optional in build mode)
 if (-not (Test-Path $paths.IMPL_PLAN -PathType Leaf)) {
-    Write-Output "ERROR: plan.md not found in $($paths.FEATURE_DIR)"
-    Write-Output "Run /speckit.plan first to create the implementation plan."
-    exit 1
+    # Get current mode to determine if plan.md is required
+    $currentMode = "spec"
+    $configFile = ".specify/config/config.json"
+    if (Test-Path $configFile) {
+        try {
+            $configData = Get-Content $configFile | ConvertFrom-Json
+            $currentMode = $configData.workflow.current_mode
+            if (-not $currentMode) { $currentMode = "spec" }
+        } catch {
+            $currentMode = "spec"
+        }
+    }
+
+    if ($currentMode -eq "spec") {
+        Write-Output "ERROR: plan.md not found in $($paths.FEATURE_DIR)"
+        Write-Output "Run /speckit.plan first to create the implementation plan."
+        exit 1
+    }
+    # In build mode, plan.md is optional - allow implementation to proceed
 }
 
 # Check for tasks.md if required
@@ -125,11 +200,16 @@ if ($IncludeTasks -and (Test-Path $paths.TASKS)) {
 }
 
 # Output results
+    $specRisks = Get-RisksFromFile -Path $paths.FEATURE_SPEC
+    $planRisks = Get-RisksFromFile -Path $paths.IMPL_PLAN
+
 if ($Json) {
     # JSON output
     [PSCustomObject]@{ 
         FEATURE_DIR = $paths.FEATURE_DIR
         AVAILABLE_DOCS = $docs 
+        SPEC_RISKS = $specRisks
+        PLAN_RISKS = $planRisks
     } | ConvertTo-Json -Compress
 } else {
     # Text output
@@ -145,4 +225,7 @@ if ($Json) {
     if ($IncludeTasks) {
         Test-FileExists -Path $paths.TASKS -Description 'tasks.md' | Out-Null
     }
+
+    Write-Output "SPEC_RISKS: $($specRisks.Count)"
+    Write-Output "PLAN_RISKS: $($planRisks.Count)"
 }
diff --git a/scripts/powershell/constitution-evolution.ps1 b/scripts/powershell/constitution-evolution.ps1
new file mode 100644
index 0000000000..d37cb9fedf
--- /dev/null
+++ b/scripts/powershell/constitution-evolution.ps1
@@ -0,0 +1,368 @@
+#!/usr/bin/env pwsh
+[CmdletBinding()]
+param(
+    [switch]$Json,
+    [switch]$Amend,
+    [switch]$History,
+    [switch]$Diff,
+    [switch]$Version,
+    [string[]]$ArgsList
+)
+
+$ErrorActionPreference = 'Stop'
+
+. "$PSScriptRoot/common.ps1"
+
+$paths = Get-FeaturePathsEnv
+
+$constitutionFile = Join-Path $paths.REPO_ROOT '.specify/memory/constitution.md'
+$amendmentLog = Join-Path $paths.REPO_ROOT '.specify/memory/constitution-amendments.log'
+
+# Ensure amendment log exists
+$amendmentLogDir = Split-Path $amendmentLog -Parent
+if (-not (Test-Path $amendmentLogDir)) {
+    New-Item -ItemType Directory -Path $amendmentLogDir -Force | Out-Null
+}
+if (-not (Test-Path $amendmentLog)) {
+    New-Item -ItemType File -Path $amendmentLog -Force | Out-Null
+}
+
+# Function to log amendment
+function Add-AmendmentLog {
+    param([string]$Version, [string]$Author, [string]$Description)
+
+    $timestamp = Get-Date -Format 'yyyy-MM-ddTHH:mm:sszzz'
+    "$timestamp|$Version|$Author|$Description" | Out-File -FilePath $amendmentLog -Append -Encoding UTF8
+}
+
+# Function to get current version
+function Get-ConstitutionVersion {
+    if (-not (Test-Path $constitutionFile)) {
+        return "1.0.0"
+    }
+
+    $content = Get-Content $constitutionFile -Raw
+    $versionMatch = [regex]::Match($content, '\*\*Version\*\*:\s*([0-9.]+)')
+
+    if ($versionMatch.Success) {
+        return $versionMatch.Groups[1].Value
+    } else {
+        return "1.0.0"
+    }
+}
+
+# Function to increment version
+function Step-Version {
+    param([string]$CurrentVersion, [string]$ChangeType)
+
+    $parts = $CurrentVersion -split '\.'
+    $major = [int]$parts[0]
+    $minor = [int]$parts[1]
+    $patch = [int]$parts[2]
+
+    switch ($ChangeType) {
+        "major" {
+            $major++
+            $minor = 0
+            $patch = 0
+        }
+        "minor" {
+            $minor++
+            $patch = 0
+        }
+        "patch" {
+            $patch++
+        }
+        default {
+            Write-Error "Invalid change type: $ChangeType"
+            return $null
+        }
+    }
+
+    return "$major.$minor.$patch"
+}
+
+# Function to propose amendment
+function New-AmendmentProposal {
+    param([string]$AmendmentFile)
+
+    if (-not (Test-Path $AmendmentFile)) {
+        Write-Error "Amendment file not found: $AmendmentFile"
+        exit 1
+    }
+
+    $amendmentContent = Get-Content $AmendmentFile -Raw
+
+    # Validate amendment format
+    if ($amendmentContent -notmatch '\*\*Proposed Principle:\*\*') {
+        Write-Error "Amendment must include '**Proposed Principle:**' section"
+        exit 1
+    }
+
+    # Generate amendment ID
+    $amendmentId = "amendment-$(Get-Date -Format 'yyyyMMdd-HHmmss')"
+
+    # Create amendment record
+    $recordFile = Join-Path $paths.REPO_ROOT ".specify/memory/amendments/$amendmentId.md"
+    $recordDir = Split-Path $recordFile -Parent
+
+    if (-not (Test-Path $recordDir)) {
+        New-Item -ItemType Directory -Path $recordDir -Force | Out-Null
+    }
+
+    $author = git config user.name 2>$null
+    if (-not $author) { $author = "Unknown" }
+
+    $recordContent = @"
+# Constitution Amendment: $amendmentId
+
+**Status:** Proposed
+**Proposed Date:** $(Get-Date -Format 'yyyy-MM-dd')
+**Proposed By:** $author
+
+## Amendment Content
+
+$amendmentContent
+
+## Review Status
+
+- [ ] Technical Review
+- [ ] Team Approval
+- [ ] Implementation
+
+## Comments
+
+"@
+
+    $recordContent | Out-File -FilePath $recordFile -Encoding UTF8
+
+    Write-Host "Amendment proposed: $amendmentId" -ForegroundColor Green
+    Write-Host "Review file: $recordFile" -ForegroundColor Yellow
+
+    if ($Json) {
+        @{status="proposed"; id=$amendmentId; file=$recordFile} | ConvertTo-Json -Compress
+    }
+}
+
+# Function to apply amendment
+function Apply-Amendment {
+    param([string]$AmendmentId, [string]$ChangeType = "minor")
+
+    $recordFile = Join-Path $paths.REPO_ROOT ".specify/memory/amendments/$AmendmentId.md"
+
+    if (-not (Test-Path $recordFile)) {
+        Write-Error "Amendment record not found: $recordFile"
+        exit 1
+    }
+
+    # Check if amendment is approved
+    $recordContent = Get-Content $recordFile -Raw
+    if ($recordContent -notmatch '\*\*Status:\*\* Approved') {
+        Write-Error "Amendment $AmendmentId is not approved for application"
+        exit 1
+    }
+
+    # Get current version and increment
+    $currentVersion = Get-ConstitutionVersion
+    $newVersion = Step-Version -CurrentVersion $currentVersion -ChangeType $ChangeType
+
+    # Extract amendment content
+    $amendmentContent = ""
+    if ($recordContent -match '(?s)## Amendment Content(.*?)(?=## Review Status)') {
+        $amendmentContent = $matches[1].Trim()
+    }
+
+    # Read current constitution
+    $currentConstitution = Get-Content $constitutionFile -Raw
+
+    # Apply amendment
+    $updatedConstitution = @"
+$currentConstitution
+
+## Amendment: $AmendmentId
+
+$amendmentContent
+"@
+
+    # Update version and amendment date
+    $today = Get-Date -Format 'yyyy-MM-dd'
+    $updatedConstitution = $updatedConstitution -replace '\*\*Version\*\*:.*', "**Version**: $newVersion"
+    $updatedConstitution = $updatedConstitution -replace '\*\*Last Amended\*\*:.*', "**Last Amended**: $today"
+
+    # Write updated constitution
+    $updatedConstitution | Out-File -FilePath $constitutionFile -Encoding UTF8
+
+    # Log amendment
+    $author = ($recordContent | Select-String '\*\*Proposed By:\*\* (.+)').Matches.Groups[1].Value
+    $description = ($recordContent | Select-String '\*\*Proposed Principle:\*\* (.+)').Matches.Groups[1].Value
+
+    Add-AmendmentLog -Version $newVersion -Author $author -Description "Applied amendment $AmendmentId`: $description"
+
+    # Update amendment status
+    $updatedRecord = $recordContent -replace '\*\*Status:\*\* Approved', '**Status:** Applied'
+    $updatedRecord | Out-File -FilePath $recordFile -Encoding UTF8
+
+    Write-Host "Amendment applied: $AmendmentId" -ForegroundColor Green
+    Write-Host "New version: $newVersion" -ForegroundColor Green
+
+    if ($Json) {
+        @{status="applied"; id=$AmendmentId; version=$newVersion} | ConvertTo-Json -Compress
+    }
+}
+
+# Function to show history
+function Show-AmendmentHistory {
+    if (-not (Test-Path $amendmentLog)) {
+        Write-Host "No amendment history found" -ForegroundColor Yellow
+        return
+    }
+
+    $entries = Get-Content $amendmentLog
+
+    if ($Json) {
+        $amendments = @()
+        foreach ($entry in $entries) {
+            $parts = $entry -split '\|', 4
+            $amendments += @{
+                timestamp = $parts[0]
+                version = $parts[1]
+                author = $parts[2]
+                description = $parts[3]
+            }
+        }
+        @{amendments=$amendments} | ConvertTo-Json -Depth 10
+    } else {
+        Write-Host "Constitution Amendment History:" -ForegroundColor Cyan
+        Write-Host "================================" -ForegroundColor Cyan
+        "{0,-20} {1,-10} {2,-20} {3}" -f "Date", "Version", "Author", "Description"
+        Write-Host ("-" * 80) -ForegroundColor Gray
+
+        foreach ($entry in $entries) {
+            $parts = $entry -split '\|', 4
+            $date = ($parts[0] -split 'T')[0]
+            "{0,-20} {1,-10} {2,-20} {3}" -f $date, $parts[1], $parts[2], $parts[3]
+        }
+    }
+}
+
+# Function to show diff
+function Show-ConstitutionDiff {
+    param([string]$Version1 = "HEAD~1", [string]$Version2 = "HEAD")
+
+    try {
+        $null = git log --oneline -n 1 -- "$constitutionFile" 2>$null
+    } catch {
+        Write-Error "Constitution file not under git version control"
+        exit 1
+    }
+
+    Write-Host "Constitution differences between $Version1 and $Version2`:" -ForegroundColor Cyan
+    Write-Host ("=" * 60) -ForegroundColor Cyan
+
+    try {
+        git diff "$Version1`:$constitutionFile" "$Version2`:$constitutionFile"
+    } catch {
+        Write-Host "Could not generate diff. Make sure both versions exist." -ForegroundColor Red
+        exit 1
+    }
+}
+
+# Function to manage versions
+function Invoke-VersionManagement {
+    param([string]$Action, [string]$ChangeType)
+
+    switch ($Action) {
+        "current" {
+            $version = Get-ConstitutionVersion
+            Write-Host "Current constitution version: $version" -ForegroundColor Green
+        }
+        "bump" {
+            if (-not $ChangeType) {
+                Write-Error "Must specify change type for version bump (major, minor, patch)"
+                exit 1
+            }
+
+            $currentVersion = Get-ConstitutionVersion
+            $newVersion = Step-Version -CurrentVersion $currentVersion -ChangeType $ChangeType
+
+            # Update constitution
+            $content = Get-Content $constitutionFile -Raw
+            $content = $content -replace '\*\*Version\*\*:.*', "**Version**: $newVersion"
+            $content = $content -replace '\*\*Last Amended\*\*:.*', "**Last Amended**: $(Get-Date -Format 'yyyy-MM-dd')"
+            $content | Out-File -FilePath $constitutionFile -Encoding UTF8
+
+            $author = git config user.name 2>$null
+            if (-not $author) { $author = "System" }
+
+            Add-AmendmentLog -Version $newVersion -Author $author -Description "Version bump: $ChangeType"
+
+            Write-Host "Version bumped from $currentVersion to $newVersion" -ForegroundColor Green
+        }
+        default {
+            Write-Error "Invalid version action: $Action. Valid actions: current, bump"
+            exit 1
+        }
+    }
+}
+
+# Main logic
+if ($Amend) {
+    if ($ArgsList.Count -eq 0) {
+        Write-Error "Must specify amendment file for -Amend"
+        exit 1
+    }
+
+    $amendmentFile = $ArgsList[0]
+    $changeType = if ($ArgsList.Count -gt 1) { $ArgsList[1] } else { "minor" }
+
+    if (Test-Path $amendmentFile) {
+        New-AmendmentProposal -AmendmentFile $amendmentFile
+    } else {
+        Apply-Amendment -AmendmentId $amendmentFile -ChangeType $changeType
+    }
+
+} elseif ($History) {
+    Show-AmendmentHistory
+
+} elseif ($Diff) {
+    $version1 = if ($ArgsList.Count -gt 0) { $ArgsList[0] } else { "HEAD~1" }
+    $version2 = if ($ArgsList.Count -gt 1) { $ArgsList[1] } else { "HEAD" }
+    Show-ConstitutionDiff -Version1 $version1 -Version2 $Version2
+
+} elseif ($Version) {
+    $action = if ($ArgsList.Count -gt 0) { $ArgsList[0] } else { "current" }
+    $changeType = if ($ArgsList.Count -gt 1) { $ArgsList[1] } else { $null }
+    Invoke-VersionManagement -Action $action -ChangeType $changeType
+
+} else {
+    # Default: show current status
+    if (-not (Test-Path $constitutionFile)) {
+        Write-Host "No constitution found. Run setup-constitution.ps1 first." -ForegroundColor Red
+        exit 1
+    }
+
+    $currentVersion = Get-ConstitutionVersion
+    $amendmentCount = if (Test-Path $amendmentLog) { (Get-Content $amendmentLog).Count } else { 0 }
+
+    if ($Json) {
+        @{
+            version = $currentVersion
+            amendments = $amendmentCount
+            file = $constitutionFile
+        } | ConvertTo-Json -Compress
+    } else {
+        Write-Host "Constitution Status:" -ForegroundColor Cyan
+        Write-Host "===================" -ForegroundColor Cyan
+        Write-Host "Current Version: $currentVersion" -ForegroundColor White
+        Write-Host "Total Amendments: $amendmentCount" -ForegroundColor White
+        Write-Host "Constitution File: $constitutionFile" -ForegroundColor White
+        Write-Host ""
+        Write-Host "Available commands:" -ForegroundColor Yellow
+        Write-Host "  -History          Show amendment history" -ForegroundColor White
+        Write-Host "  -Version current  Show current version" -ForegroundColor White
+        Write-Host "  -Version bump <type>  Bump version (major/minor/patch)" -ForegroundColor White
+        Write-Host "  -Amend <file>     Propose new amendment" -ForegroundColor White
+        Write-Host "  -Amend <id> <type> Apply approved amendment" -ForegroundColor White
+        Write-Host "  -Diff [v1] [v2]  Show constitution differences" -ForegroundColor White
+    }
+}
\ No newline at end of file
diff --git a/scripts/powershell/constitution-levelup.ps1 b/scripts/powershell/constitution-levelup.ps1
new file mode 100644
index 0000000000..e9962ffb7b
--- /dev/null
+++ b/scripts/powershell/constitution-levelup.ps1
@@ -0,0 +1,393 @@
+#!/usr/bin/env pwsh
+[CmdletBinding()]
+param(
+    [switch]$Json,
+    [switch]$Amendment,
+    [switch]$Validate,
+    [string]$KnowledgeFile
+)
+
+$ErrorActionPreference = 'Stop'
+
+if (-not $KnowledgeFile -and -not $Amendment -and -not $Validate) {
+    Write-Error "Must specify AI session context file or use -Amendment/-Validate mode"
+    exit 1
+}
+
+. "$PSScriptRoot/common.ps1"
+
+$paths = Get-FeaturePathsEnv
+
+# Function to analyze AI session context for team-ai-directives contributions
+function Analyze-TeamDirectivesContributions {
+    param([string]$ContextFile)
+
+    if (-not (Test-Path $ContextFile)) {
+        Write-Error "AI session context file not found: $ContextFile"
+        exit 1
+    }
+
+    $content = Get-Content $ContextFile -Raw
+
+    # Extract different sections for analysis
+    $sessionOverview = ""
+    if ($content -match '(?s)## Session Overview(.*?)(?=^## )') {
+        $sessionOverview = $matches[1]
+    }
+
+    $decisionPatterns = ""
+    if ($content -match '(?s)## Decision Patterns(.*?)(?=^## )') {
+        $decisionPatterns = $matches[1]
+    }
+
+    $executionContext = ""
+    if ($content -match '(?s)## Execution Context(.*?)(?=^## )') {
+        $executionContext = $matches[1]
+    }
+
+    $reusablePatterns = ""
+    if ($content -match '(?s)## Reusable Patterns(.*?)(?=^## )') {
+        $reusablePatterns = $matches[1]
+    }
+
+    # Analyze for different component contributions
+    $constitutionScore = 0
+    $rulesScore = 0
+    $personasScore = 0
+    $examplesScore = 0
+
+    $constitutionKeywords = @("must", "shall", "required", "mandatory", "always", "never", "principle", "governance", "policy", "standard", "quality", "security", "testing", "documentation", "architecture", "compliance", "oversight", "review", "approval")
+    $rulesKeywords = @("agent", "behavior", "interaction", "decision", "pattern", "approach", "strategy", "methodology", "tool", "execution")
+    $personasKeywords = @("role", "specialization", "capability", "expertise", "persona", "assistant", "specialist", "focus")
+    $examplesKeywords = @("example", "case", "study", "implementation", "usage", "reference", "demonstration", "scenario")
+
+    # Analyze constitution relevance
+    foreach ($keyword in $constitutionKeywords) {
+        if ($content -match "(?i)$keyword") {
+            $constitutionScore++
+        }
+    }
+
+    # Analyze rules relevance
+    foreach ($keyword in $rulesKeywords) {
+        if (($decisionPatterns + $reusablePatterns) -match "(?i)$keyword") {
+            $rulesScore++
+        }
+    }
+
+    # Analyze personas relevance
+    foreach ($keyword in $personasKeywords) {
+        if (($sessionOverview + $executionContext) -match "(?i)$keyword") {
+            $personasScore++
+        }
+    }
+
+    # Analyze examples relevance
+    foreach ($keyword in $examplesKeywords) {
+        if (($executionContext + $reusablePatterns) -match "(?i)$keyword") {
+            $examplesScore++
+        }
+    }
+
+    # Check for imperative language patterns (constitution)
+    if ($content -match "(?m)^[ ]*-[ ]*[A-Z][a-z]*.*(?:must|shall|should|will)") {
+        $constitutionScore += 2
+    }
+
+    # Return analysis results: constitution|rules|personas|examples|content_sections
+    return "$constitutionScore|$rulesScore|$personasScore|$examplesScore|$sessionOverview|$decisionPatterns|$executionContext|$reusablePatterns"
+}
+
+# Function to generate team-ai-directives proposals
+function New-DirectivesProposals {
+    param([string]$ContextFile, [string]$AnalysisResult)
+
+    $parts = $AnalysisResult -split '\|', 8
+    $constitutionScore = [int]$parts[0]
+    $rulesScore = [int]$parts[1]
+    $personasScore = [int]$parts[2]
+    $examplesScore = [int]$parts[3]
+    $sessionOverview = $parts[4]
+    $decisionPatterns = $parts[5]
+    $executionContext = $parts[6]
+    $reusablePatterns = $parts[7]
+
+    # Extract feature name from file path
+    $featureName = [System.IO.Path]::GetFileNameWithoutExtension($ContextFile) -replace '-session$', ''
+
+    $proposals = ""
+
+    # Generate constitution amendment if relevant
+    if ($constitutionScore -ge 3) {
+        $amendmentTitle = ($sessionOverview -split "`n" | Where-Object { $_ -notmatch "^#" } | Select-Object -First 1 | ForEach-Object { $_.TrimStart('- ').Substring(0, [Math]::Min(50, $_.Length)) })
+        if (-not $amendmentTitle) {
+            $amendmentTitle = "Constitution Amendment from $featureName"
+        }
+
+        $proposals += "**CONSTITUTION AMENDMENT PROPOSAL**
+
+**Proposed Principle:** $amendmentTitle
+
+**Description:**
+$($sessionOverview -replace '^#', '###')
+
+**Rationale:** This principle was derived from AI agent session in feature '$featureName'. The approach demonstrated governance and quality considerations that should be codified.
+
+**Evidence:** See AI session context at $ContextFile
+
+**Impact Assessment:**
+- Adds new governance requirement for AI agent sessions
+- May require updates to agent behavior guidelines
+- Enhances project quality and consistency
+- Should be reviewed by team before adoption
+
+---
+"
+    }
+
+    # Generate rules proposal if relevant
+    if ($rulesScore -ge 2) {
+        $proposals += "**RULES CONTRIBUTION**
+
+**Proposed Rule:** AI Agent Decision Pattern from $featureName
+
+**Pattern Description:**
+$($decisionPatterns -replace '^#', '###')
+
+**When to Apply:** Use this decision pattern when facing similar challenges in $featureName-type features.
+
+**Evidence:** See AI session context at $ContextFile
+
+---
+"
+    }
+
+    # Generate personas proposal if relevant
+    if ($personasScore -ge 2) {
+        $proposals += "**PERSONA DEFINITION**
+
+**Proposed Persona:** Specialized Agent for $featureName-type Features
+
+**Capabilities Demonstrated:**
+$($executionContext -replace '^#', '###')
+
+**Specialization:** $featureName implementation and similar complex feature development.
+
+**Evidence:** See AI session context at $ContextFile
+
+---
+"
+    }
+
+    # Generate examples proposal if relevant
+    if ($examplesScore -ge 2) {
+        $proposals += "**EXAMPLE CONTRIBUTION**
+
+**Example:** $featureName Implementation Approach
+
+**Scenario:** Complete feature development from spec to deployment.
+
+**Approach Used:**
+$($reusablePatterns -replace '^#', '###')
+
+**Outcome:** Successful implementation with quality gates passed.
+
+**Evidence:** See AI session context at $ContextFile
+
+---
+"
+    }
+
+    if (-not $proposals) {
+        return "No significant team-ai-directives contributions identified from this session."
+    }
+
+    return $proposals
+}
+
+    # Generate amendment proposal
+    $amendmentTitle = ""
+
+    # Try to extract a concise title from the rule
+    $lines = $ruleSection -split "`n" | Where-Object { $_ -notmatch '^#' -and $_.Trim() -ne '' }
+    if ($lines) {
+        $firstLine = $lines[0] -replace '^[ ]*-[ ]*', ''
+        $amendmentTitle = $firstLine.Substring(0, [Math]::Min(50, $firstLine.Length))
+    }
+
+    if (-not $amendmentTitle) {
+        $amendmentTitle = "Amendment from $featureName"
+    }
+
+    $amendmentDescription = @"
+**Proposed Principle:** $amendmentTitle
+
+**Description:**
+$($ruleSection -replace '^#', '###')
+
+**Rationale:** This principle was derived from successful implementation of feature '$featureName'. The rule addresses $($matchedKeywords -split ' ' -join ', ') considerations identified during development.
+
+**Evidence:** See knowledge asset at $KnowledgeFile
+
+**Impact Assessment:**
+- Adds new governance requirement
+- May require updates to existing processes
+- Enhances project quality/consistency
+- Should be reviewed by team before adoption
+"@
+
+    return $amendmentDescription
+}
+
+# Function to validate amendment against existing constitution
+function Test-Amendment {
+    param([string]$Amendment)
+
+    $constitutionFile = Join-Path $paths.REPO_ROOT '.specify/memory/constitution.md'
+
+    if (-not (Test-Path $constitutionFile)) {
+        Write-Warning "No project constitution found at $constitutionFile"
+        return $true
+    }
+
+    $constitutionContent = Get-Content $constitutionFile -Raw
+    $conflicts = @()
+
+    # Extract principle names from amendment
+    if ($Amendment -match '\*\*Proposed Principle:\*\* (.+)') {
+        $amendmentPrinciple = $matches[1]
+
+        # Check if similar principle already exists
+        if ($constitutionContent -match "(?i)$amendmentPrinciple") {
+            $conflicts += "Similar principle already exists: $amendmentPrinciple"
+        }
+    }
+
+    # Check for contradictory language
+    $amendmentRules = ($Amendment -split '\*\*Rationale:\*\*')[0] |
+        Select-String -Pattern '(?m)^[ ]*-[ ]*[A-Z].*' -AllMatches |
+        ForEach-Object { $_.Matches.Value }
+
+    foreach ($rule in $amendmentRules) {
+        $ruleWord = ($rule -split ' ' | Select-Object -Last 1)
+        if ($constitutionContent -match "(?i)never.*$ruleWord" -or $constitutionContent -match "(?i)must not.*$ruleWord") {
+            $conflicts += "Potential contradiction with existing rule: $rule"
+        }
+    }
+
+    if ($conflicts.Count -gt 0) {
+        Write-Host "VALIDATION ISSUES:" -ForegroundColor Red
+        foreach ($conflict in $conflicts) {
+            Write-Host "  - $conflict" -ForegroundColor Yellow
+        }
+        return $false
+    } else {
+        Write-Host "✓ Amendment validation passed - no conflicts detected" -ForegroundColor Green
+        return $true
+    }
+}
+
+# Main logic
+if ($Validate) {
+    if (-not $KnowledgeFile) {
+        Write-Error "Must specify directives contributions file for validation"
+        exit 1
+    }
+
+    $contributionsContent = Get-Content $KnowledgeFile -Raw
+    if (Test-Amendment -Amendment $contributionsContent) {
+        if ($Json) {
+            @{status="valid"; file=$KnowledgeFile} | ConvertTo-Json -Compress
+        } else {
+            Write-Host "Directives contributions validation successful" -ForegroundColor Green
+        }
+    } else {
+        if ($Json) {
+            @{status="invalid"; file=$KnowledgeFile} | ConvertTo-Json -Compress
+        } else {
+            Write-Host "Directives contributions validation failed" -ForegroundColor Red
+        }
+        exit 1
+    }
+    exit 0
+}
+
+if ($Amendment) {
+    if (-not $KnowledgeFile) {
+        Write-Error "Must specify AI session context file for directives proposals"
+        exit 1
+    }
+
+    $analysis = Analyze-TeamDirectivesContributions -ContextFile $KnowledgeFile
+    $proposals = New-DirectivesProposals -ContextFile $KnowledgeFile -AnalysisResult $analysis
+
+    if ($proposals -notmatch "^No significant team-ai-directives") {
+        if ($Json) {
+            @{status="proposed"; file=$KnowledgeFile; proposals=$proposals} | ConvertTo-Json
+        } else {
+            Write-Host "Team-AI-Directives Contribution Proposals:" -ForegroundColor Cyan
+            Write-Host "==========================================" -ForegroundColor Cyan
+            Write-Host $proposals
+            Write-Host ""
+            Write-Host "To apply this amendment, run:" -ForegroundColor Yellow
+            Write-Host "  constitution-amend --file amendment.md"
+        }
+    } else {
+        if ($Json) {
+            @{status="not_constitution_level"; file=$KnowledgeFile} | ConvertTo-Json -Compress
+        } else {
+            Write-Host $proposal -ForegroundColor Yellow
+        }
+    }
+    exit 0
+}
+
+# Default: analyze mode
+$analysis = Analyze-TeamDirectivesContributions -ContextFile $KnowledgeFile
+
+$parts = $analysis -split '\|', 4
+$constitutionScore = [int]$parts[0]
+$rulesScore = [int]$parts[1]
+$personasScore = [int]$parts[2]
+$examplesScore = [int]$parts[3]
+
+if ($Json) {
+    @{
+        file = $KnowledgeFile
+        constitution_score = $constitutionScore
+        rules_score = $rulesScore
+        personas_score = $personasScore
+        examples_score = $examplesScore
+    } | ConvertTo-Json -Compress
+} else {
+    Write-Host "Team-AI-Directives Contribution Analysis for: $KnowledgeFile" -ForegroundColor Cyan
+    Write-Host "Constitution Score: $constitutionScore/10" -ForegroundColor White
+    Write-Host "Rules Score: $rulesScore/5" -ForegroundColor White
+    Write-Host "Personas Score: $personasScore/5" -ForegroundColor White
+    Write-Host "Examples Score: $examplesScore/5" -ForegroundColor White
+    Write-Host ""
+
+    $hasContributions = $false
+    if ($constitutionScore -ge 3) {
+        Write-Host "✓ Constitution contribution potential detected" -ForegroundColor Green
+        $hasContributions = $true
+    }
+    if ($rulesScore -ge 2) {
+        Write-Host "✓ Rules contribution potential detected" -ForegroundColor Green
+        $hasContributions = $true
+    }
+    if ($personasScore -ge 2) {
+        Write-Host "✓ Personas contribution potential detected" -ForegroundColor Green
+        $hasContributions = $true
+    }
+    if ($examplesScore -ge 2) {
+        Write-Host "✓ Examples contribution potential detected" -ForegroundColor Green
+        $hasContributions = $true
+    }
+
+    if ($hasContributions) {
+        Write-Host "Run with -Amendment to generate contribution proposals" -ForegroundColor Yellow
+    } else {
+        Write-Host "ℹ No significant team-ai-directives contributions identified" -ForegroundColor Yellow
+    }
+}
\ No newline at end of file
diff --git a/scripts/powershell/create-new-feature.ps1 b/scripts/powershell/create-new-feature.ps1
index 2f0172e35d..c5aa9dee86 100644
--- a/scripts/powershell/create-new-feature.ps1
+++ b/scripts/powershell/create-new-feature.ps1
@@ -254,12 +254,124 @@ if ($hasGit) {
 $featureDir = Join-Path $specsDir $branchName
 New-Item -ItemType Directory -Path $featureDir -Force | Out-Null
 
-$template = Join-Path $repoRoot '.specify/templates/spec-template.md'
+# Mode-aware template selection
+$modeFile = Join-Path $repoRoot '.specify/config/config.json'
+$currentMode = "spec"
+if (Test-Path $modeFile) {
+    try {
+        $config = Get-Content $modeFile -Raw | ConvertFrom-Json
+        if ($config.PSObject.Properties['current_mode']) {
+            $currentMode = $config.current_mode
+        }
+    } catch {
+        # Fall back to default if JSON parsing fails
+        $currentMode = "spec"
+    }
+}
+
+# Select template based on mode
+if ($currentMode -eq "build") {
+    $template = Join-Path $repoRoot 'templates/spec-template-build.md'
+} else {
+    $template = Join-Path $repoRoot 'templates/spec-template.md'
+}
 $specFile = Join-Path $featureDir 'spec.md'
-if (Test-Path $template) { 
-    Copy-Item $template $specFile -Force 
-} else { 
-    New-Item -ItemType File -Path $specFile | Out-Null 
+if (Test-Path $template) {
+    Copy-Item $template $specFile -Force
+} else {
+    New-Item -ItemType File -Path $specFile | Out-Null
+}
+
+# Function to populate context.md with intelligent defaults (mode-aware)
+function Populate-ContextFile {
+    param(
+        [string]$ContextFile,
+        [string]$FeatureName,
+        [string]$FeatureDescription,
+        [string]$Mode
+    )
+
+    # Extract feature title (first line or first sentence)
+    $featureTitle = ($featureDescription -split "`n")[0].Trim()
+
+    # Extract mission (first sentence, limited to reasonable length)
+    $mission = ($featureDescription -split '[.!?]')[0]
+    if (-not $mission) {
+        $mission = $featureDescription
+    }
+    # Limit mission length for readability
+    if ($mission.Length -gt 200) {
+        $mission = $mission.Substring(0, 200).TrimEnd() + "..."
+    }
+
+    # Mode-aware field population
+    if ($Mode -eq "build") {
+        # Build mode: Minimal context, focus on core functionality
+        $codePaths = "To be determined during implementation"
+        $directives = "None (build mode)"
+        $research = "Minimal research needed for lightweight implementation"
+        $gateway = "None (build mode)"
+    } else {
+        # Spec mode: Comprehensive context for full specification
+        # Detect code paths (basic detection based on common patterns)
+        $codePaths = "To be determined during planning phase"
+        if ($featureDescription -match "(?i)api|endpoint|service") {
+            $codePaths = "api/, services/"
+        } elseif ($featureDescription -match "(?i)ui|frontend|component") {
+            $codePaths = "src/components/, src/pages/"
+        } elseif ($featureDescription -match "(?i)database|data|model") {
+            $codePaths = "src/models/, database/"
+        }
+
+        # Read team directives if available
+        $directives = "None"
+        $teamDirectivesFile = Join-Path $repoRoot '.specify/memory/team-ai-directives/directives.md'
+        if (Test-Path $teamDirectivesFile) {
+            $directives = "See team-ai-directives repository for applicable guidelines"
+        }
+
+        # Set research needs
+        $research = "To be identified during specification and planning phases"
+
+        # Read gateway configuration if available
+        $gateway = "None"
+        $configFile = Join-Path $repoRoot '.specify/config/config.json'
+        if (Test-Path $configFile) {
+            try {
+                $config = Get-Content $configFile -Raw | ConvertFrom-Json
+                if ($config.gateway -and $config.gateway.url) {
+                    $gateway = $config.gateway.url
+                }
+            } catch {
+                # Fall back to None if JSON parsing fails
+                $gateway = "None"
+            }
+        }
+    }
+
+    # Create context.md with populated values
+    $contextContent = @"
+# Feature Context
+
+**Feature**: $featureTitle
+**Mission**: $mission
+**Code Paths**: $codePaths
+**Directives**: $directives
+**Research**: $research
+**Gateway**: $gateway
+
+"@
+
+    $contextContent | Out-File -FilePath $contextFile -Encoding UTF8
+}
+
+# Populate context.md with intelligent defaults
+$contextTemplate = Join-Path $repoRoot 'templates/context-template.md'
+$contextFile = Join-Path $featureDir 'context.md'
+if (Test-Path $contextTemplate) {
+    Populate-ContextFile -ContextFile $contextFile -FeatureName $branchSuffix -FeatureDescription $featureDescription -Mode $currentMode
+} else {
+    New-Item -ItemType File -Path $contextFile | Out-Null
 }
 
 # Set the SPECIFY_FEATURE environment variable for the current session
diff --git a/scripts/powershell/implement.ps1 b/scripts/powershell/implement.ps1
new file mode 100644
index 0000000000..d9c1a315f5
--- /dev/null
+++ b/scripts/powershell/implement.ps1
@@ -0,0 +1,220 @@
+# implement.ps1 - Execute the implementation plan with dual execution loop support
+# Handles SYNC/ASYNC task classification, MCP dispatching, and review enforcement
+
+param(
+    [Parameter(Mandatory=$true)]
+    [string]$JsonOutput
+)
+
+# Source common utilities
+$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path
+. "$ScriptDir/common.ps1"
+
+# Global variables
+$FeatureDir = ""
+$AvailableDocs = ""
+$TasksFile = ""
+$TasksMetaFile = ""
+$ChecklistsDir = ""
+$ImplementationLog = ""
+
+# Logging functions
+function Write-Info {
+    param([string]$Message)
+    Write-Host "[INFO] $Message" -ForegroundColor Cyan
+}
+
+function Write-Success {
+    param([string]$Message)
+    Write-Host "[SUCCESS] $Message" -ForegroundColor Green
+}
+
+function Write-Error {
+    param([string]$Message)
+    Write-Host "[ERROR] $Message" -ForegroundColor Red
+}
+
+function Write-Warning {
+    param([string]$Message)
+    Write-Host "[WARNING] $Message" -ForegroundColor Yellow
+}
+
+# Initialize implementation environment
+function Initialize-Implementation {
+    param([string]$JsonOutput)
+
+    # Parse JSON output from check-prerequisites.ps1
+    $global:FeatureDir = ($JsonOutput | ConvertFrom-Json).FEATURE_DIR
+    $global:AvailableDocs = ($JsonOutput | ConvertFrom-Json).AVAILABLE_DOCS
+
+    if ([string]::IsNullOrEmpty($global:FeatureDir)) {
+        Write-Error "FEATURE_DIR not found in prerequisites check"
+        exit 1
+    }
+
+    $global:TasksFile = Join-Path $global:FeatureDir "tasks.md"
+    $global:TasksMetaFile = Join-Path $global:FeatureDir "tasks_meta.json"
+    $global:ChecklistsDir = Join-Path $global:FeatureDir "checklists"
+
+    # Create implementation log
+    $global:ImplementationLog = Join-Path $global:FeatureDir "implementation.log"
+    $logContent = @"
+# Implementation Log - $(Get-Date)
+
+"@
+    $logContent | Out-File -FilePath $global:ImplementationLog -Encoding UTF8
+
+    Write-Info "Initialized implementation for feature: $(Split-Path $global:FeatureDir -Leaf)"
+}
+
+# Check checklists status
+function Test-ChecklistsStatus {
+    if (-not (Test-Path $global:ChecklistsDir)) {
+        Write-Info "No checklists directory found - proceeding without checklist validation"
+        return
+    }
+
+    Write-Info "Checking checklist status..."
+
+    $totalChecklists = 0
+    $passedChecklists = 0
+    $failedChecklists = 0
+
+    $logContent = @"
+
+## Checklist Status Report
+
+| Checklist | Total | Completed | Incomplete | Status |
+|-----------|-------|-----------|------------|--------|
+"@
+
+    Get-ChildItem -Path $global:ChecklistsDir -Filter "*.md" | ForEach-Object {
+        $checklistFile = $_.FullName
+        $filename = $_.BaseName
+
+        $content = Get-Content $checklistFile -Raw
+        $totalItems = ($content | Select-String -Pattern "^- \[" -AllMatches).Matches.Count
+        $completedItems = ($content | Select-String -Pattern "^- \[X\]|^- \[x\]" -AllMatches).Matches.Count
+        $incompleteItems = $totalItems - $completedItems
+
+        $status = if ($incompleteItems -gt 0) { "FAIL"; $global:failedChecklists++ } else { "PASS"; $global:passedChecklists++ }
+        $global:totalChecklists++
+
+        $logContent += "| $filename | $totalItems | $completedItems | $incompleteItems | $status |`n"
+    }
+
+    $logContent | Out-File -FilePath $global:ImplementationLog -Append -Encoding UTF8
+
+    if ($failedChecklists -gt 0) {
+        Write-Warning "Found $failedChecklists checklist(s) with incomplete items"
+        $response = Read-Host "Some checklists are incomplete. Do you want to proceed with implementation anyway? (yes/no)"
+        if ($response -notmatch "^(yes|y)$") {
+            Write-Info "Implementation cancelled by user"
+            exit 0
+        }
+    } else {
+        Write-Success "All $totalChecklists checklists passed"
+    }
+}
+
+# Load implementation context
+function Import-ImplementationContext {
+    Write-Info "Loading implementation context..."
+
+    # Get current workflow mode
+    $workflowMode = "spec"  # Default
+    $configFile = ".specify/config/config.json"
+    if (Test-Path $configFile) {
+        try {
+            $configData = Get-Content $configFile | ConvertFrom-Json
+            $workflowMode = $configData.workflow.current_mode
+            if (-not $workflowMode) { $workflowMode = "spec" }
+        } catch {
+            $workflowMode = "spec"
+        }
+    }
+
+    # Required files (plan.md is optional in build mode)
+    $requiredFiles = @("tasks.md", "spec.md")
+    if ($workflowMode -eq "spec") {
+        $requiredFiles += "plan.md"
+    }
+
+    foreach ($file in $requiredFiles) {
+        $filePath = Join-Path $global:FeatureDir $file
+        if (-not (Test-Path $filePath)) {
+            Write-Error "Required file missing: $filePath"
+            exit 1
+        }
+    }
+
+    # Optional files (plan.md is optional in build mode)
+    $optionalFiles = @("data-model.md", "contracts", "research.md", "quickstart.md")
+    if ($workflowMode -eq "build") {
+        $optionalFiles += "plan.md"
+    }
+
+    foreach ($file in $optionalFiles) {
+        $filePath = Join-Path $global:FeatureDir $file
+        if ((Test-Path $filePath)) {
+            Write-Info "Found optional context: $file"
+        }
+    }
+}
+
+# Parse tasks from tasks.md (simplified implementation)
+function Get-TasksFromFile {
+    Write-Info "Parsing tasks from $global:TasksFile..."
+
+    if (-not (Test-Path $global:TasksFile)) {
+        Write-Warning "Tasks file not found: $global:TasksFile"
+        return
+    }
+
+    $content = Get-Content $global:TasksFile -Raw
+    $taskLines = $content | Select-String -Pattern "^- \[ \] T\d+" -AllMatches
+
+    if ($taskLines.Matches.Count -eq 0) {
+        Write-Warning "No uncompleted tasks found in $global:TasksFile"
+        return
+    }
+
+    foreach ($match in $taskLines.Matches) {
+        $taskLine = $match.Value
+
+        # Extract task ID
+        $taskId = [regex]::Match($taskLine, "T\d+").Value
+
+        # Extract description (remove markers and task ID)
+        $description = $taskLine -replace "^- \[ \] T\d+ " -replace "\[.*?\]", "" | ForEach-Object { $_.Trim() }
+
+        # Determine execution mode
+        $executionMode = if ($taskLine -match "\[ASYNC\]") { "ASYNC" } else { "SYNC" }
+
+        # Check for parallel marker
+        $parallelMarker = if ($taskLine -match "\[P\]") { "P" } else { "" }
+
+        # Extract file paths (simplified)
+        $taskFiles = ($taskLine | Select-String -Pattern "\b\w+\.(js|ts|py|java|cpp|md|json|yml|yaml)\b" -AllMatches).Matches.Value -join " "
+
+        Write-Info "Found task $taskId`: $description [$executionMode] $(if ($parallelMarker) { "[$parallelMarker] " } else { "" })($taskFiles)"
+
+        # In a real implementation, would call classify and add task functions
+        # For now, just log the classification
+    }
+}
+
+# Main implementation workflow
+function Invoke-MainImplementation {
+    param([string]$JsonOutput)
+
+    Initialize-Implementation -JsonOutput $JsonOutput
+    Test-ChecklistsStatus
+    Import-ImplementationContext
+    Get-TasksFromFile
+
+    Write-Success "Implementation phase completed (PowerShell implementation is simplified)"
+}
+
+# Run main function
+Invoke-MainImplementation -JsonOutput $JsonOutput
\ No newline at end of file
diff --git a/scripts/powershell/prepare-levelup.ps1 b/scripts/powershell/prepare-levelup.ps1
new file mode 100644
index 0000000000..ff0dfd0b70
--- /dev/null
+++ b/scripts/powershell/prepare-levelup.ps1
@@ -0,0 +1,67 @@
+#!/usr/bin/env pwsh
+[CmdletBinding()]
+param(
+    [switch]$Json,
+    [switch]$Help
+)
+
+$ErrorActionPreference = 'Stop'
+
+if ($Help) {
+    Write-Output "Usage: ./prepare-levelup.ps1 [-Json] [-Help]"
+    Write-Output "  -Json     Output results in JSON format"
+    Write-Output "  -Help     Show this help message"
+    exit 0
+}
+
+. "$PSScriptRoot/common.ps1"
+
+$paths = Get-FeaturePathsEnv
+
+if (-not (Test-FeatureBranch -Branch $paths.CURRENT_BRANCH -HasGit $paths.HAS_GIT)) {
+    exit 1
+}
+
+$knowledgeRoot = $env:SPECIFY_TEAM_DIRECTIVES
+if (-not $knowledgeRoot) {
+    $knowledgeRoot = Join-Path $paths.REPO_ROOT '.specify/memory/team-ai-directives'
+}
+
+$knowledgeDrafts = $null
+if (Test-Path $knowledgeRoot -PathType Container) {
+    $knowledgeDrafts = Join-Path $knowledgeRoot 'drafts'
+    New-Item -ItemType Directory -Path $knowledgeDrafts -Force | Out-Null
+} else {
+    $knowledgeRoot = ''
+    $knowledgeDrafts = ''
+}
+
+if ($Json) {
+    $result = [PSCustomObject]@{
+        FEATURE_DIR      = $paths.FEATURE_DIR
+        BRANCH           = $paths.CURRENT_BRANCH
+        SPEC_FILE        = $paths.FEATURE_SPEC
+        PLAN_FILE        = $paths.IMPL_PLAN
+        TASKS_FILE       = $paths.TASKS
+        RESEARCH_FILE    = $paths.RESEARCH
+        QUICKSTART_FILE  = $paths.QUICKSTART
+        KNOWLEDGE_ROOT   = $knowledgeRoot
+        KNOWLEDGE_DRAFTS = $knowledgeDrafts
+    }
+    $result | ConvertTo-Json -Compress
+} else {
+    Write-Output "FEATURE_DIR: $($paths.FEATURE_DIR)"
+    Write-Output "BRANCH: $($paths.CURRENT_BRANCH)"
+    Write-Output "SPEC_FILE: $($paths.FEATURE_SPEC)"
+    Write-Output "PLAN_FILE: $($paths.IMPL_PLAN)"
+    Write-Output "TASKS_FILE: $($paths.TASKS)"
+    Write-Output "RESEARCH_FILE: $($paths.RESEARCH)"
+    Write-Output "QUICKSTART_FILE: $($paths.QUICKSTART)"
+    if ($knowledgeRoot) {
+        Write-Output "KNOWLEDGE_ROOT: $knowledgeRoot"
+        Write-Output "KNOWLEDGE_DRAFTS: $knowledgeDrafts"
+    } else {
+        Write-Output "KNOWLEDGE_ROOT: (missing)"
+        Write-Output "KNOWLEDGE_DRAFTS: (missing)"
+    }
+}
diff --git a/scripts/powershell/scan-project-artifacts.ps1 b/scripts/powershell/scan-project-artifacts.ps1
new file mode 100644
index 0000000000..2414078bbd
--- /dev/null
+++ b/scripts/powershell/scan-project-artifacts.ps1
@@ -0,0 +1,303 @@
+#!/usr/bin/env pwsh
+[CmdletBinding()]
+param(
+    [switch]$Json,
+    [switch]$Suggestions
+)
+
+$ErrorActionPreference = 'Stop'
+
+. "$PSScriptRoot/common.ps1"
+
+$paths = Get-FeaturePathsEnv
+
+# Function to scan for testing patterns
+function Get-TestingPatterns {
+    param([string]$RepoRoot)
+
+    $testPatterns = @(
+        "test_*.py", "*Test.java", "*.spec.js", "*.test.js", "*_test.go",
+        "spec/**/*.rb", "test/**/*.rs", "__tests__/**/*.js"
+    )
+
+    $testFilesCount = 0
+    $testFrameworksFound = @()
+
+    # Count test files
+    foreach ($pattern in $testPatterns) {
+        try {
+            $files = Get-ChildItem -Path $RepoRoot -Filter $pattern -Recurse -File -ErrorAction SilentlyContinue
+            $testFilesCount += $files.Count
+        } catch {
+            # Ignore errors for patterns that don't match
+        }
+    }
+
+    # Detect testing frameworks
+    $packageJsonFiles = Get-ChildItem -Path $RepoRoot -Filter "package.json" -Recurse -File -ErrorAction SilentlyContinue
+    foreach ($file in $packageJsonFiles) {
+        $content = Get-Content $file.FullName -Raw
+        if ($content -match '"jest"') {
+            $testFrameworksFound += "Jest"
+        }
+    }
+
+    $pytestFiles = Get-ChildItem -Path $RepoRoot -Include "pytest.ini", "setup.cfg" -Recurse -File -ErrorAction SilentlyContinue
+    foreach ($file in $pytestFiles) {
+        $content = Get-Content $file.FullName -Raw
+        if ($content -match "pytest") {
+            $testFrameworksFound += "pytest"
+        }
+    }
+
+    $cargoFiles = Get-ChildItem -Path $RepoRoot -Filter "Cargo.toml" -Recurse -File -ErrorAction SilentlyContinue
+    foreach ($file in $cargoFiles) {
+        $content = Get-Content $file.FullName -Raw
+        if ($content -match "testing") {
+            $testFrameworksFound += "Rust testing"
+        }
+    }
+
+    $goFiles = Get-ChildItem -Path $RepoRoot -Filter "*.go" -Recurse -File -ErrorAction SilentlyContinue
+    foreach ($file in $goFiles) {
+        $content = Get-Content $file.FullName -Raw
+        if ($content -match "testing") {
+            $testFrameworksFound += "Go testing"
+        }
+    }
+
+    return "$testFilesCount|$($testFrameworksFound -join ' ')"
+}
+
+# Function to scan for security patterns
+function Get-SecurityPatterns {
+    param([string]$RepoRoot)
+
+    $securityIndicators = 0
+    $authPatterns = 0
+    $inputValidation = 0
+
+    $codeFiles = Get-ChildItem -Path $RepoRoot -Include "*.py", "*.js", "*.java", "*.go", "*.rs" -Recurse -File -ErrorAction SilentlyContinue
+
+    foreach ($file in $codeFiles) {
+        $content = Get-Content $file.FullName -Raw
+
+        # Check for authentication patterns
+        if ($content -match "(?i)jwt|oauth|bearer|token") {
+            $authPatterns++
+            break
+        }
+    }
+
+    foreach ($file in $codeFiles) {
+        $content = Get-Content $file.FullName -Raw
+
+        # Check for input validation
+        if ($content -match "(?i)sanitize|validate|escape") {
+            $inputValidation++
+            break
+        }
+    }
+
+    # Check for security-related files
+    $securityFiles = Get-ChildItem -Path $RepoRoot -Include "*security*", "*auth*", "*crypto*" -Recurse -File -ErrorAction SilentlyContinue
+    if ($securityFiles.Count -gt 0) {
+        $securityIndicators++
+    }
+
+    return "$authPatterns|$inputValidation|$securityIndicators"
+}
+
+# Function to scan for documentation patterns
+function Get-DocumentationPatterns {
+    param([string]$RepoRoot)
+
+    $readmeCount = 0
+    $apiDocs = 0
+    $inlineComments = 0
+
+    # Count README files
+    $readmeFiles = Get-ChildItem -Path $RepoRoot -Filter "readme*" -Recurse -File -ErrorAction SilentlyContinue
+    $readmeCount = $readmeFiles.Count
+
+    # Check for API documentation
+    $docsDirs = Get-ChildItem -Path $RepoRoot -Include "*api*", "*docs*" -Recurse -Directory -ErrorAction SilentlyContinue
+    if ($docsDirs.Count -gt 0) {
+        $apiDocs = 1
+    }
+
+    # Sample code files for comment analysis
+    $codeFiles = Get-ChildItem -Path $RepoRoot -Include "*.py", "*.js", "*.java", "*.go", "*.rs" -Recurse -File -ErrorAction SilentlyContinue | Select-Object -First 10
+
+    $totalLines = 0
+    $commentLines = 0
+
+    foreach ($file in $codeFiles) {
+        $content = Get-Content $file.FullName -Raw
+        $lines = $content -split "`n"
+        $totalLines += $lines.Count
+
+        $extension = [System.IO.Path]::GetExtension($file.Name)
+
+        switch ($extension) {
+            ".py" {
+                $comments = $lines | Where-Object { $_ -match "^[ ]*#" }
+                $commentLines += $comments.Count
+            }
+            ".js" {
+                $comments = $lines | Where-Object { $_ -match "^[ ]*//" -or $_ -match "/\*" }
+                $commentLines += $comments.Count
+            }
+            ".java" {
+                $comments = $lines | Where-Object { $_ -match "^[ ]*//" -or $_ -match "/\*" }
+                $commentLines += $comments.Count
+            }
+            ".go" {
+                $comments = $lines | Where-Object { $_ -match "^[ ]*//" }
+                $commentLines += $comments.Count
+            }
+            ".rs" {
+                $comments = $lines | Where-Object { $_ -match "^[ ]*//" -or $_ -match "/\*" }
+                $commentLines += $comments.Count
+            }
+        }
+    }
+
+    if ($totalLines -gt 0) {
+        $inlineComments = [math]::Round(($commentLines * 100) / $totalLines)
+    }
+
+    return "$readmeCount|$apiDocs|$inlineComments"
+}
+
+# Function to scan for architecture patterns
+function Get-ArchitecturePatterns {
+    param([string]$RepoRoot)
+
+    $layeredArchitecture = 0
+    $modularStructure = 0
+    $configManagement = 0
+
+    # Check for layered architecture
+    $layerDirs = Get-ChildItem -Path $RepoRoot -Include "controllers", "services", "models", "views" -Recurse -Directory -ErrorAction SilentlyContinue
+    if ($layerDirs.Count -gt 0) {
+        $layeredArchitecture = 1
+    }
+
+    # Check for modular structure
+    $allDirs = Get-ChildItem -Path $RepoRoot -Recurse -Directory | Where-Object { $_.FullName -notlike "*\.git*" }
+    if ($allDirs.Count -gt 10) {
+        $modularStructure = 1
+    }
+
+    # Check for configuration management
+    $configFiles = Get-ChildItem -Path $RepoRoot -Include "*.env*", "config*", "settings*" -Recurse -File -ErrorAction SilentlyContinue
+    if ($configFiles.Count -gt 0) {
+        $configManagement = 1
+    }
+
+    return "$layeredArchitecture|$modularStructure|$configManagement"
+}
+
+# Function to generate constitution suggestions
+function New-ConstitutionSuggestions {
+    param([string]$TestingData, [string]$SecurityData, [string]$DocsData, [string]$ArchData)
+
+    $suggestions = @()
+
+    # Parse testing data
+    $testingParts = $TestingData -split '\|'
+    $testFiles = [int]$testingParts[0]
+    $testFrameworks = $testingParts[1]
+
+    if ($testFiles -gt 0) {
+        $suggestions += "**Testing Standards**: Project has $testFiles test files using $testFrameworks. Consider mandating test coverage requirements and framework consistency."
+    }
+
+    # Parse security data
+    $securityParts = $SecurityData -split '\|'
+    $authPatterns = [int]$securityParts[0]
+    $inputValidation = [int]$securityParts[1]
+    $securityIndicators = [int]$securityParts[2]
+
+    if ($authPatterns -gt 0 -or $securityIndicators -gt 0) {
+        $suggestions += "**Security by Default**: Project shows security practices. Consider requiring security reviews and input validation standards."
+    }
+
+    # Parse documentation data
+    $docsParts = $DocsData -split '\|'
+    $readmeCount = [int]$docsParts[0]
+    $apiDocs = [int]$docsParts[1]
+    $commentPercentage = [int]$docsParts[2]
+
+    if ($readmeCount -gt 0) {
+        $suggestions += "**Documentation Matters**: Project has $readmeCount README files. Consider mandating documentation for APIs and complex logic."
+    }
+
+    if ($commentPercentage -gt 10) {
+        $suggestions += "**Code Comments**: Project shows $commentPercentage% comment density. Consider requiring meaningful comments for complex algorithms."
+    }
+
+    # Parse architecture data
+    $archParts = $ArchData -split '\|'
+    $layered = [int]$archParts[0]
+    $modular = [int]$archParts[1]
+    $config = [int]$archParts[2]
+
+    if ($layered -gt 0) {
+        $suggestions += "**Architecture Consistency**: Project uses layered architecture. Consider documenting architectural patterns and separation of concerns."
+    }
+
+    if ($modular -gt 0) {
+        $suggestions += "**Modular Design**: Project shows modular organization. Consider requiring modular design principles and dependency management."
+    }
+
+    if ($config -gt 0) {
+        $suggestions += "**Configuration Management**: Project uses configuration files. Consider requiring environment-specific configuration and secrets management."
+    }
+
+    # Output suggestions
+    if ($suggestions.Count -gt 0) {
+        Write-Host "Constitution Suggestions Based on Codebase Analysis:" -ForegroundColor Cyan
+        Write-Host "==================================================" -ForegroundColor Cyan
+        foreach ($suggestion in $suggestions) {
+            Write-Host "- $suggestion" -ForegroundColor White
+        }
+    } else {
+        Write-Host "No specific constitution suggestions generated from codebase analysis." -ForegroundColor Yellow
+        Write-Host "Consider adding general development principles to your constitution." -ForegroundColor Yellow
+    }
+}
+
+# Main logic
+if ($Json) {
+    $testing = Get-TestingPatterns -RepoRoot $paths.REPO_ROOT
+    $security = Get-SecurityPatterns -RepoRoot $paths.REPO_ROOT
+    $docs = Get-DocumentationPatterns -RepoRoot $paths.REPO_ROOT
+    $arch = Get-ArchitecturePatterns -RepoRoot $paths.REPO_ROOT
+
+    @{
+        testing = $testing
+        security = $security
+        documentation = $docs
+        architecture = $arch
+    } | ConvertTo-Json -Compress
+} else {
+    Write-Host "Scanning project artifacts for constitution patterns..." -ForegroundColor Cyan
+    Write-Host ""
+
+    $testing = Get-TestingPatterns -RepoRoot $paths.REPO_ROOT
+    $security = Get-SecurityPatterns -RepoRoot $paths.REPO_ROOT
+    $docs = Get-DocumentationPatterns -RepoRoot $paths.REPO_ROOT
+    $arch = Get-ArchitecturePatterns -RepoRoot $paths.REPO_ROOT
+
+    Write-Host "Testing Patterns: $testing" -ForegroundColor White
+    Write-Host "Security Patterns: $security" -ForegroundColor White
+    Write-Host "Documentation Patterns: $docs" -ForegroundColor White
+    Write-Host "Architecture Patterns: $arch" -ForegroundColor White
+    Write-Host ""
+
+    if ($Suggestions) {
+        New-ConstitutionSuggestions -TestingData $testing -SecurityData $security -DocsData $docs -ArchData $arch
+    }
+}
\ No newline at end of file
diff --git a/scripts/powershell/setup-constitution.ps1 b/scripts/powershell/setup-constitution.ps1
new file mode 100644
index 0000000000..6cd6a6be84
--- /dev/null
+++ b/scripts/powershell/setup-constitution.ps1
@@ -0,0 +1,333 @@
+# Setup project constitution with team inheritance
+[CmdletBinding()]
+param(
+    [switch]$Json,
+    [switch]$Validate,
+    [switch]$Scan,
+    [switch]$Help
+)
+
+$ErrorActionPreference = 'Stop'
+
+if ($Help) {
+    Write-Output "Usage: ./setup-constitution.ps1 [-Json] [-Validate] [-Scan] [-Help]"
+    Write-Output "  -Json     Output results in JSON format"
+    Write-Output "  -Validate Validate existing constitution against team inheritance"
+    Write-Output "  -Scan     Scan project artifacts and suggest constitution enhancements"
+    Write-Output "  -Help     Show this help message"
+    exit 0
+}
+
+    # Fallback: search for repository markers
+    $currentDir = Get-Location
+    while ($currentDir -and (Test-Path $currentDir)) {
+        if ((Test-Path "$currentDir\.git") -or (Test-Path "$currentDir\.specify")) {
+            return $currentDir
+        }
+        $currentDir = Split-Path $currentDir -Parent
+    }
+
+    throw "Could not determine repository root"
+}
+
+# Get team directives path
+function Get-TeamDirectivesPath {
+    $repoRoot = Get-RepositoryRoot
+    $configFile = Join-Path $repoRoot ".specify\config\config.json"
+
+    if (Test-Path $configFile) {
+        try {
+            $config = Get-Content $configFile -Raw | ConvertFrom-Json
+            $path = $config.team_directives.path
+            if ($path -and (Test-Path $path.Trim())) {
+                return $path.Trim()
+            }
+        } catch {
+            # Ignore config parsing errors
+        }
+    }
+
+    # Fallback to default location
+    $defaultDir = Join-Path $repoRoot ".specify\memory\team-ai-directives"
+    if (Test-Path $defaultDir) {
+        return $defaultDir
+    }
+
+    return $null
+}
+
+# Load team constitution
+function Get-TeamConstitution {
+    $teamDirectives = Get-TeamDirectivesPath
+
+    if ($teamDirectives -and (Test-Path $teamDirectives)) {
+        # Try direct constitution.md
+        $constitutionFile = Join-Path $teamDirectives "constitution.md"
+        if (Test-Path $constitutionFile) {
+            return Get-Content $constitutionFile -Raw
+        }
+
+        # Try context_modules/constitution.md
+        $constitutionFile = Join-Path $teamDirectives "context_modules\constitution.md"
+        if (Test-Path $constitutionFile) {
+            return Get-Content $constitutionFile -Raw
+        }
+    }
+
+    # Default constitution if none found
+    return @"
+# Project Constitution
+
+## Core Principles
+
+### Principle 1: Quality First
+All code must meet quality standards and include appropriate testing.
+
+### Principle 2: Documentation Required
+Clear documentation must accompany all significant changes.
+
+### Principle 3: Security by Default
+Security considerations must be addressed for all features.
+
+## Governance
+
+**Version**: 1.0.0 | **Ratified**: $(Get-Date -Format 'yyyy-MM-dd') | **Last Amended**: $(Get-Date -Format 'yyyy-MM-dd')
+
+*This constitution was auto-generated from team defaults. Customize as needed for your project.*
+"@
+}
+
+# Enhance constitution with project context
+function Add-ProjectContext {
+    param([string]$Constitution)
+
+    $repoRoot = Get-RepositoryRoot
+    $projectName = Split-Path $repoRoot -Leaf
+
+    # Add project header if not present
+    if ($Constitution -notmatch "^# $projectName Constitution") {
+        $inheritanceDate = Get-Date -Format 'yyyy-MM-dd'
+        $Constitution = "# $projectName Constitution
+
+*Inherited from team constitution on $inheritanceDate*
+
+$Constitution"
+    }
+
+    return $Constitution
+}
+
+# Validate inheritance
+function Test-ConstitutionInheritance {
+    param([string]$TeamConstitution, [string]$ProjectConstitution)
+
+    # Extract team principles (simple pattern match)
+    $teamPrinciples = @()
+    if ($TeamConstitution -match '^\d+\.\s*\*\*(.+?)\*\*') {
+        $matches = [regex]::Matches($TeamConstitution, '^\d+\.\s*\*\*(.+?)\*\*', [System.Text.RegularExpressions.RegexOptions]::Multiline)
+        foreach ($match in $matches) {
+            $teamPrinciples += $match.Groups[1].Value
+        }
+    }
+
+    # Check if project contains team principles
+    $missingPrinciples = @()
+    foreach ($principle in $teamPrinciples) {
+        if ($ProjectConstitution -notmatch [regex]::Escape($principle)) {
+            $missingPrinciples += $principle
+        }
+    }
+
+    if ($missingPrinciples.Count -gt 0) {
+        Write-Warning "Project constitution may be missing some team principles: $($missingPrinciples -join ', ')"
+        Write-Host "Consider ensuring all team principles are represented in your project constitution."
+        return $false
+    } else {
+        Write-Host "✓ Inheritance validation passed - all team principles detected in project constitution"
+        return $true
+    }
+}
+
+# Check for team constitution updates
+function Test-TeamConstitutionUpdates {
+    param([string]$TeamConstitution, [string]$ProjectConstitution)
+
+    if ($ProjectConstitution -match 'Inherited from team constitution on (\d{4}-\d{2}-\d{2})') {
+        $inheritanceDate = $matches[1]
+
+        $teamDirectives = Get-TeamDirectivesPath
+        if ($teamDirectives) {
+            $constitutionFile = Join-Path $teamDirectives "context_modules\constitution.md"
+            if (-not (Test-Path $constitutionFile)) {
+                $constitutionFile = Join-Path $teamDirectives "constitution.md"
+            }
+
+            if (Test-Path $constitutionFile) {
+                $teamFileInfo = Get-Item $constitutionFile
+                $inheritanceDateTime = [DateTime]::Parse($inheritanceDate)
+
+                if ($teamFileInfo.LastWriteTime -gt $inheritanceDateTime) {
+                    Write-Host "NOTICE: Team constitution has been updated since project constitution was created."
+                    Write-Host "Consider reviewing the team constitution for any changes that should be reflected in your project."
+                    Write-Host "Team constitution: $constitutionFile"
+                }
+            }
+        }
+    }
+}
+
+# Main logic
+try {
+    $repoRoot = Get-RepositoryRoot
+    $constitutionFile = Join-Path $repoRoot ".specify\memory\constitution.md"
+
+    # Ensure directory exists
+    $constitutionDir = Split-Path $constitutionFile -Parent
+    if (-not (Test-Path $constitutionDir)) {
+        New-Item -ItemType Directory -Path $constitutionDir -Force | Out-Null
+    }
+
+    if ($Scan -and -not (Test-Path $constitutionFile)) {
+        Write-Host "Scanning project artifacts for constitution suggestions..." -ForegroundColor Cyan
+        & "$PSScriptRoot\scan-project-artifacts.ps1" -Suggestions
+        exit 0
+    }
+
+    if ($Validate) {
+        if (-not (Test-Path $constitutionFile)) {
+            Write-Error "No constitution file found at $constitutionFile. Run without --validate to create the constitution first."
+            exit 1
+        }
+
+        $teamConstitution = Get-TeamConstitution
+        $projectConstitution = Get-Content $constitutionFile -Raw
+
+        if ($Json) {
+            $result = @{
+                status = "validated"
+                file = $constitutionFile
+                team_directives = (Get-TeamDirectivesPath)
+            } | ConvertTo-Json -Compress
+            Write-Host $result
+        } else {
+            Write-Host "Validating constitution at: $constitutionFile"
+            Write-Host "Team directives source: $(Get-TeamDirectivesPath)"
+            Write-Host ""
+            Test-ConstitutionInheritance -TeamConstitution $teamConstitution -ProjectConstitution $projectConstitution
+            Write-Host ""
+            Test-TeamConstitutionUpdates -TeamConstitution $teamConstitution -ProjectConstitution $projectConstitution
+        }
+        exit 0
+    }
+
+    if (Test-Path $constitutionFile) {
+        Write-Host "Constitution file already exists at $constitutionFile"
+        Write-Host "Use git to modify it directly, or remove it to recreate from team directives."
+
+        # Check for updates
+        $teamConstitution = Get-TeamConstitution
+        $existingConstitution = Get-Content $constitutionFile -Raw
+        Test-TeamConstitutionUpdates -TeamConstitution $teamConstitution -ProjectConstitution $existingConstitution
+        Write-Host ""
+
+        if ($Json) {
+            $result = @{ status = "exists"; file = $constitutionFile } | ConvertTo-Json -Compress
+            Write-Host $result
+        }
+        exit 0
+    }
+
+    # Create new constitution
+    $teamConstitution = Get-TeamConstitution
+    $projectConstitution = Add-ProjectContext -Constitution $teamConstitution
+
+    # If scan mode is enabled, enhance constitution with project insights
+    if ($Scan) {
+        if (-not $Json) {
+            Write-Host "Enhancing constitution with project artifact analysis..." -ForegroundColor Cyan
+        }
+
+        # Get scan results
+        $scanResults = & "$PSScriptRoot\scan-project-artifacts.ps1" -Json | ConvertFrom-Json
+
+        # Generate additional principles based on scan
+        $additionalPrinciples = @()
+
+        # Parse testing data
+        $testingParts = $scanResults.testing -split '\|'
+        $testFiles = [int]$testingParts[0]
+        $testFrameworks = $testingParts[1]
+
+        if ($testFiles -gt 0) {
+            $additionalPrinciples += @"
+### Tests Drive Confidence (Project Practice)
+Automated testing is established with $testFiles test files using $testFrameworks. All features must maintain or improve test coverage. Refuse to ship when test suites fail.
+"@
+        }
+
+        # Parse security data
+        $securityParts = $scanResults.security -split '\|'
+        $authPatterns = [int]$securityParts[0]
+        $securityIndicators = [int]$securityParts[2]
+
+        if ($authPatterns -gt 0 -or $securityIndicators -gt 0) {
+            $additionalPrinciples += @"
+### Security by Default (Project Practice)
+Security practices are established in the codebase. All features must include security considerations, input validation, and follow established security patterns.
+"@
+        }
+
+        # Parse documentation data
+        $docsParts = $scanResults.documentation -split '\|'
+        $readmeCount = [int]$docsParts[0]
+
+        if ($readmeCount -gt 0) {
+            $additionalPrinciples += @"
+### Documentation Matters (Project Practice)
+Documentation practices are established with $readmeCount README files. All features must include appropriate documentation and maintain existing documentation standards.
+"@
+        }
+
+        # Insert additional principles into constitution
+        if ($additionalPrinciples.Count -gt 0) {
+            $projectConstitution = $projectConstitution -replace '(## Additional Constraints)', @"
+## Project-Specific Principles
+
+$($additionalPrinciples -join "`n`n")
+
+## Additional Constraints
+"@
+        }
+    }
+
+    # Validate inheritance
+    if (-not $Json) {
+        Test-ConstitutionInheritance -TeamConstitution $teamConstitution -ProjectConstitution $projectConstitution
+        Write-Host ""
+    }
+
+    # Write constitution
+    $projectConstitution | Out-File -FilePath $constitutionFile -Encoding UTF8
+
+    # Output results
+    if ($Json) {
+        $result = @{
+            status = "created"
+            file = $constitutionFile
+            team_directives = (Get-TeamDirectivesPath)
+        } | ConvertTo-Json -Compress
+        Write-Host $result
+    } else {
+        Write-Host "Constitution created at: $constitutionFile"
+        Write-Host "Team directives source: $(Get-TeamDirectivesPath)"
+        Write-Host ""
+        Write-Host "Next steps:"
+        Write-Host "1. Review and customize the constitution for your project needs"
+        Write-Host "2. Commit the constitution: git add .specify/memory/constitution.md && git commit -m 'docs: initialize project constitution'"
+        Write-Host "3. The constitution will be used by planning and implementation commands"
+    }
+
+} catch {
+    Write-Error "Error: $_"
+    exit 1
+}
\ No newline at end of file
diff --git a/scripts/powershell/setup-plan.ps1 b/scripts/powershell/setup-plan.ps1
index d0ed582fa9..5d71c92768 100644
--- a/scripts/powershell/setup-plan.ps1
+++ b/scripts/powershell/setup-plan.ps1
@@ -42,6 +42,26 @@ if (Test-Path $template) {
     New-Item -ItemType File -Path $paths.IMPL_PLAN -Force | Out-Null
 }
 
+$constitutionFile = $env:SPECIFY_CONSTITUTION
+if (-not $constitutionFile) {
+    $constitutionFile = Join-Path $paths.REPO_ROOT '.specify/memory/constitution.md'
+}
+if (Test-Path $constitutionFile) {
+    $env:SPECIFY_CONSTITUTION = $constitutionFile
+} else {
+    $constitutionFile = ''
+}
+
+$teamDirectives = $env:SPECIFY_TEAM_DIRECTIVES
+if (-not $teamDirectives) {
+    $teamDirectives = Join-Path $paths.REPO_ROOT '.specify/memory/team-ai-directives'
+}
+if (Test-Path $teamDirectives) {
+    $env:SPECIFY_TEAM_DIRECTIVES = $teamDirectives
+} else {
+    $teamDirectives = ''
+}
+
 # Output results
 if ($Json) {
     $result = [PSCustomObject]@{ 
@@ -50,6 +70,8 @@ if ($Json) {
         SPECS_DIR = $paths.FEATURE_DIR
         BRANCH = $paths.CURRENT_BRANCH
         HAS_GIT = $paths.HAS_GIT
+        CONSTITUTION = $constitutionFile
+        TEAM_DIRECTIVES = $teamDirectives
     }
     $result | ConvertTo-Json -Compress
 } else {
@@ -58,4 +80,14 @@ if ($Json) {
     Write-Output "SPECS_DIR: $($paths.FEATURE_DIR)"
     Write-Output "BRANCH: $($paths.CURRENT_BRANCH)"
     Write-Output "HAS_GIT: $($paths.HAS_GIT)"
+    if ($constitutionFile) {
+        Write-Output "CONSTITUTION: $constitutionFile"
+    } else {
+        Write-Output "CONSTITUTION: (missing)"
+    }
+    if ($teamDirectives) {
+        Write-Output "TEAM_DIRECTIVES: $teamDirectives"
+    } else {
+        Write-Output "TEAM_DIRECTIVES: (missing)"
+    }
 }
diff --git a/scripts/powershell/spec-hooks-install.ps1 b/scripts/powershell/spec-hooks-install.ps1
new file mode 100644
index 0000000000..1336ce3f69
--- /dev/null
+++ b/scripts/powershell/spec-hooks-install.ps1
@@ -0,0 +1,213 @@
+# spec-hooks-install.ps1 - Install git hooks for automatic spec-code synchronization
+# This script sets up pre-commit, post-commit, and pre-push hooks to detect code changes
+# and queue documentation updates for specs/*.md files
+
+param()
+
+# Colors for output
+$RED = "`e[0;31m"
+$GREEN = "`e[0;32m"
+$YELLOW = "`e[1;33m"
+$BLUE = "`e[0;34m"
+$NC = "`e[0m" # No Color
+
+# Logging functions
+function log_info {
+    param([string]$message)
+    Write-Host "${BLUE}INFO:${NC} $message" -ForegroundColor Blue
+}
+
+function log_success {
+    param([string]$message)
+    Write-Host "${GREEN}SUCCESS:${NC} $message" -ForegroundColor Green
+}
+
+function log_warning {
+    param([string]$message)
+    Write-Host "${YELLOW}WARNING:${NC} $message" -ForegroundColor Yellow
+}
+
+function log_error {
+    param([string]$message)
+    Write-Host "${RED}ERROR:${NC} $message" -ForegroundColor Red
+}
+
+# Check if we're in a git repository
+function check_git_repo {
+    try {
+        $null = git rev-parse --git-dir 2>$null
+    } catch {
+        log_error "Not in a git repository. Spec sync requires git."
+        exit 1
+    }
+}
+
+# Create hooks directory if it doesn't exist
+function ensure_hooks_dir {
+    $hooks_dir = ".git/hooks"
+    if (-not (Test-Path $hooks_dir)) {
+        log_warning "Git hooks directory not found, creating it"
+        New-Item -ItemType Directory -Path $hooks_dir -Force | Out-Null
+    }
+}
+
+# Install a specific hook
+function install_hook {
+    param(
+        [string]$hook_name,
+        [string]$hook_script
+    )
+
+    $hooks_dir = ".git/hooks"
+    $hook_path = Join-Path $hooks_dir $hook_name
+
+    # Check if hook already exists
+    if (Test-Path $hook_path) {
+        # Check if it's already our hook
+        $content = Get-Content $hook_path -Raw -ErrorAction SilentlyContinue
+        if ($content -and $content.Contains("spec-sync")) {
+            log_info "$hook_name hook already installed"
+            return
+        } else {
+            log_warning "$hook_name hook already exists, backing up and replacing"
+            $backup_path = "${hook_path}.backup.$(Get-Date -Format 'yyyyMMdd_HHmmss')"
+            Copy-Item $hook_path $backup_path
+        }
+    }
+
+    # Create the hook script
+    $hook_content = @"
+#!/bin/bash
+# $hook_name hook for spec-code synchronization
+# Automatically detects code changes and queues spec updates
+
+set -euo pipefail
+
+# Source the spec sync utilities
+SCRIPT_DIR="`$(cd "`$(dirname "`${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="`$(cd "`$SCRIPT_DIR/../.." && pwd)"
+
+# Check if spec sync is enabled for this project
+if [[ ! -f "`$PROJECT_ROOT/.specify/config/config.json" ]]; then
+    exit 0
+fi
+
+# Check if spec sync is enabled in config
+if command -v jq >/dev/null 2>&1; then
+    enabled=\$(jq -r '.spec_sync.enabled // false' "`$PROJECT_ROOT/.specify/config/config.json" 2>/dev/null)
+    if [[ "\$enabled" != "true" ]]; then
+        exit 0
+    fi
+else
+    # Fallback: check if enabled exists in config (simple grep)
+    if ! grep -q '"enabled":\s*true' "`$PROJECT_ROOT/.specify/config/config.json" 2>/dev/null; then
+        exit 0
+    fi
+fi
+
+# Run the $hook_script
+if [[ -f "`$PROJECT_ROOT/scripts/powershell/$hook_script" ]]; then
+    powershell -ExecutionPolicy Bypass -File "`$PROJECT_ROOT/scripts/powershell/$hook_script"
+fi
+"@
+
+    $hook_content | Out-File -FilePath $hook_path -Encoding UTF8 -Force
+
+    # Make executable (using bash since we're creating bash hooks that call PowerShell)
+    & bash -c "chmod +x '$hook_path'"
+
+    log_success "Installed $hook_name hook"
+}
+
+# Create spec sync configuration
+function create_config {
+    $config_dir = ".specify/config"
+    $config_file = Join-Path $config_dir "config.json"
+    New-Item -ItemType Directory -Path $config_dir -Force | Out-Null
+
+    # Check if config file exists, create if not
+    if (-not (Test-Path $config_file)) {
+        $config_content = @"
+{
+  "version": "1.0",
+  "project": {
+    "created": "$(Get-Date -Format 'yyyy-MM-ddTHH:mm:ssK')",
+    "last_modified": "$(Get-Date -Format 'yyyy-MM-ddTHH:mm:ssK')"
+  },
+  "workflow": {
+    "current_mode": "spec",
+    "default_mode": "spec",
+    "mode_history": []
+  },
+  "options": {
+    "tdd_enabled": false,
+    "contracts_enabled": false,
+    "data_models_enabled": false,
+    "risk_tests_enabled": false
+  },
+  "mode_defaults": {
+    "build": {
+      "tdd_enabled": false,
+      "contracts_enabled": false,
+      "data_models_enabled": false,
+      "risk_tests_enabled": false
+    },
+    "spec": {
+      "tdd_enabled": true,
+      "contracts_enabled": true,
+      "data_models_enabled": true,
+      "risk_tests_enabled": true
+    }
+  },
+  "spec_sync": {
+    "enabled": true,
+    "queue": {
+      "version": "1.0",
+      "created": "$(Get-Date -Format 'yyyy-MM-ddTHH:mm:ssK')",
+      "pending": [],
+      "processed": []
+    }
+  },
+  "gateway": {
+    "url": null,
+    "token": null,
+    "suppress_warning": false
+  }
+}
+"@
+        $config_content | Out-File -FilePath $config_file -Encoding UTF8 -Force
+    } else {
+        # Update existing config to enable spec sync
+        try {
+            $config = Get-Content $config_file -Raw | ConvertFrom-Json
+            $config.spec_sync.enabled = $true
+            $config.project.last_modified = Get-Date -Format 'yyyy-MM-ddTHH:mm:ssK'
+            $config | ConvertTo-Json -Depth 10 | Out-File -FilePath $config_file -Encoding UTF8 -Force
+        } catch {
+            log_warning "Could not update existing config file, spec sync may not be enabled"
+        }
+    }
+
+    log_success "Created/updated spec sync configuration"
+}
+
+# Main installation function
+function main {
+    log_info "Installing spec-code synchronization hooks..."
+
+    check_git_repo
+    ensure_hooks_dir
+    create_config
+
+    # Install the hooks
+    install_hook "pre-commit" "spec-sync-pre-commit.ps1"
+    install_hook "post-commit" "spec-sync-post-commit.ps1"
+    install_hook "pre-push" "spec-sync-pre-push.ps1"
+
+    log_success "Spec-code synchronization hooks installed successfully!"
+    log_info "Hooks will automatically detect code changes and queue spec updates"
+    log_info "Use 'git commit' or 'git push' to trigger synchronization"
+}
+
+# Run main function
+main
\ No newline at end of file
diff --git a/scripts/powershell/spec-sync-post-commit.ps1 b/scripts/powershell/spec-sync-post-commit.ps1
new file mode 100644
index 0000000000..2ccf6ec98a
--- /dev/null
+++ b/scripts/powershell/spec-sync-post-commit.ps1
@@ -0,0 +1,68 @@
+# spec-sync-post-commit.ps1 - Post-commit hook for spec-code synchronization
+# This script runs after commits to process queued spec updates
+
+param()
+
+# Colors for output
+$RED = "`e[0;31m"
+$GREEN = "`e[0;32m"
+$YELLOW = "`e[1;33m"
+$BLUE = "`e[0;34m"
+$NC = "`e[0m" # No Color
+
+# Logging functions
+function log_info {
+    param([string]$message)
+    Write-Host "${BLUE}INFO:${NC} $message" -ForegroundColor Blue
+}
+
+function log_success {
+    param([string]$message)
+    Write-Host "${GREEN}SUCCESS:${NC} $message" -ForegroundColor Green
+}
+
+function log_warning {
+    param([string]$message)
+    Write-Host "${YELLOW}WARNING:${NC} $message" -ForegroundColor Yellow
+}
+
+function log_error {
+    param([string]$message)
+    Write-Host "${RED}ERROR:${NC} $message" -ForegroundColor Red
+}
+
+# Get the project root
+$script_dir = Split-Path -Parent $MyInvocation.MyCommand.Path
+$project_root = Split-Path -Parent (Split-Path -Parent $script_dir)
+
+# Check if spec sync is enabled
+$config_file = Join-Path $project_root ".specify/config/config.json"
+if (Test-Path $config_file) {
+    try {
+        $config = Get-Content $config_file -Raw | ConvertFrom-Json
+        if (-not $config.spec_sync.enabled) {
+            exit 0
+        }
+    } catch {
+        # If config can't be read, assume disabled
+        exit 0
+    }
+} else {
+    # No config file, assume disabled
+    exit 0
+}
+
+log_info "Processing spec sync queue after commit..."
+
+# Check if there's a queue file
+$queue_file = Join-Path $project_root ".specify/config/spec-sync-queue.json"
+if (-not (Test-Path $queue_file)) {
+    log_info "No spec sync queue found"
+    exit 0
+}
+
+# For now, just log that post-commit processing would happen here
+# In a full implementation, this would process queued spec updates
+log_info "Spec sync post-commit processing completed (stub implementation)"
+
+log_success "Post-commit spec sync processing completed"
\ No newline at end of file
diff --git a/scripts/powershell/spec-sync-pre-commit.ps1 b/scripts/powershell/spec-sync-pre-commit.ps1
new file mode 100644
index 0000000000..bbc583e8b0
--- /dev/null
+++ b/scripts/powershell/spec-sync-pre-commit.ps1
@@ -0,0 +1,90 @@
+# spec-sync-pre-commit.ps1 - Pre-commit hook for spec-code synchronization
+# This script runs before commits to detect code changes and queue spec updates
+
+param()
+
+# Colors for output
+$RED = "`e[0;31m"
+$GREEN = "`e[0;32m"
+$YELLOW = "`e[1;33m"
+$BLUE = "`e[0;34m"
+$NC = "`e[0m" # No Color
+
+# Logging functions
+function log_info {
+    param([string]$message)
+    Write-Host "${BLUE}INFO:${NC} $message" -ForegroundColor Blue
+}
+
+function log_success {
+    param([string]$message)
+    Write-Host "${GREEN}SUCCESS:${NC} $message" -ForegroundColor Green
+}
+
+function log_warning {
+    param([string]$message)
+    Write-Host "${YELLOW}WARNING:${NC} $message" -ForegroundColor Yellow
+}
+
+function log_error {
+    param([string]$message)
+    Write-Host "${RED}ERROR:${NC} $message" -ForegroundColor Red
+}
+
+# Get the project root
+$script_dir = Split-Path -Parent $MyInvocation.MyCommand.Path
+$project_root = Split-Path -Parent (Split-Path -Parent $script_dir)
+
+# Check if spec sync is enabled
+$config_file = Join-Path $project_root ".specify/config/config.json"
+if (Test-Path $config_file) {
+    try {
+        $config = Get-Content $config_file -Raw | ConvertFrom-Json
+        if (-not $config.spec_sync.enabled) {
+            exit 0
+        }
+    } catch {
+        # If config can't be read, assume disabled
+        exit 0
+    }
+} else {
+    # No config file, assume disabled
+    exit 0
+}
+
+log_info "Checking for code changes that may require spec updates..."
+
+# Get list of changed files
+try {
+    $changed_files = git diff --cached --name-only --diff-filter=ACMRTUXB
+} catch {
+    log_warning "Could not get changed files from git"
+    exit 0
+}
+
+if (-not $changed_files) {
+    log_info "No files changed, skipping spec sync check"
+    exit 0
+}
+
+# Check if any spec files or code files changed
+$spec_changed = $false
+$code_changed = $false
+
+foreach ($file in $changed_files) {
+    if ($file -like "specs/*.md") {
+        $spec_changed = $true
+    } elseif ($file -match "\.(py|js|ts|java|c|cpp|h|go|rs|php|cs|vb)$") {
+        $code_changed = $true
+    }
+}
+
+# If code changed but no spec updates, warn the user
+if ($code_changed -and -not $spec_changed) {
+    log_warning "Code changes detected but no spec files updated"
+    log_warning "Consider updating relevant specs/*.md files to reflect code changes"
+    log_warning "Use 'git commit --no-verify' to skip this check if intentional"
+    # Don't fail the commit, just warn
+}
+
+log_success "Pre-commit spec sync check completed"
\ No newline at end of file
diff --git a/scripts/powershell/spec-sync-pre-push.ps1 b/scripts/powershell/spec-sync-pre-push.ps1
new file mode 100644
index 0000000000..2db822c867
--- /dev/null
+++ b/scripts/powershell/spec-sync-pre-push.ps1
@@ -0,0 +1,74 @@
+# spec-sync-pre-push.ps1 - Pre-push hook for spec-code synchronization
+# This script runs before pushes to ensure spec updates are processed
+
+param()
+
+# Colors for output
+$RED = "`e[0;31m"
+$GREEN = "`e[0;32m"
+$YELLOW = "`e[1;33m"
+$BLUE = "`e[0;34m"
+$NC = "`e[0m" # No Color
+
+# Logging functions
+function log_info {
+    param([string]$message)
+    Write-Host "${BLUE}INFO:${NC} $message" -ForegroundColor Blue
+}
+
+function log_success {
+    param([string]$message)
+    Write-Host "${GREEN}SUCCESS:${NC} $message" -ForegroundColor Green
+}
+
+function log_warning {
+    param([string]$message)
+    Write-Host "${YELLOW}WARNING:${NC} $message" -ForegroundColor Yellow
+}
+
+function log_error {
+    param([string]$message)
+    Write-Host "${RED}ERROR:${NC} $message" -ForegroundColor Red
+}
+
+# Get the project root
+$script_dir = Split-Path -Parent $MyInvocation.MyCommand.Path
+$project_root = Split-Path -Parent (Split-Path -Parent $script_dir)
+
+# Check if spec sync is enabled
+$config_file = Join-Path $project_root ".specify/config/config.json"
+if (Test-Path $config_file) {
+    try {
+        $config = Get-Content $config_file -Raw | ConvertFrom-Json
+        if (-not $config.spec_sync.enabled) {
+            exit 0
+        }
+    } catch {
+        # If config can't be read, assume disabled
+        exit 0
+    }
+} else {
+    # No config file, assume disabled
+    exit 0
+}
+
+log_info "Checking spec sync status before push..."
+
+# Check if there are any pending spec updates in the queue
+$config_file = Join-Path $project_root ".specify/config/config.json"
+if (Test-Path $config_file) {
+    try {
+        $config = Get-Content $config_file -Raw | ConvertFrom-Json
+        $pending_count = $config.spec_sync.queue.pending.Count
+        if ($pending_count -gt 0) {
+            log_warning "Pending spec updates detected in queue ($pending_count items)"
+            log_warning "Consider processing spec updates before pushing"
+            log_warning "Use 'git push --no-verify' to skip this check if intentional"
+            # Don't fail the push, just warn
+        }
+    } catch {
+        # Ignore config parsing errors
+    }
+}
+
+log_success "Pre-push spec sync check completed"
\ No newline at end of file
diff --git a/scripts/powershell/validate-constitution.ps1 b/scripts/powershell/validate-constitution.ps1
new file mode 100644
index 0000000000..8f7d9d8a16
--- /dev/null
+++ b/scripts/powershell/validate-constitution.ps1
@@ -0,0 +1,324 @@
+#!/usr/bin/env pwsh
+[CmdletBinding()]
+param(
+    [switch]$Json,
+    [switch]$Strict,
+    [switch]$Compliance,
+    [string]$ConstitutionFile
+)
+
+$ErrorActionPreference = 'Stop'
+
+. "$PSScriptRoot/common.ps1"
+
+$paths = Get-FeaturePathsEnv
+
+if (-not $ConstitutionFile) {
+    $ConstitutionFile = Join-Path $paths.REPO_ROOT '.specify/memory/constitution.md'
+}
+
+# Validation results structure
+$validationResults = @{}
+
+# Function to add validation result
+function Add-ValidationResult {
+    param([string]$Category, [string]$Check, [string]$Status, [string]$Message)
+
+    if (-not $validationResults.ContainsKey($Category)) {
+        $validationResults[$Category] = @()
+    }
+
+    $validationResults[$Category] += @{
+        check = $Check
+        status = $Status
+        message = $Message
+    }
+}
+
+# Function to validate constitution file exists
+function Test-ConstitutionFileExists {
+    if (-not (Test-Path $ConstitutionFile)) {
+        Add-ValidationResult "critical" "file_exists" "fail" "Constitution file not found at $ConstitutionFile"
+        return $false
+    }
+    Add-ValidationResult "basic" "file_exists" "pass" "Constitution file found"
+    return $true
+}
+
+# Function to validate basic structure
+function Test-ConstitutionStructure {
+    param([string]$Content)
+
+    # Check for required sections
+    if ($Content -notmatch '^# .* Constitution') {
+        Add-ValidationResult "structure" "title" "fail" "Constitution must have a title starting with '# ... Constitution'"
+        return $false
+    }
+    Add-ValidationResult "structure" "title" "pass" "Title format correct"
+
+    if ($Content -notmatch '^## Core Principles') {
+        Add-ValidationResult "structure" "core_principles" "fail" "Constitution must have '## Core Principles' section"
+        return $false
+    }
+    Add-ValidationResult "structure" "core_principles" "pass" "Core Principles section present"
+
+    if ($Content -notmatch '^##.*Governance') {
+        Add-ValidationResult "structure" "governance" "fail" "Constitution must have a Governance section"
+        return $false
+    }
+    Add-ValidationResult "structure" "governance" "pass" "Governance section present"
+
+    return $true
+}
+
+# Function to validate principle quality
+function Test-PrincipleQuality {
+    param([string]$Content)
+
+    # Extract principles
+    $principles = [regex]::Matches($Content, '^### (.+)', [System.Text.RegularExpressions.RegexOptions]::Multiline) |
+        ForEach-Object { $_.Groups[1].Value }
+
+    $principleCount = $principles.Count
+
+    foreach ($principle in $principles) {
+        # Check principle name quality
+        if ($principle.Length -lt 10) {
+            Add-ValidationResult "quality" "principle_name_length" "warn" "Principle '$principle' name is very short"
+        } elseif ($principle.Length -gt 80) {
+            Add-ValidationResult "quality" "principle_name_length" "warn" "Principle '$principle' name is very long"
+        } else {
+            Add-ValidationResult "quality" "principle_name_length" "pass" "Principle '$principle' name length appropriate"
+        }
+
+        # Check for vague language
+        if ($principle -match '(?i)should|may|might|try|consider') {
+            Add-ValidationResult "quality" "principle_clarity" "warn" "Principle '$principle' contains vague language (should/may/might/try/consider)"
+        } else {
+            Add-ValidationResult "quality" "principle_clarity" "pass" "Principle '$principle' uses clear language"
+        }
+    }
+
+    if ($principleCount -lt 3) {
+        Add-ValidationResult "quality" "principle_count" "warn" "Only $principleCount principles found (recommended: 3-7)"
+    } elseif ($principleCount -gt 10) {
+        Add-ValidationResult "quality" "principle_count" "warn" "$principleCount principles found (consider consolidating)"
+    } else {
+        Add-ValidationResult "quality" "principle_count" "pass" "$principleCount principles (appropriate range)"
+    }
+}
+
+# Function to validate versioning
+function Test-ConstitutionVersioning {
+    param([string]$Content)
+
+    # Check for version line
+    if ($Content -notmatch '\*\*Version\*\*:') {
+        Add-ValidationResult "versioning" "version_present" "fail" "Version information not found"
+        return $false
+    }
+    Add-ValidationResult "versioning" "version_present" "pass" "Version information present"
+
+    # Extract version
+    $versionMatch = [regex]::Match($Content, '\*\*Version\*\*:\s*([0-9.]+)')
+    if (-not $versionMatch.Success) {
+        Add-ValidationResult "versioning" "version_format" "fail" "Could not parse version number"
+        return $false
+    }
+
+    $version = $versionMatch.Groups[1].Value
+
+    # Check semantic versioning format
+    if ($version -notmatch '^[0-9]+\.[0-9]+\.[0-9]+$') {
+        Add-ValidationResult "versioning" "version_format" "warn" "Version '$version' does not follow semantic versioning (X.Y.Z)"
+    } else {
+        Add-ValidationResult "versioning" "version_format" "pass" "Version follows semantic versioning"
+    }
+
+    # Check dates
+    $ratifiedMatch = [regex]::Match($Content, '\*\*Ratified\*\*:\s*([0-9-]+)')
+    $amendedMatch = [regex]::Match($Content, '\*\*Last Amended\*\*:\s*([0-9-]+)')
+
+    if (-not $ratifiedMatch.Success) {
+        Add-ValidationResult "versioning" "ratified_date" "fail" "Ratification date not found"
+    } elseif ($ratifiedMatch.Groups[1].Value -notmatch '^[0-9]{4}-[0-9]{2}-[0-9]{2}$') {
+        Add-ValidationResult "versioning" "ratified_date" "warn" "Ratification date '$($ratifiedMatch.Groups[1].Value)' not in YYYY-MM-DD format"
+    } else {
+        Add-ValidationResult "versioning" "ratified_date" "pass" "Ratification date format correct"
+    }
+
+    if (-not $amendedMatch.Success) {
+        Add-ValidationResult "versioning" "amended_date" "fail" "Last amended date not found"
+    } elseif ($amendedMatch.Groups[1].Value -notmatch '^[0-9]{4}-[0-9]{2}-[0-9]{2}$') {
+        Add-ValidationResult "versioning" "amended_date" "warn" "Last amended date '$($amendedMatch.Groups[1].Value)' not in YYYY-MM-DD format"
+    } else {
+        Add-ValidationResult "versioning" "amended_date" "pass" "Last amended date format correct"
+    }
+
+    return $true
+}
+
+# Function to validate team compliance
+function Test-TeamCompliance {
+    param([string]$Content)
+
+    # Load team constitution
+    $teamConstitution = ""
+    $teamDirectives = Get-TeamDirectivesPath
+
+    if ($teamDirectives -and (Test-Path $teamDirectives)) {
+        $constitutionPath = Join-Path $teamDirectives "constitution.md"
+        if (-not (Test-Path $constitutionPath)) {
+            $constitutionPath = Join-Path $teamDirectives "context_modules/constitution.md"
+        }
+
+        if (Test-Path $constitutionPath) {
+            $teamConstitution = Get-Content $constitutionPath -Raw
+        }
+    }
+
+    if (-not $teamConstitution) {
+        Add-ValidationResult "compliance" "team_constitution" "warn" "Team constitution not found - cannot validate compliance"
+        return $true
+    }
+
+    Add-ValidationResult "compliance" "team_constitution" "pass" "Team constitution found"
+
+    # Extract team principles
+    $teamPrinciples = [regex]::Matches($teamConstitution, '^\d+\. \*\*(.+?)\*\*', [System.Text.RegularExpressions.RegexOptions]::Multiline) |
+        ForEach-Object { $_.Groups[1].Value }
+
+    # Check each team principle is represented
+    $missingPrinciples = @()
+    foreach ($principle in $teamPrinciples) {
+        if ($principle -and $Content -notmatch [regex]::Escape($principle)) {
+            $missingPrinciples += $principle
+        }
+    }
+
+    if ($missingPrinciples.Count -gt 0) {
+        Add-ValidationResult "compliance" "team_principles" "fail" "Missing team principles: $($missingPrinciples -join ', ')"
+        return $false
+    } else {
+        Add-ValidationResult "compliance" "team_principles" "pass" "All team principles represented"
+        return $true
+    }
+}
+
+# Function to check for conflicts
+function Test-ConstitutionConflicts {
+    param([string]$Content)
+
+    $conflictsFound = 0
+
+    # Look for contradictory terms
+    if ($Content -match '(?i)must.*never|never.*must|required.*forbidden|forbidden.*required') {
+        Add-ValidationResult "conflicts" "contradictory_terms" "warn" "Found potentially contradictory terms (must/never, required/forbidden)"
+        $conflictsFound++
+    }
+
+    # Check for duplicate principles
+    $principleNames = [regex]::Matches($Content, '^### (.+)', [System.Text.RegularExpressions.RegexOptions]::Multiline) |
+        ForEach-Object { $_.Groups[1].Value.ToLower() }
+
+    $duplicates = @()
+    $seen = @{}
+    foreach ($name in $principleNames) {
+        if ($seen.ContainsKey($name)) {
+            $duplicates += $name
+        } else {
+            $seen[$name] = $true
+        }
+    }
+
+    if ($duplicates.Count -gt 0) {
+        Add-ValidationResult "conflicts" "duplicate_principles" "warn" "Duplicate principle names found: $($duplicates -join ', ')"
+        $conflictsFound++
+    }
+
+    if ($conflictsFound -eq 0) {
+        Add-ValidationResult "conflicts" "no_conflicts" "pass" "No obvious conflicts detected"
+    }
+
+    return $conflictsFound -eq 0
+}
+
+# Main validation logic
+if (-not (Test-ConstitutionFileExists)) {
+    if ($Json) {
+        $validationResults | ConvertTo-Json -Depth 10
+    } else {
+        Write-Host "CRITICAL: Constitution file not found" -ForegroundColor Red
+        exit 1
+    }
+}
+
+# Read constitution content
+$content = Get-Content $ConstitutionFile -Raw
+
+# Run validations
+Test-ConstitutionStructure -Content $content
+Test-PrincipleQuality -Content $content
+Test-ConstitutionVersioning -Content $content
+
+if ($Compliance) {
+    Test-TeamCompliance -Content $content
+}
+
+Test-ConstitutionConflicts -Content $content
+
+# Calculate overall status
+$criticalFails = ($validationResults.critical | Where-Object { $_.status -eq "fail" }).Count
+$structureFails = ($validationResults.structure | Where-Object { $_.status -eq "fail" }).Count
+$qualityFails = ($validationResults.quality | Where-Object { $_.status -eq "fail" }).Count
+$versioningFails = ($validationResults.versioning | Where-Object { $_.status -eq "fail" }).Count
+$complianceFails = ($validationResults.compliance | Where-Object { $_.status -eq "fail" }).Count
+
+$totalFails = $criticalFails + $structureFails + $qualityFails + $versioningFails + $complianceFails
+
+$warnings = 0
+foreach ($category in $validationResults.Keys) {
+    $warnings += ($validationResults[$category] | Where-Object { $_.status -eq "warn" }).Count
+}
+
+if ($totalFails -gt 0) {
+    $overallStatus = "fail"
+} elseif ($Strict -and $warnings -gt 0) {
+    $overallStatus = "fail"
+} else {
+    $overallStatus = "pass"
+}
+
+$validationResults.overall = $overallStatus
+
+# Output results
+if ($Json) {
+    $validationResults | ConvertTo-Json -Depth 10
+} else {
+    Write-Host "Constitution Validation Results for: $ConstitutionFile" -ForegroundColor Cyan
+    Write-Host "Overall Status: $($overallStatus.ToUpper())" -ForegroundColor $(if ($overallStatus -eq "pass") { "Green" } else { "Red" })
+    Write-Host ""
+
+    # Display results by category
+    foreach ($category in $validationResults.Keys) {
+        if ($category -eq "overall") { continue }
+
+        Write-Host "$category checks:" -ForegroundColor Yellow
+        foreach ($result in $validationResults[$category]) {
+            $color = switch ($result.status) {
+                "pass" { "Green" }
+                "fail" { "Red" }
+                "warn" { "Yellow" }
+            }
+            Write-Host "  [$($result.status.ToUpper())] $($result.check): $($result.message)" -ForegroundColor $color
+        }
+        Write-Host ""
+    }
+
+    if ($overallStatus -eq "fail") {
+        Write-Host "❌ Validation failed - address the issues above" -ForegroundColor Red
+        exit 1
+    } else {
+        Write-Host "✅ Validation passed" -ForegroundColor Green
+    }
+}
\ No newline at end of file
diff --git a/spec-driven.md b/spec-driven.md
index 70b9789708..f6cb7ccda8 100644
--- a/spec-driven.md
+++ b/spec-driven.md
@@ -97,7 +97,7 @@ Once a feature specification exists, this command creates a comprehensive implem
 
 After a plan is created, this command analyzes the plan and related design documents to generate an executable task list:
 
-1. **Inputs**: Reads `plan.md` (required) and, if present, `data-model.md`, `contracts/`, and `research.md`
+1. **Inputs**: Reads `plan.md` (required in spec mode, optional in build mode) and, if present, `data-model.md`, `contracts/`, and `research.md`
 2. **Task Derivation**: Converts contracts, entities, and scenarios into specific tasks
 3. **Parallelization**: Marks independent tasks `[P]` and outlines safe parallel groups
 4. **Output**: Writes `tasks.md` in the feature directory, ready for execution by a Task agent
diff --git a/src/specify_cli/__init__.py b/src/specify_cli/__init__.py
index 1dedb31949..d8216bf3d0 100644
--- a/src/specify_cli/__init__.py
+++ b/src/specify_cli/__init__.py
@@ -1,14 +1,4 @@
 #!/usr/bin/env python3
-# /// script
-# requires-python = ">=3.11"
-# dependencies = [
-#     "typer",
-#     "rich",
-#     "platformdirs",
-#     "readchar",
-#     "httpx",
-# ]
-# ///
 """
 Specify CLI - Setup tool for Specify projects
 
@@ -56,6 +46,12 @@
 ssl_context = truststore.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
 client = httpx.Client(verify=ssl_context)
 
+# Color constants for orange theme
+ACCENT_COLOR = "#f47721"
+BANNER_COLORS = ["#ff6b35", "#ff8c42", "#f47721", "#ff5722", "white", "bright_white"]
+
+TEAM_DIRECTIVES_DIRNAME = "team-ai-directives"
+
 def _github_token(cli_token: str | None = None) -> str | None:
     """Return sanitized GitHub token (cli arg takes precedence) or None."""
     return ((cli_token or os.getenv("GH_TOKEN") or os.getenv("GITHUB_TOKEN") or "").strip()) or None
@@ -228,8 +224,167 @@ def _format_rate_limit_error(status_code: int, headers: httpx.Headers, url: str)
     },
 }
 
+# Issue tracker MCP configuration with name, type, URL, and metadata
+ISSUE_TRACKER_CONFIG = {
+    "github": {
+        "name": "GitHub Issues",
+        "type": "http",
+        "url": "https://api.githubcopilot.com/mcp/",
+        "description": "Connect to GitHub Issues for project management and issue tracking",
+    },
+    "jira": {
+        "name": "Jira",
+        "type": "sse",
+        "url": "https://mcp.atlassian.com/v1/sse",
+        "description": "Connect to Atlassian Jira for enterprise project management",
+    },
+    "linear": {
+        "name": "Linear",
+        "type": "sse",
+        "url": "https://mcp.linear.app/sse",
+        "description": "Connect to Linear for modern software project management",
+    },
+    "gitlab": {
+        "name": "GitLab Issues",
+        "type": "http",
+        "url": "https://mcp.gitlab.com/",  # Placeholder - GitLab MCP server may not exist yet
+        "description": "Connect to GitLab Issues for DevOps project management",
+    },
+}
+
+# Git platform MCP configuration for Git operations (PRs, branches, etc.)
+GIT_PLATFORM_CONFIG = {
+    "github": {
+        "name": "GitHub Platform",
+        "type": "http",
+        "url": "https://api.github.com/mcp/",
+        "description": "Connect to GitHub Platform API for PR creation, branch management, and Git operations",
+    },
+    "gitlab": {
+        "name": "GitLab Platform",
+        "type": "http",
+        "url": "https://gitlab.com/api/v4/mcp/",
+        "description": "Connect to GitLab Platform API for merge request creation, branch management, and Git operations",
+    },
+}
+
+# Agent MCP configuration for async coding agents that support autonomous task execution
+# These agents can receive tasks, execute them asynchronously, and create PRs
+AGENT_MCP_CONFIG = {
+    "jules": {
+        "name": "Jules",
+        "type": "http",
+        "url": "https://mcp.jules.ai/",
+        "description": "Connect to Jules for autonomous async task execution and PR creation",
+    },
+    "async-copilot": {
+        "name": "Async Copilot",
+        "type": "http",
+        "url": "https://mcp.async-copilot.dev/",
+        "description": "Connect to Async Copilot for autonomous coding task execution",
+    },
+    "async-codex": {
+        "name": "Async Codex",
+        "type": "http",
+        "url": "https://mcp.async-codex.ai/",
+        "description": "Connect to Async Codex for autonomous development workflows",
+    },
+}
+
 SCRIPT_TYPE_CHOICES = {"sh": "POSIX Shell (bash/zsh)", "ps": "PowerShell"}
 
+# Consolidated Configuration Management
+
+def get_config_path(project_path: Path) -> Path:
+    """Get the path to the consolidated config file."""
+    return project_path / ".specify" / "config" / "config.json"
+
+def load_config(project_path: Path) -> dict:
+    """Load the consolidated configuration file."""
+    config_path = get_config_path(project_path)
+    if not config_path.exists():
+        return get_default_config()
+
+    try:
+        with open(config_path, 'r') as f:
+            return json.load(f)
+    except (json.JSONDecodeError, IOError):
+        console.print(f"[yellow]Warning:[/yellow] Could not load config file {config_path}, using defaults")
+        return get_default_config()
+
+def save_config(project_path: Path, config: dict) -> None:
+    """Save the consolidated configuration file."""
+    config_path = get_config_path(project_path)
+    config_path.parent.mkdir(parents=True, exist_ok=True)
+
+    # Update last_modified timestamp
+    if "project" not in config:
+        config["project"] = {}
+    if not isinstance(config["project"], dict):
+        config["project"] = {}
+    config["project"]["last_modified"] = __import__('datetime').datetime.now().isoformat()
+
+    with open(config_path, 'w') as f:
+        json.dump(config, f, indent=2)
+
+def get_default_config() -> dict:
+    """Get the default configuration structure."""
+    from datetime import datetime
+    now = datetime.now().isoformat()
+
+    return {
+        "version": "1.0",
+        "project": {
+            "created": now,
+            "last_modified": now
+        },
+        "workflow": {
+            "current_mode": "spec",
+            "default_mode": "spec",
+            "mode_history": []
+        },
+        "options": {
+            "tdd_enabled": False,
+            "contracts_enabled": False,
+            "data_models_enabled": False,
+            "risk_tests_enabled": False
+        },
+        "mode_defaults": {
+            "build": {
+                "tdd_enabled": False,
+                "contracts_enabled": False,
+                "data_models_enabled": False,
+                "risk_tests_enabled": False
+            },
+            "spec": {
+                "tdd_enabled": True,
+                "contracts_enabled": True,
+                "data_models_enabled": True,
+                "risk_tests_enabled": True
+            }
+        },
+        "spec_sync": {
+            "enabled": False,
+            "queue": {
+                "version": "1.0",
+                "created": now,
+                "pending": [],
+                "processed": []
+            }
+        },
+        "gateway": {
+            "url": None,
+            "token": None,
+            "suppress_warning": False
+        },
+        "team_directives": {
+            "path": None
+        }
+    }
+
+# Workflow mode configuration
+
+
 CLAUDE_LOCAL_PATH = Path.home() / ".claude" / "local" / "claude"
 
 BANNER = """
@@ -241,7 +396,7 @@ def _format_rate_limit_error(status_code: int, headers: httpx.Headers, url: str)
 ╚══════╝╚═╝     ╚══════╝ ╚═════╝╚═╝╚═╝        ╚═╝   
 """
 
-TAGLINE = "GitHub Spec Kit - Spec-Driven Development Toolkit"
+TAGLINE = "Agentic SDLC Spec Kit - Spec-Driven Development Toolkit"
 class StepTracker:
     """Track and render hierarchical steps without emojis, similar to Claude Code tree output.
     Supports live auto-refresh via an attached refresh callback.
@@ -292,7 +447,7 @@ def _maybe_refresh(self):
                 pass
 
     def render(self):
-        tree = Tree(f"[cyan]{self.title}[/cyan]", guide_style="grey50")
+        tree = Tree(Text(self.title, style=ACCENT_COLOR), guide_style="grey50")
         for step in self.steps:
             label = step["label"]
             detail_text = step["detail"].strip() if step["detail"] else ""
@@ -303,7 +458,7 @@ def render(self):
             elif status == "pending":
                 symbol = "[green dim]○[/green dim]"
             elif status == "running":
-                symbol = "[cyan]○[/cyan]"
+                symbol = f"[{ACCENT_COLOR}]○[/{ACCENT_COLOR}]"
             elif status == "error":
                 symbol = "[red]●[/red]"
             elif status == "skipped":
@@ -370,14 +525,14 @@ def select_with_arrows(options: dict, prompt_text: str = "Select an option", def
     def create_selection_panel():
         """Create the selection panel with current selection highlighted."""
         table = Table.grid(padding=(0, 2))
-        table.add_column(style="cyan", justify="left", width=3)
+        table.add_column(style=ACCENT_COLOR, justify="left", width=3)
         table.add_column(style="white", justify="left")
 
         for i, key in enumerate(option_keys):
             if i == selected_index:
-                table.add_row("▶", f"[cyan]{key}[/cyan] [dim]({options[key]})[/dim]")
+                table.add_row("▶", f"[{ACCENT_COLOR}]{key}[/{ACCENT_COLOR}] [dim]({options[key]})[/dim]")
             else:
-                table.add_row(" ", f"[cyan]{key}[/cyan] [dim]({options[key]})[/dim]")
+                table.add_row(" ", f"[{ACCENT_COLOR}]{key}[/{ACCENT_COLOR}] [dim]({options[key]})[/dim]")
 
         table.add_row("", "")
         table.add_row("", "[dim]Use ↑/↓ to navigate, Enter to select, Esc to cancel[/dim]")
@@ -385,7 +540,7 @@ def create_selection_panel():
         return Panel(
             table,
             title=f"[bold]{prompt_text}[/bold]",
-            border_style="cyan",
+            border_style=ACCENT_COLOR,
             padding=(1, 2)
         )
 
@@ -444,7 +599,7 @@ def format_help(self, ctx, formatter):
 def show_banner():
     """Display the ASCII art banner."""
     banner_lines = BANNER.strip().split('\n')
-    colors = ["bright_blue", "blue", "cyan", "bright_cyan", "white", "bright_white"]
+    colors = BANNER_COLORS
 
     styled_banner = Text()
     for i, line in enumerate(banner_lines):
@@ -452,7 +607,7 @@ def show_banner():
         styled_banner.append(line + "\n", style=color)
 
     console.print(Align.center(styled_banner))
-    console.print(Align.center(Text(TAGLINE, style="italic bright_yellow")))
+    console.print(Align.center(Text(TAGLINE, style=f"italic {ACCENT_COLOR}")))
     console.print()
 
 @app.callback()
@@ -512,11 +667,234 @@ def check_tool(tool: str, tracker: StepTracker = None) -> bool:
     
     return found
 
+
+def _run_git_command(args: list[str], cwd: Path | None = None, *, env: dict[str, str] | None = None) -> subprocess.CompletedProcess:
+    """Run a git command with optional working directory and environment overrides."""
+    cmd = ["git"]
+    if cwd is not None:
+        cmd.extend(["-C", str(cwd)])
+    cmd.extend(args)
+    return subprocess.run(cmd, check=True, capture_output=True, text=True, env=env)
+
+
+def sync_team_ai_directives(repo_url: str, project_root: Path, *, skip_tls: bool = False) -> tuple[str, Path]:
+    """Clone or update the team-ai-directives repository.
+
+    When repo_url points to a local directory, return it without cloning.
+    Returns a tuple of (status, resolved_path).
+    """
+    repo_url = (repo_url or "").strip()
+    if not repo_url:
+        raise ValueError("Team AI directives repository URL cannot be empty")
+
+    potential_path = Path(repo_url).expanduser()
+    if potential_path.exists() and potential_path.is_dir():
+        return ("local", potential_path.resolve())
+
+    memory_root = project_root / ".specify" / "memory"
+    memory_root.mkdir(parents=True, exist_ok=True)
+    destination = memory_root / TEAM_DIRECTIVES_DIRNAME
+
+    git_env = os.environ.copy()
+    if skip_tls:
+        git_env["GIT_SSL_NO_VERIFY"] = "1"
+
+    try:
+        if destination.exists() and any(destination.iterdir()):
+            _run_git_command(["rev-parse", "--is-inside-work-tree"], cwd=destination, env=git_env)
+            try:
+                existing_remote = _run_git_command([
+                    "config",
+                    "--get",
+                    "remote.origin.url",
+                ], cwd=destination, env=git_env).stdout.strip()
+            except subprocess.CalledProcessError:
+                existing_remote = ""
+
+            if existing_remote and existing_remote != repo_url:
+                _run_git_command(["remote", "set-url", "origin", repo_url], cwd=destination, env=git_env)
+
+            _run_git_command(["pull", "--ff-only"], cwd=destination, env=git_env)
+            return ("updated", destination)
+
+        if destination.exists() and not any(destination.iterdir()):
+            shutil.rmtree(destination)
+
+        memory_root.mkdir(parents=True, exist_ok=True)
+        _run_git_command(["clone", repo_url, str(destination)], env=git_env)
+        return ("cloned", destination)
+    except subprocess.CalledProcessError as exc:
+        message = exc.stderr.strip() if exc.stderr else str(exc)
+        raise RuntimeError(f"Git operation failed: {message}") from exc
+
+def configure_mcp_servers(project_path: Path, issue_tracker: str, team_directives_path: Path | None = None) -> None:
+    """Configure MCP servers for issue tracker integration.
+
+    Creates or updates .mcp.json in the project root with the appropriate
+    MCP server configuration for the selected issue tracker.
+    """
+    import json
+
+    mcp_file = project_path / ".mcp.json"
+    mcp_servers = {}
+
+    # Load existing .mcp.json if it exists
+    if mcp_file.exists():
+        try:
+            with open(mcp_file, 'r') as f:
+                data = json.load(f)
+                mcp_servers = data.get("mcpServers", {})
+        except (json.JSONDecodeError, IOError):
+            # If file is corrupted, start fresh
+            pass
+
+    # Load team directives template if available
+    if team_directives_path:
+        template_file = team_directives_path / ".mcp.json"
+        if template_file.exists():
+            try:
+                with open(template_file, 'r') as f:
+                    template_data = json.load(f)
+                    template_servers = template_data.get("mcpServers", {})
+                    # Merge template servers (template takes precedence for conflicts)
+                    for name, config in template_servers.items():
+                        mcp_servers[name] = config
+            except (json.JSONDecodeError, IOError):
+                # Skip template if corrupted
+                pass
+
+    # Get issue tracker configuration
+    tracker_config = ISSUE_TRACKER_CONFIG.get(issue_tracker)
+    if not tracker_config:
+        raise ValueError(f"Unknown issue tracker: {issue_tracker}")
+
+    # Add issue tracker server
+    server_name = f"issue-tracker-{issue_tracker}"
+    if server_name not in mcp_servers:
+        mcp_servers[server_name] = {
+            "type": tracker_config["type"],
+            "url": tracker_config["url"]
+        }
+
+    # Write updated configuration
+    mcp_data = {"mcpServers": mcp_servers}
+    with open(mcp_file, 'w') as f:
+        json.dump(mcp_data, f, indent=2)
+
+def configure_agent_mcp_servers(project_path: Path, agent: str, team_directives_path: Path | None = None) -> None:
+    """Configure MCP servers for AI agent integration.
+
+    Creates or updates .mcp.json in the project root with the appropriate
+    MCP server configuration for the selected AI coding agent.
+    """
+    import json
+
+    mcp_file = project_path / ".mcp.json"
+    mcp_servers = {}
+
+    # Load existing .mcp.json if it exists
+    if mcp_file.exists():
+        try:
+            with open(mcp_file, 'r') as f:
+                data = json.load(f)
+                mcp_servers = data.get("mcpServers", {})
+        except (json.JSONDecodeError, IOError):
+            # If file is corrupted, start fresh
+            pass
+
+    # Load team directives template if available
+    if team_directives_path:
+        template_file = team_directives_path / ".mcp.json"
+        if template_file.exists():
+            try:
+                with open(template_file, 'r') as f:
+                    template_data = json.load(f)
+                    template_servers = template_data.get("mcpServers", {})
+                    # Merge template servers (template takes precedence for conflicts)
+                    for name, config in template_servers.items():
+                        mcp_servers[name] = config
+            except (json.JSONDecodeError, IOError):
+                # Skip template if corrupted
+                pass
+
+    # Get agent configuration
+    agent_config = AGENT_MCP_CONFIG.get(agent)
+    if not agent_config:
+        raise ValueError(f"Unknown agent: {agent}")
+
+    # Add agent server
+    server_name = f"agent-{agent}"
+    if server_name not in mcp_servers:
+        mcp_servers[server_name] = {
+            "type": agent_config["type"],
+            "url": agent_config["url"]
+        }
+
+    # Write updated configuration
+    mcp_data = {"mcpServers": mcp_servers}
+    with open(mcp_file, 'w') as f:
+        json.dump(mcp_data, f, indent=2)
+
+def configure_git_platform_mcp_servers(project_path: Path, git_platform: str, team_directives_path: Path | None = None) -> None:
+    """Configure MCP servers for Git platform integration.
+
+    Creates or updates .mcp.json in the project root with the appropriate
+    MCP server configuration for the selected Git platform (GitHub/GitLab).
+    """
+    import json
+
+    mcp_file = project_path / ".mcp.json"
+    mcp_servers = {}
+
+    # Load existing .mcp.json if it exists
+    if mcp_file.exists():
+        try:
+            with open(mcp_file, 'r') as f:
+                data = json.load(f)
+                mcp_servers = data.get("mcpServers", {})
+        except json.JSONDecodeError:
+            # Reset if corrupted
+            mcp_servers = {}
+
+    # Load template from team directives if available
+    if team_directives_path:
+        template_file = team_directives_path / ".mcp.json"
+        if template_file.exists():
+            try:
+                with open(template_file, 'r') as f:
+                    template_data = json.load(f)
+                    template_servers = template_data.get("mcpServers", {})
+                    # Merge template servers (don't overwrite existing ones)
+                    for name, config in template_servers.items():
+                        if name not in mcp_servers:
+                            mcp_servers[name] = config
+            except json.JSONDecodeError:
+                # Skip template if corrupted
+                pass
+
+    # Get Git platform configuration
+    platform_config = GIT_PLATFORM_CONFIG.get(git_platform)
+    if not platform_config:
+        raise ValueError(f"Unknown Git platform: {git_platform}")
+
+    # Add Git platform server
+    server_name = f"git-platform-{git_platform}"
+    if server_name not in mcp_servers:
+        mcp_servers[server_name] = {
+            "type": platform_config["type"],
+            "url": platform_config["url"]
+        }
+
+    # Write updated configuration
+    mcp_data = {"mcpServers": mcp_servers}
+    with open(mcp_file, 'w') as f:
+        json.dump(mcp_data, f, indent=2)
+
 def is_git_repo(path: Path = None) -> bool:
     """Check if the specified path is inside a git repository."""
     if path is None:
         path = Path.cwd()
-    
+
     if not path.is_dir():
         return False
 
@@ -532,6 +910,10 @@ def is_git_repo(path: Path = None) -> bool:
     except (subprocess.CalledProcessError, FileNotFoundError):
         return False
 
+
+
+
+
 def init_git_repo(project_path: Path, quiet: bool = False) -> Tuple[bool, Optional[str]]:
     """Initialize a git repository in the specified path.
     
@@ -546,7 +928,7 @@ def init_git_repo(project_path: Path, quiet: bool = False) -> Tuple[bool, Option
         original_cwd = Path.cwd()
         os.chdir(project_path)
         if not quiet:
-            console.print("[cyan]Initializing git repository...[/cyan]")
+            console.print("[{ACCENT_COLOR}]Initializing git repository...[/cyan]")
         subprocess.run(["git", "init"], check=True, capture_output=True, text=True)
         subprocess.run(["git", "add", "."], check=True, capture_output=True, text=True)
         subprocess.run(["git", "commit", "-m", "Initial commit from Specify template"], check=True, capture_output=True, text=True)
@@ -635,13 +1017,13 @@ def deep_merge(base: dict, update: dict) -> dict:
     return merged
 
 def download_template_from_github(ai_assistant: str, download_dir: Path, *, script_type: str = "sh", verbose: bool = True, show_progress: bool = True, client: httpx.Client = None, debug: bool = False, github_token: str = None) -> Tuple[Path, dict]:
-    repo_owner = "github"
-    repo_name = "spec-kit"
+    repo_owner = "tikalk"
+    repo_name = "agentic-sdlc-spec-kit"
     if client is None:
         client = httpx.Client(verify=ssl_context)
 
     if verbose:
-        console.print("[cyan]Fetching latest release information...[/cyan]")
+        console.print(f"[{ACCENT_COLOR}]Fetching latest release information...[/{ACCENT_COLOR}]")
     api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/releases/latest"
 
     try:
@@ -668,7 +1050,7 @@ def download_template_from_github(ai_assistant: str, download_dir: Path, *, scri
         raise typer.Exit(1)
 
     assets = release_data.get("assets", [])
-    pattern = f"spec-kit-template-{ai_assistant}-{script_type}"
+    pattern = f"agentic-sdlc-spec-kit-template-{ai_assistant}-{script_type}"
     matching_assets = [
         asset for asset in assets
         if pattern in asset["name"] and asset["name"].endswith(".zip")
@@ -687,13 +1069,13 @@ def download_template_from_github(ai_assistant: str, download_dir: Path, *, scri
     file_size = asset["size"]
 
     if verbose:
-        console.print(f"[cyan]Found template:[/cyan] {filename}")
-        console.print(f"[cyan]Size:[/cyan] {file_size:,} bytes")
-        console.print(f"[cyan]Release:[/cyan] {release_data['tag_name']}")
+        console.print(f"[{ACCENT_COLOR}]Found template:[/{ACCENT_COLOR}] {filename}")
+        console.print(f"[{ACCENT_COLOR}]Size:[/{ACCENT_COLOR}] {file_size:,} bytes")
+        console.print(f"[{ACCENT_COLOR}]Release:[/{ACCENT_COLOR}] {release_data['tag_name']}")
 
     zip_path = download_dir / filename
     if verbose:
-        console.print(f"[cyan]Downloading template...[/cyan]")
+        console.print(f"[{ACCENT_COLOR}]Downloading template...[/{ACCENT_COLOR}]")
 
     try:
         with client.stream(
@@ -795,7 +1177,7 @@ def download_and_extract_template(project_path: Path, ai_assistant: str, script_
                 tracker.start("zip-list")
                 tracker.complete("zip-list", f"{len(zip_contents)} entries")
             elif verbose:
-                console.print(f"[cyan]ZIP contains {len(zip_contents)} items[/cyan]")
+                console.print(f"[{ACCENT_COLOR}]ZIP contains {len(zip_contents)} items[/{ACCENT_COLOR}]")
 
             if is_current_dir:
                 with tempfile.TemporaryDirectory() as temp_dir:
@@ -807,7 +1189,7 @@ def download_and_extract_template(project_path: Path, ai_assistant: str, script_
                         tracker.start("extracted-summary")
                         tracker.complete("extracted-summary", f"temp {len(extracted_items)} items")
                     elif verbose:
-                        console.print(f"[cyan]Extracted {len(extracted_items)} items to temp location[/cyan]")
+                        console.print(f"[{ACCENT_COLOR}]Extracted {len(extracted_items)} items to temp location[/{ACCENT_COLOR}]")
 
                     source_dir = temp_path
                     if len(extracted_items) == 1 and extracted_items[0].is_dir():
@@ -816,7 +1198,7 @@ def download_and_extract_template(project_path: Path, ai_assistant: str, script_
                             tracker.add("flatten", "Flatten nested directory")
                             tracker.complete("flatten")
                         elif verbose:
-                            console.print(f"[cyan]Found nested directory structure[/cyan]")
+                            console.print(f"[{ACCENT_COLOR}]Found nested directory structure[/{ACCENT_COLOR}]")
 
                     for item in source_dir.iterdir():
                         dest_path = project_path / item.name
@@ -841,7 +1223,7 @@ def download_and_extract_template(project_path: Path, ai_assistant: str, script_
                                 console.print(f"[yellow]Overwriting file:[/yellow] {item.name}")
                             shutil.copy2(item, dest_path)
                     if verbose and not tracker:
-                        console.print(f"[cyan]Template files merged into current directory[/cyan]")
+                        console.print(f"[{ACCENT_COLOR}]Template files merged into current directory[/{ACCENT_COLOR}]")
             else:
                 zip_ref.extractall(project_path)
 
@@ -850,7 +1232,7 @@ def download_and_extract_template(project_path: Path, ai_assistant: str, script_
                     tracker.start("extracted-summary")
                     tracker.complete("extracted-summary", f"{len(extracted_items)} top-level items")
                 elif verbose:
-                    console.print(f"[cyan]Extracted {len(extracted_items)} items to {project_path}:[/cyan]")
+                    console.print(f"[{ACCENT_COLOR}]Extracted {len(extracted_items)} items to {project_path}:[/{ACCENT_COLOR}]")
                     for item in extracted_items:
                         console.print(f"  - {item.name} ({'dir' if item.is_dir() else 'file'})")
 
@@ -867,7 +1249,7 @@ def download_and_extract_template(project_path: Path, ai_assistant: str, script_
                         tracker.add("flatten", "Flatten nested directory")
                         tracker.complete("flatten")
                     elif verbose:
-                        console.print(f"[cyan]Flattened nested directory structure[/cyan]")
+                        console.print(f"[{ACCENT_COLOR}]Flattened nested directory structure[/{ACCENT_COLOR}]")
 
     except Exception as e:
         if tracker:
@@ -898,6 +1280,48 @@ def download_and_extract_template(project_path: Path, ai_assistant: str, script_
     return project_path
 
 
+def ensure_gateway_config(
+    project_path: Path,
+    selected_ai: str,
+    *,
+    tracker: StepTracker | None = None,
+    gateway_url: str | None = None,
+    gateway_token: str | None = None,
+    suppress_warning: bool | None = None,
+) -> None:
+    """Ensure gateway configuration exists in consolidated config and optionally hydrate it with provided values."""
+    config = load_config(project_path)
+
+    # Check if gateway config already exists and no new values provided
+    gateway_config = config.get("gateway", {})
+    has_existing_config = any([
+        gateway_config.get("url"),
+        gateway_config.get("token"),
+        gateway_config.get("suppress_warning", False)
+    ])
+
+    if has_existing_config and not any([gateway_url, gateway_token, suppress_warning]):
+        if tracker:
+            tracker.skip("gateway", "using existing config")
+        return
+
+    # Update gateway configuration
+    if "gateway" not in config:
+        config["gateway"] = {}
+    if not isinstance(config["gateway"], dict):
+        config["gateway"] = {}
+    if gateway_url is not None:
+        config["gateway"]["url"] = gateway_url
+    if gateway_token is not None:
+        config["gateway"]["token"] = gateway_token
+    if suppress_warning is not None:
+        config["gateway"]["suppress_warning"] = suppress_warning
+
+    save_config(project_path, config)
+
+    if tracker:
+        tracker.complete("gateway", "configured")
+
 def ensure_executable_scripts(project_path: Path, tracker: StepTracker | None = None) -> None:
     """Ensure POSIX .sh scripts under .specify/scripts (recursively) have execute bits (no-op on Windows)."""
     if os.name == "nt":
@@ -936,7 +1360,7 @@ def ensure_executable_scripts(project_path: Path, tracker: StepTracker | None =
         (tracker.error if failures else tracker.complete)("chmod", detail)
     else:
         if updated:
-            console.print(f"[cyan]Updated execute permissions on {updated} script(s) recursively[/cyan]")
+            console.print(f"[{ACCENT_COLOR}]Updated execute permissions on {updated} script(s) recursively[/{ACCENT_COLOR}]")
         if failures:
             console.print("[yellow]Some scripts could not be updated:[/yellow]")
             for f in failures:
@@ -954,6 +1378,14 @@ def init(
     skip_tls: bool = typer.Option(False, "--skip-tls", help="Skip SSL/TLS verification (not recommended)"),
     debug: bool = typer.Option(False, "--debug", help="Show verbose diagnostic output for network and extraction failures"),
     github_token: str = typer.Option(None, "--github-token", help="GitHub token to use for API requests (or set GH_TOKEN or GITHUB_TOKEN environment variable)"),
+    team_ai_directives: str = typer.Option(None, "--team-ai-directives", "--team-ai-directive", help="Clone or reference a team-ai-directives repository during setup"),
+    issue_tracker: Optional[str] = typer.Option(None, "--issue-tracker", help="Enable issue tracker MCP integration: github, jira, linear, gitlab"),
+    async_agent: Optional[str] = typer.Option(None, "--async-agent", help="Enable async coding agent MCP integration for autonomous task execution: jules, async-copilot, async-codex"),
+    git_platform: Optional[str] = typer.Option(None, "--git-platform", help="Enable Git platform MCP integration for PR operations: github, gitlab"),
+    gateway_url: str = typer.Option(None, "--gateway-url", help="Populate gateway URL in .specify/config/config.json"),
+    gateway_token: str = typer.Option(None, "--gateway-token", help="Populate gateway token in .specify/config/config.json"),
+    gateway_suppress_warning: bool = typer.Option(False, "--gateway-suppress-warning", help="Set gateway.suppress_warning=true in config.json"),
+    spec_sync: bool = typer.Option(False, "--spec-sync", help="Enable automatic spec-code synchronization (keeps specs/*.md files updated with code changes)"),
 ):
     """
     Initialize a new Specify project from the latest template.
@@ -964,7 +1396,11 @@ def init(
     3. Download the appropriate template from GitHub
     4. Extract the template to a new project directory or current directory
     5. Initialize a fresh git repository (if not --no-git and no existing repo)
-    6. Optionally set up AI assistant commands
+    6. Optionally scaffold central gateway configuration
+    7. Optionally clone or reference a shared team-ai-directives repository
+    8. Optionally configure MCP servers for issue tracker integration
+    9. Optionally configure MCP servers for AI agent integration
+    10. Capture learnings after delivery with /speckit.levelup
     
     Examples:
         specify init my-project
@@ -978,6 +1414,14 @@ def init(
         specify init --here --ai codebuddy
         specify init --here
         specify init --here --force  # Skip confirmation when current directory not empty
+        specify init my-project --team-ai-directives ~/workspace/team-ai-directives
+        specify init my-project --team-ai-directives https://github.com/example/team-ai-directives.git
+        specify init my-project --issue-tracker github
+        specify init my-project --async-agent jules
+        specify init my-project --git-platform github
+        specify init my-project --gateway-url https://proxy.internal --gateway-token $TOKEN
+        specify init my-project --spec-sync
+        specify init my-project --ai claude --spec-sync --issue-tracker github --git-platform github
     """
 
     show_banner()
@@ -1000,10 +1444,10 @@ def init(
 
         existing_items = list(project_path.iterdir())
         if existing_items:
-            console.print(f"[yellow]Warning:[/yellow] Current directory is not empty ({len(existing_items)} items)")
-            console.print("[yellow]Template files will be merged with existing content and may overwrite existing files[/yellow]")
+            console.print(f"[{ACCENT_COLOR}]Warning:[/{ACCENT_COLOR}] Current directory is not empty ({len(existing_items)} items)")
+            console.print(f"[{ACCENT_COLOR}]Template files will be merged with existing content and may overwrite existing files[/{ACCENT_COLOR}]")
             if force:
-                console.print("[cyan]--force supplied: skipping confirmation and proceeding with merge[/cyan]")
+                console.print(f"[{ACCENT_COLOR}]--force supplied: skipping confirmation and proceeding with merge[/{ACCENT_COLOR}]")
             else:
                 response = typer.confirm("Do you want to continue?")
                 if not response:
@@ -1013,7 +1457,7 @@ def init(
         project_path = Path(project_name).resolve()
         if project_path.exists():
             error_panel = Panel(
-                f"Directory '[cyan]{project_name}[/cyan]' already exists\n"
+                f"Directory '[{ACCENT_COLOR}]{project_name}[/{ACCENT_COLOR}]' already exists\n"
                 "Please choose a different project name or remove the existing directory.",
                 title="[red]Directory Conflict[/red]",
                 border_style="red",
@@ -1026,22 +1470,36 @@ def init(
     current_dir = Path.cwd()
 
     setup_lines = [
-        "[cyan]Specify Project Setup[/cyan]",
+        f"[{ACCENT_COLOR}]Specify Project Setup[/{ACCENT_COLOR}]",
         "",
         f"{'Project':<15} [green]{project_path.name}[/green]",
         f"{'Working Path':<15} [dim]{current_dir}[/dim]",
     ]
+    setup_lines = [line.replace("{ACCENT_COLOR}", ACCENT_COLOR) for line in setup_lines]
 
     if not here:
         setup_lines.append(f"{'Target Path':<15} [dim]{project_path}[/dim]")
 
-    console.print(Panel("\n".join(setup_lines), border_style="cyan", padding=(1, 2)))
+    console.print(Panel("\n".join(setup_lines), border_style=ACCENT_COLOR, padding=(1, 2)))
+
+    git_required_for_init = not no_git
+    git_required_for_directives = bool(team_ai_directives and team_ai_directives.strip())
+    git_required = git_required_for_init or git_required_for_directives
+    git_available = True
 
     should_init_git = False
     if not no_git:
         should_init_git = check_tool("git")
         if not should_init_git:
             console.print("[yellow]Git not found - will skip repository initialization[/yellow]")
+    if git_required:
+        git_available = check_tool("git")
+        if not git_available:
+            if git_required_for_directives:
+                console.print("[red]Error:[/red] Git is required to sync team-ai-directives. Install git or omit --team-ai-directive.")
+                raise typer.Exit(1)
+    if git_available and git_required_for_init:
+        should_init_git = True
 
     if ai_assistant:
         if ai_assistant not in AGENT_CONFIG:
@@ -1052,21 +1510,33 @@ def init(
         # Create options dict for selection (agent_key: display_name)
         ai_choices = {key: config["name"] for key, config in AGENT_CONFIG.items()}
         selected_ai = select_with_arrows(
-            ai_choices, 
-            "Choose your AI assistant:", 
+            ai_choices,
+            "Choose your AI assistant:",
             "copilot"
         )
 
+    # Validate issue tracker option
+    if issue_tracker:
+        if issue_tracker not in ISSUE_TRACKER_CONFIG:
+            console.print(f"[red]Error:[/red] Invalid issue tracker '{issue_tracker}'. Choose from: {', '.join(ISSUE_TRACKER_CONFIG.keys())}")
+            raise typer.Exit(1)
+
+    # Validate async agent option
+    if async_agent:
+        if async_agent not in AGENT_MCP_CONFIG:
+            console.print(f"[red]Error:[/red] Invalid async agent '{async_agent}'. Choose from: {', '.join(AGENT_MCP_CONFIG.keys())}")
+            raise typer.Exit(1)
+
     if not ignore_agent_tools:
         agent_config = AGENT_CONFIG.get(selected_ai)
         if agent_config and agent_config["requires_cli"]:
             install_url = agent_config["install_url"]
             if not check_tool(selected_ai):
                 error_panel = Panel(
-                    f"[cyan]{selected_ai}[/cyan] not found\n"
-                    f"Install from: [cyan]{install_url}[/cyan]\n"
+                    f"{ACCENT_COLOR}{selected_ai}[/{ACCENT_COLOR}] not found\n"
+                    f"Install from: [{ACCENT_COLOR}]{install_url}[/{ACCENT_COLOR}]\n"
                     f"{agent_config['name']} is required to continue with this project type.\n\n"
-                    "Tip: Use [cyan]--ignore-agent-tools[/cyan] to skip this check",
+                    "Tip: Use [{ACCENT_COLOR}]--ignore-agent-tools[/{ACCENT_COLOR}] to skip this check",
                     title="[red]Agent Detection Error[/red]",
                     border_style="red",
                     padding=(1, 2)
@@ -1088,8 +1558,8 @@ def init(
         else:
             selected_script = default_script
 
-    console.print(f"[cyan]Selected AI assistant:[/cyan] {selected_ai}")
-    console.print(f"[cyan]Selected script type:[/cyan] {selected_script}")
+    console.print(f"[{ACCENT_COLOR}]Selected AI assistant:[/{ACCENT_COLOR}] {selected_ai}")
+    console.print(f"[{ACCENT_COLOR}]Selected script type:[/{ACCENT_COLOR}] {selected_script}")
 
     tracker = StepTracker("Initialize Specify Project")
 
@@ -1108,12 +1578,18 @@ def init(
         ("zip-list", "Archive contents"),
         ("extracted-summary", "Extraction summary"),
         ("chmod", "Ensure scripts executable"),
+        ("gateway", "Configure gateway"),
+        ("spec_sync", "Setup spec-code synchronization"),
         ("cleanup", "Cleanup"),
+        ("directives", "Sync team directives"),
         ("git", "Initialize git repository"),
         ("final", "Finalize")
     ]:
         tracker.add(key, label)
 
+    resolved_team_directives: Path | None = None
+
+    # Use transient so live tree is replaced by the final static render (avoids duplicate output)
     # Track git error message outside Live context so it persists
     git_error_message = None
 
@@ -1127,6 +1603,89 @@ def init(
             download_and_extract_template(project_path, selected_ai, selected_script, here, verbose=False, tracker=tracker, client=local_client, debug=debug, github_token=github_token)
 
             ensure_executable_scripts(project_path, tracker=tracker)
+            ensure_gateway_config(
+                project_path,
+                selected_ai,
+                tracker=tracker,
+                gateway_url=gateway_url,
+                gateway_token=gateway_token,
+                suppress_warning=gateway_suppress_warning,
+            )
+
+            team_arg = team_ai_directives.strip() if team_ai_directives else ""
+            if team_arg:
+                tracker.start("directives", "syncing")
+                try:
+                    status, resolved_path = sync_team_ai_directives(team_arg, project_path, skip_tls=skip_tls)
+                    resolved_team_directives = resolved_path
+                    tracker.complete("directives", status)
+                except Exception as e:
+                    tracker.error("directives", str(e))
+                    raise
+            else:
+                tracker.skip("directives", "not provided")
+
+            # MCP configuration step
+            if issue_tracker:
+                tracker.start("mcp", "configuring")
+                try:
+                    configure_mcp_servers(project_path, issue_tracker, resolved_team_directives if team_arg else None)
+                    tracker.complete("mcp", "configured")
+                except Exception as e:
+                    tracker.error("mcp", str(e))
+                    raise
+            else:
+                tracker.skip("mcp", "not requested")
+
+            # Async Agent MCP configuration step
+            if async_agent:
+                tracker.start("async-agent-mcp", "configuring")
+                try:
+                    configure_agent_mcp_servers(project_path, async_agent, resolved_team_directives if team_arg else None)
+                    tracker.complete("async-agent-mcp", "configured")
+                except Exception as e:
+                    tracker.error("async-agent-mcp", str(e))
+                    raise
+            else:
+                tracker.skip("async-agent-mcp", "not requested")
+
+            # Git Platform MCP configuration step
+            if git_platform:
+                tracker.start("git-platform-mcp", "configuring")
+                try:
+                    configure_git_platform_mcp_servers(project_path, git_platform, resolved_team_directives if team_arg else None)
+                    tracker.complete("git-platform-mcp", "configured")
+                except Exception as e:
+                    tracker.error("git-platform-mcp", str(e))
+                    raise
+            else:
+                tracker.skip("git-platform-mcp", "not requested")
+
+            # Spec-code synchronization setup
+            if spec_sync:
+                tracker.start("spec_sync", "setting up spec-code synchronization")
+                try:
+                    # Determine script directory and command based on selected script type
+                    if selected_script == "sh":
+                        script_dir = "bash"
+                        script_cmd = "bash"
+                        script_ext = ".sh"
+                    elif selected_script == "ps":
+                        script_dir = "powershell"
+                        script_cmd = "powershell"
+                        script_ext = ".ps1"
+                    else:
+                        raise ValueError(f"Unsupported script type: {selected_script}")
+
+                    # Run the spec hooks installation script directly
+                    script_path = Path(__file__).parent.parent.parent / "scripts" / script_dir / f"spec-hooks-install{script_ext}"
+                    run_command([script_cmd, str(script_path)], check_return=True, capture=True)
+                    tracker.complete("spec_sync", "hooks installed")
+                except Exception as e:
+                    tracker.error("spec_sync", f"failed to install hooks: {str(e)}")
+                    console.print(f"[yellow]Warning:[/yellow] Spec sync setup failed: {str(e)}")
+            else:
+                tracker.skip("spec_sync", "not requested")
 
             if not no_git:
                 tracker.start("git")
@@ -1183,15 +1742,31 @@ def init(
         )
         console.print(git_error_panel)
 
+    if resolved_team_directives is None:
+        default_directives = project_path / ".specify" / "memory" / TEAM_DIRECTIVES_DIRNAME
+        if default_directives.exists():
+            resolved_team_directives = default_directives
+
+    if resolved_team_directives is not None:
+        os.environ["SPECIFY_TEAM_DIRECTIVES"] = str(resolved_team_directives)
+        # Save team directives path to consolidated config
+        config = load_config(project_path)
+        if "team_directives" not in config:
+            config["team_directives"] = {}
+        if not isinstance(config["team_directives"], dict):
+            config["team_directives"] = {}
+        config["team_directives"]["path"] = str(resolved_team_directives)
+        save_config(project_path, config)
+
     # Agent folder security notice
     agent_config = AGENT_CONFIG.get(selected_ai)
     if agent_config:
         agent_folder = agent_config["folder"]
         security_notice = Panel(
             f"Some agents may store credentials, auth tokens, or other identifying and private artifacts in the agent folder within your project.\n"
-            f"Consider adding [cyan]{agent_folder}[/cyan] (or parts of it) to [cyan].gitignore[/cyan] to prevent accidental credential leakage.",
-            title="[yellow]Agent Folder Security[/yellow]",
-            border_style="yellow",
+            f"Consider adding [{ACCENT_COLOR}]{agent_folder}[/{ACCENT_COLOR}] (or parts of it) to [{ACCENT_COLOR}].gitignore[/{ACCENT_COLOR}] to prevent accidental credential leakage.",
+            title=f"[{ACCENT_COLOR}]Agent Folder Security[/{ACCENT_COLOR}]",
+            border_style=ACCENT_COLOR,
             padding=(1, 2)
         )
         console.print()
@@ -1199,7 +1774,7 @@ def init(
 
     steps_lines = []
     if not here:
-        steps_lines.append(f"1. Go to the project folder: [cyan]cd {project_name}[/cyan]")
+        steps_lines.append(f"1. Go to the project folder: [{ACCENT_COLOR}]cd {project_name}[/{ACCENT_COLOR}]")
         step_num = 2
     else:
         steps_lines.append("1. You're already in the project directory!")
@@ -1214,29 +1789,30 @@ def init(
         else:  # Unix-like systems
             cmd = f"export CODEX_HOME={quoted_path}"
         
-        steps_lines.append(f"{step_num}. Set [cyan]CODEX_HOME[/cyan] environment variable before running Codex: [cyan]{cmd}[/cyan]")
+        steps_lines.append(f"{step_num}. Set [{ACCENT_COLOR}]CODEX_HOME[/{ACCENT_COLOR}] environment variable before running Codex: [{ACCENT_COLOR}]{cmd}[/{ACCENT_COLOR}]")
         step_num += 1
 
     steps_lines.append(f"{step_num}. Start using slash commands with your AI agent:")
 
-    steps_lines.append("   2.1 [cyan]/speckit.constitution[/] - Establish project principles")
-    steps_lines.append("   2.2 [cyan]/speckit.specify[/] - Create baseline specification")
-    steps_lines.append("   2.3 [cyan]/speckit.plan[/] - Create implementation plan")
-    steps_lines.append("   2.4 [cyan]/speckit.tasks[/] - Generate actionable tasks")
-    steps_lines.append("   2.5 [cyan]/speckit.implement[/] - Execute implementation")
+    steps_lines.append(f"   2.1 [{ACCENT_COLOR}]/speckit.constitution[/{ACCENT_COLOR}] - Establish project principles")
+    steps_lines.append(f"   2.2 [{ACCENT_COLOR}]/speckit.specify[/{ACCENT_COLOR}] - Create baseline specification")
+    steps_lines.append(f"   2.3 [{ACCENT_COLOR}]/speckit.plan[/{ACCENT_COLOR}] - Create implementation plan")
+    steps_lines.append(f"   2.4 [{ACCENT_COLOR}]/speckit.tasks[/{ACCENT_COLOR}] - Generate actionable tasks")
+    steps_lines.append(f"   2.5 [{ACCENT_COLOR}]/speckit.implement[/{ACCENT_COLOR}] - Execute implementation")
+    steps_lines.append(f"   2.6 [{ACCENT_COLOR}]/speckit.levelup[/{ACCENT_COLOR}] - Capture learnings and create knowledge assets")
 
-    steps_panel = Panel("\n".join(steps_lines), title="Next Steps", border_style="cyan", padding=(1,2))
+    steps_panel = Panel("\n".join(steps_lines), title="Next Steps", border_style=ACCENT_COLOR, padding=(1,2))
     console.print()
     console.print(steps_panel)
 
     enhancement_lines = [
         "Optional commands that you can use for your specs [bright_black](improve quality & confidence)[/bright_black]",
         "",
-        f"○ [cyan]/speckit.clarify[/] [bright_black](optional)[/bright_black] - Ask structured questions to de-risk ambiguous areas before planning (run before [cyan]/speckit.plan[/] if used)",
-        f"○ [cyan]/speckit.analyze[/] [bright_black](optional)[/bright_black] - Cross-artifact consistency & alignment report (after [cyan]/speckit.tasks[/], before [cyan]/speckit.implement[/])",
-        f"○ [cyan]/speckit.checklist[/] [bright_black](optional)[/bright_black] - Generate quality checklists to validate requirements completeness, clarity, and consistency (after [cyan]/speckit.plan[/])"
+        f"○ [{ACCENT_COLOR}]/speckit.clarify[/{ACCENT_COLOR}] [bright_black](optional)[/bright_black] - Ask structured questions to de-risk ambiguous areas before planning (run before [{ACCENT_COLOR}]/speckit.plan[/{ACCENT_COLOR}] if used)",
+        f"○ [{ACCENT_COLOR}]/speckit.analyze[/{ACCENT_COLOR}] [bright_black](optional)[/bright_black] - Cross-artifact consistency & alignment report (after [{ACCENT_COLOR}]/speckit.tasks[/{ACCENT_COLOR}], before [{ACCENT_COLOR}]/speckit.implement[/{ACCENT_COLOR}])",
+        f"○ [{ACCENT_COLOR}]/speckit.checklist[/{ACCENT_COLOR}] [bright_black](optional)[/bright_black] - Generate quality checklists to validate requirements completeness, clarity, and consistency (after [{ACCENT_COLOR}]/speckit.plan[/{ACCENT_COLOR}])"
     ]
-    enhancements_panel = Panel("\n".join(enhancement_lines), title="Enhancement Commands", border_style="cyan", padding=(1,2))
+    enhancements_panel = Panel("\n".join(enhancement_lines), title="Enhancement Commands", border_style=ACCENT_COLOR, padding=(1,2))
     console.print()
     console.print(enhancements_panel)
 
diff --git a/templates/commands/analyze.md b/templates/commands/analyze.md
index 827d4e4caf..5068e14c8a 100644
--- a/templates/commands/analyze.md
+++ b/templates/commands/analyze.md
@@ -1,8 +1,8 @@
 ---
-description: Perform a non-destructive cross-artifact consistency and quality analysis across spec.md, plan.md, and tasks.md after task generation.
+description: Perform cross-artifact consistency and quality analysis. Automatically detects pre vs post-implementation context based on workflow mode and project state.
 scripts:
-  sh: scripts/bash/check-prerequisites.sh --json --require-tasks --include-tasks
-  ps: scripts/powershell/check-prerequisites.ps1 -Json -RequireTasks -IncludeTasks
+   sh: scripts/bash/check-prerequisites.sh --json --include-tasks
+   ps: scripts/powershell/check-prerequisites.ps1 -Json -IncludeTasks
 ---
 
 ## User Input
@@ -15,12 +15,35 @@ You **MUST** consider the user input before proceeding (if not empty).
 
 ## Goal
 
-Identify inconsistencies, duplications, ambiguities, and underspecified items across the three core artifacts (`spec.md`, `plan.md`, `tasks.md`) before implementation. This command MUST run only after `/speckit.tasks` has successfully produced a complete `tasks.md`.
+Perform consistency and quality analysis across artifacts and implementation with automatic context detection:
+
+**Auto-Detection Logic**:
+
+- **Pre-Implementation**: When tasks.md exists but no implementation artifacts detected
+- **Post-Implementation**: When implementation artifacts exist (source code, build outputs, etc.)
+
+**Pre-Implementation Analysis**: Identify inconsistencies, duplications, ambiguities, and underspecified items across the three core artifacts (`spec.md`, `plan.md`, `tasks.md`) before implementation. This command MUST run only after `/speckit.tasks` has successfully produced a complete `tasks.md`.
+
+**Post-Implementation Analysis**: Analyze actual implemented code against documentation to identify refinement opportunities, synchronization needs, and real-world improvements.
+
+This command adapts its behavior based on project state and workflow mode.
 
 ## Operating Constraints
 
 **STRICTLY READ-ONLY**: Do **not** modify any files. Output a structured analysis report. Offer an optional remediation plan (user must explicitly approve before any follow-up editing commands would be invoked manually).
 
+**Auto-Detection Logic**:
+
+1. Check workflow mode (build vs spec) from `.specify/config/config.json` under `workflow.current_mode`
+2. Analyze project state:
+   - **Pre-implementation**:
+     - **Build mode**: spec.md exists, no implementation artifacts (plan.md/tasks.md optional)
+     - **Spec mode**: tasks.md exists, no source code or build artifacts
+   - **Post-implementation**: Source code directories, compiled outputs, or deployment artifacts exist
+3. Apply mode-aware analysis depth:
+   - **Build mode**: Lightweight analysis appropriate for rapid iteration
+   - **Spec mode**: Comprehensive analysis with full validation
+
 **Constitution Authority**: The project constitution (`/memory/constitution.md`) is **non-negotiable** within this analysis scope. Constitution conflicts are automatically CRITICAL and require adjustment of the spec, plan, or tasks—not dilution, reinterpretation, or silent ignoring of the principle. If a principle itself needs to change, that must occur in a separate, explicit constitution update outside `/speckit.analyze`.
 
 ## Execution Steps
@@ -33,14 +56,34 @@ Run `{SCRIPT}` once from repo root and parse JSON for FEATURE_DIR and AVAILABLE_
 - PLAN = FEATURE_DIR/plan.md
 - TASKS = FEATURE_DIR/tasks.md
 
-Abort with an error message if any required file is missing (instruct the user to run missing prerequisite command).
 For single quotes in args like "I'm Groot", use escape syntax: e.g 'I'\''m Groot' (or double-quote if possible: "I'm Groot").
 
-### 2. Load Artifacts (Progressive Disclosure)
+### 2. Auto-Detect Analysis Mode
+
+**Context Analysis**:
+
+1. **Check Workflow Mode**: Read current mode from `.specify/config/config.json` under `workflow.current_mode`
+2. **Analyze Project State**:
+   - Scan for implementation artifacts (src/, build/, dist/, *.js,*.py, etc.)
+   - Check git history for implementation commits
+   - Verify if `/implement` has been run recently
+3. **Determine Analysis Type**:
+   - **Pre-Implementation**:
+     - **Build mode**: No implementation artifacts (regardless of plan.md/tasks.md status)
+     - **Spec mode**: No implementation artifacts + tasks.md exists
+   - **Post-Implementation**: Implementation artifacts exist
+4. **Apply Mode-Aware Depth**:
+   - **Build Mode**: Focus on core functionality and quick iterations
+   - **Spec Mode**: Comprehensive analysis with full validation
+
+**Fallback Logic**: If detection is ambiguous, default to pre-implementation analysis appropriate for the current mode and prompt user for clarification.
 
-Load only the minimal necessary context from each artifact:
+### 3. Load Artifacts (Auto-Detected Mode)
 
-**From spec.md:**
+**Pre-Implementation Mode Artifacts:**
+Load available artifacts (build mode may have only spec.md):
+
+**From spec.md (required):**
 
 - Overview/Context
 - Functional Requirements
@@ -48,14 +91,14 @@ Load only the minimal necessary context from each artifact:
 - User Stories
 - Edge Cases (if present)
 
-**From plan.md:**
+**From plan.md (optional in build mode):**
 
 - Architecture/stack choices
 - Data Model references
 - Phases
 - Technical constraints
 
-**From tasks.md:**
+**From tasks.md (optional in build mode):**
 
 - Task IDs
 - Descriptions
@@ -63,9 +106,24 @@ Load only the minimal necessary context from each artifact:
 - Parallel markers [P]
 - Referenced file paths
 
+**Post-Implementation Mode Artifacts:**
+Load documentation artifacts plus analyze actual codebase:
+
+**From Documentation:**
+
+- All artifacts as above (if available)
+- Implementation notes and decisions
+
+**From Codebase:**
+
+- Scan source code for implemented functionality
+- Check for undocumented features or changes
+- Analyze performance patterns and architecture usage
+- Identify manual modifications not reflected in documentation
+
 **From constitution:**
 
-- Load `/memory/constitution.md` for principle validation
+- Load `/memory/constitution.md` for principle validation (both modes)
 
 ### 3. Build Semantic Models
 
@@ -76,10 +134,14 @@ Create internal representations (do not include raw artifacts in output):
 - **Task coverage mapping**: Map each task to one or more requirements or stories (inference by keyword / explicit reference patterns like IDs or key phrases)
 - **Constitution rule set**: Extract principle names and MUST/SHOULD normative statements
 
-### 4. Detection Passes (Token-Efficient Analysis)
+### 4. Detection Passes (Auto-Detected Analysis)
 
 Focus on high-signal findings. Limit to 50 findings total; aggregate remainder in overflow summary.
 
+**BRANCH BY AUTO-DETECTED MODE:**
+
+#### Pre-Implementation Detection Passes
+
 #### A. Duplication Detection
 
 - Identify near-duplicate requirements
@@ -94,7 +156,7 @@ Focus on high-signal findings. Limit to 50 findings total; aggregate remainder i
 
 - Requirements with verbs but missing object or measurable outcome
 - User stories missing acceptance criteria alignment
-- Tasks referencing files or components not defined in spec/plan
+- Tasks referencing files or components not defined in spec/plan (if tasks.md exists)
 
 #### D. Constitution Alignment
 
@@ -103,9 +165,9 @@ Focus on high-signal findings. Limit to 50 findings total; aggregate remainder i
 
 #### E. Coverage Gaps
 
-- Requirements with zero associated tasks
-- Tasks with no mapped requirement/story
-- Non-functional requirements not reflected in tasks (e.g., performance, security)
+- Requirements with zero associated tasks (if tasks.md exists)
+- Tasks with no mapped requirement/story (if tasks.md exists)
+- Non-functional requirements not reflected in tasks (if tasks.md exists)
 
 #### F. Inconsistency
 
@@ -114,27 +176,95 @@ Focus on high-signal findings. Limit to 50 findings total; aggregate remainder i
 - Task ordering contradictions (e.g., integration tasks before foundational setup tasks without dependency note)
 - Conflicting requirements (e.g., one requires Next.js while other specifies Vue)
 
-### 5. Severity Assignment
+#### Post-Implementation Detection Passes
+
+##### G. Documentation Drift
+
+- Implemented features not documented in spec.md
+- Code architecture differing from plan.md
+- Manual changes not reflected in documentation
+- Deprecated code still referenced in docs
+
+##### H. Implementation Quality
+
+- Performance bottlenecks not anticipated in spec
+- Security issues discovered during implementation
+- Scalability problems with current architecture
+- Code maintainability concerns
+
+##### I. Real-World Usage Gaps
+
+- User experience issues not covered in requirements
+- Edge cases discovered during testing/usage
+- Integration problems with external systems
+- Data validation issues in production
+
+##### J. Refinement Opportunities
+
+- Code optimizations possible
+- Architecture improvements identified
+- Testing gaps revealed
+- Monitoring/logging enhancements needed
+
+#### K. Smart Trace Validation (Both Modes)
+
+**Purpose**: Ensure spec-to-issue traceability is maintained throughout the SDD workflow using `@issue-tracker ISSUE-123` syntax.
+
+**Detection Logic**:
+
+1. **Scan all artifacts** for existing `@issue-tracker` references
+2. **Extract issue IDs** from patterns like `@issue-tracker PROJ-123`, `@issue-tracker #456`, `@issue-tracker GITHUB-789`
+3. **Validate coverage**:
+   - **Spec-level traces**: Every major feature/user story should have at least one issue reference
+   - **Task-level traces**: Implementation tasks should reference parent spec issues
+   - **Cross-artifact consistency**: Same issue IDs used across spec.md, plan.md, tasks.md
+4. **Check MCP configuration**: Verify `.mcp.json` exists and issue tracker is properly configured
+
+**Traceability Gaps to Detect**:
+
+- **Missing spec traces**: User stories or major features without `@issue-tracker` references
+- **Orphaned tasks**: Implementation tasks not linked to spec-level issues
+- **Inconsistent issue references**: Same feature referenced with different issue IDs across artifacts
+- **Invalid issue formats**: Malformed issue references that won't integrate with MCP
+- **MCP misconfiguration**: Issue tracker not configured in `.mcp.json`
+
+**Validation Rules**:
+
+- **Minimum coverage**: ≥80% of user stories/requirements should have traceable issue links
+- **Format validation**: Issue references must match configured tracker patterns (GitHub/Jira/Linear/GitLab)
+- **Consistency check**: Issue IDs should be consistent across spec.md → plan.md → tasks.md
+- **MCP readiness**: `.mcp.json` must exist and contain valid issue tracker configuration
+
+### 5. Severity Assignment (Mode-Aware)
 
 Use this heuristic to prioritize findings:
 
-- **CRITICAL**: Violates constitution MUST, missing core spec artifact, or requirement with zero coverage that blocks baseline functionality
+**Pre-Implementation Severities:**
+
+- **CRITICAL**: Violates constitution MUST, missing spec.md, or requirement with zero coverage that blocks baseline functionality
 - **HIGH**: Duplicate or conflicting requirement, ambiguous security/performance attribute, untestable acceptance criterion
-- **MEDIUM**: Terminology drift, missing non-functional task coverage, underspecified edge case
+- **MEDIUM**: Terminology drift, missing non-functional task coverage, underspecified edge case, missing plan.md/tasks.md (build mode only)
 - **LOW**: Style/wording improvements, minor redundancy not affecting execution order
 
-### 6. Produce Compact Analysis Report
+**Post-Implementation Severities:**
+
+- **CRITICAL**: Security vulnerabilities, data corruption risks, or system stability issues
+- **HIGH**: Performance problems affecting user experience, undocumented breaking changes
+- **MEDIUM**: Code quality issues, missing tests, documentation drift
+- **LOW**: Optimization opportunities, minor improvements, style enhancements
+
+### 6. Produce Compact Analysis Report (Auto-Detected)
+
+Output a Markdown report (no file writes) with auto-detected mode-appropriate structure. Include detection summary at the top:
 
-Output a Markdown report (no file writes) with the following structure:
+#### Pre-Implementation Report Structure
 
-## Specification Analysis Report
+## Pre-Implementation Analysis Report
 
 | ID | Category | Severity | Location(s) | Summary | Recommendation |
 |----|----------|----------|-------------|---------|----------------|
 | A1 | Duplication | HIGH | spec.md:L120-134 | Two similar requirements ... | Merge phrasing; keep clearer version |
 
-(Add one row per finding; generate stable IDs prefixed by category initial.)
-
 **Coverage Summary Table:**
 
 | Requirement Key | Has Task? | Task IDs | Notes |
@@ -153,18 +283,118 @@ Output a Markdown report (no file writes) with the following structure:
 - Duplication Count
 - Critical Issues Count
 
-### 7. Provide Next Actions
+**Traceability Validation:**
+
+- **Issue Coverage**: X/Y user stories have @issue-tracker references (Z%)
+- **MCP Status**: ✅ Configured (GitHub) / ❌ Missing .mcp.json
+- **Format Validation**: All issue references use valid formats
+- **Consistency Check**: Issue IDs consistent across artifacts
+
+### Post-Implementation Report Structure
+
+## Post-Implementation Analysis Report
+
+| ID | Category | Severity | Location(s) | Summary | Recommendation |
+|----|----------|----------|-------------|---------|----------------|
+| G1 | Documentation Drift | HIGH | src/auth.js | JWT implementation not in spec | Update spec.md to document JWT usage |
+
+**Implementation vs Documentation Gaps:**
+
+| Area | Implemented | Documented | Gap Analysis |
+|------|-------------|------------|--------------|
+| Authentication | JWT + OAuth2 | Basic auth only | Missing OAuth2 in spec |
+
+**Code Quality Metrics:**
 
-At end of report, output a concise Next Actions block:
+- Lines of code analyzed
+- Test coverage percentage
+- Performance bottlenecks identified
+- Security issues found
 
-- If CRITICAL issues exist: Recommend resolving before `/speckit.implement`
+**Refinement Opportunities:**
+
+- Performance optimizations
+- Architecture improvements
+- Testing enhancements
+- Documentation updates needed
+
+**Traceability Validation:**
+
+- **Issue Coverage**: X/Y user stories have @issue-tracker references (Z%)
+- **MCP Status**: ✅ Configured (GitHub) / ❌ Missing .mcp.json
+- **Format Validation**: All issue references use valid formats
+- **Consistency Check**: Issue IDs consistent across artifacts
+
+### 7. Provide Next Actions (Auto-Detected)
+
+At end of report, output a concise Next Actions block based on detected mode and findings:
+
+**Pre-Implementation Next Actions:**
+
+- **Build Mode**: Missing plan.md/tasks.md is not critical - user may proceed to `/implement` for lightweight development
+- **Spec Mode**: - If CRITICAL issues exist: Recommend resolving before `/speckit.implement`
 - If only LOW/MEDIUM: User may proceed, but provide improvement suggestions
 - Provide explicit command suggestions: e.g., "Run /speckit.specify with refinement", "Run /speckit.plan to adjust architecture", "Manually edit tasks.md to add coverage for 'performance-metrics'"
+- **Traceability**: If <80% coverage: "Add @issue-tracker ISSUE-123 references to major user stories in spec.md"
+- Provide explicit command suggestions: e.g., "Run /specify with refinement", "Run /plan to adjust architecture", "Manually edit tasks.md to add coverage for 'performance-metrics'"
+
+**Post-Implementation Next Actions:**
+
+- If CRITICAL issues exist: Recommend immediate fixes for security/stability
+- If HIGH issues exist: Suggest prioritization for next iteration
+- **Traceability**: If gaps found: "Update issue status in tracker and ensure all implemented features are linked via @issue-tracker references"
+- Provide refinement suggestions: e.g., "Consider performance optimization", "Update documentation for new features", "Add missing test coverage"
+- Suggest follow-up commands: e.g., "Run /plan to update architecture docs", "Run /specify to document new requirements"
 
 ### 8. Offer Remediation
 
 Ask the user: "Would you like me to suggest concrete remediation edits for the top N issues?" (Do NOT apply them automatically.)
 
+### 9. Documentation Evolution (Post-Implementation Only)
+
+**When Post-Implementation Analysis Detects Significant Changes:**
+
+If the analysis reveals substantial implementation changes that should be reflected in documentation, offer to evolve the documentation:
+
+**Documentation Evolution Options:**
+
+- **Spec Updates**: Add newly discovered requirements, edge cases, or user experience insights
+- **Plan Updates**: Document architecture changes, performance optimizations, or integration decisions
+- **Task Updates**: Mark completed tasks, add follow-up tasks for refinements
+
+**Evolution Workflow:**
+
+1. **Identify Changes**: Flag implemented features not in spec.md, architecture deviations from plan.md
+2. **Propose Updates**: Suggest specific additions to documentation artifacts
+3. **Preserve Intent**: Ensure updates maintain original requirements while incorporating implementation learnings
+4. **Version Tracking**: Create new versions of documentation with clear change rationale
+
+**Evolution Triggers:**
+
+- New features implemented but not specified
+- Architecture changes for performance/security reasons
+- User experience improvements discovered during implementation
+- Integration requirements not anticipated in planning
+
+### 10. Rollback Integration
+
+**When Analysis Reveals Critical Issues:**
+
+If post-implementation analysis identifies critical problems requiring rollback:
+
+**Rollback Options:**
+
+- **Task-Level Rollback**: Revert individual tasks while preserving completed work
+- **Feature Rollback**: Roll back entire feature implementation
+- **Documentation Preservation**: Keep documentation updates even when code is rolled back
+
+**Rollback Workflow:**
+
+1. **Assess Impact**: Determine which tasks/code to rollback
+2. **Preserve Documentation**: Keep spec/plan updates that reflect learnings
+3. **Clean Revert**: Remove problematic implementation while maintaining good changes
+4. **Regenerate Tasks**: Create new tasks for corrected implementation approach
+
 ## Operating Principles
 
 ### Context Efficiency
@@ -182,6 +412,20 @@ Ask the user: "Would you like me to suggest concrete remediation edits for the t
 - **Use examples over exhaustive rules** (cite specific instances, not generic patterns)
 - **Report zero issues gracefully** (emit success report with coverage statistics)
 
+### Auto-Detection Guidelines
+
+- **Context awareness**: Analyze project state to determine appropriate analysis type
+- **Mode integration**: Respect workflow mode (build vs spec) for analysis depth
+- **Progressive enhancement**: Start with basic detection, allow user override if needed
+- **Clear communication**: Always report which analysis mode was auto-selected
+
+### Post-Implementation Guidelines
+
+- **Code analysis scope**: Focus on high-level architecture and functionality, not line-by-line code review
+- **Documentation synchronization**: Identify gaps between code and docs without assuming intent
+- **Refinement focus**: Suggest improvements based on real implementation experience
+- **Performance awareness**: Flag obvious bottlenecks but don't micro-optimize
+
 ## Context
 
 {ARGS}
diff --git a/templates/commands/checklist.md b/templates/commands/checklist.md
index e32a2c843b..5ad0ccc45a 100644
--- a/templates/commands/checklist.md
+++ b/templates/commands/checklist.md
@@ -36,9 +36,10 @@ You **MUST** consider the user input before proceeding (if not empty).
 
 ## Execution Steps
 
-1. **Setup**: Run `{SCRIPT}` from repo root and parse JSON for FEATURE_DIR and AVAILABLE_DOCS list.
-   - All file paths must be absolute.
-   - For single quotes in args like "I'm Groot", use escape syntax: e.g 'I'\''m Groot' (or double-quote if possible: "I'm Groot").
+1. **Setup**: Run `{SCRIPT}` from repo root and parse JSON for FEATURE_DIR, AVAILABLE_DOCS, and MODE_CONFIG.
+    - All file paths must be absolute.
+    - For single quotes in args like "I'm Groot", use escape syntax: e.g 'I'\''m Groot' (or double-quote if possible: "I'm Groot").
+    - Parse MODE_CONFIG to determine current workflow mode and enabled framework options
 
 2. **Clarify intent (dynamic)**: Derive up to THREE initial contextual clarifying questions (no pre-baked catalog). They MUST:
    - Be generated from the user's phrasing + extracted signals from spec/plan/tasks
@@ -79,17 +80,49 @@ You **MUST** consider the user input before proceeding (if not empty).
    - Infer any missing context from spec/plan/tasks (do NOT hallucinate)
 
 4. **Load feature context**: Read from FEATURE_DIR:
-   - spec.md: Feature requirements and scope
-   - plan.md (if exists): Technical details, dependencies
-   - tasks.md (if exists): Implementation tasks
-
-   **Context Loading Strategy**:
-   - Load only necessary portions relevant to active focus areas (avoid full-file dumping)
-   - Prefer summarizing long sections into concise scenario/requirement bullets
-   - Use progressive disclosure: add follow-on retrieval only if gaps detected
-   - If source docs are large, generate interim summary items instead of embedding raw text
-
-5. **Generate checklist** - Create "Unit Tests for Requirements":
+     - spec.md: Feature requirements and scope
+     - plan.md (if exists): Technical details, dependencies
+     - tasks.md (if exists): Implementation tasks
+     - .mcp.json (if exists): MCP server configurations
+     - .specify/config/config.json: Current mode and enabled options (under `workflow` and `options` sections)
+
+    **Context Loading Strategy**:
+     - Load only necessary portions relevant to active focus areas (avoid full-file dumping)
+     - Prefer summarizing long sections into concise scenario/requirement bullets
+     - Use progressive disclosure: add follow-on retrieval only if gaps detected
+     - If source docs are large, generate interim summary items instead of embedding raw text
+     - For MCP validation: Check .mcp.json structure and server configurations
+
+5. **Apply Mode-Aware Checklist Generation**: Use MODE_CONFIG to adapt checklist content based on enabled framework options:
+
+     **Parse MODE_CONFIG JSON**:
+     - `current_mode`: "build" or "spec" (affects default option values)
+     - `options.tdd_enabled`: true/false - include TDD requirement checks
+     - `options.contracts_enabled`: true/false - include API contract checks
+     - `options.data_models_enabled`: true/false - include data model checks
+     - `options.risk_tests_enabled`: true/false - include risk-based testing checks
+
+     **TDD Option (if options.tdd_enabled)**:
+     - Include items checking if test requirements are specified in the spec
+     - Validate that acceptance criteria are testable
+     - Check for test scenario coverage in requirements
+
+     **API Contracts Option (if options.contracts_enabled)**:
+     - Include items validating OpenAPI/GraphQL contract requirements
+     - Check for API specification completeness and clarity
+     - Validate contract versioning and compatibility requirements
+
+     **Data Models Option (if options.data_models_enabled)**:
+     - Include items checking entity and relationship specifications
+     - Validate data model completeness and consistency
+     - Check for data validation and constraint requirements
+
+     **Risk-Based Testing Option (if options.risk_tests_enabled)**:
+     - Include items validating risk assessment coverage
+     - Check for mitigation strategy specifications
+     - Validate edge case and failure scenario requirements
+
+6. **Generate checklist** - Create "Unit Tests for Requirements":
    - Create `FEATURE_DIR/checklists/` directory if it doesn't exist
    - Generate unique checklist filename:
      - Use short, descriptive name based on domain (e.g., `ux.md`, `api.md`, `security.md`)
@@ -97,6 +130,7 @@ You **MUST** consider the user input before proceeding (if not empty).
      - If file exists, append to existing file
    - Number items sequentially starting from CHK001
    - Each `/speckit.checklist` run creates a NEW file (never overwrites existing checklists)
+   - For MCP validation: Include infrastructure quality checks when relevant to the checklist focus
 
    **CORE PRINCIPLE - Test the Requirements, Not the Implementation**:
    Every checklist item MUST evaluate the REQUIREMENTS THEMSELVES for:
@@ -106,18 +140,7 @@ You **MUST** consider the user input before proceeding (if not empty).
    - **Measurability**: Can requirements be objectively verified?
    - **Coverage**: Are all scenarios/edge cases addressed?
 
-   **Category Structure** - Group items by requirement quality dimensions:
-   - **Requirement Completeness** (Are all necessary requirements documented?)
-   - **Requirement Clarity** (Are requirements specific and unambiguous?)
-   - **Requirement Consistency** (Do requirements align without conflicts?)
-   - **Acceptance Criteria Quality** (Are success criteria measurable?)
-   - **Scenario Coverage** (Are all flows/cases addressed?)
-   - **Edge Case Coverage** (Are boundary conditions defined?)
-   - **Non-Functional Requirements** (Performance, Security, Accessibility, etc. - are they specified?)
-   - **Dependencies & Assumptions** (Are they documented and validated?)
-   - **Ambiguities & Conflicts** (What needs clarification?)
-
-   **HOW TO WRITE CHECKLIST ITEMS - "Unit Tests for English"**:
+    **HOW TO WRITE CHECKLIST ITEMS - "Unit Tests for English"**:
 
    ❌ **WRONG** (Testing implementation):
    - "Verify landing page displays 3 episode cards"
@@ -139,7 +162,7 @@ You **MUST** consider the user input before proceeding (if not empty).
    - Focus on what's WRITTEN (or not written) in the spec/plan
    - Include quality dimension in brackets [Completeness/Clarity/Consistency/etc.]
    - Reference spec section `[Spec §X.Y]` when checking existing requirements
-   - Use `[Gap]` marker when checking for missing requirements
+      - Use `[Gap]` marker when checking for missing requirements
 
    **EXAMPLES BY QUALITY DIMENSION**:
 
@@ -162,33 +185,93 @@ You **MUST** consider the user input before proceeding (if not empty).
    - "Are concurrent user interaction scenarios addressed? [Coverage, Gap]"
    - "Are requirements specified for partial data loading failures? [Coverage, Exception Flow]"
 
-   Measurability:
-   - "Are visual hierarchy requirements measurable/testable? [Acceptance Criteria, Spec §FR-1]"
-   - "Can 'balanced visual weight' be objectively verified? [Measurability, Spec §FR-2]"
-
-   **Scenario Classification & Coverage** (Requirements Quality Focus):
-   - Check if requirements exist for: Primary, Alternate, Exception/Error, Recovery, Non-Functional scenarios
-   - For each scenario class, ask: "Are [scenario type] requirements complete, clear, and consistent?"
-   - If scenario class missing: "Are [scenario type] requirements intentionally excluded or missing? [Gap]"
-   - Include resilience/rollback when state mutation occurs: "Are rollback requirements defined for migration failures? [Gap]"
+    Measurability:
+    - "Are visual hierarchy requirements measurable/testable? [Acceptance Criteria, Spec §FR-1]"
+    - "Can 'balanced visual weight' be objectively verified? [Measurability, Spec §FR-2]"
+
+    Completeness:
+    - "Are error handling requirements defined for all API failure modes? [Gap]"
+    - "Are accessibility requirements specified for all interactive elements? [Completeness]"
+    - "Are mobile breakpoint requirements defined for responsive layouts? [Gap]"
+
+    Clarity:
+    - "Is 'fast loading' quantified with specific timing thresholds? [Clarity, Spec §NFR-2]"
+    - "Are 'related episodes' selection criteria explicitly defined? [Clarity, Spec §FR-5]"
+    - "Is 'prominent' defined with measurable visual properties? [Ambiguity, Spec §FR-4]"
+
+    Consistency:
+    - "Do navigation requirements align across all pages? [Consistency, Spec §FR-10]"
+    - "Are card component requirements consistent between landing and detail pages? [Consistency]"
+
+    Coverage:
+    - "Are requirements defined for zero-state scenarios (no episodes)? [Coverage, Edge Case]"
+    - "Are concurrent user interaction scenarios addressed? [Coverage, Gap]"
+    - "Are requirements specified for partial data loading failures? [Coverage, Exception Flow]"
+
+    Measurability:
+    - "Are visual hierarchy requirements measurable/testable? [Acceptance Criteria, Spec §FR-1]"
+    - "Can 'balanced visual weight' be objectively verified? [Measurability, Spec §FR-2]"
+
+     Infrastructure (MCP Configuration):
+     - "Is .mcp.json file present and contains valid JSON? [Completeness, Infrastructure]"
+     - "Are MCP server URLs properly formatted and accessible? [Clarity, Infrastructure]"
+     - "Is issue tracker MCP server configured for project tracking? [Coverage, Infrastructure]"
+     - "Are async agent MCP servers configured for task delegation? [Completeness, Infrastructure]"
+     - "Do MCP server configurations include required type and url fields? [Consistency, Infrastructure]"
+
+     Framework Options (Mode-Aware):
+     - "Are test requirements specified for all acceptance criteria? [Completeness, TDD]" (when TDD enabled)
+     - "Are API contract specifications complete and versioned? [Completeness, Contracts]" (when contracts enabled)
+     - "Are entity relationships and data models fully specified? [Completeness, Data Models]" (when data models enabled)
+     - "Are risk mitigation strategies documented for critical paths? [Coverage, Risk Testing]" (when risk tests enabled)
+
+    **Scenario Classification & Coverage** (Requirements Quality Focus):
+    - Check if requirements exist for: Primary, Alternate, Exception/Error, Recovery, Non-Functional scenarios
+    - For each scenario class, ask: "Are [scenario type] requirements complete, clear, and consistent?"
+    - If scenario class missing: "Are [scenario type] requirements intentionally excluded or missing? [Gap]"
+    - Include resilience/rollback when state mutation occurs: "Are rollback requirements defined for migration failures? [Gap]"
+
+    **MCP Configuration Validation** (Infrastructure Quality Focus):
+    - Check if `.mcp.json` file exists and is properly configured
+    - Validate MCP server configurations for issue trackers and async agents
+    - Ensure required fields (type, url) are present for each server
+    - Verify URL formats and server types are valid
+    - Check for proper integration with issue tracking systems (GitHub, Jira, etc.)
+    - Validate async agent configurations for delegation support
 
    **Traceability Requirements**:
    - MINIMUM: ≥80% of items MUST include at least one traceability reference
    - Each item should reference: spec section `[Spec §X.Y]`, or use markers: `[Gap]`, `[Ambiguity]`, `[Conflict]`, `[Assumption]`
    - If no ID system exists: "Is a requirement & acceptance criteria ID scheme established? [Traceability]"
 
-   **Surface & Resolve Issues** (Requirements Quality Problems):
-   Ask questions about the requirements themselves:
-   - Ambiguities: "Is the term 'fast' quantified with specific metrics? [Ambiguity, Spec §NFR-1]"
-   - Conflicts: "Do navigation requirements conflict between §FR-10 and §FR-10a? [Conflict]"
-   - Assumptions: "Is the assumption of 'always available podcast API' validated? [Assumption]"
-   - Dependencies: "Are external podcast API requirements documented? [Dependency, Gap]"
-   - Missing definitions: "Is 'visual hierarchy' defined with measurable criteria? [Gap]"
-
-   **Content Consolidation**:
-   - Soft cap: If raw candidate items > 40, prioritize by risk/impact
-   - Merge near-duplicates checking the same requirement aspect
-   - If >5 low-impact edge cases, create one item: "Are edge cases X, Y, Z addressed in requirements? [Coverage]"
+    **Surface & Resolve Issues** (Requirements Quality Problems):
+    Ask questions about the requirements themselves:
+    - Ambiguities: "Is the term 'fast' quantified with specific metrics? [Ambiguity, Spec §NFR-1]"
+    - Conflicts: "Do navigation requirements conflict between §FR-10 and §FR-10a? [Conflict]"
+    - Assumptions: "Is the assumption of 'always available podcast API' validated? [Assumption]"
+    - Dependencies: "Are external podcast API requirements documented? [Dependency, Gap]"
+    - Missing definitions: "Is 'visual hierarchy' defined with measurable criteria? [Gap]"
+
+    **MCP Configuration Issues** (Infrastructure Quality Problems):
+    Ask questions about MCP setup quality:
+    - Missing config: "Is .mcp.json file present in the project root? [Gap, Infrastructure]"
+    - Invalid servers: "Do MCP server configurations have valid URLs and types? [Consistency, Infrastructure]"
+    - Missing integrations: "Is issue tracker MCP configured for the project's tracking system? [Coverage, Infrastructure]"
+    - Async delegation: "Are async agent MCP servers configured for task delegation? [Completeness, Infrastructure]"
+
+    **MCP Configuration Validation Logic**:
+    - When checklist focus includes infrastructure or deployment aspects, include MCP validation items
+    - Check .mcp.json file existence and validity
+    - Validate each MCP server configuration for required fields and proper formatting
+    - Ensure issue tracker integration is configured for the project's tracking system
+    - Verify async agent configurations are present for delegation support
+    - Flag any MCP configuration issues that could impact development workflow
+
+    **Content Consolidation**:
+    - Soft cap: If raw candidate items > 40, prioritize by risk/impact
+    - Merge near-duplicates checking the same requirement aspect
+    - If >5 low-impact edge cases, create one item: "Are edge cases X, Y, Z addressed in requirements? [Coverage]"
+    - For MCP items: Consolidate server validation checks into logical groupings
 
    **🚫 ABSOLUTELY PROHIBITED** - These make it an implementation test, not a requirements test:
    - ❌ Any item starting with "Verify", "Test", "Confirm", "Check" + implementation behavior
@@ -206,13 +289,15 @@ You **MUST** consider the user input before proceeding (if not empty).
    - ✅ "Are [edge cases/scenarios] addressed in requirements?"
    - ✅ "Does the spec define [missing aspect]?"
 
-6. **Structure Reference**: Generate the checklist following the canonical template in `templates/checklist-template.md` for title, meta section, category headings, and ID formatting. If template is unavailable, use: H1 title, purpose/created meta lines, `##` category sections containing `- [ ] CHK### <requirement item>` lines with globally incrementing IDs starting at CHK001.
+7. **Structure Reference**: Generate the checklist following the canonical template in `templates/checklist-template.md` for title, meta section, category headings, and ID formatting. If template is unavailable, use: H1 title, purpose/created meta lines, `##` category sections containing `- [ ] CHK### <requirement item>` lines with globally incrementing IDs starting at CHK001.
 
-7. **Report**: Output full path to created checklist, item count, and remind user that each run creates a new file. Summarize:
-   - Focus areas selected
-   - Depth level
-   - Actor/timing
-   - Any explicit user-specified must-have items incorporated
+8. **Report**: Output full path to created checklist, item count, and remind user that each run creates a new file. Summarize:
+     - Focus areas selected
+     - Depth level
+     - Actor/timing
+     - Any explicit user-specified must-have items incorporated
+     - MCP configuration validation status (if included in checklist)
+     - Framework options validation status (based on enabled mode options)
 
 **Important**: Each `/speckit.checklist` command invocation creates a checklist file using short, descriptive names unless file already exists. This allows:
 
@@ -265,6 +350,27 @@ Sample items:
 - "Are security requirements consistent with compliance obligations? [Consistency]"
 - "Are security failure/breach response requirements defined? [Gap, Exception Flow]"
 
+**MCP Configuration Quality:** `mcp.md`
+
+Sample items:
+
+- "Is .mcp.json file present and properly configured? [Completeness, Infrastructure]"
+- "Are MCP server URLs valid and accessible? [Clarity, Infrastructure]"
+- "Is issue tracker MCP server configured for the project's tracking system? [Coverage, Infrastructure]"
+- "Are async agent MCP servers properly configured for delegation? [Completeness, Infrastructure]"
+- "Do MCP server configurations include required type and url fields? [Consistency, Infrastructure]"
+- "Are MCP server types valid (http, websocket, stdio)? [Measurability, Infrastructure]"
+
+**Framework Options Quality:** `options.md` (when mode options are enabled)
+
+Sample items (varies based on enabled options):
+
+- "Are test scenarios specified for all acceptance criteria? [Completeness, TDD]" (TDD enabled)
+- "Are API contract specifications complete with versioning? [Completeness, Contracts]" (contracts enabled)
+- "Are entity relationships and constraints fully documented? [Completeness, Data Models]" (data models enabled)
+- "Are risk assessment and mitigation strategies specified? [Coverage, Risk Testing]" (risk tests enabled)
+- "Are framework option requirements consistent with project complexity level? [Consistency, Options]"
+
 ## Anti-Examples: What NOT To Do
 
 **❌ WRONG - These test implementation, not requirements:**
diff --git a/templates/commands/clarify.md b/templates/commands/clarify.md
index 4de842aa60..ec26f9a7b4 100644
--- a/templates/commands/clarify.md
+++ b/templates/commands/clarify.md
@@ -17,6 +17,14 @@ $ARGUMENTS
 
 You **MUST** consider the user input before proceeding (if not empty).
 
+## Mode Detection
+
+1. **Check Current Workflow Mode**: Determine if the user is in "build" or "spec" mode by checking the mode configuration file at `.specify/config/config.json` under `workflow.current_mode`. If the file doesn't exist or mode is not set, default to "spec" mode.
+
+2. **Mode-Aware Behavior**:
+   - **Build Mode**: Minimal clarification - focus only on critical blockers, limit to 1-2 questions maximum
+   - **Spec Mode**: Full clarification workflow - comprehensive ambiguity detection and resolution
+
 ## Outline
 
 Goal: Detect and reduce ambiguity or missing decision points in the active feature specification and record the clarifications directly in the spec file.
@@ -88,18 +96,68 @@ Execution steps:
    - Clarification would not materially change implementation or validation strategy
    - Information is better deferred to planning phase (note internally)
 
-3. Generate (internally) a prioritized queue of candidate clarification questions (maximum 5). Do NOT output them all at once. Apply these constraints:
-    - Maximum of 10 total questions across the whole session.
-    - Each question must be answerable with EITHER:
-       - A short multiple‑choice selection (2–5 distinct, mutually exclusive options), OR
-       - A one-word / short‑phrase answer (explicitly constrain: "Answer in <=5 words").
-    - Only include questions whose answers materially impact architecture, data modeling, task decomposition, test design, UX behavior, operational readiness, or compliance validation.
-    - Ensure category coverage balance: attempt to cover the highest impact unresolved categories first; avoid asking two low-impact questions when a single high-impact area (e.g., security posture) is unresolved.
-    - Exclude questions already answered, trivial stylistic preferences, or plan-level execution details (unless blocking correctness).
-    - Favor clarifications that reduce downstream rework risk or prevent misaligned acceptance tests.
-    - If more than 5 categories remain unresolved, select the top 5 by (Impact * Uncertainty) heuristic.
-
-4. Sequential questioning loop (interactive):
+3. Generate (internally) a prioritized queue of candidate clarification questions (mode-aware limits):
+
+     **Build Mode Question Generation:**
+     - Maximum of 2 total questions across the whole session
+     - Focus ONLY on scope-defining decisions that would prevent basic functionality
+     - Skip detailed technical, performance, or edge case questions
+     - Prioritize: core functionality > basic UX > essential data requirements
+
+     **Spec Mode Question Generation:**
+     - Maximum of 10 total questions across the whole session
+     - Each question must be answerable with EITHER:
+        - A short multiple‑choice selection (2–5 distinct, mutually exclusive options), OR
+        - A one-word / short‑phrase answer (explicitly constrain: "Answer in <=5 words")
+     - Only include questions whose answers materially impact architecture, data modeling, task decomposition, test design, UX behavior, operational readiness, or compliance validation
+     - Ensure category coverage balance: attempt to cover the highest impact unresolved categories first; avoid asking two low-impact questions when a single high-impact area (e.g., security posture) is unresolved
+     - Favor clarifications that reduce downstream rework risk or prevent misaligned acceptance tests
+     - If more than 5 categories remain unresolved, select the top 5 by (Impact * Uncertainty) heuristic
+
+4. Sequential questioning loop (interactive, mode-aware):
+
+     **Build Mode Questioning:**
+     - Present EXACTLY ONE question at a time (maximum 2 total)
+     - Keep questions extremely simple and focused on basic functionality
+     - Use short-answer format when possible to minimize interaction
+     - Stop after 2 questions or when core functionality is clear
+
+     **Spec Mode Questioning:**
+     - Present EXACTLY ONE question at a time
+     - For multiple‑choice questions:
+        - **Analyze all options** and determine the **most suitable option** based on:
+           - Best practices for the project type
+           - Common patterns in similar implementations
+           - Risk reduction (security, performance, maintainability)
+           - Alignment with any explicit project goals or constraints visible in the spec
+        - Present your **recommended option prominently** at the top with clear reasoning (1-2 sentences explaining why this is the best choice)
+        - Format as: `**Recommended:** Option [X] - <reasoning>`
+        - Then render all options as a Markdown table:
+
+        | Option | Description |
+        |--------|-------------|
+        | A | <Option A description> |
+        | B | <Option B description> |
+        | C | <Option C description> |
+        | Short | Provide a different short answer (<=5 words) |
+
+        - After the table, add: `You can reply with the option letter (e.g., "A"), accept the recommendation by saying "yes" or "recommended", or provide your own short answer.`
+     - For short‑answer style (no meaningful discrete options):
+        - Provide your **suggested answer** based on best practices and context
+        - Format as: `**Suggested:** <your proposed answer> - <brief reasoning>`
+        - Then output: `Format: Short answer (<=5 words). You can accept the suggestion by saying "yes" or "suggested", or provide your own answer.`
+     - After the user answers:
+        - If the user replies with "yes", "recommended", or "suggested", use your previously stated recommendation/suggestion as the answer
+        - Otherwise, validate the answer maps to one option or fits the <=5 word constraint
+        - If ambiguous, ask for a quick disambiguation (count still belongs to same question; do not advance)
+        - Once satisfactory, record it in working memory (do not yet write to disk) and move to the next queued question
+     - Stop asking further questions when:
+        - All critical ambiguities resolved early (remaining queued items become unnecessary), OR
+        - User signals completion ("done", "good", "no more"), OR
+        - You reach 5 asked questions
+     - Never reveal future queued questions in advance
+
+5. Sequential questioning loop (interactive):
     - Present EXACTLY ONE question at a time.
     - For multiple‑choice questions:
        - **Analyze all options** and determine the **most suitable option** based on:
@@ -129,24 +187,10 @@ Execution steps:
        - If ambiguous, ask for a quick disambiguation (count still belongs to same question; do not advance).
        - Once satisfactory, record it in working memory (do not yet write to disk) and move to the next queued question.
     - Stop asking further questions when:
-       - All critical ambiguities resolved early (remaining queued items become unnecessary), OR
-       - User signals completion ("done", "good", "no more"), OR
-       - You reach 5 asked questions.
+        - All critical ambiguities resolved early (remaining queued items become unnecessary), OR
+        - User signals completion ("done", "good", "no more"), OR
+        - You reach 5 asked questions.
     - Never reveal future queued questions in advance.
-    - If no valid questions exist at start, immediately report no critical ambiguities.
-
-5. Integration after EACH accepted answer (incremental update approach):
-    - Maintain in-memory representation of the spec (loaded once at start) plus the raw file contents.
-    - For the first integrated answer in this session:
-       - Ensure a `## Clarifications` section exists (create it just after the highest-level contextual/overview section per the spec template if missing).
-       - Under it, create (if not present) a `### Session YYYY-MM-DD` subheading for today.
-    - Append a bullet line immediately after acceptance: `- Q: <question> → A: <final answer>`.
-    - Then immediately apply the clarification to the most appropriate section(s):
-       - Functional ambiguity → Update or add a bullet in Functional Requirements.
-       - User interaction / actor distinction → Update User Stories or Actors subsection (if present) with clarified role, constraint, or scenario.
-       - Data shape / entities → Update Data Model (add fields, types, relationships) preserving ordering; note added constraints succinctly.
-       - Non-functional constraint → Add/modify measurable criteria in Non-Functional / Quality Attributes section (convert vague adjective to metric or explicit target).
-       - Edge case / negative flow → Add a new bullet under Edge Cases / Error Handling (or create such subsection if template provides placeholder for it).
        - Terminology conflict → Normalize term across spec; retain original only if necessary by adding `(formerly referred to as "X")` once.
     - If the clarification invalidates an earlier ambiguous statement, replace that statement instead of duplicating; leave no obsolete contradictory text.
     - Save the spec file AFTER each integration to minimize risk of context loss (atomic overwrite).
@@ -163,13 +207,21 @@ Execution steps:
 
 7. Write the updated spec back to `FEATURE_SPEC`.
 
-8. Report completion (after questioning loop ends or early termination):
-   - Number of questions asked & answered.
-   - Path to updated spec.
-   - Sections touched (list names).
-   - Coverage summary table listing each taxonomy category with Status: Resolved (was Partial/Missing and addressed), Deferred (exceeds question quota or better suited for planning), Clear (already sufficient), Outstanding (still Partial/Missing but low impact).
-   - If any Outstanding or Deferred remain, recommend whether to proceed to `/speckit.plan` or run `/speckit.clarify` again later post-plan.
-   - Suggested next command.
+8. Report completion (after questioning loop ends or early termination, mode-aware):
+
+    **Build Mode Completion:**
+    - Number of questions asked & answered (max 2)
+    - Path to updated spec
+    - Basic coverage summary (focus on core functionality)
+    - Suggested next command: `/speckit.implement` (skip formal planning)
+
+    **Spec Mode Completion:**
+    - Number of questions asked & answered
+    - Path to updated spec
+    - Sections touched (list names)
+    - Coverage summary table listing each taxonomy category with Status: Resolved (was Partial/Missing and addressed), Deferred (exceeds question quota or better suited for planning), Clear (already sufficient), Outstanding (still Partial/Missing but low impact)
+    - If any Outstanding or Deferred remain, recommend whether to proceed to `/speckit.plan` or run `/speckit.clarify` again later post-plan
+    - Suggested next command
 
 Behavior rules:
 
@@ -178,7 +230,13 @@ Behavior rules:
 - Never exceed 5 total asked questions (clarification retries for a single question do not count as new questions).
 - Avoid speculative tech stack questions unless the absence blocks functional clarity.
 - Respect user early termination signals ("stop", "done", "proceed").
-- If no questions asked due to full coverage, output a compact coverage summary (all categories Clear) then suggest advancing.
+  - If no questions asked due to full coverage, output a compact coverage summary (all categories Clear) then suggest advancing.
+
+1. **Mode Guidance & Transitions**:
+    - **Build Mode**: Limited clarification (max 2 questions) focuses on critical blockers only
+    - **Spec Mode**: Comprehensive clarification (max 5 questions) ensures thorough understanding
+    - **Mode Switching**: If Build mode reveals unexpected complexity, suggest switching to Spec mode with `/mode spec`
+
 - If quota reached with unresolved high-impact categories remaining, explicitly flag them under Deferred with rationale.
 
 Context for prioritization: {ARGS}
diff --git a/templates/commands/constitution.md b/templates/commands/constitution.md
index cf81f08c2f..a628203bf1 100644
--- a/templates/commands/constitution.md
+++ b/templates/commands/constitution.md
@@ -1,82 +1,170 @@
 ---
 description: Create or update the project constitution from interactive or provided principle inputs, ensuring all dependent templates stay in sync.
+scripts:
+   sh: scripts/bash/setup-constitution.sh
+   ps: scripts/powershell/setup-constitution.ps1
+validation_scripts:
+   sh: scripts/bash/validate-constitution.sh
+   ps: scripts/powershell/validate-constitution.ps1
 handoffs: 
   - label: Build Specification
     agent: speckit.specify
     prompt: Implement the feature specification based on the updated constitution. I want to build...
 ---
 
+## Role & Context
+
+You are a **Constitution Architect** responsible for establishing and maintaining the project's governance framework. Your role involves:
+
+- **Inheriting** foundational principles from team constitutions
+- **Adapting** principles to project-specific contexts
+- **Ensuring** constitutional compliance across all project activities
+- **Maintaining** version control and amendment history
+
+**Key Principles:**
+
+- Constitution supersedes all other practices
+- Changes require justification and validation
+- Principles must be testable and enforceable
+
 ## User Input
 
 ```text
 $ARGUMENTS
 ```
 
-You **MUST** consider the user input before proceeding (if not empty).
+**Input Processing:** Analyze the user input for:
+
+- Specific principle amendments or additions
+- Project context requiring constitutional guidance
+- Validation requests or compliance checks
+
+## Execution Strategy
+
+**Chain of Thought Approach:**
+
+1. **Understand Context** → Analyze project needs and team inheritance
+2. **Load Foundations** → Access team constitution and project templates
+3. **Apply Inheritance** → Map team principles to project context
+4. **Validate Integrity** → Ensure compliance and consistency
+5. **Generate Outputs** → Create validated constitution artifacts
+
+## Detailed Workflow
+
+### Phase 1: Context Analysis & Inheritance
+
+**Objective:** Establish constitutional foundation through team inheritance
+
+1. **Load Team Constitution**
+   - Execute: `{SCRIPT}` to access team directives
+   - Parse JSON output for team constitution path
+   - Extract core principles using pattern: numbered list with `**Principle Name**`
+   - Validate team constitution structure and completeness
+
+2. **Analyze Project Context**
+   - Determine project name from repository or branch context
+   - Identify project-specific requirements or constraints
+   - Assess existing codebase patterns (optional: run artifact scanning)
+
+3. **Map Inheritance Rules**
+   - **Direct Mapping:** Team principles → Project principles (preserve core governance)
+   - **Contextual Adaptation:** Adjust descriptions for project-specific application
+   - **Extension Points:** Identify areas for project-specific additions
+
+### Phase 2: Constitution Assembly
+
+**Objective:** Construct validated constitution document
+
+1. **Template Processing**
+   - Load constitution template from `/memory/constitution.md`
+   - Identify and categorize placeholder tokens:
+     - `[PROJECT_NAME]`: Repository-derived identifier
+     - `[PRINCIPLE_X_*]`: Team principle mappings
+     - `[SECTION_*]`: Governance structure elements
+     - `[VERSION_*]`: Version control metadata
+
+2. **Content Generation**
+   - **Principle Synthesis:** Combine team inheritance with project context
+   - **Governance Framework:** Establish amendment procedures and compliance rules
+   - **Version Initialization:** Set semantic version (1.0.0) and ratification dates
+
+3. **Quality Assurance**
+   - **Clarity Check:** Ensure principles use declarative, testable language
+   - **Consistency Validation:** Verify alignment across all sections
+   - **Completeness Audit:** Confirm all required elements are present
+
+### Phase 3: Validation & Synchronization
+
+**Objective:** Ensure constitutional integrity and system alignment
+
+1. **Automated Validation**
+   - Execute: `{VALIDATION_SCRIPT} --compliance --strict /memory/constitution.md`
+   - Parse validation results for critical failures and warnings
+   - **Critical Failures:** Block constitution acceptance
+   - **Warnings:** Allow override with explicit justification
+
+2. **Template Synchronization**
+   - **Dependency Scan:** Identify templates referencing constitutional elements
+   - **Consistency Checks:** Validate alignment with updated principles
+   - **Update Propagation:** Modify dependent templates as needed
+
+3. **Impact Assessment**
+   - Generate Sync Impact Report with version changes and affected components
+   - Document amendment rationale and expected outcomes
+   - Identify follow-up actions and monitoring requirements
+
+### Phase 4: Finalization & Documentation
+
+**Objective:** Complete constitution establishment with proper tracking
 
-## Outline
+1. **Artifact Generation**
+    - Write validated constitution to `/memory/constitution.md`
+    - Update version metadata and amendment timestamps
+    - Generate amendment history entry
 
-You are updating the project constitution at `/memory/constitution.md`. This file is a TEMPLATE containing placeholder tokens in square brackets (e.g. `[PROJECT_NAME]`, `[PRINCIPLE_1_NAME]`). Your job is to (a) collect/derive concrete values, (b) fill the template precisely, and (c) propagate any amendments across dependent artifacts.
+2. **User Communication**
+    - **Success Report:** Version, changes, and impact summary
+    - **Action Items:** Required follow-ups and manual interventions
+    - **Commit Guidance:** Suggested commit message with constitutional context
 
-Follow this execution flow:
+## Error Handling & Edge Cases
 
-1. Load the existing constitution template at `/memory/constitution.md`.
-   - Identify every placeholder token of the form `[ALL_CAPS_IDENTIFIER]`.
-   **IMPORTANT**: The user might require less or more principles than the ones used in the template. If a number is specified, respect that - follow the general template. You will update the doc accordingly.
+**Missing Team Constitution:**
 
-2. Collect/derive values for placeholders:
-   - If user input (conversation) supplies a value, use it.
-   - Otherwise infer from existing repo context (README, docs, prior constitution versions if embedded).
-   - For governance dates: `RATIFICATION_DATE` is the original adoption date (if unknown ask or mark TODO), `LAST_AMENDED_DATE` is today if changes are made, otherwise keep previous.
-   - `CONSTITUTION_VERSION` must increment according to semantic versioning rules:
-     - MAJOR: Backward incompatible governance/principle removals or redefinitions.
-     - MINOR: New principle/section added or materially expanded guidance.
-     - PATCH: Clarifications, wording, typo fixes, non-semantic refinements.
-   - If version bump type ambiguous, propose reasoning before finalizing.
+- Use default project constitution template
+- Flag for team constitution setup requirement
+- Allow manual principle specification
 
-3. Draft the updated constitution content:
-   - Replace every placeholder with concrete text (no bracketed tokens left except intentionally retained template slots that the project has chosen not to define yet—explicitly justify any left).
-   - Preserve heading hierarchy and comments can be removed once replaced unless they still add clarifying guidance.
-   - Ensure each Principle section: succinct name line, paragraph (or bullet list) capturing non‑negotiable rules, explicit rationale if not obvious.
-   - Ensure Governance section lists amendment procedure, versioning policy, and compliance review expectations.
+**Validation Failures:**
 
-4. Consistency propagation checklist (convert prior checklist into active validations):
-   - Read `/templates/plan-template.md` and ensure any "Constitution Check" or rules align with updated principles.
-   - Read `/templates/spec-template.md` for scope/requirements alignment—update if constitution adds/removes mandatory sections or constraints.
-   - Read `/templates/tasks-template.md` and ensure task categorization reflects new or removed principle-driven task types (e.g., observability, versioning, testing discipline).
-   - Read each command file in `/templates/commands/*.md` (including this one) to verify no outdated references (agent-specific names like CLAUDE only) remain when generic guidance is required.
-   - Read any runtime guidance docs (e.g., `README.md`, `docs/quickstart.md`, or agent-specific guidance files if present). Update references to principles changed.
+- Provide detailed error breakdown by category
+- Suggest remediation steps for each failure type
+- Support override mechanisms for justified exceptions
 
-5. Produce a Sync Impact Report (prepend as an HTML comment at top of the constitution file after update):
-   - Version change: old → new
-   - List of modified principles (old title → new title if renamed)
-   - Added sections
-   - Removed sections
-   - Templates requiring updates (✅ updated / ⚠ pending) with file paths
-   - Follow-up TODOs if any placeholders intentionally deferred.
+**Template Synchronization Issues:**
 
-6. Validation before final output:
-   - No remaining unexplained bracket tokens.
-   - Version line matches report.
-   - Dates ISO format YYYY-MM-DD.
-   - Principles are declarative, testable, and free of vague language ("should" → replace with MUST/SHOULD rationale where appropriate).
+- Report affected templates with specific change requirements
+- Generate automated update scripts where possible
+- Maintain backward compatibility during transitions
 
-7. Write the completed constitution back to `/memory/constitution.md` (overwrite).
+## Output Standards
 
-8. Output a final summary to the user with:
-   - New version and bump rationale.
-   - Any files flagged for manual follow-up.
-   - Suggested commit message (e.g., `docs: amend constitution to vX.Y.Z (principle additions + governance update)`).
+**Formatting Requirements:**
 
-Formatting & Style Requirements:
+- Markdown headers: Exact hierarchy preservation
+- Line length: <100 characters for readability
+- Spacing: Single blank lines between sections
+- Encoding: UTF-8 with no trailing whitespace
 
-- Use Markdown headings exactly as in the template (do not demote/promote levels).
-- Wrap long rationale lines to keep readability (<100 chars ideally) but do not hard enforce with awkward breaks.
-- Keep a single blank line between sections.
-- Avoid trailing whitespace.
+**Version Control:**
 
-If the user supplies partial updates (e.g., only one principle revision), still perform validation and version decision steps.
+- Semantic versioning: MAJOR.MINOR.PATCH
+- ISO dates: YYYY-MM-DD format
+- Amendment tracking: Timestamped change history
 
-If critical info missing (e.g., ratification date truly unknown), insert `TODO(<FIELD_NAME>): explanation` and include in the Sync Impact Report under deferred items.
+**Validation Reporting:**
 
+- Structured JSON output for automation integration
+- Human-readable summaries with actionable guidance
+- Color-coded status indicators (✅ PASS / ❌ FAIL / ⚠️ WARN)
 Do not create a new template; always operate on the existing `/memory/constitution.md` file.
diff --git a/templates/commands/implement.md b/templates/commands/implement.md
index 39abb1e6c8..ae32bc3d3e 100644
--- a/templates/commands/implement.md
+++ b/templates/commands/implement.md
@@ -1,10 +1,18 @@
 ---
 description: Execute the implementation plan by processing and executing all tasks defined in tasks.md
 scripts:
-  sh: scripts/bash/check-prerequisites.sh --json --require-tasks --include-tasks
-  ps: scripts/powershell/check-prerequisites.ps1 -Json -RequireTasks -IncludeTasks
+  sh: scripts/bash/implement.sh "$(scripts/bash/check-prerequisites.sh --json --require-tasks --include-tasks)"
+  ps: scripts/powershell/implement.ps1 "$(scripts/powershell/check-prerequisites.ps1 -Json -RequireTasks -IncludeTasks)"
 ---
 
+## Mode Detection
+
+1. **Check Current Workflow Mode**: Determine if the user is in "build" or "spec" mode by checking the mode configuration file at `.specify/config/config.json` under `workflow.current_mode`. If the file doesn't exist or mode is not set, default to "spec" mode.
+
+2. **Mode-Aware Behavior**:
+   - **Build Mode**: Lightweight implementation focused on core functionality with simplified validation
+   - **Spec Mode**: Full implementation with comprehensive quality gates and dual execution loop
+
 ## User Input
 
 ```text
@@ -13,7 +21,32 @@ $ARGUMENTS
 
 You **MUST** consider the user input before proceeding (if not empty).
 
-## Outline
+## Outline (Mode-Aware)
+
+### Build Mode Execution Flow
+
+**Focus:** Quick implementation of core functionality
+
+1. Run `{SCRIPT}` from repo root and parse FEATURE_DIR and AVAILABLE_DOCS list. All paths must be absolute.
+
+2. **Lightweight Validation**:
+   - Skip detailed checklist validation
+   - Focus on core functionality requirements only
+   - Use basic project setup verification
+
+3. **Core Implementation**:
+   - Execute essential tasks for primary user journey
+   - Skip comprehensive testing and edge cases
+   - Prioritize working functionality over complete coverage
+
+4. **Basic Quality Gates**:
+   - Verify core functionality works
+   - Check for critical errors
+   - Ensure basic usability
+
+### Spec Mode Execution Flow
+
+**Focus:** Comprehensive implementation with full validation
 
 1. Run `{SCRIPT}` from repo root and parse FEATURE_DIR and AVAILABLE_DOCS list. All paths must be absolute. For single quotes in args like "I'm Groot", use escape syntax: e.g 'I'\''m Groot' (or double-quote if possible: "I'm Groot").
 
@@ -49,12 +82,12 @@ You **MUST** consider the user input before proceeding (if not empty).
      - Automatically proceed to step 3
 
 3. Load and analyze the implementation context:
-   - **REQUIRED**: Read tasks.md for the complete task list and execution plan
-   - **REQUIRED**: Read plan.md for tech stack, architecture, and file structure
-   - **IF EXISTS**: Read data-model.md for entities and relationships
-   - **IF EXISTS**: Read contracts/ for API specifications and test requirements
-   - **IF EXISTS**: Read research.md for technical decisions and constraints
-   - **IF EXISTS**: Read quickstart.md for integration scenarios
+    - **REQUIRED**: Read tasks.md for the complete task list and execution plan
+    - **REQUIRED**: Read plan.md for tech stack, architecture, and file structure (optional in build mode)
+    - **IF EXISTS**: Read data-model.md for entities and relationships
+    - **IF EXISTS**: Read contracts/ for API specifications and test requirements
+    - **IF EXISTS**: Read research.md for technical decisions and constraints
+    - **IF EXISTS**: Read quickstart.md for integration scenarios
 
 4. **Project Setup Verification**:
    - **REQUIRED**: Create/verify ignore files based on actual project setup:
@@ -74,10 +107,10 @@ You **MUST** consider the user input before proceeding (if not empty).
    - Check if terraform files (*.tf) exist → create/verify .terraformignore
    - Check if .helmignore needed (helm charts present) → create/verify .helmignore
 
-   **If ignore file already exists**: Verify it contains essential patterns, append missing critical patterns only
-   **If ignore file missing**: Create with full pattern set for detected technology
+    **If ignore file already exists**: Verify it contains essential patterns, append missing critical patterns only
+    **If ignore file missing**: Create with full pattern set for detected technology
 
-   **Common Patterns by Technology** (from plan.md tech stack):
+   **Common Patterns by Technology** (from plan.md tech stack if available, otherwise detect from project files):
    - **Node.js/JavaScript/TypeScript**: `node_modules/`, `dist/`, `build/`, `*.log`, `.env*`
    - **Python**: `__pycache__/`, `*.pyc`, `.venv/`, `venv/`, `dist/`, `*.egg-info/`
    - **Java**: `target/`, `*.class`, `*.jar`, `.gradle/`, `build/`
@@ -100,39 +133,86 @@ You **MUST** consider the user input before proceeding (if not empty).
    - **Terraform**: `.terraform/`, `*.tfstate*`, `*.tfvars`, `.terraform.lock.hcl`
    - **Kubernetes/k8s**: `*.secret.yaml`, `secrets/`, `.kube/`, `kubeconfig*`, `*.key`, `*.crt`
 
-5. Parse tasks.md structure and extract:
-   - **Task phases**: Setup, Tests, Core, Integration, Polish
-   - **Task dependencies**: Sequential vs parallel execution rules
-   - **Task details**: ID, description, file paths, parallel markers [P]
-   - **Execution flow**: Order and dependency requirements
-
-6. Execute implementation following the task plan:
-   - **Phase-by-phase execution**: Complete each phase before moving to the next
-   - **Respect dependencies**: Run sequential tasks in order, parallel tasks [P] can run together  
-   - **Follow TDD approach**: Execute test tasks before their corresponding implementation tasks
-   - **File-based coordination**: Tasks affecting the same files must run sequentially
-   - **Validation checkpoints**: Verify each phase completion before proceeding
-
-7. Implementation execution rules:
-   - **Setup first**: Initialize project structure, dependencies, configuration
-   - **Tests before code**: If you need to write tests for contracts, entities, and integration scenarios
-   - **Core development**: Implement models, services, CLI commands, endpoints
-   - **Integration work**: Database connections, middleware, logging, external services
-   - **Polish and validation**: Unit tests, performance optimization, documentation
-
-8. Progress tracking and error handling:
-   - Report progress after each completed task
-   - Halt execution if any non-parallel task fails
-   - For parallel tasks [P], continue with successful tasks, report failed ones
-   - Provide clear error messages with context for debugging
-   - Suggest next steps if implementation cannot proceed
-   - **IMPORTANT** For completed tasks, make sure to mark the task off as [X] in the tasks file.
-
-9. Completion validation:
-   - Verify all required tasks are completed
-   - Check that implemented features match the original specification
-   - Validate that tests pass and coverage meets requirements
-   - Confirm the implementation follows the technical plan
-   - Report final status with summary of completed work
+   1. Parse tasks.md structure and extract (mode-aware):
+       - **Task phases**: Setup, Tests, Core, Integration, Polish
+       - **Task dependencies**: Sequential vs parallel execution rules
+       - **Task details**: ID, description, file paths, parallel markers [P]
+       - **Execution flow**: Order and dependency requirements
+       - **Load tasks_meta.json**: Read execution modes, delegation status, and review requirements
+       - Record assigned agents and job IDs for ASYNC tasks
+
+   2. Execute implementation following execution approach (mode-aware):
+
+       **Build Mode Execution:**
+       - **Simplified flow**: Focus on core tasks for primary functionality
+       - **Basic coordination**: Run essential tasks sequentially, skip complex parallel execution
+       - **Lightweight validation**: Basic checks for core functionality
+       - **Fast iteration**: Prioritize working code over comprehensive testing
+
+       **Spec Mode Execution (Dual Execution Loop):**
+       - **Phase-by-phase execution**: Complete each phase before moving to the next
+       - **Respect dependencies**: Run sequential tasks in order, parallel tasks [P] can run together
+       - **Follow TDD approach** (if enabled): Check current mode opinion settings - if TDD enabled, execute test tasks before implementation tasks
+       - **File-based coordination**: Tasks affecting the same files must run sequentially
+       - **Dual execution mode handling**:
+         - **SYNC tasks**: Execute immediately with human oversight, require micro-review via `scripts/bash/tasks-meta-utils.sh review-micro "$FEATURE_DIR/tasks_meta.json" "$task_id"`
+           - **ASYNC tasks**: Generate delegation prompts via `scripts/bash/tasks-meta-utils.sh dispatch_async_task "$task_id" "$agent_type" "$description" ...`, send to LLM agents, monitor completion, apply macro-review after completion
+       - **Quality gates**: Apply differentiated validation based on execution mode via `scripts/bash/tasks-meta-utils.sh quality-gate "$FEATURE_DIR/tasks_meta.json" "$task_id"`
+       - **Validation checkpoints**: Verify each phase completion before proceeding
+
+5. Implementation execution rules (mode-aware):
+
+     **Build Mode Rules:**
+     - **Core first**: Focus on primary user journey implementation
+     - **Basic setup**: Essential project structure and dependencies only
+     - **Working functionality**: Prioritize demonstrable features over comprehensive coverage
+     - **Iterative approach**: Get something working, then refine
+
+     **Spec Mode Rules:**
+     - **Setup first**: Initialize project structure, dependencies, configuration
+     - **Tests before code** (if TDD enabled): If TDD is enabled in current mode settings and you need to write tests for contracts, entities, and integration scenarios
+     - **Core development**: Implement models, services, CLI commands, endpoints
+     - **Integration work**: Database connections, middleware, logging, external services
+     - **Polish and validation**: Unit tests, performance optimization, documentation
+
+6. Progress tracking and error handling (mode-aware):
+     - Report progress after each completed task
+     - **Build Mode**: Continue on minor errors, focus on core functionality
+     - **Spec Mode**: Halt execution if any non-parallel task fails
+     - For parallel tasks [P], continue with successful tasks, report failed ones
+     - Provide clear error messages with context for debugging
+     - Suggest next steps if implementation cannot proceed
+     - **IMPORTANT** For completed tasks, make sure to mark the task off as [X] in the tasks file.
+
+7. Issue Tracker Integration (Spec Mode only):
+     - If ASYNC tasks were dispatched, update issue tracker with progress
+     - Apply completion labels when ASYNC tasks finish
+     - Provide traceability links between tasks and issue tracker items
+
+8. Completion validation (mode-aware):
+
+      **Build Mode Validation:**
+      - Verify core user journey works end-to-end
+      - Check for critical errors or crashes
+      - Confirm basic functionality is demonstrable
+      - Report working status with core features summary
+
+      **Spec Mode Validation:**
+      - Verify all required tasks are completed
+      - Check that implemented features match the original specification
+      - Validate that tests pass and coverage meets requirements
+      - Confirm the implementation follows the technical plan
+      - Report final status with comprehensive summary of completed work
 
 Note: This command assumes a complete task breakdown exists in tasks.md. If tasks are incomplete or missing, suggest running `/speckit.tasks` first to regenerate the task list.
+
+**Mode-Specific Notes:**
+
+- **Build Mode**: Can work with simplified task lists focused on core functionality
+- **Spec Mode**: Requires comprehensive task breakdown with proper triage classification
+
+**Mode Guidance & Transitions:**
+
+- **Build Mode**: Lightweight implementation with basic validation - ideal for quick wins
+- **Spec Mode**: Full dual execution loop with comprehensive quality gates - ideal for robust delivery
+- **Mode Switching**: If Build mode implementation reveals gaps, switch to Spec mode with `/mode spec` for complete coverage
diff --git a/templates/commands/levelup.md b/templates/commands/levelup.md
new file mode 100644
index 0000000000..03acc37737
--- /dev/null
+++ b/templates/commands/levelup.md
@@ -0,0 +1,106 @@
+---
+description: Create reusable AI session context packets and comprehensive issue summaries for team-ai-directives contributions.
+scripts:
+    sh: scripts/bash/prepare-levelup.sh --json
+    ps: scripts/powershell/prepare-levelup.ps1 -Json
+constitution_script:
+    sh: scripts/bash/constitution-levelup.sh
+    ps: scripts/powershell/constitution-levelup.ps1
+---
+
+The user input to you can be provided directly by the agent or as a command argument - you **MUST** consider it before proceeding with the prompt (if not empty).
+
+User input:
+
+$ARGUMENTS
+
+**MCP Integration Requirements:**
+
+- This command uses MCP (Model Context Protocol) tools for GitHub/GitLab operations instead of direct CLI commands
+- Requires `.mcp.json` configuration with:
+  - Git platform MCP server (GitHub/GitLab) for PR/merge request operations
+  - Issue tracker MCP server (GitHub/Jira/Linear/GitLab) for issue comment operations
+- Available MCP tools:
+  - `create_pull_request` / `create_merge_request` (from Git platform MCP)
+  - `add_issue_comment` (from issue tracker MCP)
+  - `get_pull_request`, `update_issue`
+- If MCP tools are unavailable, the command will provide drafted content for manual operations
+
+1. Run `{SCRIPT}` from the repo root and parse JSON for `FEATURE_DIR`, `BRANCH`, `SPEC_FILE`, `PLAN_FILE`, `TASKS_FILE`, `RESEARCH_FILE`, `QUICKSTART_FILE`, `KNOWLEDGE_ROOT`, and `KNOWLEDGE_DRAFTS`. All file paths must be absolute.
+     - If any of `SPEC_FILE`, `PLAN_FILE`, or `TASKS_FILE` are missing, STOP and instruct the developer to complete Stages 1-3 before leveling up.
+     - Before proceeding, analyze `TASKS_FILE` and confirm **all tasks are marked `[X]`** and no execution status indicates `[IN PROGRESS]`, `[BLOCKED]`, or other incomplete markers. If any tasks remain open or unchecked, STOP and instruct the developer to finish `/implement` first.
+     - If `KNOWLEDGE_ROOT` or `KNOWLEDGE_DRAFTS` are empty, STOP and direct the developer to rerun `specify init --team-ai-directive ...` so the shared directives repository is cloned locally.
+     - **Check MCP configuration**: Verify `.mcp.json` exists with both Git platform MCP server (GitHub/GitLab for PR operations) and issue tracker MCP server (GitHub/Jira/Linear/GitLab for issue comments) configured. MCP tools will be used for GitHub/GitLab integration instead of direct CLI commands.
+
+2. Load the implementation artifacts:
+     - Read `SPEC_FILE` (feature intent and acceptance criteria).
+     - Read `PLAN_FILE` (execution strategy and triage decisions).
+     - Read `TASKS_FILE` (completed task log, including `[SYNC]/[ASYNC]` execution modes and `[X]` markers).
+     - **Read tasks_meta.json**: Load dual execution metadata, review status, MCP job tracking, and quality gate results.
+     - IF EXISTS: Read `RESEARCH_FILE` (supporting decisions) and `QUICKSTART_FILE` (validation scenarios).
+     - Synthesize the AI agent session context: decisions made, problem-solving approaches, tool usage patterns, and execution strategies from these artifacts.
+     - **Perform macro-review**: Run `scripts/bash/tasks-meta-utils.sh review-macro "$FEATURE_DIR/tasks_meta.json"` to complete the dual execution loop.
+
+3. Create AI session context packet:
+     - **IMPORTANT**: The context packet must capture the AI agent's session for reuse in other projects. Focus on session context, decision patterns, and approaches that other AI agents can learn from and apply.
+     - Determine a slug such as `{BRANCH}-session` and create a new markdown file under `KNOWLEDGE_DRAFTS/{slug}.md`.
+     - Capture AI session context:
+       - **Session Overview**: Summary of the AI agent's approach and key decisions throughout the feature development.
+       - **Decision Patterns**: How the AI agent approached problem-solving, tool selection, and execution strategies.
+       - **Execution Context**: MCP job tracking, quality gates passed, and dual execution loop outcomes.
+       - **Reusable Patterns**: Agent behaviors, prompts, and methodologies that proved effective.
+       - **Evidence Links**: Reference to repository files/commits and the originating issue.
+       - Structure the packet for easy reuse by other AI agents in similar contexts.
+
+4. Analyze for team-ai-directives contributions:
+     - Analyze the session context for potential contributions to team-ai-directives components:
+       - **Rules**: Agent behavior rules, interaction patterns, or decision-making guidelines.
+       - **Constitution**: Governance principles, quality standards, or architectural guidelines.
+       - **Personas**: Agent roles, capabilities, or specialization patterns.
+       - **Examples**: Usage examples, case studies, or reference implementations.
+     - For each relevant component, generate targeted proposals:
+       - Run `{CONSTITUTION_SCRIPT} --json "{KNOWLEDGE_DRAFTS}/{slug}.md"` for constitution analysis.
+       - Parse results and identify contributions to rules, personas, and examples.
+       - Generate amendment proposals for constitution-level changes (relevance_score >= 3).
+       - Create rule additions, persona definitions, or example entries as appropriate.
+
+5. Prepare comprehensive AI session summary and team contributions:
+     - Generate a draft pull request description targeting the `team-ai-directives` repository. Include:
+       - AI session context packet purpose and contents.
+       - Summary of team-ai-directives contributions (rules, constitution, personas, examples).
+       - Analysis of session patterns and reusable approaches.
+       - Checklist of validations performed.
+     - Generate a comprehensive "AI Session Summary" comment for the originating issue tracker entry (`$ARGUMENTS` may specify the issue ID). Include:
+       - Complete overview of AI agent session and approach.
+       - Key decisions, challenges overcome, and patterns established.
+       - Links to context packet and any team-ai-directives contributions.
+       - Recommendations for future similar issues.
+
+6. Present results for human approval:
+     - Output the path of the generated AI session context packet and its full content.
+     - Display any proposed team-ai-directives contributions (rules, constitution amendments, personas, examples).
+     - Provide the draft pull request description and comprehensive issue comment.
+     - Ask the developer to confirm whether to proceed with publishing (Y/N). If "N", stop after summarizing the manual next steps.
+
+7. On developer approval, execute the publishing workflow automatically:
+     - Verify the working tree at `KNOWLEDGE_ROOT` is clean. If not, report the pending changes and abort.
+     - Create a new branch `levelup/{slug}` in `KNOWLEDGE_ROOT` (reuse if already created in this session).
+     - Move/copy the context packet from `KNOWLEDGE_DRAFTS/{slug}.md` into a permanent path (e.g., `context/{slug}.md`) inside `KNOWLEDGE_ROOT`.
+     - If team-ai-directives contributions were proposed and approved: create/update the relevant component files (rules, constitution, personas, examples).
+     - Run `git add` for the new/updated files and commit with a message like `Add AI session context for {BRANCH}`.
+     - If the repository has a configured `origin` remote:
+         - Push the branch: `git push -u origin levelup/{slug}`
+         - Use Git platform MCP tools to create a pull request:
+           - Call `create_pull_request` (GitHub) or `create_merge_request` (GitLab) tool with title "Add AI session context for {BRANCH}", body containing the drafted description, source branch "levelup/{slug}", target branch "main"
+           - If Git platform MCP tools are unavailable, provide the drafted PR description for manual creation.
+     - If an issue identifier was provided, use issue tracker MCP tools to post the comprehensive session comment:
+           - Call `add_issue_comment` tool with the issue ID and the drafted session summary
+           - If issue tracker MCP tools are unavailable, provide the drafted comment text for manual posting.
+     - Report the exact operations performed and surface any MCP tool failures so the developer can intervene manually.
+
+8. Summarize final status, including:
+     - AI session context packet path and commit SHA (if created).
+     - Team-ai-directives contributions status (rules, constitution, personas, examples).
+     - Pull request URL (from MCP) or instructions for manual creation.
+     - Issue tracker comment status (from MCP) or manual instructions.
+     - MCP tool execution status and any failures encountered.
diff --git a/templates/commands/mode.md b/templates/commands/mode.md
new file mode 100644
index 0000000000..7ee1e2582e
--- /dev/null
+++ b/templates/commands/mode.md
@@ -0,0 +1,156 @@
+---
+description: Manage workflow modes and framework options for development complexity control
+scripts:
+    sh: echo "Mode management is handled by the specify CLI directly"
+    ps: echo "Mode management is handled by the specify CLI directly"
+---
+
+## User Input
+
+```text
+$ARGUMENTS
+```
+
+You **MUST** consider the user input before proceeding (if not empty).
+
+## Goal
+
+Control the complexity level of the Agentic SDLC workflow by setting workflow modes and configuring framework options.
+
+## Operating Constraints
+
+**Mode Management**: Workflow modes control overall development complexity (build vs spec)
+
+**Option Management**: Framework options control which architectural patterns are enabled
+
+**Configuration Persistence**: Settings are stored in `.specify/config/config.json` under the `workflow` and `options` sections
+
+## Execution Steps
+
+### 1. Parse User Input
+
+**Command Patterns:**
+
+- No arguments: Display current mode and option settings
+- `build|spec`: Switch to specified workflow mode
+- `--tdd|--no-tdd`: Enable/disable TDD
+- `--contracts|--no-contracts`: Enable/disable API contracts
+- `--data-models|--no-data-models`: Enable/disable data models
+- `--risk-tests|--no-risk-tests`: Enable/disable risk-based test generation
+- `--reset-options`: Reset options to current mode defaults
+- `--info`: Show detailed information about all modes and options
+
+### 2. Execute Operation
+
+#### Display Current Configuration
+
+- Show current workflow mode (build/spec)
+- Display framework option settings
+- Indicate which settings are defaults vs customizations
+
+#### Switch Workflow Mode
+
+- Change from current mode to specified mode
+- Automatically reset options to new mode defaults
+- Preserve mode change history
+
+#### Configure Options
+
+- Enable/disable individual framework options
+- Persist custom settings in the `options` section of config.json
+- Allow fine-grained control over development patterns
+
+#### Reset Options
+
+- Restore options to current mode's default settings
+- Clear any custom option overrides
+
+### 3. Show Impact
+
+After any configuration change, display:
+
+- How the change affects workflow stages
+- Which artifacts will be generated differently
+- Recommendations for the new configuration
+
+## Output Format
+
+### Current Configuration Display
+
+```text
+Current Workflow Configuration
+Mode: spec
+Description: Full specification-driven workflow with comprehensive planning and structure
+Default: Yes
+
+Framework Options
+  ✅ TDD: Enabled (mode default)
+  ❌ API contracts: Disabled (custom)
+  ❌ Data models: Disabled (custom)
+  ❌ Risk-based testing: Disabled (custom)
+```
+
+### Mode Change Confirmation
+
+```text
+Workflow mode changed!
+From: build → To: spec
+Framework options reset to spec mode defaults
+```
+
+### Opinion Change Confirmation
+
+```text
+TDD enabled
+API contracts disabled
+Data models disabled
+Risk-based testing disabled
+```
+
+## Mode Descriptions
+
+### Build Mode
+
+#### Lightweight approach for quick implementation and validation
+
+- Framework options: All disabled by default
+- Focus: Rapid prototyping and exploration
+- Artifacts: Minimal documentation, flexible structure
+
+### Spec Mode
+
+#### Full specification-driven workflow with comprehensive planning
+
+- Framework options: All enabled by default
+- Focus: Thorough planning and structured development
+- Artifacts: Complete documentation, rigorous validation
+
+## Framework Options
+
+### TDD (Test-Driven Development)
+
+- **Enabled**: Tests generated before implementation tasks
+- **Disabled**: Tests optional, generated only when explicitly requested
+- **Impact**: Affects task generation and implementation ordering
+
+### API Contracts
+
+- **Enabled**: OpenAPI/GraphQL contract generation during planning
+- **Disabled**: Contract artifacts not automatically created
+- **Impact**: Affects planning phase output and API documentation
+
+### Data Models
+
+- **Enabled**: Entity and relationship modeling during planning
+- **Disabled**: Data model artifacts not automatically created
+- **Impact**: Affects planning phase output and data architecture
+
+### Risk-Based Testing
+
+- **Enabled**: Generate tests specifically targeting identified risks and edge cases
+- **Disabled**: Use standard test coverage without risk prioritization
+- **Impact**: Affects task generation and test planning in tasks.md
+
+## Context
+
+{ARGS}
diff --git a/templates/commands/plan.md b/templates/commands/plan.md
index 147da0afa0..6a3bc4a208 100644
--- a/templates/commands/plan.md
+++ b/templates/commands/plan.md
@@ -9,41 +9,131 @@ handoffs:
     agent: speckit.checklist
     prompt: Create a checklist for the following domain...
 scripts:
-  sh: scripts/bash/setup-plan.sh --json
-  ps: scripts/powershell/setup-plan.ps1 -Json
+    sh: scripts/bash/setup-plan.sh --json
+    ps: scripts/powershell/setup-plan.ps1 -Json
 agent_scripts:
-  sh: scripts/bash/update-agent-context.sh __AGENT__
-  ps: scripts/powershell/update-agent-context.ps1 -AgentType __AGENT__
+    sh: scripts/bash/update-agent-context.sh __AGENT__
+    ps: scripts/powershell/update-agent-context.ps1 -AgentType __AGENT__
 ---
 
+## Mode Detection
+
+1. **Check Current Workflow Mode**: Determine if the user is in "build" or "spec" mode by checking the mode configuration file at `.specify/config/config.json` under `workflow.current_mode`. If the file doesn't exist or mode is not set, default to "spec" mode.
+
+2. **Mode-Aware Behavior**:
+   - **Build Mode**: Lightweight planning focused on core implementation approach and basic structure
+   - **Spec Mode**: Full research-driven planning with comprehensive design artifacts and validation
+
+## Role & Context
+
+You are a **Technical Planning Architect** responsible for transforming feature specifications into executable implementation plans. Your role involves:
+
+- **Analyzing** feature requirements and technical constraints
+- **Designing** system architecture and component interactions
+- **Ensuring** constitutional compliance throughout planning
+- **Coordinating** research, design, and validation activities
+
+**Key Principles:**
+
+- Plans must be testable and implementable
+- Unknowns require research before commitment
+- Constitution governs all technical decisions
+- Quality gates prevent premature advancement
+
 ## User Input
 
 ```text
 $ARGUMENTS
 ```
 
-You **MUST** consider the user input before proceeding (if not empty).
+**Input Processing:** The user input represents feature context or planning directives. Analyze for:
+
+- Specific planning requirements or constraints
+- Technical preferences or architectural decisions
+- Timeline or resource considerations
+- Quality or compliance requirements
+
+## Execution Strategy (Mode-Aware)
+
+**Chain of Thought Approach:**
+
+1. **Establish Context** → Load specifications and constitutional requirements
+2. **Analyze Scope** → Identify technical unknowns and research needs (mode-aware depth)
+3. **Design Architecture** → Create system models and component definitions (mode-aware complexity)
+4. **Validate Compliance** → Ensure constitutional alignment
+5. **Generate Artifacts** → Produce implementation-ready documentation (mode-aware templates)
+
+## Core Workflow (Mode-Aware)
+
+### Phase 1: Planning Setup & Context Loading
+
+**Objective:** Establish planning environment and load all required context
 
-## Outline
+1. **Environment Initialization**
+   - Execute: `{SCRIPT}` from repository root
+   - Parse JSON output for: FEATURE_SPEC, IMPL_PLAN, SPECS_DIR, BRANCH
+   - Validate all required paths exist and are accessible
+   - Handle argument escaping for special characters
 
-1. **Setup**: Run `{SCRIPT}` from repo root and parse JSON for FEATURE_SPEC, IMPL_PLAN, SPECS_DIR, BRANCH. For single quotes in args like "I'm Groot", use escape syntax: e.g 'I'\''m Groot' (or double-quote if possible: "I'm Groot").
+2. **Context Acquisition**
+   - **Specification Loading:** Read FEATURE_SPEC for requirements and constraints
+   - **Constitutional Loading:** Read `/memory/constitution.md` for governance rules
+   - **Template Loading:** Load appropriate template based on mode
+     - **Build Mode**: Use `plan-template-build.md` (lightweight structure)
+     - **Spec Mode**: Use `plan-template.md` (full research-driven structure)
+   - **Validation:** Ensure all context sources are available and consistent
 
-2. **Load context**: Read FEATURE_SPEC and `/memory/constitution.md`. Load IMPL_PLAN template (already copied).
+### Phase 2: Technical Analysis & Research Planning
 
-3. **Execute plan workflow**: Follow the structure in IMPL_PLAN template to:
-   - Fill Technical Context (mark unknowns as "NEEDS CLARIFICATION")
-   - Fill Constitution Check section from constitution
-   - Evaluate gates (ERROR if violations unjustified)
-   - Phase 0: Generate research.md (resolve all NEEDS CLARIFICATION)
-   - Phase 1: Generate data-model.md, contracts/, quickstart.md
-   - Phase 1: Update agent context by running the agent script
-   - Re-evaluate Constitution Check post-design
+**Objective:** Identify technical scope and knowledge gaps requiring research
 
-4. **Stop and report**: Command ends after Phase 2 planning. Report branch, IMPL_PLAN path, and generated artifacts.
+ 1. **Technical Context Mapping**
+    - Extract technical requirements from feature specification
+    - Identify technology stack and architectural patterns
+    - Map integration points and external dependencies
+    - **NEEDS CLARIFICATION Flag:** Mark unknowns preventing confident planning
 
-## Phases
+ 2. **Constitutional Compliance Assessment**
+    - Map feature requirements against constitution principles
+    - Identify potential conflicts or additional requirements
+    - Document compliance strategy and justification
+    - **Gate Evaluation:** Block progression for unjustified violations
 
-### Phase 0: Outline & Research
+ 3. **Research Planning**
+    - **Gap Analysis:** Convert NEEDS CLARIFICATION items to research tasks
+    - **Dependency Research:** Plan investigation of critical integrations
+    - **Best Practice Research:** Identify technology-specific recommendations
+    - Generate research.md with prioritized investigation plan
+
+## Detailed Phases (Mode-Aware)
+
+### Build Mode Execution Flow
+
+**Focus:** Lightweight planning for quick implementation and validation
+
+1. **Core Implementation Approach**
+   - Identify primary technology stack and architecture pattern
+   - Define basic project structure (source directories, key components)
+   - Document essential dependencies and integrations
+   - Skip extensive research - use reasonable defaults and industry standards
+
+2. **Basic Design Artifacts**
+   - Create simplified data model (only essential entities)
+   - Define core API contracts (main endpoints only)
+   - Generate basic project structure documentation
+
+3. **Implementation Readiness Check**
+   - Validate core functionality is implementable
+   - Ensure basic dependencies are identified
+   - Confirm project structure supports feature scope
+
+**Output**: plan.md with core implementation approach, basic data model, essential contracts
+
+### Spec Mode Execution Flow
+
+**Focus:** Comprehensive research-driven planning with full validation
+
+#### Phase 0: Outline & Research
 
 1. **Extract unknowns from Technical Context** above:
    - For each NEEDS CLARIFICATION → research task
@@ -66,30 +156,141 @@ You **MUST** consider the user input before proceeding (if not empty).
 
 **Output**: research.md with all NEEDS CLARIFICATION resolved
 
-### Phase 1: Design & Contracts
+#### Phase 1: Design & Contracts (Configurable)
 
 **Prerequisites:** `research.md` complete
 
-1. **Extract entities from feature spec** → `data-model.md`:
-   - Entity name, fields, relationships
-   - Validation rules from requirements
-   - State transitions if applicable
+**Framework Opinions Check:**
+
+- Check current mode and opinion settings via `/mode`
+- Respect user configuration for contracts and data models
+
+1. **Extract entities from feature spec** → `data-model.md` (if data models enabled):
+    - Only generate if data models are enabled in current mode settings
+    - Entity name, fields, relationships
+    - Validation rules from requirements
+    - State transitions if applicable
 
-2. **Generate API contracts** from functional requirements:
-   - For each user action → endpoint
-   - Use standard REST/GraphQL patterns
-   - Output OpenAPI/GraphQL schema to `/contracts/`
+2. **Generate API contracts** from functional requirements (if contracts enabled):
+    - Only generate if API contracts are enabled in current mode settings
+    - For each user action → endpoint
+    - Use standard REST/GraphQL patterns
+    - Output OpenAPI/GraphQL schema to `/contracts/`
 
 3. **Agent context update**:
-   - Run `{AGENT_SCRIPT}`
-   - These scripts detect which AI agent is in use
-   - Update the appropriate agent-specific context file
-   - Add only new technology from current plan
-   - Preserve manual additions between markers
+    - Run `{AGENT_SCRIPT}`
+    - These scripts detect which AI agent is in use
+    - Update the appropriate agent-specific context file
+    - Add only new technology from current plan
+    - Preserve manual additions between markers
+
+**Output**: Conditionally generated artifacts based on mode opinion settings:
+
+- data-model.md (if data models enabled)
+- /contracts/* (if contracts enabled)
+- quickstart.md, agent-specific file (always generated)
+
+## Triage Framework: [SYNC] vs [ASYNC] Task Classification (Mode-Aware)
 
-**Output**: data-model.md, /contracts/*, quickstart.md, agent-specific file
+**Purpose**: Guide the classification of implementation tasks as [SYNC] (human-reviewed) or [ASYNC] (agent-delegated) to optimize execution efficiency and quality.
 
-## Key rules
+### Build Mode Triage
+
+**Focus:** Simplified classification for lightweight execution
+
+- Prioritize core functionality tasks as [SYNC]
+- Delegate supporting tasks (boilerplate, standard patterns) as [ASYNC]
+- Limit detailed triage analysis - focus on obvious complexity indicators
+
+### Spec Mode Triage
+
+**Focus:** Comprehensive classification with full validation
+
+#### Triage Decision Framework
+
+**Evaluate Each Implementation Task Against These Criteria:**
+
+##### [SYNC] Classification (Human Execution Required)
+
+- **Complex Business Logic**: Non-trivial algorithms, state machines, or domain-specific calculations
+- **Architectural Decisions**: System design choices, component boundaries, or integration patterns
+- **Security-Critical Code**: Authentication, authorization, encryption, or data protection
+- **External Integrations**: Third-party APIs, legacy systems, or complex data transformations
+- **Ambiguous Requirements**: Unclear specifications requiring interpretation or clarification
+- **High-Risk Changes**: Database schema changes, API contract modifications, or breaking changes
+
+##### [ASYNC] Classification (Agent Delegation Suitable)
+
+- **Well-Defined CRUD**: Standard create/read/update/delete operations with clear schemas
+- **Repetitive Tasks**: Boilerplate code, standard library usage, or template-based generation
+- **Clear Specifications**: Unambiguous requirements with complete acceptance criteria
+- **Independent Components**: Self-contained modules with minimal external dependencies
+- **Standard Patterns**: Established frameworks, libraries, or architectural patterns
+- **Testable Units**: Components with comprehensive automated test coverage
+
+#### Triage Process
+
+1. **Task Identification**: Break down the feature into discrete, implementable tasks
+2. **Criteria Evaluation**: Assess each task against the [SYNC]/[ASYNC] criteria above
+3. **Rationale Documentation**: Record the reasoning for each classification decision
+4. **Risk Assessment**: Consider the impact of incorrect classification
+5. **Review Checkpoint**: Validate triage decisions before task generation
+
+#### Triage Audit Trail
+
+**Document for Each Task:**
+
+- Classification: [SYNC] or [ASYNC]
+- Primary Criteria: Which criteria drove the classification
+- Risk Level: Low/Medium/High (impact of misclassification)
+- Rationale: 1-2 sentence explanation
+
+#### Triage Effectiveness Metrics
+
+**Track Over Time:**
+
+- Classification Accuracy: Percentage of tasks correctly classified (measured post-implementation)
+- Review Efficiency: Time spent on [SYNC] reviews vs [ASYNC] execution time
+- Quality Impact: Defect rates by classification type
+- Learning Opportunities: Common misclassification patterns
+
+## Key Rules (Mode-Aware)
 
 - Use absolute paths
 - ERROR on gate failures or unresolved clarifications
+- **TRIAGE REQUIREMENT**: All implementation tasks must be classified as [SYNC] or [ASYNC] with documented rationale
+
+### Build Mode Rules
+
+- Focus on core functionality and basic implementation approach
+- Use reasonable defaults for unspecified technical details
+- Skip extensive research - prioritize getting something working
+- Limit triage to obvious complexity indicators
+
+### Spec Mode Rules
+
+- Comprehensive research required before design decisions
+- Full constitutional compliance validation
+- Detailed triage analysis for all tasks
+- Complete design artifacts and validation
+
+## Mode Guidance & Transitions
+
+**Build Mode Planning:**
+
+- Focus on core implementation approach and essential structure
+- Use reasonable defaults for unspecified technical details
+- Skip extensive research to prioritize getting something working
+- Suitable for: Prototyping, simple features, quick validation
+
+**Spec Mode Planning:**
+
+- Comprehensive research-driven planning with full validation
+- Detailed technical analysis and constitutional compliance
+- Complete design artifacts and thorough triage
+- Suitable for: Complex features, team collaboration, production systems
+
+**Mode Transitions:**
+
+- Build → Spec: Use `/mode spec` when feature complexity increases or comprehensive planning is needed
+- Spec → Build: Use `/mode build` for rapid prototyping or when detailed planning creates overhead
diff --git a/templates/commands/specify.md b/templates/commands/specify.md
index 3c952d683e..d8d7a73ddb 100644
--- a/templates/commands/specify.md
+++ b/templates/commands/specify.md
@@ -21,6 +21,14 @@ $ARGUMENTS
 
 You **MUST** consider the user input before proceeding (if not empty).
 
+## Mode Detection
+
+1. **Check Current Workflow Mode**: Determine if the user is in "build" or "spec" mode by checking the mode configuration file at `.specify/config/config.json` under `workflow.current_mode`. If the file doesn't exist or mode is not set, default to "spec" mode.
+
+2. **Mode-Aware Behavior**:
+   - **Build Mode**: Lightweight, conversational specification focused on quick validation and exploration
+   - **Spec Mode**: Full structured specification with comprehensive requirements and validation
+
 ## Outline
 
 The text the user typed after `/speckit.specify` in the triggering message **is** the feature description. Assume you always have it available in this conversation even if `{ARGS}` appears literally below. Do not ask the user to repeat it unless they provided an empty command.
@@ -71,56 +79,79 @@ Given that feature description, do this:
    - The JSON output will contain BRANCH_NAME and SPEC_FILE paths
    - For single quotes in args like "I'm Groot", use escape syntax: e.g 'I'\''m Groot' (or double-quote if possible: "I'm Groot")
 
-3. Load `templates/spec-template.md` to understand required sections.
-
-4. Follow this execution flow:
-
-    1. Parse user description from Input
-       If empty: ERROR "No feature description provided"
-    2. Extract key concepts from description
-       Identify: actors, actions, data, constraints
-    3. For unclear aspects:
-       - Make informed guesses based on context and industry standards
-       - Only mark with [NEEDS CLARIFICATION: specific question] if:
-         - The choice significantly impacts feature scope or user experience
-         - Multiple reasonable interpretations exist with different implications
-         - No reasonable default exists
-       - **LIMIT: Maximum 3 [NEEDS CLARIFICATION] markers total**
-       - Prioritize clarifications by impact: scope > security/privacy > user experience > technical details
-    4. Fill User Scenarios & Testing section
-       If no clear user flow: ERROR "Cannot determine user scenarios"
-    5. Generate Functional Requirements
-       Each requirement must be testable
-       Use reasonable defaults for unspecified details (document assumptions in Assumptions section)
-    6. Define Success Criteria
-       Create measurable, technology-agnostic outcomes
-       Include both quantitative metrics (time, performance, volume) and qualitative measures (user satisfaction, task completion)
-       Each criterion must be verifiable without implementation details
-    7. Identify Key Entities (if data involved)
-    8. Return: SUCCESS (spec ready for planning)
+3. Load template based on mode:
+   - **Build Mode**: Use `templates/spec-template-build.md` (lightweight version)
+   - **Spec Mode**: Use `templates/spec-template.md` (full structured version)
+
+4. Follow this execution flow (mode-aware):
+
+     **Build Mode Execution Flow:**
+     1. Parse user description from Input
+        If empty: ERROR "No feature description provided"
+     2. Extract key concepts from description
+        Identify: actors, actions, data, constraints
+     3. For unclear aspects:
+        - Make informed guesses based on context and industry standards
+        - Only mark with [NEEDS CLARIFICATION: specific question] if critical for basic functionality
+        - **LIMIT: Maximum 1 [NEEDS CLARIFICATION] marker total** (keep it minimal)
+        - Focus only on scope-defining decisions
+     4. Fill User Scenarios & Testing section (lightweight)
+        - Focus on 1-2 primary user journeys
+        - Simple acceptance scenarios (Given/When/Then format)
+        - If no clear user flow: ERROR "Cannot determine user scenarios"
+     5. Generate Functional Requirements (simplified)
+        - Focus on core functionality only
+        - Use reasonable defaults for unspecified details
+        - Keep to 3-5 key requirements
+     6. Define Success Criteria (basic)
+        - 2-3 measurable outcomes focused on core functionality
+        - Technology-agnostic but practical
+     7. Identify Key Entities (if data involved, minimal)
+        - Only essential entities and relationships
+     8. Return: SUCCESS (spec ready for lightweight implementation)
+
+     **Spec Mode Execution Flow:**
+     1. Parse user description from Input
+        If empty: ERROR "No feature description provided"
+     2. Extract key concepts from description
+        Identify: actors, actions, data, constraints
+     3. For unclear aspects:
+        - Make informed guesses based on context and industry standards
+        - Only mark with [NEEDS CLARIFICATION: specific question] if:
+          - The choice significantly impacts feature scope or user experience
+          - Multiple reasonable interpretations exist with different implications
+          - No reasonable default exists
+        - **LIMIT: Maximum 3 [NEEDS CLARIFICATION] markers total**
+        - Prioritize clarifications by impact: scope > security/privacy > user experience > technical details
+     4. Fill User Scenarios & Testing section
+        If no clear user flow: ERROR "Cannot determine user scenarios"
+     5. Generate Functional Requirements
+        Each requirement must be testable
+        Use reasonable defaults for unspecified details (document assumptions in Assumptions section)
+     6. Define Success Criteria
+        Create measurable, technology-agnostic outcomes
+        Include both quantitative metrics (time, performance, volume) and qualitative measures (user satisfaction, task completion)
+        Each criterion must be verifiable without implementation details
+     7. Identify Key Entities (if data involved)
+     8. Return: SUCCESS (spec ready for planning)
 
 5. Write the specification to SPEC_FILE using the template structure, replacing placeholders with concrete details derived from the feature description (arguments) while preserving section order and headings.
 
-6. **Specification Quality Validation**: After writing the initial spec, validate it against quality criteria:
+6. **Specification Quality Validation** (mode-aware):
 
-   a. **Create Spec Quality Checklist**: Generate a checklist file at `FEATURE_DIR/checklists/requirements.md` using the checklist template structure with these validation items:
+    **Build Mode Validation:**
+    - **Lightweight Checklist**: Focus on core functionality and basic testability
+    - **Reduced Requirements**: Skip detailed edge cases and comprehensive coverage
+    - **Quick Validation**: 1-2 iteration maximum, prioritize getting something working
+    - **Success Criteria**: Basic functionality demonstrable, core user journey works
 
-      ```markdown
-      # Specification Quality Checklist: [FEATURE NAME]
-      
-      **Purpose**: Validate specification completeness and quality before proceeding to planning
-      **Created**: [DATE]
-      **Feature**: [Link to spec.md]
-      
-      ## Content Quality
-      
       - [ ] No implementation details (languages, frameworks, APIs)
       - [ ] Focused on user value and business needs
       - [ ] Written for non-technical stakeholders
       - [ ] All mandatory sections completed
-      
+
       ## Requirement Completeness
-      
+
       - [ ] No [NEEDS CLARIFICATION] markers remain
       - [ ] Requirements are testable and unambiguous
       - [ ] Success criteria are measurable
@@ -129,18 +160,17 @@ Given that feature description, do this:
       - [ ] Edge cases are identified
       - [ ] Scope is clearly bounded
       - [ ] Dependencies and assumptions identified
-      
+
       ## Feature Readiness
-      
+
       - [ ] All functional requirements have clear acceptance criteria
       - [ ] User scenarios cover primary flows
       - [ ] Feature meets measurable outcomes defined in Success Criteria
       - [ ] No implementation details leak into specification
-      
+
       ## Notes
-      
-      - Items marked incomplete require spec updates before `/speckit.clarify` or `/speckit.plan`
-      ```
+
+       - Items marked incomplete require spec updates before `/speckit.clarify` or `/speckit.plan`
 
    b. **Run Validation Check**: Review the spec against each checklist item:
       - For each item, determine if it passes or fails
@@ -191,9 +221,34 @@ Given that feature description, do this:
         8. Update the spec by replacing each [NEEDS CLARIFICATION] marker with the user's selected or provided answer
         9. Re-run validation after all clarifications are resolved
 
-   d. **Update Checklist**: After each validation iteration, update the checklist file with current pass/fail status
-
-7. Report completion with branch name, spec file path, checklist results, and readiness for the next phase (`/speckit.clarify` or `/speckit.plan`).
+    d. **Update Checklist**: After each validation iteration, update the checklist file with current pass/fail status
+
+     **Spec Mode Validation:**
+     - **Comprehensive Checklist**: Full requirements quality validation
+     - **Multiple Iterations**: Allow up to 3 clarification rounds for complex features
+     - **Detailed Validation**: Check all requirement quality dimensions
+     - **Success Criteria**: All requirements are clear, complete, and testable
+
+7. **Context Population** (mode-aware):
+     - **Read the generated spec.md** and extract key information
+     - **Update context.md** with derived values instead of [NEEDS INPUT] placeholders:
+       - **Feature**: Use the feature title/name from spec.md header
+       - **Mission**: Extract the core purpose/goal from the feature description
+       - **Code Paths**: Identify relevant codebase locations based on feature type and requirements
+       - **Directives**: Reference applicable team directives from constitution/memory
+       - **Research**: List any external research needs identified during specification
+       - **Gateway**: Specify API gateway/service mesh requirements if applicable
+     - **Build Mode**: Populate Feature and Mission (minimum required)
+     - **Spec Mode**: Populate all 6 fields with detailed, accurate values
+     - **Validation**: Ensure no [NEEDS INPUT] markers remain in context.md
+
+8. Report completion with branch name, spec file path, checklist results, and readiness for the next phase:
+     - **Build Mode**: Ready for `/speckit.implement` (skip clarify/plan for lightweight execution)
+     - **Spec Mode**: Ready for `/speckit.clarify` or `/speckit.plan`
+
+9. **Mode Guidance & Transitions**:
+    - **Build Mode Users**: This mode prioritizes speed over completeness. If your feature becomes more complex, consider switching to Spec mode with `/mode spec` for comprehensive planning.
+    - **Spec Mode Users**: This mode provides thorough validation. If you need to quickly prototype, you can switch to Build mode with `/mode build`.
 
 **NOTE:** The script creates and checks out the new branch and initializes the spec file before writing.
 
diff --git a/templates/commands/tasks.md b/templates/commands/tasks.md
index d69d43763e..b2c7add32b 100644
--- a/templates/commands/tasks.md
+++ b/templates/commands/tasks.md
@@ -22,46 +22,85 @@ $ARGUMENTS
 
 You **MUST** consider the user input before proceeding (if not empty).
 
+Check mode configuration for risk-based testing. If enabled in `.specify/config/config.json` under `options.risk_tests_enabled`, enable risk-based test generation.
+
 ## Outline
 
 1. **Setup**: Run `{SCRIPT}` from repo root and parse FEATURE_DIR and AVAILABLE_DOCS list. All paths must be absolute. For single quotes in args like "I'm Groot", use escape syntax: e.g 'I'\''m Groot' (or double-quote if possible: "I'm Groot").
 
-2. **Load design documents**: Read from FEATURE_DIR:
+2. **Initialize Dual Execution Loop**: Run `scripts/bash/tasks-meta-utils.sh init "$FEATURE_DIR"` to create tasks_meta.json structure for tracking SYNC/ASYNC execution modes, LLM delegation, and review enforcement.
+
+3. **Load design documents**: Read from FEATURE_DIR:
    - **Required**: plan.md (tech stack, libraries, structure), spec.md (user stories with priorities)
    - **Optional**: data-model.md (entities), contracts/ (API endpoints), research.md (decisions), quickstart.md (test scenarios)
    - Note: Not all projects have all documents. Generate tasks based on what's available.
 
-3. **Execute task generation workflow**:
-   - Load plan.md and extract tech stack, libraries, project structure
-   - Load spec.md and extract user stories with their priorities (P1, P2, P3, etc.)
-   - If data-model.md exists: Extract entities and map to user stories
-   - If contracts/ exists: Map endpoints to user stories
-   - If research.md exists: Extract decisions for setup tasks
-   - Generate tasks organized by user story (see Task Generation Rules below)
-   - Generate dependency graph showing user story completion order
-   - Create parallel execution examples per user story
-   - Validate task completeness (each user story has all needed tasks, independently testable)
-
-4. **Generate tasks.md**: Use `templates/tasks-template.md` as structure, fill with:
-   - Correct feature name from plan.md
-   - Phase 1: Setup tasks (project initialization)
-   - Phase 2: Foundational tasks (blocking prerequisites for all user stories)
-   - Phase 3+: One phase per user story (in priority order from spec.md)
-   - Each phase includes: story goal, independent test criteria, tests (if requested), implementation tasks
-   - Final Phase: Polish & cross-cutting concerns
-   - All tasks must follow the strict checklist format (see Task Generation Rules below)
-   - Clear file paths for each task
-   - Dependencies section showing story completion order
-   - Parallel execution examples per story
-   - Implementation strategy section (MVP first, incremental delivery)
-
-5. **Report**: Output path to generated tasks.md and summary:
-   - Total task count
-   - Task count per user story
-   - Parallel opportunities identified
-   - Independent test criteria for each story
-   - Suggested MVP scope (typically just User Story 1)
-   - Format validation: Confirm ALL tasks follow the checklist format (checkbox, ID, labels, file paths)
+4. **Execute task generation workflow** (follow the template structure):
+    - Load plan.md and extract tech stack, libraries, project structure
+    - **Load spec.md and extract user stories with their priorities (P1, P2, P3, etc.)**
+    - If data-model.md exists: Extract entities → map to user stories
+    - If contracts/ exists: Each file → map endpoints to user stories
+    - If research.md exists: Extract decisions → generate setup tasks
+    - **If risk-based testing is enabled in mode configuration**:
+      - Run `scripts/bash/generate-risk-tests.sh` with combined SPEC_RISKS and PLAN_RISKS from {SCRIPT} output
+      - Parse the generated risk-based test tasks
+      - Append them as a dedicated "Risk Mitigation" phase at the end of tasks.md
+    - **Generate tasks ORGANIZED BY USER STORY**:
+      - Setup tasks (shared infrastructure needed by all stories)
+      - **Foundational tasks (prerequisites that must complete before ANY user story can start)**
+      - For each user story (in priority order P1, P2, P3...):
+        - Group all tasks needed to complete JUST that story
+        - Include models, services, endpoints, UI components specific to that story
+        - Mark which tasks are [P] parallelizable within each story
+        - Classify tasks as [SYNC] (complex, requires human review) or [ASYNC] (routine, can be delegated to async agents)
+        - **For each task**: Run `scripts/bash/tasks-meta-utils.sh classify "$task_description" "$task_files"` to determine execution mode and update tasks_meta.json with `scripts/bash/tasks-meta-utils.sh add-task "$FEATURE_DIR/tasks_meta.json" "$task_id" "$task_description" "$task_files" "$execution_mode"`
+        - If tests requested: Include tests specific to that story
+    - **Tests are CONFIGURABLE**: Check current mode opinion settings - if TDD enabled, generate test tasks before implementation; if disabled, tests are optional
+    - Apply task rules:
+      - Different files = mark [P] for parallel within story
+      - Same file = sequential (no [P])
+      - If TDD enabled (`is_opinion_enabled tdd $MODE`): Tests before implementation (TDD order)
+    - If TDD disabled: Tests optional, generated only if explicitly requested
+      - Classify execution mode:
+        - [SYNC] for: complex logic, architectural decisions, security-critical code, ambiguous requirements (requires human review)
+        - [ASYNC] for: well-defined CRUD operations, repetitive tasks, clear specifications, independent components (can be delegated to async agents)
+    - Number tasks sequentially (T001, T002...)
+    - Generate dependency graph showing user story completion order
+    - Create parallel execution examples per user story
+    - Validate task completeness (each user story has all needed tasks, independently testable)
+
+5. **Generate tasks.md**: Use `.specify/templates/tasks-template.md` as structure, fill with:
+    - Correct feature name from plan.md
+    - Phase 1: Setup tasks (project initialization)
+    - Phase 2: Foundational tasks (blocking prerequisites for all user stories)
+    - Phase 3+: One phase per user story (in priority order from spec.md)
+      - Each phase includes: story goal, independent test criteria, tests (if requested), implementation tasks
+      - Clear [Story] labels (US1, US2, US3...) for each task
+      - [P] markers for parallelizable tasks within each story
+      - [SYNC]/[ASYNC] markers for execution mode classification
+      - Checkpoint markers after each story phase
+    - Final Phase: Polish & cross-cutting concerns
+    - **If risk tests enabled**: Add "Risk Mitigation Phase" with generated risk-based test tasks
+    - Numbered tasks (T001, T002...) in execution order
+    - Clear file paths for each task
+    - Dependencies section showing story completion order
+    - Parallel execution examples per story
+    - Implementation strategy section (MVP first, incremental delivery)
+
+6. **Apply Issue Tracker Labels**: If issue tracker MCP is configured and ASYNC tasks exist:
+    - Apply `async-ready` and `agent-delegatable` labels to the associated issue
+    - Update tasks_meta.json with labeling information
+    - Enable automatic async agent triggering for qualifying tasks
+
+7. **Report**: Output path to generated tasks.md and summary:
+    - Total task count
+    - Task count per user story
+    - Parallel opportunities identified
+    - Independent test criteria for each story
+    - **If risk tests enabled**: Number of risk mitigation tasks generated
+    - **If issue labeling applied**: Issue ID and labels applied
+    - Suggested MVP scope (typically just User Story 1)
+    - Format validation: Confirm ALL tasks follow the checklist format (checkbox, ID, labels, file paths)
 
 Context for task generation: {ARGS}
 
@@ -78,7 +117,7 @@ The tasks.md should be immediately executable - each task must be specific enoug
 Every task MUST strictly follow this format:
 
 ```text
-- [ ] [TaskID] [P?] [Story?] Description with file path
+- [ ] [TaskID] [P?] [SYNC/ASYNC] [Story?] Description with file path
 ```
 
 **Format Components**:
@@ -86,24 +125,27 @@ Every task MUST strictly follow this format:
 1. **Checkbox**: ALWAYS start with `- [ ]` (markdown checkbox)
 2. **Task ID**: Sequential number (T001, T002, T003...) in execution order
 3. **[P] marker**: Include ONLY if task is parallelizable (different files, no dependencies on incomplete tasks)
-4. **[Story] label**: REQUIRED for user story phase tasks only
-   - Format: [US1], [US2], [US3], etc. (maps to user stories from spec.md)
-   - Setup phase: NO story label
-   - Foundational phase: NO story label  
-   - User Story phases: MUST have story label
-   - Polish phase: NO story label
-5. **Description**: Clear action with exact file path
+4. **[SYNC]/[ASYNC] marker**: REQUIRED for all tasks
+    - **[SYNC]**: Requires human review (complex logic, security-critical, ambiguous requirements)
+    - **[ASYNC]**: Can be delegated to async agents (well-defined CRUD, repetitive tasks, clear specs)
+5. **[Story] label**: REQUIRED for user story phase tasks only
+    - Format: [US1], [US2], [US3], etc. (maps to user stories from spec.md)
+    - Setup phase: NO story label
+    - Foundational phase: NO story label
+    - User Story phases: MUST have story label
+    - Polish phase: NO story label
+6. **Description**: Clear action with exact file path
 
 **Examples**:
 
-- ✅ CORRECT: `- [ ] T001 Create project structure per implementation plan`
-- ✅ CORRECT: `- [ ] T005 [P] Implement authentication middleware in src/middleware/auth.py`
-- ✅ CORRECT: `- [ ] T012 [P] [US1] Create User model in src/models/user.py`
-- ✅ CORRECT: `- [ ] T014 [US1] Implement UserService in src/services/user_service.py`
-- ❌ WRONG: `- [ ] Create User model` (missing ID and Story label)
-- ❌ WRONG: `T001 [US1] Create model` (missing checkbox)
-- ❌ WRONG: `- [ ] [US1] Create User model` (missing Task ID)
-- ❌ WRONG: `- [ ] T001 [US1] Create model` (missing file path)
+- ✅ CORRECT: `- [ ] T001 [ASYNC] Create project structure per implementation plan`
+- ✅ CORRECT: `- [ ] T005 [P] [SYNC] Implement authentication middleware in src/middleware/auth.py`
+- ✅ CORRECT: `- [ ] T012 [P] [ASYNC] [US1] Create User model in src/models/user.py`
+- ✅ CORRECT: `- [ ] T014 [SYNC] [US1] Implement UserService in src/services/user_service.py`
+- ❌ WRONG: `- [ ] Create User model` (missing ID, SYNC/ASYNC, and Story label)
+- ❌ WRONG: `T001 [US1] Create model` (missing checkbox and SYNC/ASYNC)
+- ❌ WRONG: `- [ ] [US1] Create User model` (missing Task ID and SYNC/ASYNC)
+- ❌ WRONG: `- [ ] T001 [US1] Create model` (missing SYNC/ASYNC and file path)
 
 ### Task Organization
 
@@ -130,7 +172,7 @@ Every task MUST strictly follow this format:
    - Foundational/blocking tasks → Foundational phase (Phase 2)
    - Story-specific setup → within that story's phase
 
-### Phase Structure
+### Phase Structure & Ordering
 
 - **Phase 1**: Setup (project initialization)
 - **Phase 2**: Foundational (blocking prerequisites - MUST complete before user stories)
@@ -138,3 +180,20 @@ Every task MUST strictly follow this format:
   - Within each story: Tests (if requested) → Models → Services → Endpoints → Integration
   - Each phase should be a complete, independently testable increment
 - **Final Phase**: Polish & Cross-Cutting Concerns
+
+### [SYNC]/[ASYNC] Classification
+
+- **[SYNC] Tasks**: Require human review and oversight
+  - Complex business logic or algorithms
+  - Architectural or design decisions
+  - Security-critical functionality
+  - Integration with external systems
+  - Ambiguous or unclear requirements
+  - Tasks affecting multiple components
+- **[ASYNC] Tasks**: Can be safely delegated to async coding agents
+  - Well-defined CRUD operations
+  - Repetitive or boilerplate code
+  - Clear, unambiguous specifications
+  - Independent component implementation
+  - Standard library/framework usage
+  - Tasks with comprehensive test coverage
diff --git a/templates/context-template.md b/templates/context-template.md
new file mode 100644
index 0000000000..ed645d7531
--- /dev/null
+++ b/templates/context-template.md
@@ -0,0 +1,8 @@
+# Feature Context
+
+**Feature**: [NEEDS INPUT]
+**Mission**: [NEEDS INPUT]
+**Code Paths**: [NEEDS INPUT]
+**Directives**: [NEEDS INPUT]
+**Research**: [NEEDS INPUT]
+**Gateway**: [NEEDS INPUT]
diff --git a/templates/delegation-template.md b/templates/delegation-template.md
new file mode 100644
index 0000000000..bc87f3e125
--- /dev/null
+++ b/templates/delegation-template.md
@@ -0,0 +1,13 @@
+Please complete the following task autonomously using your available tools and capabilities:
+
+**Task**: {TASK_DESCRIPTION}
+
+**Context**: {TASK_CONTEXT}
+
+**Requirements**: {TASK_REQUIREMENTS}
+
+**Instructions**: {EXECUTION_INSTRUCTIONS}
+
+Please execute this task completely and provide a summary of what was accomplished, including any code changes, tests written, or other deliverables produced.
+
+When finished, provide a brief status update indicating completion.
diff --git a/templates/plan-template-build.md b/templates/plan-template-build.md
new file mode 100644
index 0000000000..5c509d8e03
--- /dev/null
+++ b/templates/plan-template-build.md
@@ -0,0 +1,62 @@
+# Implementation Plan: [FEATURE] (Build Mode)
+
+**Branch**: `[###-feature-name]` | **Date**: [DATE] | **Spec**: [link]
+**Mode**: Build Mode (Lightweight Planning)
+**Input**: Feature specification from `/specs/[###-feature-name]/spec.md`
+
+**Note**: This template is filled in by the `/speckit.plan` command in Build mode. Focuses on core implementation approach for quick validation.
+
+## Summary
+
+[Extract from feature spec: primary requirement + basic technical approach]
+
+## Core Implementation Approach
+
+**Technology Stack**: [Primary language/framework - use reasonable defaults]
+**Architecture Pattern**: [Simple pattern like MVC, layered, or direct implementation]
+**Key Dependencies**: [Essential libraries/packages only]
+**Data Storage**: [Basic approach: files, simple DB, or in-memory if applicable]
+
+## Project Structure
+
+### Source Code Layout
+
+```text
+src/
+├── [main-module]/
+│   ├── [core-components]
+│   └── [supporting-files]
+├── [tests]/
+└── [config]/
+```
+
+**Structure Decision**: [Brief explanation of chosen structure]
+
+## Implementation Tasks
+
+### Core Functionality
+
+- [Primary user journey implementation]
+- [Essential data operations]
+- [Basic error handling]
+
+### Supporting Tasks
+
+- [Configuration/setup]
+- [Basic testing]
+- [Documentation]
+
+## Success Criteria Validation
+
+- [ ] Core user journey works end-to-end
+- [ ] Basic functionality demonstrable
+- [ ] No critical errors or crashes
+- [ ] Code is readable and maintainable
+
+## Next Steps
+
+Ready for `/speckit.implement` to execute the core functionality. Can evolve to Spec mode later for comprehensive planning if needed.</content>
+</xai:function_call">
+
+<xai:function_call name="todowrite">
+<parameter name="todos">[{"status":"completed","id":"adapt_plan_command"},{"status":"completed","id":"create_plan_template_build"}]
diff --git a/templates/plan-template.md b/templates/plan-template.md
index 6a8bfc6c8a..aaf5bb1dbe 100644
--- a/templates/plan-template.md
+++ b/templates/plan-template.md
@@ -94,6 +94,39 @@ ios/ or android/
 **Structure Decision**: [Document the selected structure and reference the real
 directories captured above]
 
+## Triage Framework: [SYNC] vs [ASYNC] Classification
+
+**Execution Strategy**: This feature will use a hybrid execution model combining human expertise ([SYNC]) with autonomous agent delegation ([ASYNC]).
+
+### Preliminary Task Classification
+
+Complete during planning phase - will be validated and refined during task generation
+
+| Task Category | Estimated [SYNC] Tasks | Estimated [ASYNC] Tasks | Rationale |
+|---------------|----------------------|----------------------|-----------|
+| Business Logic | [count] | [count] | [why this split] |
+| Data Operations | [count] | [count] | [why this split] |
+| UI Components | [count] | [count] | [why this split] |
+| Integrations | [count] | [count] | [why this split] |
+| Infrastructure | [count] | [count] | [why this split] |
+
+### Triage Decision Criteria Applied
+
+**High-Risk [SYNC] Classifications:**
+
+- [List specific tasks that MUST be human-reviewed due to complexity/security/criticality]
+
+**Agent-Delegated [ASYNC] Classifications:**
+
+- [List specific tasks suitable for autonomous execution]
+
+### Triage Audit Trail
+
+| Task | Classification | Primary Criteria | Risk Level | Rationale |
+|------|----------------|------------------|------------|-----------|
+| [Task 1] | [SYNC/ASYNC] | [criteria] | [Low/Med/High] | [explanation] |
+| [Task 2] | [SYNC/ASYNC] | [criteria] | [Low/Med/High] | [explanation] |
+
 ## Complexity Tracking
 
 > **Fill ONLY if Constitution Check has violations that must be justified**
diff --git a/templates/spec-template-build.md b/templates/spec-template-build.md
new file mode 100644
index 0000000000..22949e3a16
--- /dev/null
+++ b/templates/spec-template-build.md
@@ -0,0 +1,61 @@
+# Feature Specification: [FEATURE NAME] (Build Mode)
+
+**Feature Branch**: `[###-feature-name]`
+**Created**: [DATE]
+**Status**: Draft
+**Mode**: Build (Lightweight)
+**Input**: User description: "$ARGUMENTS"
+
+## Core User Journey *(required)*
+
+<!-- Build mode focuses on one primary user journey that demonstrates the core value -->
+
+### Primary User Story - [Brief Title]
+
+[Describe the main user journey in plain language - keep it simple and focused]
+
+**Value**: [Why this matters to users - 1-2 sentences]
+
+**Basic Test**: [How to verify it works - simple action and expected result]
+
+**Acceptance Scenarios** (Keep it simple):
+
+1. **Given** [basic starting state], **When** [user action], **Then** [expected outcome]
+2. **Given** [another state], **When** [user action], **Then** [expected outcome]
+
+## Key Requirements *(required)*
+
+<!-- Focus on 3-5 essential requirements that make the feature work -->
+
+### Functional Requirements
+
+- **FR-001**: System MUST [core functionality - keep it simple]
+- **FR-002**: Users MUST be able to [essential interaction]
+- **FR-003**: System MUST [basic behavior or data handling]
+
+## Success Criteria *(required)*
+
+<!-- 2-3 measurable outcomes focused on the core functionality -->
+
+### Basic Outcomes
+
+- **SC-001**: [Simple measurable success - e.g., "Users can complete the primary task successfully"]
+- **SC-002**: [Basic performance or usability metric - e.g., "Feature works without errors"]
+
+## Notes
+
+<!-- Optional: Any assumptions or scope limitations -->
+
+**Assumptions**:
+
+- [Any reasonable defaults or assumptions made]
+
+**Out of Scope**:
+
+- [What we're explicitly NOT building in this lightweight version]
+
+---
+
+*This is a Build Mode specification - lightweight and focused on getting something working quickly. For comprehensive specifications with detailed requirements, use Spec Mode.*</content>
+</xai:function_call name="edit">
+<parameter name="filePath">/home/lior/dev/agentic-sdlc/agentic-sdlc-spec-kit/templates/commands/clarify.md
diff --git a/templates/spec-template.md b/templates/spec-template.md
index c67d914980..358d3853a3 100644
--- a/templates/spec-template.md
+++ b/templates/spec-template.md
@@ -100,6 +100,23 @@
 - **[Entity 1]**: [What it represents, key attributes without implementation]
 - **[Entity 2]**: [What it represents, relationships to other entities]
 
+### Non-Functional Requirements
+
+- **NFR-001**: System MUST [performance requirement, e.g., "respond within 2 seconds for typical operations"]
+- **NFR-002**: System MUST [security requirement, e.g., "encrypt sensitive data at rest and in transit"]
+- **NFR-003**: System MUST [scalability requirement, e.g., "handle 1000 concurrent users"]
+- **NFR-004**: System MUST [reliability requirement, e.g., "maintain 99.9% uptime"]
+- **NFR-005**: System MUST [usability requirement, e.g., "provide accessible interface compliant with WCAG 2.1 AA"]
+
+### Quality Attributes
+
+- **Security**: [Specific security considerations and requirements]
+- **Performance**: [Response times, throughput, resource usage expectations]
+- **Scalability**: [Expected growth patterns and scaling requirements]
+- **Reliability**: [Uptime, error handling, and fault tolerance requirements]
+- **Usability**: [User experience and accessibility requirements]
+- **Maintainability**: [Code quality, documentation, and technical debt considerations]
+
 ## Success Criteria *(mandatory)*
 
 <!--
diff --git a/templates/tasks-template.md b/templates/tasks-template.md
index 60f9be455d..bd06c70b45 100644
--- a/templates/tasks-template.md
+++ b/templates/tasks-template.md
@@ -6,15 +6,17 @@ description: "Task list template for feature implementation"
 # Tasks: [FEATURE NAME]
 
 **Input**: Design documents from `/specs/[###-feature-name]/`
-**Prerequisites**: plan.md (required), spec.md (required for user stories), research.md, data-model.md, contracts/
+**Prerequisites**: plan.md (required in spec mode, optional in build mode), spec.md (required for user stories), research.md, data-model.md, contracts/
 
 **Tests**: The examples below include test tasks. Tests are OPTIONAL - only include them if explicitly requested in the feature specification.
 
 **Organization**: Tasks are grouped by user story to enable independent implementation and testing of each story.
 
-## Format: `[ID] [P?] [Story] Description`
+## Format: `[ID] [P?] [SYNC/ASYNC] [Story] Description`
 
 - **[P]**: Can run in parallel (different files, no dependencies)
+- **[SYNC]**: Requires human review (complex logic, security-critical, ambiguous requirements)
+- **[ASYNC]**: Can be delegated to async agents (well-defined CRUD, repetitive tasks, clear specs)
 - **[Story]**: Which user story this task belongs to (e.g., US1, US2, US3)
 - Include exact file paths in descriptions
 
@@ -48,9 +50,9 @@ description: "Task list template for feature implementation"
 
 **Purpose**: Project initialization and basic structure
 
-- [ ] T001 Create project structure per implementation plan
-- [ ] T002 Initialize [language] project with [framework] dependencies
-- [ ] T003 [P] Configure linting and formatting tools
+- [ ] T001 [ASYNC] Create project structure per implementation plan
+- [ ] T002 [ASYNC] Initialize [language] project with [framework] dependencies
+- [ ] T003 [P] [ASYNC] Configure linting and formatting tools
 
 ---
 
@@ -62,12 +64,12 @@ description: "Task list template for feature implementation"
 
 Examples of foundational tasks (adjust based on your project):
 
-- [ ] T004 Setup database schema and migrations framework
-- [ ] T005 [P] Implement authentication/authorization framework
-- [ ] T006 [P] Setup API routing and middleware structure
-- [ ] T007 Create base models/entities that all stories depend on
-- [ ] T008 Configure error handling and logging infrastructure
-- [ ] T009 Setup environment configuration management
+- [ ] T004 [SYNC] Setup database schema and migrations framework
+- [ ] T005 [P] [SYNC] Implement authentication/authorization framework
+- [ ] T006 [P] [ASYNC] Setup API routing and middleware structure
+- [ ] T007 [ASYNC] Create base models/entities that all stories depend on
+- [ ] T008 [ASYNC] Configure error handling and logging infrastructure
+- [ ] T009 [ASYNC] Setup environment configuration management
 
 **Checkpoint**: Foundation ready - user story implementation can now begin in parallel
 
@@ -83,17 +85,17 @@ Examples of foundational tasks (adjust based on your project):
 
 > **NOTE: Write these tests FIRST, ensure they FAIL before implementation**
 
-- [ ] T010 [P] [US1] Contract test for [endpoint] in tests/contract/test_[name].py
-- [ ] T011 [P] [US1] Integration test for [user journey] in tests/integration/test_[name].py
+- [ ] T010 [P] [ASYNC] [US1] Contract test for [endpoint] in tests/contract/test_[name].py
+- [ ] T011 [P] [ASYNC] [US1] Integration test for [user journey] in tests/integration/test_[name].py
 
 ### Implementation for User Story 1
 
-- [ ] T012 [P] [US1] Create [Entity1] model in src/models/[entity1].py
-- [ ] T013 [P] [US1] Create [Entity2] model in src/models/[entity2].py
-- [ ] T014 [US1] Implement [Service] in src/services/[service].py (depends on T012, T013)
-- [ ] T015 [US1] Implement [endpoint/feature] in src/[location]/[file].py
-- [ ] T016 [US1] Add validation and error handling
-- [ ] T017 [US1] Add logging for user story 1 operations
+- [ ] T012 [P] [ASYNC] [US1] Create [Entity1] model in src/models/[entity1].py
+- [ ] T013 [P] [ASYNC] [US1] Create [Entity2] model in src/models/[entity2].py
+- [ ] T014 [SYNC] [US1] Implement [Service] in src/services/[service].py (depends on T012, T013)
+- [ ] T015 [SYNC] [US1] Implement [endpoint/feature] in src/[location]/[file].py
+- [ ] T016 [ASYNC] [US1] Add validation and error handling
+- [ ] T017 [ASYNC] [US1] Add logging for user story 1 operations
 
 **Checkpoint**: At this point, User Story 1 should be fully functional and testable independently
 
@@ -107,15 +109,15 @@ Examples of foundational tasks (adjust based on your project):
 
 ### Tests for User Story 2 (OPTIONAL - only if tests requested) ⚠️
 
-- [ ] T018 [P] [US2] Contract test for [endpoint] in tests/contract/test_[name].py
-- [ ] T019 [P] [US2] Integration test for [user journey] in tests/integration/test_[name].py
+- [ ] T018 [P] [ASYNC] [US2] Contract test for [endpoint] in tests/contract/test_[name].py
+- [ ] T019 [P] [ASYNC] [US2] Integration test for [user journey] in tests/integration/test_[name].py
 
 ### Implementation for User Story 2
 
-- [ ] T020 [P] [US2] Create [Entity] model in src/models/[entity].py
-- [ ] T021 [US2] Implement [Service] in src/services/[service].py
-- [ ] T022 [US2] Implement [endpoint/feature] in src/[location]/[file].py
-- [ ] T023 [US2] Integrate with User Story 1 components (if needed)
+- [ ] T020 [P] [ASYNC] [US2] Create [Entity] model in src/models/[entity].py
+- [ ] T021 [SYNC] [US2] Implement [Service] in src/services/[service].py
+- [ ] T022 [SYNC] [US2] Implement [endpoint/feature] in src/[location]/[file].py
+- [ ] T023 [SYNC] [US2] Integrate with User Story 1 components (if needed)
 
 **Checkpoint**: At this point, User Stories 1 AND 2 should both work independently
 
@@ -129,14 +131,14 @@ Examples of foundational tasks (adjust based on your project):
 
 ### Tests for User Story 3 (OPTIONAL - only if tests requested) ⚠️
 
-- [ ] T024 [P] [US3] Contract test for [endpoint] in tests/contract/test_[name].py
-- [ ] T025 [P] [US3] Integration test for [user journey] in tests/integration/test_[name].py
+- [ ] T024 [P] [ASYNC] [US3] Contract test for [endpoint] in tests/contract/test_[name].py
+- [ ] T025 [P] [ASYNC] [US3] Integration test for [user journey] in tests/integration/test_[name].py
 
 ### Implementation for User Story 3
 
-- [ ] T026 [P] [US3] Create [Entity] model in src/models/[entity].py
-- [ ] T027 [US3] Implement [Service] in src/services/[service].py
-- [ ] T028 [US3] Implement [endpoint/feature] in src/[location]/[file].py
+- [ ] T026 [P] [ASYNC] [US3] Create [Entity] model in src/models/[entity].py
+- [ ] T027 [SYNC] [US3] Implement [Service] in src/services/[service].py
+- [ ] T028 [SYNC] [US3] Implement [endpoint/feature] in src/[location]/[file].py
 
 **Checkpoint**: All user stories should now be independently functional
 
@@ -159,6 +161,18 @@ Examples of foundational tasks (adjust based on your project):
 
 ---
 
+## Risk Mitigation Phase (OPTIONAL - only if risk-based testing enabled in mode configuration)
+
+**Purpose**: Address security, performance, and reliability risks identified during planning
+
+- [ ] TXXX [P] [ASYNC] Implement input validation for [risk area] in src/[component]/[file].py
+- [ ] TXXX [SYNC] Add rate limiting for [endpoint] to prevent abuse
+- [ ] TXXX [P] [ASYNC] Security audit of authentication flow
+- [ ] TXXX [SYNC] Performance testing for [critical path]
+- [ ] TXXX [ASYNC] Error handling improvements for edge cases
+
+---
+
 ## Dependencies & Execution Order
 
 ### Phase Dependencies
@@ -243,6 +257,8 @@ With multiple developers:
 ## Notes
 
 - [P] tasks = different files, no dependencies
+- [SYNC] tasks = require human review (complex, security-critical, ambiguous)
+- [ASYNC] tasks = can be delegated to async agents (well-defined, repetitive, clear specs)
 - [Story] label maps task to specific user story for traceability
 - Each user story should be independently completable and testable
 - Verify tests fail before implementing
diff --git a/tests/test_check_prerequisites_risks.py b/tests/test_check_prerequisites_risks.py
new file mode 100644
index 0000000000..685df9c702
--- /dev/null
+++ b/tests/test_check_prerequisites_risks.py
@@ -0,0 +1,50 @@
+import json
+import shutil
+import subprocess
+from pathlib import Path
+
+
+def test_check_prerequisites_exposes_risk_register(tmp_path, monkeypatch):
+    repo_root = tmp_path / "repo"
+    script_dir = repo_root / "scripts" / "bash"
+    script_dir.mkdir(parents=True)
+
+    project_root = Path(__file__).resolve().parent.parent
+
+    shutil.copy(project_root / "scripts" / "bash" / "check-prerequisites.sh", script_dir / "check-prerequisites.sh")
+    shutil.copy(project_root / "scripts" / "bash" / "common.sh", script_dir / "common.sh")
+
+    feature_dir = repo_root / "specs" / "001-risk-feature"
+    feature_dir.mkdir(parents=True)
+
+    (feature_dir / "context.md").write_text("# Context\n- ready")
+
+    (feature_dir / "spec.md").write_text(
+        """# Spec\n\n## Risk Register\n- RISK: R1 | Statement: Data loss during retries | Impact: High | Likelihood: Medium | Test: Simulate retry storm\n- RISK: R2 | Statement: Unauthorized admin escalation | Impact: High | Likelihood: Low | Test: RBAC denies non-admin roles\n"""
+    )
+
+    (feature_dir / "plan.md").write_text(
+        """# Plan\n\n## Risk Mitigation & Test Strategy\n- RISK: R1 | Statement: Data loss during retries | Mitigation Owner: Platform | Test Strategy: Stress retries integration test | Evidence Artefact: risk-tests/R1.log\n- RISK: R2 | Statement: Unauthorized admin escalation | Mitigation Owner: Security | Test Strategy: RBAC integration test | Evidence Artefact: risk-tests/R2.log\n"""
+    )
+
+    monkeypatch.setenv("SPECIFY_FEATURE", "001-risk-feature")
+
+    subprocess.run(["git", "init"], cwd=repo_root, check=True, capture_output=True, text=True)
+
+    result = subprocess.run(
+        ["bash", str(script_dir / "check-prerequisites.sh"), "--json"],
+        cwd=repo_root,
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+
+    json_line = next((line for line in result.stdout.splitlines()[::-1] if line.strip().startswith("{")), "")
+    assert json_line, "Expected JSON output"
+    payload = json.loads(json_line)
+
+    assert len(payload["SPEC_RISKS"]) == 2
+    assert {risk["id"] for risk in payload["SPEC_RISKS"]} == {"R1", "R2"}
+    r1_plan = {risk["id"]: risk for risk in payload["PLAN_RISKS"]}["R1"]
+    assert r1_plan["test_strategy"].startswith("Stress retries")
+    assert r1_plan["evidence_artefact"] == "risk-tests/R1.log"
diff --git a/tests/test_create_new_feature.py b/tests/test_create_new_feature.py
new file mode 100644
index 0000000000..b2dd49c6a4
--- /dev/null
+++ b/tests/test_create_new_feature.py
@@ -0,0 +1,50 @@
+import json
+import shutil
+import subprocess
+from pathlib import Path
+
+
+def test_create_new_feature_outputs_context_paths(tmp_path):
+    repo_root = tmp_path / "repo"
+    script_dir = repo_root / "scripts" / "bash"
+    template_dir = repo_root / ".specify" / "templates"
+    memory_dir = repo_root / ".specify" / "memory"
+
+    script_dir.mkdir(parents=True)
+    template_dir.mkdir(parents=True)
+    memory_dir.mkdir(parents=True)
+
+    project_root = Path(__file__).resolve().parent.parent
+    top_level_root = project_root.parent
+
+    shutil.copy(project_root / "scripts" / "bash" / "create-new-feature.sh", script_dir / "create-new-feature.sh")
+    shutil.copy(project_root / "scripts" / "bash" / "common.sh", script_dir / "common.sh")
+    shutil.copy(project_root / "templates" / "spec-template.md", template_dir / "spec-template.md")
+    shutil.copy(project_root / "templates" / "context-template.md", template_dir / "context-template.md")
+
+    constitution_path = memory_dir / "constitution.md"
+    constitution_path.write_text("Principles")
+
+    team_root = memory_dir / "team-ai-directives"
+    (team_root / "context_modules").mkdir(parents=True)
+    (team_root / "context_modules" / "principles.md").write_text("Rule")
+
+    script_path = script_dir / "create-new-feature.sh"
+
+    result = subprocess.run(
+        ["bash", str(script_path), "--json", "Add stage one feature"],
+        cwd=repo_root,
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+
+    data = json.loads(result.stdout.strip())
+
+    spec_file = Path(data["SPEC_FILE"])
+    assert spec_file.exists()
+    context_file = spec_file.parent / "context.md"
+    assert context_file.exists()
+    context_text = context_file.read_text()
+    assert "[NEEDS INPUT]" in context_text
+    assert "Feature Context" in context_text
diff --git a/tests/test_mode_evolution.py b/tests/test_mode_evolution.py
new file mode 100644
index 0000000000..95c08da668
--- /dev/null
+++ b/tests/test_mode_evolution.py
@@ -0,0 +1,164 @@
+import json
+import shutil
+import subprocess
+from pathlib import Path
+
+
+def test_mode_detection_and_template_selection(tmp_path):
+    """Test that mode detection works and appropriate templates are selected."""
+    repo_root = tmp_path / "repo"
+    config_dir = repo_root / ".specify" / "config"
+    template_dir = repo_root / ".specify" / "templates"
+
+    config_dir.mkdir(parents=True)
+    template_dir.mkdir(parents=True)
+
+    project_root = Path(__file__).resolve().parent.parent
+
+    # Copy templates
+    shutil.copy(project_root / "templates" / "spec-template.md", template_dir / "spec-template.md")
+    shutil.copy(project_root / "templates" / "spec-template-build.md", template_dir / "spec-template-build.md")
+    shutil.copy(project_root / "templates" / "plan-template.md", template_dir / "plan-template.md")
+    shutil.copy(project_root / "templates" / "plan-template-build.md", template_dir / "plan-template-build.md")
+
+    # Test Build mode configuration
+    config_file = config_dir / "config.json"
+    build_config = {
+        "version": "1.0",
+        "project": {"created": "2025-01-01T00:00:00", "last_modified": "2025-01-01T00:00:00"},
+        "workflow": {
+            "current_mode": "build",
+            "default_mode": "spec",
+            "mode_history": []
+        },
+        "options": {"tdd_enabled": False, "contracts_enabled": False, "data_models_enabled": False, "risk_tests_enabled": False},
+        "mode_defaults": {
+            "build": {"tdd_enabled": False, "contracts_enabled": False, "data_models_enabled": False, "risk_tests_enabled": False},
+            "spec": {"tdd_enabled": True, "contracts_enabled": True, "data_models_enabled": True, "risk_tests_enabled": True}
+        },
+        "spec_sync": {"enabled": False, "queue": {"version": "1.0", "created": "2025-01-01T00:00:00", "pending": [], "processed": []}},
+        "gateway": {"url": None, "token": None, "suppress_warning": False}
+    }
+    config_file.write_text(json.dumps(build_config))
+
+    # Verify mode config is readable
+    with open(config_file) as f:
+        config = json.load(f)
+        assert config["workflow"]["current_mode"] == "build"
+
+    # Test Spec mode configuration
+    spec_config = build_config.copy()
+    spec_config["workflow"]["current_mode"] = "spec"
+    config_file.write_text(json.dumps(spec_config))
+
+    with open(config_file) as f:
+        config = json.load(f)
+        assert config["workflow"]["current_mode"] == "spec"
+
+
+def test_mode_history_tracking(tmp_path):
+    """Test that mode changes are tracked in history."""
+    repo_root = tmp_path / "repo"
+    config_dir = repo_root / ".specify" / "config"
+    config_dir.mkdir(parents=True)
+
+    config_file = config_dir / "config.json"
+
+    # Initial config
+    initial_config = {
+        "version": "1.0",
+        "project": {"created": "2025-01-01T00:00:00", "last_modified": "2025-01-01T00:00:00"},
+        "workflow": {
+            "current_mode": "spec",
+            "default_mode": "spec",
+            "mode_history": []
+        },
+        "options": {"tdd_enabled": False, "contracts_enabled": False, "data_models_enabled": False, "risk_tests_enabled": False},
+        "mode_defaults": {
+            "build": {"tdd_enabled": False, "contracts_enabled": False, "data_models_enabled": False, "risk_tests_enabled": False},
+            "spec": {"tdd_enabled": True, "contracts_enabled": True, "data_models_enabled": True, "risk_tests_enabled": True}
+        },
+        "spec_sync": {"enabled": False, "queue": {"version": "1.0", "created": "2025-01-01T00:00:00", "pending": [], "processed": []}},
+        "gateway": {"url": None, "token": None, "suppress_warning": False}
+    }
+    config_file.write_text(json.dumps(initial_config))
+
+    # Simulate mode change to build
+    updated_config = initial_config.copy()
+    updated_config["workflow"]["current_mode"] = "build"
+    updated_config["workflow"]["mode_history"] = [{
+        "timestamp": None,
+        "from_mode": "spec",
+        "to_mode": "build"
+    }]
+    config_file.write_text(json.dumps(updated_config))
+
+    # Verify history is tracked
+    with open(config_file) as f:
+        config = json.load(f)
+        assert config["workflow"]["current_mode"] == "build"
+        assert len(config["workflow"]["mode_history"]) == 1
+        assert config["workflow"]["mode_history"][0]["from_mode"] == "spec"
+        assert config["workflow"]["mode_history"][0]["to_mode"] == "build"
+
+
+def test_template_content_difference():
+    """Test that build and spec templates have different content structures."""
+    project_root = Path(__file__).resolve().parent.parent
+
+    spec_template = (project_root / "templates" / "spec-template.md").read_text()
+    build_template = (project_root / "templates" / "spec-template-build.md").read_text()
+
+    # Build template should be shorter and simpler
+    assert len(build_template) < len(spec_template)
+
+    # Build template should focus on core functionality
+    assert "Core user journey" in build_template.lower() or "primary user journey" in build_template.lower()
+    assert "Success Criteria" in build_template
+
+    # Spec template should have more comprehensive sections
+    assert "Functional Requirements" in spec_template
+    assert "Non-Functional Requirements" in spec_template or "Quality Attributes" in spec_template
+
+
+def test_plan_template_difference():
+    """Test that build and spec plan templates have different structures."""
+    project_root = Path(__file__).resolve().parent.parent
+
+    spec_plan = (project_root / "templates" / "plan-template.md").read_text()
+    build_plan = (project_root / "templates" / "plan-template-build.md").read_text()
+
+    # Build plan should be simpler
+    assert len(build_plan) < len(spec_plan)
+
+    # Build plan should focus on core implementation
+    assert "Core Implementation Approach" in build_plan
+    assert "Success Criteria Validation" in build_plan
+
+    # Spec plan should have comprehensive structure
+    assert "Technical Context" in spec_plan
+    assert "Constitution Check" in spec_plan
+
+
+def test_analyze_template_auto_detection():
+    """Test that /analyze template now supports auto-detection of analysis mode."""
+    project_root = Path(__file__).resolve().parent.parent
+
+    analyze_template = (project_root / "templates" / "commands" / "analyze.md").read_text()
+
+    # Should mention auto-detection
+    assert "auto-detect" in analyze_template.lower()
+    assert "Auto-Detection" in analyze_template
+
+    # Should have both pre and post implementation sections
+    assert "Pre-Implementation" in analyze_template
+    assert "Post-Implementation" in analyze_template
+
+    # Should have different detection passes for each mode
+    assert "Documentation Drift" in analyze_template
+    assert "Implementation Quality" in analyze_template
+    assert "Real-World Usage Gaps" in analyze_template
+
+    # Should mention workflow mode integration
+    assert "workflow mode" in analyze_template.lower()
+    assert "build vs spec" in analyze_template.lower()
\ No newline at end of file
diff --git a/tests/test_prepare_levelup.py b/tests/test_prepare_levelup.py
new file mode 100644
index 0000000000..3cde26fd58
--- /dev/null
+++ b/tests/test_prepare_levelup.py
@@ -0,0 +1,57 @@
+import json
+import shutil
+import subprocess
+from pathlib import Path
+
+
+def test_prepare_levelup_outputs_context(tmp_path, monkeypatch):
+    repo_root = tmp_path / "repo"
+    (repo_root / "scripts" / "bash").mkdir(parents=True)
+    (repo_root / ".specify" / "templates").mkdir(parents=True)
+    (repo_root / ".specify" / "memory").mkdir(parents=True)
+
+    project_root = Path(__file__).resolve().parent.parent
+    top_level_root = project_root.parent
+
+    # Copy scripts
+    shutil.copy(project_root / "scripts" / "bash" / "prepare-levelup.sh", repo_root / "scripts" / "bash" / "prepare-levelup.sh")
+    shutil.copy(project_root / "scripts" / "bash" / "common.sh", repo_root / "scripts" / "bash" / "common.sh")
+
+    # Constitution and directives
+    memory_root = repo_root / ".specify" / "memory"
+    (memory_root / "team-ai-directives").mkdir(parents=True)
+    (memory_root / "team-ai-directives" / "context_modules").mkdir(parents=True)
+    (memory_root / "team-ai-directives" / "drafts").mkdir(parents=True)
+    (memory_root / "constitution.md").write_text("Principles")
+
+    # Feature structure
+    feature_dir = repo_root / "specs" / "001-levelup-test"
+    feature_dir.mkdir(parents=True)
+    (feature_dir / "spec.md").write_text("# Spec")
+    (feature_dir / "plan.md").write_text("# Plan")
+    (feature_dir / "tasks.md").write_text("# Tasks")
+    (feature_dir / "research.md").write_text("# Research")
+    (feature_dir / "quickstart.md").write_text("# Quickstart")
+    (feature_dir / "context.md").write_text("# Feature Context\n- filled")
+
+    monkeypatch.setenv("SPECIFY_FEATURE", "001-levelup-test")
+
+    subprocess.run(["git", "init"], cwd=repo_root, check=True, capture_output=True, text=True)
+
+    result = subprocess.run(
+        ["bash", str(repo_root / "scripts" / "bash" / "prepare-levelup.sh"), "--json"],
+        cwd=repo_root,
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+
+    json_line = next((line for line in result.stdout.splitlines()[::-1] if line.strip().startswith("{")), "")
+    assert json_line, "Expected JSON output"
+    data = json.loads(json_line)
+
+    assert data["SPEC_FILE"].endswith("specs/001-levelup-test/spec.md")
+    assert data["PLAN_FILE"].endswith("specs/001-levelup-test/plan.md")
+    assert data["TASKS_FILE"].endswith("specs/001-levelup-test/tasks.md")
+    assert data["KNOWLEDGE_ROOT"].endswith(".specify/memory/team-ai-directives")
+    assert data["KNOWLEDGE_DRAFTS"].endswith(".specify/memory/team-ai-directives/drafts")
diff --git a/tests/test_setup_plan.py b/tests/test_setup_plan.py
new file mode 100644
index 0000000000..4271a0246c
--- /dev/null
+++ b/tests/test_setup_plan.py
@@ -0,0 +1,61 @@
+import json
+import shutil
+import subprocess
+from pathlib import Path
+
+
+def test_setup_plan_outputs_context_paths(tmp_path, monkeypatch):
+    repo_root = tmp_path / "repo"
+    script_dir = repo_root / "scripts" / "bash"
+    script_dir.mkdir(parents=True)
+
+    specify_dir = repo_root / ".specify"
+    templates_dir = specify_dir / "templates"
+    memory_dir = specify_dir / "memory"
+    templates_dir.mkdir(parents=True)
+    memory_dir.mkdir(parents=True)
+
+    # Copy required scripts and template
+    project_root = Path(__file__).resolve().parent.parent
+    shutil.copy(project_root / "scripts" / "bash" / "setup-plan.sh", script_dir / "setup-plan.sh")
+    shutil.copy(project_root / "scripts" / "bash" / "common.sh", script_dir / "common.sh")
+    shutil.copy(project_root / "templates" / "plan-template.md", templates_dir / "plan-template.md")
+
+    # Seed constitution and team directives
+    constitution = memory_dir / "constitution.md"
+    constitution.write_text("Principles")
+    team_directives = memory_dir / "team-ai-directives"
+    (team_directives / "context_modules").mkdir(parents=True)
+
+    # Seed feature spec directory
+    feature_dir = repo_root / "specs" / "001-test-feature"
+    feature_dir.mkdir(parents=True)
+    (feature_dir / "spec.md").write_text("# Spec")
+    (feature_dir / "context.md").write_text("""# Feature Context\n- filled""")
+
+    # Prefer SPECIFY_FEATURE to avoid git dependency
+    monkeypatch.setenv("SPECIFY_FEATURE", "001-test-feature")
+
+    subprocess.run(
+        ["git", "init"],
+        cwd=repo_root,
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+
+    result = subprocess.run(
+        ["bash", str(script_dir / "setup-plan.sh"), "--json"],
+        cwd=repo_root,
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+
+    json_line = next((line for line in result.stdout.splitlines()[::-1] if line.strip().startswith("{")), "")
+    assert json_line, "Expected JSON output from setup-plan script"
+    data = json.loads(json_line)
+    assert data["FEATURE_SPEC"].endswith("specs/001-test-feature/spec.md")
+    assert data["CONSTITUTION"] == str(constitution)
+    assert data["TEAM_DIRECTIVES"] == str(team_directives)
+    assert data["CONTEXT_FILE"] == str(feature_dir / "context.md")
diff --git a/tests/test_smart_trace_validation.py b/tests/test_smart_trace_validation.py
new file mode 100644
index 0000000000..fb7617a336
--- /dev/null
+++ b/tests/test_smart_trace_validation.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+"""
+Test Smart Trace Validation functionality in /analyze command
+"""
+import os
+import json
+import tempfile
+from pathlib import Path
+
+
+def test_traceability_detection():
+    """Test detection of @issue-tracker references in artifacts"""
+
+    # Create test artifacts with traceability references
+    spec_content = """
+    # Photo Album Feature
+
+    ## User Stories
+    As a user, I want to upload photos @issue-tracker PROJ-123
+    As a user, I want to organize photos in albums @issue-tracker PROJ-124
+    As a user, I want to view photos in a grid layout  # Missing trace
+    """
+
+    plan_content = """
+    # Implementation Plan
+
+    ## Architecture
+    Use React for frontend @issue-tracker PROJ-123
+    Use Node.js for backend @issue-tracker PROJ-124
+    """
+
+    tasks_content = """
+    # Implementation Tasks
+
+    ## Phase 1: Setup
+    - [ ] Setup React project @issue-tracker PROJ-123
+    - [ ] Configure build pipeline  # Missing trace
+    """
+
+    # Test detection logic
+    def extract_issue_references(text):
+        """Extract @issue-tracker references from text"""
+        import re
+        pattern = r'@issue-tracker\s+([A-Z0-9\-#]+)'
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        return matches
+
+    spec_issues = extract_issue_references(spec_content)
+    plan_issues = extract_issue_references(plan_content)
+    tasks_issues = extract_issue_references(tasks_content)
+
+    # Assertions
+    assert "PROJ-123" in spec_issues
+    assert "PROJ-124" in spec_issues
+    assert len([line for line in spec_content.split('\n') if 'view photos' in line and '@issue-tracker' not in line]) > 0  # Missing trace detected
+
+    assert "PROJ-123" in plan_issues
+    assert "PROJ-124" in plan_issues
+
+    assert "PROJ-123" in tasks_issues
+    assert len([line for line in tasks_content.split('\n') if 'build pipeline' in line and '@issue-tracker' not in line]) > 0  # Missing trace detected
+
+    print("✅ Traceability detection test passed")
+
+
+def test_mcp_configuration_validation():
+    """Test MCP configuration validation for issue trackers"""
+
+    # Valid MCP configuration
+    valid_mcp = {
+        "mcpServers": {
+            "github-issues": {
+                "command": "npx",
+                "args": ["-y", "@modelcontextprotocol/server-github"],
+                "env": {
+                    "GITHUB_PERSONAL_ACCESS_TOKEN": "token"
+                }
+            }
+        }
+    }
+
+    # Invalid MCP configuration (missing issue tracker)
+    invalid_mcp = {
+        "mcpServers": {
+            "some-other-server": {
+                "command": "npx",
+                "args": ["some-package"]
+            }
+        }
+    }
+
+    def has_issue_tracker_config(mcp_config):
+        """Check if MCP config has issue tracker configured"""
+        if not mcp_config or "mcpServers" not in mcp_config:
+            return False
+
+        # Check for common issue tracker server names
+        issue_tracker_keywords = ["github", "jira", "linear", "gitlab", "issue", "tracker"]
+        for server_name in mcp_config["mcpServers"]:
+            if any(keyword in server_name.lower() for keyword in issue_tracker_keywords):
+                return True
+        return False
+
+    assert has_issue_tracker_config(valid_mcp) == True
+    assert has_issue_tracker_config(invalid_mcp) == False
+
+    print("✅ MCP configuration validation test passed")
+
+
+def test_traceability_coverage_calculation():
+    """Test calculation of traceability coverage percentage"""
+
+    def calculate_traceability_coverage(artifacts):
+        """Calculate what percentage of user stories/requirements have issue traces"""
+        total_stories = 0
+        traced_stories = 0
+
+        for artifact_name, content in artifacts.items():
+            lines = content.split('\n')
+            for line in lines:
+                # Count user stories (simplified detection)
+                if any(keyword in line.lower() for keyword in ["as a user", "user story", "requirement"]):
+                    total_stories += 1
+                    if "@issue-tracker" in line:
+                        traced_stories += 1
+
+        if total_stories == 0:
+            return 100.0  # No stories = 100% coverage (no gaps)
+
+        return round((traced_stories / total_stories) * 100, 1)
+
+    test_artifacts = {
+        "spec.md": """
+        As a user, I want to upload photos @issue-tracker PROJ-123
+        As a user, I want to organize albums  # Missing trace
+        As a user, I want to view photo grid @issue-tracker PROJ-125
+        """,
+        "plan.md": "Technical details here",
+        "tasks.md": "Task breakdown here"
+    }
+
+    coverage = calculate_traceability_coverage(test_artifacts)
+    assert coverage == 66.7  # 2 out of 3 stories traced (rounded)
+    assert coverage >= 60.0  # At least 60% coverage
+
+    print(f"✅ Traceability coverage calculation: {coverage:.1f}%")
+
+
+if __name__ == "__main__":
+    test_traceability_detection()
+    test_mcp_configuration_validation()
+    test_traceability_coverage_calculation()
+    print("🎉 All Smart Trace Validation tests passed!")
\ No newline at end of file
diff --git a/tests/test_team_directives.py b/tests/test_team_directives.py
new file mode 100644
index 0000000000..8982082e32
--- /dev/null
+++ b/tests/test_team_directives.py
@@ -0,0 +1,95 @@
+import subprocess
+
+import pytest
+
+from specify_cli import sync_team_ai_directives, TEAM_DIRECTIVES_DIRNAME
+
+
+def _completed(stdout: str = "", stderr: str = ""):
+    return subprocess.CompletedProcess(args=[], returncode=0, stdout=stdout, stderr=stderr)
+
+
+def test_sync_clones_when_missing(tmp_path, monkeypatch):
+    calls = []
+
+    def fake_run(cmd, check, capture_output, text, env=None):
+        calls.append((cmd, env))
+        return _completed()
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+
+    status, path = sync_team_ai_directives("https://example.com/repo.git", tmp_path, skip_tls=True)
+
+    assert status == "cloned"
+    assert path == tmp_path / ".specify" / "memory" / TEAM_DIRECTIVES_DIRNAME
+    memory_root = tmp_path / ".specify" / "memory"
+    assert memory_root.exists()
+    assert calls[0][0][:2] == ["git", "clone"]
+    assert calls[0][0][2] == "https://example.com/repo.git"
+    assert calls[0][1]["GIT_SSL_NO_VERIFY"] == "1"
+
+
+def test_sync_updates_existing_repo(tmp_path, monkeypatch):
+    destination = tmp_path / ".specify" / "memory" / TEAM_DIRECTIVES_DIRNAME
+    (destination / ".git").mkdir(parents=True)
+
+    commands = []
+
+    def fake_run(cmd, check, capture_output, text, env=None):
+        commands.append(cmd)
+        if "config" in cmd:
+            return _completed("https://example.com/repo.git\n")
+        return _completed()
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+
+    status, path = sync_team_ai_directives("https://example.com/repo.git", tmp_path)
+
+    assert status == "updated"
+    assert path == destination
+    assert any(item[3] == "pull" for item in commands if len(item) > 3)
+    assert commands[0][:4] == ["git", "-C", str(destination), "rev-parse"]
+
+
+def test_sync_resets_remote_when_url_changes(tmp_path, monkeypatch):
+    destination = tmp_path / ".specify" / "memory" / TEAM_DIRECTIVES_DIRNAME
+    (destination / ".git").mkdir(parents=True)
+
+    commands = []
+
+    def fake_run(cmd, check, capture_output, text, env=None):
+        commands.append(cmd)
+        if "config" in cmd:
+            return _completed("https://old.example.com/repo.git\n")
+        return _completed()
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+
+    sync_team_ai_directives("https://new.example.com/repo.git", tmp_path)
+
+    assert any(
+        item[:5] == ["git", "-C", str(destination), "remote", "set-url"]
+        for item in commands
+    )
+
+
+def test_sync_raises_on_git_failure(tmp_path, monkeypatch):
+    def fake_run(cmd, check, capture_output, text, env=None):
+        raise subprocess.CalledProcessError(returncode=1, cmd=cmd, stderr="fatal: error")
+
+    monkeypatch.setattr(subprocess, "run", fake_run)
+
+    with pytest.raises(RuntimeError) as exc:
+        sync_team_ai_directives("https://example.com/repo.git", tmp_path)
+
+    assert "fatal: error" in str(exc.value)
+
+
+def test_sync_returns_local_path_when_given_directory(tmp_path):
+    local_repo = tmp_path / "team-ai-directives"
+    local_repo.mkdir()
+
+    status, path = sync_team_ai_directives(str(local_repo), tmp_path)
+
+    assert status == "local"
+    assert path == local_repo