EntityProcess · christso · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/apps/cli/package.json b/apps/cli/package.json
@@ -14,10 +14,7 @@
   "bin": {
     "agentv": "./dist/cli.js"
   },
-  "files": [
-    "dist",
-    "README.md"
-  ],
+  "files": ["dist", "README.md"],
   "scripts": {
     "dev": "bun --watch src/index.ts",
     "build": "tsup && bun run copy-readme",

diff --git a/apps/cli/src/commands/generate/rubrics.ts b/apps/cli/src/commands/generate/rubrics.ts
@@ -160,9 +160,9 @@ export async function generateRubricsCommand(options: GenerateRubricsOptions): P
       caseNode.set(
         'rubrics',
         rubrics.map(
-          (r: { id: string; description: string; weight: number; required: boolean }) => ({
+          (r: { id: string; expected_outcome: string; weight: number; required: boolean }) => ({
             id: r.id,
-            description: r.description,
+            expected_outcome: r.expected_outcome,
             weight: r.weight,
             required: r.required,
           }),

diff --git a/examples/features/rubric/evals/dataset.yaml b/examples/features/rubric/evals/dataset.yaml
@@ -71,27 +71,27 @@ evalcases:
     # Detailed rubric objects with weights and required flags
     rubrics:
       - id: structure
-        description: Has clear headings and organization
+        expected_outcome: Has clear headings and organization
         weight: 1.0
         required: true
 
       - id: success-codes
-        description: Covers 2xx success codes with examples
+        expected_outcome: Covers 2xx success codes with examples
         weight: 2.0
         required: true
 
       - id: client-errors
-        description: Explains 4xx client error codes
+        expected_outcome: Explains 4xx client error codes
         weight: 2.0
         required: true
 
       - id: server-errors
-        description: Explains 5xx server error codes
+        expected_outcome: Explains 5xx server error codes
         weight: 1.5
         required: false
 
       - id: practical-examples
-        description: Includes practical use case examples
+        expected_outcome: Includes practical use case examples
         weight: 1.0
         required: false
 
@@ -177,3 +177,61 @@ evalcases:
 
     # No rubrics defined - will use default llm_judge evaluator
     # To generate rubrics: agentv generate rubrics evals/rubric-examples.yaml
+
+  # ==========================================
+  # Example 5: Multi-criteria score_ranges (PROPOSED)
+  # Demonstrates: multiple rubric ids, each with 0–10 score_ranges, then weighted aggregation.
+  # Real-world intent: grading a summary on both factual accuracy and brevity.
+  # Status: Proposed only; not supported until `add-rubric-score-ranges` is implemented.
+  # ==========================================
+  - id: summary-multi-criteria-score-ranges-proposed
+
+    expected_outcome: |-
+      Provide an accurate summary in under 50 words.
+
+    input_messages:
+      - role: user
+        content: |-
+          Summarize this article in under 50 words:
+
+          Climate change is accelerating faster than predicted. Recent studies show
+          Arctic ice melting at unprecedented rates, sea levels rising, and extreme
+          weather events becoming more frequent. Scientists urge immediate action to
+          reduce carbon emissions and transition to renewable energy sources.
+
+    expected_messages:
+      - role: assistant
+        content: |-
+          Climate change is accelerating with rapid Arctic ice loss, rising seas, and
+          more extreme weather. Scientists urge urgent emissions cuts and a transition
+          to renewable energy.
+
+    rubrics:
+      - id: factual_accuracy
+        weight: 2.0
+        required_min_score: 8
+        score_ranges:
+          - score_range: [0, 2]
+            expected_outcome: Contains major factual errors or contradicts the article.
+          - score_range: [3, 5]
+            expected_outcome: Mostly on-topic but includes at least one clear factual error or misstates a key claim.
+          - score_range: [6, 7]
+            expected_outcome: Generally accurate but misses an important point or slightly distorts emphasis.
+          - score_range: [8, 9]
+            expected_outcome: Accurate and covers the key points with only minor omissions.
+          - score_range: [10, 10]
+            expected_outcome: Fully accurate, captures all key points with no distortions.
+
+      - id: brevity_and_clarity
+        weight: 1.0
+        score_ranges:
+          - score_range: [0, 2]
+            expected_outcome: Exceeds 50 words or is hard to understand.
+          - score_range: [3, 5]
+            expected_outcome: Under 50 words but unclear, repetitive, or poorly structured.
+          - score_range: [6, 7]
+            expected_outcome: Under 50 words and mostly clear, but could be more concise or better phrased.
+          - score_range: [8, 9]
+            expected_outcome: Under 50 words, clear and concise.
+          - score_range: [10, 10]
+            expected_outcome: Under 50 words, exceptionally clear, concise, and well phrased.
diff --git a/openspec/changes/add-rubric-score-ranges/design.md b/openspec/changes/add-rubric-score-ranges/design.md
@@ -0,0 +1,82 @@
+## Context
+AgentV currently supports rubric-based evaluation by converting `rubrics` into `llm_judge` checklist items. The judge returns per-item `satisfied: boolean` checks and the runtime computes a weighted fraction score in 0..1.
+
+External best practice (DeepEval/Confident AI) adds an additional pattern: **score-range rubrics**, where the judge chooses an integer score in 0..10 constrained by explicit ranges with concrete expected outcomes, then the framework normalizes to 0..1.
+
+## Decision
+Evolve to a **single rubric system** that supports both "DeepEval-style" banded scoring and multi-criterion weighted scoring by introducing **per-criterion score ranges**.
+
+Each rubric criterion keeps an `id` (and optional `weight`), but can optionally include `score_ranges` that define non-overlapping 0–10 bands with concrete expected outcomes. The judge returns an integer score 0..10 per criterion; the runtime normalizes each to 0..1 and aggregates deterministically.
+
+This change also includes a **breaking rename** for checklist rubrics: `description` → `expected_outcome`.
+
+The existing `required: boolean` is replaced (in the proposed primary shape) by `required_min_score: int` gating. `required` remains accepted as a deprecated alias during migration.
+
+### Proposed YAML Shape
+```yaml
+evaluators:
+  - name: correctness
+    type: llm_judge
+    rubrics:
+      - id: correctness
+        weight: 1.0
+        required_min_score: 10
+        score_ranges:
+          - score_range: [0, 2]
+            expected_outcome: Factually incorrect.
+          - score_range: [3, 6]
+            expected_outcome: Mostly correct but includes notable errors or omissions.
+          - score_range: [7, 9]
+            expected_outcome: Correct with minor missing details.
+          - score_range: [10, 10]
+            expected_outcome: Fully correct and complete.
+```
+
+### Output Contract
+- Judge returns a **per-criterion** `score` as an integer in `0..10` for each rubric `id`.
+- AgentV normalizes each to `0..1` by dividing by 10 and aggregates deterministically (weighted average).
+- If any criterion has `required_min_score` and the returned score is below it, the verdict is forced to `fail`.
+- Preserve existing verdict thresholds (`>=0.8 pass`, `>=0.6 borderline`, else fail).
+
+## Validation Rules
+- Ranges are inclusive integer bounds.
+- Bounds must be within 0..10.
+- No overlap (within a given rubric criterion).
+- Prefer full coverage of 0..10 inclusive (strict coverage recommended for determinism).
+- Each range must have non-empty `expected_outcome`.
+
+## Backwards Compatibility
+- Existing checklist rubrics remain supported during migration.
+- `required` is treated as a deprecated alias for `required_min_score: 10`.
+- New rubric criteria may include `score_ranges` for banded 0–10 scoring.
+
+### Migration
+- Replace checklist rubric object field `description:` with `expected_outcome:`.
+
+## Open Questions
+- Should AgentV allow gaps (e.g., reserve 0 for “unscorable”), or strictly require full coverage? (Proposal defaults to strict full coverage to match the cited best practice.)
+- Should mixed `rubrics` (checklist + score-range) be allowed, and if so how to combine them? (Proposal: disallow mixing for simplicity and determinism.)
+
+## Deterministic Mapping to Checklist (Weighted-Average) Rubrics
+
+### Can score-range rubrics be deterministically mapped to the existing weighted-average system?
+Not in a semantics-preserving way.
+
+Holistic score-range rubrics define a *single ordinal grade* (an integer 0..10) with an expected outcome per interval.
+Checklist rubrics define *multiple independent criteria* with per-criterion weights and gating, and compute a weighted fraction.
+
+Because the score-range system does not provide per-criterion truth values (or even a breakdown of which expectations were met), there is no deterministic transformation from a range choice into a unique checklist satisfaction vector.
+Any mapping from range → checklist would require adding assumptions (e.g., “a 7 implies all requirements A/B/C are satisfied”), which is equivalent to inventing extra semantics not present in the input.
+
+### Can checklist rubrics be deterministically mapped to score-range rubrics?
+Only in a lossy, wrapper-style way.
+
+Given checklist results, AgentV can deterministically compute a normalized score $s \in [0,1]$ and then map it to a raw integer $r = \mathrm{round}(10s)$ (or $\lfloor 10s \rfloor$, etc.).
+But that does not recreate the score-range *rubric definition* (expected outcomes per bucket), and it does not provide the core value of range rubrics: constraining the judge with explicit outcome descriptions per range.
+
+### Conclusion
+The two rubric modes are not redundant:
+- Checklist rubrics are best for requirement-driven grading (decomposable criteria, required flags, deterministic scoring).
+- Score-range rubrics are best for holistic grading where the evaluator needs explicit outcome descriptions per band.
+
+The most practical unification is at the interface level: treat both as rubric-driven evaluators that produce a normalized $[0,1]$ score and a verdict, but keep both scoring modes as first-class options rather than making one a wrapper for the other.
diff --git a/openspec/changes/add-rubric-score-ranges/proposal.md b/openspec/changes/add-rubric-score-ranges/proposal.md
@@ -0,0 +1,57 @@
+# Change: Add 0–10 score-range rubrics for LLM judging
+
+## Why
+AgentV’s current rubric support in `llm_judge` is a **binary checklist** (each rubric item is `satisfied: true|false` and the score is computed as a weighted fraction). This is great for requirements-style grading, but it does **not** support the “confine the judge into explicit score ranges” pattern used by common LLM-evals tooling.
+
+Best-practice literature for LLM-as-a-judge rubric scoring (e.g., DeepEval/Confident AI) recommends:
+- A **0–10 integer scale** (more reliable than floats for LLMs)
+- Explicit **non-overlapping** `score_range` definitions
+- Clear **expected outcomes per range**, not vague labels
+- **Normalization to 0–1** for downstream aggregation
+
+Adding this as an **optional, backwards-compatible** scoring mode gives AgentV users a deterministic way to express custom metrics while keeping existing rubrics intact.
+
+## What Changes
+- Extend the existing `rubrics` concept to support **per-criterion score ranges** (analytic rubric scoring):
+  - Each rubric entry represents a criterion with an `id` and optional aggregation `weight`.
+  - Each criterion can include `score_ranges` (0–10 inclusive integer bands) with explicit `expected_outcome` text.
+  - The judge returns an integer score **0–10 per criterion**, which AgentV normalizes to **0–1** (divide by 10) and aggregates (weighted average).
+
+- Replace `required: boolean` with `required_min_score: int` (0–10) for gating.
+  - If a criterion has `required_min_score`, the overall verdict MUST be `fail` when the criterion score is below that threshold.
+
+- Add validation rules (for per-criterion score ranges):
+  - Ranges MUST be integers within **0..10**
+  - Ranges MUST NOT overlap within a criterion
+  - Ranges SHOULD cover **0..10** (inclusive) within a criterion (strict coverage is preferred for determinism)
+  - Each range MUST include a non-empty `expected_outcome`
+
+- Backwards compatibility:
+  - Existing checklist rubrics remain supported during migration.
+  - `required` is treated as a deprecated alias for `required_min_score: 10`.
+
+## Breaking Changes
+- **BREAKING**: Rename checklist rubric field `description` → `expected_outcome`.
+  - YAML before:
+    - `rubrics: [{ id: "x", description: "...", weight: 1, required: true }]`
+  - YAML after:
+    - `rubrics: [{ id: "x", expected_outcome: "...", weight: 1, required: true }]`
+  - CLI `generate rubrics` output changes accordingly.
+
+- **BREAKING (proposed new primary shape)**: Prefer `required_min_score` over `required`.
+  - `required` remains accepted as a deprecated alias during migration.
+
+## Impact
+- Affected specs: `rubric-evaluator`, `yaml-schema`.
+- Affected code (expected):
+  - `packages/core/src/evaluation/types.ts` (new config/type)
+  - `packages/core/src/evaluation/yaml-parser.ts` (parsing inline config)
+  - `packages/core/src/evaluation/loaders/evaluator-parser.ts` (validation)
+  - `packages/core/src/evaluation/evaluators/llm-judge.ts` (prompt + scoring)
+  - `packages/core/src/evaluation/validation/*` (range validation helper)
+  - Tests under `packages/core/test/**`
+
+## Non-Goals
+- Do not replace checklist rubrics.
+- Do not change `EvaluationScore.score` away from 0–1.
+- Do not add new CLI UX beyond schema support (future enhancement could generate range rubrics).
diff --git a/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md b/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md
@@ -0,0 +1,48 @@
+## MODIFIED Requirements
+
+### Requirement: Static Rubric Evaluation MUST support checklist and score-range rubrics
+The evaluator SHALL support rubric-based grading using rubric criteria entries. Each criterion may be:
+
+1) **Checklist-style** (legacy): boolean checks per criterion using `expected_outcome` text.
+2) **Score-range per criterion** (new): each criterion contains `score_ranges` defining non-overlapping integer ranges over 0–10 inclusive, each with an explicit `expected_outcome`.
+
+When score-ranges are present for a criterion, the evaluator SHALL instruct the judge to output an **integer score 0..10 for that criterion** and then normalize it to 0..1 for aggregation.
+
+The evaluator SHALL support `required_min_score` gating: if a criterion specifies `required_min_score` and the returned score is below it, the overall verdict SHALL be `fail`.
+
+#### Scenario: Checklist rubrics continue to work
+- **GIVEN** an eval case with `rubrics` (id/description/weight/required)
+- **WHEN** the rubric evaluator runs
+- **THEN** it SHALL grade using per-item boolean checks
+- **AND** the reported score SHALL be in 0..1
+
+#### Scenario: Range rubrics constrain scoring
+- **GIVEN** an eval case with `rubrics` where a criterion contains `score_ranges` entries and `expected_outcome` text
+- **WHEN** the rubric evaluator runs
+- **THEN** the judge SHALL be constrained to output an integer score in 0..10 for that criterion
+- **AND** the system SHALL normalize each criterion score to 0..1 by dividing by 10
+
+#### Scenario: Invalid range rubrics are rejected
+- **GIVEN** a `score_rubric` with overlapping ranges or missing coverage of 0..10
+- **WHEN** the eval suite is loaded
+- **THEN** validation SHALL fail
+- **AND** the error message SHALL indicate the violated rule (overlap, bounds, or coverage)
+
+### Requirement: Structured Grading MUST produce validated results
+The evaluator SHALL validate judge output against a schema appropriate to the configured mode.
+
+#### Scenario: Range rubric output schema
+- **GIVEN** a range-rubric configuration
+- **WHEN** the judge responds
+- **THEN** the evaluator SHALL accept a JSON object matching:
+```typescript
+z.object({
+  checks: z.array(z.object({
+    id: z.string(),
+    score: z.number().int().min(0).max(10),
+    reasoning: z.string().optional(),
+  })),
+  overall_reasoning: z.string().optional(),
+})
+```
+- **AND** AgentV SHALL normalize per-criterion `score / 10` into the standard 0..1 result and aggregate.
diff --git a/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md b/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md
@@ -0,0 +1,72 @@
+## ADDED Requirements
+
+### Requirement: Checklist rubric field name MUST be `expected_outcome`
+The YAML schema SHALL accept checklist rubric objects using `expected_outcome` (replacing the legacy `description`).
+
+#### Scenario: Checklist rubric uses expected_outcome
+- **GIVEN** a YAML eval case with:
+```yaml
+rubrics:
+  - id: structure
+    expected_outcome: Has clear headings and organization
+    weight: 1.0
+    required_min_score: 10
+```
+- **WHEN** the YAML is parsed
+- **THEN** schema validation succeeds
+
+### Requirement: Rubric gating MUST support required_min_score
+The YAML schema SHALL support `required_min_score` (0..10) on rubric criteria to enforce hard-gating.
+
+#### Scenario: required_min_score gates rubric criteria
+- **GIVEN** a YAML eval case with:
+```yaml
+rubrics:
+  - id: correctness
+    weight: 2.0
+    required_min_score: 10
+    expected_outcome: Must be fully correct.
+```
+- **WHEN** the YAML is parsed
+- **THEN** schema validation succeeds
+
+### Requirement: Per-criterion score_ranges rubrics MUST be supported for LLM judging
+The YAML schema SHALL support configuring per-criterion `score_ranges` for `llm_judge` evaluators via the existing `rubrics` field.
+
+#### Scenario: Configure score_rubric
+- **GIVEN** a YAML eval case with:
+```yaml
+evaluators:
+  - name: correctness
+    type: llm_judge
+    rubrics:
+      - id: correctness
+        weight: 1.0
+        required_min_score: 10
+        score_ranges:
+          - score_range: [0, 2]
+            expected_outcome: Factually incorrect.
+          - score_range: [3, 6]
+            expected_outcome: Mostly correct.
+          - score_range: [7, 9]
+            expected_outcome: Correct but missing minor details.
+          - score_range: [10, 10]
+            expected_outcome: Fully correct.
+```
+- **WHEN** the YAML is parsed
+- **THEN** the evaluator configuration SHALL include the provided score ranges
+
+#### Scenario: Reject overlapping score ranges
+- **GIVEN** a YAML eval case with overlapping ranges
+- **WHEN** the YAML is parsed
+- **THEN** schema validation SHALL fail
+
+#### Scenario: Reject incomplete 0..10 coverage
+- **GIVEN** a YAML eval case where score ranges do not cover 0..10 inclusive
+- **WHEN** the YAML is parsed
+- **THEN** schema validation SHALL fail
+
+#### Scenario: Reject empty expected_outcome
+- **GIVEN** a YAML eval case where a range rubric entry has an empty `expected_outcome`
+- **WHEN** the YAML is parsed
+- **THEN** schema validation SHALL fail