diff --git a/apps/cli/package.json b/apps/cli/package.json index c689f8e4..3da5827e 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -14,10 +14,7 @@ "bin": { "agentv": "./dist/cli.js" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "scripts": { "dev": "bun --watch src/index.ts", "build": "tsup && bun run copy-readme", diff --git a/apps/cli/src/commands/generate/rubrics.ts b/apps/cli/src/commands/generate/rubrics.ts index 3f2993f6..c2807195 100644 --- a/apps/cli/src/commands/generate/rubrics.ts +++ b/apps/cli/src/commands/generate/rubrics.ts @@ -160,9 +160,9 @@ export async function generateRubricsCommand(options: GenerateRubricsOptions): P caseNode.set( 'rubrics', rubrics.map( - (r: { id: string; description: string; weight: number; required: boolean }) => ({ + (r: { id: string; expected_outcome: string; weight: number; required: boolean }) => ({ id: r.id, - description: r.description, + expected_outcome: r.expected_outcome, weight: r.weight, required: r.required, }), diff --git a/examples/features/rubric/evals/dataset.yaml b/examples/features/rubric/evals/dataset.yaml index b9b764f9..6b902412 100644 --- a/examples/features/rubric/evals/dataset.yaml +++ b/examples/features/rubric/evals/dataset.yaml @@ -71,27 +71,27 @@ evalcases: # Detailed rubric objects with weights and required flags rubrics: - id: structure - description: Has clear headings and organization + expected_outcome: Has clear headings and organization weight: 1.0 required: true - id: success-codes - description: Covers 2xx success codes with examples + expected_outcome: Covers 2xx success codes with examples weight: 2.0 required: true - id: client-errors - description: Explains 4xx client error codes + expected_outcome: Explains 4xx client error codes weight: 2.0 required: true - id: server-errors - description: Explains 5xx server error codes + expected_outcome: Explains 5xx server error codes weight: 1.5 required: false - id: practical-examples - description: Includes practical use case examples + expected_outcome: Includes practical use case examples weight: 1.0 required: false @@ -177,3 +177,61 @@ evalcases: # No rubrics defined - will use default llm_judge evaluator # To generate rubrics: agentv generate rubrics evals/rubric-examples.yaml + + # ========================================== + # Example 5: Multi-criteria score_ranges (PROPOSED) + # Demonstrates: multiple rubric ids, each with 0–10 score_ranges, then weighted aggregation. + # Real-world intent: grading a summary on both factual accuracy and brevity. + # Status: Proposed only; not supported until `add-rubric-score-ranges` is implemented. + # ========================================== + - id: summary-multi-criteria-score-ranges-proposed + + expected_outcome: |- + Provide an accurate summary in under 50 words. + + input_messages: + - role: user + content: |- + Summarize this article in under 50 words: + + Climate change is accelerating faster than predicted. Recent studies show + Arctic ice melting at unprecedented rates, sea levels rising, and extreme + weather events becoming more frequent. Scientists urge immediate action to + reduce carbon emissions and transition to renewable energy sources. + + expected_messages: + - role: assistant + content: |- + Climate change is accelerating with rapid Arctic ice loss, rising seas, and + more extreme weather. Scientists urge urgent emissions cuts and a transition + to renewable energy. + + rubrics: + - id: factual_accuracy + weight: 2.0 + required_min_score: 8 + score_ranges: + - score_range: [0, 2] + expected_outcome: Contains major factual errors or contradicts the article. + - score_range: [3, 5] + expected_outcome: Mostly on-topic but includes at least one clear factual error or misstates a key claim. + - score_range: [6, 7] + expected_outcome: Generally accurate but misses an important point or slightly distorts emphasis. + - score_range: [8, 9] + expected_outcome: Accurate and covers the key points with only minor omissions. + - score_range: [10, 10] + expected_outcome: Fully accurate, captures all key points with no distortions. + + - id: brevity_and_clarity + weight: 1.0 + score_ranges: + - score_range: [0, 2] + expected_outcome: Exceeds 50 words or is hard to understand. + - score_range: [3, 5] + expected_outcome: Under 50 words but unclear, repetitive, or poorly structured. + - score_range: [6, 7] + expected_outcome: Under 50 words and mostly clear, but could be more concise or better phrased. + - score_range: [8, 9] + expected_outcome: Under 50 words, clear and concise. + - score_range: [10, 10] + expected_outcome: Under 50 words, exceptionally clear, concise, and well phrased. diff --git a/openspec/changes/add-rubric-score-ranges/design.md b/openspec/changes/add-rubric-score-ranges/design.md new file mode 100644 index 00000000..5210937b --- /dev/null +++ b/openspec/changes/add-rubric-score-ranges/design.md @@ -0,0 +1,82 @@ +## Context +AgentV currently supports rubric-based evaluation by converting `rubrics` into `llm_judge` checklist items. The judge returns per-item `satisfied: boolean` checks and the runtime computes a weighted fraction score in 0..1. + +External best practice (DeepEval/Confident AI) adds an additional pattern: **score-range rubrics**, where the judge chooses an integer score in 0..10 constrained by explicit ranges with concrete expected outcomes, then the framework normalizes to 0..1. + +## Decision +Evolve to a **single rubric system** that supports both "DeepEval-style" banded scoring and multi-criterion weighted scoring by introducing **per-criterion score ranges**. + +Each rubric criterion keeps an `id` (and optional `weight`), but can optionally include `score_ranges` that define non-overlapping 0–10 bands with concrete expected outcomes. The judge returns an integer score 0..10 per criterion; the runtime normalizes each to 0..1 and aggregates deterministically. + +This change also includes a **breaking rename** for checklist rubrics: `description` → `expected_outcome`. + +The existing `required: boolean` is replaced (in the proposed primary shape) by `required_min_score: int` gating. `required` remains accepted as a deprecated alias during migration. + +### Proposed YAML Shape +```yaml +evaluators: + - name: correctness + type: llm_judge + rubrics: + - id: correctness + weight: 1.0 + required_min_score: 10 + score_ranges: + - score_range: [0, 2] + expected_outcome: Factually incorrect. + - score_range: [3, 6] + expected_outcome: Mostly correct but includes notable errors or omissions. + - score_range: [7, 9] + expected_outcome: Correct with minor missing details. + - score_range: [10, 10] + expected_outcome: Fully correct and complete. +``` + +### Output Contract +- Judge returns a **per-criterion** `score` as an integer in `0..10` for each rubric `id`. +- AgentV normalizes each to `0..1` by dividing by 10 and aggregates deterministically (weighted average). +- If any criterion has `required_min_score` and the returned score is below it, the verdict is forced to `fail`. +- Preserve existing verdict thresholds (`>=0.8 pass`, `>=0.6 borderline`, else fail). + +## Validation Rules +- Ranges are inclusive integer bounds. +- Bounds must be within 0..10. +- No overlap (within a given rubric criterion). +- Prefer full coverage of 0..10 inclusive (strict coverage recommended for determinism). +- Each range must have non-empty `expected_outcome`. + +## Backwards Compatibility +- Existing checklist rubrics remain supported during migration. +- `required` is treated as a deprecated alias for `required_min_score: 10`. +- New rubric criteria may include `score_ranges` for banded 0–10 scoring. + +### Migration +- Replace checklist rubric object field `description:` with `expected_outcome:`. + +## Open Questions +- Should AgentV allow gaps (e.g., reserve 0 for “unscorable”), or strictly require full coverage? (Proposal defaults to strict full coverage to match the cited best practice.) +- Should mixed `rubrics` (checklist + score-range) be allowed, and if so how to combine them? (Proposal: disallow mixing for simplicity and determinism.) + +## Deterministic Mapping to Checklist (Weighted-Average) Rubrics + +### Can score-range rubrics be deterministically mapped to the existing weighted-average system? +Not in a semantics-preserving way. + +Holistic score-range rubrics define a *single ordinal grade* (an integer 0..10) with an expected outcome per interval. +Checklist rubrics define *multiple independent criteria* with per-criterion weights and gating, and compute a weighted fraction. + +Because the score-range system does not provide per-criterion truth values (or even a breakdown of which expectations were met), there is no deterministic transformation from a range choice into a unique checklist satisfaction vector. +Any mapping from range → checklist would require adding assumptions (e.g., “a 7 implies all requirements A/B/C are satisfied”), which is equivalent to inventing extra semantics not present in the input. + +### Can checklist rubrics be deterministically mapped to score-range rubrics? +Only in a lossy, wrapper-style way. + +Given checklist results, AgentV can deterministically compute a normalized score $s \in [0,1]$ and then map it to a raw integer $r = \mathrm{round}(10s)$ (or $\lfloor 10s \rfloor$, etc.). +But that does not recreate the score-range *rubric definition* (expected outcomes per bucket), and it does not provide the core value of range rubrics: constraining the judge with explicit outcome descriptions per range. + +### Conclusion +The two rubric modes are not redundant: +- Checklist rubrics are best for requirement-driven grading (decomposable criteria, required flags, deterministic scoring). +- Score-range rubrics are best for holistic grading where the evaluator needs explicit outcome descriptions per band. + +The most practical unification is at the interface level: treat both as rubric-driven evaluators that produce a normalized $[0,1]$ score and a verdict, but keep both scoring modes as first-class options rather than making one a wrapper for the other. diff --git a/openspec/changes/add-rubric-score-ranges/proposal.md b/openspec/changes/add-rubric-score-ranges/proposal.md new file mode 100644 index 00000000..e66ebd81 --- /dev/null +++ b/openspec/changes/add-rubric-score-ranges/proposal.md @@ -0,0 +1,57 @@ +# Change: Add 0–10 score-range rubrics for LLM judging + +## Why +AgentV’s current rubric support in `llm_judge` is a **binary checklist** (each rubric item is `satisfied: true|false` and the score is computed as a weighted fraction). This is great for requirements-style grading, but it does **not** support the “confine the judge into explicit score ranges” pattern used by common LLM-evals tooling. + +Best-practice literature for LLM-as-a-judge rubric scoring (e.g., DeepEval/Confident AI) recommends: +- A **0–10 integer scale** (more reliable than floats for LLMs) +- Explicit **non-overlapping** `score_range` definitions +- Clear **expected outcomes per range**, not vague labels +- **Normalization to 0–1** for downstream aggregation + +Adding this as an **optional, backwards-compatible** scoring mode gives AgentV users a deterministic way to express custom metrics while keeping existing rubrics intact. + +## What Changes +- Extend the existing `rubrics` concept to support **per-criterion score ranges** (analytic rubric scoring): + - Each rubric entry represents a criterion with an `id` and optional aggregation `weight`. + - Each criterion can include `score_ranges` (0–10 inclusive integer bands) with explicit `expected_outcome` text. + - The judge returns an integer score **0–10 per criterion**, which AgentV normalizes to **0–1** (divide by 10) and aggregates (weighted average). + +- Replace `required: boolean` with `required_min_score: int` (0–10) for gating. + - If a criterion has `required_min_score`, the overall verdict MUST be `fail` when the criterion score is below that threshold. + +- Add validation rules (for per-criterion score ranges): + - Ranges MUST be integers within **0..10** + - Ranges MUST NOT overlap within a criterion + - Ranges SHOULD cover **0..10** (inclusive) within a criterion (strict coverage is preferred for determinism) + - Each range MUST include a non-empty `expected_outcome` + +- Backwards compatibility: + - Existing checklist rubrics remain supported during migration. + - `required` is treated as a deprecated alias for `required_min_score: 10`. + +## Breaking Changes +- **BREAKING**: Rename checklist rubric field `description` → `expected_outcome`. + - YAML before: + - `rubrics: [{ id: "x", description: "...", weight: 1, required: true }]` + - YAML after: + - `rubrics: [{ id: "x", expected_outcome: "...", weight: 1, required: true }]` + - CLI `generate rubrics` output changes accordingly. + +- **BREAKING (proposed new primary shape)**: Prefer `required_min_score` over `required`. + - `required` remains accepted as a deprecated alias during migration. + +## Impact +- Affected specs: `rubric-evaluator`, `yaml-schema`. +- Affected code (expected): + - `packages/core/src/evaluation/types.ts` (new config/type) + - `packages/core/src/evaluation/yaml-parser.ts` (parsing inline config) + - `packages/core/src/evaluation/loaders/evaluator-parser.ts` (validation) + - `packages/core/src/evaluation/evaluators/llm-judge.ts` (prompt + scoring) + - `packages/core/src/evaluation/validation/*` (range validation helper) + - Tests under `packages/core/test/**` + +## Non-Goals +- Do not replace checklist rubrics. +- Do not change `EvaluationScore.score` away from 0–1. +- Do not add new CLI UX beyond schema support (future enhancement could generate range rubrics). diff --git a/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md b/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md new file mode 100644 index 00000000..5afc100b --- /dev/null +++ b/openspec/changes/add-rubric-score-ranges/specs/rubric-evaluator/spec.md @@ -0,0 +1,48 @@ +## MODIFIED Requirements + +### Requirement: Static Rubric Evaluation MUST support checklist and score-range rubrics +The evaluator SHALL support rubric-based grading using rubric criteria entries. Each criterion may be: + +1) **Checklist-style** (legacy): boolean checks per criterion using `expected_outcome` text. +2) **Score-range per criterion** (new): each criterion contains `score_ranges` defining non-overlapping integer ranges over 0–10 inclusive, each with an explicit `expected_outcome`. + +When score-ranges are present for a criterion, the evaluator SHALL instruct the judge to output an **integer score 0..10 for that criterion** and then normalize it to 0..1 for aggregation. + +The evaluator SHALL support `required_min_score` gating: if a criterion specifies `required_min_score` and the returned score is below it, the overall verdict SHALL be `fail`. + +#### Scenario: Checklist rubrics continue to work +- **GIVEN** an eval case with `rubrics` (id/description/weight/required) +- **WHEN** the rubric evaluator runs +- **THEN** it SHALL grade using per-item boolean checks +- **AND** the reported score SHALL be in 0..1 + +#### Scenario: Range rubrics constrain scoring +- **GIVEN** an eval case with `rubrics` where a criterion contains `score_ranges` entries and `expected_outcome` text +- **WHEN** the rubric evaluator runs +- **THEN** the judge SHALL be constrained to output an integer score in 0..10 for that criterion +- **AND** the system SHALL normalize each criterion score to 0..1 by dividing by 10 + +#### Scenario: Invalid range rubrics are rejected +- **GIVEN** a `score_rubric` with overlapping ranges or missing coverage of 0..10 +- **WHEN** the eval suite is loaded +- **THEN** validation SHALL fail +- **AND** the error message SHALL indicate the violated rule (overlap, bounds, or coverage) + +### Requirement: Structured Grading MUST produce validated results +The evaluator SHALL validate judge output against a schema appropriate to the configured mode. + +#### Scenario: Range rubric output schema +- **GIVEN** a range-rubric configuration +- **WHEN** the judge responds +- **THEN** the evaluator SHALL accept a JSON object matching: +```typescript +z.object({ + checks: z.array(z.object({ + id: z.string(), + score: z.number().int().min(0).max(10), + reasoning: z.string().optional(), + })), + overall_reasoning: z.string().optional(), +}) +``` +- **AND** AgentV SHALL normalize per-criterion `score / 10` into the standard 0..1 result and aggregate. diff --git a/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md b/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md new file mode 100644 index 00000000..eb085666 --- /dev/null +++ b/openspec/changes/add-rubric-score-ranges/specs/yaml-schema/spec.md @@ -0,0 +1,72 @@ +## ADDED Requirements + +### Requirement: Checklist rubric field name MUST be `expected_outcome` +The YAML schema SHALL accept checklist rubric objects using `expected_outcome` (replacing the legacy `description`). + +#### Scenario: Checklist rubric uses expected_outcome +- **GIVEN** a YAML eval case with: +```yaml +rubrics: + - id: structure + expected_outcome: Has clear headings and organization + weight: 1.0 + required_min_score: 10 +``` +- **WHEN** the YAML is parsed +- **THEN** schema validation succeeds + +### Requirement: Rubric gating MUST support required_min_score +The YAML schema SHALL support `required_min_score` (0..10) on rubric criteria to enforce hard-gating. + +#### Scenario: required_min_score gates rubric criteria +- **GIVEN** a YAML eval case with: +```yaml +rubrics: + - id: correctness + weight: 2.0 + required_min_score: 10 + expected_outcome: Must be fully correct. +``` +- **WHEN** the YAML is parsed +- **THEN** schema validation succeeds + +### Requirement: Per-criterion score_ranges rubrics MUST be supported for LLM judging +The YAML schema SHALL support configuring per-criterion `score_ranges` for `llm_judge` evaluators via the existing `rubrics` field. + +#### Scenario: Configure score_rubric +- **GIVEN** a YAML eval case with: +```yaml +evaluators: + - name: correctness + type: llm_judge + rubrics: + - id: correctness + weight: 1.0 + required_min_score: 10 + score_ranges: + - score_range: [0, 2] + expected_outcome: Factually incorrect. + - score_range: [3, 6] + expected_outcome: Mostly correct. + - score_range: [7, 9] + expected_outcome: Correct but missing minor details. + - score_range: [10, 10] + expected_outcome: Fully correct. +``` +- **WHEN** the YAML is parsed +- **THEN** the evaluator configuration SHALL include the provided score ranges + +#### Scenario: Reject overlapping score ranges +- **GIVEN** a YAML eval case with overlapping ranges +- **WHEN** the YAML is parsed +- **THEN** schema validation SHALL fail + +#### Scenario: Reject incomplete 0..10 coverage +- **GIVEN** a YAML eval case where score ranges do not cover 0..10 inclusive +- **WHEN** the YAML is parsed +- **THEN** schema validation SHALL fail + +#### Scenario: Reject empty expected_outcome +- **GIVEN** a YAML eval case where a range rubric entry has an empty `expected_outcome` +- **WHEN** the YAML is parsed +- **THEN** schema validation SHALL fail diff --git a/openspec/changes/add-rubric-score-ranges/tasks.md b/openspec/changes/add-rubric-score-ranges/tasks.md new file mode 100644 index 00000000..258fe043 --- /dev/null +++ b/openspec/changes/add-rubric-score-ranges/tasks.md @@ -0,0 +1,28 @@ +## 1. Schema & Types +- [ ] 1.1 Add `ScoreRange` and `RubricCriterion` types (per-criterion 0–10 integer ranges) to core evaluation types +- [ ] 1.2 Extend rubric criteria to accept `score_ranges` and `required_min_score` (deprecate `required`) + +## 2. Validation +- [ ] 2.1 Validate ranges are integers within 0..10 and start <= end +- [ ] 2.2 Validate non-overlap within each criterion's ranges +- [ ] 2.3 Validate (preferred) full coverage of 0..10 inclusive per criterion +- [ ] 2.4 Validate each range has non-empty `expected_outcome` +- [ ] 2.5 Validate `required_min_score` is an integer within 0..10 + +## 3. LLM Judge Integration +- [ ] 3.1 Add prompt template for per-criterion score-range scoring that requests integer `score` 0..10 per rubric `id` +- [ ] 3.2 Normalize criterion scores to 0..1 (divide by 10) and aggregate deterministically (weighted average) +- [ ] 3.3 Apply `required_min_score` gating (force fail when any gated criterion is below threshold) +- [ ] 3.4 Store raw 0–10 scores in `details` (or `evaluatorRawRequest/Response`) for debugging + +## 4. YAML Support +- [ ] 4.1 Support `score_ranges` nested under each rubric criterion in YAML +- [ ] 4.2 Support `required_min_score` in YAML and treat legacy `required: true` as `required_min_score: 10` + +## 5. Tests +- [ ] 5.1 Unit tests for validation (overlap, gaps, bounds) +- [ ] 5.2 Unit/integration tests for llm_judge parsing + normalization + gating + +## 6. Docs +- [ ] 6.1 Update rubric-evaluator skill/reference docs to include range rubrics +- [ ] 6.2 Add examples of good/bad range definitions diff --git a/package.json b/package.json index 8a55e354..7f2d0cb2 100644 --- a/package.json +++ b/package.json @@ -4,10 +4,7 @@ "private": true, "description": "AgentV monorepo workspace", "packageManager": "bun@1.3.3", - "workspaces": [ - "apps/*", - "packages/*" - ], + "workspaces": ["apps/*", "packages/*"], "scripts": { "build": "bun --filter @agentv/core build && bun --filter @agentv/eval build && bun --filter agentv build", "verify": "bun run build && bun run typecheck && bun run lint && bun run test", diff --git a/packages/core/package.json b/packages/core/package.json index 5c4ff566..5b4c9c76 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -36,10 +36,7 @@ "test:watch": "bun test --watch", "diagnostics:azure": "bun src/diagnostics/azure-deployment-diag.ts" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "dependencies": { "@ai-sdk/anthropic": "^2.0.53", "@ai-sdk/azure": "^2.0.78", diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts index ab8a5dd5..fb643e8b 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge.ts @@ -231,7 +231,7 @@ export class LlmJudgeEvaluator implements Evaluator { for (const rubric of rubrics) { const requiredLabel = rubric.required ? ' (REQUIRED)' : ''; const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; - parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`); + parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.expected_outcome}`); } parts.push('', 'For each rubric, determine if it is satisfied and provide brief reasoning.'); @@ -353,9 +353,9 @@ function calculateRubricScore( if (check.satisfied) { earnedWeight += rubric.weight; - hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`); + hits.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`); } else { - misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`); + misses.push(`[${rubric.id}] ${rubric.expected_outcome}: ${check.reasoning}`); if (rubric.required) { failedRequired = true; } diff --git a/packages/core/src/evaluation/generators/rubric-generator.ts b/packages/core/src/evaluation/generators/rubric-generator.ts index 54ee01c3..44cb54e2 100644 --- a/packages/core/src/evaluation/generators/rubric-generator.ts +++ b/packages/core/src/evaluation/generators/rubric-generator.ts @@ -6,7 +6,7 @@ import type { RubricItem } from '../types.js'; const rubricItemSchema = z.object({ id: z.string().describe('Short identifier for this rubric (e.g., clarity, completeness)'), - description: z.string().describe('What this rubric checks for'), + expected_outcome: z.string().describe('Concrete expected outcome for this rubric item'), weight: z.number().default(1.0).describe('Relative importance (default 1.0)'), required: z.boolean().default(true).describe('Whether this is a mandatory requirement'), }); @@ -43,7 +43,7 @@ You must return a valid JSON object matching this schema: "rubrics": [ { "id": "string (short identifier)", - "description": "string (what to check)", + "expected_outcome": "string (concrete expected outcome for this rubric item)", "weight": number (default 1.0), "required": boolean (default true) } @@ -86,7 +86,7 @@ function buildPrompt(expectedOutcome: string, question?: string, referenceAnswer 'Each rubric should:', '- Be specific and testable', '- Have a short, descriptive ID', - '- Include a clear description of what to check', + '- Include a clear expected outcome statement (what a good answer must demonstrate for this rubric)', '- Indicate if it is required (mandatory) or optional', '- Have an appropriate weight (default 1.0, use higher values for more important aspects)', '', diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index 97de79a2..42c69a99 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -549,11 +549,11 @@ export async function parseEvaluators( .filter((r): r is JsonObject => isJsonObject(r)) .map((rubric, index) => ({ id: asString(rubric.id) ?? `rubric-${index + 1}`, - description: asString(rubric.description) ?? '', + expected_outcome: asString(rubric.expected_outcome) ?? '', weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0, required: typeof rubric.required === 'boolean' ? rubric.required : true, })) - .filter((r) => r.description.length > 0) + .filter((r) => r.expected_outcome.length > 0) : undefined; if (typeValue === 'rubric') { diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index c7f571f3..d5e3c3b1 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -202,7 +202,7 @@ export type LlmJudgeEvaluatorConfig = { export type RubricItem = { readonly id: string; - readonly description: string; + readonly expected_outcome: string; readonly weight: number; readonly required: boolean; }; diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index c14826fc..4e432e4b 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -222,19 +222,19 @@ export async function loadEvalCases( if (typeof rubric === 'string') { return { id: `rubric-${index + 1}`, - description: rubric, + expected_outcome: rubric, weight: 1.0, required: true, }; } return { id: asString(rubric.id) ?? `rubric-${index + 1}`, - description: asString(rubric.description) ?? '', + expected_outcome: asString(rubric.expected_outcome) ?? '', weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0, required: typeof rubric.required === 'boolean' ? rubric.required : true, }; }) - .filter((r) => r.description.length > 0); + .filter((r) => r.expected_outcome.length > 0); if (rubricItems.length > 0) { const rubricEvaluator: import('./types.js').LlmJudgeEvaluatorConfig = { diff --git a/packages/eval/package.json b/packages/eval/package.json index 53075bb9..87b3d186 100644 --- a/packages/eval/package.json +++ b/packages/eval/package.json @@ -29,10 +29,7 @@ "fix": "biome check --write .", "test": "bun test" }, - "files": [ - "dist", - "README.md" - ], + "files": ["dist", "README.md"], "dependencies": { "zod": "^3.23.8" }