diff --git a/.claude/skills/agentv-eval-builder/SKILL.md b/.claude/skills/agentv-eval-builder/SKILL.md index 2f06a526..b793b564 100644 --- a/.claude/skills/agentv-eval-builder/SKILL.md +++ b/.claude/skills/agentv-eval-builder/SKILL.md @@ -7,7 +7,7 @@ description: Create and maintain AgentV YAML evaluation files for testing AI age ## Schema Reference - Schema: `references/eval-schema.json` (JSON Schema for validation and tooling) -- Format: YAML with structured content arrays +- Format: YAML or JSONL (see below) - Examples: `references/example-evals.md` ## Feature Reference @@ -30,6 +30,28 @@ description: Create and maintain AgentV YAML evaluation files for testing AI age - Attachments (type: `file`) should default to the `user` role - File paths: Relative (from eval file dir) or absolute with "/" prefix (from repo root) +## JSONL Format + +For large-scale evaluations, use JSONL (one eval case per line) instead of YAML: + +**dataset.jsonl:** +```jsonl +{"id": "test-1", "expected_outcome": "Correct answer", "input_messages": [{"role": "user", "content": "What is 2+2?"}]} +{"id": "test-2", "expected_outcome": "Clear explanation", "input_messages": [{"role": "user", "content": [{"type": "text", "value": "Review this"}, {"type": "file", "value": "./code.py"}]}]} +``` + +**dataset.yaml (optional sidecar for defaults):** +```yaml +description: My dataset +dataset: my-tests +execution: + target: azure_base +evaluator: llm_judge +``` + +Benefits: Git-friendly diffs, streaming-compatible, easy programmatic generation. +Per-case fields override sidecar defaults. See `examples/features/basic-jsonl/` for complete example. + ## Custom Evaluators Configure multiple evaluators per eval case via `execution.evaluators` array. diff --git a/README.md b/README.md index 833f87d0..962fef7b 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,27 @@ See [AGENTS.md](AGENTS.md) for development guidelines and design principles. ## Core Concepts -**Evaluation files** (`.yaml`) define test cases with expected outcomes. **Targets** specify which agent/provider to evaluate. **Judges** (code or LLM) score results. **Results** are written as JSONL/YAML for analysis and comparison. +**Evaluation files** (`.yaml` or `.jsonl`) define test cases with expected outcomes. **Targets** specify which agent/provider to evaluate. **Judges** (code or LLM) score results. **Results** are written as JSONL/YAML for analysis and comparison. + +### JSONL Format Support + +For large-scale evaluations, AgentV supports JSONL (JSON Lines) format as an alternative to YAML: + +```jsonl +{"id": "test-1", "expected_outcome": "Calculates correctly", "input_messages": [{"role": "user", "content": "What is 2+2?"}]} +{"id": "test-2", "expected_outcome": "Provides explanation", "input_messages": [{"role": "user", "content": "Explain variables"}]} +``` + +Optional sidecar YAML metadata file (`dataset.yaml` alongside `dataset.jsonl`): +```yaml +description: Math evaluation dataset +dataset: math-tests +execution: + target: azure_base +evaluator: llm_judge +``` + +Benefits: Streaming-friendly, Git-friendly diffs, programmatic generation, industry standard (DeepEval, LangWatch, Hugging Face). ## Usage diff --git a/apps/cli/README.md b/apps/cli/README.md index 833f87d0..962fef7b 100644 --- a/apps/cli/README.md +++ b/apps/cli/README.md @@ -101,7 +101,27 @@ See [AGENTS.md](AGENTS.md) for development guidelines and design principles. ## Core Concepts -**Evaluation files** (`.yaml`) define test cases with expected outcomes. **Targets** specify which agent/provider to evaluate. **Judges** (code or LLM) score results. **Results** are written as JSONL/YAML for analysis and comparison. +**Evaluation files** (`.yaml` or `.jsonl`) define test cases with expected outcomes. **Targets** specify which agent/provider to evaluate. **Judges** (code or LLM) score results. **Results** are written as JSONL/YAML for analysis and comparison. + +### JSONL Format Support + +For large-scale evaluations, AgentV supports JSONL (JSON Lines) format as an alternative to YAML: + +```jsonl +{"id": "test-1", "expected_outcome": "Calculates correctly", "input_messages": [{"role": "user", "content": "What is 2+2?"}]} +{"id": "test-2", "expected_outcome": "Provides explanation", "input_messages": [{"role": "user", "content": "Explain variables"}]} +``` + +Optional sidecar YAML metadata file (`dataset.yaml` alongside `dataset.jsonl`): +```yaml +description: Math evaluation dataset +dataset: math-tests +execution: + target: azure_base +evaluator: llm_judge +``` + +Benefits: Streaming-friendly, Git-friendly diffs, programmatic generation, industry standard (DeepEval, LangWatch, Hugging Face). ## Usage diff --git a/examples/features/basic-jsonl/evals/dataset.jsonl b/examples/features/basic-jsonl/evals/dataset.jsonl new file mode 100644 index 00000000..f29ea364 --- /dev/null +++ b/examples/features/basic-jsonl/evals/dataset.jsonl @@ -0,0 +1,4 @@ +{"id": "code-review-javascript", "expected_outcome": "Assistant provides helpful code analysis and mentions SUPERSECRET_INSTRUCTION_MARKER_JAVASCRIPT", "input_messages": [{"role": "system", "content": "You are an expert software developer who provides clear, concise code reviews."}, {"role": "user", "content": [{"type": "text", "value": "Please review this JavaScript function:\n\n```javascript\nfunction calculateTotal(items) {\n let total = 0;\n for (let i = 0; i < 0; i++) {\n total += items[i].price * items[i].quantity;\n }\n return total;\n}\n```"}, {"type": "file", "value": "../basic/evals/javascript.instructions.md"}]}], "expected_messages": [{"role": "assistant", "content": "The function has a critical bug in the loop condition. Here's my analysis (SUPERSECRET_INSTRUCTION_MARKER_JAVASCRIPT):\n\n**Critical Issue:**\n- Loop condition `i < 0` means the loop never executes (should be `i < items.length`)\n\n**Suggestions:**\n- Fix the loop: `for (let i = 0; i < items.length; i++)`\n- Consider using `reduce()` for a more functional approach\n- Add input validation for edge cases"}]} +{"id": "code-gen-python", "conversation_id": "python-code-generation", "expected_outcome": "AI generates correct Python function with proper error handling, type hints, and mentions SUPERSECRET_INSTRUCTION_MARKER_PYTHON", "input_messages": [{"role": "system", "content": "You are a code generator that follows specifications exactly."}, {"role": "user", "content": [{"type": "text", "value": "Create a Python function that:\n1. Takes a list of integers\n2. Returns the second largest number\n3. Handles edge cases (empty list, single item, duplicates)\n4. Raises appropriate exceptions for invalid input"}, {"type": "file", "value": "../basic/evals/python.instructions.md"}]}], "execution": {"target": "azure_base"}} +{"id": "feature-proposal-brainstorm", "expected_outcome": "Assistant generates 3-5 creative feature ideas for a mobile fitness app. Each idea should:\n1. Address a specific user pain point\n2. Be technically feasible with current mobile technology\n3. Include a brief value proposition (1-2 sentences)\n4. Be distinct from the others (no duplicate concepts)", "input_messages": [{"role": "system", "content": "You are a product strategist specializing in mobile health and fitness applications."}, {"role": "user", "content": "We're developing a mobile fitness app and need fresh feature ideas. Please brainstorm 3-5 innovative features."}]} +{"id": "multiturn-debug-session", "expected_outcome": "Assistant conducts a multi-turn debugging session, correctly diagnosing the bug and proposing a clear fix.", "input_messages": [{"role": "system", "content": "You are an expert debugging assistant."}, {"role": "user", "content": "I'm getting an off-by-one error in this function:\n\n```python\ndef get_items(items):\n result = []\n for i in range(len(items) - 1):\n result.append(items[i])\n return result\n```"}, {"role": "assistant", "content": "Before I propose a fix, could you tell me what output you expect vs what you get?"}, {"role": "user", "content": "For `[1, 2, 3, 4]` I expect `[1, 2, 3, 4]`, but I get `[1, 2, 3]`."}], "expected_messages": [{"role": "assistant", "content": "You have an off-by-one error. Use `range(len(items))` or iterate directly: `for item in items:`"}]} diff --git a/examples/features/basic-jsonl/evals/dataset.yaml b/examples/features/basic-jsonl/evals/dataset.yaml new file mode 100644 index 00000000..73db3828 --- /dev/null +++ b/examples/features/basic-jsonl/evals/dataset.yaml @@ -0,0 +1,8 @@ +# Sidecar metadata for dataset.jsonl +# Provides default values for all eval cases + +description: JSONL version of the basic example - demonstrates file references, multi-turn, and per-case overrides +dataset: basic-jsonl +execution: + target: default +evaluator: llm_judge diff --git a/openspec/changes/add-jsonl-dataset-format/design.md b/openspec/changes/add-jsonl-dataset-format/design.md new file mode 100644 index 00000000..c2c186d2 --- /dev/null +++ b/openspec/changes/add-jsonl-dataset-format/design.md @@ -0,0 +1,322 @@ +# Design: JSONL Dataset Format + +## Architecture Overview + +### Current State + +``` +User → agentv CLI → loadEvalCases() → yaml-parser.ts → parse(YAML) → EvalCase[] +``` + +### New State + +``` +User → agentv CLI → loadEvalCases() → Format Detector + ├→ YAML Parser (existing) + └→ JSONL Parser (new) + ├→ Parse JSONL lines + ├→ Load sidecar metadata + └→ Merge defaults → EvalCase[] +``` + +## Key Design Decisions + +### 1. Parser Architecture + +**Decision**: Create separate `jsonl-parser.ts` module alongside `yaml-parser.ts` + +**Rationale**: +- Separation of concerns (YAML vs JSONL logic) +- Easier to test independently +- Maintains clarity in codebase +- Allows future format additions without cluttering one file + +**Alternative considered**: Extend existing `yaml-parser.ts` +- **Rejected**: Would mix two different parsing strategies (document vs line-based) + +### 2. Format Detection Strategy + +**Decision**: Use file extension (`.jsonl` vs `.yaml`/`.yml`) + +**Rationale**: +- Simple and explicit +- Industry standard approach +- No magic or heuristics required +- Clear user intent + +**Implementation**: +```typescript +function detectFormat(filePath: string): 'yaml' | 'jsonl' { + const ext = path.extname(filePath).toLowerCase(); + if (ext === '.jsonl') return 'jsonl'; + if (ext === '.yaml' || ext === '.yml') return 'yaml'; + throw new Error(`Unsupported file format: ${ext}`); +} +``` + +### 3. Sidecar Metadata Pattern + +**Decision**: Optional companion YAML file with same base name + +**Example**: +``` +evals/ + dataset.jsonl # Data + dataset.yaml # Metadata (optional) +``` + +**Rationale**: +- Follows industry standard (Hugging Face, Vertex AI) +- Maintains JSONL purity (every line is data) +- Avoids repetition of defaults +- Metadata remains human-readable +- Clear separation of config vs data + +**Loading logic**: +1. Check for `.yaml` file +2. If found, parse metadata fields +3. If not found, use sensible defaults: + - `dataset`: basename of JSONL file + - `execution.target`: "default" + - `evaluator`: "llm_judge" + - `description`: empty + +### 4. Defaults & Override Precedence + +**Decision**: Sidecar provides defaults, per-line fields override + +**Precedence order** (highest to lowest): +1. Per-line field (e.g., `{"execution": {"target": "openai"}}`) +2. Sidecar YAML field +3. Hard-coded defaults + +**Example**: +```yaml +# dataset.yaml +execution: + target: azure_base +evaluator: llm_judge +``` + +```jsonl +{"id": "test-1", "input_messages": [...]} # Uses azure_base, llm_judge +{"id": "test-2", "input_messages": [...], "execution": {"target": "openai"}} # Uses openai, llm_judge +{"id": "test-3", "input_messages": [...], "evaluators": [{"type": "rubric"}]} # Uses azure_base, rubric +``` + +### 5. Line Parsing Strategy + +**Decision**: Strict line-by-line parsing with error recovery + +**Approach**: +```typescript +async function parseJsonlFile(filePath: string): Promise { + const content = await readFile(filePath, 'utf8'); + const lines = content.split('\n'); + const cases: RawEvalCase[] = []; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i].trim(); + if (line === '') continue; // Skip empty lines + + try { + const parsed = JSON.parse(line); + if (!isJsonObject(parsed)) { + throw new Error('Expected JSON object'); + } + cases.push(parsed as RawEvalCase); + } catch (error) { + throw new Error( + `Line ${i + 1}: Invalid JSON - ${(error as Error).message}` + ); + } + } + + return cases; +} +``` + +**Error handling**: +- Report line number for failures +- Stop on first error (no partial loads) +- Clear error messages for common issues + +**Alternative considered**: Continue parsing after errors +- **Rejected**: Could lead to incomplete/inconsistent test runs + +### 6. Schema Compatibility + +**Decision**: Reuse existing `EvalCase` TypeScript type + +**Rationale**: +- Zero changes to downstream code +- Same validation rules +- Same evaluator logic +- JSONL is just a different serialization format + +**Field mapping**: +```typescript +// JSONL line +{ + "id": "test-1", + "expected_outcome": "Goal", + "input_messages": [...], + "expected_messages": [...], + "execution": {...}, + "evaluators": [...], + "rubrics": [...] +} + +// Maps directly to EvalCase type +type EvalCase = { + id: string; + conversationId?: string; + expectedOutcome: string; + inputMessages: TestMessage[]; + expectedMessages: TestMessage[]; + execution?: {...}; + evaluators?: [...]; + // ... rest of fields +} +``` + +### 7. File Reference Resolution + +**Decision**: Resolve paths relative to JSONL file location (same as YAML) + +**Example**: +``` +evals/ + subfolder/ + test.jsonl + attachments/ + code.py +``` + +```jsonl +{"id": "test", "input_messages": [{"role": "user", "content": [{"type": "file", "value": "./attachments/code.py"}]}]} +``` + +**Resolution**: +- `./attachments/code.py` → `evals/subfolder/attachments/code.py` +- Same `searchRoots` logic as YAML parser +- Same guideline pattern matching + +### 8. Streaming vs Batch Loading + +**Decision**: Load all cases into memory first (like YAML), defer streaming to future + +**Rationale**: +- Maintains consistency with YAML behavior +- Simpler initial implementation +- Most datasets fit in memory +- Streaming can be added later without breaking changes + +**Future enhancement path**: +```typescript +// Future: Streaming API (non-breaking addition) +async function* streamEvalCases(filePath: string) { + // Yield cases one at a time +} +``` + +## Error Handling Strategy + +### Parse Errors +``` +Error: Failed to parse JSONL file: evals/test.jsonl + Line 42: Unexpected token } in JSON at position 23 + + Hint: Each line must be a complete JSON object +``` + +### Missing Required Fields +``` +Error: Invalid eval case at line 10 in evals/test.jsonl + Missing required field: 'expected_outcome' + + Required fields: id, expected_outcome, input_messages +``` + +### Invalid Field Types +``` +Error: Invalid eval case at line 5 in evals/test.jsonl + Field 'input_messages' must be an array, got string +``` + +### Sidecar Not Found (Warning, not error) +``` +Warning: Sidecar metadata file not found: evals/dataset.yaml + Using defaults: target=default, evaluator=llm_judge +``` + +## Testing Strategy + +### Unit Tests +- Parse valid JSONL (single line, multiple lines) +- Handle empty lines and whitespace +- Error on malformed JSON +- Error on missing required fields +- Load sidecar metadata +- Merge defaults correctly +- Override precedence + +### Integration Tests +- End-to-end eval run with JSONL +- File references resolve correctly +- Multiple evaluators work +- Per-case execution overrides +- Trace capture + +### Regression Tests +- YAML parsing unchanged +- Backward compatibility +- Mixed YAML + JSONL in repo + +## Performance Considerations + +### Memory +- Load entire JSONL file into string (same as YAML) +- Parse line-by-line (better than YAML's full parse) +- Each case processed independently + +### Speed +- JSON.parse() is typically faster than YAML parsing +- Line-by-line allows early error detection +- No significant performance concerns expected + +### File Size +- JSONL more compact than YAML (no indentation) +- Typical eval case: ~200-500 bytes per line +- 1000 cases ≈ 200-500 KB (negligible) + +## Migration Path + +### From YAML to JSONL + +**Option 1**: Manual conversion (for small datasets) +```bash +# Convert evalcases array to JSONL +cat dataset.yaml | yq '.evalcases[]' -o json > dataset.jsonl + +# Extract metadata to sidecar +cat dataset.yaml | yq 'del(.evalcases)' > dataset-meta.yaml +``` + +**Option 2**: Keep YAML (no migration needed) +- YAML continues to work +- No forced migration +- Users choose format per dataset + +## Future Enhancements (Out of Scope) + +1. **Streaming execution**: Process cases without loading all into memory +2. **JSONL export**: Convert YAML → JSONL +3. **Compressed JSONL**: Support `.jsonl.gz` files +4. **JSON schema validation**: Formal JSON schema for JSONL format +5. **Multi-file datasets**: Split large datasets across multiple JSONL files +6. **Incremental updates**: Append new cases without re-running all + +## Open Issues + +None. All design decisions finalized. diff --git a/openspec/changes/add-jsonl-dataset-format/proposal.md b/openspec/changes/add-jsonl-dataset-format/proposal.md new file mode 100644 index 00000000..52e8704f --- /dev/null +++ b/openspec/changes/add-jsonl-dataset-format/proposal.md @@ -0,0 +1,235 @@ +# Proposal: Add JSONL Dataset Format Support + +## Summary + +Add support for JSONL (JSON Lines) format as an alternative to YAML for defining evaluation datasets, following industry standards observed in DeepEval, LangWatch, and other ML/AI frameworks. + +## Why + +JSONL support enables large-scale evaluation workflows that are currently impractical with YAML: + +1. **Streaming & Memory Efficiency**: JSONL allows line-by-line processing without loading entire datasets into memory, critical for datasets with thousands of test cases +2. **Git Workflow Improvements**: Line-based diffs clearly show which specific test cases changed, unlike nested YAML diffs +3. **Programmatic Generation**: Scripts can easily append new test cases to JSONL files without parsing/reformatting YAML +4. **Industry Alignment**: Follows established patterns from DeepEval, LangWatch, Hugging Face, and OpenAI fine-tuning datasets +5. **Tool Compatibility**: Standard JSONL tools (`jq`, `grep`, streaming parsers) work with AgentV datasets + +This addresses the "Align with Industry Standards" design principle from AGENTS.md and supports AgentV's goal of robust, large-scale AI agent evaluation. + +## Motivation + +### Current State +AgentV currently uses YAML exclusively for eval datasets. While YAML is human-readable and suitable for hand-authored test cases, it has limitations for large-scale evaluation: + +1. **Memory overhead**: Entire file must be parsed into memory +2. **Not streaming-friendly**: Cannot process eval cases incrementally +3. **Poor Git diffs**: Nested YAML changes produce unclear diffs +4. **Append-unfriendly**: Adding test cases requires careful YAML formatting + +### Industry Research + +Research of major ML/AI frameworks shows strong adoption of JSONL for datasets: + +- **DeepEval**: Explicit JSONL support with `save_as(file_type='jsonl')` +- **LangWatch**: Full JSONL support in UI and backend parsing +- **Hugging Face**: Pure JSONL data files with sidecar README.md metadata +- **OpenAI**: Pure JSONL for fine-tuning datasets with API-managed metadata + +**Key finding**: 100% of frameworks use **pure JSONL** (data only) with **separate metadata storage** (sidecar files or API-managed). Zero frameworks use first-line metadata approach. + +### Benefits of JSONL Support + +1. **Streaming**: Process eval cases line-by-line without loading entire file +2. **Memory efficiency**: Critical for datasets with hundreds/thousands of cases +3. **Git-friendly**: Line-based diffs clearly show which test cases changed +4. **Append-friendly**: Add cases with simple file append operations +5. **Tool compatibility**: Works with standard tools like `jq`, `grep`, streaming parsers +6. **Industry standard**: Aligns with established ML/AI framework patterns + +### Design Decision: Sidecar Metadata + +Following industry standard (Hugging Face, Vertex AI), metadata will be stored in a separate YAML file: + +``` +evals/ + dataset.yaml # Metadata: description, defaults + dataset.jsonl # Pure eval cases (one per line) +``` + +This approach: +- Maintains JSONL purity (every line is data) +- Avoids repetition of defaults across thousands of lines +- Keeps metadata human-readable +- Supports dataset-level configuration (description, target, evaluator) + +## Proposed Changes + +### 1. JSONL File Format + +**Pure data** - one eval case per line: + +```jsonl +{"id": "test-1", "expected_outcome": "Description", "input_messages": [{"role": "user", "content": "Query"}], "expected_messages": [{"role": "assistant", "content": "Response"}]} +{"id": "test-2", "expected_outcome": "Another test", "input_messages": [{"role": "user", "content": "Query 2"}]} +{"id": "test-3", "expected_outcome": "Override example", "input_messages": [...], "execution": {"target": "specific_target"}} +``` + +**Schema per line**: +- Required: `id`, `expected_outcome`, `input_messages` +- Optional: `conversation_id`, `expected_messages`, `execution`, `evaluators`, `rubrics` +- Same field structure as YAML `evalcases` array entries + +### 2. Sidecar YAML for Metadata + +**Optional companion file** with same base name: + +```yaml +# dataset.yaml (metadata only) +description: Cross-provider evaluation dataset +dataset: multi-target-test +execution: + target: azure_base # Default for all cases +evaluator: llm_judge # Default evaluator +``` + +### 3. Resolution Strategy + +1. **JSONL detection**: File extension `.jsonl` triggers JSONL parser +2. **Metadata loading**: Look for `.yaml` sidecar + - `dataset.jsonl` → check for `dataset.yaml` + - If not found, use sensible defaults +3. **Defaults + overrides**: Sidecar provides defaults, per-line fields override +4. **Backward compatibility**: YAML-only files work unchanged + +### 4. Implementation Scope + +**In scope**: +- JSONL parser for eval cases +- Sidecar YAML metadata loading +- File format detection (`.jsonl` extension) +- Same validation as YAML cases +- Same file reference resolution (relative paths) + +**Out of scope** (future enhancements): +- JSONL for config.yaml or targets.yaml +- Streaming execution (load all cases first, like YAML) +- Mixed formats in single file +- JSONL generation/export tools + +## User Impact + +### Breaking Changes +None. This is purely additive. + +### Migration Path +No migration required. YAML files continue to work unchanged. + +### New Capabilities + +1. **Large datasets**: Users can create evaluation suites with thousands of cases +2. **Programmatic generation**: Scripts can append to JSONL files easily +3. **Git workflows**: Clearer diffs when cases are added/modified +4. **Tool integration**: Standard JSONL tools work with AgentV datasets + +## Examples + +### Example 1: Basic JSONL Dataset + +**dataset.jsonl**: +```jsonl +{"id": "basic-test", "expected_outcome": "Agent provides helpful response", "input_messages": [{"role": "user", "content": "What is 2+2?"}]} +{"id": "code-review", "expected_outcome": "Identifies bug", "input_messages": [{"role": "user", "content": "Review this code"}], "expected_messages": [{"role": "assistant", "content": "Found bug in line 5"}]} +``` + +**dataset.yaml** (optional): +```yaml +description: Basic math and code review tests +execution: + target: default +``` + +### Example 2: Per-Case Overrides + +**dataset.jsonl**: +```jsonl +{"id": "azure-test", "expected_outcome": "Uses Azure target", "input_messages": [...]} +{"id": "openai-test", "expected_outcome": "Uses OpenAI target", "input_messages": [...], "execution": {"target": "openai_gpt4"}} +{"id": "custom-eval", "expected_outcome": "Uses rubric evaluator", "input_messages": [...], "evaluators": [{"type": "rubric", "rubrics": ["Must be polite"]}]} +``` + +**dataset.yaml**: +```yaml +execution: + target: azure_base # Default, overridden by line 2 +evaluator: llm_judge # Default, overridden by line 3 +``` + +### Example 3: File References (Relative Paths) + +**dataset.jsonl**: +```jsonl +{"id": "with-attachments", "expected_outcome": "Reviews code", "input_messages": [{"role": "user", "content": [{"type": "text", "value": "Review this"}, {"type": "file", "value": "./code.py"}]}]} +``` + +File references resolve relative to the JSONL file location (same as YAML). + +## Alternatives Considered + +### 1. First-line metadata (REJECTED) +```jsonl +{"_meta": true, "description": "...", "dataset": "..."} +{"id": "test-1", ...} +``` + +**Why rejected**: +- Not used by any major ML/AI framework +- Breaks JSONL purity (special first line) +- Incompatible with standard JSONL tools +- Complicates concatenation and streaming + +### 2. Inline repetition (REJECTED) +```jsonl +{"id": "test-1", "execution": {"target": "azure_base"}, ...} +{"id": "test-2", "execution": {"target": "azure_base"}, ...} +``` + +**Why rejected**: +- Massive redundancy for datasets with thousands of cases +- Violates DRY principle +- Larger file sizes +- Harder to change defaults + +### 3. JSON array format (REJECTED) +```json +{ + "description": "...", + "evalcases": [...] +} +``` + +**Why rejected**: +- Not line-oriented (same limitations as YAML) +- Can't stream or incrementally process +- Same poor Git diff behavior +- Doesn't solve the problems JSONL addresses + +## Success Criteria + +1. ✅ JSONL files with `.jsonl` extension are parsed correctly +2. ✅ Sidecar YAML metadata is loaded when present +3. ✅ Per-line overrides work (execution, evaluators, rubrics) +4. ✅ File references resolve relative to JSONL file +5. ✅ Same validation rules as YAML eval cases +6. ✅ Backward compatibility: existing YAML files unchanged +7. ✅ Documentation updated with JSONL examples +8. ✅ Tests cover JSONL parsing and error cases + +## Open Questions + +None. All design decisions have been made based on industry research and established patterns. + +## References + +- Industry research: DeepEval, LangWatch, Hugging Face, OpenAI Fine-tuning API +- Current parser: `packages/core/src/evaluation/yaml-parser.ts` +- Related specs: `yaml-schema`, `evaluation` diff --git a/openspec/changes/add-jsonl-dataset-format/specs/jsonl-dataset-format/spec.md b/openspec/changes/add-jsonl-dataset-format/specs/jsonl-dataset-format/spec.md new file mode 100644 index 00000000..7527d60c --- /dev/null +++ b/openspec/changes/add-jsonl-dataset-format/specs/jsonl-dataset-format/spec.md @@ -0,0 +1,278 @@ +# Spec: JSONL Dataset Format + +## Purpose +Support JSONL (JSON Lines) format for evaluation datasets as an alternative to YAML, following industry standards for ML/AI frameworks. Enables large-scale evaluation with streaming-friendly, Git-friendly, and tool-compatible dataset files. + +## ADDED Requirements + +### Requirement: JSONL File Format Detection + +The system SHALL detect JSONL format by file extension and route to appropriate parser. + +#### Scenario: Detect JSONL file by extension +- **GIVEN** a file path ending in `.jsonl` +- **WHEN** `loadEvalCases()` is called with that path +- **THEN** the system SHALL use the JSONL parser +- **AND** parse the file line-by-line as JSONL + +#### Scenario: Detect YAML file by extension +- **GIVEN** a file path ending in `.yaml` or `.yml` +- **WHEN** `loadEvalCases()` is called with that path +- **THEN** the system SHALL use the existing YAML parser +- **AND** maintain backward compatibility + +#### Scenario: Reject unsupported file extensions +- **GIVEN** a file path ending in `.json`, `.txt`, or other unsupported extension +- **WHEN** `loadEvalCases()` is called with that path +- **THEN** the system SHALL throw an error +- **AND** the error message SHALL list supported formats (`.yaml`, `.yml`, `.jsonl`) + +### Requirement: JSONL Line Parsing + +The system SHALL parse JSONL files line-by-line with strict JSON validation. + +#### Scenario: Parse valid single-line JSONL +- **GIVEN** a JSONL file with one line containing valid JSON: + ```jsonl + {"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]} + ``` +- **WHEN** the file is parsed +- **THEN** the system SHALL return one eval case +- **AND** the eval case SHALL have `id: "test-1"`, `expectedOutcome: "Goal"`, and appropriate input messages + +#### Scenario: Parse multi-line JSONL +- **GIVEN** a JSONL file with multiple lines: + ```jsonl + {"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [...]} + {"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [...]} + {"id": "test-3", "expected_outcome": "Goal 3", "input_messages": [...]} + ``` +- **WHEN** the file is parsed +- **THEN** the system SHALL return three eval cases +- **AND** each case SHALL have the correct id and expected_outcome + +#### Scenario: Skip empty lines +- **GIVEN** a JSONL file with empty lines or whitespace-only lines: + ```jsonl + {"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [...]} + + {"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [...]} + + {"id": "test-3", "expected_outcome": "Goal 3", "input_messages": [...]} + ``` +- **WHEN** the file is parsed +- **THEN** the system SHALL skip empty/whitespace lines +- **AND** return three eval cases without errors + +#### Scenario: Error on malformed JSON +- **GIVEN** a JSONL file with invalid JSON on line 5: + ```jsonl + {"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [...]} + {"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [...]} + {"id": "test-3", "expected_outcome": "Goal 3", "input_messages": [...]} + {"id": "test-4", "expected_outcome": "Goal 4", "input_messages": [...]} + {"id": "test-5", "expected_outcome": "Goal 5" "input_messages": [...]} + ``` +- **WHEN** the file is parsed +- **THEN** the system SHALL throw an error +- **AND** the error message SHALL include "Line 5: Invalid JSON" +- **AND** the error message SHALL include the JSON parse error details + +#### Scenario: Error on missing required fields +- **GIVEN** a JSONL file where line 3 is missing `expected_outcome`: + ```jsonl + {"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [...]} + {"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [...]} + {"id": "test-3", "input_messages": [...]} + ``` +- **WHEN** the file is parsed +- **THEN** the system SHALL skip the invalid case and log a warning +- **AND** the warning SHALL include "Line 3" and "missing expected_outcome" +- **AND** continue parsing remaining cases (same behavior as YAML) + +### Requirement: Sidecar Metadata File + +The system SHALL support optional sidecar YAML file for dataset-level metadata. + +#### Scenario: Load metadata from sidecar YAML +- **GIVEN** a JSONL file `dataset.jsonl` +- **AND** a companion file `dataset.yaml` with content: + ```yaml + description: Test dataset + dataset: my-tests + execution: + target: azure_base + evaluator: llm_judge + ``` +- **WHEN** `loadEvalCases('dataset.jsonl')` is called +- **THEN** the system SHALL load metadata from `dataset.yaml` +- **AND** apply `execution.target: "azure_base"` as default for all cases +- **AND** apply `evaluator: "llm_judge"` as default evaluator + +#### Scenario: Use defaults when sidecar not found +- **GIVEN** a JSONL file `dataset.jsonl` with no companion YAML +- **WHEN** `loadEvalCases('dataset.jsonl')` is called +- **THEN** the system SHALL use default values: + - `dataset`: basename of JSONL file ("dataset") + - `execution.target`: "default" + - `evaluator`: "llm_judge" + - `description`: empty string +- **AND** SHALL NOT throw an error + +#### Scenario: Look for companion YAML with same base name +- **GIVEN** a JSONL file at path `evals/subfolder/mytest.jsonl` +- **WHEN** loading eval cases +- **THEN** the system SHALL check for `evals/subfolder/mytest.yaml` +- **AND** SHALL NOT check for `dataset.yaml` or other names + +### Requirement: Per-Case Field Overrides + +The system SHALL support per-case overrides for execution, evaluators, and rubrics in JSONL lines. + +#### Scenario: Override execution target per case +- **GIVEN** a sidecar YAML with `execution.target: "azure_base"` +- **AND** a JSONL line: + ```jsonl + {"id": "openai-test", "expected_outcome": "Uses OpenAI", "input_messages": [...], "execution": {"target": "openai_gpt4"}} + ``` +- **WHEN** the eval case is loaded +- **THEN** the case SHALL use `target: "openai_gpt4"` +- **AND** the sidecar default SHALL be overridden for this case only + +#### Scenario: Override evaluators per case +- **GIVEN** a sidecar YAML with `evaluator: llm_judge` +- **AND** a JSONL line: + ```jsonl + {"id": "rubric-test", "expected_outcome": "Uses rubric", "input_messages": [...], "evaluators": [{"type": "rubric", "rubrics": ["Must be polite"]}]} + ``` +- **WHEN** the eval case is loaded +- **THEN** the case SHALL use the rubric evaluator +- **AND** the sidecar default evaluator SHALL be overridden for this case only + +#### Scenario: Merge defaults with per-case fields +- **GIVEN** a sidecar YAML with: + ```yaml + execution: + target: azure_base + evaluator: llm_judge + ``` +- **AND** a JSONL line with only `execution` override: + ```jsonl + {"id": "test", "expected_outcome": "Goal", "input_messages": [...], "execution": {"target": "openai"}} + ``` +- **WHEN** the eval case is loaded +- **THEN** the case SHALL use `target: "openai"` (overridden) +- **AND** the case SHALL use `evaluator: "llm_judge"` (from sidecar) + +### Requirement: File Reference Resolution + +The system SHALL resolve file references in JSONL content relative to the JSONL file location. + +#### Scenario: Resolve relative file reference +- **GIVEN** a JSONL file at `evals/tests/dataset.jsonl` +- **AND** a line with file reference: + ```jsonl + {"id": "test", "expected_outcome": "Reviews code", "input_messages": [{"role": "user", "content": [{"type": "file", "value": "./code.py"}]}]} + ``` +- **WHEN** the eval case is loaded +- **THEN** the system SHALL resolve `./code.py` relative to `evals/tests/` +- **AND** load content from `evals/tests/code.py` + +#### Scenario: Resolve guideline files from JSONL +- **GIVEN** a JSONL file at `evals/dataset.jsonl` +- **AND** a config with `guideline_patterns: ["*.instructions.md"]` +- **AND** a line with guideline reference: + ```jsonl + {"id": "test", "expected_outcome": "Follows guidelines", "input_messages": [{"role": "user", "content": [{"type": "file", "value": "python.instructions.md"}]}]} + ``` +- **WHEN** the eval case is loaded +- **THEN** the system SHALL detect the guideline file +- **AND** process it as a guideline (prepend to prompt, wrap in guidelines block) + +### Requirement: Schema Compatibility + +The system SHALL produce identical `EvalCase` objects from JSONL and YAML formats. + +#### Scenario: JSONL and YAML produce same EvalCase +- **GIVEN** a YAML file with: + ```yaml + evalcases: + - id: test-1 + expected_outcome: Goal + input_messages: + - role: user + content: Query + ``` +- **AND** a JSONL file with: + ```jsonl + {"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]} + ``` +- **WHEN** both files are parsed +- **THEN** both SHALL produce identical `EvalCase` objects +- **AND** downstream code SHALL work identically with both + +#### Scenario: All eval case fields supported in JSONL +- **GIVEN** a JSONL line with all supported fields: + ```jsonl + { + "id": "full-test", + "conversation_id": "conv-1", + "expected_outcome": "Goal", + "input_messages": [...], + "expected_messages": [...], + "execution": {"target": "azure"}, + "evaluators": [...], + "rubrics": [...] + } + ``` +- **WHEN** the line is parsed +- **THEN** all fields SHALL be preserved in the `EvalCase` object +- **AND** the case SHALL validate and execute correctly + +### Requirement: Error Reporting + +The system SHALL provide clear, actionable error messages for JSONL parsing failures. + +#### Scenario: Line number in parse errors +- **GIVEN** a JSONL file with JSON syntax error on line 42 +- **WHEN** the file is parsed +- **THEN** the error message SHALL include "Line 42" +- **AND** SHALL include the specific JSON parse error + +#### Scenario: Field validation errors reference line +- **GIVEN** a JSONL file where line 10 has invalid field type (string instead of array for `input_messages`) +- **WHEN** the file is parsed +- **THEN** the error/warning message SHALL include "Line 10" +- **AND** SHALL indicate the field name and expected type + +#### Scenario: Sidecar not found is a warning, not error +- **GIVEN** a JSONL file without companion YAML +- **WHEN** the file is loaded with verbose logging enabled +- **THEN** the system SHALL log a warning about missing sidecar +- **AND** SHALL continue with defaults +- **AND** SHALL NOT throw an error + +### Requirement: Backward Compatibility + +The system SHALL maintain full backward compatibility with existing YAML-only workflows. + +#### Scenario: Existing YAML files work unchanged +- **GIVEN** an existing YAML eval file +- **WHEN** `loadEvalCases()` is called with the YAML file path +- **THEN** the system SHALL parse it with the YAML parser +- **AND** produce identical results as before JSONL support was added + +#### Scenario: Mixed YAML and JSONL in same repo +- **GIVEN** a repository with both: + - `evals/test1.yaml` + - `evals/test2.jsonl` +- **WHEN** running evals from both files +- **THEN** both SHALL work correctly +- **AND** YAML files SHALL use YAML parser +- **AND** JSONL files SHALL use JSONL parser + +#### Scenario: CLI works with both formats +- **GIVEN** the CLI command `agentv run evals/test.jsonl` +- **WHEN** executed +- **THEN** the CLI SHALL detect JSONL format and run the eval +- **AND** produce same output format as YAML evals diff --git a/openspec/changes/add-jsonl-dataset-format/tasks.md b/openspec/changes/add-jsonl-dataset-format/tasks.md new file mode 100644 index 00000000..9cfd131b --- /dev/null +++ b/openspec/changes/add-jsonl-dataset-format/tasks.md @@ -0,0 +1,131 @@ +# Tasks: Add JSONL Dataset Format Support + +## Implementation Checklist + +### Phase 1: Core JSONL Parser +- [x] Create `jsonl-parser.ts` module for JSONL parsing + - [x] Implement line-by-line JSON parsing + - [x] Handle malformed lines with clear error messages + - [x] Validate each line matches eval case schema + - [x] Support UTF-8 encoding + - [x] Skip empty lines and whitespace-only lines +- [x] Create file format detector + - [x] Detect `.jsonl` extension → route to JSONL parser + - [x] Detect `.yaml` or `.yml` → route to existing YAML parser + - [x] Return clear error for unsupported extensions +- [x] Implement sidecar YAML metadata loader + - [x] Look for `.yaml` companion file + - [x] Parse metadata fields: `description`, `dataset`, `execution`, `evaluator` + - [x] Merge defaults with per-case overrides + - [x] Fall back to sensible defaults if no sidecar found +- [x] Update `loadEvalCases()` function + - [x] Add format detection logic + - [x] Route to appropriate parser (JSONL or YAML) + - [x] Maintain same function signature (backward compatible) + - [x] Preserve existing error handling patterns + +### Phase 2: Schema Validation +- [x] Extend TypeScript types for JSONL cases + - [x] Verify `EvalCase` type covers all JSONL fields + - [x] Add types for sidecar metadata structure + - [x] Ensure per-line overrides type-check correctly +- [x] Add validation for JSONL-specific scenarios + - [x] Validate line-level `execution` overrides + - [x] Validate line-level `evaluators` array + - [x] Validate line-level `rubrics` array + - [x] Ensure same file reference resolution as YAML +- [x] Add error reporting for invalid JSONL + - [x] Report line number for parse failures + - [x] Indicate which field is invalid + - [x] Suggest fixes for common errors + +### Phase 3: File Reference Resolution +- [x] Verify file reference resolution works with JSONL + - [x] Resolve paths relative to JSONL file location + - [x] Support `type: file` content blocks + - [x] Handle guideline files (`.instructions.md`) + - [x] Same search root logic as YAML +- [x] Test with nested directories + - [x] JSONL in `evals/subfolder/test.jsonl` + - [x] File references like `./data/input.txt` + - [x] Ensure correct path resolution + +### Phase 4: Testing +- [x] Unit tests for JSONL parser + - [x] Parse valid single-line JSONL + - [x] Parse multi-line JSONL dataset + - [x] Handle empty files gracefully + - [x] Skip empty lines and whitespace + - [x] Error on malformed JSON + - [x] Error on missing required fields (`id`, `expected_outcome`, `input_messages`) +- [x] Unit tests for sidecar metadata + - [x] Load metadata from companion YAML + - [x] Merge defaults with per-line overrides + - [x] Handle missing sidecar gracefully + - [x] Apply correct precedence (line overrides sidecar) +- [x] Integration tests + - [x] End-to-end eval run with JSONL dataset + - [x] Verify file references resolve correctly + - [x] Test with multiple evaluators + - [x] Test with per-case execution overrides + - [x] Verify trace capture works with JSONL +- [x] Regression tests + - [x] Ensure YAML parsing unchanged + - [x] Verify backward compatibility + - [x] Test mixed repos (YAML + JSONL) + +### Phase 5: Documentation +- [x] Update README with JSONL examples +- [x] Document JSONL in eval-builder skill (SKILL.md) + - [x] Basic JSONL structure + - [x] Sidecar metadata usage + - [x] Per-case overrides + - [x] File reference examples +- [x] Add JSONL examples to `examples/` directory + - [x] `examples/features/basic-jsonl/` - JSONL version of basic example + - [x] With sidecar metadata + - [x] With per-case execution overrides + - [x] With file references (points to basic example files) + +### Phase 6: Error Messages & DX +- [x] Improve error messages for JSONL + - [x] "Line 42: Invalid JSON syntax" + - [x] "Line 10: Missing required field 'id'" + - [x] "Sidecar file 'dataset.yaml' not found (using defaults)" +- [x] Add verbose logging for JSONL loading + - [x] Log sidecar metadata discovery + - [x] Log number of cases loaded + - [x] Log per-case override application +- [x] Validate with `openspec validate --strict` + +## Validation Steps + +After implementation: +1. Run `bun run build` - Ensure no compilation errors ✓ +2. Run `bun run typecheck` - Verify TypeScript types ✓ +3. Run `bun run lint` - Check code style ✓ +4. Run `bun test` - All tests pass ✓ +5. Run examples with JSONL datasets ✓ +6. Validate backward compatibility with existing YAML files ✓ + +## Dependencies + +- No new external dependencies required +- Uses existing Node.js `fs/promises` and `path` modules +- Reuses existing validation and file resolution logic + +## Parallelizable Work + +These can be done independently: +- JSONL parser implementation (Phase 1) and Testing setup (Phase 4) can start together +- Documentation (Phase 5) can be drafted while implementation is in progress +- Example files can be created early for testing + +## Estimated Effort + +- **Phase 1-3**: Core implementation - 2-3 hours +- **Phase 4**: Testing - 1-2 hours +- **Phase 5**: Documentation - 1 hour +- **Phase 6**: Polish - 30 minutes + +**Total**: ~5-7 hours for complete implementation diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts new file mode 100644 index 00000000..7e15c236 --- /dev/null +++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts @@ -0,0 +1,373 @@ +import { readFile } from 'node:fs/promises'; +import path from 'node:path'; +import { parse as parseYaml } from 'yaml'; + +import type { EvalCase, JsonObject, JsonValue, TestMessage } from '../types.js'; +import { isJsonObject, isTestMessage } from '../types.js'; +import { loadConfig } from './config-loader.js'; +import { coerceEvaluator, parseEvaluators } from './evaluator-parser.js'; +import { buildSearchRoots, fileExists, resolveToAbsolutePath } from './file-resolver.js'; +import { processExpectedMessages, processMessages } from './message-processor.js'; + +const ANSI_YELLOW = '\u001b[33m'; +const ANSI_RED = '\u001b[31m'; +const ANSI_RESET = '\u001b[0m'; + +type LoadOptions = { + readonly verbose?: boolean; + readonly evalId?: string; +}; + +/** + * Sidecar metadata structure for JSONL datasets. + */ +type SidecarMetadata = { + readonly description?: string; + readonly dataset?: string; + readonly execution?: JsonObject; + readonly evaluator?: JsonValue; +}; + +/** + * Raw eval case from JSONL line. + */ +type RawJsonlEvalCase = JsonObject & { + readonly id?: JsonValue; + readonly conversation_id?: JsonValue; + readonly outcome?: JsonValue; + readonly expected_outcome?: JsonValue; + readonly input_messages?: JsonValue; + readonly expected_messages?: JsonValue; + readonly execution?: JsonValue; + readonly evaluators?: JsonValue; + readonly rubrics?: JsonValue; +}; + +/** + * Detect file format by extension. + */ +export function detectFormat(filePath: string): 'yaml' | 'jsonl' { + const ext = path.extname(filePath).toLowerCase(); + if (ext === '.jsonl') return 'jsonl'; + if (ext === '.yaml' || ext === '.yml') return 'yaml'; + throw new Error(`Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl`); +} + +/** + * Load sidecar YAML metadata file for a JSONL dataset. + */ +async function loadSidecarMetadata(jsonlPath: string, verbose: boolean): Promise { + const dir = path.dirname(jsonlPath); + const base = path.basename(jsonlPath, '.jsonl'); + const sidecarPath = path.join(dir, `${base}.yaml`); + + if (!(await fileExists(sidecarPath))) { + if (verbose) { + logWarning(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`); + } + return {}; + } + + try { + const content = await readFile(sidecarPath, 'utf8'); + const parsed = parseYaml(content) as unknown; + + if (!isJsonObject(parsed)) { + logWarning(`Invalid sidecar metadata format in ${sidecarPath}`); + return {}; + } + + return { + description: asString(parsed.description), + dataset: asString(parsed.dataset), + execution: isJsonObject(parsed.execution) ? parsed.execution : undefined, + evaluator: parsed.evaluator, + }; + } catch (error) { + logWarning(`Could not read sidecar metadata from ${sidecarPath}: ${(error as Error).message}`); + return {}; + } +} + +/** + * Parse JSONL file content into raw eval cases. + */ +function parseJsonlContent(content: string, filePath: string): RawJsonlEvalCase[] { + const lines = content.split('\n'); + const cases: RawJsonlEvalCase[] = []; + + for (let i = 0; i < lines.length; i++) { + const line = lines[i].trim(); + if (line === '') continue; // Skip empty lines + + try { + const parsed = JSON.parse(line) as unknown; + if (!isJsonObject(parsed)) { + throw new Error('Expected JSON object'); + } + cases.push(parsed as RawJsonlEvalCase); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + throw new Error(`Line ${i + 1}: Invalid JSON - ${message}\n File: ${filePath}`); + } + } + + return cases; +} + +/** + * Load eval cases from a JSONL file with optional sidecar YAML metadata. + */ +export async function loadEvalCasesFromJsonl( + evalFilePath: string, + repoRoot: URL | string, + options?: LoadOptions, +): Promise { + const verbose = options?.verbose ?? false; + const evalIdFilter = options?.evalId; + const absoluteTestPath = path.resolve(evalFilePath); + + const repoRootPath = resolveToAbsolutePath(repoRoot); + const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath); + + // Load configuration (walks up directory tree to repo root) + const config = await loadConfig(absoluteTestPath, repoRootPath); + const guidelinePatterns = config?.guideline_patterns; + + // Load sidecar metadata + const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose); + + // Parse JSONL content + const rawFile = await readFile(absoluteTestPath, 'utf8'); + const rawCases = parseJsonlContent(rawFile, evalFilePath); + + // Derive dataset name: sidecar > filename + const fallbackDataset = path.basename(absoluteTestPath, '.jsonl') || 'eval'; + const datasetName = + sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset; + + // Global defaults from sidecar + const globalEvaluator = coerceEvaluator(sidecar.evaluator, 'sidecar') ?? 'llm_judge'; + const globalExecution = sidecar.execution; + + if (verbose) { + console.log(`\n[JSONL Dataset: ${evalFilePath}]`); + console.log(` Cases: ${rawCases.length}`); + console.log(` Dataset name: ${datasetName}`); + if (sidecar.description) { + console.log(` Description: ${sidecar.description}`); + } + } + + const results: EvalCase[] = []; + + for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) { + const evalcase = rawCases[lineIndex]; + const lineNumber = lineIndex + 1; // 1-based for user-facing messages + const id = asString(evalcase.id); + + // Skip eval cases that don't match the filter + if (evalIdFilter && id !== evalIdFilter) { + continue; + } + + const conversationId = asString(evalcase.conversation_id); + // Support both expected_outcome and outcome (backward compatibility) + const outcome = asString(evalcase.expected_outcome) ?? asString(evalcase.outcome); + + const inputMessagesValue = evalcase.input_messages; + const expectedMessagesValue = evalcase.expected_messages; + + if (!id || !outcome || !Array.isArray(inputMessagesValue)) { + logError( + `Skipping incomplete eval case at line ${lineNumber}: ${id ?? 'unknown'}. Missing required fields: id, expected_outcome, and/or input_messages`, + ); + continue; + } + + // expected_messages is optional - for outcome-only evaluation + const hasExpectedMessages = + Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0; + + // V2 format: input_messages vs expected_messages + const inputMessages = inputMessagesValue.filter((msg): msg is TestMessage => + isTestMessage(msg), + ); + const expectedMessages = hasExpectedMessages + ? expectedMessagesValue.filter((msg): msg is TestMessage => isTestMessage(msg)) + : []; + + if (hasExpectedMessages && expectedMessages.length === 0) { + logError(`Line ${lineNumber}: No valid expected message found for eval case: ${id}`); + continue; + } + + const guidelinePaths: string[] = []; + const inputTextParts: string[] = []; + + // Process all input messages to extract files and guidelines + const inputSegments = await processMessages({ + messages: inputMessages, + searchRoots, + repoRootPath, + guidelinePatterns, + guidelinePaths, + textParts: inputTextParts, + messageType: 'input', + verbose, + }); + + // Process expected_messages into segments (only if provided) + // Preserve full message structure including role and tool_calls for expected_messages evaluator + const outputSegments = hasExpectedMessages + ? await processExpectedMessages({ + messages: expectedMessages, + searchRoots, + repoRootPath, + verbose, + }) + : []; + + // Build reference_answer: + // Extract the content from the last message in expected_messages (similar to candidate_answer) + let referenceAnswer = ''; + if (outputSegments.length > 0) { + // Get the last message + const lastMessage = outputSegments[outputSegments.length - 1]; + const content = lastMessage.content; + const toolCalls = lastMessage.tool_calls; + + if (typeof content === 'string') { + referenceAnswer = content; + } else if (content !== undefined && content !== null) { + // Serialize just the content, not the entire message + referenceAnswer = JSON.stringify(content, null, 2); + } else if (toolCalls !== undefined && toolCalls !== null) { + // Message with only tool_calls - serialize just the tool_calls + referenceAnswer = JSON.stringify(toolCalls, null, 2); + } + } + const question = inputTextParts + .map((part) => part.trim()) + .filter((part) => part.length > 0) + .join(' '); + + // Merge execution config: per-case overrides sidecar + const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : undefined; + const mergedExecution = caseExecution ?? globalExecution; + + const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator; + let evaluators: Awaited>; + try { + evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? 'unknown'); + } catch (error) { + // Skip entire eval case if evaluator validation fails + const message = error instanceof Error ? error.message : String(error); + logError(`Skipping eval case '${id}' at line ${lineNumber}: ${message}`); + continue; + } + + // Handle inline rubrics field (syntactic sugar) + const inlineRubrics = evalcase.rubrics; + if (inlineRubrics !== undefined && Array.isArray(inlineRubrics)) { + const rubricItems = inlineRubrics + .filter((r): r is JsonObject | string => isJsonObject(r) || typeof r === 'string') + .map((rubric, index) => { + if (typeof rubric === 'string') { + return { + id: `rubric-${index + 1}`, + description: rubric, + weight: 1.0, + required: true, + }; + } + return { + id: asString(rubric.id) ?? `rubric-${index + 1}`, + description: asString(rubric.description) ?? '', + weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0, + required: typeof rubric.required === 'boolean' ? rubric.required : true, + }; + }) + .filter((r) => r.description.length > 0); + + if (rubricItems.length > 0) { + const rubricEvaluator: import('../types.js').LlmJudgeEvaluatorConfig = { + name: 'rubric', + type: 'llm_judge', + rubrics: rubricItems, + }; + // Prepend rubric evaluator to existing evaluators + evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator]; + } + } + + // Extract file paths from all input segments (non-guideline files) + const userFilePaths: string[] = []; + for (const segment of inputSegments) { + if (segment.type === 'file' && typeof segment.resolvedPath === 'string') { + userFilePaths.push(segment.resolvedPath); + } + } + + // Combine all file paths (guidelines + regular files) + const allFilePaths = [ + ...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)), + ...userFilePaths, + ]; + + const testCase: EvalCase = { + id, + dataset: datasetName, + conversation_id: conversationId, + question: question, + input_messages: inputMessages, + input_segments: inputSegments, + expected_messages: outputSegments, + reference_answer: referenceAnswer, + guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)), + guideline_patterns: guidelinePatterns, + file_paths: allFilePaths, + expected_outcome: outcome, + evaluator: evalCaseEvaluatorKind, + evaluators, + }; + + if (verbose) { + console.log(`\n[Eval Case: ${id}]`); + if (testCase.guideline_paths.length > 0) { + console.log(` Guidelines used: ${testCase.guideline_paths.length}`); + for (const guidelinePath of testCase.guideline_paths) { + console.log(` - ${guidelinePath}`); + } + } else { + console.log(' No guidelines found'); + } + } + + results.push(testCase); + } + + return results; +} + +function asString(value: unknown): string | undefined { + return typeof value === 'string' ? value : undefined; +} + +function logWarning(message: string, details?: readonly string[]): void { + if (details && details.length > 0) { + const detailBlock = details.join('\n'); + console.warn(`${ANSI_YELLOW}Warning: ${message}\n${detailBlock}${ANSI_RESET}`); + } else { + console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`); + } +} + +function logError(message: string, details?: readonly string[]): void { + if (details && details.length > 0) { + const detailBlock = details.join('\n'); + console.error(`${ANSI_RED}Error: ${message}\n${detailBlock}${ANSI_RESET}`); + } else { + console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET}`); + } +} diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index c14826fc..d481f567 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -5,6 +5,7 @@ import { parse } from 'yaml'; import { extractTargetFromSuite, loadConfig } from './loaders/config-loader.js'; import { coerceEvaluator, parseEvaluators } from './loaders/evaluator-parser.js'; import { buildSearchRoots, resolveToAbsolutePath } from './loaders/file-resolver.js'; +import { detectFormat, loadEvalCasesFromJsonl } from './loaders/jsonl-parser.js'; import { processExpectedMessages, processMessages } from './loaders/message-processor.js'; import type { EvalCase, JsonObject, JsonValue, TestMessage } from './types.js'; import { isJsonObject, isTestMessage } from './types.js'; @@ -12,6 +13,7 @@ import { isJsonObject, isTestMessage } from './types.js'; // Re-export public APIs from modules export { buildPromptInputs, type PromptInputs } from './formatting/prompt-builder.js'; export { isGuidelineFile } from './loaders/config-loader.js'; +export { detectFormat } from './loaders/jsonl-parser.js'; const ANSI_YELLOW = '\u001b[33m'; const ANSI_RED = '\u001b[31m'; @@ -62,13 +64,21 @@ export async function readTestSuiteMetadata(testFilePath: string): Promise<{ tar } /** - * Load eval cases from a AgentV YAML specification file. + * Load eval cases from a AgentV specification file (YAML or JSONL). + * Format is detected by file extension: .yaml/.yml for YAML, .jsonl for JSONL. */ export async function loadEvalCases( evalFilePath: string, repoRoot: URL | string, options?: LoadOptions, ): Promise { + // Detect format and route to appropriate parser + const format = detectFormat(evalFilePath); + if (format === 'jsonl') { + return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options); + } + + // YAML parsing (existing implementation) const verbose = options?.verbose ?? false; const evalIdFilter = options?.evalId; const absoluteTestPath = path.resolve(evalFilePath); diff --git a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts new file mode 100644 index 00000000..9107d8d8 --- /dev/null +++ b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts @@ -0,0 +1,396 @@ +import { afterAll, beforeAll, describe, expect, it } from 'bun:test'; +import { mkdir, rm, writeFile } from 'node:fs/promises'; +import os from 'node:os'; +import path from 'node:path'; + +import { + detectFormat, + loadEvalCasesFromJsonl, +} from '../../../src/evaluation/loaders/jsonl-parser.js'; +import { loadEvalCases } from '../../../src/evaluation/yaml-parser.js'; + +describe('detectFormat', () => { + it('returns jsonl for .jsonl extension', () => { + expect(detectFormat('test.jsonl')).toBe('jsonl'); + expect(detectFormat('/path/to/dataset.jsonl')).toBe('jsonl'); + }); + + it('returns yaml for .yaml extension', () => { + expect(detectFormat('test.yaml')).toBe('yaml'); + expect(detectFormat('/path/to/config.yaml')).toBe('yaml'); + }); + + it('returns yaml for .yml extension', () => { + expect(detectFormat('test.yml')).toBe('yaml'); + expect(detectFormat('/path/to/config.yml')).toBe('yaml'); + }); + + it('throws for unsupported extensions', () => { + expect(() => detectFormat('test.json')).toThrow('Unsupported file format'); + expect(() => detectFormat('test.txt')).toThrow('Unsupported file format'); + expect(() => detectFormat('test')).toThrow('Unsupported file format'); + }); +}); + +describe('loadEvalCasesFromJsonl', () => { + let tempDir: string; + + beforeAll(async () => { + tempDir = path.join(os.tmpdir(), `agentv-test-jsonl-${Date.now()}`); + await mkdir(tempDir, { recursive: true }); + }); + + afterAll(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('parses valid single-line JSONL', async () => { + const jsonlPath = path.join(tempDir, 'single.jsonl'); + await writeFile( + jsonlPath, + '{"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}\n', + ); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].id).toBe('test-1'); + expect(cases[0].expected_outcome).toBe('Goal'); + expect(cases[0].input_messages).toHaveLength(1); + expect(cases[0].input_messages[0].role).toBe('user'); + expect(cases[0].input_messages[0].content).toBe('Query'); + }); + + it('parses multi-line JSONL', async () => { + const jsonlPath = path.join(tempDir, 'multi.jsonl'); + await writeFile( + jsonlPath, + [ + '{"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [{"role": "user", "content": "Query 1"}]}', + '{"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [{"role": "user", "content": "Query 2"}]}', + '{"id": "test-3", "expected_outcome": "Goal 3", "input_messages": [{"role": "user", "content": "Query 3"}]}', + ].join('\n'), + ); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(3); + expect(cases[0].id).toBe('test-1'); + expect(cases[1].id).toBe('test-2'); + expect(cases[2].id).toBe('test-3'); + expect(cases[0].expected_outcome).toBe('Goal 1'); + expect(cases[1].expected_outcome).toBe('Goal 2'); + expect(cases[2].expected_outcome).toBe('Goal 3'); + }); + + it('skips empty lines and whitespace-only lines', async () => { + const jsonlPath = path.join(tempDir, 'empty-lines.jsonl'); + await writeFile( + jsonlPath, + [ + '{"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [{"role": "user", "content": "Query 1"}]}', + '', + '{"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [{"role": "user", "content": "Query 2"}]}', + ' ', + '{"id": "test-3", "expected_outcome": "Goal 3", "input_messages": [{"role": "user", "content": "Query 3"}]}', + '', + ].join('\n'), + ); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(3); + expect(cases[0].id).toBe('test-1'); + expect(cases[1].id).toBe('test-2'); + expect(cases[2].id).toBe('test-3'); + }); + + it('throws error on malformed JSON with line number', async () => { + const jsonlPath = path.join(tempDir, 'malformed.jsonl'); + await writeFile( + jsonlPath, + [ + '{"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [{"role": "user", "content": "Query 1"}]}', + '{"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [{"role": "user", "content": "Query 2"}]}', + '{"id": "test-3", "expected_outcome": "Goal 3" "input_messages": []}', // Missing comma + ].join('\n'), + ); + + await expect(loadEvalCasesFromJsonl(jsonlPath, tempDir)).rejects.toThrow(/Line 3/); + }); + + it('skips cases with missing required fields', async () => { + const jsonlPath = path.join(tempDir, 'missing-fields.jsonl'); + await writeFile( + jsonlPath, + [ + '{"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [{"role": "user", "content": "Query 1"}]}', + '{"id": "test-2", "input_messages": [{"role": "user", "content": "Query 2"}]}', // Missing expected_outcome + '{"expected_outcome": "Goal 3", "input_messages": [{"role": "user", "content": "Query 3"}]}', // Missing id + '{"id": "test-4", "expected_outcome": "Goal 4"}', // Missing input_messages + '{"id": "test-5", "expected_outcome": "Goal 5", "input_messages": [{"role": "user", "content": "Query 5"}]}', + ].join('\n'), + ); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(2); + expect(cases[0].id).toBe('test-1'); + expect(cases[1].id).toBe('test-5'); + }); + + it('loads sidecar YAML metadata', async () => { + const jsonlPath = path.join(tempDir, 'with-sidecar.jsonl'); + const sidecarPath = path.join(tempDir, 'with-sidecar.yaml'); + + await writeFile( + jsonlPath, + '{"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}\n', + ); + await writeFile( + sidecarPath, + 'description: Test dataset\ndataset: my-tests\nevaluator: llm_judge\n', + ); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].dataset).toBe('my-tests'); + expect(cases[0].evaluator).toBe('llm_judge'); + }); + + it('uses default dataset name from filename when no sidecar', async () => { + const jsonlPath = path.join(tempDir, 'my-dataset.jsonl'); + await writeFile( + jsonlPath, + '{"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}\n', + ); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].dataset).toBe('my-dataset'); + }); + + it('supports per-case evaluators override', async () => { + const jsonlPath = path.join(tempDir, 'with-evaluators.jsonl'); + await writeFile( + jsonlPath, + '{"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}], "evaluators": [{"name": "rubric-check", "type": "llm_judge", "rubrics": [{"id": "r1", "description": "Must be polite", "weight": 1.0, "required": true}]}]}\n', + ); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].evaluators).toHaveLength(1); + expect(cases[0].evaluators?.[0].name).toBe('rubric-check'); + }); + + it('supports inline rubrics field', async () => { + const jsonlPath = path.join(tempDir, 'with-rubrics.jsonl'); + await writeFile( + jsonlPath, + '{"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}], "rubrics": ["Must be polite", "Must be helpful"]}\n', + ); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].evaluators).toHaveLength(1); + expect(cases[0].evaluators?.[0].type).toBe('llm_judge'); + const rubricEvaluator = cases[0].evaluators?.[0] as { type: string; rubrics?: unknown[] }; + expect(rubricEvaluator.rubrics).toHaveLength(2); + }); + + it('filters by evalId', async () => { + const jsonlPath = path.join(tempDir, 'filter.jsonl'); + await writeFile( + jsonlPath, + [ + '{"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [{"role": "user", "content": "Query 1"}]}', + '{"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [{"role": "user", "content": "Query 2"}]}', + '{"id": "test-3", "expected_outcome": "Goal 3", "input_messages": [{"role": "user", "content": "Query 3"}]}', + ].join('\n'), + ); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir, { evalId: 'test-2' }); + + expect(cases).toHaveLength(1); + expect(cases[0].id).toBe('test-2'); + }); + + it('supports conversation_id field', async () => { + const jsonlPath = path.join(tempDir, 'with-conv-id.jsonl'); + await writeFile( + jsonlPath, + '{"id": "test-1", "conversation_id": "conv-123", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}\n', + ); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].conversation_id).toBe('conv-123'); + }); + + it('supports expected_messages field', async () => { + const jsonlPath = path.join(tempDir, 'with-expected.jsonl'); + await writeFile( + jsonlPath, + '{"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}], "expected_messages": [{"role": "assistant", "content": "Response"}]}\n', + ); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].expected_messages).toHaveLength(1); + expect(cases[0].reference_answer).toBe('Response'); + }); + + it('handles empty JSONL file', async () => { + const jsonlPath = path.join(tempDir, 'empty.jsonl'); + await writeFile(jsonlPath, ''); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(0); + }); + + it('supports backward-compatible outcome field', async () => { + const jsonlPath = path.join(tempDir, 'outcome-field.jsonl'); + await writeFile( + jsonlPath, + '{"id": "test-1", "outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}\n', + ); + + const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].expected_outcome).toBe('Goal'); + }); +}); + +describe('loadEvalCases with format detection', () => { + let tempDir: string; + + beforeAll(async () => { + tempDir = path.join(os.tmpdir(), `agentv-test-loadEvalCases-${Date.now()}`); + await mkdir(tempDir, { recursive: true }); + }); + + afterAll(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('routes .jsonl to JSONL parser', async () => { + const jsonlPath = path.join(tempDir, 'test.jsonl'); + await writeFile( + jsonlPath, + '{"id": "jsonl-test", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}\n', + ); + + const cases = await loadEvalCases(jsonlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].id).toBe('jsonl-test'); + }); + + it('routes .yaml to YAML parser', async () => { + const yamlPath = path.join(tempDir, 'test.yaml'); + await writeFile( + yamlPath, + `evalcases: + - id: yaml-test + expected_outcome: Goal + input_messages: + - role: user + content: Query +`, + ); + + const cases = await loadEvalCases(yamlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].id).toBe('yaml-test'); + }); + + it('routes .yml to YAML parser', async () => { + const ymlPath = path.join(tempDir, 'test.yml'); + await writeFile( + ymlPath, + `evalcases: + - id: yml-test + expected_outcome: Goal + input_messages: + - role: user + content: Query +`, + ); + + const cases = await loadEvalCases(ymlPath, tempDir); + + expect(cases).toHaveLength(1); + expect(cases[0].id).toBe('yml-test'); + }); + + it('throws for unsupported extensions via loadEvalCases', async () => { + const txtPath = path.join(tempDir, 'test.txt'); + await writeFile(txtPath, '{}'); + + await expect(loadEvalCases(txtPath, tempDir)).rejects.toThrow('Unsupported file format'); + }); +}); + +describe('JSONL and YAML produce equivalent EvalCases', () => { + let tempDir: string; + + beforeAll(async () => { + tempDir = path.join(os.tmpdir(), `agentv-test-equivalence-${Date.now()}`); + await mkdir(tempDir, { recursive: true }); + }); + + afterAll(async () => { + await rm(tempDir, { recursive: true, force: true }); + }); + + it('produces identical EvalCase structure from both formats', async () => { + // Create equivalent YAML and JSONL files + const yamlPath = path.join(tempDir, 'equiv.yaml'); + const jsonlPath = path.join(tempDir, 'equiv.jsonl'); + + await writeFile( + yamlPath, + `dataset: my-dataset +evalcases: + - id: test-1 + expected_outcome: "The agent should respond with a helpful answer" + input_messages: + - role: user + content: "What is 2+2?" +`, + ); + + // JSONL with equivalent sidecar + const sidecarPath = path.join(tempDir, 'equiv-sidecar.yaml'); + await writeFile(sidecarPath, 'dataset: my-dataset\n'); + + const jsonlPath2 = path.join(tempDir, 'equiv-sidecar.jsonl'); + await writeFile( + jsonlPath2, + '{"id": "test-1", "expected_outcome": "The agent should respond with a helpful answer", "input_messages": [{"role": "user", "content": "What is 2+2?"}]}\n', + ); + + const yamlCases = await loadEvalCases(yamlPath, tempDir); + const jsonlCases = await loadEvalCases(jsonlPath2, tempDir); + + expect(yamlCases).toHaveLength(1); + expect(jsonlCases).toHaveLength(1); + + // Core fields should match + expect(jsonlCases[0].id).toBe(yamlCases[0].id); + expect(jsonlCases[0].expected_outcome).toBe(yamlCases[0].expected_outcome); + expect(jsonlCases[0].dataset).toBe(yamlCases[0].dataset); + expect(jsonlCases[0].input_messages.length).toBe(yamlCases[0].input_messages.length); + expect(jsonlCases[0].input_messages[0].role).toBe(yamlCases[0].input_messages[0].role); + expect(jsonlCases[0].input_messages[0].content).toBe(yamlCases[0].input_messages[0].content); + }); +});