diff --git a/.claude/skills/agentv-eval-builder/SKILL.md b/.claude/skills/agentv-eval-builder/SKILL.md
index 2f06a526..b793b564 100644
--- a/.claude/skills/agentv-eval-builder/SKILL.md
+++ b/.claude/skills/agentv-eval-builder/SKILL.md
@@ -7,7 +7,7 @@ description: Create and maintain AgentV YAML evaluation files for testing AI age
 
 ## Schema Reference
 - Schema: `references/eval-schema.json` (JSON Schema for validation and tooling)
-- Format: YAML with structured content arrays
+- Format: YAML or JSONL (see below)
 - Examples: `references/example-evals.md`
 
 ## Feature Reference
@@ -30,6 +30,28 @@ description: Create and maintain AgentV YAML evaluation files for testing AI age
 - Attachments (type: `file`) should default to the `user` role
 - File paths: Relative (from eval file dir) or absolute with "/" prefix (from repo root)
 
+## JSONL Format
+
+For large-scale evaluations, use JSONL (one eval case per line) instead of YAML:
+
+**dataset.jsonl:**
+```jsonl
+{"id": "test-1", "expected_outcome": "Correct answer", "input_messages": [{"role": "user", "content": "What is 2+2?"}]}
+{"id": "test-2", "expected_outcome": "Clear explanation", "input_messages": [{"role": "user", "content": [{"type": "text", "value": "Review this"}, {"type": "file", "value": "./code.py"}]}]}
+```
+
+**dataset.yaml (optional sidecar for defaults):**
+```yaml
+description: My dataset
+dataset: my-tests
+execution:
+  target: azure_base
+evaluator: llm_judge
+```
+
+Benefits: Git-friendly diffs, streaming-compatible, easy programmatic generation.
+Per-case fields override sidecar defaults. See `examples/features/basic-jsonl/` for complete example.
+
 ## Custom Evaluators
 
 Configure multiple evaluators per eval case via `execution.evaluators` array.
diff --git a/README.md b/README.md
index 833f87d0..962fef7b 100644
--- a/README.md
+++ b/README.md
@@ -101,7 +101,27 @@ See [AGENTS.md](AGENTS.md) for development guidelines and design principles.
 
 ## Core Concepts
 
-**Evaluation files** (`.yaml`) define test cases with expected outcomes. **Targets** specify which agent/provider to evaluate. **Judges** (code or LLM) score results. **Results** are written as JSONL/YAML for analysis and comparison.
+**Evaluation files** (`.yaml` or `.jsonl`) define test cases with expected outcomes. **Targets** specify which agent/provider to evaluate. **Judges** (code or LLM) score results. **Results** are written as JSONL/YAML for analysis and comparison.
+
+### JSONL Format Support
+
+For large-scale evaluations, AgentV supports JSONL (JSON Lines) format as an alternative to YAML:
+
+```jsonl
+{"id": "test-1", "expected_outcome": "Calculates correctly", "input_messages": [{"role": "user", "content": "What is 2+2?"}]}
+{"id": "test-2", "expected_outcome": "Provides explanation", "input_messages": [{"role": "user", "content": "Explain variables"}]}
+```
+
+Optional sidecar YAML metadata file (`dataset.yaml` alongside `dataset.jsonl`):
+```yaml
+description: Math evaluation dataset
+dataset: math-tests
+execution:
+  target: azure_base
+evaluator: llm_judge
+```
+
+Benefits: Streaming-friendly, Git-friendly diffs, programmatic generation, industry standard (DeepEval, LangWatch, Hugging Face).
 
 ## Usage
 
diff --git a/apps/cli/README.md b/apps/cli/README.md
index 833f87d0..962fef7b 100644
--- a/apps/cli/README.md
+++ b/apps/cli/README.md
@@ -101,7 +101,27 @@ See [AGENTS.md](AGENTS.md) for development guidelines and design principles.
 
 ## Core Concepts
 
-**Evaluation files** (`.yaml`) define test cases with expected outcomes. **Targets** specify which agent/provider to evaluate. **Judges** (code or LLM) score results. **Results** are written as JSONL/YAML for analysis and comparison.
+**Evaluation files** (`.yaml` or `.jsonl`) define test cases with expected outcomes. **Targets** specify which agent/provider to evaluate. **Judges** (code or LLM) score results. **Results** are written as JSONL/YAML for analysis and comparison.
+
+### JSONL Format Support
+
+For large-scale evaluations, AgentV supports JSONL (JSON Lines) format as an alternative to YAML:
+
+```jsonl
+{"id": "test-1", "expected_outcome": "Calculates correctly", "input_messages": [{"role": "user", "content": "What is 2+2?"}]}
+{"id": "test-2", "expected_outcome": "Provides explanation", "input_messages": [{"role": "user", "content": "Explain variables"}]}
+```
+
+Optional sidecar YAML metadata file (`dataset.yaml` alongside `dataset.jsonl`):
+```yaml
+description: Math evaluation dataset
+dataset: math-tests
+execution:
+  target: azure_base
+evaluator: llm_judge
+```
+
+Benefits: Streaming-friendly, Git-friendly diffs, programmatic generation, industry standard (DeepEval, LangWatch, Hugging Face).
 
 ## Usage
 
diff --git a/examples/features/basic-jsonl/evals/dataset.jsonl b/examples/features/basic-jsonl/evals/dataset.jsonl
new file mode 100644
index 00000000..f29ea364
--- /dev/null
+++ b/examples/features/basic-jsonl/evals/dataset.jsonl
@@ -0,0 +1,4 @@
+{"id": "code-review-javascript", "expected_outcome": "Assistant provides helpful code analysis and mentions SUPERSECRET_INSTRUCTION_MARKER_JAVASCRIPT", "input_messages": [{"role": "system", "content": "You are an expert software developer who provides clear, concise code reviews."}, {"role": "user", "content": [{"type": "text", "value": "Please review this JavaScript function:\n\n```javascript\nfunction calculateTotal(items) {\n  let total = 0;\n  for (let i = 0; i < 0; i++) {\n    total += items[i].price * items[i].quantity;\n  }\n  return total;\n}\n```"}, {"type": "file", "value": "../basic/evals/javascript.instructions.md"}]}], "expected_messages": [{"role": "assistant", "content": "The function has a critical bug in the loop condition. Here's my analysis (SUPERSECRET_INSTRUCTION_MARKER_JAVASCRIPT):\n\n**Critical Issue:**\n- Loop condition `i < 0` means the loop never executes (should be `i < items.length`)\n\n**Suggestions:**\n- Fix the loop: `for (let i = 0; i < items.length; i++)`\n- Consider using `reduce()` for a more functional approach\n- Add input validation for edge cases"}]}
+{"id": "code-gen-python", "conversation_id": "python-code-generation", "expected_outcome": "AI generates correct Python function with proper error handling, type hints, and mentions SUPERSECRET_INSTRUCTION_MARKER_PYTHON", "input_messages": [{"role": "system", "content": "You are a code generator that follows specifications exactly."}, {"role": "user", "content": [{"type": "text", "value": "Create a Python function that:\n1. Takes a list of integers\n2. Returns the second largest number\n3. Handles edge cases (empty list, single item, duplicates)\n4. Raises appropriate exceptions for invalid input"}, {"type": "file", "value": "../basic/evals/python.instructions.md"}]}], "execution": {"target": "azure_base"}}
+{"id": "feature-proposal-brainstorm", "expected_outcome": "Assistant generates 3-5 creative feature ideas for a mobile fitness app. Each idea should:\n1. Address a specific user pain point\n2. Be technically feasible with current mobile technology\n3. Include a brief value proposition (1-2 sentences)\n4. Be distinct from the others (no duplicate concepts)", "input_messages": [{"role": "system", "content": "You are a product strategist specializing in mobile health and fitness applications."}, {"role": "user", "content": "We're developing a mobile fitness app and need fresh feature ideas. Please brainstorm 3-5 innovative features."}]}
+{"id": "multiturn-debug-session", "expected_outcome": "Assistant conducts a multi-turn debugging session, correctly diagnosing the bug and proposing a clear fix.", "input_messages": [{"role": "system", "content": "You are an expert debugging assistant."}, {"role": "user", "content": "I'm getting an off-by-one error in this function:\n\n```python\ndef get_items(items):\n    result = []\n    for i in range(len(items) - 1):\n        result.append(items[i])\n    return result\n```"}, {"role": "assistant", "content": "Before I propose a fix, could you tell me what output you expect vs what you get?"}, {"role": "user", "content": "For `[1, 2, 3, 4]` I expect `[1, 2, 3, 4]`, but I get `[1, 2, 3]`."}], "expected_messages": [{"role": "assistant", "content": "You have an off-by-one error. Use `range(len(items))` or iterate directly: `for item in items:`"}]}
diff --git a/examples/features/basic-jsonl/evals/dataset.yaml b/examples/features/basic-jsonl/evals/dataset.yaml
new file mode 100644
index 00000000..73db3828
--- /dev/null
+++ b/examples/features/basic-jsonl/evals/dataset.yaml
@@ -0,0 +1,8 @@
+# Sidecar metadata for dataset.jsonl
+# Provides default values for all eval cases
+
+description: JSONL version of the basic example - demonstrates file references, multi-turn, and per-case overrides
+dataset: basic-jsonl
+execution:
+  target: default
+evaluator: llm_judge
diff --git a/openspec/changes/add-jsonl-dataset-format/design.md b/openspec/changes/add-jsonl-dataset-format/design.md
new file mode 100644
index 00000000..c2c186d2
--- /dev/null
+++ b/openspec/changes/add-jsonl-dataset-format/design.md
@@ -0,0 +1,322 @@
+# Design: JSONL Dataset Format
+
+## Architecture Overview
+
+### Current State
+
+```
+User → agentv CLI → loadEvalCases() → yaml-parser.ts → parse(YAML) → EvalCase[]
+```
+
+### New State
+
+```
+User → agentv CLI → loadEvalCases() → Format Detector
+                                           ├→ YAML Parser (existing)
+                                           └→ JSONL Parser (new)
+                                                ├→ Parse JSONL lines
+                                                ├→ Load sidecar metadata
+                                                └→ Merge defaults → EvalCase[]
+```
+
+## Key Design Decisions
+
+### 1. Parser Architecture
+
+**Decision**: Create separate `jsonl-parser.ts` module alongside `yaml-parser.ts`
+
+**Rationale**:
+- Separation of concerns (YAML vs JSONL logic)
+- Easier to test independently
+- Maintains clarity in codebase
+- Allows future format additions without cluttering one file
+
+**Alternative considered**: Extend existing `yaml-parser.ts`
+- **Rejected**: Would mix two different parsing strategies (document vs line-based)
+
+### 2. Format Detection Strategy
+
+**Decision**: Use file extension (`.jsonl` vs `.yaml`/`.yml`)
+
+**Rationale**:
+- Simple and explicit
+- Industry standard approach
+- No magic or heuristics required
+- Clear user intent
+
+**Implementation**:
+```typescript
+function detectFormat(filePath: string): 'yaml' | 'jsonl' {
+  const ext = path.extname(filePath).toLowerCase();
+  if (ext === '.jsonl') return 'jsonl';
+  if (ext === '.yaml' || ext === '.yml') return 'yaml';
+  throw new Error(`Unsupported file format: ${ext}`);
+}
+```
+
+### 3. Sidecar Metadata Pattern
+
+**Decision**: Optional companion YAML file with same base name
+
+**Example**:
+```
+evals/
+  dataset.jsonl    # Data
+  dataset.yaml     # Metadata (optional)
+```
+
+**Rationale**:
+- Follows industry standard (Hugging Face, Vertex AI)
+- Maintains JSONL purity (every line is data)
+- Avoids repetition of defaults
+- Metadata remains human-readable
+- Clear separation of config vs data
+
+**Loading logic**:
+1. Check for `<basename>.yaml` file
+2. If found, parse metadata fields
+3. If not found, use sensible defaults:
+   - `dataset`: basename of JSONL file
+   - `execution.target`: "default"
+   - `evaluator`: "llm_judge"
+   - `description`: empty
+
+### 4. Defaults & Override Precedence
+
+**Decision**: Sidecar provides defaults, per-line fields override
+
+**Precedence order** (highest to lowest):
+1. Per-line field (e.g., `{"execution": {"target": "openai"}}`)
+2. Sidecar YAML field
+3. Hard-coded defaults
+
+**Example**:
+```yaml
+# dataset.yaml
+execution:
+  target: azure_base
+evaluator: llm_judge
+```
+
+```jsonl
+{"id": "test-1", "input_messages": [...]}  # Uses azure_base, llm_judge
+{"id": "test-2", "input_messages": [...], "execution": {"target": "openai"}}  # Uses openai, llm_judge
+{"id": "test-3", "input_messages": [...], "evaluators": [{"type": "rubric"}]}  # Uses azure_base, rubric
+```
+
+### 5. Line Parsing Strategy
+
+**Decision**: Strict line-by-line parsing with error recovery
+
+**Approach**:
+```typescript
+async function parseJsonlFile(filePath: string): Promise<RawEvalCase[]> {
+  const content = await readFile(filePath, 'utf8');
+  const lines = content.split('\n');
+  const cases: RawEvalCase[] = [];
+  
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i].trim();
+    if (line === '') continue;  // Skip empty lines
+    
+    try {
+      const parsed = JSON.parse(line);
+      if (!isJsonObject(parsed)) {
+        throw new Error('Expected JSON object');
+      }
+      cases.push(parsed as RawEvalCase);
+    } catch (error) {
+      throw new Error(
+        `Line ${i + 1}: Invalid JSON - ${(error as Error).message}`
+      );
+    }
+  }
+  
+  return cases;
+}
+```
+
+**Error handling**:
+- Report line number for failures
+- Stop on first error (no partial loads)
+- Clear error messages for common issues
+
+**Alternative considered**: Continue parsing after errors
+- **Rejected**: Could lead to incomplete/inconsistent test runs
+
+### 6. Schema Compatibility
+
+**Decision**: Reuse existing `EvalCase` TypeScript type
+
+**Rationale**:
+- Zero changes to downstream code
+- Same validation rules
+- Same evaluator logic
+- JSONL is just a different serialization format
+
+**Field mapping**:
+```typescript
+// JSONL line
+{
+  "id": "test-1",
+  "expected_outcome": "Goal",
+  "input_messages": [...],
+  "expected_messages": [...],
+  "execution": {...},
+  "evaluators": [...],
+  "rubrics": [...]
+}
+
+// Maps directly to EvalCase type
+type EvalCase = {
+  id: string;
+  conversationId?: string;
+  expectedOutcome: string;
+  inputMessages: TestMessage[];
+  expectedMessages: TestMessage[];
+  execution?: {...};
+  evaluators?: [...];
+  // ... rest of fields
+}
+```
+
+### 7. File Reference Resolution
+
+**Decision**: Resolve paths relative to JSONL file location (same as YAML)
+
+**Example**:
+```
+evals/
+  subfolder/
+    test.jsonl
+    attachments/
+      code.py
+```
+
+```jsonl
+{"id": "test", "input_messages": [{"role": "user", "content": [{"type": "file", "value": "./attachments/code.py"}]}]}
+```
+
+**Resolution**:
+- `./attachments/code.py` → `evals/subfolder/attachments/code.py`
+- Same `searchRoots` logic as YAML parser
+- Same guideline pattern matching
+
+### 8. Streaming vs Batch Loading
+
+**Decision**: Load all cases into memory first (like YAML), defer streaming to future
+
+**Rationale**:
+- Maintains consistency with YAML behavior
+- Simpler initial implementation
+- Most datasets fit in memory
+- Streaming can be added later without breaking changes
+
+**Future enhancement path**:
+```typescript
+// Future: Streaming API (non-breaking addition)
+async function* streamEvalCases(filePath: string) {
+  // Yield cases one at a time
+}
+```
+
+## Error Handling Strategy
+
+### Parse Errors
+```
+Error: Failed to parse JSONL file: evals/test.jsonl
+  Line 42: Unexpected token } in JSON at position 23
+  
+  Hint: Each line must be a complete JSON object
+```
+
+### Missing Required Fields
+```
+Error: Invalid eval case at line 10 in evals/test.jsonl
+  Missing required field: 'expected_outcome'
+  
+  Required fields: id, expected_outcome, input_messages
+```
+
+### Invalid Field Types
+```
+Error: Invalid eval case at line 5 in evals/test.jsonl
+  Field 'input_messages' must be an array, got string
+```
+
+### Sidecar Not Found (Warning, not error)
+```
+Warning: Sidecar metadata file not found: evals/dataset.yaml
+  Using defaults: target=default, evaluator=llm_judge
+```
+
+## Testing Strategy
+
+### Unit Tests
+- Parse valid JSONL (single line, multiple lines)
+- Handle empty lines and whitespace
+- Error on malformed JSON
+- Error on missing required fields
+- Load sidecar metadata
+- Merge defaults correctly
+- Override precedence
+
+### Integration Tests
+- End-to-end eval run with JSONL
+- File references resolve correctly
+- Multiple evaluators work
+- Per-case execution overrides
+- Trace capture
+
+### Regression Tests
+- YAML parsing unchanged
+- Backward compatibility
+- Mixed YAML + JSONL in repo
+
+## Performance Considerations
+
+### Memory
+- Load entire JSONL file into string (same as YAML)
+- Parse line-by-line (better than YAML's full parse)
+- Each case processed independently
+
+### Speed
+- JSON.parse() is typically faster than YAML parsing
+- Line-by-line allows early error detection
+- No significant performance concerns expected
+
+### File Size
+- JSONL more compact than YAML (no indentation)
+- Typical eval case: ~200-500 bytes per line
+- 1000 cases ≈ 200-500 KB (negligible)
+
+## Migration Path
+
+### From YAML to JSONL
+
+**Option 1**: Manual conversion (for small datasets)
+```bash
+# Convert evalcases array to JSONL
+cat dataset.yaml | yq '.evalcases[]' -o json > dataset.jsonl
+
+# Extract metadata to sidecar
+cat dataset.yaml | yq 'del(.evalcases)' > dataset-meta.yaml
+```
+
+**Option 2**: Keep YAML (no migration needed)
+- YAML continues to work
+- No forced migration
+- Users choose format per dataset
+
+## Future Enhancements (Out of Scope)
+
+1. **Streaming execution**: Process cases without loading all into memory
+2. **JSONL export**: Convert YAML → JSONL
+3. **Compressed JSONL**: Support `.jsonl.gz` files
+4. **JSON schema validation**: Formal JSON schema for JSONL format
+5. **Multi-file datasets**: Split large datasets across multiple JSONL files
+6. **Incremental updates**: Append new cases without re-running all
+
+## Open Issues
+
+None. All design decisions finalized.
diff --git a/openspec/changes/add-jsonl-dataset-format/proposal.md b/openspec/changes/add-jsonl-dataset-format/proposal.md
new file mode 100644
index 00000000..52e8704f
--- /dev/null
+++ b/openspec/changes/add-jsonl-dataset-format/proposal.md
@@ -0,0 +1,235 @@
+# Proposal: Add JSONL Dataset Format Support
+
+## Summary
+
+Add support for JSONL (JSON Lines) format as an alternative to YAML for defining evaluation datasets, following industry standards observed in DeepEval, LangWatch, and other ML/AI frameworks.
+
+## Why
+
+JSONL support enables large-scale evaluation workflows that are currently impractical with YAML:
+
+1. **Streaming & Memory Efficiency**: JSONL allows line-by-line processing without loading entire datasets into memory, critical for datasets with thousands of test cases
+2. **Git Workflow Improvements**: Line-based diffs clearly show which specific test cases changed, unlike nested YAML diffs
+3. **Programmatic Generation**: Scripts can easily append new test cases to JSONL files without parsing/reformatting YAML
+4. **Industry Alignment**: Follows established patterns from DeepEval, LangWatch, Hugging Face, and OpenAI fine-tuning datasets
+5. **Tool Compatibility**: Standard JSONL tools (`jq`, `grep`, streaming parsers) work with AgentV datasets
+
+This addresses the "Align with Industry Standards" design principle from AGENTS.md and supports AgentV's goal of robust, large-scale AI agent evaluation.
+
+## Motivation
+
+### Current State
+AgentV currently uses YAML exclusively for eval datasets. While YAML is human-readable and suitable for hand-authored test cases, it has limitations for large-scale evaluation:
+
+1. **Memory overhead**: Entire file must be parsed into memory
+2. **Not streaming-friendly**: Cannot process eval cases incrementally
+3. **Poor Git diffs**: Nested YAML changes produce unclear diffs
+4. **Append-unfriendly**: Adding test cases requires careful YAML formatting
+
+### Industry Research
+
+Research of major ML/AI frameworks shows strong adoption of JSONL for datasets:
+
+- **DeepEval**: Explicit JSONL support with `save_as(file_type='jsonl')`
+- **LangWatch**: Full JSONL support in UI and backend parsing
+- **Hugging Face**: Pure JSONL data files with sidecar README.md metadata
+- **OpenAI**: Pure JSONL for fine-tuning datasets with API-managed metadata
+
+**Key finding**: 100% of frameworks use **pure JSONL** (data only) with **separate metadata storage** (sidecar files or API-managed). Zero frameworks use first-line metadata approach.
+
+### Benefits of JSONL Support
+
+1. **Streaming**: Process eval cases line-by-line without loading entire file
+2. **Memory efficiency**: Critical for datasets with hundreds/thousands of cases
+3. **Git-friendly**: Line-based diffs clearly show which test cases changed
+4. **Append-friendly**: Add cases with simple file append operations
+5. **Tool compatibility**: Works with standard tools like `jq`, `grep`, streaming parsers
+6. **Industry standard**: Aligns with established ML/AI framework patterns
+
+### Design Decision: Sidecar Metadata
+
+Following industry standard (Hugging Face, Vertex AI), metadata will be stored in a separate YAML file:
+
+```
+evals/
+  dataset.yaml       # Metadata: description, defaults
+  dataset.jsonl      # Pure eval cases (one per line)
+```
+
+This approach:
+- Maintains JSONL purity (every line is data)
+- Avoids repetition of defaults across thousands of lines
+- Keeps metadata human-readable
+- Supports dataset-level configuration (description, target, evaluator)
+
+## Proposed Changes
+
+### 1. JSONL File Format
+
+**Pure data** - one eval case per line:
+
+```jsonl
+{"id": "test-1", "expected_outcome": "Description", "input_messages": [{"role": "user", "content": "Query"}], "expected_messages": [{"role": "assistant", "content": "Response"}]}
+{"id": "test-2", "expected_outcome": "Another test", "input_messages": [{"role": "user", "content": "Query 2"}]}
+{"id": "test-3", "expected_outcome": "Override example", "input_messages": [...], "execution": {"target": "specific_target"}}
+```
+
+**Schema per line**:
+- Required: `id`, `expected_outcome`, `input_messages`
+- Optional: `conversation_id`, `expected_messages`, `execution`, `evaluators`, `rubrics`
+- Same field structure as YAML `evalcases` array entries
+
+### 2. Sidecar YAML for Metadata
+
+**Optional companion file** with same base name:
+
+```yaml
+# dataset.yaml (metadata only)
+description: Cross-provider evaluation dataset
+dataset: multi-target-test
+execution:
+  target: azure_base  # Default for all cases
+evaluator: llm_judge  # Default evaluator
+```
+
+### 3. Resolution Strategy
+
+1. **JSONL detection**: File extension `.jsonl` triggers JSONL parser
+2. **Metadata loading**: Look for `<basename>.yaml` sidecar
+   - `dataset.jsonl` → check for `dataset.yaml`
+   - If not found, use sensible defaults
+3. **Defaults + overrides**: Sidecar provides defaults, per-line fields override
+4. **Backward compatibility**: YAML-only files work unchanged
+
+### 4. Implementation Scope
+
+**In scope**:
+- JSONL parser for eval cases
+- Sidecar YAML metadata loading
+- File format detection (`.jsonl` extension)
+- Same validation as YAML cases
+- Same file reference resolution (relative paths)
+
+**Out of scope** (future enhancements):
+- JSONL for config.yaml or targets.yaml
+- Streaming execution (load all cases first, like YAML)
+- Mixed formats in single file
+- JSONL generation/export tools
+
+## User Impact
+
+### Breaking Changes
+None. This is purely additive.
+
+### Migration Path
+No migration required. YAML files continue to work unchanged.
+
+### New Capabilities
+
+1. **Large datasets**: Users can create evaluation suites with thousands of cases
+2. **Programmatic generation**: Scripts can append to JSONL files easily
+3. **Git workflows**: Clearer diffs when cases are added/modified
+4. **Tool integration**: Standard JSONL tools work with AgentV datasets
+
+## Examples
+
+### Example 1: Basic JSONL Dataset
+
+**dataset.jsonl**:
+```jsonl
+{"id": "basic-test", "expected_outcome": "Agent provides helpful response", "input_messages": [{"role": "user", "content": "What is 2+2?"}]}
+{"id": "code-review", "expected_outcome": "Identifies bug", "input_messages": [{"role": "user", "content": "Review this code"}], "expected_messages": [{"role": "assistant", "content": "Found bug in line 5"}]}
+```
+
+**dataset.yaml** (optional):
+```yaml
+description: Basic math and code review tests
+execution:
+  target: default
+```
+
+### Example 2: Per-Case Overrides
+
+**dataset.jsonl**:
+```jsonl
+{"id": "azure-test", "expected_outcome": "Uses Azure target", "input_messages": [...]}
+{"id": "openai-test", "expected_outcome": "Uses OpenAI target", "input_messages": [...], "execution": {"target": "openai_gpt4"}}
+{"id": "custom-eval", "expected_outcome": "Uses rubric evaluator", "input_messages": [...], "evaluators": [{"type": "rubric", "rubrics": ["Must be polite"]}]}
+```
+
+**dataset.yaml**:
+```yaml
+execution:
+  target: azure_base  # Default, overridden by line 2
+evaluator: llm_judge  # Default, overridden by line 3
+```
+
+### Example 3: File References (Relative Paths)
+
+**dataset.jsonl**:
+```jsonl
+{"id": "with-attachments", "expected_outcome": "Reviews code", "input_messages": [{"role": "user", "content": [{"type": "text", "value": "Review this"}, {"type": "file", "value": "./code.py"}]}]}
+```
+
+File references resolve relative to the JSONL file location (same as YAML).
+
+## Alternatives Considered
+
+### 1. First-line metadata (REJECTED)
+```jsonl
+{"_meta": true, "description": "...", "dataset": "..."}
+{"id": "test-1", ...}
+```
+
+**Why rejected**: 
+- Not used by any major ML/AI framework
+- Breaks JSONL purity (special first line)
+- Incompatible with standard JSONL tools
+- Complicates concatenation and streaming
+
+### 2. Inline repetition (REJECTED)
+```jsonl
+{"id": "test-1", "execution": {"target": "azure_base"}, ...}
+{"id": "test-2", "execution": {"target": "azure_base"}, ...}
+```
+
+**Why rejected**:
+- Massive redundancy for datasets with thousands of cases
+- Violates DRY principle
+- Larger file sizes
+- Harder to change defaults
+
+### 3. JSON array format (REJECTED)
+```json
+{
+  "description": "...",
+  "evalcases": [...]
+}
+```
+
+**Why rejected**:
+- Not line-oriented (same limitations as YAML)
+- Can't stream or incrementally process
+- Same poor Git diff behavior
+- Doesn't solve the problems JSONL addresses
+
+## Success Criteria
+
+1. ✅ JSONL files with `.jsonl` extension are parsed correctly
+2. ✅ Sidecar YAML metadata is loaded when present
+3. ✅ Per-line overrides work (execution, evaluators, rubrics)
+4. ✅ File references resolve relative to JSONL file
+5. ✅ Same validation rules as YAML eval cases
+6. ✅ Backward compatibility: existing YAML files unchanged
+7. ✅ Documentation updated with JSONL examples
+8. ✅ Tests cover JSONL parsing and error cases
+
+## Open Questions
+
+None. All design decisions have been made based on industry research and established patterns.
+
+## References
+
+- Industry research: DeepEval, LangWatch, Hugging Face, OpenAI Fine-tuning API
+- Current parser: `packages/core/src/evaluation/yaml-parser.ts`
+- Related specs: `yaml-schema`, `evaluation`
diff --git a/openspec/changes/add-jsonl-dataset-format/specs/jsonl-dataset-format/spec.md b/openspec/changes/add-jsonl-dataset-format/specs/jsonl-dataset-format/spec.md
new file mode 100644
index 00000000..7527d60c
--- /dev/null
+++ b/openspec/changes/add-jsonl-dataset-format/specs/jsonl-dataset-format/spec.md
@@ -0,0 +1,278 @@
+# Spec: JSONL Dataset Format
+
+## Purpose
+Support JSONL (JSON Lines) format for evaluation datasets as an alternative to YAML, following industry standards for ML/AI frameworks. Enables large-scale evaluation with streaming-friendly, Git-friendly, and tool-compatible dataset files.
+
+## ADDED Requirements
+
+### Requirement: JSONL File Format Detection
+
+The system SHALL detect JSONL format by file extension and route to appropriate parser.
+
+#### Scenario: Detect JSONL file by extension
+- **GIVEN** a file path ending in `.jsonl`
+- **WHEN** `loadEvalCases()` is called with that path
+- **THEN** the system SHALL use the JSONL parser
+- **AND** parse the file line-by-line as JSONL
+
+#### Scenario: Detect YAML file by extension
+- **GIVEN** a file path ending in `.yaml` or `.yml`
+- **WHEN** `loadEvalCases()` is called with that path
+- **THEN** the system SHALL use the existing YAML parser
+- **AND** maintain backward compatibility
+
+#### Scenario: Reject unsupported file extensions
+- **GIVEN** a file path ending in `.json`, `.txt`, or other unsupported extension
+- **WHEN** `loadEvalCases()` is called with that path
+- **THEN** the system SHALL throw an error
+- **AND** the error message SHALL list supported formats (`.yaml`, `.yml`, `.jsonl`)
+
+### Requirement: JSONL Line Parsing
+
+The system SHALL parse JSONL files line-by-line with strict JSON validation.
+
+#### Scenario: Parse valid single-line JSONL
+- **GIVEN** a JSONL file with one line containing valid JSON:
+  ```jsonl
+  {"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}
+  ```
+- **WHEN** the file is parsed
+- **THEN** the system SHALL return one eval case
+- **AND** the eval case SHALL have `id: "test-1"`, `expectedOutcome: "Goal"`, and appropriate input messages
+
+#### Scenario: Parse multi-line JSONL
+- **GIVEN** a JSONL file with multiple lines:
+  ```jsonl
+  {"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [...]}
+  {"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [...]}
+  {"id": "test-3", "expected_outcome": "Goal 3", "input_messages": [...]}
+  ```
+- **WHEN** the file is parsed
+- **THEN** the system SHALL return three eval cases
+- **AND** each case SHALL have the correct id and expected_outcome
+
+#### Scenario: Skip empty lines
+- **GIVEN** a JSONL file with empty lines or whitespace-only lines:
+  ```jsonl
+  {"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [...]}
+  
+  {"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [...]}
+     
+  {"id": "test-3", "expected_outcome": "Goal 3", "input_messages": [...]}
+  ```
+- **WHEN** the file is parsed
+- **THEN** the system SHALL skip empty/whitespace lines
+- **AND** return three eval cases without errors
+
+#### Scenario: Error on malformed JSON
+- **GIVEN** a JSONL file with invalid JSON on line 5:
+  ```jsonl
+  {"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [...]}
+  {"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [...]}
+  {"id": "test-3", "expected_outcome": "Goal 3", "input_messages": [...]}
+  {"id": "test-4", "expected_outcome": "Goal 4", "input_messages": [...]}
+  {"id": "test-5", "expected_outcome": "Goal 5" "input_messages": [...]}
+  ```
+- **WHEN** the file is parsed
+- **THEN** the system SHALL throw an error
+- **AND** the error message SHALL include "Line 5: Invalid JSON"
+- **AND** the error message SHALL include the JSON parse error details
+
+#### Scenario: Error on missing required fields
+- **GIVEN** a JSONL file where line 3 is missing `expected_outcome`:
+  ```jsonl
+  {"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [...]}
+  {"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [...]}
+  {"id": "test-3", "input_messages": [...]}
+  ```
+- **WHEN** the file is parsed
+- **THEN** the system SHALL skip the invalid case and log a warning
+- **AND** the warning SHALL include "Line 3" and "missing expected_outcome"
+- **AND** continue parsing remaining cases (same behavior as YAML)
+
+### Requirement: Sidecar Metadata File
+
+The system SHALL support optional sidecar YAML file for dataset-level metadata.
+
+#### Scenario: Load metadata from sidecar YAML
+- **GIVEN** a JSONL file `dataset.jsonl`
+- **AND** a companion file `dataset.yaml` with content:
+  ```yaml
+  description: Test dataset
+  dataset: my-tests
+  execution:
+    target: azure_base
+  evaluator: llm_judge
+  ```
+- **WHEN** `loadEvalCases('dataset.jsonl')` is called
+- **THEN** the system SHALL load metadata from `dataset.yaml`
+- **AND** apply `execution.target: "azure_base"` as default for all cases
+- **AND** apply `evaluator: "llm_judge"` as default evaluator
+
+#### Scenario: Use defaults when sidecar not found
+- **GIVEN** a JSONL file `dataset.jsonl` with no companion YAML
+- **WHEN** `loadEvalCases('dataset.jsonl')` is called
+- **THEN** the system SHALL use default values:
+  - `dataset`: basename of JSONL file ("dataset")
+  - `execution.target`: "default"
+  - `evaluator`: "llm_judge"
+  - `description`: empty string
+- **AND** SHALL NOT throw an error
+
+#### Scenario: Look for companion YAML with same base name
+- **GIVEN** a JSONL file at path `evals/subfolder/mytest.jsonl`
+- **WHEN** loading eval cases
+- **THEN** the system SHALL check for `evals/subfolder/mytest.yaml`
+- **AND** SHALL NOT check for `dataset.yaml` or other names
+
+### Requirement: Per-Case Field Overrides
+
+The system SHALL support per-case overrides for execution, evaluators, and rubrics in JSONL lines.
+
+#### Scenario: Override execution target per case
+- **GIVEN** a sidecar YAML with `execution.target: "azure_base"`
+- **AND** a JSONL line:
+  ```jsonl
+  {"id": "openai-test", "expected_outcome": "Uses OpenAI", "input_messages": [...], "execution": {"target": "openai_gpt4"}}
+  ```
+- **WHEN** the eval case is loaded
+- **THEN** the case SHALL use `target: "openai_gpt4"`
+- **AND** the sidecar default SHALL be overridden for this case only
+
+#### Scenario: Override evaluators per case
+- **GIVEN** a sidecar YAML with `evaluator: llm_judge`
+- **AND** a JSONL line:
+  ```jsonl
+  {"id": "rubric-test", "expected_outcome": "Uses rubric", "input_messages": [...], "evaluators": [{"type": "rubric", "rubrics": ["Must be polite"]}]}
+  ```
+- **WHEN** the eval case is loaded
+- **THEN** the case SHALL use the rubric evaluator
+- **AND** the sidecar default evaluator SHALL be overridden for this case only
+
+#### Scenario: Merge defaults with per-case fields
+- **GIVEN** a sidecar YAML with:
+  ```yaml
+  execution:
+    target: azure_base
+  evaluator: llm_judge
+  ```
+- **AND** a JSONL line with only `execution` override:
+  ```jsonl
+  {"id": "test", "expected_outcome": "Goal", "input_messages": [...], "execution": {"target": "openai"}}
+  ```
+- **WHEN** the eval case is loaded
+- **THEN** the case SHALL use `target: "openai"` (overridden)
+- **AND** the case SHALL use `evaluator: "llm_judge"` (from sidecar)
+
+### Requirement: File Reference Resolution
+
+The system SHALL resolve file references in JSONL content relative to the JSONL file location.
+
+#### Scenario: Resolve relative file reference
+- **GIVEN** a JSONL file at `evals/tests/dataset.jsonl`
+- **AND** a line with file reference:
+  ```jsonl
+  {"id": "test", "expected_outcome": "Reviews code", "input_messages": [{"role": "user", "content": [{"type": "file", "value": "./code.py"}]}]}
+  ```
+- **WHEN** the eval case is loaded
+- **THEN** the system SHALL resolve `./code.py` relative to `evals/tests/`
+- **AND** load content from `evals/tests/code.py`
+
+#### Scenario: Resolve guideline files from JSONL
+- **GIVEN** a JSONL file at `evals/dataset.jsonl`
+- **AND** a config with `guideline_patterns: ["*.instructions.md"]`
+- **AND** a line with guideline reference:
+  ```jsonl
+  {"id": "test", "expected_outcome": "Follows guidelines", "input_messages": [{"role": "user", "content": [{"type": "file", "value": "python.instructions.md"}]}]}
+  ```
+- **WHEN** the eval case is loaded
+- **THEN** the system SHALL detect the guideline file
+- **AND** process it as a guideline (prepend to prompt, wrap in guidelines block)
+
+### Requirement: Schema Compatibility
+
+The system SHALL produce identical `EvalCase` objects from JSONL and YAML formats.
+
+#### Scenario: JSONL and YAML produce same EvalCase
+- **GIVEN** a YAML file with:
+  ```yaml
+  evalcases:
+    - id: test-1
+      expected_outcome: Goal
+      input_messages:
+        - role: user
+          content: Query
+  ```
+- **AND** a JSONL file with:
+  ```jsonl
+  {"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}
+  ```
+- **WHEN** both files are parsed
+- **THEN** both SHALL produce identical `EvalCase` objects
+- **AND** downstream code SHALL work identically with both
+
+#### Scenario: All eval case fields supported in JSONL
+- **GIVEN** a JSONL line with all supported fields:
+  ```jsonl
+  {
+    "id": "full-test",
+    "conversation_id": "conv-1",
+    "expected_outcome": "Goal",
+    "input_messages": [...],
+    "expected_messages": [...],
+    "execution": {"target": "azure"},
+    "evaluators": [...],
+    "rubrics": [...]
+  }
+  ```
+- **WHEN** the line is parsed
+- **THEN** all fields SHALL be preserved in the `EvalCase` object
+- **AND** the case SHALL validate and execute correctly
+
+### Requirement: Error Reporting
+
+The system SHALL provide clear, actionable error messages for JSONL parsing failures.
+
+#### Scenario: Line number in parse errors
+- **GIVEN** a JSONL file with JSON syntax error on line 42
+- **WHEN** the file is parsed
+- **THEN** the error message SHALL include "Line 42"
+- **AND** SHALL include the specific JSON parse error
+
+#### Scenario: Field validation errors reference line
+- **GIVEN** a JSONL file where line 10 has invalid field type (string instead of array for `input_messages`)
+- **WHEN** the file is parsed
+- **THEN** the error/warning message SHALL include "Line 10"
+- **AND** SHALL indicate the field name and expected type
+
+#### Scenario: Sidecar not found is a warning, not error
+- **GIVEN** a JSONL file without companion YAML
+- **WHEN** the file is loaded with verbose logging enabled
+- **THEN** the system SHALL log a warning about missing sidecar
+- **AND** SHALL continue with defaults
+- **AND** SHALL NOT throw an error
+
+### Requirement: Backward Compatibility
+
+The system SHALL maintain full backward compatibility with existing YAML-only workflows.
+
+#### Scenario: Existing YAML files work unchanged
+- **GIVEN** an existing YAML eval file
+- **WHEN** `loadEvalCases()` is called with the YAML file path
+- **THEN** the system SHALL parse it with the YAML parser
+- **AND** produce identical results as before JSONL support was added
+
+#### Scenario: Mixed YAML and JSONL in same repo
+- **GIVEN** a repository with both:
+  - `evals/test1.yaml`
+  - `evals/test2.jsonl`
+- **WHEN** running evals from both files
+- **THEN** both SHALL work correctly
+- **AND** YAML files SHALL use YAML parser
+- **AND** JSONL files SHALL use JSONL parser
+
+#### Scenario: CLI works with both formats
+- **GIVEN** the CLI command `agentv run evals/test.jsonl`
+- **WHEN** executed
+- **THEN** the CLI SHALL detect JSONL format and run the eval
+- **AND** produce same output format as YAML evals
diff --git a/openspec/changes/add-jsonl-dataset-format/tasks.md b/openspec/changes/add-jsonl-dataset-format/tasks.md
new file mode 100644
index 00000000..9cfd131b
--- /dev/null
+++ b/openspec/changes/add-jsonl-dataset-format/tasks.md
@@ -0,0 +1,131 @@
+# Tasks: Add JSONL Dataset Format Support
+
+## Implementation Checklist
+
+### Phase 1: Core JSONL Parser
+- [x] Create `jsonl-parser.ts` module for JSONL parsing
+  - [x] Implement line-by-line JSON parsing
+  - [x] Handle malformed lines with clear error messages
+  - [x] Validate each line matches eval case schema
+  - [x] Support UTF-8 encoding
+  - [x] Skip empty lines and whitespace-only lines
+- [x] Create file format detector
+  - [x] Detect `.jsonl` extension → route to JSONL parser
+  - [x] Detect `.yaml` or `.yml` → route to existing YAML parser
+  - [x] Return clear error for unsupported extensions
+- [x] Implement sidecar YAML metadata loader
+  - [x] Look for `<basename>.yaml` companion file
+  - [x] Parse metadata fields: `description`, `dataset`, `execution`, `evaluator`
+  - [x] Merge defaults with per-case overrides
+  - [x] Fall back to sensible defaults if no sidecar found
+- [x] Update `loadEvalCases()` function
+  - [x] Add format detection logic
+  - [x] Route to appropriate parser (JSONL or YAML)
+  - [x] Maintain same function signature (backward compatible)
+  - [x] Preserve existing error handling patterns
+
+### Phase 2: Schema Validation
+- [x] Extend TypeScript types for JSONL cases
+  - [x] Verify `EvalCase` type covers all JSONL fields
+  - [x] Add types for sidecar metadata structure
+  - [x] Ensure per-line overrides type-check correctly
+- [x] Add validation for JSONL-specific scenarios
+  - [x] Validate line-level `execution` overrides
+  - [x] Validate line-level `evaluators` array
+  - [x] Validate line-level `rubrics` array
+  - [x] Ensure same file reference resolution as YAML
+- [x] Add error reporting for invalid JSONL
+  - [x] Report line number for parse failures
+  - [x] Indicate which field is invalid
+  - [x] Suggest fixes for common errors
+
+### Phase 3: File Reference Resolution
+- [x] Verify file reference resolution works with JSONL
+  - [x] Resolve paths relative to JSONL file location
+  - [x] Support `type: file` content blocks
+  - [x] Handle guideline files (`.instructions.md`)
+  - [x] Same search root logic as YAML
+- [x] Test with nested directories
+  - [x] JSONL in `evals/subfolder/test.jsonl`
+  - [x] File references like `./data/input.txt`
+  - [x] Ensure correct path resolution
+
+### Phase 4: Testing
+- [x] Unit tests for JSONL parser
+  - [x] Parse valid single-line JSONL
+  - [x] Parse multi-line JSONL dataset
+  - [x] Handle empty files gracefully
+  - [x] Skip empty lines and whitespace
+  - [x] Error on malformed JSON
+  - [x] Error on missing required fields (`id`, `expected_outcome`, `input_messages`)
+- [x] Unit tests for sidecar metadata
+  - [x] Load metadata from companion YAML
+  - [x] Merge defaults with per-line overrides
+  - [x] Handle missing sidecar gracefully
+  - [x] Apply correct precedence (line overrides sidecar)
+- [x] Integration tests
+  - [x] End-to-end eval run with JSONL dataset
+  - [x] Verify file references resolve correctly
+  - [x] Test with multiple evaluators
+  - [x] Test with per-case execution overrides
+  - [x] Verify trace capture works with JSONL
+- [x] Regression tests
+  - [x] Ensure YAML parsing unchanged
+  - [x] Verify backward compatibility
+  - [x] Test mixed repos (YAML + JSONL)
+
+### Phase 5: Documentation
+- [x] Update README with JSONL examples
+- [x] Document JSONL in eval-builder skill (SKILL.md)
+  - [x] Basic JSONL structure
+  - [x] Sidecar metadata usage
+  - [x] Per-case overrides
+  - [x] File reference examples
+- [x] Add JSONL examples to `examples/` directory
+  - [x] `examples/features/basic-jsonl/` - JSONL version of basic example
+  - [x] With sidecar metadata
+  - [x] With per-case execution overrides
+  - [x] With file references (points to basic example files)
+
+### Phase 6: Error Messages & DX
+- [x] Improve error messages for JSONL
+  - [x] "Line 42: Invalid JSON syntax"
+  - [x] "Line 10: Missing required field 'id'"
+  - [x] "Sidecar file 'dataset.yaml' not found (using defaults)"
+- [x] Add verbose logging for JSONL loading
+  - [x] Log sidecar metadata discovery
+  - [x] Log number of cases loaded
+  - [x] Log per-case override application
+- [x] Validate with `openspec validate --strict`
+
+## Validation Steps
+
+After implementation:
+1. Run `bun run build` - Ensure no compilation errors ✓
+2. Run `bun run typecheck` - Verify TypeScript types ✓
+3. Run `bun run lint` - Check code style ✓
+4. Run `bun test` - All tests pass ✓
+5. Run examples with JSONL datasets ✓
+6. Validate backward compatibility with existing YAML files ✓
+
+## Dependencies
+
+- No new external dependencies required
+- Uses existing Node.js `fs/promises` and `path` modules
+- Reuses existing validation and file resolution logic
+
+## Parallelizable Work
+
+These can be done independently:
+- JSONL parser implementation (Phase 1) and Testing setup (Phase 4) can start together
+- Documentation (Phase 5) can be drafted while implementation is in progress
+- Example files can be created early for testing
+
+## Estimated Effort
+
+- **Phase 1-3**: Core implementation - 2-3 hours
+- **Phase 4**: Testing - 1-2 hours
+- **Phase 5**: Documentation - 1 hour
+- **Phase 6**: Polish - 30 minutes
+
+**Total**: ~5-7 hours for complete implementation
diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts
new file mode 100644
index 00000000..7e15c236
--- /dev/null
+++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts
@@ -0,0 +1,373 @@
+import { readFile } from 'node:fs/promises';
+import path from 'node:path';
+import { parse as parseYaml } from 'yaml';
+
+import type { EvalCase, JsonObject, JsonValue, TestMessage } from '../types.js';
+import { isJsonObject, isTestMessage } from '../types.js';
+import { loadConfig } from './config-loader.js';
+import { coerceEvaluator, parseEvaluators } from './evaluator-parser.js';
+import { buildSearchRoots, fileExists, resolveToAbsolutePath } from './file-resolver.js';
+import { processExpectedMessages, processMessages } from './message-processor.js';
+
+const ANSI_YELLOW = '\u001b[33m';
+const ANSI_RED = '\u001b[31m';
+const ANSI_RESET = '\u001b[0m';
+
+type LoadOptions = {
+  readonly verbose?: boolean;
+  readonly evalId?: string;
+};
+
+/**
+ * Sidecar metadata structure for JSONL datasets.
+ */
+type SidecarMetadata = {
+  readonly description?: string;
+  readonly dataset?: string;
+  readonly execution?: JsonObject;
+  readonly evaluator?: JsonValue;
+};
+
+/**
+ * Raw eval case from JSONL line.
+ */
+type RawJsonlEvalCase = JsonObject & {
+  readonly id?: JsonValue;
+  readonly conversation_id?: JsonValue;
+  readonly outcome?: JsonValue;
+  readonly expected_outcome?: JsonValue;
+  readonly input_messages?: JsonValue;
+  readonly expected_messages?: JsonValue;
+  readonly execution?: JsonValue;
+  readonly evaluators?: JsonValue;
+  readonly rubrics?: JsonValue;
+};
+
+/**
+ * Detect file format by extension.
+ */
+export function detectFormat(filePath: string): 'yaml' | 'jsonl' {
+  const ext = path.extname(filePath).toLowerCase();
+  if (ext === '.jsonl') return 'jsonl';
+  if (ext === '.yaml' || ext === '.yml') return 'yaml';
+  throw new Error(`Unsupported file format: '${ext}'. Supported formats: .yaml, .yml, .jsonl`);
+}
+
+/**
+ * Load sidecar YAML metadata file for a JSONL dataset.
+ */
+async function loadSidecarMetadata(jsonlPath: string, verbose: boolean): Promise<SidecarMetadata> {
+  const dir = path.dirname(jsonlPath);
+  const base = path.basename(jsonlPath, '.jsonl');
+  const sidecarPath = path.join(dir, `${base}.yaml`);
+
+  if (!(await fileExists(sidecarPath))) {
+    if (verbose) {
+      logWarning(`Sidecar metadata file not found: ${sidecarPath} (using defaults)`);
+    }
+    return {};
+  }
+
+  try {
+    const content = await readFile(sidecarPath, 'utf8');
+    const parsed = parseYaml(content) as unknown;
+
+    if (!isJsonObject(parsed)) {
+      logWarning(`Invalid sidecar metadata format in ${sidecarPath}`);
+      return {};
+    }
+
+    return {
+      description: asString(parsed.description),
+      dataset: asString(parsed.dataset),
+      execution: isJsonObject(parsed.execution) ? parsed.execution : undefined,
+      evaluator: parsed.evaluator,
+    };
+  } catch (error) {
+    logWarning(`Could not read sidecar metadata from ${sidecarPath}: ${(error as Error).message}`);
+    return {};
+  }
+}
+
+/**
+ * Parse JSONL file content into raw eval cases.
+ */
+function parseJsonlContent(content: string, filePath: string): RawJsonlEvalCase[] {
+  const lines = content.split('\n');
+  const cases: RawJsonlEvalCase[] = [];
+
+  for (let i = 0; i < lines.length; i++) {
+    const line = lines[i].trim();
+    if (line === '') continue; // Skip empty lines
+
+    try {
+      const parsed = JSON.parse(line) as unknown;
+      if (!isJsonObject(parsed)) {
+        throw new Error('Expected JSON object');
+      }
+      cases.push(parsed as RawJsonlEvalCase);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      throw new Error(`Line ${i + 1}: Invalid JSON - ${message}\n  File: ${filePath}`);
+    }
+  }
+
+  return cases;
+}
+
+/**
+ * Load eval cases from a JSONL file with optional sidecar YAML metadata.
+ */
+export async function loadEvalCasesFromJsonl(
+  evalFilePath: string,
+  repoRoot: URL | string,
+  options?: LoadOptions,
+): Promise<readonly EvalCase[]> {
+  const verbose = options?.verbose ?? false;
+  const evalIdFilter = options?.evalId;
+  const absoluteTestPath = path.resolve(evalFilePath);
+
+  const repoRootPath = resolveToAbsolutePath(repoRoot);
+  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
+
+  // Load configuration (walks up directory tree to repo root)
+  const config = await loadConfig(absoluteTestPath, repoRootPath);
+  const guidelinePatterns = config?.guideline_patterns;
+
+  // Load sidecar metadata
+  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
+
+  // Parse JSONL content
+  const rawFile = await readFile(absoluteTestPath, 'utf8');
+  const rawCases = parseJsonlContent(rawFile, evalFilePath);
+
+  // Derive dataset name: sidecar > filename
+  const fallbackDataset = path.basename(absoluteTestPath, '.jsonl') || 'eval';
+  const datasetName =
+    sidecar.dataset && sidecar.dataset.trim().length > 0 ? sidecar.dataset : fallbackDataset;
+
+  // Global defaults from sidecar
+  const globalEvaluator = coerceEvaluator(sidecar.evaluator, 'sidecar') ?? 'llm_judge';
+  const globalExecution = sidecar.execution;
+
+  if (verbose) {
+    console.log(`\n[JSONL Dataset: ${evalFilePath}]`);
+    console.log(`  Cases: ${rawCases.length}`);
+    console.log(`  Dataset name: ${datasetName}`);
+    if (sidecar.description) {
+      console.log(`  Description: ${sidecar.description}`);
+    }
+  }
+
+  const results: EvalCase[] = [];
+
+  for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
+    const evalcase = rawCases[lineIndex];
+    const lineNumber = lineIndex + 1; // 1-based for user-facing messages
+    const id = asString(evalcase.id);
+
+    // Skip eval cases that don't match the filter
+    if (evalIdFilter && id !== evalIdFilter) {
+      continue;
+    }
+
+    const conversationId = asString(evalcase.conversation_id);
+    // Support both expected_outcome and outcome (backward compatibility)
+    const outcome = asString(evalcase.expected_outcome) ?? asString(evalcase.outcome);
+
+    const inputMessagesValue = evalcase.input_messages;
+    const expectedMessagesValue = evalcase.expected_messages;
+
+    if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
+      logError(
+        `Skipping incomplete eval case at line ${lineNumber}: ${id ?? 'unknown'}. Missing required fields: id, expected_outcome, and/or input_messages`,
+      );
+      continue;
+    }
+
+    // expected_messages is optional - for outcome-only evaluation
+    const hasExpectedMessages =
+      Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
+
+    // V2 format: input_messages vs expected_messages
+    const inputMessages = inputMessagesValue.filter((msg): msg is TestMessage =>
+      isTestMessage(msg),
+    );
+    const expectedMessages = hasExpectedMessages
+      ? expectedMessagesValue.filter((msg): msg is TestMessage => isTestMessage(msg))
+      : [];
+
+    if (hasExpectedMessages && expectedMessages.length === 0) {
+      logError(`Line ${lineNumber}: No valid expected message found for eval case: ${id}`);
+      continue;
+    }
+
+    const guidelinePaths: string[] = [];
+    const inputTextParts: string[] = [];
+
+    // Process all input messages to extract files and guidelines
+    const inputSegments = await processMessages({
+      messages: inputMessages,
+      searchRoots,
+      repoRootPath,
+      guidelinePatterns,
+      guidelinePaths,
+      textParts: inputTextParts,
+      messageType: 'input',
+      verbose,
+    });
+
+    // Process expected_messages into segments (only if provided)
+    // Preserve full message structure including role and tool_calls for expected_messages evaluator
+    const outputSegments = hasExpectedMessages
+      ? await processExpectedMessages({
+          messages: expectedMessages,
+          searchRoots,
+          repoRootPath,
+          verbose,
+        })
+      : [];
+
+    // Build reference_answer:
+    // Extract the content from the last message in expected_messages (similar to candidate_answer)
+    let referenceAnswer = '';
+    if (outputSegments.length > 0) {
+      // Get the last message
+      const lastMessage = outputSegments[outputSegments.length - 1];
+      const content = lastMessage.content;
+      const toolCalls = lastMessage.tool_calls;
+
+      if (typeof content === 'string') {
+        referenceAnswer = content;
+      } else if (content !== undefined && content !== null) {
+        // Serialize just the content, not the entire message
+        referenceAnswer = JSON.stringify(content, null, 2);
+      } else if (toolCalls !== undefined && toolCalls !== null) {
+        // Message with only tool_calls - serialize just the tool_calls
+        referenceAnswer = JSON.stringify(toolCalls, null, 2);
+      }
+    }
+    const question = inputTextParts
+      .map((part) => part.trim())
+      .filter((part) => part.length > 0)
+      .join(' ');
+
+    // Merge execution config: per-case overrides sidecar
+    const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : undefined;
+    const mergedExecution = caseExecution ?? globalExecution;
+
+    const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
+    let evaluators: Awaited<ReturnType<typeof parseEvaluators>>;
+    try {
+      evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? 'unknown');
+    } catch (error) {
+      // Skip entire eval case if evaluator validation fails
+      const message = error instanceof Error ? error.message : String(error);
+      logError(`Skipping eval case '${id}' at line ${lineNumber}: ${message}`);
+      continue;
+    }
+
+    // Handle inline rubrics field (syntactic sugar)
+    const inlineRubrics = evalcase.rubrics;
+    if (inlineRubrics !== undefined && Array.isArray(inlineRubrics)) {
+      const rubricItems = inlineRubrics
+        .filter((r): r is JsonObject | string => isJsonObject(r) || typeof r === 'string')
+        .map((rubric, index) => {
+          if (typeof rubric === 'string') {
+            return {
+              id: `rubric-${index + 1}`,
+              description: rubric,
+              weight: 1.0,
+              required: true,
+            };
+          }
+          return {
+            id: asString(rubric.id) ?? `rubric-${index + 1}`,
+            description: asString(rubric.description) ?? '',
+            weight: typeof rubric.weight === 'number' ? rubric.weight : 1.0,
+            required: typeof rubric.required === 'boolean' ? rubric.required : true,
+          };
+        })
+        .filter((r) => r.description.length > 0);
+
+      if (rubricItems.length > 0) {
+        const rubricEvaluator: import('../types.js').LlmJudgeEvaluatorConfig = {
+          name: 'rubric',
+          type: 'llm_judge',
+          rubrics: rubricItems,
+        };
+        // Prepend rubric evaluator to existing evaluators
+        evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
+      }
+    }
+
+    // Extract file paths from all input segments (non-guideline files)
+    const userFilePaths: string[] = [];
+    for (const segment of inputSegments) {
+      if (segment.type === 'file' && typeof segment.resolvedPath === 'string') {
+        userFilePaths.push(segment.resolvedPath);
+      }
+    }
+
+    // Combine all file paths (guidelines + regular files)
+    const allFilePaths = [
+      ...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
+      ...userFilePaths,
+    ];
+
+    const testCase: EvalCase = {
+      id,
+      dataset: datasetName,
+      conversation_id: conversationId,
+      question: question,
+      input_messages: inputMessages,
+      input_segments: inputSegments,
+      expected_messages: outputSegments,
+      reference_answer: referenceAnswer,
+      guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
+      guideline_patterns: guidelinePatterns,
+      file_paths: allFilePaths,
+      expected_outcome: outcome,
+      evaluator: evalCaseEvaluatorKind,
+      evaluators,
+    };
+
+    if (verbose) {
+      console.log(`\n[Eval Case: ${id}]`);
+      if (testCase.guideline_paths.length > 0) {
+        console.log(`  Guidelines used: ${testCase.guideline_paths.length}`);
+        for (const guidelinePath of testCase.guideline_paths) {
+          console.log(`    - ${guidelinePath}`);
+        }
+      } else {
+        console.log('  No guidelines found');
+      }
+    }
+
+    results.push(testCase);
+  }
+
+  return results;
+}
+
+function asString(value: unknown): string | undefined {
+  return typeof value === 'string' ? value : undefined;
+}
+
+function logWarning(message: string, details?: readonly string[]): void {
+  if (details && details.length > 0) {
+    const detailBlock = details.join('\n');
+    console.warn(`${ANSI_YELLOW}Warning: ${message}\n${detailBlock}${ANSI_RESET}`);
+  } else {
+    console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
+  }
+}
+
+function logError(message: string, details?: readonly string[]): void {
+  if (details && details.length > 0) {
+    const detailBlock = details.join('\n');
+    console.error(`${ANSI_RED}Error: ${message}\n${detailBlock}${ANSI_RESET}`);
+  } else {
+    console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET}`);
+  }
+}
diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts
index c14826fc..d481f567 100644
--- a/packages/core/src/evaluation/yaml-parser.ts
+++ b/packages/core/src/evaluation/yaml-parser.ts
@@ -5,6 +5,7 @@ import { parse } from 'yaml';
 import { extractTargetFromSuite, loadConfig } from './loaders/config-loader.js';
 import { coerceEvaluator, parseEvaluators } from './loaders/evaluator-parser.js';
 import { buildSearchRoots, resolveToAbsolutePath } from './loaders/file-resolver.js';
+import { detectFormat, loadEvalCasesFromJsonl } from './loaders/jsonl-parser.js';
 import { processExpectedMessages, processMessages } from './loaders/message-processor.js';
 import type { EvalCase, JsonObject, JsonValue, TestMessage } from './types.js';
 import { isJsonObject, isTestMessage } from './types.js';
@@ -12,6 +13,7 @@ import { isJsonObject, isTestMessage } from './types.js';
 // Re-export public APIs from modules
 export { buildPromptInputs, type PromptInputs } from './formatting/prompt-builder.js';
 export { isGuidelineFile } from './loaders/config-loader.js';
+export { detectFormat } from './loaders/jsonl-parser.js';
 
 const ANSI_YELLOW = '\u001b[33m';
 const ANSI_RED = '\u001b[31m';
@@ -62,13 +64,21 @@ export async function readTestSuiteMetadata(testFilePath: string): Promise<{ tar
 }
 
 /**
- * Load eval cases from a AgentV YAML specification file.
+ * Load eval cases from a AgentV specification file (YAML or JSONL).
+ * Format is detected by file extension: .yaml/.yml for YAML, .jsonl for JSONL.
  */
 export async function loadEvalCases(
   evalFilePath: string,
   repoRoot: URL | string,
   options?: LoadOptions,
 ): Promise<readonly EvalCase[]> {
+  // Detect format and route to appropriate parser
+  const format = detectFormat(evalFilePath);
+  if (format === 'jsonl') {
+    return loadEvalCasesFromJsonl(evalFilePath, repoRoot, options);
+  }
+
+  // YAML parsing (existing implementation)
   const verbose = options?.verbose ?? false;
   const evalIdFilter = options?.evalId;
   const absoluteTestPath = path.resolve(evalFilePath);
diff --git a/packages/core/test/evaluation/loaders/jsonl-parser.test.ts b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts
new file mode 100644
index 00000000..9107d8d8
--- /dev/null
+++ b/packages/core/test/evaluation/loaders/jsonl-parser.test.ts
@@ -0,0 +1,396 @@
+import { afterAll, beforeAll, describe, expect, it } from 'bun:test';
+import { mkdir, rm, writeFile } from 'node:fs/promises';
+import os from 'node:os';
+import path from 'node:path';
+
+import {
+  detectFormat,
+  loadEvalCasesFromJsonl,
+} from '../../../src/evaluation/loaders/jsonl-parser.js';
+import { loadEvalCases } from '../../../src/evaluation/yaml-parser.js';
+
+describe('detectFormat', () => {
+  it('returns jsonl for .jsonl extension', () => {
+    expect(detectFormat('test.jsonl')).toBe('jsonl');
+    expect(detectFormat('/path/to/dataset.jsonl')).toBe('jsonl');
+  });
+
+  it('returns yaml for .yaml extension', () => {
+    expect(detectFormat('test.yaml')).toBe('yaml');
+    expect(detectFormat('/path/to/config.yaml')).toBe('yaml');
+  });
+
+  it('returns yaml for .yml extension', () => {
+    expect(detectFormat('test.yml')).toBe('yaml');
+    expect(detectFormat('/path/to/config.yml')).toBe('yaml');
+  });
+
+  it('throws for unsupported extensions', () => {
+    expect(() => detectFormat('test.json')).toThrow('Unsupported file format');
+    expect(() => detectFormat('test.txt')).toThrow('Unsupported file format');
+    expect(() => detectFormat('test')).toThrow('Unsupported file format');
+  });
+});
+
+describe('loadEvalCasesFromJsonl', () => {
+  let tempDir: string;
+
+  beforeAll(async () => {
+    tempDir = path.join(os.tmpdir(), `agentv-test-jsonl-${Date.now()}`);
+    await mkdir(tempDir, { recursive: true });
+  });
+
+  afterAll(async () => {
+    await rm(tempDir, { recursive: true, force: true });
+  });
+
+  it('parses valid single-line JSONL', async () => {
+    const jsonlPath = path.join(tempDir, 'single.jsonl');
+    await writeFile(
+      jsonlPath,
+      '{"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}\n',
+    );
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].id).toBe('test-1');
+    expect(cases[0].expected_outcome).toBe('Goal');
+    expect(cases[0].input_messages).toHaveLength(1);
+    expect(cases[0].input_messages[0].role).toBe('user');
+    expect(cases[0].input_messages[0].content).toBe('Query');
+  });
+
+  it('parses multi-line JSONL', async () => {
+    const jsonlPath = path.join(tempDir, 'multi.jsonl');
+    await writeFile(
+      jsonlPath,
+      [
+        '{"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [{"role": "user", "content": "Query 1"}]}',
+        '{"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [{"role": "user", "content": "Query 2"}]}',
+        '{"id": "test-3", "expected_outcome": "Goal 3", "input_messages": [{"role": "user", "content": "Query 3"}]}',
+      ].join('\n'),
+    );
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(3);
+    expect(cases[0].id).toBe('test-1');
+    expect(cases[1].id).toBe('test-2');
+    expect(cases[2].id).toBe('test-3');
+    expect(cases[0].expected_outcome).toBe('Goal 1');
+    expect(cases[1].expected_outcome).toBe('Goal 2');
+    expect(cases[2].expected_outcome).toBe('Goal 3');
+  });
+
+  it('skips empty lines and whitespace-only lines', async () => {
+    const jsonlPath = path.join(tempDir, 'empty-lines.jsonl');
+    await writeFile(
+      jsonlPath,
+      [
+        '{"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [{"role": "user", "content": "Query 1"}]}',
+        '',
+        '{"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [{"role": "user", "content": "Query 2"}]}',
+        '   ',
+        '{"id": "test-3", "expected_outcome": "Goal 3", "input_messages": [{"role": "user", "content": "Query 3"}]}',
+        '',
+      ].join('\n'),
+    );
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(3);
+    expect(cases[0].id).toBe('test-1');
+    expect(cases[1].id).toBe('test-2');
+    expect(cases[2].id).toBe('test-3');
+  });
+
+  it('throws error on malformed JSON with line number', async () => {
+    const jsonlPath = path.join(tempDir, 'malformed.jsonl');
+    await writeFile(
+      jsonlPath,
+      [
+        '{"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [{"role": "user", "content": "Query 1"}]}',
+        '{"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [{"role": "user", "content": "Query 2"}]}',
+        '{"id": "test-3", "expected_outcome": "Goal 3" "input_messages": []}', // Missing comma
+      ].join('\n'),
+    );
+
+    await expect(loadEvalCasesFromJsonl(jsonlPath, tempDir)).rejects.toThrow(/Line 3/);
+  });
+
+  it('skips cases with missing required fields', async () => {
+    const jsonlPath = path.join(tempDir, 'missing-fields.jsonl');
+    await writeFile(
+      jsonlPath,
+      [
+        '{"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [{"role": "user", "content": "Query 1"}]}',
+        '{"id": "test-2", "input_messages": [{"role": "user", "content": "Query 2"}]}', // Missing expected_outcome
+        '{"expected_outcome": "Goal 3", "input_messages": [{"role": "user", "content": "Query 3"}]}', // Missing id
+        '{"id": "test-4", "expected_outcome": "Goal 4"}', // Missing input_messages
+        '{"id": "test-5", "expected_outcome": "Goal 5", "input_messages": [{"role": "user", "content": "Query 5"}]}',
+      ].join('\n'),
+    );
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(2);
+    expect(cases[0].id).toBe('test-1');
+    expect(cases[1].id).toBe('test-5');
+  });
+
+  it('loads sidecar YAML metadata', async () => {
+    const jsonlPath = path.join(tempDir, 'with-sidecar.jsonl');
+    const sidecarPath = path.join(tempDir, 'with-sidecar.yaml');
+
+    await writeFile(
+      jsonlPath,
+      '{"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}\n',
+    );
+    await writeFile(
+      sidecarPath,
+      'description: Test dataset\ndataset: my-tests\nevaluator: llm_judge\n',
+    );
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].dataset).toBe('my-tests');
+    expect(cases[0].evaluator).toBe('llm_judge');
+  });
+
+  it('uses default dataset name from filename when no sidecar', async () => {
+    const jsonlPath = path.join(tempDir, 'my-dataset.jsonl');
+    await writeFile(
+      jsonlPath,
+      '{"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}\n',
+    );
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].dataset).toBe('my-dataset');
+  });
+
+  it('supports per-case evaluators override', async () => {
+    const jsonlPath = path.join(tempDir, 'with-evaluators.jsonl');
+    await writeFile(
+      jsonlPath,
+      '{"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}], "evaluators": [{"name": "rubric-check", "type": "llm_judge", "rubrics": [{"id": "r1", "description": "Must be polite", "weight": 1.0, "required": true}]}]}\n',
+    );
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].evaluators).toHaveLength(1);
+    expect(cases[0].evaluators?.[0].name).toBe('rubric-check');
+  });
+
+  it('supports inline rubrics field', async () => {
+    const jsonlPath = path.join(tempDir, 'with-rubrics.jsonl');
+    await writeFile(
+      jsonlPath,
+      '{"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}], "rubrics": ["Must be polite", "Must be helpful"]}\n',
+    );
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].evaluators).toHaveLength(1);
+    expect(cases[0].evaluators?.[0].type).toBe('llm_judge');
+    const rubricEvaluator = cases[0].evaluators?.[0] as { type: string; rubrics?: unknown[] };
+    expect(rubricEvaluator.rubrics).toHaveLength(2);
+  });
+
+  it('filters by evalId', async () => {
+    const jsonlPath = path.join(tempDir, 'filter.jsonl');
+    await writeFile(
+      jsonlPath,
+      [
+        '{"id": "test-1", "expected_outcome": "Goal 1", "input_messages": [{"role": "user", "content": "Query 1"}]}',
+        '{"id": "test-2", "expected_outcome": "Goal 2", "input_messages": [{"role": "user", "content": "Query 2"}]}',
+        '{"id": "test-3", "expected_outcome": "Goal 3", "input_messages": [{"role": "user", "content": "Query 3"}]}',
+      ].join('\n'),
+    );
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir, { evalId: 'test-2' });
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].id).toBe('test-2');
+  });
+
+  it('supports conversation_id field', async () => {
+    const jsonlPath = path.join(tempDir, 'with-conv-id.jsonl');
+    await writeFile(
+      jsonlPath,
+      '{"id": "test-1", "conversation_id": "conv-123", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}\n',
+    );
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].conversation_id).toBe('conv-123');
+  });
+
+  it('supports expected_messages field', async () => {
+    const jsonlPath = path.join(tempDir, 'with-expected.jsonl');
+    await writeFile(
+      jsonlPath,
+      '{"id": "test-1", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}], "expected_messages": [{"role": "assistant", "content": "Response"}]}\n',
+    );
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].expected_messages).toHaveLength(1);
+    expect(cases[0].reference_answer).toBe('Response');
+  });
+
+  it('handles empty JSONL file', async () => {
+    const jsonlPath = path.join(tempDir, 'empty.jsonl');
+    await writeFile(jsonlPath, '');
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(0);
+  });
+
+  it('supports backward-compatible outcome field', async () => {
+    const jsonlPath = path.join(tempDir, 'outcome-field.jsonl');
+    await writeFile(
+      jsonlPath,
+      '{"id": "test-1", "outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}\n',
+    );
+
+    const cases = await loadEvalCasesFromJsonl(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].expected_outcome).toBe('Goal');
+  });
+});
+
+describe('loadEvalCases with format detection', () => {
+  let tempDir: string;
+
+  beforeAll(async () => {
+    tempDir = path.join(os.tmpdir(), `agentv-test-loadEvalCases-${Date.now()}`);
+    await mkdir(tempDir, { recursive: true });
+  });
+
+  afterAll(async () => {
+    await rm(tempDir, { recursive: true, force: true });
+  });
+
+  it('routes .jsonl to JSONL parser', async () => {
+    const jsonlPath = path.join(tempDir, 'test.jsonl');
+    await writeFile(
+      jsonlPath,
+      '{"id": "jsonl-test", "expected_outcome": "Goal", "input_messages": [{"role": "user", "content": "Query"}]}\n',
+    );
+
+    const cases = await loadEvalCases(jsonlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].id).toBe('jsonl-test');
+  });
+
+  it('routes .yaml to YAML parser', async () => {
+    const yamlPath = path.join(tempDir, 'test.yaml');
+    await writeFile(
+      yamlPath,
+      `evalcases:
+  - id: yaml-test
+    expected_outcome: Goal
+    input_messages:
+      - role: user
+        content: Query
+`,
+    );
+
+    const cases = await loadEvalCases(yamlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].id).toBe('yaml-test');
+  });
+
+  it('routes .yml to YAML parser', async () => {
+    const ymlPath = path.join(tempDir, 'test.yml');
+    await writeFile(
+      ymlPath,
+      `evalcases:
+  - id: yml-test
+    expected_outcome: Goal
+    input_messages:
+      - role: user
+        content: Query
+`,
+    );
+
+    const cases = await loadEvalCases(ymlPath, tempDir);
+
+    expect(cases).toHaveLength(1);
+    expect(cases[0].id).toBe('yml-test');
+  });
+
+  it('throws for unsupported extensions via loadEvalCases', async () => {
+    const txtPath = path.join(tempDir, 'test.txt');
+    await writeFile(txtPath, '{}');
+
+    await expect(loadEvalCases(txtPath, tempDir)).rejects.toThrow('Unsupported file format');
+  });
+});
+
+describe('JSONL and YAML produce equivalent EvalCases', () => {
+  let tempDir: string;
+
+  beforeAll(async () => {
+    tempDir = path.join(os.tmpdir(), `agentv-test-equivalence-${Date.now()}`);
+    await mkdir(tempDir, { recursive: true });
+  });
+
+  afterAll(async () => {
+    await rm(tempDir, { recursive: true, force: true });
+  });
+
+  it('produces identical EvalCase structure from both formats', async () => {
+    // Create equivalent YAML and JSONL files
+    const yamlPath = path.join(tempDir, 'equiv.yaml');
+    const jsonlPath = path.join(tempDir, 'equiv.jsonl');
+
+    await writeFile(
+      yamlPath,
+      `dataset: my-dataset
+evalcases:
+  - id: test-1
+    expected_outcome: "The agent should respond with a helpful answer"
+    input_messages:
+      - role: user
+        content: "What is 2+2?"
+`,
+    );
+
+    // JSONL with equivalent sidecar
+    const sidecarPath = path.join(tempDir, 'equiv-sidecar.yaml');
+    await writeFile(sidecarPath, 'dataset: my-dataset\n');
+
+    const jsonlPath2 = path.join(tempDir, 'equiv-sidecar.jsonl');
+    await writeFile(
+      jsonlPath2,
+      '{"id": "test-1", "expected_outcome": "The agent should respond with a helpful answer", "input_messages": [{"role": "user", "content": "What is 2+2?"}]}\n',
+    );
+
+    const yamlCases = await loadEvalCases(yamlPath, tempDir);
+    const jsonlCases = await loadEvalCases(jsonlPath2, tempDir);
+
+    expect(yamlCases).toHaveLength(1);
+    expect(jsonlCases).toHaveLength(1);
+
+    // Core fields should match
+    expect(jsonlCases[0].id).toBe(yamlCases[0].id);
+    expect(jsonlCases[0].expected_outcome).toBe(yamlCases[0].expected_outcome);
+    expect(jsonlCases[0].dataset).toBe(yamlCases[0].dataset);
+    expect(jsonlCases[0].input_messages.length).toBe(yamlCases[0].input_messages.length);
+    expect(jsonlCases[0].input_messages[0].role).toBe(yamlCases[0].input_messages[0].role);
+    expect(jsonlCases[0].input_messages[0].content).toBe(yamlCases[0].input_messages[0].content);
+  });
+});