Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/workspace-sync.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"agentv": minor
---

Add workspace sync commands and `eval --workspace-root` to default agentic providers' working directory.
8 changes: 8 additions & 0 deletions .claude/skills/agentv-eval-builder/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ description: Create and maintain AgentV YAML evaluation files for testing AI age
- Batch CLI: `references/batch-cli-evaluator.md` - Evaluate batch runner output (JSONL)
- Compare: `references/compare-command.md` - Compare evaluation results between runs

## Workspace-Aware Runs

If your target provider needs a consistent working directory (e.g., prompt files, skills, fixtures), create and sync a workspace folder and then run evals with `--workspace-root`.

- `agentv workspace create --out <dir>` writes `<dir>/.agentv/workspace.yaml` (`--workspace-root` is an alias)
- `agentv workspace sync --config <dir>/.agentv/workspace.yaml` refreshes all configured sources
- `agentv eval --workspace-root <dir> ...` defaults target `cwd` for `cli`, `codex`, `claude-code`, `pi-coding-agent` and defaults `workspaceTemplate` for `vscode`/`vscode-insiders` when not set in `targets.yaml`

## Structure Requirements
- Root level: `description` (optional), `execution` (with `target`), `evalcases` (required)
- Eval case fields: `id` (required), `expected_outcome` (required), `input_messages` (required)
Expand Down
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,25 @@ agentv eval --eval-id case-123 evals/my-eval.yaml

# Dry-run with mock provider
agentv eval --dry-run evals/my-eval.yaml

# Default agentic provider workspace (cwd/workspace template)
agentv eval --workspace-root ./agent-workspace evals/my-eval.yaml
```

See `agentv eval --help` for all options: workers, timeouts, output formats, trace dumping, and more.

### Workspace Sync

Use `agentv workspace` to build and refresh a local working folder from multiple sources (local folders or git repos). This is useful for agentic targets that need a consistent working directory.

```bash
# Create a new workspace config
agentv workspace create --out ./agent-workspace

# Edit ./agent-workspace/.agentv/workspace.yaml to add sources, then sync
agentv workspace sync --config ./agent-workspace/.agentv/workspace.yaml
```

### Create Custom Evaluators

Write code judges in Python or TypeScript:
Expand Down
15 changes: 15 additions & 0 deletions apps/cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,25 @@ agentv eval --eval-id case-123 evals/my-eval.yaml

# Dry-run with mock provider
agentv eval --dry-run evals/my-eval.yaml

# Default agentic provider workspace (cwd/workspace template)
agentv eval --workspace-root ./agent-workspace evals/my-eval.yaml
```

See `agentv eval --help` for all options: workers, timeouts, output formats, trace dumping, and more.

### Workspace Sync

Use `agentv workspace` to build and refresh a local working folder from multiple sources (local folders or git repos). This is useful for agentic targets that need a consistent working directory.

```bash
# Create a new workspace config
agentv workspace create --out ./agent-workspace

# Edit ./agent-workspace/.agentv/workspace.yaml to add sources, then sync
agentv workspace sync --config ./agent-workspace/.agentv/workspace.yaml
```

### Create Custom Evaluators

Write code judges in Python or TypeScript:
Expand Down
5 changes: 1 addition & 4 deletions apps/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@
"bin": {
"agentv": "./dist/cli.js"
},
"files": [
"dist",
"README.md"
],
"files": ["dist", "README.md"],
"scripts": {
"dev": "bun --watch src/index.ts",
"build": "tsup && bun run copy-readme",
Expand Down
7 changes: 7 additions & 0 deletions apps/cli/src/commands/eval/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ export const evalCommand = command({
description: 'Retry count for timeout recoveries (default: 2)',
defaultValue: () => 2,
}),
workspaceRoot: option({
type: optional(string),
long: 'workspace-root',
description:
'Default workspace root for agentic providers (applied as target cwd/workspaceTemplate when not set in targets.yaml)',
}),
cache: flag({
long: 'cache',
description: 'Enable in-memory provider response cache',
Expand All @@ -117,6 +123,7 @@ export const evalCommand = command({
dryRunDelayMax: args.dryRunDelayMax,
agentTimeout: args.agentTimeout,
maxRetries: args.maxRetries,
workspaceRoot: args.workspaceRoot,
cache: args.cache,
verbose: args.verbose,
};
Expand Down
115 changes: 111 additions & 4 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ interface NormalizedOptions {
readonly dryRunDelayMax: number;
readonly agentTimeoutSeconds: number;
readonly maxRetries: number;
readonly workspaceRoot?: string;
readonly cache: boolean;
readonly verbose: boolean;
}
Expand Down Expand Up @@ -93,6 +94,7 @@ function normalizeOptions(rawOptions: Record<string, unknown>): NormalizedOption
dryRunDelayMax: normalizeNumber(rawOptions.dryRunDelayMax, 0),
agentTimeoutSeconds: normalizeNumber(rawOptions.agentTimeout, 120),
maxRetries: normalizeNumber(rawOptions.maxRetries, 2),
workspaceRoot: normalizeString(rawOptions.workspaceRoot),
cache: normalizeBoolean(rawOptions.cache),
verbose: normalizeBoolean(rawOptions.verbose),
} satisfies NormalizedOptions;
Expand Down Expand Up @@ -217,18 +219,119 @@ function applyVerboseOverride(selection: TargetSelection, cliVerbose: boolean):
};
}

export function applyWorkspaceRootOverride(
selection: TargetSelection,
workspaceRoot?: string,
): TargetSelection {
const root = workspaceRoot?.trim();
if (!root) {
return selection;
}

const { resolvedTarget } = selection;

if (resolvedTarget.kind === 'vscode' || resolvedTarget.kind === 'vscode-insiders') {
const current = resolvedTarget.config.workspaceTemplate;
if (typeof current === 'string' && current.trim().length > 0) {
return selection;
}

return {
...selection,
resolvedTarget: {
...resolvedTarget,
config: {
...resolvedTarget.config,
workspaceTemplate: root,
},
},
};
}

if (resolvedTarget.kind === 'cli') {
const current = resolvedTarget.config.cwd;
if (typeof current === 'string' && current.trim().length > 0) {
return selection;
}
return {
...selection,
resolvedTarget: {
...resolvedTarget,
config: {
...resolvedTarget.config,
cwd: root,
},
},
};
}

if (resolvedTarget.kind === 'codex') {
const current = resolvedTarget.config.cwd;
if (typeof current === 'string' && current.trim().length > 0) {
return selection;
}
return {
...selection,
resolvedTarget: {
...resolvedTarget,
config: {
...resolvedTarget.config,
cwd: root,
},
},
};
}

if (resolvedTarget.kind === 'pi-coding-agent') {
const current = resolvedTarget.config.cwd;
if (typeof current === 'string' && current.trim().length > 0) {
return selection;
}
return {
...selection,
resolvedTarget: {
...resolvedTarget,
config: {
...resolvedTarget.config,
cwd: root,
},
},
};
}

if (resolvedTarget.kind === 'claude-code') {
const current = resolvedTarget.config.cwd;
if (typeof current === 'string' && current.trim().length > 0) {
return selection;
}
return {
...selection,
resolvedTarget: {
...resolvedTarget,
config: {
...resolvedTarget.config,
cwd: root,
},
},
};
}

return selection;
}

async function prepareFileMetadata(params: {
readonly testFilePath: string;
readonly repoRoot: string;
readonly cwd: string;
readonly workspaceRoot?: string;
readonly options: NormalizedOptions;
}): Promise<{
readonly evalIds: readonly string[];
readonly evalCases: readonly EvalCase[];
readonly selection: TargetSelection;
readonly inlineTargetLabel: string;
}> {
const { testFilePath, repoRoot, cwd, options } = params;
const { testFilePath, repoRoot, cwd, options, workspaceRoot } = params;

await ensureFileExists(testFilePath, 'Test file');
await loadEnvFromHierarchy({
Expand All @@ -250,10 +353,12 @@ async function prepareFileMetadata(params: {
env: process.env,
});

const selectionWithWorkspaceRoot = applyWorkspaceRootOverride(selection, workspaceRoot);

const providerLabel = options.dryRun
? `${selection.resolvedTarget.kind} (dry-run)`
: selection.resolvedTarget.kind;
const inlineTargetLabel = `${selection.targetName} [provider=${providerLabel}]`;
? `${selectionWithWorkspaceRoot.resolvedTarget.kind} (dry-run)`
: selectionWithWorkspaceRoot.resolvedTarget.kind;
const inlineTargetLabel = `${selectionWithWorkspaceRoot.targetName} [provider=${providerLabel}]`;

const evalCases = await loadEvalCases(testFilePath, repoRoot, {
verbose: options.verbose,
Expand Down Expand Up @@ -404,6 +509,7 @@ async function runSingleEvalFile(params: {

export async function runEvalCommand(input: RunEvalCommandInput): Promise<void> {
const options = normalizeOptions(input.rawOptions);
const workspaceRoot = options.workspaceRoot ? path.resolve(options.workspaceRoot) : undefined;
const cwd = process.cwd();
const repoRoot = await findRepoRoot(cwd);

Expand Down Expand Up @@ -447,6 +553,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise<void>
testFilePath,
repoRoot,
cwd,
workspaceRoot,
options,
});
fileMetadata.set(testFilePath, meta);
Expand Down
Loading