From fba6acec00bded00f86888a097154530ae9248c9 Mon Sep 17 00:00:00 2001 From: Alon Mishne Date: Tue, 23 Sep 2025 13:13:53 -0700 Subject: [PATCH 1/2] feat: add tool logs --- pnpm-lock.yaml | 10 ++++ report-app/package.json | 1 + .../pages/report-viewer/report-viewer.html | 26 ++++++++++ .../pages/report-viewer/report-viewer.scss | 31 ++++++++++- .../app/pages/report-viewer/report-viewer.ts | 2 + .../codegen/gemini-cli/gemini-cli-runner.ts | 2 +- runner/codegen/genkit/genkit-runner.ts | 51 ++++++++++++++++++- runner/codegen/llm-runner.ts | 20 ++++---- runner/orchestration/build.ts | 17 +++---- runner/orchestration/codegen.ts | 5 ++ runner/orchestration/generate.ts | 23 ++++++--- runner/shared-interfaces.ts | 22 ++++++++ 12 files changed, 182 insertions(+), 28 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 802b3d8..785b57a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -165,6 +165,9 @@ importers: jszip: specifier: ^3.10.1 version: 3.10.1 + ngx-json-viewer: + specifier: ^3.2.1 + version: 3.2.1 rxjs: specifier: ~7.8.0 version: 7.8.2 @@ -4969,6 +4972,9 @@ packages: resolution: {integrity: sha512-dBpDMdxv9Irdq66304OLfEmQ9tbNRFnFTuZiLo+bD+r332bBmMJ8GBLXklIXXgxd3+v9+KUnZaUR5PJMa75Gsg==} engines: {node: '>= 0.4.0'} + ngx-json-viewer@3.2.1: + resolution: {integrity: sha512-TTHtXsrBX+IXPqqAIsxklHPqSNmyGeQaziFZbCDJq1PnPOQmTrEHfwNrzN3LnWGhf7UxeM1cK0njegVPChwEcg==} + node-addon-api@6.1.0: resolution: {integrity: sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA==} @@ -12211,6 +12217,10 @@ snapshots: netmask@2.0.2: {} + ngx-json-viewer@3.2.1: + dependencies: + tslib: 2.8.1 + node-addon-api@6.1.0: optional: true diff --git a/report-app/package.json b/report-app/package.json index 3dc49ec..4a22670 100644 --- a/report-app/package.json +++ b/report-app/package.json @@ -23,6 +23,7 @@ "@shikijs/themes": "3.13.0", "express": "^4.18.2", "jszip": "^3.10.1", + "ngx-json-viewer": "^3.2.1", "rxjs": "~7.8.0", "shiki": "^3.6.0", "tinyglobby": "^0.2.14", diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html index 2f74f08..bb2f95d 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.html +++ b/report-app/src/app/pages/report-viewer/report-viewer.html @@ -411,6 +411,32 @@

Debugging Tools

(click)="downloadDebuggingZip(result)"> Download ZIP for debugging + @if (result.toolLogs.length > 0) { + + Tool Logs + + + } @if (finalBuild.runtimeErrors) { diff --git a/report-app/src/app/pages/report-viewer/report-viewer.scss b/report-app/src/app/pages/report-viewer/report-viewer.scss index 23eaa6b..6887773 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.scss +++ b/report-app/src/app/pages/report-viewer/report-viewer.scss @@ -63,7 +63,7 @@ expansion-panel { padding: 0 1rem 1rem; } -.app-details-section expansion-panel { +.app-details-section expansion-panel, .app-details-section button { margin-bottom: 0.5rem; } @@ -233,3 +233,32 @@ expansion-panel { padding: 0px 20px; } +.mcp-log-entry { + border: 1px solid var(--border-color); + border-radius: var(--border-radius); + margin: 1rem 0; + + & > summary { + cursor: pointer; + font-weight: 500; + padding: 1rem 1.5rem; + + &:hover { + background-color: var(--button-active-bg-color); + } + } + + & .mcp-log-content { + padding: 0 1.5rem 1.5rem; + + h5 { + margin-top: 1.5rem; + margin-bottom: 0.5rem; + } + } +} + +.tool-logs-list { + list-style: none; + padding: 0; +} diff --git a/report-app/src/app/pages/report-viewer/report-viewer.ts b/report-app/src/app/pages/report-viewer/report-viewer.ts index 16d8ce2..14acc8b 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.ts +++ b/report-app/src/app/pages/report-viewer/report-viewer.ts @@ -11,6 +11,7 @@ import { signal, viewChild, } from '@angular/core'; +import { NgxJsonViewerModule } from 'ngx-json-viewer'; import { BuildErrorType } from '../../../../../runner/builder/builder-types'; import { AssessmentResult, @@ -55,6 +56,7 @@ import { ProviderLabel } from '../../shared/provider-label'; ExpansionPanel, ExpansionPanelHeader, ProviderLabel, + NgxJsonViewerModule, ], templateUrl: './report-viewer.html', styleUrls: ['./report-viewer.scss'], diff --git a/runner/codegen/gemini-cli/gemini-cli-runner.ts b/runner/codegen/gemini-cli/gemini-cli-runner.ts index b711354..549423e 100644 --- a/runner/codegen/gemini-cli/gemini-cli-runner.ts +++ b/runner/codegen/gemini-cli/gemini-cli-runner.ts @@ -89,7 +89,7 @@ export class GeminiCliRunner implements LlmRunner { }); } - return { files, reasoning }; + return { files, reasoning, toolLogs: [] }; } generateText(): Promise { diff --git a/runner/codegen/genkit/genkit-runner.ts b/runner/codegen/genkit/genkit-runner.ts index cda3bd3..3064cf3 100644 --- a/runner/codegen/genkit/genkit-runner.ts +++ b/runner/codegen/genkit/genkit-runner.ts @@ -1,5 +1,6 @@ import { DynamicResourceAction, + GenerateResponse, genkit, ModelReference, ToolAction, @@ -27,6 +28,7 @@ import { GenkitModelProvider, PromptDataForCounting, } from './model-provider.js'; +import { ToolLogEntry } from '../../shared-interfaces.js'; const globalLogger = new GenkitLogger(); logger.init(globalLogger); @@ -38,6 +40,7 @@ export class GenkitRunner implements LlmRunner { readonly hasBuiltInRepairLoop = false; private readonly genkitInstance = this.getGenkitInstance(); private mcpHost: GenkitMcpHost | null = null; + private toolLogs: ToolLogEntry[] = []; async generateConstrained( options: LlmConstrainedOutputGenerateRequestOptions @@ -75,9 +78,14 @@ export class GenkitRunner implements LlmRunner { files: result.output.outputFiles || [], usage: result.usage, reasoning: result.reasoning, + toolLogs: this.flushToolLogs(), }; } + flushToolLogs(): ToolLogEntry[] { + return this.toolLogs.splice(0); + } + async generateText( options: LlmGenerateTextRequestOptions ): Promise { @@ -87,6 +95,7 @@ export class GenkitRunner implements LlmRunner { text: result.text, usage: result.usage, reasoning: result.reasoning, + toolLogs: this.flushToolLogs(), }; } @@ -120,7 +129,7 @@ export class GenkitRunner implements LlmRunner { ]); } - return this.genkitInstance.generate({ + const response = await this.genkitInstance.generate({ prompt: options.prompt, model, output: schema @@ -145,6 +154,10 @@ export class GenkitRunner implements LlmRunner { resources, abortSignal: options.abortSignal, }); + + this._logToolUsage(response); + + return response; }; return options.timeout @@ -158,6 +171,42 @@ export class GenkitRunner implements LlmRunner { ); } + private _logToolUsage(response: GenerateResponse) { + const toolRequests = new Map(); + const toolResponses = new Map(); + + if (response.request?.messages) { + for (const message of response.request.messages) { + if (!message.content) { + continue; + } + for (const contentPart of message.content) { + if (contentPart.toolRequest) { + toolRequests.set( + contentPart.toolRequest.ref || '0', + contentPart.toolRequest + ); + } else if (contentPart.toolResponse) { + toolResponses.set( + contentPart.toolResponse.ref || '0', + contentPart.toolResponse + ); + } + } + } + } + + for (const [ref, toolRequest] of toolRequests.entries()) { + const toolResponse = toolResponses.get(ref); + if (toolResponse) { + this.toolLogs.push({ + request: toolRequest, + response: toolResponse, + }); + } + } + } + startMcpServerHost(hostName: string, servers: McpServerOptions[]): void { if (this.mcpHost !== null) { throw new Error('MCP host is already started'); diff --git a/runner/codegen/llm-runner.ts b/runner/codegen/llm-runner.ts index 279e99d..9a0a715 100644 --- a/runner/codegen/llm-runner.ts +++ b/runner/codegen/llm-runner.ts @@ -1,5 +1,5 @@ import { z } from 'zod'; -import { LlmResponseFile, Usage } from '../shared-interfaces.js'; +import { LlmResponseFile, ToolLogEntry, Usage } from '../shared-interfaces.js'; import { UserFacingError } from '../utils/errors.js'; export function assertValidModelName(value: string, availableModels: string[]) { @@ -141,22 +141,24 @@ export interface LlmConstrainedOutputGenerateResponse< reasoning: string; } -/** File generation response from the LLM. */ -export interface LlmGenerateFilesResponse { - files: LlmResponseFile[]; +/** LLM response. */ +interface BaseLlmGenerateResponse { /** Token usage data, if available. */ usage?: Partial; /** Reasoning messages from the LLM. */ reasoning: string; + /** Tool requests and responses. */ + toolLogs: ToolLogEntry[]; +} + +/** File generation response from the LLM. */ +export interface LlmGenerateFilesResponse extends BaseLlmGenerateResponse { + files: LlmResponseFile[]; } /** Text response from the LLM. */ -export interface LlmGenerateTextResponse { +export interface LlmGenerateTextResponse extends BaseLlmGenerateResponse { text: string; - /** Token usage data, if available. */ - usage?: Partial; - /** Reasoning messages from the LLM. */ - reasoning: string; } /** Schema for the LLM server options. */ diff --git a/runner/orchestration/build.ts b/runner/orchestration/build.ts index 8117c20..a05a1cd 100644 --- a/runner/orchestration/build.ts +++ b/runner/orchestration/build.ts @@ -4,7 +4,7 @@ import { BuildWorkerMessage, RepairType, } from '../builder/builder-types.js'; -import { LlmRunner } from '../codegen/llm-runner.js'; +import { LlmGenerateFilesResponse, LlmRunner } from '../codegen/llm-runner.js'; import { Environment } from '../configuration/environment.js'; import { AttemptDetails, @@ -44,11 +44,7 @@ export async function attemptBuild( rootPromptDef: RootPromptDefinition, directory: string, contextFiles: LlmContextFile[], - initialResponse: { - usage: Usage; - outputFiles: LlmResponseFile[]; - reasoning: string; - }, + initialResponse: LlmGenerateFilesResponse, attemptDetails: AttemptDetails[], skipScreenshots: boolean, skipAxeTesting: boolean, @@ -72,7 +68,7 @@ export async function attemptBuild( // Clone the original files, because we're going to mutate them between repair // attempts and we don't want the different runs to influence each other. - const finalOutputFiles = initialResponse.outputFiles.map((file) => ({ + const finalOutputFiles = initialResponse.files.map((file) => ({ ...file, })); let buildResult = await workerConcurrencyQueue.add( @@ -86,8 +82,11 @@ export async function attemptBuild( : DEFAULT_MAX_REPAIR_ATTEMPTS; attemptDetails.push({ - outputFiles: initialResponse.outputFiles, - usage: initialResponse.usage, + outputFiles: initialResponse.files, + usage: { + ...{ inputTokens: 0, outputTokens: 0, totalTokens: 0 }, + ...initialResponse.usage, + }, reasoning: initialResponse.reasoning, buildResult, attempt: 0, diff --git a/runner/orchestration/codegen.ts b/runner/orchestration/codegen.ts index 3975e27..1550f6f 100644 --- a/runner/orchestration/codegen.ts +++ b/runner/orchestration/codegen.ts @@ -3,6 +3,7 @@ import { LlmResponse, LlmResponseFile, RootPromptDefinition, + ToolLogEntry, Usage, } from '../shared-interfaces.js'; import { @@ -49,6 +50,7 @@ export async function generateCodeWithAI( let usage: Usage; let success: boolean; let reasoning: string; + let toolLogs: ToolLogEntry[]; const contextMessageData = prepareContextFilesMessage(contextFiles); const messages: PromptDataMessage[] | undefined = contextMessageData @@ -72,6 +74,7 @@ export async function generateCodeWithAI( totalTokens: response.usage?.totalTokens ?? 0, }; reasoning = response.reasoning; + toolLogs = response.toolLogs ?? []; progress.log( promptDef, @@ -100,6 +103,7 @@ export async function generateCodeWithAI( usage = { inputTokens: 0, outputTokens: 0, totalTokens: 0 }; success = false; reasoning = ''; + toolLogs = []; errors.push(error + ''); progress.log( promptDef, @@ -117,6 +121,7 @@ export async function generateCodeWithAI( errors, usage, reasoning, + toolLogs, } satisfies LlmResponse; } diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts index f267b6c..19b7ad8 100644 --- a/runner/orchestration/generate.ts +++ b/runner/orchestration/generate.ts @@ -5,7 +5,11 @@ import { randomUUID } from 'crypto'; import PQueue from 'p-queue'; import { basename, join } from 'path'; import { existsSync, readdirSync } from 'fs'; -import { LlmGenerateFilesContext, LlmRunner } from '../codegen/llm-runner.js'; +import { + LlmGenerateFilesContext, + LlmGenerateFilesResponse, + LlmRunner, +} from '../codegen/llm-runner.js'; import { DEFAULT_AUTORATER_MODEL_NAME, LLM_OUTPUT_DIR, @@ -346,6 +350,8 @@ async function startEvaluationTask( progress ); + const toolLogs = initialResponse.toolLogs; + if (!initialResponse) { progress.log( promptDef, @@ -363,7 +369,7 @@ async function startEvaluationTask( // Write the generated files to disk within the project directory. await writeResponseFiles( directory, - initialResponse.outputFiles, + initialResponse.files, env, rootPromptDef.name ); @@ -373,7 +379,7 @@ async function startEvaluationTask( if (rootPromptDef.kind === 'multi-step') { await writeResponseFiles( directory, - initialResponse.outputFiles, + initialResponse.files, env, promptDef.name ); @@ -400,7 +406,7 @@ async function startEvaluationTask( ratingLlm, rootPromptDef.name, defsToExecute[0].prompt, - initialResponse.outputFiles, + initialResponse.files, abortSignal ); @@ -469,6 +475,7 @@ async function startEvaluationTask( attemptDetails, userJourneys: userJourneys, axeRepairAttempts: attempt.axeRepairAttempts, + toolLogs, } satisfies AssessmentResult); } @@ -497,7 +504,7 @@ async function generateInitialFiles( localMode: boolean, abortSignal: AbortSignal, progress: ProgressLogger -) { +): Promise { if (localMode) { const localFilesDirectory = join(LLM_OUTPUT_DIR, env.id, promptDef.name); const filePaths = globSync('**/*', { cwd: localFilesDirectory }); @@ -509,7 +516,7 @@ async function generateInitialFiles( } return { - outputFiles: await Promise.all( + files: await Promise.all( filePaths.map(async (filePath) => ({ filePath, code: await readFile(join(localFilesDirectory, filePath), 'utf8'), @@ -521,6 +528,7 @@ async function generateInitialFiles( } satisfies Usage, // TODO: We could also try save/restore reasoning locally. reasoning: '', + toolLogs: [], }; } @@ -542,9 +550,10 @@ async function generateInitialFiles( } return { - outputFiles: response.outputFiles!, + files: response.outputFiles!, usage: response.usage, reasoning: response.reasoning, + toolLogs: response.toolLogs, }; } diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts index 42648bc..28c2d6b 100644 --- a/runner/shared-interfaces.ts +++ b/runner/shared-interfaces.ts @@ -105,6 +105,8 @@ export interface LlmResponse { usage: Usage; /** Reasoning messages from the LLM for generating this response. */ reasoning: string; + /** Tool requests logs (e.g. MCP requests and responses). */ + toolLogs: ToolLogEntry[]; } /** Error response from an LLM API. */ @@ -366,6 +368,24 @@ export interface RunDetails { systemInstructionsPrompt?: string; } +/** + * Logs for a single tool request and response (e.g. an MCP tool). + * + * Fields are coming from GenerateRequestSchema. + */ +export interface ToolLogEntry { + request: { + name: string; + ref?: string | undefined; + input?: unknown; + }; + response: { + name: string; + output?: unknown; + ref?: string | undefined; + }; +} + /** * Encapsulates all results and details for the assessment of a single prompt. * This includes the original prompt definition, the final generated code, @@ -388,6 +408,8 @@ export interface AssessmentResult { userJourneys?: UserJourneysResult; /** The number of repair attempts made after the axe initial failures. */ axeRepairAttempts: number; + /** Tool requests logs (e.g. MCP requests and responses). */ + toolLogs: ToolLogEntry[]; } /** From f5baadd3081170b8851e7ef7f975b75b0583f5fc Mon Sep 17 00:00:00 2001 From: Alon Mishne Date: Wed, 24 Sep 2025 14:06:38 -0700 Subject: [PATCH 2/2] fix: make toolLogs optional --- report-app/src/app/pages/report-viewer/report-viewer.html | 2 +- runner/codegen/llm-runner.ts | 2 +- runner/orchestration/generate.ts | 4 ++-- runner/shared-interfaces.ts | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/report-app/src/app/pages/report-viewer/report-viewer.html b/report-app/src/app/pages/report-viewer/report-viewer.html index bb2f95d..2b52b13 100644 --- a/report-app/src/app/pages/report-viewer/report-viewer.html +++ b/report-app/src/app/pages/report-viewer/report-viewer.html @@ -411,7 +411,7 @@

Debugging Tools

(click)="downloadDebuggingZip(result)"> Download ZIP for debugging - @if (result.toolLogs.length > 0) { + @if (result.toolLogs && result.toolLogs.length > 0) { Tool Logs
    diff --git a/runner/codegen/llm-runner.ts b/runner/codegen/llm-runner.ts index 9a0a715..dd4b6f9 100644 --- a/runner/codegen/llm-runner.ts +++ b/runner/codegen/llm-runner.ts @@ -148,7 +148,7 @@ interface BaseLlmGenerateResponse { /** Reasoning messages from the LLM. */ reasoning: string; /** Tool requests and responses. */ - toolLogs: ToolLogEntry[]; + toolLogs?: ToolLogEntry[]; } /** File generation response from the LLM. */ diff --git a/runner/orchestration/generate.ts b/runner/orchestration/generate.ts index 19b7ad8..af40f13 100644 --- a/runner/orchestration/generate.ts +++ b/runner/orchestration/generate.ts @@ -350,7 +350,7 @@ async function startEvaluationTask( progress ); - const toolLogs = initialResponse.toolLogs; + const toolLogs = initialResponse.toolLogs ?? []; if (!initialResponse) { progress.log( @@ -688,7 +688,7 @@ async function installChrome(): Promise { try { await chromeInstallPromise; - } catch {} + } catch {} // Ignore errors here, as it might be already installed. } /** diff --git a/runner/shared-interfaces.ts b/runner/shared-interfaces.ts index 28c2d6b..4c33409 100644 --- a/runner/shared-interfaces.ts +++ b/runner/shared-interfaces.ts @@ -106,7 +106,7 @@ export interface LlmResponse { /** Reasoning messages from the LLM for generating this response. */ reasoning: string; /** Tool requests logs (e.g. MCP requests and responses). */ - toolLogs: ToolLogEntry[]; + toolLogs?: ToolLogEntry[]; } /** Error response from an LLM API. */ @@ -409,7 +409,7 @@ export interface AssessmentResult { /** The number of repair attempts made after the axe initial failures. */ axeRepairAttempts: number; /** Tool requests logs (e.g. MCP requests and responses). */ - toolLogs: ToolLogEntry[]; + toolLogs?: ToolLogEntry[]; } /**