Skip to content

Commit cdc7475

Browse files
committed
🤖 feat: add web_fetch tool for fetching web pages as markdown
Adds a new web_fetch tool that: - Fetches web pages using curl via the Runtime (respects workspace network context) - Extracts main content using Mozilla Readability - Converts to clean markdown using Turndown - Supports both markdown and plain text output formats Features: - Network isolation: requests originate from workspace, not Mux host - Curl handles redirects, SSL, encoding, compression natively - Output truncated to 64KB, HTML input limited to 5MB - Graceful error handling for DNS failures, timeouts, empty responses Dependencies added: - @mozilla/readability: article extraction - jsdom: DOM parsing for Readability - turndown: HTML to markdown conversion Also updates AGENTS.md testing section to clarify preferred test types: 1. True integration tests (no mocks) 2. Unit tests on pure/isolated logic _Generated with `mux`_
1 parent 6830b71 commit cdc7475

File tree

9 files changed

+514
-9
lines changed

9 files changed

+514
-9
lines changed

bun.lock

Lines changed: 110 additions & 9 deletions
Large diffs are not rendered by default.

docs/AGENTS.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,13 @@ gh pr view <number> --json mergeable,mergeStateStatus | jq '.'
5757

5858
## Testing Doctrine
5959

60+
Two types of tests are preferred:
61+
62+
1. **True integration tests** — use real runtimes, real filesystems, real network calls. No mocks, stubs, or fakes. These prove the system works end-to-end.
63+
2. **Unit tests on pure/isolated logic** — test pure functions or well-isolated modules where inputs and outputs are clear. No mocks needed because the code has no external dependencies.
64+
65+
Avoid mock-heavy tests that verify implementation details rather than behavior. If you need mocks to test something, consider whether the code should be restructured to be more testable.
66+
6067
### Storybook
6168

6269
- Prefer full-app stories (`App.stories.tsx`) to isolated components.

package.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
"@ai-sdk/openai": "^2.0.72",
5151
"@ai-sdk/xai": "^2.0.36",
5252
"@lydell/node-pty": "1.1.0",
53+
"@mozilla/readability": "^0.6.0",
5354
"@openrouter/ai-sdk-provider": "^1.2.5",
5455
"@radix-ui/react-checkbox": "^1.3.3",
5556
"@radix-ui/react-dialog": "^1.1.15",
@@ -72,6 +73,7 @@
7273
"electron-updater": "^6.6.2",
7374
"express": "^5.1.0",
7475
"ghostty-web": "0.2.1",
76+
"jsdom": "^27.2.0",
7577
"jsonc-parser": "^3.3.1",
7678
"lru-cache": "^11.2.2",
7779
"lucide-react": "^0.553.0",
@@ -83,6 +85,7 @@
8385
"shescape": "^2.1.6",
8486
"source-map-support": "^0.5.21",
8587
"streamdown": "^1.4.0",
88+
"turndown": "^7.2.2",
8689
"undici": "^7.16.0",
8790
"write-file-atomic": "^6.0.0",
8891
"ws": "^8.18.3",
@@ -106,11 +109,13 @@
106109
"@types/escape-html": "^1.0.4",
107110
"@types/express": "^5.0.3",
108111
"@types/jest": "^30.0.0",
112+
"@types/jsdom": "^27.0.0",
109113
"@types/katex": "^0.16.7",
110114
"@types/markdown-it": "^14.1.2",
111115
"@types/minimist": "^1.2.5",
112116
"@types/react": "^18.2.0",
113117
"@types/react-dom": "^18.2.0",
118+
"@types/turndown": "^5.0.6",
114119
"@types/write-file-atomic": "^4.0.3",
115120
"@types/ws": "^8.18.1",
116121
"@typescript-eslint/eslint-plugin": "^8.44.1",

src/common/constants/toolLimits.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,8 @@ export const BASH_MAX_LINE_BYTES = 1024; // 1KB per line for AI agent
1717
export const MAX_TODOS = 7; // Maximum number of TODO items in a list
1818

1919
export const STATUS_MESSAGE_MAX_LENGTH = 60; // Maximum length for status messages (auto-truncated)
20+
21+
// Web fetch tool limits
22+
export const WEB_FETCH_TIMEOUT_SECS = 15; // curl timeout
23+
export const WEB_FETCH_MAX_OUTPUT_BYTES = 64 * 1024; // 64KB markdown output
24+
export const WEB_FETCH_MAX_HTML_BYTES = 5 * 1024 * 1024; // 5MB HTML input (curl --max-filesize)

src/common/types/tools.ts

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,3 +201,23 @@ export type StatusSetToolResult =
201201
success: false;
202202
error: string;
203203
};
204+
205+
// Web Fetch Tool Types
206+
export interface WebFetchToolArgs {
207+
url: string;
208+
format?: "markdown" | "text";
209+
}
210+
211+
export type WebFetchToolResult =
212+
| {
213+
success: true;
214+
title: string;
215+
content: string;
216+
url: string;
217+
byline?: string;
218+
length: number;
219+
}
220+
| {
221+
success: false;
222+
error: string;
223+
};

src/common/utils/tools/toolDefinitions.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import {
1212
BASH_MAX_LINE_BYTES,
1313
BASH_MAX_TOTAL_BYTES,
1414
STATUS_MESSAGE_MAX_LENGTH,
15+
WEB_FETCH_MAX_OUTPUT_BYTES,
1516
} from "@/common/constants/toolLimits";
1617
import { TOOL_EDIT_WARNING } from "@/common/types/tools";
1718

@@ -228,6 +229,20 @@ export const TOOL_DEFINITIONS = {
228229
})
229230
.strict(),
230231
},
232+
web_fetch: {
233+
description:
234+
`Fetch a web page and extract its main content as clean markdown. ` +
235+
`Uses the workspace's network context (requests originate from the workspace, not Mux host). ` +
236+
`Requires curl to be installed in the workspace. ` +
237+
`Output is truncated to ${Math.floor(WEB_FETCH_MAX_OUTPUT_BYTES / 1024)}KB.`,
238+
schema: z.object({
239+
url: z.string().url().describe("The URL to fetch (http or https)"),
240+
format: z
241+
.enum(["markdown", "text"])
242+
.optional()
243+
.describe("Output format: 'markdown' (default) preserves structure, 'text' is plain text"),
244+
}),
245+
},
231246
} as const;
232247

233248
/**
@@ -268,6 +283,7 @@ export function getAvailableTools(modelString: string): string[] {
268283
"todo_write",
269284
"todo_read",
270285
"status_set",
286+
"web_fetch",
271287
];
272288

273289
// Add provider-specific tools

src/common/utils/tools/tools.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { createFileEditInsertTool } from "@/node/services/tools/file_edit_insert
77
import { createProposePlanTool } from "@/node/services/tools/propose_plan";
88
import { createTodoWriteTool, createTodoReadTool } from "@/node/services/tools/todo";
99
import { createStatusSetTool } from "@/node/services/tools/status_set";
10+
import { createWebFetchTool } from "@/node/services/tools/web_fetch";
1011
import { wrapWithInitWait } from "@/node/services/tools/wrapWithInitWait";
1112
import { log } from "@/node/services/log";
1213

@@ -95,6 +96,7 @@ export async function getToolsForModel(
9596
// and line number miscalculations. Use file_edit_replace_string instead.
9697
// file_edit_replace_lines: wrap(createFileEditReplaceLinesTool(config)),
9798
bash: wrap(createBashTool(config)),
99+
web_fetch: wrap(createWebFetchTool(config)),
98100
};
99101

100102
// Non-runtime tools execute immediately (no init wait needed)
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
import { describe, it, expect } from "bun:test";
2+
import { createWebFetchTool } from "./web_fetch";
3+
import type { WebFetchToolArgs, WebFetchToolResult } from "@/common/types/tools";
4+
import { WEB_FETCH_MAX_OUTPUT_BYTES } from "@/common/constants/toolLimits";
5+
import { TestTempDir, createTestToolConfig } from "./testHelpers";
6+
import * as fs from "fs/promises";
7+
import * as path from "path";
8+
9+
import type { ToolCallOptions } from "ai";
10+
11+
// ToolCallOptions stub for testing
12+
const toolCallOptions: ToolCallOptions = {
13+
toolCallId: "test-call-id",
14+
messages: [],
15+
};
16+
17+
// Helper to create web_fetch tool with real LocalRuntime
18+
function createTestWebFetchTool() {
19+
const tempDir = new TestTempDir("test-web-fetch");
20+
const config = createTestToolConfig(tempDir.path);
21+
const tool = createWebFetchTool(config);
22+
23+
return {
24+
tool,
25+
tempDir,
26+
[Symbol.dispose]() {
27+
tempDir[Symbol.dispose]();
28+
},
29+
};
30+
}
31+
32+
describe("web_fetch tool", () => {
33+
// Integration test: fetch a real public URL
34+
it("should fetch and convert a real web page to markdown", async () => {
35+
using testEnv = createTestWebFetchTool();
36+
const args: WebFetchToolArgs = {
37+
// example.com is a stable, simple HTML page maintained by IANA
38+
url: "https://example.com",
39+
};
40+
41+
const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
42+
43+
expect(result.success).toBe(true);
44+
if (result.success) {
45+
expect(result.title).toContain("Example Domain");
46+
expect(result.url).toBe("https://example.com");
47+
// example.com mentions documentation examples
48+
expect(result.content).toContain("documentation");
49+
expect(result.length).toBeGreaterThan(0);
50+
}
51+
});
52+
53+
it("should return plain text when format is 'text'", async () => {
54+
using testEnv = createTestWebFetchTool();
55+
const args: WebFetchToolArgs = {
56+
url: "https://example.com",
57+
format: "text",
58+
};
59+
60+
const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
61+
62+
expect(result.success).toBe(true);
63+
if (result.success) {
64+
// Plain text should not have markdown formatting
65+
expect(result.content).not.toContain("#");
66+
expect(result.content).not.toContain("**");
67+
// example.com content mentions documentation
68+
expect(result.content).toContain("documentation");
69+
}
70+
});
71+
72+
it("should handle DNS failure gracefully", async () => {
73+
using testEnv = createTestWebFetchTool();
74+
const args: WebFetchToolArgs = {
75+
// .invalid TLD is reserved and guaranteed to never resolve
76+
url: "https://this-domain-does-not-exist.invalid/page",
77+
};
78+
79+
const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
80+
81+
expect(result.success).toBe(false);
82+
if (!result.success) {
83+
expect(result.error).toContain("Failed to fetch URL");
84+
}
85+
});
86+
87+
it("should handle connection refused gracefully", async () => {
88+
using testEnv = createTestWebFetchTool();
89+
const args: WebFetchToolArgs = {
90+
// localhost on a random high port should refuse connection
91+
url: "http://127.0.0.1:59999/page",
92+
};
93+
94+
const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
95+
96+
expect(result.success).toBe(false);
97+
if (!result.success) {
98+
expect(result.error).toContain("Failed to fetch URL");
99+
}
100+
});
101+
102+
// Test with a local file served via file:// - tests HTML parsing without network
103+
it("should handle local HTML content via file:// URL", async () => {
104+
using testEnv = createTestWebFetchTool();
105+
106+
// Create a test HTML file
107+
const htmlContent = `
108+
<!DOCTYPE html>
109+
<html>
110+
<head><title>Local Test Page</title></head>
111+
<body>
112+
<article>
113+
<h1>Test Heading</h1>
114+
<p>This is test content with <strong>bold</strong> and <em>italic</em> text.</p>
115+
</article>
116+
</body>
117+
</html>`;
118+
const htmlPath = path.join(testEnv.tempDir.path, "test.html");
119+
await fs.writeFile(htmlPath, htmlContent);
120+
121+
const args: WebFetchToolArgs = {
122+
url: `file://${htmlPath}`,
123+
};
124+
125+
const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
126+
127+
expect(result.success).toBe(true);
128+
if (result.success) {
129+
expect(result.title).toBe("Local Test Page");
130+
expect(result.content).toContain("Test Heading");
131+
expect(result.content).toContain("**bold**");
132+
expect(result.content).toContain("_italic_");
133+
}
134+
});
135+
136+
it("should truncate oversized output from local file", async () => {
137+
using testEnv = createTestWebFetchTool();
138+
139+
// Create HTML that will produce content larger than WEB_FETCH_MAX_OUTPUT_BYTES
140+
const largeContent = "x".repeat(WEB_FETCH_MAX_OUTPUT_BYTES + 1000);
141+
const htmlContent = `
142+
<!DOCTYPE html>
143+
<html>
144+
<head><title>Large Page</title></head>
145+
<body><article><p>${largeContent}</p></article></body>
146+
</html>`;
147+
const htmlPath = path.join(testEnv.tempDir.path, "large.html");
148+
await fs.writeFile(htmlPath, htmlContent);
149+
150+
const args: WebFetchToolArgs = {
151+
url: `file://${htmlPath}`,
152+
};
153+
154+
const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
155+
156+
expect(result.success).toBe(true);
157+
if (result.success) {
158+
expect(result.content.length).toBeLessThanOrEqual(
159+
WEB_FETCH_MAX_OUTPUT_BYTES + 100 // Allow for truncation message
160+
);
161+
expect(result.content).toContain("[Content truncated]");
162+
}
163+
});
164+
165+
it("should handle non-article HTML gracefully", async () => {
166+
using testEnv = createTestWebFetchTool();
167+
168+
// Minimal HTML that Readability may not parse as an article
169+
const htmlContent = "<html><body><p>Just some text</p></body></html>";
170+
const htmlPath = path.join(testEnv.tempDir.path, "minimal.html");
171+
await fs.writeFile(htmlPath, htmlContent);
172+
173+
const args: WebFetchToolArgs = {
174+
url: `file://${htmlPath}`,
175+
};
176+
177+
const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
178+
179+
// Readability may or may not parse this - the important thing is we don't crash
180+
expect(typeof result.success).toBe("boolean");
181+
});
182+
183+
it("should handle empty file", async () => {
184+
using testEnv = createTestWebFetchTool();
185+
186+
const htmlPath = path.join(testEnv.tempDir.path, "empty.html");
187+
await fs.writeFile(htmlPath, "");
188+
189+
const args: WebFetchToolArgs = {
190+
url: `file://${htmlPath}`,
191+
};
192+
193+
const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
194+
195+
expect(result.success).toBe(false);
196+
if (!result.success) {
197+
expect(result.error).toContain("Empty response");
198+
}
199+
});
200+
201+
it("should handle missing file", async () => {
202+
using testEnv = createTestWebFetchTool();
203+
204+
const args: WebFetchToolArgs = {
205+
url: `file://${testEnv.tempDir.path}/nonexistent.html`,
206+
};
207+
208+
const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
209+
210+
expect(result.success).toBe(false);
211+
if (!result.success) {
212+
expect(result.error).toContain("Failed to fetch URL");
213+
}
214+
});
215+
});

0 commit comments

Comments
 (0)