🤖 feat: add web_fetch tool for fetching web pages as markdown

ammar-agent · ammar-agent · commit cdc747528d47 · 2025-11-24T19:18:20.000-06:00
Adds a new web_fetch tool that:
- Fetches web pages using curl via the Runtime (respects workspace network context)
- Extracts main content using Mozilla Readability
- Converts to clean markdown using Turndown
- Supports both markdown and plain text output formats

Features:
- Network isolation: requests originate from workspace, not Mux host
- Curl handles redirects, SSL, encoding, compression natively
- Output truncated to 64KB, HTML input limited to 5MB
- Graceful error handling for DNS failures, timeouts, empty responses

Dependencies added:
- @mozilla/readability: article extraction
- jsdom: DOM parsing for Readability
- turndown: HTML to markdown conversion

Also updates AGENTS.md testing section to clarify preferred test types:
1. True integration tests (no mocks)
2. Unit tests on pure/isolated logic

_Generated with `mux`_
diff --git a/bun.lock b/bun.lock
diff --git a/docs/AGENTS.md b/docs/AGENTS.md
@@ -57,6 +57,13 @@ gh pr view <number> --json mergeable,mergeStateStatus | jq '.'
 
 ## Testing Doctrine
 
+Two types of tests are preferred:
+
+1. **True integration tests** — use real runtimes, real filesystems, real network calls. No mocks, stubs, or fakes. These prove the system works end-to-end.
+2. **Unit tests on pure/isolated logic** — test pure functions or well-isolated modules where inputs and outputs are clear. No mocks needed because the code has no external dependencies.
+
+Avoid mock-heavy tests that verify implementation details rather than behavior. If you need mocks to test something, consider whether the code should be restructured to be more testable.
+
 ### Storybook
 
 - Prefer full-app stories (`App.stories.tsx`) to isolated components.
diff --git a/package.json b/package.json
@@ -50,6 +50,7 @@
     "@ai-sdk/openai": "^2.0.72",
     "@ai-sdk/xai": "^2.0.36",
     "@lydell/node-pty": "1.1.0",
+    "@mozilla/readability": "^0.6.0",
     "@openrouter/ai-sdk-provider": "^1.2.5",
     "@radix-ui/react-checkbox": "^1.3.3",
     "@radix-ui/react-dialog": "^1.1.15",
@@ -72,6 +73,7 @@
     "electron-updater": "^6.6.2",
     "express": "^5.1.0",
     "ghostty-web": "0.2.1",
+    "jsdom": "^27.2.0",
     "jsonc-parser": "^3.3.1",
     "lru-cache": "^11.2.2",
     "lucide-react": "^0.553.0",
@@ -83,6 +85,7 @@
     "shescape": "^2.1.6",
     "source-map-support": "^0.5.21",
     "streamdown": "^1.4.0",
+    "turndown": "^7.2.2",
     "undici": "^7.16.0",
     "write-file-atomic": "^6.0.0",
     "ws": "^8.18.3",
@@ -106,11 +109,13 @@
     "@types/escape-html": "^1.0.4",
     "@types/express": "^5.0.3",
     "@types/jest": "^30.0.0",
+    "@types/jsdom": "^27.0.0",
     "@types/katex": "^0.16.7",
     "@types/markdown-it": "^14.1.2",
     "@types/minimist": "^1.2.5",
     "@types/react": "^18.2.0",
     "@types/react-dom": "^18.2.0",
+    "@types/turndown": "^5.0.6",
     "@types/write-file-atomic": "^4.0.3",
     "@types/ws": "^8.18.1",
     "@typescript-eslint/eslint-plugin": "^8.44.1",
diff --git a/src/common/constants/toolLimits.ts b/src/common/constants/toolLimits.ts
@@ -17,3 +17,8 @@ export const BASH_MAX_LINE_BYTES = 1024; // 1KB per line for AI agent
 export const MAX_TODOS = 7; // Maximum number of TODO items in a list
 
 export const STATUS_MESSAGE_MAX_LENGTH = 60; // Maximum length for status messages (auto-truncated)
+
+// Web fetch tool limits
+export const WEB_FETCH_TIMEOUT_SECS = 15; // curl timeout
+export const WEB_FETCH_MAX_OUTPUT_BYTES = 64 * 1024; // 64KB markdown output
+export const WEB_FETCH_MAX_HTML_BYTES = 5 * 1024 * 1024; // 5MB HTML input (curl --max-filesize)
diff --git a/src/common/types/tools.ts b/src/common/types/tools.ts
@@ -201,3 +201,23 @@ export type StatusSetToolResult =
       success: false;
       error: string;
     };
+
+// Web Fetch Tool Types
+export interface WebFetchToolArgs {
+  url: string;
+  format?: "markdown" | "text";
+}
+
+export type WebFetchToolResult =
+  | {
+      success: true;
+      title: string;
+      content: string;
+      url: string;
+      byline?: string;
+      length: number;
+    }
+  | {
+      success: false;
+      error: string;
+    };
diff --git a/src/common/utils/tools/toolDefinitions.ts b/src/common/utils/tools/toolDefinitions.ts
@@ -12,6 +12,7 @@ import {
   BASH_MAX_LINE_BYTES,
   BASH_MAX_TOTAL_BYTES,
   STATUS_MESSAGE_MAX_LENGTH,
+  WEB_FETCH_MAX_OUTPUT_BYTES,
 } from "@/common/constants/toolLimits";
 import { TOOL_EDIT_WARNING } from "@/common/types/tools";
 
@@ -228,6 +229,20 @@ export const TOOL_DEFINITIONS = {
       })
       .strict(),
   },
+  web_fetch: {
+    description:
+      `Fetch a web page and extract its main content as clean markdown. ` +
+      `Uses the workspace's network context (requests originate from the workspace, not Mux host). ` +
+      `Requires curl to be installed in the workspace. ` +
+      `Output is truncated to ${Math.floor(WEB_FETCH_MAX_OUTPUT_BYTES / 1024)}KB.`,
+    schema: z.object({
+      url: z.string().url().describe("The URL to fetch (http or https)"),
+      format: z
+        .enum(["markdown", "text"])
+        .optional()
+        .describe("Output format: 'markdown' (default) preserves structure, 'text' is plain text"),
+    }),
+  },
 } as const;
 
 /**
@@ -268,6 +283,7 @@ export function getAvailableTools(modelString: string): string[] {
     "todo_write",
     "todo_read",
     "status_set",
+    "web_fetch",
   ];
 
   // Add provider-specific tools
diff --git a/src/common/utils/tools/tools.ts b/src/common/utils/tools/tools.ts
@@ -7,6 +7,7 @@ import { createFileEditInsertTool } from "@/node/services/tools/file_edit_insert
 import { createProposePlanTool } from "@/node/services/tools/propose_plan";
 import { createTodoWriteTool, createTodoReadTool } from "@/node/services/tools/todo";
 import { createStatusSetTool } from "@/node/services/tools/status_set";
+import { createWebFetchTool } from "@/node/services/tools/web_fetch";
 import { wrapWithInitWait } from "@/node/services/tools/wrapWithInitWait";
 import { log } from "@/node/services/log";
 
@@ -95,6 +96,7 @@ export async function getToolsForModel(
     // and line number miscalculations. Use file_edit_replace_string instead.
     // file_edit_replace_lines: wrap(createFileEditReplaceLinesTool(config)),
     bash: wrap(createBashTool(config)),
+    web_fetch: wrap(createWebFetchTool(config)),
   };
 
   // Non-runtime tools execute immediately (no init wait needed)
diff --git a/src/node/services/tools/web_fetch.test.ts b/src/node/services/tools/web_fetch.test.ts
@@ -0,0 +1,215 @@
+import { describe, it, expect } from "bun:test";
+import { createWebFetchTool } from "./web_fetch";
+import type { WebFetchToolArgs, WebFetchToolResult } from "@/common/types/tools";
+import { WEB_FETCH_MAX_OUTPUT_BYTES } from "@/common/constants/toolLimits";
+import { TestTempDir, createTestToolConfig } from "./testHelpers";
+import * as fs from "fs/promises";
+import * as path from "path";
+
+import type { ToolCallOptions } from "ai";
+
+// ToolCallOptions stub for testing
+const toolCallOptions: ToolCallOptions = {
+  toolCallId: "test-call-id",
+  messages: [],
+};
+
+// Helper to create web_fetch tool with real LocalRuntime
+function createTestWebFetchTool() {
+  const tempDir = new TestTempDir("test-web-fetch");
+  const config = createTestToolConfig(tempDir.path);
+  const tool = createWebFetchTool(config);
+
+  return {
+    tool,
+    tempDir,
+    [Symbol.dispose]() {
+      tempDir[Symbol.dispose]();
+    },
+  };
+}
+
+describe("web_fetch tool", () => {
+  // Integration test: fetch a real public URL
+  it("should fetch and convert a real web page to markdown", async () => {
+    using testEnv = createTestWebFetchTool();
+    const args: WebFetchToolArgs = {
+      // example.com is a stable, simple HTML page maintained by IANA
+      url: "https://example.com",
+    };
+
+    const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
+
+    expect(result.success).toBe(true);
+    if (result.success) {
+      expect(result.title).toContain("Example Domain");
+      expect(result.url).toBe("https://example.com");
+      // example.com mentions documentation examples
+      expect(result.content).toContain("documentation");
+      expect(result.length).toBeGreaterThan(0);
+    }
+  });
+
+  it("should return plain text when format is 'text'", async () => {
+    using testEnv = createTestWebFetchTool();
+    const args: WebFetchToolArgs = {
+      url: "https://example.com",
+      format: "text",
+    };
+
+    const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
+
+    expect(result.success).toBe(true);
+    if (result.success) {
+      // Plain text should not have markdown formatting
+      expect(result.content).not.toContain("#");
+      expect(result.content).not.toContain("**");
+      // example.com content mentions documentation
+      expect(result.content).toContain("documentation");
+    }
+  });
+
+  it("should handle DNS failure gracefully", async () => {
+    using testEnv = createTestWebFetchTool();
+    const args: WebFetchToolArgs = {
+      // .invalid TLD is reserved and guaranteed to never resolve
+      url: "https://this-domain-does-not-exist.invalid/page",
+    };
+
+    const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
+
+    expect(result.success).toBe(false);
+    if (!result.success) {
+      expect(result.error).toContain("Failed to fetch URL");
+    }
+  });
+
+  it("should handle connection refused gracefully", async () => {
+    using testEnv = createTestWebFetchTool();
+    const args: WebFetchToolArgs = {
+      // localhost on a random high port should refuse connection
+      url: "http://127.0.0.1:59999/page",
+    };
+
+    const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
+
+    expect(result.success).toBe(false);
+    if (!result.success) {
+      expect(result.error).toContain("Failed to fetch URL");
+    }
+  });
+
+  // Test with a local file served via file:// - tests HTML parsing without network
+  it("should handle local HTML content via file:// URL", async () => {
+    using testEnv = createTestWebFetchTool();
+
+    // Create a test HTML file
+    const htmlContent = `
+<!DOCTYPE html>
+<html>
+<head><title>Local Test Page</title></head>
+<body>
+  <article>
+    <h1>Test Heading</h1>
+    <p>This is test content with <strong>bold</strong> and <em>italic</em> text.</p>
+  </article>
+</body>
+</html>`;
+    const htmlPath = path.join(testEnv.tempDir.path, "test.html");
+    await fs.writeFile(htmlPath, htmlContent);
+
+    const args: WebFetchToolArgs = {
+      url: `file://${htmlPath}`,
+    };
+
+    const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
+
+    expect(result.success).toBe(true);
+    if (result.success) {
+      expect(result.title).toBe("Local Test Page");
+      expect(result.content).toContain("Test Heading");
+      expect(result.content).toContain("**bold**");
+      expect(result.content).toContain("_italic_");
+    }
+  });
+
+  it("should truncate oversized output from local file", async () => {
+    using testEnv = createTestWebFetchTool();
+
+    // Create HTML that will produce content larger than WEB_FETCH_MAX_OUTPUT_BYTES
+    const largeContent = "x".repeat(WEB_FETCH_MAX_OUTPUT_BYTES + 1000);
+    const htmlContent = `
+<!DOCTYPE html>
+<html>
+<head><title>Large Page</title></head>
+<body><article><p>${largeContent}</p></article></body>
+</html>`;
+    const htmlPath = path.join(testEnv.tempDir.path, "large.html");
+    await fs.writeFile(htmlPath, htmlContent);
+
+    const args: WebFetchToolArgs = {
+      url: `file://${htmlPath}`,
+    };
+
+    const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
+
+    expect(result.success).toBe(true);
+    if (result.success) {
+      expect(result.content.length).toBeLessThanOrEqual(
+        WEB_FETCH_MAX_OUTPUT_BYTES + 100 // Allow for truncation message
+      );
+      expect(result.content).toContain("[Content truncated]");
+    }
+  });
+
+  it("should handle non-article HTML gracefully", async () => {
+    using testEnv = createTestWebFetchTool();
+
+    // Minimal HTML that Readability may not parse as an article
+    const htmlContent = "<html><body><p>Just some text</p></body></html>";
+    const htmlPath = path.join(testEnv.tempDir.path, "minimal.html");
+    await fs.writeFile(htmlPath, htmlContent);
+
+    const args: WebFetchToolArgs = {
+      url: `file://${htmlPath}`,
+    };
+
+    const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
+
+    // Readability may or may not parse this - the important thing is we don't crash
+    expect(typeof result.success).toBe("boolean");
+  });
+
+  it("should handle empty file", async () => {
+    using testEnv = createTestWebFetchTool();
+
+    const htmlPath = path.join(testEnv.tempDir.path, "empty.html");
+    await fs.writeFile(htmlPath, "");
+
+    const args: WebFetchToolArgs = {
+      url: `file://${htmlPath}`,
+    };
+
+    const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
+
+    expect(result.success).toBe(false);
+    if (!result.success) {
+      expect(result.error).toContain("Empty response");
+    }
+  });
+
+  it("should handle missing file", async () => {
+    using testEnv = createTestWebFetchTool();
+
+    const args: WebFetchToolArgs = {
+      url: `file://${testEnv.tempDir.path}/nonexistent.html`,
+    };
+
+    const result = (await testEnv.tool.execute!(args, toolCallOptions)) as WebFetchToolResult;
+
+    expect(result.success).toBe(false);
+    if (!result.success) {
+      expect(result.error).toContain("Failed to fetch URL");
+    }
+  });
+});
diff --git a/src/node/services/tools/web_fetch.ts b/src/node/services/tools/web_fetch.ts