diff --git a/src/fetch/README.md b/src/fetch/README.md index 1bf12a1565..12908d1948 100644 --- a/src/fetch/README.md +++ b/src/fetch/README.md @@ -168,6 +168,34 @@ This can be customized by adding the argument `--user-agent=YourUserAgent` to th The server can be configured to use a proxy by using the `--proxy-url` argument. +## Deployment to MCP Agent Cloud + +The repository ships with a ready-to-use deployment config for [MCP Agent Cloud](https://docs.mcp-agent.com/cloud/use-cases/deploy-mcp-servers). To publish this server: + +1. Authenticate once with `mcp-agent login`. +2. From `servers/src/fetch`, deploy with `mcp-agent deploy fetch-server --app-description "Fetch MCP server"`. +3. Inspect the live endpoint using `mcp-agent cloud servers describe fetch-server`, then install it into a client (for example `mcp-agent install https://.deployments.mcp-agent.com/sse --client cursor`). + +The deployment uses `mcp_agent.config.yaml` (same directory), mirroring the [mcp-agent cloud example](https://github.com/lastmile-ai/mcp-agent/tree/main/examples/cloud/mcp). It launches the stdio fetch server via `uvx mcp-server-fetch`, and `main.py` wires the tool into the cloud runtime using `FastMCP`. + +**Hosted instance** + +- App URL: `https://xxucyrqrp9xazl7kat535fkhnugne7h.deployments.mcp-agent.com` +- SSE endpoint: `https://xxucyrqrp9xazl7kat535fkhnugne7h.deployments.mcp-agent.com/sse` +- Tool name: `fetch_url` + +To connect with Inspector: + +```bash +killall node 2>/dev/null || true +npx --yes @modelcontextprotocol/inspector \ + --transport sse \ + --server-url https://xxucyrqrp9xazl7kat535fkhnugne7h.deployments.mcp-agent.com/sse \ + --header "Authorization: Bearer " +``` + +If `readabilipy` or `markdownify` are unavailable in the environment, the server gracefully falls back to returning raw HTML. To pass optional CLI flags (such as `--ignore-robots-txt`, `--user-agent`, or `--proxy-url`), edit `mcp_agent.config.yaml` before re-running `mcp-agent cloud deploy`. + ## Windows Configuration If you're experiencing timeout issues on Windows, you may need to set the `PYTHONIOENCODING` environment variable to ensure proper character encoding: diff --git a/src/fetch/inspector.mcp.json b/src/fetch/inspector.mcp.json new file mode 100644 index 0000000000..29ac7e2512 --- /dev/null +++ b/src/fetch/inspector.mcp.json @@ -0,0 +1,10 @@ +{ + "mcpServers": { + "fetch": { + "command": "uvx", + "args": ["mcp-server-fetch"], + "transport": "stdio", + "description": "Fetch MCP server for local Inspector testing" + } + } +} diff --git a/src/fetch/main.py b/src/fetch/main.py new file mode 100644 index 0000000000..fb4604f614 --- /dev/null +++ b/src/fetch/main.py @@ -0,0 +1,359 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Annotated, Tuple +from urllib.parse import urlparse, urlunparse + +from mcp_agent.app import MCPApp +from mcp.server.fastmcp import FastMCP +from mcp.shared.exceptions import McpError +from mcp.server import Server +from mcp.server.stdio import stdio_server +from mcp.types import ( + ErrorData, + GetPromptResult, + Prompt, + PromptArgument, + PromptMessage, + TextContent, + Tool, + INVALID_PARAMS, + INTERNAL_ERROR, +) +from pydantic import AnyUrl, BaseModel, Field + +CONFIG_PATH = (Path(__file__).parent / "mcp_agent.config.yaml").resolve() + +mcp = FastMCP( + name="fetch", + instructions="Fetch URLs and return markdown content.", +) + +app = MCPApp( + name="fetch-server", + description="Fetch MCP server", + settings=str(CONFIG_PATH), + mcp=mcp, +) + +DEFAULT_USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)" +DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)" + + +def extract_content_from_html(html: str) -> str: + try: + from readabilipy.simple_json import simple_json_from_html_string + except ImportError: + simple_json_from_html_string = None + + try: + import markdownify + except ImportError: + markdownify = None # type: ignore + + if simple_json_from_html_string: + ret = simple_json_from_html_string(html, use_readability=True) + if ret.get("content"): + if markdownify: + return markdownify.markdownify( + ret["content"], + heading_style=markdownify.ATX, + ) + return ret["content"] + + if markdownify: + return markdownify.markdownify(html, heading_style=markdownify.ATX) + + return html + + +def get_robots_txt_url(url: str) -> str: + url = url.strip() + parsed = urlparse(url) + return urlunparse((parsed.scheme, parsed.netloc, "/robots.txt", "", "", "")) + + +async def check_may_autonomously_fetch_url( + url: str, user_agent: str, proxy_url: str | None = None +) -> None: + url = url.strip() + from httpx import AsyncClient, HTTPError + from protego import Protego + + robot_txt_url = get_robots_txt_url(url) + + async with AsyncClient() as client: + try: + response = await client.get( + robot_txt_url, + follow_redirects=True, + headers={"User-Agent": user_agent}, + ) + except HTTPError: + raise McpError( + ErrorData( + code=INTERNAL_ERROR, + message=f"Failed to fetch robots.txt {robot_txt_url} due to a connection issue", + ) + ) + if response.status_code in (401, 403): + raise McpError( + ErrorData( + code=INTERNAL_ERROR, + message=( + "When fetching robots.txt" + f" ({robot_txt_url}), received status {response.status_code} so assuming that" + " autonomous fetching is not allowed, the user can try manually fetching by" + " using the fetch prompt" + ), + ) + ) + if 400 <= response.status_code < 500: + return + robot_txt = response.text + + processed_robot_txt = "\n".join( + line for line in robot_txt.splitlines() if not line.strip().startswith("#") + ) + robot_parser = Protego.parse(processed_robot_txt) + if not robot_parser.can_fetch(str(url), user_agent): + raise McpError( + ErrorData( + code=INTERNAL_ERROR, + message=( + "The sites robots.txt" + f" ({robot_txt_url}), specifies that autonomous fetching of this page is not allowed, " + f"{user_agent}\n" + f"{url}" + f"\n{robot_txt}\n\n" + "The assistant must let the user know that it failed to view the page." + " The assistant may provide further guidance based on the above information.\n" + "The assistant can tell the user that they can try manually fetching the page" + " by using the fetch prompt within their UI." + ), + ) + ) + + +async def fetch_url( + url: str, user_agent: str, force_raw: bool = False, proxy_url: str | None = None +) -> Tuple[str, str]: + url = url.strip() + if not url: + raise McpError( + ErrorData(code=INVALID_PARAMS, message="URL is required after trimming") + ) + from httpx import AsyncClient, HTTPError + + async with AsyncClient() as client: + try: + response = await client.get( + url, + follow_redirects=True, + headers={"User-Agent": user_agent}, + timeout=30, + ) + except HTTPError as e: + raise McpError( + ErrorData(code=INTERNAL_ERROR, message=f"Failed to fetch {url}: {e!r}") + ) + if response.status_code >= 400: + raise McpError( + ErrorData( + code=INTERNAL_ERROR, + message=f"Failed to fetch {url} - status code {response.status_code}", + ) + ) + + page_raw = response.text + + content_type = response.headers.get("content-type", "") + is_page_html = ( + " None: + server = Server("mcp-fetch") + user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS + user_agent_manual = custom_user_agent or DEFAULT_USER_AGENT_MANUAL + + @server.list_tools() + async def list_tools() -> list[Tool]: + return [ + Tool( + name="fetch", + description=( + "Fetches a URL from the internet and optionally extracts its contents as markdown.\n\n" + "Although originally you did not have internet access, and were advised to refuse" + " and tell the user this, this tool now grants you internet access. Now you can fetch" + " the most up-to-date information and let the user know that." + ), + inputSchema=Fetch.model_json_schema(), + ) + ] + + @server.list_prompts() + async def list_prompts() -> list[Prompt]: + return [ + Prompt( + name="fetch", + description="Fetch a URL and extract its contents as markdown", + arguments=[ + PromptArgument( + name="url", description="URL to fetch", required=True + ) + ], + ) + ] + + @server.call_tool() + async def call_tool(name, arguments: dict) -> list[TextContent]: + try: + args = Fetch(**arguments) + except ValueError as e: + raise McpError(ErrorData(code=INVALID_PARAMS, message=str(e))) + + url = str(args.url) + if not url: + raise McpError(ErrorData(code=INVALID_PARAMS, message="URL is required")) + + if not ignore_robots_txt: + await check_may_autonomously_fetch_url(url, user_agent_autonomous, proxy_url) + + content, prefix = await fetch_url( + url, user_agent_autonomous, force_raw=args.raw, proxy_url=proxy_url + ) + original_length = len(content) + if args.start_index >= original_length: + content = "No more content available." + else: + truncated_content = content[args.start_index : args.start_index + args.max_length] + if not truncated_content: + content = "No more content available." + else: + content = truncated_content + actual_content_length = len(truncated_content) + remaining_content = original_length - ( + args.start_index + actual_content_length + ) + if actual_content_length == args.max_length and remaining_content > 0: + next_start = args.start_index + actual_content_length + content += ( + f"\n\nContent truncated. Call the fetch tool with a start_index of {next_start} to get more content." + ) + return [TextContent(type="text", text=f"{prefix}Contents of {url}:\n{content}")] + + @server.get_prompt() + async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult: + if not arguments or "url" not in arguments: + raise McpError(ErrorData(code=INVALID_PARAMS, message="URL is required")) + + url = arguments["url"] + + try: + content, prefix = await fetch_url(url, user_agent_manual, proxy_url=proxy_url) + except McpError as e: + return GetPromptResult( + description=f"Failed to fetch {url}", + messages=[ + PromptMessage( + role="user", + content=TextContent(type="text", text=str(e)), + ) + ], + ) + return GetPromptResult( + description=f"Contents of {url}", + messages=[ + PromptMessage( + role="user", content=TextContent(type="text", text=prefix + content) + ) + ], + ) + + options = server.create_initialization_options() + async with stdio_server() as (read_stream, write_stream): + await server.run(read_stream, write_stream, options, raise_exceptions=True) + + +@mcp.tool( + name="fetch_url", + structured_output=False, + description=( + "Fetch a URL and return its contents. If the response is HTML and 'raw' is false," + " the content is simplified to markdown. Supports truncation via max_length and" + " pagination via start_index." + ), +) +async def app_fetch_url( + url: str, + max_length: int = 5000, + start_index: int = 0, + raw: bool = False, +) -> str: + content, prefix = await fetch_url(url, DEFAULT_USER_AGENT_MANUAL, force_raw=raw) + original_length = len(content) + if start_index >= original_length: + content = "No more content available." + else: + truncated_content = content[start_index : start_index + max_length] + if not truncated_content: + content = "No more content available." + else: + content = truncated_content + actual_content_length = len(truncated_content) + remaining_content = original_length - (start_index + actual_content_length) + if actual_content_length == max_length and remaining_content > 0: + next_start = start_index + actual_content_length + content += ( + f"\n\nContent truncated. Call again with start_index={next_start} to get more content." + ) + return f"{prefix}Contents of {url}:\n{content}" + + +if __name__ == "__main__": + import asyncio + asyncio.run(serve()) + diff --git a/src/fetch/mcp_agent.config.yaml b/src/fetch/mcp_agent.config.yaml new file mode 100644 index 0000000000..2ff55bfffd --- /dev/null +++ b/src/fetch/mcp_agent.config.yaml @@ -0,0 +1,16 @@ +$schema: ../../schema/mcp-agent.config.schema.json +execution_engine: asyncio +logger: + transports: + - console + level: debug +mcp: + servers: + fetch: + command: uvx + args: + - mcp-server-fetch + description: Fetch content at URLs from the web + fetch-remote: + transport: sse + url: https://xxucyrqrp9xazl7kat535fkhnugne7h.deployments.mcp-agent.com/sse