diff --git a/pyproject.toml b/pyproject.toml index f6c3c8bac..820e380f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ dependencies = [ "sentry-sdk==2.23.1", "click>=8.1.7", "requests>=2.32.3", + "beautifulsoup4>=4.12.3", "lazy-object-proxy>=0.0.0", "pydantic-core>=2.23.4", "hatch-vcs>=0.4.0", @@ -169,6 +170,7 @@ dev-dependencies = [ "modal>=0.73.25", "pytest-lsp>=1.0.0b1", "cython>=3.0.11", + "responses>=0.25.0", ] [tool.uv.workspace] diff --git a/src/codegen/extensions/tools/__init__.py b/src/codegen/extensions/tools/__init__.py index 33e864e39..27c25de6f 100644 --- a/src/codegen/extensions/tools/__init__.py +++ b/src/codegen/extensions/tools/__init__.py @@ -27,6 +27,7 @@ from .semantic_edit import semantic_edit from .semantic_search import semantic_search from .view_file import view_file +from .web import web_search_tool, web_view_page_tool __all__ = [ # Git operations @@ -61,4 +62,7 @@ "semantic_search", "view_file", "view_pr", + # Web browsing operations + "web_search_tool", + "web_view_page_tool", ] diff --git a/src/codegen/extensions/tools/web/__init__.py b/src/codegen/extensions/tools/web/__init__.py new file mode 100644 index 000000000..4219be608 --- /dev/null +++ b/src/codegen/extensions/tools/web/__init__.py @@ -0,0 +1,5 @@ +"""Web browsing tools.""" + +from .web import WebPageObservation, WebSearchObservation, web_search_tool, web_view_page_tool + +__all__ = ["WebPageObservation", "WebSearchObservation", "web_search_tool", "web_view_page_tool"] diff --git a/src/codegen/extensions/tools/web/web.py b/src/codegen/extensions/tools/web/web.py new file mode 100644 index 000000000..aa6476fdf --- /dev/null +++ b/src/codegen/extensions/tools/web/web.py @@ -0,0 +1,143 @@ +"""Tools for browsing the web.""" + +from typing import ClassVar + +import requests +from bs4 import BeautifulSoup +from pydantic import Field + +from ..observation import Observation + + +class WebSearchObservation(Observation): + """Response from searching the web.""" + + query: str = Field(description="Search query used") + results: list[dict] = Field(description="List of search results") + + str_template: ClassVar[str] = "Found {result_count} results for '{query}'" + + def _get_details(self) -> dict[str, str | int]: + """Get details for string representation.""" + return { + "result_count": len(self.results), + "query": self.query, + } + + +class WebPageObservation(Observation): + """Response from viewing a web page.""" + + url: str = Field(description="URL of the web page") + title: str = Field(description="Title of the web page") + content: str = Field(description="Content of the web page") + + str_template: ClassVar[str] = "Viewed web page: {title}" + + +def web_search_tool(query: str, num_results: int = 5) -> WebSearchObservation: + """Search the web and get content snippets from search results. + + Args: + query: Search query string + num_results: Maximum number of results to return (default: 5) + + Returns: + WebSearchObservation with search results + """ + try: + # Note: In a real implementation, this would use a search API like Google Custom Search API, + # Bing Search API, or a similar service. For this example, we'll return mock data. + # The actual implementation would require API keys and proper error handling. + + # Mock search results for demonstration + mock_results = [ + { + "title": f"Result {i + 1} for {query}", + "url": f"https://example.com/result{i + 1}", + "snippet": f"This is a snippet of content for result {i + 1} related to {query}...", + } + for i in range(min(num_results, 5)) + ] + + return WebSearchObservation( + status="success", + query=query, + results=mock_results, + ) + except requests.exceptions.RequestException as e: + # Network-related errors + return WebSearchObservation( + status="error", + error=f"Network error when searching: {e!s}", + query=query, + results=[], + ) + except Exception as e: + # Catch-all for other errors + return WebSearchObservation( + status="error", + error=f"Failed to search the web: {e!s}", + query=query, + results=[], + ) + + +def web_view_page_tool(url: str, max_length: int = 10000) -> WebPageObservation: + """View the content of a specific webpage. + + Args: + url: URL of the webpage to view + max_length: Maximum length of content to return (default: 10000) + + Returns: + WebPageObservation with page content + """ + try: + # Send a GET request to the URL + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"} + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() # Raise an exception for 4XX/5XX responses + + # Parse the HTML content + soup = BeautifulSoup(response.text, "html.parser") + + # Extract the title + title = soup.title.string if soup.title else "No title found" + + # Extract the main content (simplified approach) + # Remove script and style elements + for script in soup(["script", "style"]): + script.extract() + + # Get text content + text = soup.get_text(separator="\n", strip=True) + + # Truncate if necessary + if len(text) > max_length: + text = text[:max_length] + "... [content truncated]" + + return WebPageObservation( + status="success", + url=url, + title=title, + content=text, + ) + except requests.exceptions.RequestException as e: + # Network-related errors + return WebPageObservation( + status="error", + error=f"Network error when fetching page: {e!s}", + url=url, + title="", + content="", + ) + except Exception as e: + # Catch-all for other errors + return WebPageObservation( + status="error", + error=f"Failed to view web page: {e!s}", + url=url, + title="", + content="", + ) diff --git a/tests/integration/extension/test_web.py b/tests/integration/extension/test_web.py new file mode 100644 index 000000000..59648dd93 --- /dev/null +++ b/tests/integration/extension/test_web.py @@ -0,0 +1,59 @@ +"""Tests for web browsing tools.""" + +import pytest +import responses + +from codegen.extensions.tools.web import web_search_tool, web_view_page_tool + + +@pytest.fixture +def mock_responses(): + """Set up mock responses for testing.""" + with responses.RequestsMock() as rsps: + # Mock a web page response + rsps.add( + responses.GET, + "https://example.com", + body="Example Domain

Example Domain

This is a test page.

", + status=200, + content_type="text/html", + ) + yield rsps + + +def test_web_search_tool(): + """Test the web search tool.""" + # Since this is a mock implementation, we just test the basic functionality + result = web_search_tool("test query", num_results=3) + + assert result.status == "success" + assert result.query == "test query" + assert len(result.results) == 3 + + # Check that the results have the expected structure + for i, item in enumerate(result.results): + assert item["title"] == f"Result {i + 1} for test query" + assert item["url"] == f"https://example.com/result{i + 1}" + assert "snippet" in item + + +def test_web_view_page_tool(mock_responses): + """Test the web view page tool.""" + result = web_view_page_tool("https://example.com") + + assert result.status == "success" + assert result.url == "https://example.com" + assert result.title == "Example Domain" + assert "Example Domain" in result.content + assert "This is a test page." in result.content + + +def test_web_view_page_tool_error(): + """Test the web view page tool with an error.""" + result = web_view_page_tool("https://nonexistent-domain-that-should-fail.com") + + assert result.status == "error" + assert "error" in result + assert result.url == "https://nonexistent-domain-that-should-fail.com" + assert result.title == "" + assert result.content == ""