get text coordinates

SentienceDEV · SentienceDEV · commit 8e3a8b87ad9f · 2025-12-29T07:23:29.000-08:00
diff --git a/README.md b/README.md
@@ -401,6 +401,74 @@ data_url = screenshot(browser, format="jpeg", quality=85)
 
 </details>
 
+<details>
+<summary><h3>🔎 Text Search - Find Elements by Visible Text</h3></summary>
+
+**`find_text_rect(browser, text, case_sensitive=False, whole_word=False, max_results=10)`** - Find text on page and get exact pixel coordinates
+
+Find buttons, links, or any UI elements by their visible text without needing element IDs or CSS selectors. Returns exact pixel coordinates for each match.
+
+**Example:**
+```python
+from sentience import SentienceBrowser, find_text_rect, click_rect
+
+with SentienceBrowser() as browser:
+    browser.page.goto("https://example.com")
+
+    # Find "Sign In" button
+    result = find_text_rect(browser, "Sign In")
+    if result.status == "success" and result.results:
+        first_match = result.results[0]
+        print(f"Found at: ({first_match.rect.x}, {first_match.rect.y})")
+        print(f"In viewport: {first_match.in_viewport}")
+
+        # Click on the found text
+        if first_match.in_viewport:
+            click_rect(browser, {
+                "x": first_match.rect.x,
+                "y": first_match.rect.y,
+                "w": first_match.rect.width,
+                "h": first_match.rect.height
+            })
+```
+
+**Advanced Options:**
+```python
+# Case-sensitive search
+result = find_text_rect(browser, "LOGIN", case_sensitive=True)
+
+# Whole word only (won't match "login" as part of "loginButton")
+result = find_text_rect(browser, "log", whole_word=True)
+
+# Find multiple matches
+result = find_text_rect(browser, "Buy", max_results=10)
+for match in result.results:
+    if match.in_viewport:
+        print(f"Found '{match.text}' at ({match.rect.x}, {match.rect.y})")
+        print(f"Context: ...{match.context.before}[{match.text}]{match.context.after}...")
+```
+
+**Returns:** `TextRectSearchResult` with:
+- **`status`**: "success" or "error"
+- **`results`**: List of `TextMatch` objects with:
+  - `text` - The matched text
+  - `rect` - Absolute coordinates (with scroll offset)
+  - `viewport_rect` - Viewport-relative coordinates
+  - `context` - Surrounding text (before/after)
+  - `in_viewport` - Whether visible in current viewport
+
+**Use Cases:**
+- Find buttons/links by visible text without CSS selectors
+- Get exact pixel coordinates for click automation
+- Verify text visibility and position on page
+- Search dynamic content that changes frequently
+
+**Note:** Does not consume API credits (runs locally in browser)
+
+**See example:** `examples/find_text_demo.py`
+
+</details>
+
 ---
 
 ## 📋 Reference
diff --git a/examples/find_text_demo.py b/examples/find_text_demo.py
@@ -0,0 +1,102 @@
+"""
+Text Search Demo - Using find_text_rect() to locate elements by visible text
+
+This example demonstrates how to:
+1. Find text on a webpage and get exact pixel coordinates
+2. Use case-sensitive and whole-word matching options
+3. Click on found text using click_rect()
+4. Handle multiple matches and filter by viewport visibility
+"""
+
+from sentience import SentienceBrowser, find_text_rect, click_rect
+
+
+def main():
+    with SentienceBrowser() as browser:
+        # Navigate to a search page
+        browser.page.goto("https://www.google.com")
+        browser.page.wait_for_load_state("networkidle")
+
+        print("\n" + "=" * 60)
+        print("Text Search Demo")
+        print("=" * 60 + "\n")
+
+        # Example 1: Simple text search
+        print("Example 1: Finding 'Google Search' button")
+        print("-" * 60)
+        result = find_text_rect(browser, "Google Search")
+
+        if result.status == "success" and result.results:
+            print(f"✓ Found {result.matches} match(es) for '{result.query}'")
+            for i, match in enumerate(result.results[:3]):  # Show first 3
+                print(f"\nMatch {i + 1}:")
+                print(f"  Text: '{match.text}'")
+                print(f"  Position: ({match.rect.x:.1f}, {match.rect.y:.1f})")
+                print(f"  Size: {match.rect.width:.1f}x{match.rect.height:.1f} pixels")
+                print(f"  In viewport: {match.in_viewport}")
+                print(
+                    f"  Context: ...{match.context.before}[{match.text}]{match.context.after}..."
+                )
+        else:
+            print(f"✗ Search failed: {result.error}")
+
+        # Example 2: Find and click search box
+        print("\n\nExample 2: Finding and clicking the search box")
+        print("-" * 60)
+        result = find_text_rect(browser, "Search", max_results=5)
+
+        if result.status == "success" and result.results:
+            # Find the first visible match
+            for match in result.results:
+                if match.in_viewport:
+                    print(f"✓ Found visible match: '{match.text}'")
+                    print(f"  Clicking at ({match.rect.x:.1f}, {match.rect.y:.1f})")
+
+                    # Click in the center of the text
+                    click_result = click_rect(
+                        browser,
+                        {
+                            "x": match.rect.x,
+                            "y": match.rect.y,
+                            "w": match.rect.width,
+                            "h": match.rect.height,
+                        },
+                    )
+
+                    if click_result.success:
+                        print(f"  ✓ Click successful!")
+                    break
+
+        # Example 3: Case-sensitive search
+        print("\n\nExample 3: Case-sensitive search for 'GOOGLE'")
+        print("-" * 60)
+        result_insensitive = find_text_rect(browser, "GOOGLE", case_sensitive=False)
+        result_sensitive = find_text_rect(browser, "GOOGLE", case_sensitive=True)
+
+        print(f"Case-insensitive search: {result_insensitive.matches or 0} matches")
+        print(f"Case-sensitive search: {result_sensitive.matches or 0} matches")
+
+        # Example 4: Whole word search
+        print("\n\nExample 4: Whole word search")
+        print("-" * 60)
+        result_partial = find_text_rect(browser, "Search", whole_word=False)
+        result_whole = find_text_rect(browser, "Search", whole_word=True)
+
+        print(f"Partial word match: {result_partial.matches or 0} matches")
+        print(f"Whole word only: {result_whole.matches or 0} matches")
+
+        # Example 5: Get viewport information
+        print("\n\nExample 5: Viewport and scroll information")
+        print("-" * 60)
+        result = find_text_rect(browser, "Google")
+        if result.status == "success" and result.viewport:
+            print(f"Viewport size: {result.viewport.width}x{result.viewport.height}")
+            # Note: scroll position would be available if viewport had scroll_x/scroll_y fields
+
+        print("\n" + "=" * 60)
+        print("Demo complete!")
+        print("=" * 60 + "\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sentience/__init__.py b/sentience/__init__.py
@@ -34,14 +34,19 @@
     BBox,
     Cookie,
     Element,
+    TextRectSearchResult,
     LocalStorageItem,
     OriginStorage,
     ScreenshotConfig,
     Snapshot,
     SnapshotFilter,
     SnapshotOptions,
     StorageState,
+    TextContext,
+    TextMatch,
+    TextRect,
     TokenStats,
+    ViewportRect,
     Viewport,
     WaitResult,
 )
@@ -51,6 +56,7 @@
 from .recorder import Recorder, Trace, TraceStep, record
 from .screenshot import screenshot
 from .snapshot import snapshot
+from .text_search import find_text_rect
 from .tracer_factory import SENTIENCE_API_URL, create_tracer
 from .tracing import JsonlTraceSink, TraceEvent, Tracer, TraceSink
 
@@ -96,6 +102,13 @@
     "screenshot",
     "show_overlay",
     "clear_overlay",
+    # Text Search
+    "find_text_rect",
+    "TextRectSearchResult",
+    "TextMatch",
+    "TextRect",
+    "ViewportRect",
+    "TextContext",
     # Agent Layer (Phase 1 & 2)
     "BaseAgent",
     "LLMProvider",
diff --git a/sentience/models.py b/sentience/models.py
@@ -339,3 +339,66 @@ def to_playwright_dict(self) -> dict:
                 for origin in self.origins
             ],
         }
+
+
+# ========== Text Search Models (findTextRect) ==========
+
+
+class TextRect(BaseModel):
+    """
+    Rectangle coordinates for text occurrence.
+    Includes both absolute (page) and viewport-relative coordinates.
+    """
+
+    x: float = Field(..., description="Absolute X coordinate (page coordinate with scroll offset)")
+    y: float = Field(..., description="Absolute Y coordinate (page coordinate with scroll offset)")
+    width: float = Field(..., description="Rectangle width in pixels")
+    height: float = Field(..., description="Rectangle height in pixels")
+    left: float = Field(..., description="Absolute left position (same as x)")
+    top: float = Field(..., description="Absolute top position (same as y)")
+    right: float = Field(..., description="Absolute right position (x + width)")
+    bottom: float = Field(..., description="Absolute bottom position (y + height)")
+
+
+class ViewportRect(BaseModel):
+    """Viewport-relative rectangle coordinates (without scroll offset)"""
+
+    x: float = Field(..., description="Viewport-relative X coordinate")
+    y: float = Field(..., description="Viewport-relative Y coordinate")
+    width: float = Field(..., description="Rectangle width in pixels")
+    height: float = Field(..., description="Rectangle height in pixels")
+
+
+class TextContext(BaseModel):
+    """Context text surrounding a match"""
+
+    before: str = Field(..., description="Text before the match (up to 20 chars)")
+    after: str = Field(..., description="Text after the match (up to 20 chars)")
+
+
+class TextMatch(BaseModel):
+    """A single text match with its rectangle and context"""
+
+    text: str = Field(..., description="The matched text")
+    rect: TextRect = Field(..., description="Absolute rectangle coordinates (with scroll offset)")
+    viewport_rect: ViewportRect = Field(
+        ..., description="Viewport-relative rectangle (without scroll offset)"
+    )
+    context: TextContext = Field(..., description="Surrounding text context")
+    in_viewport: bool = Field(..., description="Whether the match is currently visible in viewport")
+
+
+class TextRectSearchResult(BaseModel):
+    """
+    Result of findTextRect operation.
+    Returns all occurrences of text on the page with their exact pixel coordinates.
+    """
+
+    status: Literal["success", "error"]
+    query: str | None = Field(None, description="The search text that was queried")
+    case_sensitive: bool | None = Field(None, description="Whether search was case-sensitive")
+    whole_word: bool | None = Field(None, description="Whether whole-word matching was used")
+    matches: int | None = Field(None, description="Number of matches found")
+    results: list[TextMatch] | None = Field(None, description="List of text matches with coordinates")
+    viewport: Viewport | None = Field(None, description="Current viewport dimensions")
+    error: str | None = Field(None, description="Error message if status is 'error'")
diff --git a/sentience/text_search.py b/sentience/text_search.py
@@ -0,0 +1,107 @@
+"""
+Text search utilities - find text and get pixel coordinates
+"""
+
+from .browser import SentienceBrowser
+from .models import TextRectSearchResult
+
+
+def find_text_rect(
+    browser: SentienceBrowser,
+    text: str,
+    case_sensitive: bool = False,
+    whole_word: bool = False,
+    max_results: int = 10,
+) -> TextRectSearchResult:
+    """
+    Find all occurrences of text on the page and get their exact pixel coordinates.
+
+    This function searches for text in all visible text nodes on the page and returns
+    the bounding rectangles for each match. Useful for:
+    - Finding specific UI elements by their text content
+    - Locating buttons, links, or labels without element IDs
+    - Getting exact coordinates for click automation
+    - Highlighting search results visually
+
+    Args:
+        browser: SentienceBrowser instance
+        text: Text to search for (required)
+        case_sensitive: If True, search is case-sensitive (default: False)
+        whole_word: If True, only match whole words surrounded by whitespace (default: False)
+        max_results: Maximum number of matches to return (default: 10, max: 100)
+
+    Returns:
+        TextRectSearchResult with:
+            - status: "success" or "error"
+            - query: The search text
+            - case_sensitive: Whether search was case-sensitive
+            - whole_word: Whether whole-word matching was used
+            - matches: Number of matches found
+            - results: List of TextMatch objects, each containing:
+                - text: The matched text
+                - rect: Absolute rectangle (with scroll offset)
+                - viewport_rect: Viewport-relative rectangle
+                - context: Surrounding text (before/after)
+                - in_viewport: Whether visible in current viewport
+            - viewport: Current viewport dimensions and scroll position
+            - error: Error message if status is "error"
+
+    Examples:
+        # Find "Sign In" button
+        result = find_text_rect(browser, "Sign In")
+        if result.status == "success" and result.results:
+            first_match = result.results[0]
+            print(f"Found at: ({first_match.rect.x}, {first_match.rect.y})")
+            print(f"Size: {first_match.rect.width}x{first_match.rect.height}")
+            print(f"In viewport: {first_match.in_viewport}")
+
+        # Case-sensitive search
+        result = find_text_rect(browser, "LOGIN", case_sensitive=True)
+
+        # Whole word only
+        result = find_text_rect(browser, "log", whole_word=True)  # Won't match "login"
+
+        # Find all matches and click the first visible one
+        result = find_text_rect(browser, "Buy Now", max_results=5)
+        if result.status == "success" and result.results:
+            for match in result.results:
+                if match.in_viewport:
+                    # Use click_rect from actions module
+                    from sentience import click_rect
+                    click_result = click_rect(browser, {
+                        "x": match.rect.x,
+                        "y": match.rect.y,
+                        "w": match.rect.width,
+                        "h": match.rect.height
+                    })
+                    break
+    """
+    if not browser.page:
+        raise RuntimeError("Browser not started. Call browser.start() first.")
+
+    if not text or not text.strip():
+        return TextRectSearchResult(
+            status="error",
+            error="Text parameter is required and cannot be empty",
+        )
+
+    # Limit max_results to prevent performance issues
+    max_results = min(max_results, 100)
+
+    # Call the extension's findTextRect method
+    result_dict = browser.page.evaluate(
+        """
+        (options) => {
+            return window.sentience.findTextRect(options);
+        }
+        """,
+        {
+            "text": text,
+            "caseSensitive": case_sensitive,
+            "wholeWord": whole_word,
+            "maxResults": max_results,
+        },
+    )
+
+    # Parse and validate with Pydantic
+    return TextRectSearchResult(**result_dict)