Skip to content

Commit 8e3a8b8

Browse files
author
SentienceDEV
committed
get text coordinates
1 parent 7bafeb3 commit 8e3a8b8

File tree

5 files changed

+353
-0
lines changed

5 files changed

+353
-0
lines changed

README.md

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,74 @@ data_url = screenshot(browser, format="jpeg", quality=85)
401401

402402
</details>
403403

404+
<details>
405+
<summary><h3>🔎 Text Search - Find Elements by Visible Text</h3></summary>
406+
407+
**`find_text_rect(browser, text, case_sensitive=False, whole_word=False, max_results=10)`** - Find text on page and get exact pixel coordinates
408+
409+
Find buttons, links, or any UI elements by their visible text without needing element IDs or CSS selectors. Returns exact pixel coordinates for each match.
410+
411+
**Example:**
412+
```python
413+
from sentience import SentienceBrowser, find_text_rect, click_rect
414+
415+
with SentienceBrowser() as browser:
416+
browser.page.goto("https://example.com")
417+
418+
# Find "Sign In" button
419+
result = find_text_rect(browser, "Sign In")
420+
if result.status == "success" and result.results:
421+
first_match = result.results[0]
422+
print(f"Found at: ({first_match.rect.x}, {first_match.rect.y})")
423+
print(f"In viewport: {first_match.in_viewport}")
424+
425+
# Click on the found text
426+
if first_match.in_viewport:
427+
click_rect(browser, {
428+
"x": first_match.rect.x,
429+
"y": first_match.rect.y,
430+
"w": first_match.rect.width,
431+
"h": first_match.rect.height
432+
})
433+
```
434+
435+
**Advanced Options:**
436+
```python
437+
# Case-sensitive search
438+
result = find_text_rect(browser, "LOGIN", case_sensitive=True)
439+
440+
# Whole word only (won't match "login" as part of "loginButton")
441+
result = find_text_rect(browser, "log", whole_word=True)
442+
443+
# Find multiple matches
444+
result = find_text_rect(browser, "Buy", max_results=10)
445+
for match in result.results:
446+
if match.in_viewport:
447+
print(f"Found '{match.text}' at ({match.rect.x}, {match.rect.y})")
448+
print(f"Context: ...{match.context.before}[{match.text}]{match.context.after}...")
449+
```
450+
451+
**Returns:** `TextRectSearchResult` with:
452+
- **`status`**: "success" or "error"
453+
- **`results`**: List of `TextMatch` objects with:
454+
- `text` - The matched text
455+
- `rect` - Absolute coordinates (with scroll offset)
456+
- `viewport_rect` - Viewport-relative coordinates
457+
- `context` - Surrounding text (before/after)
458+
- `in_viewport` - Whether visible in current viewport
459+
460+
**Use Cases:**
461+
- Find buttons/links by visible text without CSS selectors
462+
- Get exact pixel coordinates for click automation
463+
- Verify text visibility and position on page
464+
- Search dynamic content that changes frequently
465+
466+
**Note:** Does not consume API credits (runs locally in browser)
467+
468+
**See example:** `examples/find_text_demo.py`
469+
470+
</details>
471+
404472
---
405473

406474
## 📋 Reference

examples/find_text_demo.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
"""
2+
Text Search Demo - Using find_text_rect() to locate elements by visible text
3+
4+
This example demonstrates how to:
5+
1. Find text on a webpage and get exact pixel coordinates
6+
2. Use case-sensitive and whole-word matching options
7+
3. Click on found text using click_rect()
8+
4. Handle multiple matches and filter by viewport visibility
9+
"""
10+
11+
from sentience import SentienceBrowser, find_text_rect, click_rect
12+
13+
14+
def main():
15+
with SentienceBrowser() as browser:
16+
# Navigate to a search page
17+
browser.page.goto("https://www.google.com")
18+
browser.page.wait_for_load_state("networkidle")
19+
20+
print("\n" + "=" * 60)
21+
print("Text Search Demo")
22+
print("=" * 60 + "\n")
23+
24+
# Example 1: Simple text search
25+
print("Example 1: Finding 'Google Search' button")
26+
print("-" * 60)
27+
result = find_text_rect(browser, "Google Search")
28+
29+
if result.status == "success" and result.results:
30+
print(f"✓ Found {result.matches} match(es) for '{result.query}'")
31+
for i, match in enumerate(result.results[:3]): # Show first 3
32+
print(f"\nMatch {i + 1}:")
33+
print(f" Text: '{match.text}'")
34+
print(f" Position: ({match.rect.x:.1f}, {match.rect.y:.1f})")
35+
print(f" Size: {match.rect.width:.1f}x{match.rect.height:.1f} pixels")
36+
print(f" In viewport: {match.in_viewport}")
37+
print(
38+
f" Context: ...{match.context.before}[{match.text}]{match.context.after}..."
39+
)
40+
else:
41+
print(f"✗ Search failed: {result.error}")
42+
43+
# Example 2: Find and click search box
44+
print("\n\nExample 2: Finding and clicking the search box")
45+
print("-" * 60)
46+
result = find_text_rect(browser, "Search", max_results=5)
47+
48+
if result.status == "success" and result.results:
49+
# Find the first visible match
50+
for match in result.results:
51+
if match.in_viewport:
52+
print(f"✓ Found visible match: '{match.text}'")
53+
print(f" Clicking at ({match.rect.x:.1f}, {match.rect.y:.1f})")
54+
55+
# Click in the center of the text
56+
click_result = click_rect(
57+
browser,
58+
{
59+
"x": match.rect.x,
60+
"y": match.rect.y,
61+
"w": match.rect.width,
62+
"h": match.rect.height,
63+
},
64+
)
65+
66+
if click_result.success:
67+
print(f" ✓ Click successful!")
68+
break
69+
70+
# Example 3: Case-sensitive search
71+
print("\n\nExample 3: Case-sensitive search for 'GOOGLE'")
72+
print("-" * 60)
73+
result_insensitive = find_text_rect(browser, "GOOGLE", case_sensitive=False)
74+
result_sensitive = find_text_rect(browser, "GOOGLE", case_sensitive=True)
75+
76+
print(f"Case-insensitive search: {result_insensitive.matches or 0} matches")
77+
print(f"Case-sensitive search: {result_sensitive.matches or 0} matches")
78+
79+
# Example 4: Whole word search
80+
print("\n\nExample 4: Whole word search")
81+
print("-" * 60)
82+
result_partial = find_text_rect(browser, "Search", whole_word=False)
83+
result_whole = find_text_rect(browser, "Search", whole_word=True)
84+
85+
print(f"Partial word match: {result_partial.matches or 0} matches")
86+
print(f"Whole word only: {result_whole.matches or 0} matches")
87+
88+
# Example 5: Get viewport information
89+
print("\n\nExample 5: Viewport and scroll information")
90+
print("-" * 60)
91+
result = find_text_rect(browser, "Google")
92+
if result.status == "success" and result.viewport:
93+
print(f"Viewport size: {result.viewport.width}x{result.viewport.height}")
94+
# Note: scroll position would be available if viewport had scroll_x/scroll_y fields
95+
96+
print("\n" + "=" * 60)
97+
print("Demo complete!")
98+
print("=" * 60 + "\n")
99+
100+
101+
if __name__ == "__main__":
102+
main()

sentience/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,19 @@
3434
BBox,
3535
Cookie,
3636
Element,
37+
TextRectSearchResult,
3738
LocalStorageItem,
3839
OriginStorage,
3940
ScreenshotConfig,
4041
Snapshot,
4142
SnapshotFilter,
4243
SnapshotOptions,
4344
StorageState,
45+
TextContext,
46+
TextMatch,
47+
TextRect,
4448
TokenStats,
49+
ViewportRect,
4550
Viewport,
4651
WaitResult,
4752
)
@@ -51,6 +56,7 @@
5156
from .recorder import Recorder, Trace, TraceStep, record
5257
from .screenshot import screenshot
5358
from .snapshot import snapshot
59+
from .text_search import find_text_rect
5460
from .tracer_factory import SENTIENCE_API_URL, create_tracer
5561
from .tracing import JsonlTraceSink, TraceEvent, Tracer, TraceSink
5662

@@ -96,6 +102,13 @@
96102
"screenshot",
97103
"show_overlay",
98104
"clear_overlay",
105+
# Text Search
106+
"find_text_rect",
107+
"TextRectSearchResult",
108+
"TextMatch",
109+
"TextRect",
110+
"ViewportRect",
111+
"TextContext",
99112
# Agent Layer (Phase 1 & 2)
100113
"BaseAgent",
101114
"LLMProvider",

sentience/models.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,3 +339,66 @@ def to_playwright_dict(self) -> dict:
339339
for origin in self.origins
340340
],
341341
}
342+
343+
344+
# ========== Text Search Models (findTextRect) ==========
345+
346+
347+
class TextRect(BaseModel):
348+
"""
349+
Rectangle coordinates for text occurrence.
350+
Includes both absolute (page) and viewport-relative coordinates.
351+
"""
352+
353+
x: float = Field(..., description="Absolute X coordinate (page coordinate with scroll offset)")
354+
y: float = Field(..., description="Absolute Y coordinate (page coordinate with scroll offset)")
355+
width: float = Field(..., description="Rectangle width in pixels")
356+
height: float = Field(..., description="Rectangle height in pixels")
357+
left: float = Field(..., description="Absolute left position (same as x)")
358+
top: float = Field(..., description="Absolute top position (same as y)")
359+
right: float = Field(..., description="Absolute right position (x + width)")
360+
bottom: float = Field(..., description="Absolute bottom position (y + height)")
361+
362+
363+
class ViewportRect(BaseModel):
364+
"""Viewport-relative rectangle coordinates (without scroll offset)"""
365+
366+
x: float = Field(..., description="Viewport-relative X coordinate")
367+
y: float = Field(..., description="Viewport-relative Y coordinate")
368+
width: float = Field(..., description="Rectangle width in pixels")
369+
height: float = Field(..., description="Rectangle height in pixels")
370+
371+
372+
class TextContext(BaseModel):
373+
"""Context text surrounding a match"""
374+
375+
before: str = Field(..., description="Text before the match (up to 20 chars)")
376+
after: str = Field(..., description="Text after the match (up to 20 chars)")
377+
378+
379+
class TextMatch(BaseModel):
380+
"""A single text match with its rectangle and context"""
381+
382+
text: str = Field(..., description="The matched text")
383+
rect: TextRect = Field(..., description="Absolute rectangle coordinates (with scroll offset)")
384+
viewport_rect: ViewportRect = Field(
385+
..., description="Viewport-relative rectangle (without scroll offset)"
386+
)
387+
context: TextContext = Field(..., description="Surrounding text context")
388+
in_viewport: bool = Field(..., description="Whether the match is currently visible in viewport")
389+
390+
391+
class TextRectSearchResult(BaseModel):
392+
"""
393+
Result of findTextRect operation.
394+
Returns all occurrences of text on the page with their exact pixel coordinates.
395+
"""
396+
397+
status: Literal["success", "error"]
398+
query: str | None = Field(None, description="The search text that was queried")
399+
case_sensitive: bool | None = Field(None, description="Whether search was case-sensitive")
400+
whole_word: bool | None = Field(None, description="Whether whole-word matching was used")
401+
matches: int | None = Field(None, description="Number of matches found")
402+
results: list[TextMatch] | None = Field(None, description="List of text matches with coordinates")
403+
viewport: Viewport | None = Field(None, description="Current viewport dimensions")
404+
error: str | None = Field(None, description="Error message if status is 'error'")

sentience/text_search.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
"""
2+
Text search utilities - find text and get pixel coordinates
3+
"""
4+
5+
from .browser import SentienceBrowser
6+
from .models import TextRectSearchResult
7+
8+
9+
def find_text_rect(
10+
browser: SentienceBrowser,
11+
text: str,
12+
case_sensitive: bool = False,
13+
whole_word: bool = False,
14+
max_results: int = 10,
15+
) -> TextRectSearchResult:
16+
"""
17+
Find all occurrences of text on the page and get their exact pixel coordinates.
18+
19+
This function searches for text in all visible text nodes on the page and returns
20+
the bounding rectangles for each match. Useful for:
21+
- Finding specific UI elements by their text content
22+
- Locating buttons, links, or labels without element IDs
23+
- Getting exact coordinates for click automation
24+
- Highlighting search results visually
25+
26+
Args:
27+
browser: SentienceBrowser instance
28+
text: Text to search for (required)
29+
case_sensitive: If True, search is case-sensitive (default: False)
30+
whole_word: If True, only match whole words surrounded by whitespace (default: False)
31+
max_results: Maximum number of matches to return (default: 10, max: 100)
32+
33+
Returns:
34+
TextRectSearchResult with:
35+
- status: "success" or "error"
36+
- query: The search text
37+
- case_sensitive: Whether search was case-sensitive
38+
- whole_word: Whether whole-word matching was used
39+
- matches: Number of matches found
40+
- results: List of TextMatch objects, each containing:
41+
- text: The matched text
42+
- rect: Absolute rectangle (with scroll offset)
43+
- viewport_rect: Viewport-relative rectangle
44+
- context: Surrounding text (before/after)
45+
- in_viewport: Whether visible in current viewport
46+
- viewport: Current viewport dimensions and scroll position
47+
- error: Error message if status is "error"
48+
49+
Examples:
50+
# Find "Sign In" button
51+
result = find_text_rect(browser, "Sign In")
52+
if result.status == "success" and result.results:
53+
first_match = result.results[0]
54+
print(f"Found at: ({first_match.rect.x}, {first_match.rect.y})")
55+
print(f"Size: {first_match.rect.width}x{first_match.rect.height}")
56+
print(f"In viewport: {first_match.in_viewport}")
57+
58+
# Case-sensitive search
59+
result = find_text_rect(browser, "LOGIN", case_sensitive=True)
60+
61+
# Whole word only
62+
result = find_text_rect(browser, "log", whole_word=True) # Won't match "login"
63+
64+
# Find all matches and click the first visible one
65+
result = find_text_rect(browser, "Buy Now", max_results=5)
66+
if result.status == "success" and result.results:
67+
for match in result.results:
68+
if match.in_viewport:
69+
# Use click_rect from actions module
70+
from sentience import click_rect
71+
click_result = click_rect(browser, {
72+
"x": match.rect.x,
73+
"y": match.rect.y,
74+
"w": match.rect.width,
75+
"h": match.rect.height
76+
})
77+
break
78+
"""
79+
if not browser.page:
80+
raise RuntimeError("Browser not started. Call browser.start() first.")
81+
82+
if not text or not text.strip():
83+
return TextRectSearchResult(
84+
status="error",
85+
error="Text parameter is required and cannot be empty",
86+
)
87+
88+
# Limit max_results to prevent performance issues
89+
max_results = min(max_results, 100)
90+
91+
# Call the extension's findTextRect method
92+
result_dict = browser.page.evaluate(
93+
"""
94+
(options) => {
95+
return window.sentience.findTextRect(options);
96+
}
97+
""",
98+
{
99+
"text": text,
100+
"caseSensitive": case_sensitive,
101+
"wholeWord": whole_word,
102+
"maxResults": max_results,
103+
},
104+
)
105+
106+
# Parse and validate with Pydantic
107+
return TextRectSearchResult(**result_dict)

0 commit comments

Comments
 (0)