Skip to content

Commit 1c5df17

Browse files
author
SentienceDEV
committed
sample snapshot
1 parent a1e56ef commit 1c5df17

File tree

5 files changed

+363
-2
lines changed

5 files changed

+363
-2
lines changed

sentience/agent_runtime.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,50 @@ async def snapshot(self, **kwargs: Any) -> Snapshot:
358358
await self._handle_captcha_if_needed(self.last_snapshot, source="gateway")
359359
return self.last_snapshot
360360

361+
async def sampled_snapshot(
362+
self,
363+
*,
364+
samples: int = 4,
365+
scroll_delta_y: float | None = None,
366+
settle_ms: int = 250,
367+
union_limit: int | None = None,
368+
restore_scroll: bool = True,
369+
**kwargs: Any,
370+
) -> Snapshot:
371+
"""
372+
Take multiple snapshots while scrolling and merge them into a "union snapshot".
373+
374+
Intended for analysis/extraction on long / virtualized pages where a single
375+
viewport snapshot is insufficient.
376+
377+
IMPORTANT:
378+
- The returned snapshot's element bboxes may not correspond to the current viewport.
379+
Do NOT use it for clicking unless you also scroll to the right position.
380+
- This method does NOT update `self.last_snapshot` (to avoid confusing verification
381+
loops that depend on the current viewport).
382+
"""
383+
# Legacy browser path: fall back to a single snapshot (we can't rely on backend ops).
384+
if hasattr(self, "_legacy_browser") and hasattr(self, "_legacy_page"):
385+
return await self.snapshot(**kwargs)
386+
387+
from .backends.snapshot import sampled_snapshot as backend_sampled_snapshot
388+
389+
# Merge default options with call-specific kwargs
390+
options_dict = self._snapshot_options.model_dump(exclude_none=True)
391+
options_dict.update(kwargs)
392+
options = SnapshotOptions(**options_dict)
393+
394+
snap = await backend_sampled_snapshot(
395+
self.backend,
396+
options=options,
397+
samples=samples,
398+
scroll_delta_y=scroll_delta_y,
399+
settle_ms=settle_ms,
400+
union_limit=union_limit,
401+
restore_scroll=restore_scroll,
402+
)
403+
return snap
404+
361405
async def evaluate_js(self, request: EvaluateJsRequest) -> EvaluateJsResult:
362406
"""
363407
Evaluate JavaScript expression in the active backend.

sentience/backends/snapshot.py

Lines changed: 177 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from typing import TYPE_CHECKING, Any
2727

2828
from ..constants import SENTIENCE_API_URL
29-
from ..models import Snapshot, SnapshotOptions
29+
from ..models import Element, Snapshot, SnapshotOptions
3030
from ..snapshot import (
3131
_build_snapshot_payload,
3232
_merge_api_result_with_local,
@@ -259,6 +259,182 @@ async def snapshot(
259259
return await _snapshot_via_extension(backend, options)
260260

261261

262+
def _normalize_ws(text: str) -> str:
263+
return " ".join((text or "").split()).strip()
264+
265+
266+
def _dedupe_key(el: Element) -> tuple:
267+
"""
268+
Best-effort stable dedupe key across scroll-sampled snapshots.
269+
270+
Notes:
271+
- IDs are not reliable across snapshots (virtualization can remount nodes).
272+
- BBox coordinates are viewport-relative and depend on scroll position.
273+
- Prefer href/name/text + approximate document position when available.
274+
"""
275+
href = (el.href or "").strip()
276+
if href:
277+
return ("href", href)
278+
279+
name = _normalize_ws(el.name or "")
280+
if name:
281+
return ("role_name", el.role, name)
282+
283+
text = _normalize_ws(el.text or "")
284+
doc_y = el.doc_y
285+
if text:
286+
# Use doc_y when present (more stable across scroll positions than bbox.y).
287+
if isinstance(doc_y, (int, float)):
288+
return ("role_text_docy", el.role, text[:120], int(float(doc_y) // 10))
289+
return ("role_text", el.role, text[:120])
290+
291+
# Fallback: role + approximate position
292+
if isinstance(doc_y, (int, float)):
293+
return ("role_docy", el.role, int(float(doc_y) // 10))
294+
295+
# Last resort (can still dedupe within a single snapshot)
296+
return ("id", int(el.id))
297+
298+
299+
def merge_snapshots(
300+
snaps: list[Snapshot],
301+
*,
302+
union_limit: int | None = None,
303+
) -> Snapshot:
304+
"""
305+
Merge multiple snapshots into a single "union snapshot" for analysis/extraction.
306+
307+
CRITICAL:
308+
- Element bboxes are viewport-relative to the scroll position at the time each snapshot
309+
was taken. Do NOT use merged elements for direct clicking unless you also scroll
310+
back to their position.
311+
"""
312+
if not snaps:
313+
raise ValueError("merge_snapshots requires at least one snapshot")
314+
315+
base = snaps[0]
316+
best_by_key: dict[tuple, Element] = {}
317+
first_seen_idx: dict[tuple, int] = {}
318+
319+
# Keep the "best" representative per key:
320+
# - Prefer higher importance (usually means in-viewport at that sampling moment)
321+
# - Prefer having href/text/name (more useful for extraction)
322+
def _quality_score(e: Element) -> tuple:
323+
has_href = 1 if (e.href or "").strip() else 0
324+
has_text = 1 if _normalize_ws(e.text or "") else 0
325+
has_name = 1 if _normalize_ws(e.name or "") else 0
326+
has_docy = 1 if isinstance(e.doc_y, (int, float)) else 0
327+
return (e.importance, has_href, has_text, has_name, has_docy)
328+
329+
idx = 0
330+
for snap in snaps:
331+
for el in list(getattr(snap, "elements", []) or []):
332+
k = _dedupe_key(el)
333+
if k not in first_seen_idx:
334+
first_seen_idx[k] = idx
335+
prev = best_by_key.get(k)
336+
if prev is None or _quality_score(el) > _quality_score(prev):
337+
best_by_key[k] = el
338+
idx += 1
339+
340+
merged: list[Element] = list(best_by_key.values())
341+
342+
# Deterministic ordering: prefer document order when doc_y is available,
343+
# then fall back to "first seen" (stable for a given sampling sequence).
344+
def _sort_key(e: Element) -> tuple:
345+
doc_y = e.doc_y
346+
if isinstance(doc_y, (int, float)):
347+
return (0, float(doc_y), -int(e.importance))
348+
return (1, float("inf"), first_seen_idx.get(_dedupe_key(e), 10**9))
349+
350+
merged.sort(key=_sort_key)
351+
352+
if union_limit is not None:
353+
try:
354+
lim = max(1, int(union_limit))
355+
except (TypeError, ValueError):
356+
lim = None
357+
if lim is not None:
358+
merged = merged[:lim]
359+
360+
# Construct a new Snapshot object with merged elements.
361+
# Keep base url/viewport/diagnostics, and drop screenshot by default to avoid confusion.
362+
data = base.model_dump()
363+
data["elements"] = [e.model_dump() for e in merged]
364+
data["screenshot"] = None
365+
return Snapshot(**data)
366+
367+
368+
async def sampled_snapshot(
369+
backend: "BrowserBackend",
370+
*,
371+
options: SnapshotOptions | None = None,
372+
samples: int = 4,
373+
scroll_delta_y: float | None = None,
374+
settle_ms: int = 250,
375+
union_limit: int | None = None,
376+
restore_scroll: bool = True,
377+
) -> Snapshot:
378+
"""
379+
Take multiple snapshots while scrolling downward and return a merged union snapshot.
380+
381+
Designed for long / virtualized results pages where a single viewport snapshot
382+
cannot cover enough relevant items.
383+
"""
384+
if options is None:
385+
options = SnapshotOptions()
386+
387+
k = max(1, int(samples))
388+
if k <= 1:
389+
return await snapshot(backend, options=options)
390+
391+
# Baseline scroll position
392+
try:
393+
info = await backend.refresh_page_info()
394+
base_scroll_y = float(getattr(info, "scroll_y", 0.0) or 0.0)
395+
vh = float(getattr(info, "height", 800) or 800)
396+
except Exception: # pylint: disable=broad-exception-caught
397+
base_scroll_y = 0.0
398+
vh = 800.0
399+
400+
# Choose a conservative scroll delta if not provided.
401+
delta = float(scroll_delta_y) if scroll_delta_y is not None else (vh * 0.9)
402+
if delta <= 0:
403+
delta = max(200.0, vh * 0.9)
404+
405+
snaps: list[Snapshot] = []
406+
try:
407+
# Snapshot at current position.
408+
snaps.append(await snapshot(backend, options=options))
409+
410+
for _i in range(1, k):
411+
try:
412+
# Scroll by wheel delta (plays nicer with sites that hook scroll events).
413+
await backend.wheel(delta_y=delta)
414+
except Exception: # pylint: disable=broad-exception-caught
415+
# Fallback: direct scrollTo
416+
try:
417+
cur = await backend.eval("window.scrollY")
418+
await backend.call("(y) => window.scrollTo(0, y)", [float(cur) + delta])
419+
except Exception: # pylint: disable=broad-exception-caught
420+
break
421+
422+
if settle_ms > 0:
423+
await asyncio.sleep(float(settle_ms) / 1000.0)
424+
425+
snaps.append(await snapshot(backend, options=options))
426+
finally:
427+
if restore_scroll:
428+
try:
429+
await backend.call("(y) => window.scrollTo(0, y)", [float(base_scroll_y)])
430+
if settle_ms > 0:
431+
await asyncio.sleep(min(0.2, float(settle_ms) / 1000.0))
432+
except Exception: # pylint: disable=broad-exception-caught
433+
pass
434+
435+
return merge_snapshots(snaps, union_limit=union_limit)
436+
437+
262438
async def _wait_for_extension(
263439
backend: "BrowserBackend",
264440
timeout_ms: int = 5000,
@@ -273,7 +449,6 @@ async def _wait_for_extension(
273449
Raises:
274450
RuntimeError: If extension not injected within timeout
275451
"""
276-
import asyncio
277452
import logging
278453

279454
logger = logging.getLogger("sentience.backends.snapshot")

sentience/llm_provider.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,21 @@ def __init__(
353353
)
354354
super().__init__(api_key=api_key, model=model, base_url=base_url)
355355

356+
def supports_vision(self) -> bool:
357+
"""
358+
DeepInfra hosts many non-OpenAI multimodal models.
359+
360+
Their OpenAI-compatible API supports the same `image_url` message format:
361+
`{"type":"image_url","image_url":{"url":"data:image/png;base64,..."}}`
362+
363+
We therefore allow vision for common DeepInfra model naming patterns.
364+
"""
365+
model_lower = self._model_name.lower()
366+
if any(x in model_lower for x in ["vision", "llava", "qvq", "ocr"]):
367+
return True
368+
# Preserve OpenAI-style vision detection for GPT models served via DeepInfra.
369+
return super().supports_vision()
370+
356371

357372
class AnthropicProvider(LLMProvider):
358373
"""

tests/test_llm_provider_vision.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from __future__ import annotations
2+
3+
import pytest
4+
5+
6+
def test_deepinfra_provider_supports_vision_for_common_multimodal_names() -> None:
7+
pytest.importorskip("openai")
8+
9+
from sentience.llm_provider import DeepInfraProvider
10+
11+
p1 = DeepInfraProvider(api_key="x", model="meta-llama/Llama-3.2-11B-Vision-Instruct")
12+
assert p1.supports_vision() is True
13+
14+
p2 = DeepInfraProvider(api_key="x", model="deepseek-ai/DeepSeek-OCR")
15+
assert p2.supports_vision() is True
16+
17+
p3 = DeepInfraProvider(api_key="x", model="deepseek-ai/DeepSeek-V3.1")
18+
assert p3.supports_vision() is False
19+

0 commit comments

Comments
 (0)