diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..c35f038 --- /dev/null +++ b/.env.example @@ -0,0 +1,12 @@ +# Copy this file to .env and fill in your API keys +# .env is gitignored and will not be committed + +# OpenRouter API Key (default provider) +OPENROUTER_API_KEY=your-openrouter-api-key-here + +# Optional: Use OpenAI directly instead of OpenRouter +# LLM_PROVIDER=openai +# OPENAI_API_KEY=your-openai-api-key-here + +# Optional: Override the default model +# LLM_MODEL=openai/gpt-4o diff --git a/.gitignore b/.gitignore index 62a9a79..76137f9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,8 @@ __pycache__/ *.xml .env -venv/venv/ -__pycache__/ -*.xml .DS_Store -.env -venv/ -venv/ venv/ +myenv/ +logs/ +window_dump.xml diff --git a/README.md b/README.md index e84697d..c40e4c3 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,7 @@ Browser agents can't reach these. Desktop agents don't fit. **Android Use is the - Python 3.10+ - Android device or emulator (USB debugging enabled) - ADB (Android Debug Bridge) -- OpenAI API key +- OpenRouter API key (default) **or** OpenAI API key ### Installation @@ -176,13 +176,21 @@ brew install android-platform-tools # macOS # 4. Connect device & verify adb devices -# 5. Set API key -export OPENAI_API_KEY="sk-..." +# 5. Set API key (OpenRouter is the default provider) +export OPENROUTER_API_KEY="sk-or-..." # 6. Run your first agent python kernel.py ``` +### Alternative: Use OpenAI Directly + +```bash +# Override to use OpenAI instead of OpenRouter +export LLM_PROVIDER=openai +export OPENAI_API_KEY="sk-..." +``` + ### Try It: Logistics Example ```python diff --git a/docs/IMPLEMENTATION_PLAN.md b/docs/IMPLEMENTATION_PLAN.md new file mode 100644 index 0000000..7c85d4c --- /dev/null +++ b/docs/IMPLEMENTATION_PLAN.md @@ -0,0 +1,62 @@ +# Implementation Plan: OpenRouter Default (GPT-4o via OpenRouter) + +## Goal +Make **OpenRouter** the default LLM provider while preserving the current agent loop reports in `README.md` and behavior in `kernel.py`: + +- Perception: dump Android accessibility tree via `uiautomator` and sanitize it +- Reasoning: ask an LLM for the next action as **a single JSON object** +- Action: execute via ADB (`tap`, `type`, `home`, `back`, `wait`, `done`) + +## Non-Goals +- Changing the agent UX (still `python kernel.py` → prompts for goal) +- Adding new actions/tool calling +- Rewriting the sanitizer logic + +## Default Provider Decision +- Default provider: **OpenRouter** +- Default model via OpenRouter: **`openai/gpt-4o`** + +## New Configuration (env vars) +- `OPENROUTER_API_KEY` (required by default) +- `LLM_PROVIDER` (optional override; values: `openrouter`, `openai`) +- `LLM_MODEL` (optional override; default depends on provider) +- `OPENAI_API_KEY` (only required if `LLM_PROVIDER=openai`) + +## Work Breakdown (milestones) + +### Milestone 1 — Add docs-first implementation instructions +- Create docs structure: + - `docs/features/openrouter-default.md` + - `docs/bugs/kernel-known-bugs.md` +- Ensure instructions are atomic and include “why” for each step. + +### Milestone 2 — Implement provider abstraction (small refactor) +- Add a small “LLM client factory” that chooses: + - OpenRouter client (default) + - OpenAI client (opt-in) +- Keep call site to `client.chat.completions.create(...)` unchanged. + +### Milestone 3 — Preserve JSON-action contract across models/providers +- Keep `response_format={"type":"json_object"}`. +- Add parse/validation + 1 retry if output is invalid JSON. + +### Milestone 4 — Fix correctness bugs discovered during review +- Fix issues documented in `docs/bugs/kernel-known-bugs.md`. + +### Milestone 5 — Update README and do a smoke test +- Update `README.md` Quick Start to prefer OpenRouter. +- Manual smoke test: + - Run `python kernel.py` with a simple goal (e.g. “go home”). + - Confirm ADB commands work and the model returns valid JSON actions. + +## Acceptance Criteria +- Running with **only** `OPENROUTER_API_KEY` set works (OpenRouter default). +- Setting `LLM_PROVIDER=openai` with `OPENAI_API_KEY` works. +- Actions returned by the model are validated (no crashes on missing fields). +- Key ADB actions (`home`, `back`) use correct keycodes. + +## Rollback Plan +- If OpenRouter routing/model output is unstable, keep OpenRouter default but allow fallback: + - `LLM_PROVIDER=openai` + - `LLM_MODEL=gpt-4o` + diff --git a/docs/bugs/kernel-known-bugs.md b/docs/bugs/kernel-known-bugs.md new file mode 100644 index 0000000..9ec06a6 --- /dev/null +++ b/docs/bugs/kernel-known-bugs.md @@ -0,0 +1,109 @@ +# Bugs: Known Issues in `kernel.py` (and Proposed Fixes) + +This document lists bugs discovered during review that will impact correctness and/or stability. Each bug includes a proposed fix and the reason it matters. + +## 1) Missing import: `List` used but not imported +**Where** +- `kernel.py`: `def run_adb_command(command: List[str]):` + +**Problem** +- `List` is not imported from `typing`, which will raise a `NameError` at runtime. + +**Proposed Fix** +- Change typing import to include `List`: + - `from typing import Dict, Any, List` + +**Why it matters** +- This prevents the script from running at all. + +## 2) Wrong ADB keyevent constants for Home/Back +**Where** +- `kernel.py`: + - `KEYWORDS_HOME` + - `KEYWORDS_BACK` + +**Problem** +- The Android keyevent constants are `KEYCODE_HOME` and `KEYCODE_BACK`. +- Current constants will cause ADB to fail (or do nothing) when trying to go home/back. + +**Proposed Fix** +- Replace with: + - `KEYCODE_HOME` + - `KEYCODE_BACK` + +**Why it matters** +- Navigation actions are core to the agent loop. + +## 3) Potential crash: `tap` coordinates unpacking without validation +**Where** +- `execute_action()`: + - `x, y = action.get("coordinates")` + +**Problem** +- If `coordinates` is missing or malformed, unpacking throws an exception. + +**Proposed Fix** +- Validate the action schema before executing: + - Ensure `coordinates` exists + - Ensure it is a 2-item list/tuple + - Ensure each value can be converted to int + +**Why it matters** +- LLMs occasionally return malformed payloads; the agent should fail gracefully. + +## 4) Potential crash: `type` action assumes `text` exists +**Where** +- `execute_action()`: + - `text = action.get("text").replace(" ", "%s")` + +**Problem** +- If `text` is missing, `action.get("text")` returns `None` and `.replace(...)` crashes. + +**Proposed Fix** +- Validate `text` exists and is a string before calling `.replace`. + +**Why it matters** +- Prevents agent from crashing mid-run. + +## 5) Hard exit inside library function (`exit(0)`) reduces reusability +**Where** +- `execute_action()` on `done`: + - `exit(0)` + +**Problem** +- If `run_agent()` is imported and used by another module, `exit(0)` will terminate the entire host process. + +**Proposed Fix** +- Prefer returning a sentinel (e.g. `True` for completed) or raising a specific exception that `run_agent()` catches. + +**Why it matters** +- Enables embedding this library into other tools/services without unexpected process termination. + +## 6) ADB error detection is brittle +**Where** +- `run_adb_command()`: + - checks `if result.stderr and "error" in result.stderr.lower()` + +**Problem** +- Many ADB failures show up in stdout or return codes. +- Ignoring `returncode` can hide failures. + +**Proposed Fix** +- Check `result.returncode != 0` and include both stdout/stderr in the error message. + +**Why it matters** +- Makes debugging device connectivity and ADB issues far easier. + +## 7) Ambiguous `focus` usage in sanitizer (minor) +**Where** +- `sanitizer.py`: + - `is_editable = node.attrib.get("focus") == "true" or node.attrib.get("focusable") == "true"` + +**Problem** +- `focus/focusable` is not the same as "editable". + +**Proposed Fix** +- (Optional) Use attributes like `class` (`EditText`) or `long-clickable`/`enabled` to identify text fields more accurately. + +**Why it matters** +- Better context improves LLM decision quality; not required for OpenRouter switch. diff --git a/docs/features/openrouter-default.md b/docs/features/openrouter-default.md new file mode 100644 index 0000000..0091f48 --- /dev/null +++ b/docs/features/openrouter-default.md @@ -0,0 +1,121 @@ +# Feature: Make OpenRouter the Default LLM Provider (GPT-4o) + +## Summary +Refactor `kernel.py` so the default LLM provider is **OpenRouter**, using model **`openai/gpt-4o`**, while keeping the current agent loop and JSON action contract. + +## Target Behavior +- Running `python kernel.py` should work with only: + - `OPENROUTER_API_KEY` set +- OpenAI remains available as an override: + - `LLM_PROVIDER=openai` + `OPENAI_API_KEY` + +## Atomic Steps (with “Why”) + +### 1) Decide and document env var contract +**Do** +- Define these env vars: + - `OPENROUTER_API_KEY` (required by default) + - `LLM_PROVIDER` (optional; default `openrouter`) + - `LLM_MODEL` (optional; default depends on provider) + - `OPENAI_API_KEY` (only required if `LLM_PROVIDER=openai`) + +**Why** +- A junior engineer needs a single source of truth for configuration. +- Keeping OpenAI as opt-in reduces risk and makes debugging easier. + +### 2) Replace the global `MODEL` constant with provider-aware defaults +**Do** +- Introduce a provider-aware model selection: + - If provider is `openrouter`: default `openai/gpt-4o` + - If provider is `openai`: default `gpt-4o` +- Allow `LLM_MODEL` to override in both cases. + +**Why** +- OpenRouter uses namespaced model IDs; OpenAI does not. +- This prevents confusing “model not found” errors. + +### 3) Create a tiny “LLM client factory” in `kernel.py` +**Do** +- Add a function, e.g. `get_llm_client_and_model()` that returns: + - `client` + - `model` +- Build the OpenAI SDK client like: + - OpenRouter default: + - `OpenAI(api_key=OPENROUTER_API_KEY, base_url="https://openrouter.ai/api/v1")` + - OpenAI override: + - `OpenAI(api_key=OPENAI_API_KEY)` + +**Why** +- Centralizes provider logic. +- Avoids littering conditionals across `get_llm_decision()`. +- Makes future provider additions (Claude/Gemini via OpenRouter, etc.) straightforward. + +### 4) Add OpenRouter optional headers (non-blocking) +**Do** +- If the OpenAI SDK version in this repo supports default headers: + - Add `HTTP-Referer` and `X-Title` for OpenRouter requests. +- If it does not, skip this step. + +**Why** +- OpenRouter recommends these headers for attribution/analytics. +- Not required for correctness; keep it optional to reduce implementation risk. + +### 5) Keep JSON response mode, but add a fallback parsing strategy +**Do** +- Keep `response_format={"type": "json_object"}`. +- Wrap JSON parsing in a try/catch. +- If parsing fails: + - Retry once with a stricter prompt (still requiring only JSON output) + - If it still fails, raise a clear error that includes the raw response text. + +**Why** +- Different routed models can be slightly less strict about JSON-only output. +- A single retry often fixes transient “formatting drift” without changing the UX. + +### 6) Validate the returned action schema before executing +**Do** +- Before `execute_action(decision)`: + - Validate `decision["action"]` is one of: + - `tap`, `type`, `home`, `back`, `wait`, `done` + - If `tap`, require `coordinates` as a 2-item list of ints. + - If `type`, require `text` as a non-empty string. + +**Why** +- Prevents crashes and device misclicks. +- Makes the behavior consistent even when the LLM is imperfect. + +### 7) Update README “Quick Start” to prefer OpenRouter +**Do** +- Replace or augment the existing OpenAI setup section with: + - `export OPENROUTER_API_KEY="..."` + - (optional) `export LLM_MODEL="openai/gpt-4o"` +- Add an “OpenAI override” snippet: + - `export LLM_PROVIDER=openai` + - `export OPENAI_API_KEY="..."` + +**Why** +- Docs should match the new default so new users don’t get blocked. + +### 8) Add a minimal manual smoke test checklist +**Do** +- Validate both modes: + - OpenRouter default + - OpenAI override +- Use a simple goal and verify at least one valid action executes. + +**Why** +- Prevents regressions before merging. +- Junior engineers get confidence quickly with concrete steps. + +## Expected Code Touch Points +- `kernel.py` + - Add provider config + client factory + - Update model constant usage + - Add JSON parsing fallback + action validation +- `README.md` + - Update environment variable setup instructions + +## Definition of Done +- With `OPENROUTER_API_KEY` set, `python kernel.py` starts and makes LLM calls successfully. +- The LLM output is parsed into a JSON dict and validated. +- Actions execute without runtime exceptions for missing fields. diff --git a/docs/features/safetyculture-inspections-autopilot.md b/docs/features/safetyculture-inspections-autopilot.md new file mode 100644 index 0000000..17513fd --- /dev/null +++ b/docs/features/safetyculture-inspections-autopilot.md @@ -0,0 +1,180 @@ +# Feature: SafetyCulture (iAuditor) Inspections Autopilot (MVP) + +## Decision +**Chosen use case:** Safety/EHS inspections autopilot using **SafetyCulture (iAuditor)**. + +## Why this one (vs the other ideas) +- **Matches current capabilities**: form navigation + taps + text entry + submit. +- **Does not require vision/OCR**: we can explicitly avoid photo upload steps in the MVP. +- **Clear ROI + measurable outcomes**: time-per-inspection, completion rate, fewer missing fields. +- **Mobile-native + repetitive**: exactly where UI automation is valuable and APIs are often not worth building for a pilot. + +## Goal +Build a repeatable demo/MVP where the agent can: +- Open SafetyCulture +- Start a specific inspection template +- Fill a bounded set of fields (pass/fail + short notes) +- Submit the inspection + +## Non-goals (for MVP) +- Handling photo capture / attachments +- Handling signatures +- Handling deep offline sync edge cases +- Supporting arbitrary templates (we’ll support **one** stable template first) + +## Required Android apps +- SafetyCulture (iAuditor) +- (Optional) Slack or Microsoft Teams (only if you want “inspection submitted” notification) + +## Preconditions (junior engineer checklist) +- Android device connected via ADB (`adb devices` shows it) +- USB debugging enabled +- SafetyCulture installed +- A test SafetyCulture account logged in already +- A single inspection template created and kept stable for the MVP + +--- + +## Implementation Plan (atomic steps) + +### Step 1 — Create a dedicated “use case” entry point +**Do** +- Add a new Python file (example): `use_cases/safetyculture_inspection.py` that calls `run_agent()` with a hardcoded SafetyCulture goal string and higher `max_steps`. + +**Why** +- Keeps the core kernel generic and makes the use case demo reproducible. + +**Acceptance check** +- Running the file launches the agent with a SafetyCulture-focused goal without you manually typing it each time. + +--- + +### Step 2 — Add a stable way to open an app (do not rely on home-screen taps) +**Do** +- Add a new action: `{"action": "launch", "package": "..."}` OR a helper function used by the use-case entry point that runs: + - `adb shell monkey -p -c android.intent.category.LAUNCHER 1` + +**Why** +- Tapping an icon coordinate is brittle across devices, launchers, and layouts. + +**Acceptance check** +- With the phone on any screen, the agent can reliably open SafetyCulture. + +--- + +### Step 3 — Add a “wait_until” primitive (minimum viable) +**Do** +- Add a function (or action) that loops for up to N seconds until the current `screen_context` contains an element whose text contains a target substring. +- Example API (choose one): + - `wait_until_text("Inspections", timeout_s=10)` + - or action: `{"action":"wait_until","text":"Inspections","timeout_s":10}` + +**Why** +- SafetyCulture (and most enterprise apps) have loading states; blind `sleep(2)` causes flakiness. + +**Acceptance check** +- After launching SafetyCulture, the code waits until a known “home” element appears before proceeding. + +--- + +### Step 4 — Add a minimal “interrupt handler” for popups +**Do** +- Before every decision step, scan `screen_context` for common dismiss buttons: + - `"Allow"`, `"Don’t allow"`, `"No thanks"`, `"Not now"`, `"Skip"`, `"Close"`, `"Later"` +- If found, auto-tap it (using the node center coordinates) before calling the LLM. + +**Why** +- Popups will derail pilots. Handling them early increases success rate dramatically. + +**Acceptance check** +- If a popup appears during launch, the agent dismisses it and continues. + +--- + +### Step 5 — Restrict the MVP template and define what “fill the inspection” means +**Do** +- Pick one template and explicitly define: + - How many questions it has (e.g., 10) + - Which answers to select for each (pass/fail/na) + - Which questions require a note (and what note text to use) +- Store this as a structured object in code (dict/list) in `use_cases/safetyculture_inspection.py`. + +**Why** +- “Do an inspection” is too open-ended; bounded scope makes it reliable and testable. + +**Acceptance check** +- A human can read the config and understand what the agent will do before running. + +--- + +### Step 6 — Upgrade the prompt to be template-driven (but still generic) +**Do** +- Instead of a single natural-language goal, construct a goal like: + - "Open SafetyCulture, start template '', answer questions 1..N using this plan: ... then submit." +- Include the structured plan in the user message. + +**Why** +- The LLM performs better when it has an explicit plan and fewer degrees of freedom. + +**Acceptance check** +- The agent stops trying random navigation paths and focuses on the inspection flow. + +--- + +### Step 7 — Add logging artifacts for demos and debugging +**Do** +- Write a JSONL or JSON file per run with: + - timestamp + - each decision + - whether the run ended in done/timeout +- Do NOT store full raw screen dumps if they may contain sensitive info; store only: + - step index + - action + - reason + - optionally a small list of visible element texts (redacted) + +**Why** +- Business pilots require traceability: “what happened and where did it fail?” + +**Acceptance check** +- After a run, a file exists that explains each step. + +--- + +### Step 8 — Add success detection for “submitted” state +**Do** +- Define 1–2 text anchors that indicate completion (examples): + - `"Submitted"`, `"Inspection submitted"`, `"Success"` +- After tapping submit, call `wait_until_text("Submitted", timeout_s=15)`. + +**Why** +- Prevents false positives where the agent says done but nothing submitted. + +**Acceptance check** +- The run only returns success if a completion text is detected. + +--- + +### Step 9 — Define a reliability test loop +**Do** +- Run the same template 20 times (manual loop is fine at first). +- Track: + - success rate + - median duration + - top failure reasons + +**Why** +- Reliability is the product for this use case. + +**Acceptance check** +- You can report “X% success across 20 runs” to a pilot customer. + +--- + +## MVP Demo Script (what to show a customer) +- "Here is the exact SafetyCulture template" +- "Watch the agent complete and submit it" +- Show the run log + time saved estimate + +## Definition of Done +- On a physical device, for one SafetyCulture template, the agent can complete and submit successfully with **>= 80% success over 20 runs**. diff --git a/docs/ideas/idea-dump-1.md b/docs/ideas/idea-dump-1.md new file mode 100644 index 0000000..66bae34 --- /dev/null +++ b/docs/ideas/idea-dump-1.md @@ -0,0 +1,258 @@ +Section A: Use case table (12–20) + +# Use case Who pays (buyer + industry) Why Android UI automation (vs API/web/desktop) Success looks like (measurable) Key risks Example Android apps (3–5) +1 Safety/EHS inspections autopilot (checklists, issue creation, submit report) EHS Manager / Ops Director (construction, manufacturing, logistics) Inspections are mobile-first, offline-capable, highly repetitive taps; APIs often don’t cover “fill this specific checklist flow” fast for pilots 30–60% fewer taps/time per inspection; higher on-time completion; fewer “missing fields” Accessibility gaps in custom widgets; frequent template changes; photo steps (you don’t have vision) SafetyCulture (iAuditor), Microsoft Teams, Slack, Google Sheets, Trello  +2 Field service work order closeout autopilot (status, parts, notes, signature screens you can skip in pilot) Field Service VP / Dispatch Manager (utilities, HVAC, telecom) Mobile app UIs are the system-of-record for techs; API projects take longer than “get value this month” Reduce closeout time 25–50%; fewer incomplete closeouts; faster invoice readiness Offline/online state; signature/photo steps; UI drift; 2FA/SSO Salesforce Field Service, Microsoft Teams, Slack, Google Sheets  +3 CRM “lead capture + follow-up” autopilot (create contact/company, log activity, task) Sales Ops / VP Sales (SMB–midmarket) Reps live on phones at events/in the field; fastest path is “drive the existing app” not integrate CRM API + custom UI 2× more leads logged; 30–50% less time per lead; fewer missed follow-ups Login/2FA; duplicate detection flows; keyboard/input focus bugs; UI changes HubSpot, Zoho CRM, Salesforce, Asana, Trello  +4 ITSM ticket triage on mobile autopilot (accept/assign/update/resolve) IT Service Desk Manager (enterprise IT) Mobile agents exist, but actions still tedious; API automation often blocked by governance; pilots can run on pre-auth devices 30–50% faster first-touch updates; better SLA compliance; fewer “stale” tickets Role-based UI variants; 2FA; offline mode quirks ServiceNow Agent, Jira Cloud, Microsoft Teams, Slack  +5 Expense reporting autopilot (non-receipt flows) (mileage/per diem/simple entries, submit/approve) Finance Ops / Controller (any) Receipts OCR is missing, but lots of expense flows are pure form entry + submit; “pilot now” without ERP integration Cut submission time 30–60%; higher policy compliance; fewer rejected reports Receipt-photo steps; 2FA; policy popups; UI changes Expensify, SAP Concur, Zoho Expense, Google Sheets  +6 Daily operational reporting autopilot (fill structured spreadsheet rows on-device) Ops Manager / Site Lead (warehousing, retail ops) Lowest-friction “system” is often a shared sheet; field folks hate data entry; no API work needed ≥90% on-time daily reports; fewer missing fields; time per report down 40% Sheet layout changes; network; access permissions Google Sheets, Microsoft Teams, Slack, Trello  +7 Project/status update autopilot (move cards, update tasks, comment) PMO / Eng Manager (software teams) Mobile apps exist but are clicky; web automation is brittle on mobile; quick pilot is “phone does it” Higher update cadence; fewer stale boards; reduced PM chasing Permission boundaries; notification popups; UI changes Jira Cloud, Asana, Trello, Slack  +8 ChatOps incident comms autopilot (post templated updates, create channel/thread hygiene) SRE Manager / IT Ops (tech + enterprise) Many orgs restrict bots; but a phone session is already trusted; fast “operator assistant” Faster comms (<5 min update intervals); fewer missing incident fields Mention/autocomplete UIs; rate limits; workspace policy Slack, Microsoft Teams, Jira Cloud  +9 MDM/BYOD onboarding autopilot (enroll device, install required apps, compliance checks) IT Endpoint Admin (enterprise IT) Onboarding is pure UI steps across standard screens; avoids custom scripts per OEM Reduce helpdesk tickets; faster time-to-compliance Device/OEM variance; permission prompts; 2FA for work account Intune Company Portal, Microsoft Teams, Slack  +10 Service desk “quick actions” autopilot (update ticket + message requester) Service Desk Lead (enterprise) Combines ITSM + comms on-device; avoids integration approvals Lower reopen rate; faster resolution notes Copy/paste across apps; privacy of ticket content ServiceNow Agent, Microsoft Teams, Slack  +11 Sales “activity logging” autopilot (log calls/meetings, next steps) RevOps (SMB–midmarket) APIs exist but reps won’t use separate tooling; you drive their existing app UI +30% more activities logged; higher pipeline hygiene Duplicate contact flows; 2FA; inconsistent field requirements HubSpot, Zoho CRM, Salesforce  +12 “Mobile-first approvals” autopilot (approve/reject requests in queues) Finance/HR/IT approvers Approvals often happen on phones; automating via API can be gated; UI steps are consistent Shorter approval cycle time; fewer stuck approvals Role-based UI variants; 2FA SAP Concur, ServiceNow Agent, Jira Cloud  +13 Ops audits + training acknowledgement autopilot (complete required checklists + ack items) Ops + Compliance (manufacturing/logistics) “Get compliance done” is a UI workflow; avoids building custom training portals Higher completion; fewer overdue items Accessibility support varies; content changes SafetyCulture (iAuditor), Microsoft Teams, Google Sheets  +14 On-call “runbook on phone” autopilot (navigate apps, update boards, post updates) SRE/IT Ops During incidents, phones are the fallback; the agent reduces cognitive load Faster coordinated actions; fewer missed steps Unpredictable screens; notification interruptions Slack, Teams, Jira Cloud, Trello  +15 “Executive assistant for mobile admin” (update CRM fields, assign tasks, ping owners) Small-business owner / Sales manager Owners run businesses on phones; fastest ROI is “do the annoying app taps” Time saved per day; fewer missed follow-ups Account security; app UI churn HubSpot, Zoho CRM, Asana, Trello  + +Grounding note: Your agent’s “accessibility tree → action” approach maps well to how Android UI automation frameworks introspect UI nodes (text/content-desc/bounds) and trigger device actions.  + +⸻ + +Section B: Top 3 deep dives (fastest MVP → paid pilot in 2–4 weeks) + +Top 1) Safety/EHS inspections autopilot (SafetyCulture iAuditor) + +Why this is #1 for fast paid pilots + • Repetitive, form-y, and mobile-native (exactly where your agent already works). + • Clear ROI and easy “before/after” time study. + • Pilots can avoid camera/attachments entirely (or keep them manual) while still delivering big value. + +Apps required (Android, Google Play) + • SafetyCulture (iAuditor) (core workflow)  + • Optional for reporting/demo: + • Google Sheets (log run results / audit trail)  + • Microsoft Teams or Slack (send “inspection submitted” message)  + +Device requirements + • Physical Android preferred for pilots (more realistic permissions + offline behavior). + • Emulator is fine for dev, but you’ll want at least 1 real device for reliability baselines. + • Permissions: + • Accessibility Service enabled (your agent’s dependency) + • Network access; optional storage if exporting files (avoid if possible) + +Setup steps + 1. Create SafetyCulture org + user; install app; log in once.  + 2. Preconfigure: choose 1–2 inspection templates for pilot (keep them stable for 2–4 weeks). + 3. Ensure device stays logged in (no forced re-auth during pilot if possible). + 4. Disable “chaos”: turn off auto-rotate, reduce notifications, set display scaling fixed. + +Core workflows (agent “goals”) + 1. Start today’s inspection: open app → select template → create new inspection. + 2. Fill checklist: for each item, select pass/fail/na + type notes when prompted. + 3. Raise an issue (if triggered): open “issues” flow → add short description → set priority → submit. + 4. Submit inspection: reach review → confirm required fields → submit. + 5. Post-submission confirmation: capture “submitted” state → optionally message Teams/Slack channel with template text. + 6. Offline mode sanity (optional): start inspection with airplane mode → complete → wait → sync when online. + +Minimal product features needed beyond current + • Element-targeting policy (big one): tap by node bounds center from accessibility tree, not by raw screen coords when possible (still executed as tap(x,y), but derived from stable node selection). + • Wait-for condition: wait_until(text|content-desc|class present, timeout) (otherwise loops get flaky). + • Interrupt handler: detect/close popups (permissions, “rate us”, offline banners). + • Session recorder: store (sanitized) accessibility-tree snapshots + chosen action for debugging (no screenshots). + +How to test reliability + • Build a deterministic-ish “golden path suite”: + • 20 runs/day across 2 templates × 2 devices (or 1 device + 2 profiles) + • Success threshold for pilot: ≥90% completion without human intervention on golden paths + • Add “mutation tests”: + • random notification injection + • network toggle mid-run + • template with 1 new required field (should fail gracefully + report where) + +Privacy/compliance notes + • You’re not using screenshots, but accessibility trees can contain PII (names, addresses, ticket text). + • Do: + • Redact patterns in logs (emails, phone numbers, addresses, long numeric IDs). + • Store minimal logs; encrypt at rest; allow “no log mode” for sensitive customers. + • Customer-controlled device; keep auth on-device (no credential vault needed for pilot). + • SafetyCulture publishes Android “Data safety” info; use it to frame customer expectations.  + +Monetization / pricing hypothesis + • Pilot: $3k–$10k for 2–4 weeks (1–2 templates, 1–2 devices, reliability report). + • Ongoing: per-device + per-workflow bundle (e.g., $200–$500/device/month) + setup fees. + +Sales motion + • Target: EHS managers / Ops leaders already using iAuditor/SafetyCulture. + • Outreach angle: “Cut inspection time 30–50% without changing your templates or integrating anything.” + • Demo: run one inspection end-to-end in <2 minutes, show before/after tap count + completion log. + +Sources:  + +⸻ + +Top 2) CRM lead capture + follow-up autopilot (HubSpot / Zoho CRM) + +Why this is fast + • Same “search + multi-step navigation + typing” pattern you already said works. + • Paid buyers (RevOps) care about pipeline hygiene; clear KPI lifts. + +Apps required + • HubSpot (primary)  + • Zoho CRM (alternative track if a customer uses it)  + • Optional: + • Asana or Trello for creating follow-up tasks  + +Device requirements + • Physical device preferred (real keyboard behaviors). + • Accounts: HubSpot / Zoho CRM user seat. + • Avoid frequent 2FA by: using a dedicated “demo workspace” + keep device session active for pilot. + +Setup steps + 1. Install HubSpot; log in once; land on CRM home.  + 2. Predefine a “lead schema” for pilot (which fields required, naming convention). + 3. Prepare 20–50 test leads (CSV off-device is fine; user can paste one at a time for MVP). + +Core workflows (agent goals) + 1. Create new contact: open Contacts → Create → fill name/email/company/phone → Save. + 2. Log activity: open contact → Log activity (call/meeting) → type notes → Save. + 3. Create follow-up task: open Tasks → new task → due date + title (e.g., “Call back”) → Save. + 4. Update lifecycle/stage: open contact/deal → set stage → Save. + 5. Search + update: search contact by name → update one field → Save. + 6. (Optional) Dedup flow: detect “possible duplicate” screen → choose merge/skip (pilot can skip and flag). + +Minimal product features beyond current + • Text-anchor navigation: prefer selecting elements by visible label text (“Create contact”, “Save”) from accessibility nodes. + • Form field focus helper: when typing, ensure correct input is focused; if not, tap field node first. + • “Human-in-the-loop step” (pilot-safe): if a login/2FA prompt appears, pause and ask user to complete it, then continue. + +Reliability testing + • 100-run batch: create contact + log activity + create task. + • Success threshold: ≥95% field-correct saves (no wrong-field typing), ≥90% full workflow completion. + • Add “layout drift” test: switch font size/display size once; your element-bound taps should still work. + +Privacy/compliance + • CRM data = PII. Keep: + • Redacted logs + • Optional “test mode” with synthetic leads only + • No screenshots by design (good positioning) + • Document that agent runs on customer-controlled device; you don’t store passwords. + +Pricing + • Pilot: $2k–$8k (depends on number of workflows + CRM). + • Ongoing: per-seat or per-device (sales teams often price per seat; but your tech is device-based—either works). + +Sales motion + • Target: RevOps / Sales Enablement at companies already paying for HubSpot/Zoho but complaining about “CRM busywork”. + • Demo: “I say ‘log this lead and set a follow-up’, phone does it in 20 seconds.” + +Sources:  + +⸻ + +Top 3) ITSM mobile triage autopilot (ServiceNow Agent / Jira Cloud) + +Why this is fast + • IT teams already measure SLAs; you can show impact quickly. + • Mobile agent apps exist; your value is “do the rote clicks + enforce consistent updates”. + +Apps required + • ServiceNow Agent  + • Jira Cloud (many orgs use Jira for incident/task tracking)  + • Optional comms: + • Microsoft Teams or Slack  + +Device requirements + • Physical device recommended (enterprise MDM + conditional access behaves differently on emulators). + • Accounts: ServiceNow fulfiller role / Jira user with permissions. + • If org uses Intune-managed variant, note there’s an Intune-specific listing; pilots should pick whichever IT already deploys.  + +Setup steps + 1. Install ServiceNow Agent; sign in; verify you can see the queue.  + 2. Identify 2–3 ticket types to automate (Incidents, Requests, Catalog Tasks). + 3. Standardize templates for updates (work notes format, resolution code mapping). + +Core workflows (agent goals) + 1. First-touch triage: open queue → open newest P1/P2 → set “In Progress” → add work note template → save. + 2. Assign to resolver group: open ticket → assign group/user → save. + 3. Request info: open ticket → add comment to requester → save. + 4. Resolve/close: set state “Resolved” → choose resolution code → add resolution notes → save. + 5. Post status update: open Teams/Slack → send “Ticket # / status / ETA” message. + 6. Jira mirror (optional): create/update Jira issue for incident tracking. + +Minimal product features beyond current + • Stable selector heuristic: prefer matching by (label text + class + proximity) for common buttons like “Save”, “Assign”, “State”. + • Retry envelope around navigation steps (backstack gets weird in enterprise apps). + • Rate-limit / debounce taps to avoid double-submit. + +Reliability testing + • Synthetic ticket sandbox (ServiceNow dev instance / Jira test project) and replay: + • 50 triage runs/day + • Fail threshold: ≤2% “wrong ticket updated” + • Add permission variance test: user with fewer rights should fail with a clear reason (“cannot see Assign”). + +Privacy/compliance + • Tickets may include sensitive internal info. + • Log only: + • ticket ID hashes + • action trace + • redacted snippets + • Emphasize: no screenshots; on-device execution; fits “least new integration” model. + +Pricing + • Pilot: $3k–$12k depending on ITSM complexity and environment access. + • Ongoing: per-device/month + “workflow pack” (triage pack, resolve pack, comms pack). + +Sales motion + • Target: IT Service Desk managers, SRE leads, IT ops directors. + • Demo: open queue → triage 3 tickets + post Teams update in <90 seconds. + +Sources:  + +⸻ + +Section C: Recommended MVP build list (engineering tasks) + 1. Accessibility-node tap targeting + • Choose node by text/content-desc/class; tap center of its bounds (still using tap(x,y)). + 2. wait_until(...) primitive + • Wait for node match / screen signature; timeout with structured error. + 3. Popup/interrupt handler + • Generic “Close/Not now/Allow” detector + backoff. + 4. Screen signature + state machine-lite + • Hash of key visible node texts to detect “we’re stuck”. + 5. Retry policies + • Per-step retries w/ back/home recovery + max attempts. + 6. Session trace + redaction + • Store action list + minimal UI node text (redacted); export JSON for debugging. + 7. Human-in-the-loop gates + • “Pause for login/2FA” + resume. + 8. Per-customer workflow configs + • YAML/JSON “goal library” with allowed screens, expected buttons, required fields. + 9. Reliability harness + • Batch runner: N runs → success rate, median duration, failure clustering. + +(Foundation references for the “UI tree → action” approach and device interaction patterns: ) + +⸻ + +Section D: What to demo in 5 minutes (scripts) + +Demo 1 — SafetyCulture inspection in 90 seconds + 1. “Here’s the template. Watch the agent run.” + 2. Agent: open SafetyCulture → start inspection → complete 10 items → submit. + 3. Show: run log (steps + timestamps) + “time saved” estimate. + 4. Optional: agent posts “Inspection submitted ✅” to Teams/Slack. + +Demo 2 — CRM lead capture (“from conversation to CRM in 20 seconds”) + 1. You paste a synthetic lead snippet (name/company/email). + 2. Agent: HubSpot → create contact → log call note → create follow-up task. + 3. Show: contact exists + task exists; show failure handling if duplicate prompt appears. + +Demo 3 — ITSM triage sprint + 1. Open ServiceNow Agent queue (or Jira). + 2. Agent triages 3 tickets: sets state, adds templated note, assigns group. + 3. Agent posts an update message in Teams/Slack with ticket IDs + status. + +(Apps shown in demos are all verified on Google Play: SafetyCulture iAuditor, HubSpot, Zoho CRM, ServiceNow Agent, Jira Cloud, Teams, Slack, Sheets. ) \ No newline at end of file diff --git a/docs/ideas/idea-dump-2.md b/docs/ideas/idea-dump-2.md new file mode 100644 index 0000000..78d0267 --- /dev/null +++ b/docs/ideas/idea-dump-2.md @@ -0,0 +1,346 @@ +# Comprehensive Report on Android UI Automation Agent Business Use Cases + +This report details business use cases for an advanced Android UI automation agent that leverages the Android Debug Bridge (ADB) and the accessibility tree for control, combined with LLM-based decision-making. + +--- + +## PART 1: USE CASE TABLE + +This table outlines various business use cases for an accessibility-tree-based Android automation agent. + +| Use Case Name | Workflow Description | Industry/Buyer Persona | Apps Involved | Technical Requirements | Pricing/Business Model | ROI/Success Metrics | Stability Score | +| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | +| **Logistics: Invoice Factoring** | The agent receives a photo of a Bill of Lading, uses a scanner app to digitize it, opens a factoring app, fills in the invoice details, uploads the document, and submits it for payment. | **Logistics/Trucking:** Owner-operators, small to large fleets (50+ drivers) [^27] | RTS Pro, OTR Capital, other factoring apps [^27] | Accessibility tree access, ADB control. Designed for apps without APIs. [^27] | **Per Action:** $0.01 per action. [^27] | **Time Saved:** Reduces 10+ minute manual process to ~30 seconds. **Cost Savings:** 15x cheaper than screenshot-based automation ($0.01 vs $0.15/action). **Speed:** <1s latency per action. [^27] | **High** | +| **RPA: Legacy App Data Sync** | An agent automates data extraction from a legacy mobile app (e.g., an old inventory or CRM app) that lacks an API and inputs that data into a modern system like Salesforce or a database. | **Enterprise IT/Operations:** Companies with legacy mobile systems in any industry (e.g., manufacturing, finance) [^22] | Proprietary legacy apps, Dynamics, Salesforce [^22] | Accessibility tree access is critical as APIs are unavailable. Mimics human interaction (clicks, text entry). [^22] | **Per Workflow/User:** Subscription model, similar to RPA platform pricing. | **Efficiency:** Breaks down data silos and automates highly manual, error-prone data entry tasks. **Security:** Reduces risk compared to manual data transfer. [^22] | **High** | +| **Healthcare: Virtual Scribe** | The agent listens to doctor-patient conversations, automatically summarizes key details, and populates the patient's electronic medical record (EMR) in a native mobile app. | **Healthcare:** Hospitals, clinics, private practices. Target buyers are Clinical Informatics Officers, Practice Managers. [^21] | Epic MyChart, Cerner, other EMR/EHR apps [^21][^27] | HIPAA-compliant environment, robust accessibility tree in EMR apps for field identification. | **Per Provider/Month:** Subscription model. | **Time Saved:** Saves doctors hours per day on documentation. **Reduced Burnout:** Allows clinicians to focus on patient care, not paperwork. **Fewer Errors:** Reduces manual data entry mistakes. [^21] | **Medium** | +| **QA: Automated Regression Testing** | The agent executes a suite of regression tests on every new build of an Android app, verifying that new updates haven't broken existing functionality across different OS versions and devices. | **Software Development/QA:** Mobile app development companies, corporate IT departments. Buyer is QA Manager/Lead. [^3] | Any native or hybrid Android application. | Integrates with CI/CD tools (Jenkins, Azure DevOps). Uses frameworks like Appium/UiAutomator as an underlying engine. [^3][^4] | **Per User/Month or Per Test Execution:** e.g., BrowserStack is $29/user/month. [^25] | **Faster Releases:** Enables continuous integration and faster feedback loops. **Stability:** Prevents regressions and maintains application quality. Reduces manual testing effort significantly. [^3] | **High** | +| **Finance: Treasury & Reconciliation** | The agent logs into a native mobile banking app, navigates to the transaction history, extracts daily transactions, and reconciles them against internal records or exports them for accounting. | **Finance/Accounting:** SMBs, enterprise finance departments. [^27] | Native mobile banking apps (e.g., Chase, Bank of America, Mercury). | Stable UI for transaction lists. May require OTP workarounds (e.g., notification reading). | **Per Workflow/Account:** Monthly fee for automated reconciliation. | **Time Saved:** Automates a daily, time-consuming manual task. **Accuracy:** Eliminates human error in data transcription. **Visa Case Study:** AI in finance prevented over $40 billion in fraud annually. [^20] | **Medium** | +| **Field Service: Work Order Automation** | A field technician completes a job, and the agent automatically fills out the digital work order, logs parts used from inventory, and generates a field service report on a mobile device. | **Field Service Industries:** HVAC, plumbing, electrical, landscaping. [^2][^21] | SafetyCulture, Sera, FieldEdge, Fieldwork, FSM Grid, Manage360, Service Autopilot. [^12] | Offline capabilities are a plus. Robust forms with clear accessibility labels. | **Per User/Month:** Part of a larger FSM platform fee (e.g., SafetyCulture Premium is $24/seat/month, Sera starts at $399/month). [^12] | **Efficiency:** Digitizes and streamlines reporting from the field. **Productivity:** Allows technicians to move to the next job faster. **Data Accuracy:** Ensures consistent and complete data capture. [^12] | **High** | +| **Gig Economy: Order Acceptance** | The agent monitors multiple gig work apps simultaneously and automatically accepts orders that meet pre-defined criteria (e.g., payout, distance, rating), maximizing driver earnings. | **Gig Economy:** Individual drivers, fleet managers. [^27] | DoorDash, Uber Eats, Instacart, Grubhub. [^27] | Requires monitoring dynamic UIs and fast response times. | **Per User/Month:** Subscription for the optimization service. | **Increased Earnings:** Optimizes acceptance of the most profitable orders. **Reduced Distraction:** Allows drivers to focus on driving safely instead of managing multiple apps. [^27] | **Low** | +| **Warehousing: Inventory Cycle Counts** | A warehouse worker scans a location barcode, and the agent navigates the WMS app to the cycle count screen, enters the scanned quantity, and submits the update, guided by the agent's LLM. | **Logistics/Warehousing:** 3PLs, e-commerce fulfillment centers, distribution centers. [^13][^15] | SkuVault Core, Shipedge, NetSuite, Fishbowl, Wasp InventoryCloud, Odoo. [^15] | Integration with barcode scanner hardware. WMS apps must have accessible input fields and buttons. | **Per User/Month or Per Device:** Included in a WMS license. | **Sparex Case Study:** Achieved 95% inventory accuracy. **Efficiency:** Eliminates manual data entry and paper-based counting. **Real-Time Data:** Keeps inventory levels accurate in real-time. [^15][^20] | **High** | +| **Retail: Competitor Price Monitoring** | The agent navigates through competitor e-commerce apps, searches for specific products, extracts pricing and stock information, and aggregates the data for analysis. | **Retail/E-commerce:** Online retailers, marketing and pricing analysts. [^21] | Amazon, Walmart, Target, other e-commerce apps. | Must handle dynamic search results and product pages. | **Per Report/Subscription:** Data-as-a-service model. | **Amazon Example:** Amazon uses AI for daily price adjustments to drive margins and conversions, a process that can be informed by this type of data gathering. [^21] | **Medium** | +| **HR: Candidate Screening** | The agent accesses an Applicant Tracking System (ATS) mobile app, reviews new candidate applications, extracts key information from resumes, and screens them against job requirements. | **Human Resources:** Corporate recruiting teams, staffing agencies. [^20] | Workday, Greenhouse, Lever, other ATS mobile apps. | Must handle various document formats (PDF, DOCX) and parse unstructured text within app views. | **Per Hire/Subscription:** Part of a larger recruitment automation platform. | **Eightfold AI Case Study:** Decreased time-to-hire by 40% (60 to 36 days) and reduced hiring costs by 30%. [^20] | **Medium** | +| **QA: App Permissions Testing** | The agent automates tests that verify an app correctly requests and handles device permissions (camera, location, contacts) and functions as expected when permissions are granted or denied. | **Software Development/QA:** Companies in regulated industries (Healthcare, Finance) handling sensitive data. [^3] | Any app requiring device permissions. | Requires interaction with system-level permission dialogs, which is possible with UI Automator-based tools. [^3] | **Per Test Run:** Part of a comprehensive QA testing suite. | **Compliance:** Ensures adherence to privacy policies (GDPR, CCPA). **Reliability:** Guarantees functionality of features that depend on device integration. [^3] | **High** | +| **Manufacturing: Predictive Maintenance** | A technician on the factory floor uses a handheld device to scan a machine's QR code. The agent opens the maintenance app, pulls up the machine's sensor data, and logs a maintenance request predicted by the system. | **Manufacturing:** Plant managers, maintenance teams. [^21] | Proprietary maintenance apps, CMMS apps. | Requires interaction with QR code scanners and data visualization components. | **Per Asset/Subscription:** Part of an enterprise asset management (EAM) or predictive maintenance platform. | **GE/Siemens Example:** Reduced machine downtime significantly, saving millions in repair costs and lost production. [^21] | **High** | + +--- + +## PART 2: TOP 3 DEEP DIVES + +Based on ROI potential, market validation, and technical feasibility, the following three use cases represent the strongest opportunities. + +--- + +### Use Case 1: Logistics Invoice Factoring for Trucking + +**Executive Summary** + +This use case presents a high-impact opportunity to automate a critical and time-consuming workflow for the trucking industry. By automating invoice submission to factoring companies, the agent can reduce a 10+ minute manual task to under 30 seconds, providing immediate and measurable value to drivers and fleet owners. The `android-action-kernel` project has already validated this specific use case with 5 active pilot programs, demonstrating strong market demand and technical viability [^27]. + +![Android Action Kernel Demo](https://github.com/actionstatelabs/android-action-kernel/raw/main/assets/demo.gif) +*Demonstration of the android-action-kernel automating a workflow on an Android device [^27].* + +**Detailed Workflow Breakdown** + +- **Current Manual Process:** A truck driver completes a delivery and receives a physical Bill of Lading (BoL). To get paid, they must: + 1. Take a clear photo of the BoL. + 2. Email or message the photo to a back-office employee (or do it themselves). + 3. The employee downloads the photo. + 4. Opens a factoring company's mobile app (e.g., RTS Pro). + 5. Manually enters all invoice data: load number, amount, broker, etc. + 6. Uploads the BoL photo. + 7. Submits the invoice for payment. + - *Time/Cost:* This process takes 10-15 minutes of manual work per invoice and is prone to errors [^27]. + +- **Automated Process:** + 1. The driver texts or emails the BoL photo to a designated number/address. + 2. The AI agent receives the image. + 3. **Step 1 (Perception):** The agent opens a scanner app (e.g., Adobe Scan) to digitize the document. + 4. **Step 2 (Reasoning):** An LLM extracts the necessary invoice data from the digitized text. + 5. **Step 3 (Action):** The agent opens the factoring app (RTS Pro, OTR Capital), navigates to the submission form, and types the extracted data into the correct fields using ADB `input` commands based on the accessibility tree. + 6. **Step 4 (Action):** The agent uploads the BoL image and submits the form. + - *Time/Cost:* The entire automated process takes approximately 30 seconds [^27]. The cost per action is around $0.01 [^27]. + +**Market Opportunity** + +- **Target Buyer Personas:** + - Owner-Operator Truck Drivers + - Small to Mid-Sized Trucking Fleets (5-100 trucks) + - Large Enterprise Fleets (100+ trucks) + - Factoring Companies (as a value-add service for their clients) +- **Market Size:** The US trucking industry revenue is over $940 billion [^28](https://www.trucking.org/economics-and-industry-data). The `android-action-kernel` project estimates a $40+ trillion GDP from mobile-first workflows that are currently underserved by automation [^27]. +- **Pain Points Solved:** + - Reduces administrative burden on drivers, allowing them to focus on driving. + - Accelerates cash flow by submitting invoices for payment faster. + - Eliminates costly data entry errors. + - Frees up back-office staff from repetitive tasks. +- **Competition/Alternatives:** Manual entry, hiring back-office staff, or less efficient screenshot/OCR-based automation tools. + +**Technical Implementation** + +- **Accessibility Tree Requirements:** The factoring apps must have well-structured UIs with unique `resource-id` or `content-desc` attributes for input fields and buttons to ensure reliable targeting. +- **Supported Actions Needed:** `tap` (for buttons), `type` (for text fields), `navigate` (back, home), `wait`. +- **Android Version Compatibility:** High, as the underlying ADB and Accessibility Service tools are core to Android. Should support Android 7.0 and above to cover the vast majority of devices. +- **API Availability:** This use case thrives because these specialized factoring apps often lack public APIs for direct integration. The agent acts as the integration layer [^22][^27]. +- **Error Handling:** The agent must be able to detect when a UI element is not found, handle app crashes, and manage unexpected pop-ups (e.g., "rate our app" dialogs). +- **OTP/2FA Workarounds:** Most of these apps use standard username/password logins. If 2FA is present, a potential workaround involves reading OTP codes from system notifications. + +**Business Model & Pricing** + +- **Recommended Pricing Strategy:** A hybrid model. + - **Monthly Subscription Fee:** A base fee per user or per truck (e.g., $10-$20/month) for platform access. + - **Per-Action/Per-Invoice Fee:** A usage-based fee (e.g., $0.25 per submitted invoice) to align with value created. +- **Competitive Pricing:** This is significantly cheaper and more efficient than manual labor. Compared to other automation tools, BrowserStack costs $29/user/month for testing, and enterprise tools like TestComplete cost over $2,399/year [^25][^24]. The value proposition is strong. +- **ROI Calculation:** A fleet with 50 trucks submitting 5 invoices per week (1,000 invoices/month) could save over 160 hours of manual labor per month. At a conservative $20/hour, that's **$3,200 in monthly savings**, far exceeding the cost of the service. +- **Time to Value:** Immediate. Customers see time savings on the very first invoice automated. + +**Success Metrics & KPIs** + +- **Quantifiable Metrics:** + - **Time Reduction:** 10+ minutes reduced to ~30 seconds per invoice [^27]. + - **Cost Reduction:** 15x cheaper than screenshot methods ($0.01 vs $0.15 per action) [^27]. + - **Error Rate Reduction:** Target a >99% accuracy rate for data entry, reducing human error. +- **Expected Automation Success Rate:** Aim for >95%. +- **User Adoption:** Track the number of active users, number of invoices processed per month. + +**Validation & Traction** + +- The `android-action-kernel` open-source project has seen explosive growth and validation [^27]: + - **5 active pilot programs** with trucking companies and delivery fleets. + - **3 factoring companies** are exploring partnership opportunities. + - 4.5M+ views on X (Twitter) and 550+ GitHub stars, indicating high developer and industry interest. + +**Risks & Mitigations** + +- **Technical Risks:** + - *UI Changes:* Factoring apps could update their UI, breaking the agent. **Mitigation:** Implement a monitoring system to detect UI changes and have a process for quickly re-training the agent. Use modular test script designs [^26]. + - *App Updates:* Forced app updates could introduce breaking changes. **Mitigation:** Use a cloud device farm to test the agent against beta versions of apps and new OS releases [^26]. +- **Business Risks:** + - *Adoption:* Drivers may be hesitant to trust a new technology. **Mitigation:** Start with pilot programs (as `android-action-kernel` has done) to build trust and gather testimonials. + - *Competition:* Other RPA or AI companies could target this niche. **Mitigation:** First-mover advantage is key. Build a strong brand and deep integration with the trucking ecosystem. + +--- + +### Use Case 2: Healthcare Automated Clinical Documentation (Virtual Scribe) + +**Executive Summary** + +Physician burnout is a critical issue in healthcare, largely driven by the administrative burden of clinical documentation. This use case leverages the agent to act as a "virtual scribe," automating the process of filling out patient records in EMR/EHR mobile apps. This directly addresses a major industry pain point, offering a powerful ROI by freeing up highly-paid clinicians' time to focus on patient care, thereby improving both efficiency and quality of care [^21]. + +**Detailed Workflow Breakdown** + +- **Current Manual Process:** + 1. A physician conducts a patient visit, taking notes by hand or typing intermittently. + 2. Post-visit, the physician spends significant time navigating the EMR mobile app (e.g., Epic MyChart/Haiku, Cerner). + 3. They manually transcribe notes, enter structured data (vitals, diagnoses), place orders, and write summaries. + - *Time/Cost:* This "pajama time" documentation can take up hours of a physician's day, contributing to burnout and reducing patient throughput [^21]. + +- **Automated Process:** + 1. An ambient AI service records and transcribes the doctor-patient conversation (with consent). + 2. An LLM processes the transcript to extract structured clinical information (symptoms, diagnoses, medications, orders). + 3. **Step 1 (Action):** The agent logs into the EMR app on a tablet or phone. + 4. **Step 2 (Action):** It navigates to the correct patient chart and relevant sections (e.g., 'Progress Note', 'Orders'). + 5. **Step 3 (Action):** The agent systematically populates the required fields with the extracted information by identifying UI elements via the accessibility tree and using `type` and `tap` commands. + 6. **Step 4 (Reasoning):** The agent saves the note as a draft for the physician's final review and signature. + - *Time/Cost:* The automated process can complete the initial draft in under a minute, saving the physician 10-20 minutes per patient encounter. + +**Market Opportunity** + +- **Target Buyer Personas:** + - Chief Medical Information Officer (CMIO) in hospitals. + - Practice Managers in private clinics and outpatient centers. + - Heads of clinical departments. +- **Market Size:** The global EHR market is valued at over $29 billion and is foundational to healthcare operations [^29](https://www.grandviewresearch.com/industry-analysis/electronic-health-records-ehr-market). The cost of physician burnout to the U.S. healthcare system is estimated at $4.6 billion annually [^30](https://www.annals.org/aim/article-abstract/2734790/estimating-attributable-cost-physician-burnout-u-s). +- **Pain Points Solved:** + - Reduces physician burnout and administrative workload. + - Increases the amount of time clinicians can spend on direct patient care. + - Improves the accuracy and completeness of medical records. + - Increases patient throughput and practice revenue. +- **Competition/Alternatives:** Human scribes (expensive and have high turnover), desktop-based EMR automation tools, and large-scale AI scribe companies like Nuance Dragon Medical [^21]. The mobile-native agent is a key differentiator. + +**Technical Implementation** + +- **Accessibility Tree Requirements:** EMR apps are complex but typically well-structured. The agent needs reliable access to `resource-id` and labels for hundreds of different fields and buttons. This is a primary challenge. +- **Supported Actions Needed:** `tap`, `type`, `swipe` (for scrolling), complex navigation between screens. +- **Android Version Compatibility:** Must be compatible with enterprise-managed devices, typically running stable, slightly older versions of Android. +- **API Availability:** EMRs like Epic and Cerner have APIs (e.g., FHIR), but they can be complex, expensive, and may not expose all UI-level functionality. The agent can automate workflows not covered by APIs or for organizations without the resources for deep API integration [^22]. +- **Error Handling:** Must be extremely robust. The agent needs to verify each data entry step, handle timeouts, and gracefully exit and flag for human review if it encounters an unexpected state. +- **OTP/2FA Workarounds:** EMR systems have stringent security. The agent must integrate with enterprise authentication systems, potentially requiring coordination with IT for service accounts or alternative login methods. + +**Business Model & Pricing** + +- **Recommended Pricing Strategy:** Per-provider, per-month subscription model. + - Example: A tiered model from $199 to $499 per provider/month, based on usage and features. +- **Competitive Pricing:** This is significantly cheaper than hiring a human scribe, which can cost $20-$25 per hour (or >$40,000/year). Nuance's enterprise solutions are also premium-priced [^21]. +- **ROI Calculation:** If a physician sees 20 patients a day and the agent saves them 10 minutes per patient, that's over 3 hours saved daily. This can be used to see more patients (increasing revenue) or reduce burnout (reducing turnover costs, which can be >$500k per physician). +- **Time to Value:** High, but requires an integration/setup period. Once running, the value is realized daily. + +**Success Metrics & KPIs** + +- **Quantifiable Metrics:** + - **Time Saved:** Reduction in "pajama time" or time spent on documentation per day (target: >2 hours/day). + - **Note Accuracy:** Measure the percentage of fields correctly filled by the agent before physician review. + - **Click Reduction:** Measure the decrease in manual clicks and keyboard strokes per patient encounter. +- **Expected Automation Success Rate:** Target >98% for draft completion. +- **User Adoption:** Track daily active users and provider satisfaction scores (NPS). + +**Validation & Traction** + +- **Market Validation:** The success of companies like **Nuance Dragon Medical** proves the market exists and is willing to pay for solutions that reduce documentation burden [^21]. +- **Use Case Validation:** The `android-action-kernel` project explicitly lists "extracting patient data from HIPAA-locked mobile portals like Epic MyChart" as a target use case, showing alignment with this market need [^27]. +- **Curogram Research:** Outlined 35 healthcare automation workflows, with clinical documentation and charting being a primary example of a high-impact area for automation [^14]. + +**Risks & Mitigations** + +- **Technical Risks:** + - *EMR UI Complexity:* These apps are vast and can vary between institutions. **Mitigation:** Focus on a single EMR (like Epic or Cerner) initially. Develop a robust element identification system that can handle minor UI variations. + - *Data Privacy (HIPAA):* Handling Protected Health Information (PHI) is a major risk. **Mitigation:** The agent must run in a secure, HIPAA-compliant environment (on-premise or secure cloud). All data must be encrypted, and strict access controls are mandatory. +- **Business Risks:** + - *Sales Cycle:* Selling to hospitals is notoriously slow and complex. **Mitigation:** Target smaller private practices first to build case studies before approaching large hospital systems. + - *Physician Trust:* Clinicians may not trust an AI to handle patient data correctly. **Mitigation:** Implement a "human-in-the-loop" model where the agent only prepares drafts for physician review, ensuring final control always rests with the clinician. + +--- + +### Use Case 3: RPA for Legacy Mobile App Data Integration + +**Executive Summary** + +This use case targets the widespread enterprise problem of "data silos" trapped within legacy mobile applications that lack modern APIs. The agent acts as a universal adapter, mimicking human user actions to extract, input, or synchronize data, thereby integrating these older systems into modern workflows. This provides a lifeline for companies that cannot afford to replace or rebuild critical but outdated mobile applications, offering a fast and cost-effective solution to a persistent operational bottleneck [^22]. + +**Detailed Workflow Breakdown** + +- **Current Manual Process:** + 1. An employee (e.g., in a warehouse, on a sales route) uses a legacy handheld Android device to perform a task (e.g., log a sale, update inventory). + 2. The data is now stored only within that legacy app's siloed database. + 3. At the end of the day, another employee must manually run a report or re-enter the data from the legacy system into a modern ERP, CRM, or BI tool. + - *Time/Cost:* This is a slow, repetitive, and extremely error-prone process that delays access to critical business data. + +- **Automated Process:** + 1. The agent runs on a schedule (e.g., every 15 minutes) or is triggered by an event. + 2. **Step 1 (Action):** The agent opens the legacy mobile app. + 3. **Step 2 (Action/Reasoning):** It navigates through the application's menus, which may be non-standard and require the LLM to interpret based on screen text. + 4. **Step 3 (Perception):** The agent "reads" the data from the screen by parsing the accessibility tree. + 5. **Step 4 (Action):** The agent switches to a modern app (e.g., Salesforce, NetSuite) or opens a web browser to an API endpoint and inputs the extracted data. + - *Time/Cost:* The automated process runs in seconds and eliminates manual labor, providing near real-time data synchronization. + +**Market Opportunity** + +- **Target Buyer Personas:** + - IT Directors and CIOs in mid-to-large enterprises. + - Operations Managers in industries like manufacturing, logistics, and wholesale distribution. + - Business Process Automation (BPA) teams. +- **Market Size:** The global Robotic Process Automation (RPA) market is projected to reach over $13 billion by 2026, and this use case directly taps into that market by extending RPA capabilities to mobile [^31](https://www.marketsandmarkets.com/Market-Reports/robotic-process-automation-market-238229230.html). Any company that deployed custom mobile apps 5-10 years ago is a potential customer. +- **Pain Points Solved:** + - Unlocks data from legacy systems without costly development work. + - Automates manual, repetitive data entry, reducing labor costs. + - Improves data accuracy by eliminating human error. + - Enables real-time data analysis and decision-making. +- **Competition/Alternatives:** Rewriting the legacy app (very expensive), traditional RPA vendors (most lack native mobile UI automation, like Automation Anywhere [^17]), or continuing with manual processes. + +**Technical Implementation** + +- **Accessibility Tree Requirements:** The key is that even "bad" legacy apps often have some accessibility structure. The LLM's reasoning is crucial for interpreting poorly labeled elements. The agent must be resilient to non-standard UI components. +- **Supported Actions Needed:** `tap`, `type`, `long_press`, `swipe`, and the ability to read text from any element. +- **Android Version Compatibility:** Must support a wide range of older Android versions (e.g., Android 5.0+) often found on legacy hardware. +- **API Availability:** The entire premise is the lack of APIs. The agent *is* the API [^22]. +- **Error Handling:** The agent needs to be able to handle unresponsive apps, unexpected error messages, and situations where the UI state is not what it expects. It should log these events and alert a human operator. +- **OTP/2FA Workarounds:** Less common on older internal apps, but if present, would require a solution like notification reading or integration with the company's identity provider. + +**Business Model & Pricing** + +- **Recommended Pricing Strategy:** A classic RPA model: **Per-Bot/Per-Process Subscription**. + - Customers pay a monthly fee for each automated process (e.g., "$500/month for the 'Inventory Sync' bot"). This is a familiar model for enterprise buyers. +- **Competitive Pricing:** Traditional RPA licenses from vendors like UiPath can be expensive. A mobile-first RPA solution can be priced competitively to win this niche. +- **ROI Calculation:** **Prodigy's strategy highlights the key benefits [^22].** If a company has 5 employees spending 2 hours a day on manual data entry from a legacy app, that's 50 hours/week. At $25/hour, that's **$1,250 per week in labor costs saved**, making a $500/month bot subscription an easy decision. The **Sparex case study** showed how integrating siloed data led to **$5 million in annual savings** [^20]. +- **Time to Value:** Can be very fast. A proof-of-concept for a single process can be built in days or weeks, demonstrating value immediately. + +**Success Metrics & KPIs** + +- **Quantifiable Metrics:** + - **Hours Saved:** Total hours of manual work eliminated per month. + - **Error Rate Reduction:** Compare the error rate of automated entries vs. human entries. + - **Data Latency:** Reduction in time from data creation in the legacy app to availability in the modern system (e.g., from 24 hours to 15 minutes). +- **Expected Automation Success Rate:** Target >99% for structured, repetitive tasks. +- **User Adoption:** Number of processes automated, number of departments using the service. + +**Validation & Traction** + +- **Market Validation:** The entire RPA industry is built on this premise for desktop applications. **Prodigy's article "No API – No Problem"** is direct validation that this is a critical strategy and business need [^22]. +- **UiPath's mobile capabilities** show that leading RPA vendors recognize the need to extend automation to mobile, though their approach relies on Appium and may be more complex than a dedicated ADB/Accessibility agent [^10][^11]. +- **Automation Anywhere's LACK of native mobile support** highlights a significant market gap that this agent can fill [^17]. + +**Risks & Mitigations** + +- **Technical Risks:** + - *Brittle Automation:* Legacy apps can be unstable or have inconsistent UIs. **Mitigation:** Leverage the LLM's reasoning power to make the agent more adaptable. If element ID is missing, it can fall back to using text labels or positional data. Implement robust retry logic. + - *Device Management:* Managing a fleet of physical or emulated devices can be complex. **Mitigation:** Partner with a cloud device farm (e.g., BrowserStack, AWS Device Farm) to handle hardware and OS management, as suggested in the `android-action-kernel` roadmap [^27][^26]. +- **Business Risks:** + - *Perception as "Screen Scraping":* Some IT departments are wary of non-API automation. **Mitigation:** Position the agent as a secure, reliable "RPA for Mobile" solution. Provide detailed audit logs and security documentation. Emphasize that it's a bridge to modernization, not a hack. + +--- + +## PART 3: KEY INSIGHTS & RECOMMENDATIONS + +### Market Landscape Summary + +The Android automation market is mature in the QA/Testing space but nascent for business process automation via UI interaction. + +- **QA/Testing Tools:** The market is dominated by open-source frameworks like **Appium** (the de-facto standard) and Google's **UiAutomator** and **Espresso** [^2][^24]. Commercial offerings are primarily cloud-based device farms and management platforms like **BrowserStack** and **HeadSpin**, or all-in-one testing suites like **TestComplete** and **Katalon Studio**. +- **RPA Vendors:** Major RPA players are primarily desktop-focused. **UiPath** has extended its platform to Android by integrating Appium, requiring a complex setup of multiple third-party tools (Appium, JDK, Android SDK, NodeJS) [^10]. Crucially, **Automation Anywhere**, another market leader, offers **no native mobile automation**, relying on fragile emulator-based OCR and coordinate clicking, which highlights a major gap for a more robust solution [^17]. +- **Pricing Benchmarks:** The market has several tiers: + - **Free & Open Source:** Appium, Selenium, UiAutomator [^24]. + - **Per-User Subscription (Cloud Testing):** BrowserStack starts at **$29/user/month** [^25]. + - **Per-User Subscription (Enterprise Suites):** TestComplete starts at **$2,399/user/year**; UFT starts at **$3,200/year** [^24]. + - **Usage-Based (AI Agents):** The `android-action-kernel` project is modeling a **$0.01/action** cost [^27]. + +### Accessibility Tree Advantage + +The agent's core technical approach—using the accessibility tree instead of screenshot-based methods—is its single greatest competitive advantage. + +- **Cost & Speed:** The `android-action-kernel` project provides hard data on this advantage [^27]: + - **Cost:** **$0.01 per action**, which is **15 times cheaper** than screenshot/vision-based models priced at ~$0.15 per action. + - **Speed:** **<1 second latency** per action, which is **3-5 times faster** than the 3-5 second latency of vision models that must capture, upload, and process an image. +- **Technical Advantages:** + - **Reliability:** The accessibility tree provides a structured, semantic understanding of the UI. This is far more robust than OCR, which can fail with different fonts, colors, or resolutions. + - **Accuracy:** It can read text content directly and identify UI element types (e.g., button, input field) without guessing. + - **Data Consumption:** Transmitting a small XML/JSON file is significantly more efficient than streaming video or uploading large screenshot images. +- **Business Implications:** The dramatic reduction in cost and latency unlocks real-time, high-volume business automation use cases (like gig economy order acceptance) that would be economically or technically infeasible with slower, more expensive vision-based agents. + +### Go-to-Market Recommendations + +1. **Prioritized Industries:** + - **1. Logistics & Trucking:** Start here. There is clear, documented demand and market validation from the `android-action-kernel` project's pilot programs. The pain points are acute, and the ROI is easy to demonstrate. + - **2. Field Services & Warehousing:** These industries rely heavily on ruggedized Android devices and specialized apps for core operations. The UIs are typically functional and stable, making them ideal for automation. + - **3. Healthcare:** A massive opportunity, but with a longer sales cycle and higher compliance hurdles (HIPAA). Enter this market after establishing success in logistics to build credibility. + +2. **Recommended Pricing Strategy:** + - Adopt a **hybrid subscription + usage-based model**. A base platform fee provides predictable revenue, while a per-action or per-workflow fee directly ties customer costs to the value they receive. This offers an accessible entry point for SMBs while scaling for enterprise usage. + - Price aggressively based on the 15x cost advantage. A price point of **$0.02 per action** would still be significantly cheaper than vision-based competitors while providing a healthy margin. + +3. **Key Differentiators to Emphasize:** + - **Speed and Cost:** Lead with the "15x cheaper, 5x faster" message. + - **"The RPA for Mobile":** Position the agent as the missing piece in the automation landscape, filling the gap left by desktop-centric vendors like Automation Anywhere. + - **API-Free Integration:** Market the agent as a powerful tool to "unlock data from any Android app, no API needed," directly addressing the legacy system problem. + - **Reliability:** Emphasize the robustness of accessibility-tree-based automation compared to brittle screen scraping and OCR. + +--- + +### Appendix: Tool & Vendor Reference + +| Tool/Vendor | Key Feature | Pricing Model | URL | +| :--- | :--- | :--- | :--- | +| **Appium** | Open-source, cross-platform mobile automation framework. | Free | [http://appium.io/](http://appium.io/) [^24] | +| **UiAutomator** | Google's native Android UI testing framework. | Free | Included in Android SDK [^25] | +| **TestComplete** | All-in-one QA suite with AI object recognition. | Commercial (starts ~$2,399/user/year) | [https://smartbear.com/product/testcomplete/](https://smartbear.com/product/testcomplete/) [^24] | +| **BrowserStack** | Cloud platform with 3000+ real Android/iOS devices for testing. | Subscription (starts $29/user/month) | [https://www.browserstack.com/](https://www.browserstack.com/) [^25] | +| **UiPath** | Enterprise RPA platform with Appium-based mobile support. | Commercial (Enterprise licensing) | [https://www.uipath.com/](https://www.uipath.com/) [^10] | +| **Automation Anywhere** | Enterprise RPA platform. **Does not** support native mobile UI automation. | Commercial (Enterprise licensing) | [https://www.automationanywhere.com/](https://www.automationanywhere.com/) [^17] | +| **android-action-kernel** | Open-source AI agent using ADB + Accessibility Tree. | Free (Open Source) / Models a $0.01/action cost. | [https://github.com/actionstatelabs/android-action-kernel](https://github.com/actionstatelabs/android-action-kernel) [^27] | +| **Deque Axe DevTools** | Enterprise accessibility testing tool for mobile apps. | Commercial (Demo-based) | [https://www.deque.com/axe/devtools/mobile-accessibility/](https://www.deque.com/axe/devtools/mobile-accessibility/) [^23] | +| **Katalon Studio** | Free, comprehensive automation tool for Web, API, and Mobile. | Free / Enterprise tiers | [https://www.katalon.com/](https://www.katalon.com/) [^24] | +| **SafetyCulture** | Field service platform with mobile forms and workflow automation. | Subscription (starts $24/seat/month) | [https://safetyculture.com/](https://safetyculture.com/) [^12] | + +--- +### How this report was produced + +This report was compiled by synthesizing data from a multi-step, comprehensive research process. The process involved executing targeted web searches to gather information on the Android automation market, specific industry use cases (logistics, healthcare, field service), RPA vendor capabilities, technical frameworks, pricing models, and real-world ROI data. Information was extracted from dozens of sources, including vendor websites, technical documentation, market analyses, case studies, and open-source project repositories. All key data points, metrics, and claims have been cited with their original URL source to ensure accuracy and traceability. The final report was structured to directly address the user's request, organizing the compiled data into a use case table, in-depth analyses of top opportunities, and strategic market recommendations. \ No newline at end of file diff --git a/docs/ideas/idea-dump-3.md b/docs/ideas/idea-dump-3.md new file mode 100644 index 0000000..dd75dae --- /dev/null +++ b/docs/ideas/idea-dump-3.md @@ -0,0 +1,339 @@ +Below is a junior-engineer executable implementation guide to integrate Wispr Flow dictation into your Safety/EHS inspection autopilot (SafetyCulture / iAuditor), using ADB + accessibility tree for UI control, and Wispr Flow API for high-quality “spoken notes → clean text → type into focused field”. + +⸻ + +0) What we’re building + +Add a “Dictate Note” capability to your agent: + 1. Agent navigates SafetyCulture inspection UI (already works) + 2. When it reaches a free-text field (e.g., “Describe hazard”, “Corrective actions”, “Additional notes”), it: + • records audio (host mic or device mic—pick one) + • sends audio to Wispr Flow + • receives cleaned text + • uses existing type(text) action to enter it + +Wispr Flow supports: + • REST transcription endpoint (slower) /api + • WebSocket streaming endpoint (recommended, lower latency) + • Context fields (dictionary words, textbox contents, page content) to improve results + • Audio must be 16kHz, mono, 16-bit PCM WAV, base64-encoded; max 25MB / 6 min per request. +Sources:  + +⸻ + +1) Choose the integration mode (do this) + +Option A — REST (fastest to ship, easiest) + +Use: + • POST https://platform-api.wisprflow.ai/api/v1/dash/api with org API key auth +Sources:  + +This is the best “2–4 week pilot” choice. + +Option B — WebSocket (lower latency, more moving parts) + +Use: + • wss://platform-api.wisprflow.ai/api/v1/dash/ws?api_key=Bearer%20 + • Send messages: auth, then append packets, then commit +Sources:  + +⸻ + +2) Architecture decision (recommended for your current setup) + +Since your agent already runs from a host controlling Android via ADB, do: + +Host-recorded audio (laptop mic / USB mic) → Wispr Flow → adb input/your type(text). + +It avoids Android audio permissions + recording UI flows. + +If you must record on-device later, keep the Wispr client interface stable and swap the recorder. + +⸻ + +3) Implementation steps (REST path) — do these in order + +3.1 Create module: wisprflow_client.py + +API contract (from docs) + • Endpoint: POST /api + • Body: + • audio: base64 encoded 16kHz wav + • language: optional list of ISO codes + • context: optional object (app info, dictionary words, textbox contents, content_text, etc.) + • Response includes text plus metadata (detected language, time, tokens) +Sources:  + +Code (Python) + +# wisprflow_client.py +from __future__ import annotations + +import base64 +import os +import time +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +import requests + +WISPR_BASE = "https://platform-api.wisprflow.ai/api/v1/dash" + +@dataclass(frozen=True) +class WisprResult: + id: str + text: str + detected_language: str | None + total_time_ms: int | None + generated_tokens: int | None + +class WisprFlowClient: + def __init__(self, api_key: str, timeout_s: int = 60): + # api_key should already include the Bearer prefix per docs examples + # e.g. "Bearer fl-xxxxxx" + self.api_key = api_key + self.timeout_s = timeout_s + self.session = requests.Session() + self.session.headers.update({"Authorization": api_key}) + + def warmup(self) -> None: + # GET /warmup_dash + url = f"{WISPR_BASE}/warmup_dash" + r = self.session.get(url, timeout=self.timeout_s) + r.raise_for_status() + + def transcribe_rest( + self, + wav_bytes: bytes, + language: Optional[List[str]] = None, + context: Optional[Dict[str, Any]] = None, + ) -> WisprResult: + """ + wav_bytes must be 16kHz, mono, 16-bit PCM WAV. + """ + url = f"{WISPR_BASE}/api" + payload: Dict[str, Any] = { + "audio": base64.b64encode(wav_bytes).decode("ascii"), + } + if language: + payload["language"] = language + if context: + payload["context"] = context + + r = self.session.post(url, json=payload, timeout=self.timeout_s) + r.raise_for_status() + data = r.json() + return WisprResult( + id=data.get("id", ""), + text=data.get("text", ""), + detected_language=data.get("detected_language"), + total_time_ms=data.get("total_time"), + generated_tokens=data.get("generated_tokens"), + ) + +3.2 Create recorder: audio_capture.py (host mic) + +You need to produce 16kHz mono int16 WAV. Use sounddevice + wave. + +# audio_capture.py +from __future__ import annotations + +import io +import wave +import numpy as np +import sounddevice as sd + +def record_wav_16k_mono(seconds: float, sample_rate: int = 16000) -> bytes: + """ + Records from default input device. + Returns WAV bytes: PCM 16-bit, mono, 16kHz. + """ + frames = int(seconds * sample_rate) + audio = sd.rec(frames, samplerate=sample_rate, channels=1, dtype="int16") + sd.wait() + + buf = io.BytesIO() + with wave.open(buf, "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) # int16 + wf.setframerate(sample_rate) + wf.writeframes(audio.tobytes()) + return buf.getvalue() + +Install deps: + +pip install requests sounddevice numpy + +3.3 Build “context builder” from your accessibility tree + +Wispr Flow supports context fields like: + • app: { name, type } + • dictionary_context: [] + • textbox_contents: { before_text, selected_text, after_text } + • content_text (preferred vs screenshot for efficiency) + • screenshot exists but you’re avoiding screenshots (good) +Sources:  + +For SafetyCulture inspections, you should pass: + • app.type = "other" + • app.name = "SafetyCulture" + • content_text = " |
| | " (whatever you can reliably get) + • dictionary_context = [site names, asset IDs, common hazard words, employee names] + • textbox_contents from the currently focused input if accessible + +Example: + +def build_wispr_context( + question_text: str, + section_name: str | None, + existing_text: str | None, + dictionary: list[str], +) -> dict: + content_parts = [p for p in [section_name, question_text] if p] + content_text = " | ".join(content_parts) + + return { + "app": {"name": "SafetyCulture", "type": "other"}, + "dictionary_context": dictionary, + "textbox_contents": { + "before_text": existing_text or "", + "selected_text": "", + "after_text": "", + }, + "content_text": content_text, + # Do NOT send screenshot (you don’t use screenshots; keep it null/omit) + } + +3.4 Add new agent tool: dictate_and_type(...) + +This is the glue: focus field → record → transcribe → sanitize → type. + +import re + +def sanitize_for_adb(text: str) -> str: + # ADB input is fragile with some characters depending on your implementation. + # Keep it conservative for pilots. + text = text.strip() + text = re.sub(r"\s+", " ", text) + return text + +def dictate_and_type( + wispr: WisprFlowClient, + seconds: float, + question_text: str, + section_name: str | None, + existing_text: str | None, + dictionary: list[str], + type_fn, # your existing type(text) action + language: list[str] | None = None, +) -> str: + wav_bytes = record_wav_16k_mono(seconds) + ctx = build_wispr_context(question_text, section_name, existing_text, dictionary) + result = wispr.transcribe_rest(wav_bytes, language=language, context=ctx) + final_text = sanitize_for_adb(result.text) + if final_text: + type_fn(final_text) + return final_text + +3.5 Warmup call (reduces latency) + +Before dictation-heavy sessions (start of inspection): + • GET https://platform-api.wisprflow.ai/api/v1/dash/warmup_dash +Sources:  + +Call once when your agent boots or right before the first transcription: + +wispr.warmup() + + +⸻ + +4) Where to hook this into SafetyCulture workflows + +You want stable, repeatable trigger points: + • When accessibility tree indicates a focused EditText (or text input node) + • And the nearby labels match “Describe…”, “Notes”, “Corrective action”, etc. + +Pilot approach: + • Maintain a small allowlist of question labels that use dictation (10–30 items). + • Everything else stays typed/autofilled. + +⸻ + +5) WebSocket plan (optional upgrade after REST works) + +If you upgrade to WebSockets, flow is: + 1. Open socket (API-key endpoint): +wss://platform-api.wisprflow.ai/api/v1/dash/ws?api_key=Bearer%20 + 2. Send first message auth with optional language + context + 3. Send repeated append with audio_packets (packets list, volumes list, packet_duration, etc.) + 4. Send commit { total_packets: n } + 5. Receive status: "text" responses, partial + final +Sources:  + +You can also reduce size using MessagePack binary mode (Encoding: 'msgpack' and byte_encoding: "binary") if needed.  + +⸻ + +6) Auth + key handling (pilot-safe) + +Docs describe: + • API-key auth (org key) and also client-side auth via generated JWT tokens +Sources:  + +For pilots: + • Keep org API key only on the host running the agent. + • Put it in env vars: + • WISPRFLOW_API_KEY="Bearer fl-xxxxx" + +If you later ship to customer devices directly: + • Build a tiny token service using POST /generate_access_token and use /client_api or client_ws.  + +⸻ + +7) Reliability testing plan (what the junior engineer should implement) + +Unit tests + • Audio output is WAV, 16kHz, mono, 16-bit PCM (read the WAV header) + • Context builder produces correct structure (no screenshots) + +Integration tests (Wispr Flow) + • Use 5–10 pre-recorded WAV fixtures (stored in repo) and assert non-empty text + • Ensure request stays under size/time limits (25MB / 6 min) +Sources:  + +E2E tests (SafetyCulture) + • Run 20 inspections with dictation on the same 5 text fields + • Success criteria: + • ≥90% of dictations produce non-empty text + • ≥90% of runs type into the correct field (no misfocus) + • Median dictation latency < 3s (REST) or < 1s (WS), measured from “stop recording” to “typed” (your mileage will vary) + +⸻ + +8) Privacy / compliance defaults (pilot-ready) + +Even without screenshots, you’re sending: + • Audio (may include PII) + • content_text / question prompts (might include site names) + • potentially existing field text (PII) + +Defaults: + • Redact logs: don’t persist raw audio; don’t log full transcripts in prod mode + • Store only: request id, timings, success/fail, and a short hash of transcript + • Make dictionary_context customer-controlled (no secrets) + +(Also: Wispr Flow’s docs explicitly support providing content_text as a more efficient alternative to screenshot for context.)  + +⸻ + +9) Drop-in docs file (put this in your repo) + +Create: docs/features/wisprflow-dictation.md with: + • Purpose + • REST endpoint + payload example + • “dictate_and_type” workflow + • Config knobs: dictation allowlist, languages, dictionary_context, redaction mode + • Test plan + success metrics + diff --git a/kernel.py b/kernel.py index f827897..a2f152b 100644 --- a/kernel.py +++ b/kernel.py @@ -2,23 +2,57 @@ import time import subprocess import json -from typing import Dict, Any +from typing import Dict, Any, List, Optional, Tuple +from dotenv import load_dotenv from openai import OpenAI import sanitizer +# Load environment variables from .env file +load_dotenv() + # --- CONFIGURATION --- ADB_PATH = "adb" # Ensure adb is in your PATH -MODEL = "gpt-4o" # Or "gpt-4-turbo" for faster/cheaper execution SCREEN_DUMP_PATH = "/sdcard/window_dump.xml" LOCAL_DUMP_PATH = "window_dump.xml" -client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) +# --- LLM PROVIDER CONFIGURATION --- +# Default: OpenRouter with openai/gpt-4o +# Override with LLM_PROVIDER=openai and OPENAI_API_KEY for direct OpenAI access +LLM_PROVIDER = os.environ.get("LLM_PROVIDER", "openrouter").lower() + +DEFAULT_MODELS = { + "openrouter": "openai/gpt-4o", + "openai": "gpt-4o", +} + +def get_llm_client_and_model() -> Tuple[OpenAI, str]: + """Returns the appropriate OpenAI-compatible client and model based on LLM_PROVIDER.""" + model = os.environ.get("LLM_MODEL", DEFAULT_MODELS.get(LLM_PROVIDER, "openai/gpt-4o")) + + if LLM_PROVIDER == "openai": + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + raise ValueError("OPENAI_API_KEY is required when LLM_PROVIDER=openai") + client = OpenAI(api_key=api_key) + else: # Default: openrouter + api_key = os.environ.get("OPENROUTER_API_KEY") + if not api_key: + raise ValueError("OPENROUTER_API_KEY is required (default provider is OpenRouter)") + client = OpenAI( + api_key=api_key, + base_url="https://openrouter.ai/api/v1", + ) + + return client, model + +client, MODEL = get_llm_client_and_model() -def run_adb_command(command: List[str]): +def run_adb_command(command: List[str]) -> str: """Executes a shell command via ADB.""" result = subprocess.run([ADB_PATH] + command, capture_output=True, text=True) - if result.stderr and "error" in result.stderr.lower(): - print(f"❌ ADB Error: {result.stderr.strip()}") + if result.returncode != 0: + error_msg = result.stderr.strip() or result.stdout.strip() + print(f"❌ ADB Error (code {result.returncode}): {error_msg}") return result.stdout.strip() def get_screen_state() -> str: @@ -39,12 +73,56 @@ def get_screen_state() -> str: elements = sanitizer.get_interactive_elements(xml_content) return json.dumps(elements, indent=2) -def execute_action(action: Dict[str, Any]): - """Executes the action decided by the LLM.""" +class GoalAchieved(Exception): + """Raised when the agent completes its goal.""" + pass + + +def validate_action(action: Dict[str, Any]) -> Optional[str]: + """Validates action schema. Returns error message if invalid, None if valid.""" + if not isinstance(action, dict): + return "Action must be a dictionary" + + act_type = action.get("action") + valid_actions = {"tap", "type", "home", "back", "wait", "done", "enter", "clear", "launch", "scroll"} + + if act_type not in valid_actions: + return f"Unknown action '{act_type}'. Must be one of: {valid_actions}" + + if act_type == "tap": + coords = action.get("coordinates") + if not isinstance(coords, (list, tuple)) or len(coords) != 2: + return "'tap' action requires 'coordinates' as [x, y]" + try: + int(coords[0]), int(coords[1]) + except (TypeError, ValueError): + return "'tap' coordinates must be integers" + + if act_type == "type": + text = action.get("text") + if not isinstance(text, str) or not text: + return "'type' action requires non-empty 'text' string" + + if act_type == "launch": + package = action.get("package") + if not isinstance(package, str) or not package: + return "'launch' action requires non-empty 'package' string" + + if act_type == "scroll": + direction = action.get("direction", "down") + if direction not in {"up", "down", "left", "right"}: + return "'scroll' direction must be one of: up, down, left, right" + + return None + + +def execute_action(action: Dict[str, Any]) -> bool: + """Executes the action decided by the LLM. Returns True if goal achieved.""" act_type = action.get("action") if act_type == "tap": - x, y = action.get("coordinates") + coords = action.get("coordinates") + x, y = int(coords[0]), int(coords[1]) print(f"👉 Tapping: ({x}, {y})") run_adb_command(["shell", "input", "tap", str(x), str(y)]) @@ -55,56 +133,132 @@ def execute_action(action: Dict[str, Any]): elif act_type == "home": print("🏠 Going Home") - run_adb_command(["shell", "input", "keyevent", "KEYWORDS_HOME"]) + run_adb_command(["shell", "input", "keyevent", "KEYCODE_HOME"]) elif act_type == "back": print("🔙 Going Back") - run_adb_command(["shell", "input", "keyevent", "KEYWORDS_BACK"]) + run_adb_command(["shell", "input", "keyevent", "KEYCODE_BACK"]) elif act_type == "wait": print("⏳ Waiting...") time.sleep(2) + elif act_type == "enter": + print("⏎ Pressing Enter") + run_adb_command(["shell", "input", "keyevent", "KEYCODE_ENTER"]) + + elif act_type == "clear": + print("🗑️ Clearing text field") + # Select all (Ctrl+A) then delete + run_adb_command(["shell", "input", "keyevent", "KEYCODE_MOVE_END"]) # Move to end + run_adb_command(["shell", "input", "keyevent", "--longpress", "KEYCODE_DEL"]) # Long-press delete + # Fallback: try select-all + delete + run_adb_command(["shell", "input", "keycombination", "KEYCODE_CTRL_LEFT", "KEYCODE_A"]) # Select all + run_adb_command(["shell", "input", "keyevent", "KEYCODE_DEL"]) # Delete selection + + elif act_type == "launch": + package = action.get("package") + print(f"🚀 Launching app: {package}") + run_adb_command(["shell", "monkey", "-p", package, "-c", "android.intent.category.LAUNCHER", "1"]) + + elif act_type == "scroll": + direction = action.get("direction", "down") + print(f"📜 Scrolling {direction}") + # Get screen dimensions (approximate center for swipe) + # Default to common phone resolution center + cx, cy = 540, 1200 + distance = 500 + if direction == "down": + run_adb_command(["shell", "input", "swipe", str(cx), str(cy), str(cx), str(cy - distance), "300"]) + elif direction == "up": + run_adb_command(["shell", "input", "swipe", str(cx), str(cy - distance), str(cx), str(cy), "300"]) + elif direction == "left": + run_adb_command(["shell", "input", "swipe", str(cx), str(cy), str(cx - distance), str(cy), "300"]) + elif direction == "right": + run_adb_command(["shell", "input", "swipe", str(cx - distance), str(cy), str(cx), str(cy), "300"]) + elif act_type == "done": print("✅ Goal Achieved.") - exit(0) + return True + + return False -def get_llm_decision(goal: str, screen_context: str) -> Dict[str, Any]: +def get_llm_decision(goal: str, screen_context: str, action_history: List[Dict[str, Any]], retry_count: int = 0) -> Dict[str, Any]: """Sends screen context to LLM and asks for the next move.""" - system_prompt = """ - You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the UI. - - You will receive: - 1. The User's Goal. - 2. A list of interactive UI elements (JSON) with their (x,y) center coordinates. - - You must output ONLY a valid JSON object with your next action. - - Available Actions: - - {"action": "tap", "coordinates": [x, y], "reason": "Why you are tapping"} - - {"action": "type", "text": "Hello World", "reason": "Why you are typing"} - - {"action": "home", "reason": "Go to home screen"} - - {"action": "back", "reason": "Go back"} - - {"action": "wait", "reason": "Wait for loading"} - - {"action": "done", "reason": "Task complete"} - - Example Output: - {"action": "tap", "coordinates": [540, 1200], "reason": "Clicking the 'Connect' button"} - """ - - response = client.chat.completions.create( - model=MODEL, - response_format={"type": "json_object"}, - messages=[ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": f"GOAL: {goal}\n\nSCREEN_CONTEXT:\n{screen_context}"} - ] - ) - - return json.loads(response.choices[0].message.content) - -def run_agent(goal: str, max_steps=10): + system_prompt = """You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the UI. + +You will receive: +1. The User's Goal. +2. A list of interactive UI elements (JSON) with their (x,y) center coordinates. +3. Your previous actions (so you don't repeat yourself). + +You must output ONLY a valid JSON object with your next action. + +Available Actions: +- {"action": "tap", "coordinates": [x, y], "reason": "Why you are tapping"} +- {"action": "type", "text": "Hello World", "reason": "Why you are typing"} +- {"action": "clear", "reason": "Clear/delete text in focused field"} +- {"action": "enter", "reason": "Press Enter/Submit key"} +- {"action": "home", "reason": "Go to home screen"} +- {"action": "back", "reason": "Go back"} +- {"action": "wait", "reason": "Wait for loading"} +- {"action": "scroll", "direction": "down", "reason": "Scroll to see more content"} +- {"action": "launch", "package": "com.example.app", "reason": "Launch an app by package name"} +- {"action": "done", "reason": "Task complete"} + +IMPORTANT WORKFLOW RULES: +1. To enter text in a search box or text field: FIRST tap the field, THEN on the next step use "type" to enter text. +2. If you already tapped a text field in your previous action, your next action should be "type" with the text you want to enter. +3. If a text field already has text and you need to replace it, use "clear" first, then "type" the new text. +4. After typing in a search field, use "enter" to submit the search. This is more reliable than tapping a search button. +5. Do NOT tap the same element repeatedly. If you tapped something and nothing changed, try a different approach. + +Example - Searching for something: +Step 1: {"action": "tap", "coordinates": [540, 100], "reason": "Tapping search box to focus it"} +Step 2: {"action": "type", "text": "pizza near me", "reason": "Typing search query"} +Step 3: {"action": "enter", "reason": "Submitting the search query"} +""" + + history_str = "" + if action_history: + history_str = "\n\nPREVIOUS_ACTIONS:\n" + json.dumps(action_history[-5:], indent=2) # Last 5 actions + + try: + response = client.chat.completions.create( + model=MODEL, + response_format={"type": "json_object"}, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": f"GOAL: {goal}\n\nSCREEN_CONTEXT:\n{screen_context}{history_str}"} + ] + ) + + content = response.choices[0].message.content + decision = json.loads(content) + + validation_error = validate_action(decision) + if validation_error: + if retry_count < 1: + print(f"⚠️ Invalid action from LLM: {validation_error}. Retrying...") + return get_llm_decision(goal, screen_context, action_history, retry_count + 1) + else: + raise ValueError(f"LLM returned invalid action after retry: {validation_error}") + + return decision + + except json.JSONDecodeError as e: + if retry_count < 1: + print(f"⚠️ Failed to parse LLM response as JSON: {e}. Retrying...") + return get_llm_decision(goal, screen_context, action_history, retry_count + 1) + else: + raise ValueError(f"LLM did not return valid JSON after retry: {e}") + +def run_agent(goal: str, max_steps: int = 10) -> bool: + """Runs the agent loop. Returns True if goal achieved, False if max_steps reached.""" print(f"🚀 Android Use Agent Started. Goal: {goal}") + print(f"📡 Using provider: {LLM_PROVIDER} | Model: {MODEL}") + + action_history: List[Dict[str, Any]] = [] for step in range(max_steps): print(f"\n--- Step {step + 1} ---") @@ -115,14 +269,28 @@ def run_agent(goal: str, max_steps=10): # 2. Reasoning print("🧠 Thinking...") - decision = get_llm_decision(goal, screen_context) + decision = get_llm_decision(goal, screen_context, action_history) print(f"💡 Decision: {decision.get('reason')}") + # Track action history (store a summary) + action_history.append({ + "step": step + 1, + "action": decision.get("action"), + "coordinates": decision.get("coordinates"), + "text": decision.get("text"), + "reason": decision.get("reason") + }) + # 3. Action - execute_action(decision) + goal_achieved = execute_action(decision) + if goal_achieved: + return True # Wait for UI to update time.sleep(2) + + print(f"⚠️ Max steps ({max_steps}) reached without achieving goal.") + return False if __name__ == "__main__": # Example Goal: "Open settings and turn on Wi-Fi" diff --git a/requirements.txt b/requirements.txt index 06018fe..818bf76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -openai>=1.12.0 \ No newline at end of file +openai>=1.12.0 +python-dotenv>=1.0.0 \ No newline at end of file diff --git a/use_cases/__init__.py b/use_cases/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/use_cases/safetyculture_inspection.py b/use_cases/safetyculture_inspection.py new file mode 100644 index 0000000..44864d1 --- /dev/null +++ b/use_cases/safetyculture_inspection.py @@ -0,0 +1,592 @@ +#!/usr/bin/env python3 +""" +SafetyCulture (iAuditor) Inspections Autopilot - MVP Entry Point + +This use case automates completing a specific SafetyCulture inspection template. +Run this file directly to launch the agent with the SafetyCulture-focused goal. + +Prerequisites: +- Android device connected via ADB (`adb devices` shows it) +- USB debugging enabled +- SafetyCulture app installed and logged in +- The MVP template created in SafetyCulture +""" + +import os +import sys +import json +import time +import datetime +from pathlib import Path + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from kernel import ( + run_adb_command, + get_screen_state, + get_llm_decision, + execute_action, + client, + MODEL, + LLM_PROVIDER, +) +import sanitizer + +# --- SAFETYCULTURE CONFIGURATION --- +SAFETYCULTURE_PACKAGE = "com.safetyculture.iauditor" + +# MVP Template Configuration +# Define the exact template and answers for reproducible automation +MVP_TEMPLATE_CONFIG = { + "template_name": "Warehouse Safety", # Update to match your actual template name + "questions": [ + { + "question": "Are all emergency exits clearly marked and easily accessible?", + "type": "choice", + "answer": "yes", + "options": ["Yes", "No", "N/A"], + "note": None, + }, + { + "question": "How would you rate the overall cleanliness and organization of the warehouse?", + "type": "choice", + "answer": "yes", + "options": ["Yes", "No", "N/A"], + "note": None, + }, + { + "question": "Describe the procedure in place for reporting any safety hazards or incidents.", + "type": "text", + "answer": "Report hazards to the supervisor immediately and record details in the incident log.", + "note": None, + }, + { + "question": "How many fire extinguishers are present in the warehouse?", + "type": "number", + "answer": "8", + "note": None, + }, + { + "question": "Is all staff provided with and properly trained on the use of personal protective equipment (PPE)?", + "type": "choice", + "answer": "yes", + "options": ["Yes", "No", "N/A"], + "note": None, + }, + { + "question": "When was the last fire drill conducted in the warehouse?", + "type": "date", + "answer": "today", + "note": None, + }, + { + "question": "Please upload a photo showing the condition of the warehouse aisles for inspection.", + "type": "media", + "answer": "skip", + "note": None, + }, + ], + "location": "", # Optional: location to select if prompted +} + +# Common popup dismiss buttons to auto-handle +# NOTE: Do NOT add "SAVE" here - it's needed for date picker and should be tapped intentionally by the agent +POPUP_DISMISS_TEXTS = [ + "Allow", + "Don't allow", + "Deny", + "No thanks", + "Not now", + "Skip", + "Close", + "Later", + "OK", + "Got it", + "Dismiss", + "Cancel", + "Maybe later", +] + +# Success detection: We need MULTIPLE anchors present to confirm the Report screen +# This avoids false positives from partial matches (e.g., "HSE Contractor Performance Report") +SUCCESS_ANCHOR_SETS = [ + # Report screen has both Download AND Share buttons + {"Download", "Share"}, + # Alternative: explicit submission confirmation + {"Inspection submitted"}, + {"Successfully submitted"}, +] + +# --- LOGGING --- +LOGS_DIR = Path(__file__).parent.parent / "logs" + + +def ensure_logs_dir(): + """Create logs directory if it doesn't exist.""" + LOGS_DIR.mkdir(exist_ok=True) + + +def create_run_log(run_id: str) -> Path: + """Create a new log file for this run.""" + ensure_logs_dir() + return LOGS_DIR / f"safetyculture_run_{run_id}.jsonl" + + +def log_step(log_path: Path, step_data: dict): + """Append a step to the run log.""" + with open(log_path, "a") as f: + f.write(json.dumps(step_data) + "\n") + + +# --- APP LAUNCH --- +def launch_app(package: str) -> bool: + """Launch an app by package name using ADB monkey command.""" + print(f"🚀 Launching app: {package}") + result = run_adb_command([ + "shell", "monkey", "-p", package, + "-c", "android.intent.category.LAUNCHER", "1" + ]) + success = "Events injected: 1" in result or "cmp=" in result.lower() + if success: + print(f"✅ App launched successfully") + else: + print(f"⚠️ App launch may have failed: {result}") + return success + + +# --- WAIT UNTIL --- +def wait_until_text(target_text: str, timeout_s: int = 10, case_sensitive: bool = False) -> bool: + """ + Wait until the screen contains an element with text matching target_text. + Returns True if found within timeout, False otherwise. + """ + print(f"⏳ Waiting for text: '{target_text}' (timeout: {timeout_s}s)") + start_time = time.time() + + while time.time() - start_time < timeout_s: + screen_json = get_screen_state() + try: + elements = json.loads(screen_json) + for elem in elements: + elem_text = elem.get("text", "") + if case_sensitive: + if target_text in elem_text: + print(f"✅ Found: '{target_text}'") + return True + else: + if target_text.lower() in elem_text.lower(): + print(f"✅ Found: '{target_text}'") + return True + except json.JSONDecodeError: + pass + time.sleep(1) + + print(f"⚠️ Timeout waiting for: '{target_text}'") + return False + + +def wait_until_any_text(target_texts: list, timeout_s: int = 10) -> str | None: + """ + Wait until the screen contains any of the target texts. + Returns the matched text if found, None otherwise. + """ + print(f"⏳ Waiting for any of: {target_texts} (timeout: {timeout_s}s)") + start_time = time.time() + + while time.time() - start_time < timeout_s: + screen_json = get_screen_state() + try: + elements = json.loads(screen_json) + for elem in elements: + elem_text = elem.get("text", "").lower() + for target in target_texts: + if target.lower() in elem_text: + print(f"✅ Found: '{target}'") + return target + except json.JSONDecodeError: + pass + time.sleep(1) + + print(f"⚠️ Timeout waiting for any of: {target_texts}") + return None + + +# --- POPUP HANDLER --- +def handle_popups(screen_json: str) -> bool: + """ + Scan screen for common popup dismiss buttons and tap them. + Returns True if a popup was dismissed, False otherwise. + """ + try: + elements = json.loads(screen_json) + except json.JSONDecodeError: + return False + + for elem in elements: + elem_text = elem.get("text", "").strip() + if not elem_text: + continue + + for dismiss_text in POPUP_DISMISS_TEXTS: + if elem_text.lower() == dismiss_text.lower(): + if elem.get("clickable", False): + center = elem.get("center") + if center: + print(f"🔔 Dismissing popup: '{elem_text}'") + run_adb_command(["shell", "input", "tap", str(center[0]), str(center[1])]) + time.sleep(1) + return True + return False + + +# --- TEMPLATE-DRIVEN GOAL CONSTRUCTION --- +def build_inspection_goal(config: dict) -> str: + """Build a detailed goal string from the template configuration.""" + template_name = config["template_name"] + questions = config["questions"] + location = config.get("location", "") + + question_plan = [] + for i, q in enumerate(questions, 1): + q_type = q.get("type", "choice") + answer = q.get("answer", "") + note = q.get("note") + + if q_type == "choice": + options = q.get("options", ["Yes", "No", "N/A"]) + answer_str = str(answer).strip().upper() + question_plan.append( + f" Q{i}: Choice question. Tap the '{answer_str}' button (options: {', '.join(options)})" + ) + elif q_type == "text": + question_plan.append( + f" Q{i}: Text question. Tap the text field (often shows 'Tap to edit'), then type: '{str(answer)}'" + ) + elif q_type == "number": + question_plan.append( + f" Q{i}: Number question. Tap the input field, then type the number: '{str(answer)}'" + ) + elif q_type == "date": + question_plan.append( + f" Q{i}: Date question. Tap 'Select Date', then tap 'SAVE' (top right) to confirm today's date" + ) + elif q_type == "media": + question_plan.append( + f" Q{i}: Media upload. Do NOT add media. If optional, leave blank and continue. If required and blocked, stop." + ) + else: + question_plan.append(f" Q{i}: Answer: '{str(answer)}'") + + if note: + question_plan.append(f" Add note: '{note}'") + + plan_str = "\n".join(question_plan) + + goal = f"""Complete a SafetyCulture inspection using this exact plan: + +1. The SafetyCulture app is now open +2. Navigate to start a new inspection using template: "{template_name}" +3. The inspection starts on a "Title Page" (Page 1/2) with pre-filled fields like "Conducted on", "Prepared by", "Location" + - Do NOT edit these fields - they are auto-filled + - Tap "Next" (bottom right corner) to go to Page 2/2 where the actual questions are +4. On Page 2/2, answer each question in order: +{plan_str} +5. After answering all questions, tap "Complete" to submit the inspection +6. Wait for confirmation that the inspection was submitted successfully + +IMPORTANT RULES: +- The Title Page (Page 1/2) has "Conducted on", "Prepared by", "Location" - SKIP these and tap "Next" +- For choice questions, look for Yes/No/N/A buttons and tap the appropriate one +- For text/number fields, tap the field first, then type the answer +- Scroll down if you don't see the next question or if you need to find the Complete button +- Do NOT take photos or add attachments - skip media upload questions entirely +- For date picker: tap "Select Date", then tap "SAVE" button (top right of dialog) to confirm +- The "Complete" button is at the bottom - you may need to scroll down to see it +- After tapping Complete, wait for the Report screen (with Download/Share buttons) before marking done +""" + return goal + + +# --- MAIN AGENT LOOP --- +def run_safetyculture_agent( + config: dict = None, + max_steps: int = 50, + auto_launch: bool = True, +) -> dict: + """ + Run the SafetyCulture inspection agent. + + Returns a dict with: + - success: bool + - steps: int + - duration_s: float + - failure_reason: str | None + """ + if config is None: + config = MVP_TEMPLATE_CONFIG + + run_id = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + log_path = create_run_log(run_id) + + print(f"🚀 SafetyCulture Inspection Agent Started") + print(f"📋 Template: {config['template_name']}") + print(f"📝 Log file: {log_path}") + print(f"📡 Using provider: {LLM_PROVIDER} | Model: {MODEL}") + + start_time = time.time() + action_history = [] + result = { + "success": False, + "steps": 0, + "duration_s": 0, + "failure_reason": None, + "run_id": run_id, + } + + # Log run start + log_step(log_path, { + "event": "run_start", + "timestamp": datetime.datetime.now().isoformat(), + "template": config["template_name"], + "max_steps": max_steps, + }) + + # Step 0: Launch the app + if auto_launch: + if not launch_app(SAFETYCULTURE_PACKAGE): + result["failure_reason"] = "Failed to launch SafetyCulture app" + log_step(log_path, {"event": "error", "reason": result["failure_reason"]}) + return result + + # Wait for app to load + time.sleep(3) + + # Wait for a known home element + if not wait_until_text("Inspections", timeout_s=15): + # Try alternative home indicators + if not wait_until_text("Home", timeout_s=5): + print("⚠️ Could not confirm app loaded, continuing anyway...") + + # Build the goal + goal = build_inspection_goal(config) + + for step in range(max_steps): + print(f"\n--- Step {step + 1} ---") + result["steps"] = step + 1 + + # 1. Perception + print("👀 Scanning Screen...") + screen_context = get_screen_state() + + # 2. Handle popups before LLM decision + popup_dismissed = handle_popups(screen_context) + if popup_dismissed: + log_step(log_path, { + "step": step + 1, + "event": "popup_dismissed", + "timestamp": datetime.datetime.now().isoformat(), + }) + # Re-scan after dismissing popup + time.sleep(1) + screen_context = get_screen_state() + + # 3. Check for success anchors (require ALL anchors in a set to be present) + try: + elements = json.loads(screen_context) + # Collect all visible text on screen + all_text = set() + for elem in elements: + elem_text = elem.get("text", "").strip().lower() + if elem_text: + all_text.add(elem_text) + + # Check if any anchor set is fully satisfied + for anchor_set in SUCCESS_ANCHOR_SETS: + matched = all(any(anchor.lower() in txt for txt in all_text) for anchor in anchor_set) + if matched: + print(f"🎉 Success detected: {anchor_set}") + result["success"] = True + result["duration_s"] = time.time() - start_time + log_step(log_path, { + "step": step + 1, + "event": "success", + "anchors": list(anchor_set), + "timestamp": datetime.datetime.now().isoformat(), + }) + log_step(log_path, { + "event": "run_end", + "success": True, + "steps": result["steps"], + "duration_s": result["duration_s"], + }) + return result + except json.JSONDecodeError: + pass + + # 4. Reasoning + print("🧠 Thinking...") + try: + decision = get_llm_decision(goal, screen_context, action_history) + except Exception as e: + print(f"❌ LLM Error: {e}") + result["failure_reason"] = f"LLM error: {e}" + log_step(log_path, {"step": step + 1, "event": "error", "reason": str(e)}) + break + + print(f"💡 Decision: {decision.get('reason')}") + + # Track action history + action_entry = { + "step": step + 1, + "action": decision.get("action"), + "coordinates": decision.get("coordinates"), + "text": decision.get("text"), + "reason": decision.get("reason"), + } + action_history.append(action_entry) + + # Log the step (redacted) + log_step(log_path, { + "step": step + 1, + "timestamp": datetime.datetime.now().isoformat(), + "action": decision.get("action"), + "reason": decision.get("reason"), + }) + + # 5. Action + from kernel import execute_action + goal_achieved = execute_action(decision) + + if goal_achieved: + # Agent said done - verify with success anchors + time.sleep(2) + screen_context = get_screen_state() + + # Check for success using anchor sets + try: + elements = json.loads(screen_context) + all_text = set() + for elem in elements: + elem_text = elem.get("text", "").strip().lower() + if elem_text: + all_text.add(elem_text) + + found_success = False + for anchor_set in SUCCESS_ANCHOR_SETS: + matched = all(any(anchor.lower() in txt for txt in all_text) for anchor in anchor_set) + if matched: + found_success = True + break + + if found_success: + result["success"] = True + print(f"✅ Inspection submitted successfully!") + else: + print("⚠️ Agent marked done but no success anchor found") + result["failure_reason"] = "Agent marked done but submission not confirmed" + except json.JSONDecodeError: + print("⚠️ Could not verify success - screen parse error") + result["failure_reason"] = "Could not verify submission" + + break + + # Wait for UI to update + time.sleep(2) + + if not result["success"] and not result["failure_reason"]: + result["failure_reason"] = f"Max steps ({max_steps}) reached" + + result["duration_s"] = time.time() - start_time + + log_step(log_path, { + "event": "run_end", + "success": result["success"], + "steps": result["steps"], + "duration_s": result["duration_s"], + "failure_reason": result["failure_reason"], + }) + + if result["success"]: + print(f"\n🎉 SUCCESS! Completed in {result['steps']} steps, {result['duration_s']:.1f}s") + else: + print(f"\n❌ FAILED: {result['failure_reason']}") + + return result + + +def run_reliability_test(num_runs: int = 20, config: dict = None) -> dict: + """ + Run the inspection multiple times and report statistics. + """ + if config is None: + config = MVP_TEMPLATE_CONFIG + + print(f"\n{'='*60}") + print(f"🧪 RELIABILITY TEST: {num_runs} runs") + print(f"{'='*60}\n") + + results = [] + + for i in range(num_runs): + print(f"\n{'='*40}") + print(f"📊 Run {i + 1}/{num_runs}") + print(f"{'='*40}") + + result = run_safetyculture_agent(config=config) + results.append(result) + + # Brief pause between runs + if i < num_runs - 1: + print("\n⏳ Waiting 5s before next run...") + time.sleep(5) + + # Calculate statistics + successes = sum(1 for r in results if r["success"]) + success_rate = (successes / num_runs) * 100 + + durations = [r["duration_s"] for r in results if r["success"]] + median_duration = sorted(durations)[len(durations) // 2] if durations else 0 + + failure_reasons = {} + for r in results: + if not r["success"] and r["failure_reason"]: + reason = r["failure_reason"] + failure_reasons[reason] = failure_reasons.get(reason, 0) + 1 + + print(f"\n{'='*60}") + print(f"📊 RELIABILITY TEST RESULTS") + print(f"{'='*60}") + print(f"✅ Success Rate: {success_rate:.1f}% ({successes}/{num_runs})") + print(f"⏱️ Median Duration: {median_duration:.1f}s") + + if failure_reasons: + print(f"\n❌ Top Failure Reasons:") + for reason, count in sorted(failure_reasons.items(), key=lambda x: -x[1]): + print(f" - {reason}: {count} times") + + return { + "success_rate": success_rate, + "successes": successes, + "total_runs": num_runs, + "median_duration_s": median_duration, + "failure_reasons": failure_reasons, + "results": results, + } + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="SafetyCulture Inspection Autopilot") + parser.add_argument("--test", action="store_true", help="Run reliability test (20 runs)") + parser.add_argument("--runs", type=int, default=20, help="Number of runs for reliability test") + parser.add_argument("--max-steps", type=int, default=50, help="Max steps per run") + parser.add_argument("--no-launch", action="store_true", help="Skip auto-launching the app") + + args = parser.parse_args() + + if args.test: + run_reliability_test(num_runs=args.runs) + else: + run_safetyculture_agent(max_steps=args.max_steps, auto_launch=not args.no_launch)