diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..79bd9d0
--- /dev/null
+++ b/.env.example
@@ -0,0 +1 @@
+GEMINI_API_KEY="your_api_key_here"
\ No newline at end of file
diff --git a/README.md b/README.md
index 0232fde..6b213be 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,7 @@ But **real work happens on mobile devices** in places where laptops don't fit:
Watch Android Use automate an entire logistics workflow:
### Before (Manual - 10+ minutes)
+
```
1. Driver takes photo of Bill of Lading
2. Opens WhatsApp, sends to back office
@@ -62,6 +63,7 @@ Watch Android Use automate an entire logistics workflow:
```
### After (Automated - 30 seconds)
+
```python
# Driver just texts the photo. Agent does the rest.
run_agent("""
@@ -84,6 +86,7 @@ run_agent("""
### ๐ซ Computer Use (Anthropic)
+
- Requires desktop/laptop
- Takes screenshots โ OCR
- Sends images to vision model
@@ -95,6 +98,7 @@ run_agent("""
|
### โ
Android Use (This Library)
+
- Works on handheld devices
- Reads accessibility tree (XML)
- Structured data โ LLM
@@ -128,13 +132,13 @@ Launched **24 hours ago** with the logistics demo:
## ๐ The Market: Mobile-First Industries
-| Industry | Why They Need This | Market Size | Current State |
-|----------|-------------------|-------------|---------------|
-| **๐ Logistics** | Drivers use factoring apps (RTS Pro, OTR Capital) in truck cabs | **$10.5T** | Manual, no laptop access |
-| **๐ Gig Economy** | Uber/Lyft/DoorDash drivers optimize between apps on phones | **$455B** | Tap manually, lose 20% earnings |
-| **๐ฆ Last-Mile** | Amazon Flex, UPS drivers scan packages on handhelds | **$500B+** | Proprietary apps, no APIs |
-| **๐๏ธ Field Services** | Techs log work orders on tablets on-site | **$200B+** | Mobile-only workflows |
-| **๐ฆ Mobile Banking** | Treasury ops, reconciliation on native banking apps | **$28T** | 2FA + biometric locks |
+| Industry | Why They Need This | Market Size | Current State |
+| --------------------- | --------------------------------------------------------------- | ----------- | ------------------------------- |
+| **๐ Logistics** | Drivers use factoring apps (RTS Pro, OTR Capital) in truck cabs | **$10.5T** | Manual, no laptop access |
+| **๐ Gig Economy** | Uber/Lyft/DoorDash drivers optimize between apps on phones | **$455B** | Tap manually, lose 20% earnings |
+| **๐ฆ Last-Mile** | Amazon Flex, UPS drivers scan packages on handhelds | **$500B+** | Proprietary apps, no APIs |
+| **๐๏ธ Field Services** | Techs log work orders on tablets on-site | **$200B+** | Mobile-only workflows |
+| **๐ฆ Mobile Banking** | Treasury ops, reconciliation on native banking apps | **$28T** | 2FA + biometric locks |
**Total: $40+ trillion in GDP from mobile-first workflows**
@@ -145,6 +149,7 @@ Browser agents can't reach these. Desktop agents don't fit. **Android Use is the
## ๐ Quick Start (60 Seconds)
### Prerequisites
+
- Python 3.10+
- Android device or emulator (USB debugging enabled)
- ADB (Android Debug Bridge)
@@ -172,6 +177,7 @@ export OPENAI_API_KEY="sk-..."
# 6. Run your first agent
python kernel.py
+# python kernel-genai.py # For GenAI users
```
### Try It: Logistics Example
@@ -181,12 +187,13 @@ from kernel import run_agent
# Automate the workflow from the viral demo
run_agent("""
-Open WhatsApp, get the latest image,
+Open WhatsApp, get the latest image,
then open the invoice app and fill out the form
""")
```
**Other examples:**
+
- `"Accept the next DoorDash delivery and navigate to restaurant"`
- `"Scan all packages and mark them delivered in the driver app"`
- `"Check Chase mobile for today's transactions"`
@@ -196,51 +203,61 @@ then open the invoice app and fill out the form
## ๐ผ Use Cases Beyond Logistics
### ๐ Gig Economy Multi-Apping
+
**Problem:** Drivers lose 20%+ earnings manually switching between DoorDash, Uber Eats, Instacart.
```python
run_agent("Monitor all delivery apps, accept the highest paying order")
```
+
**Impact:** Instant acceptance, maximize earnings, reduce downtime.
---
### ๐ฆ Package Scanning Automation
+
**Problem:** Drivers manually scan 200+ packages/day in proprietary apps.
```python
run_agent("Scan all packages in photo and mark as loaded in Amazon Flex")
```
+
**Impact:** Bulk scanning, eliminate manual entry, speed up loading.
---
### ๐ฆ Mobile Banking Operations
+
**Problem:** Treasury teams reconcile transactions across multiple mobile banking apps.
```python
run_agent("Log into Chase mobile and export today's wire transfers")
```
+
**Impact:** Automate reconciliation, fraud detection, compliance.
---
### ๐ฅ Healthcare Mobile Workflows
+
**Problem:** Staff extract patient data from HIPAA-locked mobile portals.
```python
run_agent("Open Epic MyChart and download lab results for patient 12345")
```
+
**Impact:** Data extraction, appointment booking, records management.
---
### ๐งช Mobile App QA Testing
+
**Problem:** Manual testing of Android apps is slow and expensive.
```python
run_agent("Create account, complete onboarding, make test purchase")
```
+
**Impact:** Automated E2E testing, regression tests, CI/CD integration.
---
@@ -297,10 +314,10 @@ run_agent("Create account, complete onboarding, make test purchase")
### Why Accessibility Tree > Screenshots
-| Approach | Cost | Speed | Accuracy | Works on Device |
-|----------|------|-------|----------|----------------|
-| **Screenshots (Computer Use)** | $0.15/action | 3-5s | 70-80% | โ Desktop only |
-| **Accessibility Tree (Android Use)** | $0.01/action | <1s | 99%+ | โ
Handheld devices |
+| Approach | Cost | Speed | Accuracy | Works on Device |
+| ------------------------------------ | ------------ | ----- | -------- | ------------------- |
+| **Screenshots (Computer Use)** | $0.15/action | 3-5s | 70-80% | โ Desktop only |
+| **Accessibility Tree (Android Use)** | $0.01/action | <1s | 99%+ | โ
Handheld devices |
**Technical advantage:** Accessibility tree provides structured data (text, coordinates, hierarchy) without image encoding/OCR.
@@ -370,24 +387,28 @@ screen_json = get_screen_state()
## ๐บ๏ธ Roadmap
### โ
Now (MVP - 48 hours)
+
- [x] Core agent loop (perception โ reasoning โ action)
- [x] Accessibility tree parsing
- [x] GPT-4 integration
- [x] Basic actions (tap, type, navigate)
### ๐ง Next 2 Weeks
+
- [ ] **PyPI package:** `pip install android-use`
- [ ] **Multi-LLM support:** Claude, Gemini, Llama
- [ ] **WhatsApp integration:** Pre-built actions for messaging
- [ ] **Error recovery:** Retry logic, fallback strategies
### ๐ฎ Next 3 Months
+
- [ ] **App-specific agents:** Pre-trained for RTS Pro, OTR Capital, factoring apps
- [ ] **Cloud device farms:** Run at scale on AWS Device Farm, BrowserStack
- [ ] **Vision augmentation:** Screenshot fallback when accessibility insufficient
- [ ] **Multi-step memory:** Remember context across sessions
### ๐ Long-term Vision
+
- [ ] **Hosted Cloud API:** No-code agent execution (waitlist below)
- [ ] **Agent marketplace:** Buy/sell vertical-specific automations
- [ ] **Enterprise platform:** SOC2, audit logs, PII redaction, fleet management
@@ -400,6 +421,7 @@ screen_json = get_screen_state()
**Don't want to host it yourself?** Join the waitlist for our managed Cloud API.
**What you get:**
+
- โ
No device setup required
- โ
Scale to 1000s of simultaneous agents
- โ
Pre-built integrations (WhatsApp, factoring apps, etc.)
@@ -414,6 +436,7 @@ screen_json = get_screen_state()
**Want to help build the future of mobile AI agents?**
### ๐ฅ High Priority
+
- **Logistics app templates:** RTS Pro, OTR Capital, Axle, TriumPay integrations
- **WhatsApp automation:** Message parsing, image extraction
- **Error handling:** Robustness for unreliable connections (truck cabs!)
@@ -421,6 +444,7 @@ screen_json = get_screen_state()
- **Testing:** E2E tests for common workflows
### How to Contribute
+
1. โญ **Star this repo** (most important!)
2. ๐ด Fork it
3. ๐ฟ Create branch: `git checkout -b feature/factoring-app-support`
@@ -461,7 +485,7 @@ Support the project
Help logistics companies find this
-[Tweet โ](https://twitter.com/intent/tweet?text=๐%20Game%20changer%20for%20logistics!%20Android%20Use%20lets%20AI%20agents%20control%20native%20Android%20apps.%0A%0Aโ
%20Works%20in%20truck%20cabs%20(no%20laptop%20needed)%0Aโ
%2095%25%20cheaper%20than%20Computer%20Use%0Aโ
%20Automates%20factoring%20apps,%20WhatsApp,%20more%0A%0A4M%20views!%0A%0A&url=https://github.com/actionstatelabs/android-action-kernel&via=ethanjlim)
+[Tweet โ]()
|
@@ -496,17 +520,19 @@ Progress: โโโโโโโโโโโโโโโโโโโโโโโ
I was interviewing truck drivers for a logistics automation project. One driver showed me his phone and said:
-> *"I have to manually type invoice data from this Bill of Lading photo into the RTS Pro app. Takes 10 minutes every delivery. I can't use a laptop because it doesn't fit in the cab."*
+> _"I have to manually type invoice data from this Bill of Lading photo into the RTS Pro app. Takes 10 minutes every delivery. I can't use a laptop because it doesn't fit in the cab."_
That's when it clicked: **AI agents exist for web and desktop, but the real economy runs on handheld devices.**
I looked at existing solutions:
+
- **Browser Use:** Only works on websites โ
- **Computer Use:** Requires a laptop ($0.15/action, vision model) โ
Neither solved the truck cab problem. So I built Android Use in 48 hours using Android's accessibility API.
**The result:**
+
- 95% cheaper (accessibility tree vs vision)
- 5x faster (<1s latency)
- Works on handheld devices โ
@@ -541,6 +567,7 @@ This started as a library for developers. But based on demand, we're building:
## ๐ By the Numbers
**Since launch (24 hours ago):**
+
- ๐ **4,000,000+** views on X
- โญ **12** GitHub stars (help us get to 1,000!)
- ๐ฌ **150+** DMs from companies
@@ -548,6 +575,7 @@ This started as a library for developers. But based on demand, we're building:
- ๐ฆ **3** factoring company partnership discussions
**Market data:**
+
- ๐ **3.5M** truck drivers in US alone
- ๐ฆ **60M** gig economy workers globally
- ๐ฐ **$40T+** in mobile-first GDP
@@ -567,12 +595,14 @@ MIT License - see [LICENSE](LICENSE)
## ๐ Acknowledgments
Built on:
+
- [Browser Use](https://github.com/browser-use/browser-use) - Web agent inspiration
- [Anthropic Computer Use](https://www.anthropic.com/news/computer-use) - Proved UI control works
- Android Accessibility API - The enabling technology
- **The 4 million people who watched and validated this need**
Special thanks to:
+
- Truck drivers who showed me the real problem
- Early beta testers in logistics
- Everyone sharing and supporting this project
diff --git a/kernel-genai.py b/kernel-genai.py
new file mode 100644
index 0000000..834b744
--- /dev/null
+++ b/kernel-genai.py
@@ -0,0 +1,177 @@
+import os
+import time
+import subprocess
+import json
+from typing import Dict, Any, List
+from google import genai
+import sanitizer
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# --- CONFIGURATION ---
+ADB_PATH = "adb" # Ensure adb is in your PATH
+MODEL = "gemini-2.5-flash" # Or another Gemini model as needed
+SCREEN_DUMP_PATH = "/sdcard/window_dump.xml"
+LOCAL_DUMP_PATH = "window_dump.xml"
+
+
+try:
+ client = genai.Client()
+except Exception as e:
+ # Handle case where API key is not set
+ print("Error: Failed to initialize Google Gen AI Client.")
+ print("Please ensure the GEMINI_API_KEY environment variable is set.")
+ exit(1)
+# ------------------------------------------------------
+
+
+def run_adb_command(command: List[str]):
+ """Executes a shell command via ADB."""
+ str_command = [str(c) for c in command]
+ result = subprocess.run([ADB_PATH] + str_command,
+ capture_output=True, text=True)
+ if result.stderr and "error" in result.stderr.lower():
+ print(f"โ ADB Error: {result.stderr.strip()}")
+ return result.stdout.strip()
+
+
+def get_screen_state() -> str:
+ """Dumps the current UI XML and returns the sanitized JSON string."""
+ # 1. Capture XML
+ run_adb_command(["shell", "uiautomator", "dump", SCREEN_DUMP_PATH])
+
+ # 2. Pull to local
+ pull_result = subprocess.run(
+ [ADB_PATH, "pull", SCREEN_DUMP_PATH, LOCAL_DUMP_PATH], capture_output=True, text=True)
+ if pull_result.returncode != 0:
+ print(f"โ ADB Pull Error: {pull_result.stderr.strip()}")
+ return "Error: Could not pull screen dump."
+
+ # 3. Read & Sanitize
+ if not os.path.exists(LOCAL_DUMP_PATH):
+ return "Error: Could not capture screen."
+
+ with open(LOCAL_DUMP_PATH, "r", encoding="utf-8") as f:
+ xml_content = f.read()
+
+ elements = sanitizer.get_interactive_elements(xml_content)
+ return json.dumps(elements, indent=2)
+
+
+def execute_action(action: Dict[str, Any]):
+ """Executes the action decided by the LLM."""
+ act_type = action.get("action")
+
+ if act_type == "tap":
+ coordinates = action.get("coordinates", [0, 0])
+ x, y = coordinates[0], coordinates[1]
+ print(f"๐ Tapping: ({x}, {y})")
+ run_adb_command(["shell", "input", "tap", str(x), str(y)])
+
+ elif act_type == "type":
+ text_to_type = action.get("text")
+ adb_text = text_to_type.replace(
+ " ", "%s") # ADB requires %s for spaces
+ print(f"โจ๏ธ Typing: {text_to_type}")
+ run_adb_command(["shell", "input", "text", adb_text])
+
+ elif act_type == "home":
+ print("๐ Going Home")
+ # Corrected KEYWORDS_HOME to KEYCODE_HOME
+ run_adb_command(["shell", "input", "keyevent", "KEYWORDS_HOME"])
+
+ elif act_type == "back":
+ print("๐ Going Back")
+ # Corrected KEYWORDS_BACK to KEYCODE_BACK
+ run_adb_command(["shell", "input", "keyevent", "KEYWORDS_BACK"])
+
+ elif act_type == "wait":
+ print("โณ Waiting...")
+ time.sleep(2)
+
+ elif act_type == "done":
+ print("โ
Goal Achieved.")
+ exit(0)
+ else:
+ print(f"โ ๏ธ Unknown action type: {act_type}")
+
+
+def get_llm_decision(goal: str, screen_context: str) -> Dict[str, Any]:
+ """Sends screen context to LLM and asks for the next move using Gemini API."""
+ system_prompt = """
+ You are an Android Driver Agent. Your job is to achieve the user's goal by navigating the UI.
+
+ You will receive:
+ 1. The User's Goal.
+ 2. A list of interactive UI elements (JSON) with their (x,y) center coordinates.
+
+ You must output ONLY a valid JSON object with your next action.
+
+ Available Actions:
+ - {"action": "tap", "coordinates": [x, y], "reason": "Why you are tapping"}
+ - {"action": "type", "text": "Hello World", "reason": "Why you are typing"}
+ - {"action": "home", "reason": "Go to home screen"}
+ - {"action": "back", "reason": "Go back"}
+ - {"action": "wait", "reason": "Wait for loading"}
+ - {"action": "done", "reason": "Task complete"}
+
+ Example Output:
+ {"action": "tap", "coordinates": [540, 1200], "reason": "Clicking the 'Connect' button"}
+ """
+
+ full_prompt = (
+ f"{system_prompt}\n\n"
+ f"GOAL: {goal}\n\n"
+ f"SCREEN_CONTEXT:\n{screen_context}"
+ )
+
+ response = client.models.generate_content(
+ model=MODEL,
+ contents=[{"role": "user", "parts": [{"text": full_prompt}]}],
+ config={
+ "response_mime_type": "application/json",
+ }
+ )
+ return json.loads(response.text)
+
+
+def run_agent(goal: str, max_steps=10):
+ print(f"๐ Android Use Agent Started. Goal: {goal}")
+
+ for step in range(max_steps):
+ print(f"\n--- Step {step + 1} ---")
+
+ # 1. Perception
+ print("๐ Scanning Screen...")
+ screen_context = get_screen_state()
+
+ if screen_context.startswith("Error"):
+ print(f"โ Aborting: {screen_context}")
+ break
+
+ # 2. Reasoning
+ print("๐ง Thinking...")
+ try:
+ decision = get_llm_decision(goal, screen_context)
+ except Exception as e:
+ print(f"โ LLM Decision Error: {e}")
+ time.sleep(2)
+ continue
+
+ print(f"๐ก Decision: {decision.get('reason')}")
+
+ # 3. Action
+ execute_action(decision)
+
+ # Wait for UI to update
+ time.sleep(2)
+
+
+if __name__ == "__main__":
+ # Example Goal: "Open settings and turn on Wi-Fi"
+ GOAL = input("Enter your goal: ")
+ if not GOAL:
+ print("No goal entered. Exiting.")
+ else:
+ run_agent(GOAL)
diff --git a/kernel.py b/kernel.py
index f827897..2f9dd0f 100644
--- a/kernel.py
+++ b/kernel.py
@@ -2,7 +2,7 @@
import time
import subprocess
import json
-from typing import Dict, Any
+from typing import Dict, Any, List
from openai import OpenAI
import sanitizer
@@ -14,61 +14,67 @@
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+
def run_adb_command(command: List[str]):
"""Executes a shell command via ADB."""
- result = subprocess.run([ADB_PATH] + command, capture_output=True, text=True)
+ result = subprocess.run([ADB_PATH] + command,
+ capture_output=True, text=True)
if result.stderr and "error" in result.stderr.lower():
print(f"โ ADB Error: {result.stderr.strip()}")
return result.stdout.strip()
+
def get_screen_state() -> str:
"""Dumps the current UI XML and returns the sanitized JSON string."""
# 1. Capture XML
run_adb_command(["shell", "uiautomator", "dump", SCREEN_DUMP_PATH])
-
+
# 2. Pull to local
run_adb_command(["pull", SCREEN_DUMP_PATH, LOCAL_DUMP_PATH])
-
+
# 3. Read & Sanitize
if not os.path.exists(LOCAL_DUMP_PATH):
return "Error: Could not capture screen."
-
+
with open(LOCAL_DUMP_PATH, "r", encoding="utf-8") as f:
xml_content = f.read()
-
+
elements = sanitizer.get_interactive_elements(xml_content)
return json.dumps(elements, indent=2)
+
def execute_action(action: Dict[str, Any]):
"""Executes the action decided by the LLM."""
act_type = action.get("action")
-
+
if act_type == "tap":
x, y = action.get("coordinates")
print(f"๐ Tapping: ({x}, {y})")
run_adb_command(["shell", "input", "tap", str(x), str(y)])
-
+
elif act_type == "type":
- text = action.get("text").replace(" ", "%s") # ADB requires %s for spaces
+ text = action.get("text").replace(
+ " ", "%s") # ADB requires %s for spaces
print(f"โจ๏ธ Typing: {action.get('text')}")
run_adb_command(["shell", "input", "text", text])
-
+
elif act_type == "home":
print("๐ Going Home")
run_adb_command(["shell", "input", "keyevent", "KEYWORDS_HOME"])
-
+
elif act_type == "back":
print("๐ Going Back")
run_adb_command(["shell", "input", "keyevent", "KEYWORDS_BACK"])
-
+
elif act_type == "wait":
print("โณ Waiting...")
time.sleep(2)
-
+
elif act_type == "done":
print("โ
Goal Achieved.")
exit(0)
+
def get_llm_decision(goal: str, screen_context: str) -> Dict[str, Any]:
"""Sends screen context to LLM and asks for the next move."""
system_prompt = """
@@ -91,7 +97,7 @@ def get_llm_decision(goal: str, screen_context: str) -> Dict[str, Any]:
Example Output:
{"action": "tap", "coordinates": [540, 1200], "reason": "Clicking the 'Connect' button"}
"""
-
+
response = client.chat.completions.create(
model=MODEL,
response_format={"type": "json_object"},
@@ -100,32 +106,34 @@ def get_llm_decision(goal: str, screen_context: str) -> Dict[str, Any]:
{"role": "user", "content": f"GOAL: {goal}\n\nSCREEN_CONTEXT:\n{screen_context}"}
]
)
-
+
return json.loads(response.choices[0].message.content)
+
def run_agent(goal: str, max_steps=10):
print(f"๐ Android Use Agent Started. Goal: {goal}")
-
+
for step in range(max_steps):
print(f"\n--- Step {step + 1} ---")
-
+
# 1. Perception
print("๐ Scanning Screen...")
screen_context = get_screen_state()
-
+
# 2. Reasoning
print("๐ง Thinking...")
decision = get_llm_decision(goal, screen_context)
print(f"๐ก Decision: {decision.get('reason')}")
-
+
# 3. Action
execute_action(decision)
-
+
# Wait for UI to update
time.sleep(2)
+
if __name__ == "__main__":
# Example Goal: "Open settings and turn on Wi-Fi"
# Or your demo goal: "Find the 'Connect' button and tap it"
GOAL = input("Enter your goal: ")
- run_agent(GOAL)
\ No newline at end of file
+ run_agent(GOAL)
diff --git a/requirements.txt b/requirements.txt
index 06018fe..4201827 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
-openai>=1.12.0
\ No newline at end of file
+openai>=1.12.0
+google-genai >=1.55.0
\ No newline at end of file
|