From 0e3744c3d4f207f419772051140425e5667f88ec Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 29 Jul 2025 15:39:04 +0200 Subject: [PATCH] update osworld docs --- experiments/osworld_docker_test.py | 37 ------------------- .../benchmarks/{setup.md => osworld.md} | 22 ++++++++++- 2 files changed, 20 insertions(+), 39 deletions(-) delete mode 100644 experiments/osworld_docker_test.py rename src/agentlab/benchmarks/{setup.md => osworld.md} (62%) diff --git a/experiments/osworld_docker_test.py b/experiments/osworld_docker_test.py deleted file mode 100644 index 3b68db5d..00000000 --- a/experiments/osworld_docker_test.py +++ /dev/null @@ -1,37 +0,0 @@ -import logging - -from desktop_env.desktop_env import DesktopEnv - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", - handlers=[logging.StreamHandler()], -) - -example = { - "id": "94d95f96-9699-4208-98ba-3c3119edf9c2", - "instruction": "I want to install Spotify on my current system. Could you please help me?", - "config": [ - { - "type": "execute", - "parameters": { - "command": [ - "python", - "-c", - "import pyautogui; import time; pyautogui.click(960, 540); time.sleep(0.5);", - ] - }, - } - ], - "evaluator": { - "func": "check_include_exclude", - "result": {"type": "vm_command_line", "command": "which spotify"}, - "expected": {"type": "rule", "rules": {"include": ["spotify"], "exclude": ["not found"]}}, - }, -} - -env = DesktopEnv(action_space="pyautogui", provider_name="docker", os_type="Ubuntu") - -obs = env.reset(task_config=example) -obs, reward, done, info = env.step("pyautogui.rightClick()") -print(obs) diff --git a/src/agentlab/benchmarks/setup.md b/src/agentlab/benchmarks/osworld.md similarity index 62% rename from src/agentlab/benchmarks/setup.md rename to src/agentlab/benchmarks/osworld.md index 071670d1..d7e99d69 100644 --- a/src/agentlab/benchmarks/setup.md +++ b/src/agentlab/benchmarks/osworld.md @@ -31,7 +31,7 @@ The main entry point `experiments/run_osworld.py` is currently configured with h 2. **Environment Variables:** - `AGENTLAB_DEBUG=1`: Automatically runs the debug subset (7 tasks from `osworld_debug_task_ids.json`) -### Running OSWorld Tasks +### Task subsets We provide different subsets of tasks: @@ -42,10 +42,28 @@ We provide different subsets of tasks: ### Example Commands ```bash -# Run with default debug subset (7 tasks) +# Run with default debug subset using sequential execution in VMware VM python experiments/run_osworld.py ``` +### Parallel Execution with Docker +To run OSWorld in parallel using Docker, ensure you have Docker installed and configured. +To install it, follow the section from the OSWorld README on [Docker setup](https://github.com/xlang-ai/OSWorld?tab=readme-ov-file#docker-server-with-kvm-support-for-better-performance). +Ensure that your docker installation support KVM, as OSWorld requires it for running VMs. +We also recommend pulling the latest Docker image for OSWorld before running the benchmark: + +```bash +docker pull happysixd/osworld-docker +``` + +After setting up Docker, you can change the `use_vmware` parameter in the script to `False` and run: + +```bash +python experiments/run_osworld.py +``` +You can control number of parallel jobs by setting the `n_jobs` parameter in the script, the default is 4. +We recommend setting `n_jobs` to `your_number_of_cpu_cores - 2` to leave some resources for the host system and the benchmark itself. + ### Configuration Notes