diff --git a/ICML2024/script.ipynb b/ICML2024/script.ipynb deleted file mode 100644 index da53b994..00000000 --- a/ICML2024/script.ipynb +++ /dev/null @@ -1,732 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 370, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " benchmark Task group max DOM k-tokens max Pruned DOM k-tokens \\\n", - "0 miniwob [ALL TASKS] 7.586 5.468 \n", - "1 miniwob [ALL TASKS] 9.978 6.304 \n", - "2 workarena form 1337.341 1255.558 \n", - "3 workarena knowledge 72.976 60.304 \n", - "4 workarena list-filter 1442.547 1373.995 \n", - "\n", - " max AXTree k-tokens episode DOM k-tokens episode Pruned DOM k-tokens \\\n", - "0 1.009 10.597492 4.143874 \n", - "1 2.196 11.799989 6.235014 \n", - "2 56.093 2838.753426 2293.452617 \n", - "3 6.229 316.468900 179.223000 \n", - "4 78.882 2226.093472 1648.841453 \n", - "\n", - " episode AXTree k-tokens agent \n", - "0 0.721944 base_agent \n", - "1 1.186288 full_agent \n", - "2 104.499426 base_agent \n", - "3 14.687900 base_agent \n", - "4 181.179585 base_agent \n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "import matplotlib.cm as cm\n", - "from matplotlib.colors import ListedColormap\n", - "\n", - "# Read the CSV file\n", - "df = pd.read_csv('data.csv')\n", - "\n", - "print(df.head())\n" - ] - }, - { - "cell_type": "code", - "execution_count": 371, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
benchmarkTask groupagenttypeobsvalue
0miniwobminiwob [ALL TASKS]base_agentmaxDOM k-tokens7586.0
2workarenaworkarena formbase_agentmaxDOM k-tokens1337341.0
3workarenaworkarena knowledgebase_agentmaxDOM k-tokens72976.0
4workarenaworkarena list-filterbase_agentmaxDOM k-tokens1442547.0
5workarenaworkarena list-sortbase_agentmaxDOM k-tokens1336066.0
6workarenaworkarena menubase_agentmaxDOM k-tokens270660.0
7workarenaworkarena service catalogbase_agentmaxDOM k-tokens1284051.0
8workarenaworkarena [ALL TASKS]base_agentmaxDOM k-tokens1442547.0
32miniwobminiwob [ALL TASKS]base_agentmaxAXTree k-tokens1009.0
34workarenaworkarena formbase_agentmaxAXTree k-tokens56093.0
35workarenaworkarena knowledgebase_agentmaxAXTree k-tokens6229.0
36workarenaworkarena list-filterbase_agentmaxAXTree k-tokens78882.0
37workarenaworkarena list-sortbase_agentmaxAXTree k-tokens83581.0
38workarenaworkarena menubase_agentmaxAXTree k-tokens14110.0
39workarenaworkarena service catalogbase_agentmaxAXTree k-tokens58863.0
40workarenaworkarena [ALL TASKS]base_agentmaxAXTree k-tokens83581.0
\n", - "
" - ], - "text/plain": [ - " benchmark Task group agent type obs \\\n", - "0 miniwob miniwob [ALL TASKS] base_agent max DOM k-tokens \n", - "2 workarena workarena form base_agent max DOM k-tokens \n", - "3 workarena workarena knowledge base_agent max DOM k-tokens \n", - "4 workarena workarena list-filter base_agent max DOM k-tokens \n", - "5 workarena workarena list-sort base_agent max DOM k-tokens \n", - "6 workarena workarena menu base_agent max DOM k-tokens \n", - "7 workarena workarena service catalog base_agent max DOM k-tokens \n", - "8 workarena workarena [ALL TASKS] base_agent max DOM k-tokens \n", - "32 miniwob miniwob [ALL TASKS] base_agent max AXTree k-tokens \n", - "34 workarena workarena form base_agent max AXTree k-tokens \n", - "35 workarena workarena knowledge base_agent max AXTree k-tokens \n", - "36 workarena workarena list-filter base_agent max AXTree k-tokens \n", - "37 workarena workarena list-sort base_agent max AXTree k-tokens \n", - "38 workarena workarena menu base_agent max AXTree k-tokens \n", - "39 workarena workarena service catalog base_agent max AXTree k-tokens \n", - "40 workarena workarena [ALL TASKS] base_agent max AXTree k-tokens \n", - "\n", - " value \n", - "0 7586.0 \n", - "2 1337341.0 \n", - "3 72976.0 \n", - "4 1442547.0 \n", - "5 1336066.0 \n", - "6 270660.0 \n", - "7 1284051.0 \n", - "8 1442547.0 \n", - "32 1009.0 \n", - "34 56093.0 \n", - "35 6229.0 \n", - "36 78882.0 \n", - "37 83581.0 \n", - "38 14110.0 \n", - "39 58863.0 \n", - "40 83581.0 " - ] - }, - "execution_count": 371, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 383, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 False\n", - "2 False\n", - "3 False\n", - "4 False\n", - "5 False\n", - "6 False\n", - "7 False\n", - "8 True\n", - "32 False\n", - "34 False\n", - "35 False\n", - "36 False\n", - "37 False\n", - "38 False\n", - "39 False\n", - "40 True\n", - "Name: Task group, dtype: bool" - ] - }, - "execution_count": 383, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_melt['Task group'] == \"workarena [ALL TASKS]\"" - ] - }, - { - "cell_type": "code", - "execution_count": 415, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " benchmark Task group agent type obs value\n", - "0 miniwob miniwob [ALL TASKS] base_agent max DOM k-tokens 7586.0\n", - "2 workarena workarena form base_agent max DOM k-tokens 1337341.0\n", - "3 workarena workarena knowledge base_agent max DOM k-tokens 72976.0\n", - "4 workarena workarena list-filter base_agent max DOM k-tokens 1442547.0\n", - "5 workarena workarena list-sort base_agent max DOM k-tokens 1336066.0\n", - " benchmark Task group agent type obs value\n", - "1 miniwob miniwob [ALL TASKS] full_agent max DOM k-tokens 9978.0\n", - "9 workarena workarena form full_agent max DOM k-tokens 1550687.0\n", - "10 workarena workarena knowledge full_agent max DOM k-tokens 178639.0\n", - "11 workarena workarena list-filter full_agent max DOM k-tokens 1593240.0\n", - "12 workarena workarena list-sort full_agent max DOM k-tokens 1493638.0\n" - ] - } - ], - "source": [ - "print(df_b.head())\n", - "print(df_f.head())" - ] - }, - { - "cell_type": "code", - "execution_count": 481, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " benchmark Task group agent type obs value\n", - "0 miniwob [ALL TASKS] base_agent max DOM k-tokens 7586.0\n", - "1 miniwob [ALL TASKS] full_agent max DOM k-tokens 9978.0\n", - "2 workarena form base_agent max DOM k-tokens 1337341.0\n", - "3 workarena knowledge base_agent max DOM k-tokens 72976.0\n", - "4 workarena list-filter base_agent max DOM k-tokens 1442547.0\n", - " benchmark Task group agent type obs value\n", - "0 miniwob miniwob [ALL TASKS] base_agent max DOM k-tokens 7586.0\n", - "1 miniwob miniwob [ALL TASKS] full_agent max DOM k-tokens 9978.0\n", - "2 workarena workarena form base_agent max DOM k-tokens 1337341.0\n", - "3 workarena workarena knowledge base_agent max DOM k-tokens 72976.0\n", - "4 workarena workarena list-filter base_agent max DOM k-tokens 1442547.0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_38828\\1123958528.py:46: FutureWarning: \n", - "\n", - "The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.\n", - "\n", - " sns.barplot(x='obs', y='value', hue='Task group', hue_order=sorted_task_group, data=df_b, order=None, palette=palette, ci=None)\n", - "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_38828\\1123958528.py:50: FutureWarning: \n", - "\n", - "The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.\n", - "\n", - " sns.barplot(x='obs', y='value', hue='Task group', hue_order=sorted_task_group, data=df_f, order=None, palette=[(0.75, 0.75, 0.7)], ci=None)\n", - "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_38828\\1123958528.py:50: UserWarning: \n", - "The palette list has fewer values (1) than needed (7) and will cycle, which may produce an uninterpretable plot.\n", - " sns.barplot(x='obs', y='value', hue='Task group', hue_order=sorted_task_group, data=df_f, order=None, palette=[(0.75, 0.75, 0.7)], ci=None)\n", - "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_38828\\1123958528.py:53: FutureWarning: \n", - "\n", - "The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.\n", - "\n", - " sns.barplot(x='obs', y='value', hue='Task group', hue_order=sorted_task_group, data=df_b, order=None, palette=palette, ci=None)\n" - ] - }, - { - "ename": "PermissionError", - "evalue": "[Errno 13] Permission denied: 'wa_complexity_0.pdf'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mPermissionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[481], line 68\u001b[0m\n\u001b[0;32m 64\u001b[0m \u001b[38;5;66;03m#put legend on top two rows aligned legend word on the right\u001b[39;00m\n\u001b[0;32m 66\u001b[0m legend1 \u001b[38;5;241m=\u001b[39m plt\u001b[38;5;241m.\u001b[39mlegend(loc\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mupper center\u001b[39m\u001b[38;5;124m'\u001b[39m, fontsize\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m, bbox_to_anchor\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m0.5\u001b[39m, \u001b[38;5;241m1.24\u001b[39m), ncol\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m, labels\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMiniWoB\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mknowledge\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmenu\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mservice catalog\u001b[39m\u001b[38;5;124m\"\u001b[39m,\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlist-sort\u001b[39m\u001b[38;5;124m\"\u001b[39m , \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mform\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlist-filter\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m---> 68\u001b[0m \u001b[43mplt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msavefig\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mwa_complexity_\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mi\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m.pdf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdpi\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m300\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbbox_inches\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtight\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 69\u001b[0m plt\u001b[38;5;241m.\u001b[39msavefig(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwa_complexity_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mi\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.png\u001b[39m\u001b[38;5;124m'\u001b[39m, dpi\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m300\u001b[39m, bbox_inches\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtight\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "File \u001b[1;32mc:\\Users\\Tom\\miniconda3\\envs\\ui-copilot\\Lib\\site-packages\\matplotlib\\pyplot.py:1119\u001b[0m, in \u001b[0;36msavefig\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 1116\u001b[0m fig \u001b[38;5;241m=\u001b[39m gcf()\n\u001b[0;32m 1117\u001b[0m \u001b[38;5;66;03m# savefig default implementation has no return, so mypy is unhappy\u001b[39;00m\n\u001b[0;32m 1118\u001b[0m \u001b[38;5;66;03m# presumably this is here because subclasses can return?\u001b[39;00m\n\u001b[1;32m-> 1119\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msavefig\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# type: ignore[func-returns-value]\u001b[39;00m\n\u001b[0;32m 1120\u001b[0m fig\u001b[38;5;241m.\u001b[39mcanvas\u001b[38;5;241m.\u001b[39mdraw_idle() \u001b[38;5;66;03m# Need this if 'transparent=True', to reset colors.\u001b[39;00m\n\u001b[0;32m 1121\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\n", - "File \u001b[1;32mc:\\Users\\Tom\\miniconda3\\envs\\ui-copilot\\Lib\\site-packages\\matplotlib\\figure.py:3390\u001b[0m, in \u001b[0;36mFigure.savefig\u001b[1;34m(self, fname, transparent, **kwargs)\u001b[0m\n\u001b[0;32m 3388\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ax \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maxes:\n\u001b[0;32m 3389\u001b[0m _recursively_make_axes_transparent(stack, ax)\n\u001b[1;32m-> 3390\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcanvas\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprint_figure\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mc:\\Users\\Tom\\miniconda3\\envs\\ui-copilot\\Lib\\site-packages\\matplotlib\\backend_bases.py:2193\u001b[0m, in \u001b[0;36mFigureCanvasBase.print_figure\u001b[1;34m(self, filename, dpi, facecolor, edgecolor, orientation, format, bbox_inches, pad_inches, bbox_extra_artists, backend, **kwargs)\u001b[0m\n\u001b[0;32m 2189\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 2190\u001b[0m \u001b[38;5;66;03m# _get_renderer may change the figure dpi (as vector formats\u001b[39;00m\n\u001b[0;32m 2191\u001b[0m \u001b[38;5;66;03m# force the figure dpi to 72), so we need to set it again here.\u001b[39;00m\n\u001b[0;32m 2192\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m cbook\u001b[38;5;241m.\u001b[39m_setattr_cm(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfigure, dpi\u001b[38;5;241m=\u001b[39mdpi):\n\u001b[1;32m-> 2193\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mprint_method\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 2194\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2195\u001b[0m \u001b[43m \u001b[49m\u001b[43mfacecolor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfacecolor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2196\u001b[0m \u001b[43m \u001b[49m\u001b[43medgecolor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43medgecolor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2197\u001b[0m \u001b[43m \u001b[49m\u001b[43morientation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43morientation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2198\u001b[0m \u001b[43m \u001b[49m\u001b[43mbbox_inches_restore\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_bbox_inches_restore\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 2199\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2200\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m 2201\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bbox_inches \u001b[38;5;129;01mand\u001b[39;00m restore_bbox:\n", - "File \u001b[1;32mc:\\Users\\Tom\\miniconda3\\envs\\ui-copilot\\Lib\\site-packages\\matplotlib\\backend_bases.py:2043\u001b[0m, in \u001b[0;36mFigureCanvasBase._switch_canvas_and_return_print_method..\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 2039\u001b[0m optional_kws \u001b[38;5;241m=\u001b[39m { \u001b[38;5;66;03m# Passed by print_figure for other renderers.\u001b[39;00m\n\u001b[0;32m 2040\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdpi\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfacecolor\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124medgecolor\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124morientation\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m 2041\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbbox_inches_restore\u001b[39m\u001b[38;5;124m\"\u001b[39m}\n\u001b[0;32m 2042\u001b[0m skip \u001b[38;5;241m=\u001b[39m optional_kws \u001b[38;5;241m-\u001b[39m {\u001b[38;5;241m*\u001b[39minspect\u001b[38;5;241m.\u001b[39msignature(meth)\u001b[38;5;241m.\u001b[39mparameters}\n\u001b[1;32m-> 2043\u001b[0m print_method \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mwraps(meth)(\u001b[38;5;28;01mlambda\u001b[39;00m \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: \u001b[43mmeth\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 2044\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m{\u001b[49m\u001b[43mk\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mv\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mitems\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mnot\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mskip\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m 2045\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m: \u001b[38;5;66;03m# Let third-parties do as they see fit.\u001b[39;00m\n\u001b[0;32m 2046\u001b[0m print_method \u001b[38;5;241m=\u001b[39m meth\n", - "File \u001b[1;32mc:\\Users\\Tom\\miniconda3\\envs\\ui-copilot\\Lib\\site-packages\\matplotlib\\backends\\backend_pdf.py:2800\u001b[0m, in \u001b[0;36mFigureCanvasPdf.print_pdf\u001b[1;34m(self, filename, bbox_inches_restore, metadata)\u001b[0m\n\u001b[0;32m 2798\u001b[0m file \u001b[38;5;241m=\u001b[39m filename\u001b[38;5;241m.\u001b[39m_ensure_file()\n\u001b[0;32m 2799\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m-> 2800\u001b[0m file \u001b[38;5;241m=\u001b[39m \u001b[43mPdfFile\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2801\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 2802\u001b[0m file\u001b[38;5;241m.\u001b[39mnewPage(width, height)\n", - "File \u001b[1;32mc:\\Users\\Tom\\miniconda3\\envs\\ui-copilot\\Lib\\site-packages\\matplotlib\\backends\\backend_pdf.py:688\u001b[0m, in \u001b[0;36mPdfFile.__init__\u001b[1;34m(self, filename, metadata)\u001b[0m\n\u001b[0;32m 686\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moriginal_file_like \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 687\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtell_base \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m--> 688\u001b[0m fh, opened \u001b[38;5;241m=\u001b[39m \u001b[43mcbook\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_filehandle\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mwb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_opened\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[0;32m 689\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m opened:\n\u001b[0;32m 690\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", - "File \u001b[1;32mc:\\Users\\Tom\\miniconda3\\envs\\ui-copilot\\Lib\\site-packages\\matplotlib\\cbook.py:483\u001b[0m, in \u001b[0;36mto_filehandle\u001b[1;34m(fname, flag, return_opened, encoding)\u001b[0m\n\u001b[0;32m 481\u001b[0m fh \u001b[38;5;241m=\u001b[39m bz2\u001b[38;5;241m.\u001b[39mBZ2File(fname, flag)\n\u001b[0;32m 482\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 483\u001b[0m fh \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(fname, flag, encoding\u001b[38;5;241m=\u001b[39mencoding)\n\u001b[0;32m 484\u001b[0m opened \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m 485\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(fname, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mseek\u001b[39m\u001b[38;5;124m'\u001b[39m):\n", - "\u001b[1;31mPermissionError\u001b[0m: [Errno 13] Permission denied: 'wa_complexity_0.pdf'" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", - "import random\n", - "\n", - "for i in range(0,10):\n", - " df_melt = df.melt(id_vars=['benchmark', 'Task group', 'agent'], \n", - " value_vars=['max DOM k-tokens', 'max Pruned DOM k-tokens', 'max AXTree k-tokens', 'episode DOM k-tokens', 'episode Pruned DOM k-tokens', 'episode AXTree k-tokens'], \n", - " var_name='experience', value_name='value')\n", - " #take all value and multiply by 1000\n", - " df_melt['value'] = df_melt['value'] * 1000\n", - "\n", - " df_melt[['type', 'obs']] = df_melt['experience'].str.split(' ', n=1, expand=True)\n", - " #drop columns\n", - " df_melt = df_melt.drop(columns=['experience'])\n", - " #rearraange columns\n", - " df_melt = df_melt[['benchmark', 'Task group', 'agent', 'type', 'obs', 'value']]\n", - " sorted_obs = df_melt.groupby('obs')['value'].mean().sort_values().index\n", - "\n", - " print(df_melt.head())\n", - "\n", - " #drop lines where obs contains 'pruned'\n", - " df_melt = df_melt[~df_melt['obs'].str.contains('Pruned')]\n", - "\n", - " #keeping only max \n", - " df_melt = df_melt.loc[df_melt['type'] == 'max']\n", - " #df_melt = df_melt.loc[df_melt['agent'] == 'base_agent']\n", - " df_melt['Task group'] = df_melt['benchmark'] + ' ' + df_melt['Task group']\n", - " #remove lines where task group = \"workarena [ALL TASKS]\"\n", - " df_melt = df_melt[df_melt['Task group'] != \"workarena [ALL TASKS]\"]\n", - " # Create the bar plot\n", - " # Create a sorted list of 'obs' values based on the mean 'value'\n", - "\n", - " # Change the order of 'obs' in df_melt based on sorted_obs\n", - " sns.set_style(\"darkgrid\")\n", - " np.random.seed(random.randint(0, 100))\n", - "\n", - " # Create a color palette with a random seed\n", - " palette = sns.color_palette('colorblind', n_colors=len(df_melt['Task group'].unique()))\n", - " random.shuffle(palette)\n", - " palette_grey = sns.color_palette('grey', n_colors=1)\n", - " print(df_melt.head())\n", - " # Sort df_melt by 'obs'\n", - " sorted_task_group = df_melt.groupby(['Task group'])['value'].mean().sort_values().index.get_level_values('Task group')\n", - " figure = plt.figure(figsize=(10, 5))\n", - " df_b = df_melt.loc[df_melt['agent'] == 'base_agent']\n", - " sns.barplot(x='obs', y='value', hue='Task group', hue_order=sorted_task_group, data=df_b, order=None, palette=palette, ci=None)\n", - "\n", - " # Create the bar plot with sorted 'obs' categories\n", - " df_f = df_melt.loc[df_melt['agent'] == 'full_agent']\n", - " sns.barplot(x='obs', y='value', hue='Task group', hue_order=sorted_task_group, data=df_f, order=None, palette=[(0.75, 0.75, 0.7)], ci=None)\n", - "\n", - " df_b = df_melt.loc[df_melt['agent'] == 'base_agent']\n", - " sns.barplot(x='obs', y='value', hue='Task group', hue_order=sorted_task_group, data=df_b, order=None, palette=palette, ci=None)\n", - "\n", - " plt.ylabel('Max number of token per step (log scale)', fontsize=16)\n", - " plt.xlabel('')\n", - " #custom x tick labels\n", - " plt.xticks([0, 1], ['Document Object Model', 'Accessibility Tree'] , fontsize=16)\n", - " plt.yticks(fontsize=16)\n", - "\n", - " #y axis log\n", - " plt.yscale('log')\n", - "\n", - " #put legend on top two rows aligned legend word on the right\n", - "\n", - " legend1 = plt.legend(loc='upper center', fontsize=16, bbox_to_anchor=(0.5, 1.24), ncol=4, labels=[\"MiniWoB\", \"knowledge\", \"menu\", \"service catalog\",\"list-sort\" , \"form\", \"list-filter\"])\n", - "\n", - " plt.savefig(f'wa_complexity_{i}.pdf', dpi=300, bbox_inches='tight')\n", - " plt.savefig(f'wa_complexity_{i}.png', dpi=300, bbox_inches='tight')\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 351, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Tom\\AppData\\Local\\Temp\\ipykernel_38828\\25016155.py:4: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.\n", - " cmap = cm.get_cmap('viridis') # Replace 'viridis' with any colormap you like\n" - ] - }, - { - "ename": "ValueError", - "evalue": "shape mismatch: objects cannot be broadcast to a single shape. Mismatch is between arg 0 with shape (16,) and arg 1 with shape (4,).", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[351], line 15\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, label \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(labels):\n\u001b[0;32m 14\u001b[0m color \u001b[38;5;241m=\u001b[39m color_dict[label]\n\u001b[1;32m---> 15\u001b[0m rects1 \u001b[38;5;241m=\u001b[39m \u001b[43max\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbar\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mwidth\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mi\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mwidth\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgroup1_sorted\u001b[49m\u001b[43m[\u001b[49m\u001b[43mgroup1_sorted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[43mgroup1_sorted\u001b[49m\u001b[43m[\u001b[49m\u001b[43mgroup1_sorted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mlabel\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwidth\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlabel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 16\u001b[0m rects2 \u001b[38;5;241m=\u001b[39m ax\u001b[38;5;241m.\u001b[39mbar(x \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m10\u001b[39m\u001b[38;5;241m*\u001b[39mwidth \u001b[38;5;241m+\u001b[39m i\u001b[38;5;241m*\u001b[39mwidth, group1_sorted[group1_sorted\u001b[38;5;241m.\u001b[39mcolumns[\u001b[38;5;241m4\u001b[39m]][group1_sorted[group1_sorted\u001b[38;5;241m.\u001b[39mcolumns[\u001b[38;5;241m1\u001b[39m]] \u001b[38;5;241m==\u001b[39m label], width, color\u001b[38;5;241m=\u001b[39mcolor, label\u001b[38;5;241m=\u001b[39mlabel)\n\u001b[0;32m 18\u001b[0m \u001b[38;5;66;03m# Display the legend\u001b[39;00m\n", - "File \u001b[1;32mc:\\Users\\Tom\\miniconda3\\envs\\ui-copilot\\Lib\\site-packages\\matplotlib\\__init__.py:1478\u001b[0m, in \u001b[0;36m_preprocess_data..inner\u001b[1;34m(ax, data, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1475\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[0;32m 1476\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21minner\u001b[39m(ax, \u001b[38;5;241m*\u001b[39margs, data\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m 1477\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m data \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 1478\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43max\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43mmap\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43msanitize_sequence\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1480\u001b[0m bound \u001b[38;5;241m=\u001b[39m new_sig\u001b[38;5;241m.\u001b[39mbind(ax, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 1481\u001b[0m auto_label \u001b[38;5;241m=\u001b[39m (bound\u001b[38;5;241m.\u001b[39marguments\u001b[38;5;241m.\u001b[39mget(label_namer)\n\u001b[0;32m 1482\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m bound\u001b[38;5;241m.\u001b[39mkwargs\u001b[38;5;241m.\u001b[39mget(label_namer))\n", - "File \u001b[1;32mc:\\Users\\Tom\\miniconda3\\envs\\ui-copilot\\Lib\\site-packages\\matplotlib\\axes\\_axes.py:2457\u001b[0m, in \u001b[0;36mAxes.bar\u001b[1;34m(self, x, height, width, bottom, align, **kwargs)\u001b[0m\n\u001b[0;32m 2454\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m yerr \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 2455\u001b[0m yerr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_convert_dx(yerr, y0, y, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconvert_yunits)\n\u001b[1;32m-> 2457\u001b[0m x, height, width, y, linewidth, hatch \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbroadcast_arrays\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 2458\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Make args iterable too.\u001b[39;49;00m\n\u001b[0;32m 2459\u001b[0m \u001b[43m \u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43matleast_1d\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwidth\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlinewidth\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhatch\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2461\u001b[0m \u001b[38;5;66;03m# Now that units have been converted, set the tick locations.\u001b[39;00m\n\u001b[0;32m 2462\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m orientation \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mvertical\u001b[39m\u001b[38;5;124m'\u001b[39m:\n", - "File \u001b[1;32mc:\\Users\\Tom\\miniconda3\\envs\\ui-copilot\\Lib\\site-packages\\numpy\\lib\\stride_tricks.py:540\u001b[0m, in \u001b[0;36mbroadcast_arrays\u001b[1;34m(subok, *args)\u001b[0m\n\u001b[0;32m 533\u001b[0m \u001b[38;5;66;03m# nditer is not used here to avoid the limit of 32 arrays.\u001b[39;00m\n\u001b[0;32m 534\u001b[0m \u001b[38;5;66;03m# Otherwise, something like the following one-liner would suffice:\u001b[39;00m\n\u001b[0;32m 535\u001b[0m \u001b[38;5;66;03m# return np.nditer(args, flags=['multi_index', 'zerosize_ok'],\u001b[39;00m\n\u001b[0;32m 536\u001b[0m \u001b[38;5;66;03m# order='C').itviews\u001b[39;00m\n\u001b[0;32m 538\u001b[0m args \u001b[38;5;241m=\u001b[39m [np\u001b[38;5;241m.\u001b[39marray(_m, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, subok\u001b[38;5;241m=\u001b[39msubok) \u001b[38;5;28;01mfor\u001b[39;00m _m \u001b[38;5;129;01min\u001b[39;00m args]\n\u001b[1;32m--> 540\u001b[0m shape \u001b[38;5;241m=\u001b[39m \u001b[43m_broadcast_shape\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 542\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mall\u001b[39m(array\u001b[38;5;241m.\u001b[39mshape \u001b[38;5;241m==\u001b[39m shape \u001b[38;5;28;01mfor\u001b[39;00m array \u001b[38;5;129;01min\u001b[39;00m args):\n\u001b[0;32m 543\u001b[0m \u001b[38;5;66;03m# Common case where nothing needs to be broadcasted.\u001b[39;00m\n\u001b[0;32m 544\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m args\n", - "File \u001b[1;32mc:\\Users\\Tom\\miniconda3\\envs\\ui-copilot\\Lib\\site-packages\\numpy\\lib\\stride_tricks.py:422\u001b[0m, in \u001b[0;36m_broadcast_shape\u001b[1;34m(*args)\u001b[0m\n\u001b[0;32m 417\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Returns the shape of the arrays that would result from broadcasting the\u001b[39;00m\n\u001b[0;32m 418\u001b[0m \u001b[38;5;124;03msupplied arrays against each other.\u001b[39;00m\n\u001b[0;32m 419\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 420\u001b[0m \u001b[38;5;66;03m# use the old-iterator because np.nditer does not handle size 0 arrays\u001b[39;00m\n\u001b[0;32m 421\u001b[0m \u001b[38;5;66;03m# consistently\u001b[39;00m\n\u001b[1;32m--> 422\u001b[0m b \u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbroadcast\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[38;5;241;43m32\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 423\u001b[0m \u001b[38;5;66;03m# unfortunately, it cannot handle 32 or more arguments directly\u001b[39;00m\n\u001b[0;32m 424\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m pos \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;241m32\u001b[39m, \u001b[38;5;28mlen\u001b[39m(args), \u001b[38;5;241m31\u001b[39m):\n\u001b[0;32m 425\u001b[0m \u001b[38;5;66;03m# ironically, np.broadcast does not properly handle np.broadcast\u001b[39;00m\n\u001b[0;32m 426\u001b[0m \u001b[38;5;66;03m# objects (it treats them as scalars)\u001b[39;00m\n\u001b[0;32m 427\u001b[0m \u001b[38;5;66;03m# use broadcasting to avoid allocating the full array\u001b[39;00m\n", - "\u001b[1;31mValueError\u001b[0m: shape mismatch: objects cannot be broadcast to a single shape. Mismatch is between arg 0 with shape (16,) and arg 1 with shape (4,)." - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "fig, ax = plt.subplots()\n", - "labels = group1[group1.columns[1]].unique()\n", - "\n", - "cmap = cm.get_cmap('viridis') # Replace 'viridis' with any colormap you like\n", - "\n", - "# Create a normalized range of colors\n", - "colors = cmap(np.linspace(0, 1, len(labels)))\n", - "\n", - "# Create a dictionary that maps labels to colors\n", - "color_dict = dict(zip(labels, colors))\n", - "\n", - "# Create the bars with sorted data\n", - "for i, label in enumerate(labels):\n", - " color = color_dict[label]\n", - " rects1 = ax.bar(x - 10*width + i*width, group1_sorted[group1_sorted.columns[2]][group1_sorted[group1_sorted.columns[1]] == label], width, color=color, label=label)\n", - " rects2 = ax.bar(x + 10*width + i*width, group1_sorted[group1_sorted.columns[4]][group1_sorted[group1_sorted.columns[1]] == label], width, color=color, label=label)\n", - "\n", - "# Display the legend\n", - "ax.legend()" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "benchmark\n", - "miniwob [ALL TASKS]\n", - "miniwob [ALL TASKS]\n", - "workarena form\n", - "workarena knowledge\n", - "workarena list-filter\n", - "workarena list-sort\n", - "workarena menu\n", - "workarena service catalog\n", - "workarena [ALL TASKS]\n", - "workarena form\n", - "workarena knowledge\n", - "workarena list-filter\n", - "workarena list-sort\n", - "workarena menu\n", - "workarena service catalog\n", - "workarena [ALL TASKS]\n", - "Name: Task group, dtype: object" - ] - }, - "execution_count": 104, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Split the dataframe into three groups\n", - "group1 = df[[df.columns[0], df.columns[1], df.columns[2], df.columns[3], df.columns[4], df.columns[8]]]\n", - "group2 = df[[df.columns[0], df.columns[1], df.columns[5], df.columns[6], df.columns[7], df.columns[8]]]\n", - "\n", - "# Sort the data by the desired column\n", - "group1_sorted = group1.sort_values(by=group1.columns[2])\n", - "\n", - "# Create positions for the bars\n", - "x = np.arange(len(group1_sorted[group1_sorted.columns[1]]))\n", - "\n", - "# Plotting group1\n", - "fig, ax = plt.subplots()\n", - "\n", - "\n", - "# Create the bars with sorted data\n", - "import pandas as pd\n", - "\n", - "# Reshape the data\n", - "group1_sorted_melt = pd.melt(group1_sorted, id_vars=[group1_sorted.columns[0], group1_sorted.columns[5]], \n", - " value_vars=[group1_sorted.columns[2], group1_sorted.columns[4]], \n", - " var_name='variable', value_name='value')\n", - "\n", - "# Create the bar plot\n", - "sns.barplot(x=group1_sorted_melt[group1_sorted.columns[5]], y='value', hue='value', data=group1_sorted_melt, palette='colorblind')\n", - "# Display the legend\n", - "handles, _ = ax.get_legend_handles_labels()\n", - "ax.legend(handles, labels.unique(), title='Labels')\n", - "\n", - "# Add labels and title\n", - "ax.set_ylabel('Count')\n", - "ax.set_title('Group 1 Bar Plot')\n", - "ax.set_xticklabels(group1[group1.columns[1]])\n", - "\n", - "# Display the plot\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Group 1 plot\n", - "fig, ax = plt.subplots()\n", - "\n", - "ax.set_yscale('log') # Apply log scale on y-axis\n", - "plt.title('Max')\n", - "plt.ylabel('Count')\n", - "plt.show()\n", - "\n", - "# Add spacing between plots\n", - "plt.subplots_adjust(hspace=0.5)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df.set_index('benchmark', inplace=True)\n", - "\n", - "# Plot the 'max DOM k-tokens' and 'max AXTree k-tokens' columns\n", - "df[['max DOM k-tokens', 'max AXTree k-tokens']].plot(kind='bar')\n", - "\n", - "# Add labels and title\n", - "plt.ylabel('k-tokens')\n", - "plt.title('Max DOM and AXTree k-tokens')\n", - "\n", - "# Display the plot\n", - "plt.show()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "ui-copilot", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/README.md b/README.md index 28343f8e..7472988e 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ - - ![AgentLab Banner](https://github.com/user-attachments/assets/a23b3cd8-b5c4-4918-817b-654ae6468cb4) [![pypi](https://badge.fury.io/py/agentlab.svg)](https://pypi.org/project/agentlab/) @@ -10,6 +8,7 @@ [![Tests](https://github.com/ServiceNow/AgentLab/actions/workflows/unit_tests.yml/badge.svg)](https://github.com/ServiceNow/AgentLab/actions/workflows/unit_tests.yml) + [🛠️ Setup](#%EF%B8%8F-setup-agentlab)  |  [🤖 Assistant](#-ui-assistant)  |  [🚀 Launch Experiments](#-launch-experiments)  |  @@ -48,6 +47,7 @@ AgentLab Features: | [Mind2Web-live](https://huggingface.co/datasets/iMeanAI/Mind2Web-Live) (soon) | - | - | None | - | - | live web | soon | | [MiniWoB](https://miniwob.farama.org/index.html) | [setup](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/miniwob/README.md) | 125 | Medium | 10 | no | self hosted (static files) | soon | + ## 🛠️ Setup AgentLab ```bash diff --git a/src/agentlab/analyze/error_categorization.py b/src/agentlab/analyze/error_categorization.py deleted file mode 100644 index c6a9cd1b..00000000 --- a/src/agentlab/analyze/error_categorization.py +++ /dev/null @@ -1,89 +0,0 @@ -import re - - -def is_critical_server_error(err_msg: str, stack_trace: str, return_error_type=False): - if stack_trace is None: - return False - server_error_conditions = [ - "502 Server Error: Bad Gateway", # url is not valid or server is not ready yet - "500 Server Error: Internal Server Error", # When the server is up, but somehow unresponsive - "424 Client Error: Failed Dependency", # Request failed during generation: Server error: CUDA error: invalid argument - "403 Client Error: Forbidden", - "401 Client Error: Unauthorized", # the server was auth-token protected - "400 Client Error: Bad Request", # there's no server. (the server probably failed) sometimes only visible in stack trace? - ] - if not return_error_type: - return any(condition in stack_trace for condition in server_error_conditions) - else: - for condition in server_error_conditions: - if condition in stack_trace: - return condition - return None - - -def is_minor_server_error(err_msg: str, stack_trace: str, return_error_type=False): - if stack_trace is None: - return False - server_error_conditions = [ - "504 Server Error: Gateway Time-out", - "Server error: Out of available cache blocks", # TODO(look into) - "openai.APIConnectionError: Connection error", - ] - if not return_error_type: - return any(condition in stack_trace for condition in server_error_conditions) - else: - for condition in server_error_conditions: - if condition in stack_trace: - return condition - return None - - -def is_retry_error(err_msg: str, stack_trace: str): - """Use regex on the stack trace to detect retry errors. - - The pattern is "ValueError: Could not parse a valid value after d+ - retries." - - Args: - err_msg (str): The error message - stack_trace (str): The stack trace - - Returns: - bool: True if the error is a retry error, False otherwise - """ - if stack_trace is None: - return False - pattern = r"ValueError: Could not parse a valid value after \d+ retries." - return re.search(pattern, stack_trace) is not None - - -def is_input_length_error(err_msg: str, stack_trace: str): - """Use regex on the stack trace to detect input length errors. - - The patterns are "422 Client Error: Unprocessable Entity" - and also "Input validation error: `inputs` tokens + `max_new_tokens` must be <=" - - Args: - err_msg (str): The error message - stack_trace (str): The stack trace - - Returns: - bool: True if the error is an input length error, False otherwise - """ - if stack_trace is None: - return False - patterns = [ - r"422 Client Error: Unprocessable Entity", - r"Input validation error: `inputs` tokens \+ `max_new_tokens` must be <=", - ] - return any( - re.search(pattern, err_msg) or re.search(pattern, stack_trace) for pattern in patterns - ) - - -ERR_CLASS_MAP = { - "critical_server_error": is_critical_server_error, - "minor_server_error": is_minor_server_error, - "retry_error": is_retry_error, - "input_length_error": is_input_length_error, -} diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py index 862be27b..6dc3c3fc 100644 --- a/src/agentlab/analyze/inspect_results.py +++ b/src/agentlab/analyze/inspect_results.py @@ -1,12 +1,10 @@ import fnmatch -import io import json import random import re import traceback import warnings from collections import defaultdict -from datetime import datetime from logging import warn from pathlib import Path @@ -16,13 +14,7 @@ from IPython.display import display from tqdm import tqdm -from agentlab.analyze.error_categorization import ( - ERR_CLASS_MAP, - is_critical_server_error, - is_minor_server_error, -) from agentlab.experiments.exp_utils import RESULTS_DIR -from agentlab.utils.bootstrap import bootstrap_matrix, convert_df_to_array # TODO find a more portable way to code set_task_category_as_index at least # handle dynamic imports. We don't want to always import workarena @@ -30,11 +22,6 @@ warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning) -try: - import pyperclip -except ImportError: - pyperclip = None - pd.set_option("display.multi_sparse", False) AGENT_NAME_KEY = "agent.agent_name" @@ -224,18 +211,6 @@ def report_constant_and_variables(df, show_stack_traces=True): print(f" ...\n") -def get_bootstrap(df, metric, reduce_fn=np.nanmean, n_bootstrap=100, group_by=TASK_KEY, prior=0.5): - """Get the stratified bootstrap mean and std for the given metric.""" - grouped_df = df.reset_index(inplace=False).groupby(group_by) - array = convert_df_to_array(grouped_df, metric=metric, threshold=0.7) - if prior is not None: - prior = prior * np.ones((len(array), 1)) - array = np.concatenate([array, prior], axis=1) - - bootstrapped_values = bootstrap_matrix(array, n_bootstrap=n_bootstrap, reduce_fn=reduce_fn) - return np.nanmean(bootstrapped_values), np.nanstd(bootstrapped_values) - - def get_std_err(df, metric): """Get the standard error for a binary metric.""" # extract non missing values @@ -262,7 +237,7 @@ def get_sample_std_err(df, metric): return mean, std_err -def summarize(sub_df, use_bootstrap=False): +def summarize(sub_df): if not "cum_reward" in sub_df: record = dict( avg_reward=np.nan, @@ -279,10 +254,7 @@ def summarize(sub_df, use_bootstrap=False): if n_completed == 0: return None - if use_bootstrap: - _mean_reward, std_reward = get_bootstrap(sub_df, "cum_reward") - else: - _mean_reward, std_reward = get_std_err(sub_df, "cum_reward") + _mean_reward, std_reward = get_std_err(sub_df, "cum_reward") # sanity check, if there is an error the reward should be zero assert sub_df[sub_df["err_msg"].notnull()]["cum_reward"].sum() == 0 @@ -466,21 +438,6 @@ def _rename_bool_flags(report: pd.DataFrame, true_str="✓", false_str="-"): return report -def to_clipboard(df: pd.DataFrame): - """Copy the dataframe to the clipboard as a tab separated csv.""" - output = io.StringIO() - df.to_csv(output, sep="\t", index=True) - csv_string = output.getvalue() - if pyperclip is not None: - try: - pyperclip.copy(csv_string) - except Exception as e: - warn(f"Failed to copy to clipboard: {e}") - # else: - # print("pyperclip is not installed, cannot copy to clipboard.") - # return df - - def flag_report(report: pd.DataFrame, metric: str = "avg_reward", round_digits: int = 2): # for all index in the multi-index with boolean value, get the average for # True and the average for False separately. Produce a new dataframe with diff --git a/src/agentlab/analyze/plot_tools.py b/src/agentlab/analyze/plot_tools.py deleted file mode 100644 index b60f60df..00000000 --- a/src/agentlab/analyze/plot_tools.py +++ /dev/null @@ -1,409 +0,0 @@ -from pathlib import Path -from typing import List - -import joblib -import numpy as np -import pandas as pd -import seaborn as sns -from matplotlib import pyplot as plt -from matplotlib.collections import PolyCollection -from matplotlib.ticker import FormatStrFormatter -from scipy.stats import trim_mean - -from agentlab.utils.bootstrap import bootstrap_all_benchmarks - -sns.set_style("dark", {"grid.color": "0.98", "axes.facecolor": "(0.95, 0.95, 0.97)"}) - - -def load_benchmark_masks(csv_path: Path | str, benchmark_names=list[str]): - """Load benchmark masks from a csv file and convert it. - - To create the csv file simply use file/download/"...csv" from google - spreadsheet. - - Args: - csv_path (Path | str): The path to the csv file. - benchmark_names (list[str]): The names of the benchmarks to be loaded. - - Returns: - pd.DataFrame: The benchmark masks. - """ - mask_df = pd.read_csv(csv_path) # type: pd.DataFrame - - for benchmark_name in benchmark_names: - mask_df[benchmark_name] = mask_df[benchmark_name].apply( - lambda x: True if str(x).strip() == "x" else False - ) - - mask_df["all"] = True - benchmark_names = ["all"] + benchmark_names - - # remove the Average row - mask_df = mask_df[mask_df["Task Name"] != "ZAverage"] - mask_df = mask_df[mask_df["Task Name"].isna() == False] - - # # print the excluded tasks - # excluded_tasks = set(mask_df["Task Name"].unique()) - set(MINIWOB_TASKNAME_TO_CLASSNAME.keys()) - # print("Excluded tasks:", excluded_tasks) - - # # filter task that are not in the MINIWOB_TASKNAME_TO_CLASSNAME - # mask_df = mask_df[mask_df["Task Name"].isin(MINIWOB_TASKNAME_TO_CLASSNAME.keys())] - - # # convert task name to class name and keep all columns - mask_df["task_name"] = mask_df["Task Name"].apply(lambda x: f"miniwob.{x}") - mask_df.set_index("task_name", inplace=True) - - # extract only the benchmark columns - mask_df = mask_df[benchmark_names] - - # rename columns - mask_df.rename(rename_func, inplace=True, axis="columns") - return mask_df - - -def add_benchmark_masks(df: pd.DataFrame, benchmark_masks: pd.DataFrame): - """Add benchmark masks to the dataframe.""" - return df.merge(benchmark_masks, left_on="task_name", right_index=True) - - -def sort_agents(agents, df, metric="cum_reward"): - agent_order = ( - df.groupby("agent_model_name")[metric].mean().sort_values(ascending=False).index.tolist() - ) - return [agent for agent in agent_order if agent in agents] - - -def iqm(scores): - """Interquantile mean.""" - return trim_mean(scores, proportiontocut=0.25, axis=None) - - -# add memory from joblib -memory = joblib.Memory(location=Path().home() / "cache", verbose=1) -bootstrap_all_benchmarks_cached = memory.cache(bootstrap_all_benchmarks) - - -def bootstrap_and_plot( - df: pd.DataFrame, - benchmark_names: List[str], - metric: str = "cum_reward", - model_axis: str = "agent_model_name", - agent_order: List[str] | None = None, - agent_colors=None, - agent_markers=None, - repeat: int = 100, - fig_size=None, - n_legend_rows: int = 2, -): - """Add aggregated data as a new benchmark.""" - - if agent_order is None: - agent_order = sorted(df[model_axis].unique()) - print("bootstrapping") - # create a new df containing bootstrapped samples of iqm - - df_bootstrap = bootstrap_all_benchmarks( - df, metric, benchmark_cols=benchmark_names, repeat=repeat - ) - print("plotting") - - # plot results per benchmark (aggregated results is an extra benchmark) - plot_per_benchmark( - df_bootstrap, - agent_order, - model_axis, - benchmark_order=benchmark_names, - agent_colors=agent_colors, - agent_markers=agent_markers, - metric=metric, - fig_size=fig_size, - n_legend_rows=n_legend_rows, - ) - - -def remove_violin_outline(ax): - """Remove the outline of the violin plot.""" - for pc in ax.collections: - pc.set_edgecolor("none") - - -def get_violin_centers(ax: plt.Axes) -> list[float]: - """Estimate the center of violin patches from axes. - - This is a hacky way to get the center of the violin patches. - It works by averaging the x coordinates of the vertices of PolyCollection - - Args: - ax (plt.Axes): The axes containing the violin plot. - - Returns: - list[float]: The x coordinates of the centers of the violins. - """ - violin_centers = [] - for child in ax.get_children(): - if isinstance(child, PolyCollection): - violin_centers.append(np.mean(child.get_paths()[0].vertices[:, 0])) - - return sorted(violin_centers) - - -def associate_action_markers(agent_order): - """Associate a marker to each agent, based on action space.""" - markers = {} - for agent in agent_order: - marker = None - if "high" in agent: - marker = "^" - elif "low" in agent: - marker = "v" - elif "both" in agent: - marker = "d" - elif "code" in agent: - marker = "o" - else: - marker = "*" - - markers[agent] = marker - return markers - - -def rename_func(name): - return RENAME_DICT.get(name, name) - - -# for display names in the paper -RENAME_DICT = { - "easy (auto)": "easy", - "hard (auto)": "hard", - "long context (auto)": "long context", - "rci-agent task (47)": "RCI\nsubset", - "WebGUM task (56)": "WebGUM\nsubset", - "long context": "long context", - "pixel": "pixel", - "2d under standing": "2D\nunderstanding", - "x,y actions": "x, y\nactions", - "domain specific knowledge or exploration ?": "domain specific\nknowledge", - "long episode": "long episode", - "rapid interaction": "rapid interaction", - "action space limited": "action space\nlimited", - "requies augmented HTML and/or AXTree": "requires\naugmented HTML", -} - - -def add_markers(ax, df, model_axis, metric, agent_order, agent_markers, agent_colors): - """Add markers to the scatter plot.""" - v_centers = get_violin_centers(ax) - scatter_handles = {} - - for center_x, agent in zip(v_centers, agent_order): - median_y = df[df[model_axis] == agent][metric].mean() - - scatter_plot = ax.scatter( - center_x, - median_y, - marker=agent_markers[agent], - color=agent_colors[agent], - edgecolor="black", - label=agent, - s=60, - ) - - scatter_handles[agent] = scatter_plot - - # markers overtake the legend - legend = ax.legend( - handles=list(scatter_handles.values()), - labels=list(scatter_handles.keys()), - loc="lower center", - bbox_to_anchor=(0.5, 1), - ncol=len(agent_order), - title="", - ) - - # bigger markers! - for handle in legend.legendHandles: - handle.set_sizes([100]) - return scatter_handles - - -def plot_per_benchmark( - df: pd.DataFrame, - agent_order: List[str], - model_axis: str, - benchmark_order: List[str] | None = None, - metric: str = "cum_reward", - aggregated_name: str = "all", - sharey: bool = True, - inner: str = None, - fig_size: tuple = None, - n_legend_rows: int = 1, - agent_colors: dict[str, tuple] = None, - agent_markers: dict[str, str] = None, -): - """Violin plots for each benchmarks and each agents. - - Args: - df: pd.DataFrame - The input DataFrame. - agent_order: List[str] - The order of the agents in the plot. - model_axis: str - The column containing the agent names. - benchmark_order: List[str], optional - The order of the benchmarks in the plot. Defaults to alhpabetical order. - metric: str, optional - The column containing the metric to which the function will be applied. - aggregated_name: str, optional - The name of the "special" benchmark. It will be placed first and - highlighted in pale blue - sharey: bool, optional - Whether to share the y axis between plots. - inner: str, optional - The type of inner display inside of the violins. See seaborn.violinplot - fig_size: tuple, optional - The size of the figure. see matplotlib.pyplot.figure - n_legend_rows: int, optional - The number of rows in the legend. - agent_colors: dict[str, tuple], optional - A dictionary mapping agent names to colors. Defaults to seaborn's colorblind palette. - agent_markers: dict[str, str], optional - A dictionary mapping agent names to markers. Defaults to no markers. - """ - if benchmark_order is None: - benchmark_order = sorted(df["benchmark"].unique()) - - if fig_size is None: - fig_width = len(benchmark_order) * 2 - fig_size = (fig_width, 3) - fig, axes = plt.subplots(1, len(benchmark_order), sharey=sharey, figsize=fig_size) - - if agent_colors is None: - colors = sns.color_palette("colorblind", n_colors=len(agent_order)) - agent_colors = dict(zip(agent_order, colors)) - - if agent_markers is not None: - violon_palette = {agent: (0.8, 0.8, 0.8) for agent in agent_order} - else: - violon_palette = agent_colors - - for benchmark, ax in zip(benchmark_order, axes): - sub_df = df[df["benchmark"] == benchmark] - - sns.violinplot( - x="benchmark", - y=metric, - hue=model_axis, - data=sub_df, - hue_order=agent_order, - linewidth=0.5, - saturation=1, - scale="count", - inner=inner, - palette=violon_palette, - ax=ax, - ) - remove_violin_outline(ax) - - if agent_markers is not None: - add_markers(ax, sub_df, model_axis, metric, agent_order, agent_markers, agent_colors) - - ax.tick_params(axis="y", labelsize=18) - ax.yaxis.set_major_formatter(FormatStrFormatter("%.2f")) - ax.grid(axis="y") - - if benchmark == aggregated_name: - ax.set_facecolor("#cff6fc") - - ax.set(xlabel=None) - - if benchmark != benchmark_order[int((len(benchmark_order) - 1) / 2)]: - ax.get_legend().remove() - else: - ncols = int(np.ceil(len(agent_order) / n_legend_rows)) - - sns.move_legend(ax, loc="lower center", bbox_to_anchor=(0.5, 1), ncol=ncols, title="") - - if benchmark != benchmark_order[0]: - ax.set(ylabel=None) - - if sharey: - fig.subplots_adjust(wspace=0.02) - else: - fig.subplots_adjust(wspace=0.3) - - -@memory.cache -def modify_names(df): - """Generate names that are paper-friendly, and fix model_name and agent_name for the different formats.""" - - def process_row(row): - model_name = row.get("model_name", np.nan) - agent_name = row["agent_name"] - - if not isinstance(model_name, str) or len(model_name) == 0: - # rim's type of model extract from agent name - model_name = "gpt-4" if "gpt4" in agent_name else "gpt-3.5" - agent_name = agent_name.replace("_gpt4", "") - else: - # massimo's type of model - if "/" in model_name: - model_name = model_name.split("/")[-1] - model_name = model_name.replace("-Instruct-hf", "").replace("-beta", "") - - agent_name = agent_name.replace("GenericAgent_", "") - - row["agent_name"] = agent_name - row["model_name"] = model_name - row["agent_model_name"] = f"{agent_name}-{model_name}" - - return row - - return df.apply(process_row, axis=1) - - -def find_incomplete_exp(df: pd.DataFrame, agent_columns_to_show=None): - agents = df["agent_model_name"].unique() - # print(agents) - dicts = [] - for agent in agents: - agent_df = df[df["agent_model_name"] == agent] # type: pd.DataFrame - tasks = agent_df["task_name"].unique() - - n_episode = [] - for task in tasks: - task_df = agent_df[agent_df["task_name"] == task] - n_episode.append(len(task_df)) - # find missing cum_reward - missing_cum_reward = agent_df["cum_reward"].isna().sum() - # num results - num_results = len(agent_df) - - info = { - "agent": agent, - "n_tasks": len(tasks), - "max_ep_count": np.max(n_episode), - "min_ep_count": np.min(n_episode), - "mean_ep_count": np.mean(n_episode), - "unique cum_reward": str(agent_df["cum_reward"].unique()), - "missing_cum_reward": missing_cum_reward, - "total_results": num_results, - } - - if agent_columns_to_show is not None: - for col in agent_columns_to_show: - info[f"{col}-unique"] = agent_df[col].unique()[:10] - - for task in tasks: - sub_df = agent_df[agent_df["task_name"] == task] - if len(sub_df) > 10: - unique_exp_dir = sub_df["exp_dir"].unique() - print( - f" {agent} {task} {len(sub_df)} n unique expseeds {len(sub_df['exp_seed'].unique())}" - ) - for i, dir in enumerate(unique_exp_dir): - print(f" Unique exp_dir {i}: {dir}") - - dicts.append(info) - df_summary = pd.DataFrame(dicts) - # df_summary = df_summary.sort_values("max_episode_count", ascending=False) - return df_summary diff --git a/src/agentlab/experiments/get_ray_url.py b/src/agentlab/experiments/get_ray_url.py index b652254c..2eb5cb58 100644 --- a/src/agentlab/experiments/get_ray_url.py +++ b/src/agentlab/experiments/get_ray_url.py @@ -1,3 +1,8 @@ +"""Temporary script to get the ray dashboard url for the current experiment. + +TODO figure out a more convenient way. +""" + import ray context = ray.init(address="auto", ignore_reinit_error=True) diff --git a/src/agentlab/experiments/graph_execution_dask.py b/src/agentlab/experiments/graph_execution_dask.py deleted file mode 100644 index dc51dd51..00000000 --- a/src/agentlab/experiments/graph_execution_dask.py +++ /dev/null @@ -1,64 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError - -from contextlib import contextmanager -import threading -from dask import compute, delayed -from bgym import ExpArgs -from distributed import LocalCluster, Client -from agentlab.experiments.exp_utils import _episode_timeout - -# from agentlab.experiments.exp_utils import run_exp - - -def run_exp(exp_arg: ExpArgs, *dependencies, avg_step_timeout=60): - """Run exp_args.run() with a timeout and handle dependencies.""" - # dask can't use the timeout_manager define in exp_utils.py - # ValueError: signal only works in main thread of the main interpreter - # most alternative I try doesn't work - episode_timeout = _episode_timeout(exp_arg, avg_step_timeout=avg_step_timeout) - return exp_arg.run() - - -def make_dask_client(n_worker): - """Create a Dask client with a LocalCluster backend. - - I struggled to find an appropriate configuration. - I believe it has to do with the interplay of playwright async loop (even if - used in sync mode) and the fact that dask uses asyncio under the hood. - Making sure we use processes and 1 thread per worker seems to work. - - Args: - n_worker: int - Number of workers to create. - - Returns: - A Dask client object. - """ - cluster = LocalCluster( - n_workers=n_worker, - processes=True, - threads_per_worker=1, - ) - - return Client(cluster) - - -def execute_task_graph(exp_args_list: list[ExpArgs]): - """Execute a task graph in parallel while respecting dependencies.""" - exp_args_map = {exp_args.exp_id: exp_args for exp_args in exp_args_list} - - tasks = {} - - def get_task(exp_arg: ExpArgs): - if exp_arg.exp_id not in tasks: - dependencies = [get_task(exp_args_map[dep_key]) for dep_key in exp_arg.depends_on] - tasks[exp_arg.exp_id] = delayed(run_exp)(exp_arg, *dependencies) - return tasks[exp_arg.exp_id] - - for exp_arg in exp_args_list: - get_task(exp_arg) - - task_ids, task_list = zip(*tasks.items()) - results = compute(*task_list) - - return {task_id: result for task_id, result in zip(task_ids, results)} diff --git a/src/agentlab/experiments/list_openai_models.py b/src/agentlab/experiments/list_openai_models.py index 438f97bd..0c301926 100644 --- a/src/agentlab/experiments/list_openai_models.py +++ b/src/agentlab/experiments/list_openai_models.py @@ -5,8 +5,8 @@ models = OpenAI().models.list() df = pd.DataFrame([dict(model) for model in models.data]) - # Filter GPT models - df = df[df["id"].str.contains("gpt")] + # Filter GPT models or o1 models + df = df[df["id"].str.contains("gpt") | df["id"].str.contains("o1")] # Convert Unix timestamps to dates (YYYY-MM-DD) and remove time df["created"] = pd.to_datetime(df["created"], unit="s").dt.date diff --git a/src/agentlab/experiments/miniwob_tasks_all.csv b/src/agentlab/experiments/miniwob_tasks_all.csv deleted file mode 100644 index e4b3777b..00000000 --- a/src/agentlab/experiments/miniwob_tasks_all.csv +++ /dev/null @@ -1,126 +0,0 @@ -task_name,miniwob_category,comment -ascending-numbers,hidden test, -bisect-angle,original, -book-flight,original,delay -book-flight-nodelay,nodelay, -buy-ticket,hidden test, -choose-date,original,delay -choose-date-easy,debug,delay -choose-date-medium,debug,delay -choose-date-nodelay,nodelay, -choose-list,original, -circle-center,original, -click-button,original, -click-button-sequence,original, -click-checkboxes,original, -click-checkboxes-large,additional, -click-checkboxes-soft,additional, -click-checkboxes-transfer,additional, -click-collapsible,original,delay -click-collapsible-2,original,delay -click-collapsible-2-nodelay,nodelay, -click-collapsible-nodelay,nodelay, -click-color,original, -click-dialog,original, -click-dialog-2,original, -click-link,original, -click-menu,original, -click-menu-2,original, -click-option,original, -click-pie,original,delay -click-pie-nodelay,nodelay, -click-scroll-list,original, -click-shades,original, -click-shape,original, -click-tab,original, -click-tab-2,original, -click-tab-2-easy,debug, -click-tab-2-hard,additional, -click-tab-2-medium,debug, -click-test,original, -click-test-2,original, -click-test-transfer,debug, -click-widget,original, -copy-paste,original, -copy-paste-2,original, -count-shape,original, -count-sides,original, -daily-calendar,hidden test, -drag-box,original, -drag-circle,original, -drag-cube,original, -drag-items,original, -drag-items-grid,original, -drag-shapes,original, -drag-shapes-2,hidden test, -drag-single-shape,hidden test, -drag-sort-numbers,original, -draw-circle,hidden test, -draw-line,hidden test, -email-inbox,original, -email-inbox-delete,debug, -email-inbox-forward,debug, -email-inbox-forward-nl,additional, -email-inbox-forward-nl-turk,additional, -email-inbox-important,debug, -email-inbox-nl-turk,additional, -email-inbox-noscroll,debug, -email-inbox-reply,debug, -email-inbox-star-reply,debug, -enter-date,original, -enter-password,original, -enter-text,original, -enter-text-2,original, -enter-text-dynamic,original, -enter-time,original, -find-greatest,hidden test, -find-midpoint,original, -find-word,original, -focus-text,original, -focus-text-2,original, -form-sequence,hidden test, -form-sequence-2,hidden test, -form-sequence-3,hidden test, -generate-number,hidden test, -grid-coordinate,original, -guess-number,original, -highlight-text,original, -highlight-text-2,original, -hot-cold,hidden test, -identify-shape,original, -login-user,original, -login-user-popup,additional, -multi-layouts,additional, -multi-orderings,additional, -navigate-tree,original, -number-checkboxes,original, -odd-or-even,hidden test, -order-food,hidden test, -phone-book,hidden test, -read-table,original, -read-table-2,original, -resize-textarea,original, -right-angle,original, -scroll-text,original, -scroll-text-2,original, -search-engine,original, -sign-agreement,hidden test, -simple-algebra,original, -simple-arithmetic,original, -social-media,original, -social-media-all,additional, -social-media-some,additional, -stock-market,hidden test,delay -terminal,original, -text-editor,original, -text-transform,original, -tic-tac-toe,original, -unicode-test,debug, -use-autocomplete,original,delay -use-autocomplete-nodelay,nodelay, -use-colorwheel,original, -use-colorwheel-2,original, -use-slider,original, -use-slider-2,original, -use-spinner,original, -visual-addition,original, diff --git a/src/agentlab/experiments/reproduce_study.py b/src/agentlab/experiments/reproduce_study.py index 93ef07fb..8a8a7093 100644 --- a/src/agentlab/experiments/reproduce_study.py +++ b/src/agentlab/experiments/reproduce_study.py @@ -11,7 +11,7 @@ if __name__ == "__main__": - # old_study = "2024-06-03_13-53-50_final_run_workarena_L1_llama3-70b" + # replace by your study name old_study = "2024-06-03_12-28-51_final_run_miniwob_llama3-70b" study = reproduce_study(RESULTS_DIR / old_study) diff --git a/src/agentlab/experiments/view_dep_graph.py b/src/agentlab/experiments/view_dep_graph.py index 0639507b..abbf7f87 100644 --- a/src/agentlab/experiments/view_dep_graph.py +++ b/src/agentlab/experiments/view_dep_graph.py @@ -1,3 +1,6 @@ +"""Dirty script to visualize the dependency graph of a benchmark, e.g. webarena, vsisualwebarena, +etc. You may have to detust it to make it work for you.""" + import math import bgym import matplotlib.pyplot as plt diff --git a/src/agentlab/utils/__init__.py b/src/agentlab/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/agentlab/utils/bootstrap.py b/src/agentlab/utils/bootstrap.py deleted file mode 100644 index 666ba6cc..00000000 --- a/src/agentlab/utils/bootstrap.py +++ /dev/null @@ -1,120 +0,0 @@ -import warnings -from typing import Callable, List, Optional, Union - -import numpy as np -import pandas as pd - - -def stratified_bootstrap( - df: pd.DataFrame, - group_by: Union[str, List[str]], - strat: Union[str, List[str]], - metric: str, - func: Callable, - repeat: Optional[int] = 100, - rng: np.random.Generator = np.random.default_rng(), -) -> pd.DataFrame: - """ - Perform stratified bootstrapping on a DataFrame. - - Parameters: - df (pd.DataFrame): The input DataFrame. - group_by (Union[str, List[str]]): The column(s) by which to group the DataFrame. - strat (str): The column used to define strata within each group. - metric (str): The column containing the metric to which the function will be applied. - func (Callable): The function to apply to the metric within each stratum. - repeat (int, optional): Number of bootstrap iterations. Default is 100. - rng (np.random.Generator, optional): Random number generator. Default is NumPy's random generator. - - Returns: - pd.DataFrame: A DataFrame containing bootstrapped samples with applied function. - """ - if group_by is None: - group_by = [] - - if isinstance(group_by, str): - group_by = [group_by] - - if isinstance(strat, str): - strat = [strat] - - # extract subset of columns - df = df[group_by + strat + [metric]] - - group = df.groupby(group_by + strat) - - df_list = [] - for i in range(repeat): - new_df = group.sample(frac=1, replace=True, random_state=rng) - series = new_df.groupby(group_by)[metric].apply(func) - sub_df = series.to_frame().reset_index() - sub_df["bootstrap_index"] = i - df_list.append(sub_df) - - new_df = pd.concat(df_list) - return new_df - - -def bootstrap_all_benchmarks( - df: pd.DataFrame, - metric: str, - benchmark_cols, - group_by=["agent_model_name"], - repeat: int = 100, -): - """Add aggregated data as a new benchmark.""" - - # create a new df containing bootstrapped samples of iqm - df_bootstrap = [] - for benchmark in benchmark_cols: - # filter sub_df assuming that the benchmark column exists and is a boolean - sub_df = df[df[benchmark]] - - bs_df = stratified_bootstrap( - sub_df, - group_by=group_by, - strat=["task_name"], - metric=metric, - func=np.mean, - repeat=repeat, - ) - bs_df["benchmark"] = benchmark - df_bootstrap.append(bs_df) - df_bootstrap = pd.concat(df_bootstrap) - return df_bootstrap - - -def bootstrap_matrix(data: np.ndarray, n_bootstrap: int, reduce_fn=np.nanmean): - n_task, n_samples = data.shape - - indices = np.random.randint(0, n_samples, (n_bootstrap, n_task, n_samples)) - - assert indices.shape == (n_bootstrap, n_task, n_samples) - bootstrapped_samples = data[np.arange(n_task)[:, None], indices] - - with warnings.catch_warnings(): - # catches means of empty slices - warnings.simplefilter("ignore", category=RuntimeWarning) - results = [reduce_fn(b_data) for b_data in bootstrapped_samples] - - return results - - -def convert_df_to_array(grouped, metric="cum_reward", threshold=0.9): - max_samples_per_task = max(len(group) for _, group in grouped) - - arr = np.zeros((len(grouped), max_samples_per_task)) - - for task_idx, (group_id, group) in enumerate(grouped): - # Calculate the ratio of valid values - valid_ratio = len(group) / max_samples_per_task - # if valid_ratio < threshold: - # raise ValueError( - # f"Task {group_id} has insufficient data: ratio {valid_ratio} is below the threshold {threshold}" - # ) - - # Repeat the task data cyclically to fill up to max_samples_per_task - repeated_data = np.resize(group[metric].values, max_samples_per_task) - arr[task_idx, :] = repeated_data - - return arr diff --git a/tests/experiments/test_dask.py b/tests/experiments/test_dask.py deleted file mode 100644 index 39822634..00000000 --- a/tests/experiments/test_dask.py +++ /dev/null @@ -1,41 +0,0 @@ -from agentlab.experiments.graph_execution_dask import execute_task_graph, make_dask_client -from agentlab.experiments.exp_utils import MockedExpArgs - -TASK_TIME = 3 - - -def test_execute_task_graph(): - # Define a list of ExpArgs with dependencies - exp_args_list = [ - MockedExpArgs(exp_id="task1", depends_on=[]), - MockedExpArgs(exp_id="task2", depends_on=["task1"]), - MockedExpArgs(exp_id="task3", depends_on=["task1"]), - MockedExpArgs(exp_id="task4", depends_on=["task2", "task3"]), - ] - - with make_dask_client(n_worker=5): - results = execute_task_graph(exp_args_list) - - exp_args_list = [results[task_id] for task_id in ["task1", "task2", "task3", "task4"]] - - # Verify that all tasks were executed in the proper order - assert exp_args_list[0].start_time < exp_args_list[1].start_time - assert exp_args_list[0].start_time < exp_args_list[2].start_time - assert exp_args_list[1].end_time < exp_args_list[3].start_time - assert exp_args_list[2].end_time < exp_args_list[3].start_time - - # # Verify that parallel tasks (task2 and task3) started within a short time of each other - # parallel_start_diff = abs(exp_args_list[1].start_time - exp_args_list[2].start_time) - # print(f"parallel_start_diff: {parallel_start_diff}") - # assert parallel_start_diff < 1.5 # Allow for a small delay - - # Ensure that the entire task graph took the expected amount of time - total_time = exp_args_list[-1].end_time - exp_args_list[0].start_time - assert ( - total_time >= TASK_TIME * 3 - ) # Since the critical path involves at least 1.5 seconds of work - - -if __name__ == "__main__": - test_execute_task_graph() - # test_add_dependencies() diff --git a/tests/utils/test_bootstrap.py b/tests/utils/test_bootstrap.py deleted file mode 100644 index 41fe6b1f..00000000 --- a/tests/utils/test_bootstrap.py +++ /dev/null @@ -1,132 +0,0 @@ -import pandas as pd -import numpy as np -from agentlab.utils.bootstrap import stratified_bootstrap, bootstrap_matrix, convert_df_to_array - - -# Generate a DataFrame to use for all unit tests -df = pd.DataFrame( - { - "agent": ["A", "A", "A", "A", "B", "B", "B", "B"], - "episode": [1, 2, 1, 2, 1, 2, 1, 2], - "task": ["T1", "T1", "T2", "T2", "T1", "T1", "T2", "T2"], - "cum_reward": [10, 11, 12, 13, 14, 15, 16, 17], - } -) - - -df_no_var = pd.DataFrame( - { - "agent": ["A", "A", "A", "A", "B", "B", "B", "B"], - "episode": [1, 2, 1, 2, 1, 2, 1, 2], - "task": ["T1", "T1", "T2", "T2", "T1", "T1", "T2", "T2"], - "cum_reward": [10, 10, 12, 12, 14, 14, 16, 16], - } -) - - -def stratified_bootstrap_simple(df: pd.DataFrame, n_bootstrap_samples): - """ - Perform stratified bootstrap sampling to calculate mean cumulative rewards. - - Parameters: - df (DataFrame): DataFrame containing the columns 'task', 'episode', 'cum_reward'. - n_bootstrap_samples (int): Number of bootstrap samples to generate. - - Returns: - np.array: An array of overall bootstrap means. - """ - - # Initialize an empty list to store the overall means - overall_bootstrap_means = [] - - # Perform bootstrapping - for _ in range(n_bootstrap_samples): - bootstrap_sample_rows = [] - - # Stratified sampling for each task - for task in df["task"].unique(): - task_data = df[df["task"] == task] - - # Generate bootstrap sample for this task (sampling with replacement) - bootstrap_sample = task_data.sample(n=len(task_data), replace=True) - bootstrap_sample_rows.append(bootstrap_sample) - - # Concatenate all bootstrap samples to form the overall sample - overall_bootstrap_sample = pd.concat(bootstrap_sample_rows, ignore_index=True) - - # Compute the mean cum_reward for the overall sample - overall_bootstrap_mean = overall_bootstrap_sample["cum_reward"].mean() - - # Store this mean - overall_bootstrap_means.append(overall_bootstrap_mean) - - # Convert the list of overall means to a NumPy array - return np.array(overall_bootstrap_means) - - -# Test for expected number of rows in the output DataFrame -def test_with_no_var(): - df_bootstrap = stratified_bootstrap( - df_no_var, "agent", "task", "cum_reward", np.mean, repeat=10 - ) - df_mean = df_bootstrap.groupby(["agent"])["cum_reward"].mean() - df_mean_alt = df_no_var.groupby(["agent"])["cum_reward"].mean() - - pd.testing.assert_series_equal(df_mean, df_mean_alt, rtol=1e-1, atol=0) - - -# Test for statistical properties (mean, in this case) -def test_statistical_properties(): - df_bootstrap = stratified_bootstrap(df, "agent", "task", "cum_reward", np.mean, repeat=1000) - bs_mean = df_bootstrap.groupby(["agent"])["cum_reward"].mean() - bs_std = df_bootstrap.groupby(["agent"])["cum_reward"].std() - - # alternative method for stratified bootstrap - df_bs_alt = [] - for agent in ["A", "B"]: - bs_mean_alt = stratified_bootstrap_simple(df[df["agent"] == agent], 1000) - bs_mean_df = pd.DataFrame(bs_mean_alt, columns=["cum_reward"]) - bs_mean_df["agent"] = agent - bs_mean_df["bootstrap_index"] = np.arange(1000) - df_bs_alt.append(bs_mean_df) - - df_bs_alt = pd.concat(df_bs_alt, ignore_index=True) - bs_mean_alt = df_bs_alt.groupby(["agent"])["cum_reward"].mean() - bs_std_alt = df_bs_alt.groupby(["agent"])["cum_reward"].std() - - # high relative tolerance because of small sample size - pd.testing.assert_series_equal(bs_mean, bs_mean_alt, rtol=1e-1, atol=0) - pd.testing.assert_series_equal(bs_std, bs_std_alt, rtol=1e-1, atol=0) - - mean_alt = df.groupby(["agent"])["cum_reward"].mean() - pd.testing.assert_series_equal(bs_mean, mean_alt, rtol=1e-1, atol=0) - - -def test_statistical_properties_array(): - arr = convert_df_to_array(df.groupby("task")) - bootstrap_results = bootstrap_matrix(arr, 1000, np.mean) - bs_mean = np.mean(bootstrap_results) - original_mean = np.mean(arr) - assert np.isclose(bs_mean, original_mean, rtol=1e-1, atol=0) - - -# Test with no variation -def test_with_no_var_array(): - df_no_var = df.copy() - df_no_var["cum_reward"] = 10 # Setting a constant value for no variation - arr_no_var = convert_df_to_array(df_no_var.groupby("task")) - bootstrap_results = bootstrap_matrix(arr_no_var, 1000, np.mean) - bs_mean = np.mean(bootstrap_results) - assert np.isclose(bs_mean, 10, rtol=1e-1, atol=0) - - -def test_convert_df_to_array(): - df_uneven = df.drop(7) - result = convert_df_to_array(df_uneven.groupby("task"), threshold=0.7) - assert result.shape == (2, 4) # 2 tasks, 4 samples each - assert np.array_equal(result[1, :], [12, 13, 16, 12]) - - -if __name__ == "__main__": - test_with_no_var_array() - test_statistical_properties_array()