diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index 4e2f7976..f8dcaf23 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -48,3 +48,10 @@ ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,weblinx_test,0.0.1.dev13,202 ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.079,0.005,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a, ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,workarena_l2_agent_curriculum_eval,0.4.1,2024-11-29_14-28-47,528da1f2-1949-41dc-b988-85f19f435af2,0.072,0.017,2,235/235,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,b115b2716d8a6328824684a692ed642297f0b1dc,,0.13.3,70dac253628c476aff1af6a975f27f8563453ad2, ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,miniwob,0.13.3,2024-11-29_16-14-00,4d748972-6d35-4489-a197-138b656a7db3,0.646,0.019,0,625/625,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,becb4856fb1612f44010fe74ef8155d367ca17fc,,0.13.3,70dac253628c476aff1af6a975f27f8563453ad2, +ThibaultLSDC,GenericAgent-gpt-4o,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.005,0.003,2,213/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None, +ThibaultLSDC,GenericAgent-gpt-4o-mini,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.002,0.002,1,214/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None, +ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.008,0.003,1,212/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None, +ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.007,0.005,8,206/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None, +ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-8b-instruct,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.001,0.001,15,214/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None, +ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.007,0.003,1,212/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None, +ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,assistantbench,0.13.1,2024-11-28_19-34-58,d93a2398-2b70-41ce-b989-364fed988d73,0.009,0.005,1,214/214,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.0,32865050045c8c71df35c34ff30a6b420a4e258c, M: src/agentlab/experiments/study.py,0.13.1,None, diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py index f10905f8..c091d117 100644 --- a/src/agentlab/experiments/study.py +++ b/src/agentlab/experiments/study.py @@ -1,3 +1,4 @@ +from abc import ABC, abstractmethod import gzip import logging import pickle @@ -405,7 +406,6 @@ def load_most_recent(root_dir: Path = None, contains=None) -> "Study": def _make_study_name(agent_names, benchmark_names, suffix=None): """Make a study name from the agent and benchmark names.""" - # extract unique agent and benchmark names agent_names = list(set(agent_names)) benchmark_names = list(set(benchmark_names))