From 1edc1b90b883063bc3f4018f21ad308aa82c0446 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Fri, 29 Nov 2024 19:28:24 +0000 Subject: [PATCH 1/7] 4o mini webarena --- reproducibility_journal.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index ad2bfaa8..8921d8ff 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -46,3 +46,4 @@ ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,weblinx_test,0.0.1.de ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.089,0.005,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a, ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.125,0.006,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a, ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.079,0.005,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a, +ThibaultLSDC,GenericAgent-gpt-4o-mini,webarena,0.13.3,2024-11-29_19-25-49,c6bdeb87-9879-4c06-aa70-00d895001156,0.174,0.013,1,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,b115b2716d8a6328824684a692ed642297f0b1dc,,0.13.3,None, From 430fe9456ba766398380454a6335f094004607af Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Fri, 29 Nov 2024 19:30:38 +0000 Subject: [PATCH 2/7] where is this gone ? --- src/agentlab/analyze/inspect_results.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentlab/analyze/inspect_results.py b/src/agentlab/analyze/inspect_results.py index 0dbd173c..23f41f9b 100644 --- a/src/agentlab/analyze/inspect_results.py +++ b/src/agentlab/analyze/inspect_results.py @@ -496,8 +496,8 @@ def display_report( if rename_bool_flags: report = _rename_bool_flags(report) - if copy_to_clipboard: - to_clipboard(report) + # if copy_to_clipboard: + # to_clipboard(report) columns = list(report.columns) From 7b224971fb7a90fb76924ca9386a1e8bf609dd2a Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Fri, 29 Nov 2024 22:32:08 +0000 Subject: [PATCH 3/7] 4o webarena --- reproducibility_journal.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index 8921d8ff..99de34dd 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -47,3 +47,4 @@ ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,weblinx_test,0.0.1.d ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.125,0.006,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a, ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.079,0.005,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a, ThibaultLSDC,GenericAgent-gpt-4o-mini,webarena,0.13.3,2024-11-29_19-25-49,c6bdeb87-9879-4c06-aa70-00d895001156,0.174,0.013,1,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,b115b2716d8a6328824684a692ed642297f0b1dc,,0.13.3,None, +ThibaultLSDC,GenericAgent-gpt-4o,webarena,0.13.3,2024-11-29_22-28-32,d2eed215-91bb-4603-b69c-8ef8f9d57f34,0.314,0.016,3,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,430fe9456ba766398380454a6335f094004607af,,0.13.3,None, From 3f54ef13b778e69a1706c732f776147e9523ad3d Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Sat, 30 Nov 2024 00:21:28 +0000 Subject: [PATCH 4/7] claude webarena --- reproducibility_journal.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index 99de34dd..5fbd0238 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -48,3 +48,4 @@ ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,weblinx_test,0.0.1.dev13,202 ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,weblinx_test,0.0.1.dev13,2024-11-07_21-42-30,b9451759-4f0e-492c-a3c8-fa5109d2d9b1,0.079,0.005,0,2650/2650,None,Linux (#66-Ubuntu SMP Fri Aug 30 13:56:20 UTC 2024),3.12.7,1.39.0,0.2.3,7a5b91e62056fa8fb26efdd2f64f5b25a92b817c,,0.12.0,8633c30c31e6a5a1d5122835c035aa56d18f3f0a, ThibaultLSDC,GenericAgent-gpt-4o-mini,webarena,0.13.3,2024-11-29_19-25-49,c6bdeb87-9879-4c06-aa70-00d895001156,0.174,0.013,1,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,b115b2716d8a6328824684a692ed642297f0b1dc,,0.13.3,None, ThibaultLSDC,GenericAgent-gpt-4o,webarena,0.13.3,2024-11-29_22-28-32,d2eed215-91bb-4603-b69c-8ef8f9d57f34,0.314,0.016,3,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,430fe9456ba766398380454a6335f094004607af,,0.13.3,None, +ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,webarena,0.13.3,2024-11-29_22-37-46,b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae,0.362,0.017,0,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,7b224971fb7a90fb76924ca9386a1e8bf609dd2a,,0.13.3,None, From 5a5b94d544424517cdd11602b27100b82e35eac0 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Sat, 30 Nov 2024 03:06:36 +0000 Subject: [PATCH 5/7] o1 mini results --- reproducibility_journal.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index 5fbd0238..9bc7f040 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -49,3 +49,4 @@ ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,weblinx_test,0.0.1. ThibaultLSDC,GenericAgent-gpt-4o-mini,webarena,0.13.3,2024-11-29_19-25-49,c6bdeb87-9879-4c06-aa70-00d895001156,0.174,0.013,1,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,b115b2716d8a6328824684a692ed642297f0b1dc,,0.13.3,None, ThibaultLSDC,GenericAgent-gpt-4o,webarena,0.13.3,2024-11-29_22-28-32,d2eed215-91bb-4603-b69c-8ef8f9d57f34,0.314,0.016,3,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,430fe9456ba766398380454a6335f094004607af,,0.13.3,None, ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,webarena,0.13.3,2024-11-29_22-37-46,b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae,0.362,0.017,0,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,7b224971fb7a90fb76924ca9386a1e8bf609dd2a,,0.13.3,None, +ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,webarena,0.13.3,2024-11-30_00-22-44,1827983d-5e84-4b63-ad49-bf45ec2a6348,0.286,0.016,0,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,3f54ef13b778e69a1706c732f776147e9523ad3d,,0.13.3,None, From df7bc706f3793f47a456d1bda0485b306b8cf612 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Sun, 1 Dec 2024 01:42:30 +0000 Subject: [PATCH 6/7] 405 webarena --- reproducibility_journal.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index 9bc7f040..f7e79724 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -50,3 +50,4 @@ ThibaultLSDC,GenericAgent-gpt-4o-mini,webarena,0.13.3,2024-11-29_19-25-49,c6bdeb ThibaultLSDC,GenericAgent-gpt-4o,webarena,0.13.3,2024-11-29_22-28-32,d2eed215-91bb-4603-b69c-8ef8f9d57f34,0.314,0.016,3,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,430fe9456ba766398380454a6335f094004607af,,0.13.3,None, ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,webarena,0.13.3,2024-11-29_22-37-46,b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae,0.362,0.017,0,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,7b224971fb7a90fb76924ca9386a1e8bf609dd2a,,0.13.3,None, ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,webarena,0.13.3,2024-11-30_00-22-44,1827983d-5e84-4b63-ad49-bf45ec2a6348,0.286,0.016,0,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,3f54ef13b778e69a1706c732f776147e9523ad3d,,0.13.3,None, +ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,webarena,0.13.3,2024-12-01_00-04-43,aaeca13d-0cf5-444f-8445-590350b54746,0.24,0.015,9,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,5a5b94d544424517cdd11602b27100b82e35eac0,,0.13.3,None, From a95a8735942ccd215c7c522370ba55ab4f9915fb Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Tue, 3 Dec 2024 15:46:20 +0000 Subject: [PATCH 7/7] vwa + &)b wevarena --- reproducibility_journal.csv | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/reproducibility_journal.csv b/reproducibility_journal.csv index f7e79724..e73df94a 100644 --- a/reproducibility_journal.csv +++ b/reproducibility_journal.csv @@ -51,3 +51,7 @@ ThibaultLSDC,GenericAgent-gpt-4o,webarena,0.13.3,2024-11-29_22-28-32,d2eed215-91 ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta,webarena,0.13.3,2024-11-29_22-37-46,b5fc5be7-54cc-4fc1-a9ee-73447b9c3eae,0.362,0.017,0,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,7b224971fb7a90fb76924ca9386a1e8bf609dd2a,,0.13.3,None, ThibaultLSDC,GenericAgent-openai_o1-mini-2024-09-12,webarena,0.13.3,2024-11-30_00-22-44,1827983d-5e84-4b63-ad49-bf45ec2a6348,0.286,0.016,0,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,3f54ef13b778e69a1706c732f776147e9523ad3d,,0.13.3,None, ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-405b-instruct,webarena,0.13.3,2024-12-01_00-04-43,aaeca13d-0cf5-444f-8445-590350b54746,0.24,0.015,9,812/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,5a5b94d544424517cdd11602b27100b82e35eac0,,0.13.3,None, +ThibaultLSDC,GenericAgent-gpt-4o-mini_vision,visualwebarena,0.13.3,2024-12-02_02-54-33,8d8642d3-757a-4346-ba45-01398f85b1f4,0.169,0.012,37,909/910,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None, +ThibaultLSDC,GenericAgent-gpt-4o_vision,visualwebarena,0.13.3,2024-12-02_07-17-28,7fb7eac8-4bbd-4ebe-be32-15901a7678f2,0.267,0.015,65,910/910,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None, +ThibaultLSDC,GenericAgent-anthropic_claude-3.5-sonnet:beta_vision,visualwebarena,0.13.3,2024-12-02_09-11-35,22f0611d-aeea-4ee9-a533-b45442b5e080,0.21,0.013,178,910/910,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None, +ThibaultLSDC,GenericAgent-meta-llama_llama-3.1-70b-instruct,webarena,0.13.3,2024-12-02_23-18-38,fc5747bc-d998-4942-a0eb-e55a3ccc1cb3,0.184,0.014,213,811/812,None,Linux (#68-Ubuntu SMP Mon Oct 7 14:34:20 UTC 2024),3.12.7,1.39.0,0.3.1,df7bc706f3793f47a456d1bda0485b306b8cf612,,0.13.3,None,