From ec744a3a810a3f2281d6526fd2028df2740fa26d Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Thu, 10 Jul 2025 11:30:14 -0700
Subject: [PATCH 01/41] half way thru tb set up

---
 .github/workflows/terminal-bench.yaml | 59 +++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 .github/workflows/terminal-bench.yaml

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
new file mode 100644
index 0000000000..d2901ac2c2
--- /dev/null
+++ b/.github/workflows/terminal-bench.yaml
@@ -0,0 +1,59 @@
+# This is a terminal-bench workflow that is manually triggered
+# Template taken from https://github.com/actions/starter-workflows/blob/main/automation/manual.yml for reference 
+
+name: TB workflow
+
+# Controls when the action will run. Workflow runs when manually triggered using the UI
+# or API.
+on:
+  workflow_dispatch:
+    # Inputs the workflow accepts.
+    inputs:
+      name:
+        # Friendly description to be shown in the UI instead of 'name'
+        description: 'Run terminal bench'
+        # Default value if no value is explicitly provided
+        default: 'all'
+        # Input has to be provided for the workflow to run
+        required: true
+        # The data type of the input
+        type: string
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  run-benchmark:
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r terminal-bench-test/requirements.txt
+    
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        role-to-assume: arn:aws:iam::${{ secrets.TB_ACCOUNTID }}:role/ArjunPersonal
+        aws-region: us-east-1
+       
+
+    - name: Run terminal benchmark
+      run: |
+        cd terminal-bench-test
+        tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head
+
+    - name: Upload results
+      uses: actions/upload-artifact@v4
+      with:
+        name: benchmark-results
+        path: terminal-bench-test/results/
\ No newline at end of file

From d2a605df64a743082fd2366463ccf8c12f0223ff Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Thu, 10 Jul 2025 16:12:58 -0700
Subject: [PATCH 02/41] tb yaml

---
 .github/workflows/terminal-bench.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index d2901ac2c2..c7bf5552e4 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -43,7 +43,7 @@ jobs:
     - name: Configure AWS credentials
       uses: aws-actions/configure-aws-credentials@v4
       with:
-        role-to-assume: arn:aws:iam::${{ secrets.TB_ACCOUNTID }}:role/ArjunPersonal
+        role-to-assume: arn:aws:iam::${{ secrets.AWS_ROLE }}:role/ArjunPersonal
         aws-region: us-east-1
        
 

From bea6d119da367c7269b02118d8f1dd1c559fac02 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Thu, 10 Jul 2025 16:13:16 -0700
Subject: [PATCH 03/41] TB

---
 .github/workflows/terminal-bench.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index c7bf5552e4..f60ed4e73f 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -43,7 +43,7 @@ jobs:
     - name: Configure AWS credentials
       uses: aws-actions/configure-aws-credentials@v4
       with:
-        role-to-assume: arn:aws:iam::${{ secrets.AWS_ROLE }}:role/ArjunPersonal
+        role-to-assume: arn:aws:iam::${{ secrets.AWS_TB_ROLE }}:role/ArjunPersonal
         aws-region: us-east-1
        
 

From 1c03b3203c4aa60afea2f8c8f0d295eae60e8aa8 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Thu, 10 Jul 2025 16:37:16 -0700
Subject: [PATCH 04/41] tb bench scripts

---
 terminal-bench-test/main.py           | 47 +++++++++++++++++++++++
 terminal-bench-test/setup_amazon_q.sh | 55 +++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)
 create mode 100644 terminal-bench-test/main.py
 create mode 100644 terminal-bench-test/setup_amazon_q.sh

diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
new file mode 100644
index 0000000000..7efbd1cc4b
--- /dev/null
+++ b/terminal-bench-test/main.py
@@ -0,0 +1,47 @@
+import os
+import shlex
+from pathlib import Path
+
+from terminal_bench.agents.installed_agents.abstract_installed_agent import (
+    AbstractInstalledAgent,
+)
+from terminal_bench.terminal.models import TerminalCommand
+
+
+class AmazonQCLIAgent(AbstractInstalledAgent):
+
+    @staticmethod
+    def name() -> str:
+        return "Amazon Q CLI"
+
+    def __init__(self, model_name: str | None = None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._model_name = model_name
+        self._start_url = 'https://amzn.awsapps.com/start'
+        self.region = 'us-east-1'
+
+    @property
+    def _env(self) -> dict[str, str]:
+        # SIGv4 = 1 for AWS credentials
+        env = {}
+        for q_var in ["CODEWHISPERER_TOKEN", "CODEWHISPERER_DEVICE_REGISTRATION", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS", "AWS_SESSION_TOKEN", "AMAZON_Q_SIGV4"]:
+            if q_var in os.environ:
+                env[q_var] = os.environ[q_var]
+        return env
+
+    @property
+    def _install_agent_script_path(self) -> os.PathLike:
+        return Path(__file__).parent / "setup_amazon_q.sh"
+
+    def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]:
+        escaped_description = shlex.quote(task_description)
+        
+        return [
+        # q chat with 30 min max timeout and also we wait on input. Using qchat cuz sigv4. 
+        # non-interactive for now --> check if needed or not
+            TerminalCommand(
+                command=f"qchat chat --no-interactive --trust-all-tools {escaped_description}",
+                max_timeout_sec=370, 
+                block=True,
+            )
+        ]
\ No newline at end of file
diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
new file mode 100644
index 0000000000..6a3e38144d
--- /dev/null
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+set -e
+
+echo "Installing Amazon Q for command line (Linux)..."
+
+# Update package lists and install required tools
+apt-get update
+apt-get install -y curl unzip sqlite3 bc
+export AMAZON_Q_SIGV4=1
+
+# Check glibc version and architecture
+GLIBC_VERSION=$(ldd --version | head -n1 | grep -o '[0-9]\+\.[0-9]\+' | head -n1)
+ARCH=$(uname -m)
+
+echo "Detected architecture: $ARCH"
+echo "Detected glibc version: $GLIBC_VERSION"
+
+# Determine download URL based on architecture and glibc version
+if [ "$ARCH" = "x86_64" ]; then
+    if [ "$(echo "$GLIBC_VERSION >= 2.34" | bc -l 2>/dev/null || echo 0)" = "1" ]; then
+        Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-x86_64-linux.zip"
+        echo "Using standard x86_64 version"
+    else
+        Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-x86_64-linux-musl.zip"
+        echo "Using musl x86_64 version for older glibc"
+    fi
+elif [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then
+    if [ "$(echo "$GLIBC_VERSION >= 2.34" | bc -l 2>/dev/null || echo 0)" = "1" ]; then
+        Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-aarch64-linux.zip"
+        echo "Using standard aarch64 version"
+    else
+        Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-aarch64-linux-musl.zip"
+        echo "Using musl aarch64 version for older glibc"
+    fi
+else
+    echo "Unsupported architecture: $ARCH"
+    exit 1
+fi
+
+# Download Amazon Q
+echo "Downloading from: $Q_URL"
+curl --proto '=https' --tlsv1.2 -sSf "$Q_URL" -o "q.zip"
+
+# Extract and install w/o  interactive installer
+unzip q.zip
+cp q/bin/q /usr/local/bin/q
+cp q/bin/qchat /usr/local/bin/qchat
+chmod +x /usr/local/bin/q
+chmod +x /usr/local/bin/qchat
+
+# Test qchat installation without hanging
+echo "Testing qchat version..."
+q --version
+echo "Cleaning q zip"
+rm -f q.zip

From f063ef4ed658e65adb380fd6186146ee7c36c353 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Thu, 10 Jul 2025 16:50:38 -0700
Subject: [PATCH 05/41] trigger on push change

---
 .github/workflows/terminal-bench.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index f60ed4e73f..56f8cced7e 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -18,6 +18,8 @@ on:
         required: true
         # The data type of the input
         type: string
+    push:
+      branches: [ terminal-bench-automation ]
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:

From 62ea19127002de60f2b79e3c7e64d0e4cfcf303e Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Thu, 10 Jul 2025 16:52:11 -0700
Subject: [PATCH 06/41] typo

---
 .github/workflows/terminal-bench.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index 56f8cced7e..c93ece5c2a 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -18,8 +18,8 @@ on:
         required: true
         # The data type of the input
         type: string
-    push:
-      branches: [ terminal-bench-automation ]
+  push:
+    branches: [ terminal-bench-automation ]
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:

From 73873b1d74183d4977e81a8f5316e6c319dfc157 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Thu, 10 Jul 2025 16:53:53 -0700
Subject: [PATCH 07/41] trigger push

---
 .github/workflows/terminal-bench.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index c93ece5c2a..ae88a12df3 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -40,7 +40,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install -r terminal-bench-test/requirements.txt
+        pip install terminal-bench
     
     - name: Configure AWS credentials
       uses: aws-actions/configure-aws-credentials@v4

From 79651ff90d7067bd26465a311e504090408e0d27 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Thu, 10 Jul 2025 16:56:20 -0700
Subject: [PATCH 08/41] python v

---
 .github/workflows/terminal-bench.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index ae88a12df3..db465fc3cc 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -35,7 +35,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
-        python-version: '3.10'
+        python-version: '3.13'
 
     - name: Install dependencies
       run: |

From 832ac1eca88b5acf128a9c3fdec99ad48ff917ca Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Fri, 11 Jul 2025 10:51:26 -0700
Subject: [PATCH 09/41] run local branch instead of prod

---
 terminal-bench-test/main.py           |  2 +-
 terminal-bench-test/setup_amazon_q.sh | 54 ++-------------------------
 2 files changed, 5 insertions(+), 51 deletions(-)

diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
index 7efbd1cc4b..5cf2236e3e 100644
--- a/terminal-bench-test/main.py
+++ b/terminal-bench-test/main.py
@@ -40,7 +40,7 @@ def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]:
         # q chat with 30 min max timeout and also we wait on input. Using qchat cuz sigv4. 
         # non-interactive for now --> check if needed or not
             TerminalCommand(
-                command=f"qchat chat --no-interactive --trust-all-tools {escaped_description}",
+                command=f"cargo run --bin chat_cli -- chat --no-interactive --trust-all-tools {escaped_description}",
                 max_timeout_sec=370, 
                 block=True,
             )
diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 6a3e38144d..fb7f9c913e 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -2,54 +2,8 @@
 set -e
 
 echo "Installing Amazon Q for command line (Linux)..."
-
-# Update package lists and install required tools
-apt-get update
-apt-get install -y curl unzip sqlite3 bc
 export AMAZON_Q_SIGV4=1
-
-# Check glibc version and architecture
-GLIBC_VERSION=$(ldd --version | head -n1 | grep -o '[0-9]\+\.[0-9]\+' | head -n1)
-ARCH=$(uname -m)
-
-echo "Detected architecture: $ARCH"
-echo "Detected glibc version: $GLIBC_VERSION"
-
-# Determine download URL based on architecture and glibc version
-if [ "$ARCH" = "x86_64" ]; then
-    if [ "$(echo "$GLIBC_VERSION >= 2.34" | bc -l 2>/dev/null || echo 0)" = "1" ]; then
-        Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-x86_64-linux.zip"
-        echo "Using standard x86_64 version"
-    else
-        Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-x86_64-linux-musl.zip"
-        echo "Using musl x86_64 version for older glibc"
-    fi
-elif [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then
-    if [ "$(echo "$GLIBC_VERSION >= 2.34" | bc -l 2>/dev/null || echo 0)" = "1" ]; then
-        Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-aarch64-linux.zip"
-        echo "Using standard aarch64 version"
-    else
-        Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-aarch64-linux-musl.zip"
-        echo "Using musl aarch64 version for older glibc"
-    fi
-else
-    echo "Unsupported architecture: $ARCH"
-    exit 1
-fi
-
-# Download Amazon Q
-echo "Downloading from: $Q_URL"
-curl --proto '=https' --tlsv1.2 -sSf "$Q_URL" -o "q.zip"
-
-# Extract and install w/o  interactive installer
-unzip q.zip
-cp q/bin/q /usr/local/bin/q
-cp q/bin/qchat /usr/local/bin/qchat
-chmod +x /usr/local/bin/q
-chmod +x /usr/local/bin/qchat
-
-# Test qchat installation without hanging
-echo "Testing qchat version..."
-q --version
-echo "Cleaning q zip"
-rm -f q.zip
+ 
+# Install the Q branch code into machine + dependencies
+cd /home/runner/work/amazon-q-developer-cli/amazon-q-developer-cli/
+npm run setup

From 8a32d5618fc2f38ff74cfd64a7fc5165ec1f915f Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Fri, 11 Jul 2025 11:38:09 -0700
Subject: [PATCH 10/41] unnecessary env variables removed

---
 .github/workflows/terminal-bench.yaml | 2 +-
 terminal-bench-test/main.py           | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index db465fc3cc..6ba1c91fca 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -52,7 +52,7 @@ jobs:
     - name: Run terminal benchmark
       run: |
         cd terminal-bench-test
-        tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head
+        tb run --agent-import-path main:AmazonQCLIAgent --task-id hello-world --livestream
 
     - name: Upload results
       uses: actions/upload-artifact@v4
diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
index 5cf2236e3e..d5d586547b 100644
--- a/terminal-bench-test/main.py
+++ b/terminal-bench-test/main.py
@@ -20,15 +20,6 @@ def __init__(self, model_name: str | None = None, *args, **kwargs):
         self._start_url = 'https://amzn.awsapps.com/start'
         self.region = 'us-east-1'
 
-    @property
-    def _env(self) -> dict[str, str]:
-        # SIGv4 = 1 for AWS credentials
-        env = {}
-        for q_var in ["CODEWHISPERER_TOKEN", "CODEWHISPERER_DEVICE_REGISTRATION", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS", "AWS_SESSION_TOKEN", "AMAZON_Q_SIGV4"]:
-            if q_var in os.environ:
-                env[q_var] = os.environ[q_var]
-        return env
-
     @property
     def _install_agent_script_path(self) -> os.PathLike:
         return Path(__file__).parent / "setup_amazon_q.sh"

From e6c6a80389395a63ae7f8eda16a5e4a1608bf01d Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Fri, 11 Jul 2025 11:46:12 -0700
Subject: [PATCH 11/41] allow log inspection

---
 .github/workflows/terminal-bench.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index 6ba1c91fca..e91ff63052 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -52,10 +52,11 @@ jobs:
     - name: Run terminal benchmark
       run: |
         cd terminal-bench-test
-        tb run --agent-import-path main:AmazonQCLIAgent --task-id hello-world --livestream
+        tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head
 
     - name: Upload results
+      if: always()
       uses: actions/upload-artifact@v4
       with:
         name: benchmark-results
-        path: terminal-bench-test/results/
\ No newline at end of file
+        path: terminal-bench-test/runs/
\ No newline at end of file

From e47a2bff2921b13b641387293a0693455bf19384 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Fri, 11 Jul 2025 11:48:32 -0700
Subject: [PATCH 12/41] implement _env

---
 terminal-bench-test/main.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
index d5d586547b..444b1dd0ab 100644
--- a/terminal-bench-test/main.py
+++ b/terminal-bench-test/main.py
@@ -20,6 +20,12 @@ def __init__(self, model_name: str | None = None, *args, **kwargs):
         self._start_url = 'https://amzn.awsapps.com/start'
         self.region = 'us-east-1'
 
+    @property
+    def _env(self) -> dict[str, str]:
+        # SIGv4 = 1 for AWS credentials
+        env = {}
+        return env
+
     @property
     def _install_agent_script_path(self) -> os.PathLike:
         return Path(__file__).parent / "setup_amazon_q.sh"

From f5ed0ca19bff22ad0e8dae2fbb96139a427ac3a0 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Fri, 11 Jul 2025 12:10:04 -0700
Subject: [PATCH 13/41] npm, rust, cargo, clone github directly

---
 terminal-bench-test/setup_amazon_q.sh | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index fb7f9c913e..62f5645169 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -1,9 +1,21 @@
 #!/bin/bash
 set -e
 
+# Install Node.js and npm
+curl -fsSL https://deb.nodesource.com/setup_18.x | bash -
+apt-get install -y nodejs
+
+# Install Rust and Cargo
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+source ~/.cargo/env
+
+# get the github code
 echo "Installing Amazon Q for command line (Linux)..."
+mkdir -p /app/amazon-q-developer-cli
+cd /app
 export AMAZON_Q_SIGV4=1
- 
-# Install the Q branch code into machine + dependencies
-cd /home/runner/work/amazon-q-developer-cli/amazon-q-developer-cli/
+git clone https://github.com/aws/amazon-q-developer-cli.git
+cd amazon-q-developer-cli
 npm run setup
+
+echo "Amazon Q CLI installation completed successfully"
\ No newline at end of file

From 3ec0dbebe8724140b358d07803fb9d82e213eb69 Mon Sep 17 00:00:00 2001
From: arjun37602 <33012834+arjun37602@users.noreply.github.com>
Date: Fri, 11 Jul 2025 12:33:55 -0700
Subject: [PATCH 14/41] Update setup_amazon_q.sh

---
 terminal-bench-test/setup_amazon_q.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 62f5645169..01a04284a6 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 set -e
+apt-get update
+apt-get install -y curl wget gnupg2 software-properties-common git
 
 # Install Node.js and npm
 curl -fsSL https://deb.nodesource.com/setup_18.x | bash -
@@ -18,4 +20,4 @@ git clone https://github.com/aws/amazon-q-developer-cli.git
 cd amazon-q-developer-cli
 npm run setup
 
-echo "Amazon Q CLI installation completed successfully"
\ No newline at end of file
+echo "Amazon Q CLI installation completed successfully"

From a2e6ebeb13d2fde4a660339a948516f61a6aac0c Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Fri, 11 Jul 2025 13:54:49 -0700
Subject: [PATCH 15/41] clean up disk + check amt of free space

---
 .github/workflows/terminal-bench.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index e91ff63052..0dea01ebad 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -29,6 +29,16 @@ jobs:
       id-token: write
       contents: read
     steps:
+    - name: Cleanup and free disk space
+      run: |
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf "/usr/local/share/boost"
+        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+        sudo rm -rf /usr/local/lib/android
+        sudo rm -rf /usr/share/swift
+        df -h
+
     - name: Checkout repository
       uses: actions/checkout@v4
 

From e5f51df1b3380a380a186eebadc29174a84ab2cd Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Mon, 14 Jul 2025 10:09:57 -0700
Subject: [PATCH 16/41] get gcc dependencies

---
 terminal-bench-test/setup_amazon_q.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 01a04284a6..b02e81ca44 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -6,6 +6,8 @@ apt-get install -y curl wget gnupg2 software-properties-common git
 # Install Node.js and npm
 curl -fsSL https://deb.nodesource.com/setup_18.x | bash -
 apt-get install -y nodejs
+apt-get install -y build-essential gcc g++ make
+apt-get install -y pkg-config libssl-dev
 
 # Install Rust and Cargo
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y

From f53926d6e283a2944ea3723060dbfaacbb635343 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Mon, 14 Jul 2025 11:04:40 -0700
Subject: [PATCH 17/41] big timeout

---
 terminal-bench-test/main.py           | 4 ++--
 terminal-bench-test/setup_amazon_q.sh | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
index 444b1dd0ab..0a8d5330e0 100644
--- a/terminal-bench-test/main.py
+++ b/terminal-bench-test/main.py
@@ -23,7 +23,7 @@ def __init__(self, model_name: str | None = None, *args, **kwargs):
     @property
     def _env(self) -> dict[str, str]:
         # SIGv4 = 1 for AWS credentials
-        env = {}
+        env = {"AMAZON_Q_SIGV4":1}
         return env
 
     @property
@@ -38,7 +38,7 @@ def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]:
         # non-interactive for now --> check if needed or not
             TerminalCommand(
                 command=f"cargo run --bin chat_cli -- chat --no-interactive --trust-all-tools {escaped_description}",
-                max_timeout_sec=370, 
+                max_timeout_sec=1200, 
                 block=True,
             )
         ]
\ No newline at end of file
diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index b02e81ca44..646e288a4e 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -20,6 +20,4 @@ cd /app
 export AMAZON_Q_SIGV4=1
 git clone https://github.com/aws/amazon-q-developer-cli.git
 cd amazon-q-developer-cli
-npm run setup
-
 echo "Amazon Q CLI installation completed successfully"

From 803eda1d0c78ec69ff8f97a663597697d8dbe548 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Mon, 14 Jul 2025 11:39:46 -0700
Subject: [PATCH 18/41] pipe config files from gh runner to docker

---
 terminal-bench-test/main.py           | 29 +++++++++++++++++++++++++++
 terminal-bench-test/setup_amazon_q.sh | 17 ++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
index 0a8d5330e0..1ce66c4007 100644
--- a/terminal-bench-test/main.py
+++ b/terminal-bench-test/main.py
@@ -24,6 +24,35 @@ def __init__(self, model_name: str | None = None, *args, **kwargs):
     def _env(self) -> dict[str, str]:
         # SIGv4 = 1 for AWS credentials
         env = {"AMAZON_Q_SIGV4":1}
+        aws_credentials_path = os.path.expanduser("~/.aws/credentials")
+        aws_config_path = os.path.expanduser("~/.aws/config")
+        
+        if os.path.exists(aws_credentials_path):
+            try:
+                import configparser
+                config = configparser.ConfigParser()
+                config.read(aws_credentials_path)
+                
+                if "default" in config:
+                    if "aws_access_key_id" in config["default"]:
+                        env["AWS_ACCESS_KEY_ID"] = config["default"]["aws_access_key_id"]
+                    if "aws_secret_access_key" in config["default"]:
+                        env["AWS_SECRET_ACCESS_KEY"] = config["default"]["aws_secret_access_key"]
+                    if "aws_session_token" in config["default"]:
+                        env["AWS_SESSION_TOKEN"] = config["default"]["aws_session_token"]
+            except Exception as e:
+                print(f"Warning: Failed to read AWS credentials: {e}")
+        
+        if os.path.exists(aws_config_path):
+            try:
+                import configparser
+                config = configparser.ConfigParser()
+                config.read(aws_config_path)
+                
+                if "default" in config and "region" in config["default"]:
+                    env["AWS_REGION"] = config["default"]["region"]
+            except Exception as e:
+                print(f"Warning: Failed to read AWS config: {e}")
         return env
 
     @property
diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 646e288a4e..4baccad255 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -21,3 +21,20 @@ export AMAZON_Q_SIGV4=1
 git clone https://github.com/aws/amazon-q-developer-cli.git
 cd amazon-q-developer-cli
 echo "Amazon Q CLI installation completed successfully"
+
+
+# Create AWS credentials from environment variables
+mkdir -p ~/.aws
+cat > ~/.aws/credentials << EOF
+[default]
+aws_access_key_id = ${AWS_ACCESS_KEY_ID}
+aws_secret_access_key = ${AWS_SECRET_ACCESS_KEY}
+aws_session_token = ${AWS_SESSION_TOKEN}
+EOF
+chmod 600 ~/.aws/credentials
+
+cat > ~/.aws/config << EOF
+[default]
+region = us-east-1
+EOF
+chmod 600 ~/.aws/config
\ No newline at end of file

From 8db47f75b667ad9abbf2012ba56a99e154c0c6f0 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Mon, 14 Jul 2025 16:08:29 -0700
Subject: [PATCH 19/41] configure env + working with sso

---
 .github/workflows/terminal-bench.yaml | 3 +++
 terminal-bench-test/main.py           | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index 0dea01ebad..c4d7de086c 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -25,6 +25,8 @@ on:
 jobs:
   run-benchmark:
     runs-on: ubuntu-latest
+    env:
+      FIGCHAT_GAMMA_ID: ${{ secrets.figchat_gamma_id }}
     permissions:
       id-token: write
       contents: read
@@ -37,6 +39,7 @@ jobs:
         sudo rm -rf "$AGENT_TOOLSDIRECTORY"
         sudo rm -rf /usr/local/lib/android
         sudo rm -rf /usr/share/swift
+        sudo apt-get clean
         df -h
 
     - name: Checkout repository
diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
index 1ce66c4007..1604903517 100644
--- a/terminal-bench-test/main.py
+++ b/terminal-bench-test/main.py
@@ -66,8 +66,8 @@ def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]:
         # q chat with 30 min max timeout and also we wait on input. Using qchat cuz sigv4. 
         # non-interactive for now --> check if needed or not
             TerminalCommand(
-                command=f"cargo run --bin chat_cli -- chat --no-interactive --trust-all-tools {escaped_description}",
-                max_timeout_sec=1200, 
+                command=f"qchat chat --no-interactive --trust-all-tools {escaped_description}",
+                max_timeout_sec=370, 
                 block=True,
             )
         ]
\ No newline at end of file

From c90896023afe07ea626d02c5b60cf78922468d9e Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Mon, 14 Jul 2025 16:34:49 -0700
Subject: [PATCH 20/41] changed default

---
 terminal-bench-test/setup_amazon_q.sh | 83 ++++++++++++++++++++-------
 1 file changed, 62 insertions(+), 21 deletions(-)

diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 4baccad255..68b6c67664 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -1,27 +1,15 @@
 #!/bin/bash
 set -e
+# if git hash empty then set to latest automatically.
+git_hash=${GITHUB_SHA:+"$(git rev-parse --short "$GITHUB_SHA")"}
+git_hash=${git_hash:-"00286fbe0688fe19fef9a5149bb9f9172f96783c"}
 apt-get update
-apt-get install -y curl wget gnupg2 software-properties-common git
-
-# Install Node.js and npm
-curl -fsSL https://deb.nodesource.com/setup_18.x | bash -
-apt-get install -y nodejs
-apt-get install -y build-essential gcc g++ make
-apt-get install -y pkg-config libssl-dev
-
-# Install Rust and Cargo
-curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-source ~/.cargo/env
-
-# get the github code
-echo "Installing Amazon Q for command line (Linux)..."
-mkdir -p /app/amazon-q-developer-cli
-cd /app
-export AMAZON_Q_SIGV4=1
-git clone https://github.com/aws/amazon-q-developer-cli.git
-cd amazon-q-developer-cli
-echo "Amazon Q CLI installation completed successfully"
+apt-get install -y curl wget unzip jq
 
+echo "Installing AWS CLI..."
+curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
+unzip -q awscliv2.zip
+./aws/install --bin-dir /usr/local/bin --install-dir /usr/local/aws-cli
 
 # Create AWS credentials from environment variables
 mkdir -p ~/.aws
@@ -37,4 +25,57 @@ cat > ~/.aws/config << EOF
 [default]
 region = us-east-1
 EOF
-chmod 600 ~/.aws/config
\ No newline at end of file
+chmod 600 ~/.aws/config
+
+
+# Save original credentials
+ORIGINAL_AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
+ORIGINAL_AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
+ORIGINAL_AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN}
+
+# Assume role and capture temporary credentials --> needed for s3 bucket access for build
+echo "Assuming role FigIoChat-S3Access-Role-Gamma..."
+TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession)
+
+# Extract and export temporary credentials -> jq is just used 
+export AWS_ACCESS_KEY_ID=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId')
+export AWS_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey')
+export AWS_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken')
+
+# Download specific build from S3 based on commit hash
+echo "Downloading Amazon Q CLI build from S3..."
+S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1"
+S3_PREFIX="main/${git_hash}/x86_64-unknown-linux-musl"
+echo "Downloading qchat.zip from s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip"
+aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1
+
+
+# Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code
+echo "Extracting qchat.zip..."
+unzip -q qchat.zip
+mkdir -p /usr/local/bin
+cp -f ./qchat/qchat /usr/local/bin/qchat
+chmod +x /usr/local/bin/qchat
+ln -sf /usr/local/bin/qchat /usr/local/bin/q
+
+# Restore credentials to run Q
+export AWS_ACCESS_KEY_ID=${ORIGINAL_AWS_ACCESS_KEY_ID}
+export AWS_SECRET_ACCESS_KEY=${ORIGINAL_AWS_SECRET_ACCESS_KEY}
+export AWS_SESSION_TOKEN=${ORIGINAL_AWS_SESSION_TOKEN}
+
+cat > ~/.aws/credentials << EOF
+[default]
+aws_access_key_id = ${ORIGINAL_AWS_ACCESS_KEY_ID}
+aws_secret_access_key = ${ORIGINAL_AWS_SECRET_ACCESS_KEY}
+aws_session_token = ${ORIGINAL_AWS_SESSION_TOKEN}
+EOF
+
+echo "Cleaning q zip"
+rm -f qchat.zip
+rm -rf qchat
+
+
+
+
+
+

From fb9262dd65eee057d53a99d923ec7ccc656cfe96 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Mon, 14 Jul 2025 16:35:21 -0700
Subject: [PATCH 21/41] default to latest

---
 terminal-bench-test/setup_amazon_q.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 68b6c67664..335bd0cb04 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 set -e
-# if git hash empty then set to latest automatically.
+# if git hash empty then set to latest auto
 git_hash=${GITHUB_SHA:+"$(git rev-parse --short "$GITHUB_SHA")"}
-git_hash=${git_hash:-"00286fbe0688fe19fef9a5149bb9f9172f96783c"}
+git_hash=${git_hash:-"latest"}
 apt-get update
 apt-get install -y curl wget unzip jq
 

From ebdd054d191d5f2aee8589a44b2b68395f6c0aea Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Mon, 14 Jul 2025 16:49:30 -0700
Subject: [PATCH 22/41] fixing qchat location + forcing correct auth

---
 terminal-bench-test/main.py           |  1 +
 terminal-bench-test/setup_amazon_q.sh | 26 +++++++++++++++-----------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
index 1604903517..72c79f25be 100644
--- a/terminal-bench-test/main.py
+++ b/terminal-bench-test/main.py
@@ -53,6 +53,7 @@ def _env(self) -> dict[str, str]:
                     env["AWS_REGION"] = config["default"]["region"]
             except Exception as e:
                 print(f"Warning: Failed to read AWS config: {e}")
+        env['FIGCHAT_GAMMA_ID'] = os.environ.get('FIGCHAT_GAMMA_ID', '')
         return env
 
     @property
diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 335bd0cb04..5b22ab2e22 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -49,14 +49,24 @@ S3_PREFIX="main/${git_hash}/x86_64-unknown-linux-musl"
 echo "Downloading qchat.zip from s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip"
 aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1
 
-
 # Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code
 echo "Extracting qchat.zip..."
 unzip -q qchat.zip
 mkdir -p /usr/local/bin
-cp -f ./qchat/qchat /usr/local/bin/qchat
-chmod +x /usr/local/bin/qchat
-ln -sf /usr/local/bin/qchat /usr/local/bin/q
+
+# Find the qchat executable -> copy
+find . -type f -name "qchat" -exec cp {} /usr/local/bin/qchat \;
+
+if [ -f "/usr/local/bin/qchat" ]; then
+    chmod +x /usr/local/bin/qchat
+    ln -sf /usr/local/bin/qchat /usr/local/bin/q
+    echo "qchat installed successfully"
+else
+    echo "ERROR: qchat executable not found in the zip file"
+    # List the contents of the extracted files to debug
+    find . -type f | grep -v "aws" | head -20
+    exit 1
+fi
 
 # Restore credentials to run Q
 export AWS_ACCESS_KEY_ID=${ORIGINAL_AWS_ACCESS_KEY_ID}
@@ -72,10 +82,4 @@ EOF
 
 echo "Cleaning q zip"
 rm -f qchat.zip
-rm -rf qchat
-
-
-
-
-
-
+rm -rf qchat
\ No newline at end of file

From 1d665f5909b54595c37c24a6ca5bb665aebdccce Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Mon, 14 Jul 2025 16:59:40 -0700
Subject: [PATCH 23/41] set env vars not just config file

---
 terminal-bench-test/setup_amazon_q.sh | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 5b22ab2e22..c49b716585 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -35,15 +35,25 @@ ORIGINAL_AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN}
 
 # Assume role and capture temporary credentials --> needed for s3 bucket access for build
 echo "Assuming role FigIoChat-S3Access-Role-Gamma..."
-TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession)
-
-# Extract and export temporary credentials -> jq is just used 
-export AWS_ACCESS_KEY_ID=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId')
-export AWS_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey')
-export AWS_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken')
+if [ -n "${FIGCHAT_GAMMA_ID}" ]; then
+  TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession 2>/dev/null || echo '{}')
+  
+  # Check if role assumption was successful
+  if [ "$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId' 2>/dev/null)" != "null" ] && [ "$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId' 2>/dev/null)" != "" ]; then
+    echo "Role assumption successful, using temporary credentials"
+    export AWS_ACCESS_KEY_ID=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId')
+    export AWS_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey')
+    export AWS_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken')
+  else
+    echo "Role assumption failed, continuing with original credentials"
+  fi
+else
+  echo "FIGCHAT_GAMMA_ID not set, skipping role assumption"
+fi
 
 # Download specific build from S3 based on commit hash
 echo "Downloading Amazon Q CLI build from S3..."
+# Use hardcoded bucket ID if FIGCHAT_GAMMA_ID is not set
 S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1"
 S3_PREFIX="main/${git_hash}/x86_64-unknown-linux-musl"
 echo "Downloading qchat.zip from s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip"

From 12adf8677cffc596f699d55bac92110fc0cfb287 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Tue, 15 Jul 2025 09:27:29 -0700
Subject: [PATCH 24/41] env vars all caps

---
 .github/workflows/terminal-bench.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index c4d7de086c..267e0932b7 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -26,7 +26,7 @@ jobs:
   run-benchmark:
     runs-on: ubuntu-latest
     env:
-      FIGCHAT_GAMMA_ID: ${{ secrets.figchat_gamma_id }}
+      FIGCHAT_GAMMA_ID: ${{ secrets.FIGCHAT_GAMMA_ID }}
     permissions:
       id-token: write
       contents: read

From 6888d4b30f85a7b9f4cab7309a7d2a3e780d774b Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Tue, 15 Jul 2025 09:38:03 -0700
Subject: [PATCH 25/41] confirm env vairables are visible

---
 terminal-bench-test/setup_amazon_q.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index c49b716585..2fdb481050 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -34,7 +34,7 @@ ORIGINAL_AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
 ORIGINAL_AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN}
 
 # Assume role and capture temporary credentials --> needed for s3 bucket access for build
-echo "Assuming role FigIoChat-S3Access-Role-Gamma..."
+echo "Assuming arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma..."
 if [ -n "${FIGCHAT_GAMMA_ID}" ]; then
   TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession 2>/dev/null || echo '{}')
   

From 2c52fc1f407e5c6daccccfceb3c1f2989a632789 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Tue, 15 Jul 2025 11:36:13 -0700
Subject: [PATCH 26/41] roleName + code simplify

---
 terminal-bench-test/main.py           | 32 ++------------------
 terminal-bench-test/setup_amazon_q.sh | 42 +++++++--------------------
 2 files changed, 13 insertions(+), 61 deletions(-)

diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
index 72c79f25be..30c6baa6bb 100644
--- a/terminal-bench-test/main.py
+++ b/terminal-bench-test/main.py
@@ -23,36 +23,8 @@ def __init__(self, model_name: str | None = None, *args, **kwargs):
     @property
     def _env(self) -> dict[str, str]:
         # SIGv4 = 1 for AWS credentials
-        env = {"AMAZON_Q_SIGV4":1}
-        aws_credentials_path = os.path.expanduser("~/.aws/credentials")
-        aws_config_path = os.path.expanduser("~/.aws/config")
-        
-        if os.path.exists(aws_credentials_path):
-            try:
-                import configparser
-                config = configparser.ConfigParser()
-                config.read(aws_credentials_path)
-                
-                if "default" in config:
-                    if "aws_access_key_id" in config["default"]:
-                        env["AWS_ACCESS_KEY_ID"] = config["default"]["aws_access_key_id"]
-                    if "aws_secret_access_key" in config["default"]:
-                        env["AWS_SECRET_ACCESS_KEY"] = config["default"]["aws_secret_access_key"]
-                    if "aws_session_token" in config["default"]:
-                        env["AWS_SESSION_TOKEN"] = config["default"]["aws_session_token"]
-            except Exception as e:
-                print(f"Warning: Failed to read AWS credentials: {e}")
-        
-        if os.path.exists(aws_config_path):
-            try:
-                import configparser
-                config = configparser.ConfigParser()
-                config.read(aws_config_path)
-                
-                if "default" in config and "region" in config["default"]:
-                    env["AWS_REGION"] = config["default"]["region"]
-            except Exception as e:
-                print(f"Warning: Failed to read AWS config: {e}")
+        env = {}
+        env["AMAZON_Q_SIGV4"] = 1
         env['FIGCHAT_GAMMA_ID'] = os.environ.get('FIGCHAT_GAMMA_ID', '')
         return env
 
diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 2fdb481050..7838176e52 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -27,29 +27,18 @@ region = us-east-1
 EOF
 chmod 600 ~/.aws/config
 
-
 # Save original credentials
 ORIGINAL_AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
 ORIGINAL_AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
 ORIGINAL_AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN}
 
 # Assume role and capture temporary credentials --> needed for s3 bucket access for build
-echo "Assuming arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma..."
-if [ -n "${FIGCHAT_GAMMA_ID}" ]; then
-  TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession 2>/dev/null || echo '{}')
-  
-  # Check if role assumption was successful
-  if [ "$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId' 2>/dev/null)" != "null" ] && [ "$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId' 2>/dev/null)" != "" ]; then
-    echo "Role assumption successful, using temporary credentials"
-    export AWS_ACCESS_KEY_ID=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId')
-    export AWS_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey')
-    export AWS_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken')
-  else
-    echo "Role assumption failed, continuing with original credentials"
-  fi
-else
-  echo "FIGCHAT_GAMMA_ID not set, skipping role assumption"
-fi
+echo "Assuming AWS s3 role"
+TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession 2>/dev/null || echo '{}')
+echo $TEMP_CREDENTIALS
+QCHAT_ACCESSKEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId')
+Q_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey')
+# export AWS_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken')
 
 # Download specific build from S3 based on commit hash
 echo "Downloading Amazon Q CLI build from S3..."
@@ -57,32 +46,23 @@ echo "Downloading Amazon Q CLI build from S3..."
 S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1"
 S3_PREFIX="main/${git_hash}/x86_64-unknown-linux-musl"
 echo "Downloading qchat.zip from s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip"
-aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1
+AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY"\
+    aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1
 
 # Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code
 echo "Extracting qchat.zip..."
 unzip -q qchat.zip
-mkdir -p /usr/local/bin
 
-# Find the qchat executable -> copy
-find . -type f -name "qchat" -exec cp {} /usr/local/bin/qchat \;
-
-if [ -f "/usr/local/bin/qchat" ]; then
-    chmod +x /usr/local/bin/qchat
+# move it to /usr/local/bin/qchat
+if cp qchat /usr/local/bin/ && chmod +x /usr/local/bin/qchat; then
     ln -sf /usr/local/bin/qchat /usr/local/bin/q
     echo "qchat installed successfully"
 else
-    echo "ERROR: qchat executable not found in the zip file"
-    # List the contents of the extracted files to debug
-    find . -type f | grep -v "aws" | head -20
+    echo "ERROR: Failed to install qchat"
     exit 1
 fi
 
 # Restore credentials to run Q
-export AWS_ACCESS_KEY_ID=${ORIGINAL_AWS_ACCESS_KEY_ID}
-export AWS_SECRET_ACCESS_KEY=${ORIGINAL_AWS_SECRET_ACCESS_KEY}
-export AWS_SESSION_TOKEN=${ORIGINAL_AWS_SESSION_TOKEN}
-
 cat > ~/.aws/credentials << EOF
 [default]
 aws_access_key_id = ${ORIGINAL_AWS_ACCESS_KEY_ID}

From a7e3150f2cac0128354df3f411c90a2f1295270f Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Tue, 15 Jul 2025 12:07:07 -0700
Subject: [PATCH 27/41] environment variable fix + local working

---
 terminal-bench-test/main.py           |  5 ++++-
 terminal-bench-test/setup_amazon_q.sh | 19 +++----------------
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
index 30c6baa6bb..9dc1e7698d 100644
--- a/terminal-bench-test/main.py
+++ b/terminal-bench-test/main.py
@@ -25,7 +25,10 @@ def _env(self) -> dict[str, str]:
         # SIGv4 = 1 for AWS credentials
         env = {}
         env["AMAZON_Q_SIGV4"] = 1
-        env['FIGCHAT_GAMMA_ID'] = os.environ.get('FIGCHAT_GAMMA_ID', '')
+        env["FIGCHAT_GAMMA_ID"] = os.environ.get("FIGCHAT_GAMMA_ID", '')
+        env["AWS_ACCESS_KEY_ID"] = os.environ.get("AWS_ACCESS_KEY_ID", '')
+        env["AWS_SECRET_ACCESS_KEY"] = os.environ.get("AWS_SECRET_ACCESS_KEY", '')
+        env["AWS_SESSION_TOKEN"] = os.environ.get("AWS_SESSION_TOKEN", '')
         return env
 
     @property
diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 7838176e52..df317a8738 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -27,18 +27,12 @@ region = us-east-1
 EOF
 chmod 600 ~/.aws/config
 
-# Save original credentials
-ORIGINAL_AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
-ORIGINAL_AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
-ORIGINAL_AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN}
-
 # Assume role and capture temporary credentials --> needed for s3 bucket access for build
 echo "Assuming AWS s3 role"
 TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession 2>/dev/null || echo '{}')
-echo $TEMP_CREDENTIALS
 QCHAT_ACCESSKEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId')
 Q_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey')
-# export AWS_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken')
+Q_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken')
 
 # Download specific build from S3 based on commit hash
 echo "Downloading Amazon Q CLI build from S3..."
@@ -46,9 +40,10 @@ echo "Downloading Amazon Q CLI build from S3..."
 S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1"
 S3_PREFIX="main/${git_hash}/x86_64-unknown-linux-musl"
 echo "Downloading qchat.zip from s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip"
-AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY"\
+AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \
     aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1
 
+
 # Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code
 echo "Extracting qchat.zip..."
 unzip -q qchat.zip
@@ -62,14 +57,6 @@ else
     exit 1
 fi
 
-# Restore credentials to run Q
-cat > ~/.aws/credentials << EOF
-[default]
-aws_access_key_id = ${ORIGINAL_AWS_ACCESS_KEY_ID}
-aws_secret_access_key = ${ORIGINAL_AWS_SECRET_ACCESS_KEY}
-aws_session_token = ${ORIGINAL_AWS_SESSION_TOKEN}
-EOF
-
 echo "Cleaning q zip"
 rm -f qchat.zip
 rm -rf qchat
\ No newline at end of file

From cefeaf3c3daa131b22c8515611d9e2d22b6e35fb Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Tue, 15 Jul 2025 12:40:48 -0700
Subject: [PATCH 28/41] use the correct git hash

---
 .github/workflows/terminal-bench.yaml | 11 +++++++++++
 terminal-bench-test/main.py           |  1 +
 terminal-bench-test/setup_amazon_q.sh | 20 +++++++++++++-------
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index 267e0932b7..45b83f23e2 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -45,6 +45,17 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v4
 
+    - name: Set git hash
+      run: |
+        if [ -n "$GITHUB_SHA" ]; then
+          git_hash=$(git rev-parse --short "$GITHUB_SHA")
+        else
+          git_hash="latest"
+        fi
+        # appends to github_env file
+        echo "GIT_HASH=$git_hash" >> $GITHUB_ENV
+        echo "Git hash set to: $git_hash"
+
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
index 9dc1e7698d..0e754e3fdf 100644
--- a/terminal-bench-test/main.py
+++ b/terminal-bench-test/main.py
@@ -29,6 +29,7 @@ def _env(self) -> dict[str, str]:
         env["AWS_ACCESS_KEY_ID"] = os.environ.get("AWS_ACCESS_KEY_ID", '')
         env["AWS_SECRET_ACCESS_KEY"] = os.environ.get("AWS_SECRET_ACCESS_KEY", '')
         env["AWS_SESSION_TOKEN"] = os.environ.get("AWS_SESSION_TOKEN", '')
+        env["GIT_HASH"] = os.environ.get("GIT_HASH", '')
         return env
 
     @property
diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index df317a8738..0259b3e69c 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -1,8 +1,6 @@
 #!/bin/bash
 set -e
 # if git hash empty then set to latest auto
-git_hash=${GITHUB_SHA:+"$(git rev-parse --short "$GITHUB_SHA")"}
-git_hash=${git_hash:-"latest"}
 apt-get update
 apt-get install -y curl wget unzip jq
 
@@ -36,19 +34,27 @@ Q_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken')
 
 # Download specific build from S3 based on commit hash
 echo "Downloading Amazon Q CLI build from S3..."
-# Use hardcoded bucket ID if FIGCHAT_GAMMA_ID is not set
 S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1"
-S3_PREFIX="main/${git_hash}/x86_64-unknown-linux-musl"
-echo "Downloading qchat.zip from s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip"
+S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-musl"
+echo "Downloading qchat.zip from s3://.../${S3_PREFIX}/qchat.zip"
+
+# Try download, fall back to latest commit hash if it fails
+# exit code = 0 means success/true in bash, all others are false
 AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \
-    aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1
+  aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 || {
+    echo "Falling back to latest build"
+    S3_PREFIX="main/latest/x86_64-unknown-linux-musl"
+    AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \
+      aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1
+  }
+
 
 
 # Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code
 echo "Extracting qchat.zip..."
 unzip -q qchat.zip
 
-# move it to /usr/local/bin/qchat
+# move it to /usr/local/bin/qchat for path as qchat may not work otherwise
 if cp qchat /usr/local/bin/ && chmod +x /usr/local/bin/qchat; then
     ln -sf /usr/local/bin/qchat /usr/local/bin/q
     echo "qchat installed successfully"

From 7a1b06a368af976f93ce415716093c1873dcb137 Mon Sep 17 00:00:00 2001
From: arjun37602 <33012834+arjun37602@users.noreply.github.com>
Date: Tue, 15 Jul 2025 15:37:05 -0700
Subject: [PATCH 29/41] larger runner for storage

---
 .github/workflows/terminal-bench.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index 45b83f23e2..aae4737cdb 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -24,7 +24,7 @@ on:
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
   run-benchmark:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-latest-8-cores
     env:
       FIGCHAT_GAMMA_ID: ${{ secrets.FIGCHAT_GAMMA_ID }}
     permissions:
@@ -83,4 +83,4 @@ jobs:
       uses: actions/upload-artifact@v4
       with:
         name: benchmark-results
-        path: terminal-bench-test/runs/
\ No newline at end of file
+        path: terminal-bench-test/runs/

From 52ce30d69987c7b9e9dcc75cfe7e53a3e4b5a47f Mon Sep 17 00:00:00 2001
From: arjun37602 <33012834+arjun37602@users.noreply.github.com>
Date: Tue, 15 Jul 2025 15:42:33 -0700
Subject: [PATCH 30/41] use full hash instead of short hash

---
 .github/workflows/terminal-bench.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index aae4737cdb..0cfad7cd68 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -48,7 +48,7 @@ jobs:
     - name: Set git hash
       run: |
         if [ -n "$GITHUB_SHA" ]; then
-          git_hash=$(git rev-parse --short "$GITHUB_SHA")
+          git_hash=$(git rev-parse "$GITHUB_SHA")
         else
           git_hash="latest"
         fi

From 407993c92d94f5174847186e27aed1784b90b706 Mon Sep 17 00:00:00 2001
From: arjun37602 <33012834+arjun37602@users.noreply.github.com>
Date: Tue, 15 Jul 2025 15:48:54 -0700
Subject: [PATCH 31/41] fail if hash invalid

---
 terminal-bench-test/setup_amazon_q.sh | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 0259b3e69c..b3c0afdd71 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -38,17 +38,9 @@ S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1"
 S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-musl"
 echo "Downloading qchat.zip from s3://.../${S3_PREFIX}/qchat.zip"
 
-# Try download, fall back to latest commit hash if it fails
-# exit code = 0 means success/true in bash, all others are false
+# Try download, if hash is invalid we fail.
 AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \
-  aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 || {
-    echo "Falling back to latest build"
-    S3_PREFIX="main/latest/x86_64-unknown-linux-musl"
-    AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \
-      aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1
-  }
-
-
+  aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1
 
 # Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code
 echo "Extracting qchat.zip..."
@@ -65,4 +57,4 @@ fi
 
 echo "Cleaning q zip"
 rm -f qchat.zip
-rm -rf qchat
\ No newline at end of file
+rm -rf qchat

From ea88fed5aca0586d0412e36091909dac97c757d1 Mon Sep 17 00:00:00 2001
From: arjun37602 <33012834+arjun37602@users.noreply.github.com>
Date: Tue, 15 Jul 2025 15:51:26 -0700
Subject: [PATCH 32/41] Force to run on manual trigger

---
 .github/workflows/terminal-bench.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index 0cfad7cd68..5a0907f1be 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -18,9 +18,7 @@ on:
         required: true
         # The data type of the input
         type: string
-  push:
-    branches: [ terminal-bench-automation ]
-
+        
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
   run-benchmark:

From 5f1e553ac7e160262465742798ffe72c25696b30 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Wed, 16 Jul 2025 10:21:16 -0700
Subject: [PATCH 33/41] responding to PR comments

---
 .github/workflows/terminal-bench.yaml | 24 +++++++++++-----------
 terminal-bench-test/main.py           | 29 ++++++++++++++-------------
 terminal-bench-test/setup_amazon_q.sh |  5 ++---
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index 5a0907f1be..162504d7b1 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -1,34 +1,32 @@
 # This is a terminal-bench workflow that is manually triggered
 # Template taken from https://github.com/actions/starter-workflows/blob/main/automation/manual.yml for reference 
 
-name: TB workflow
+name: Terminal-Bench
 
 # Controls when the action will run. Workflow runs when manually triggered using the UI
-# or API.
 on:
   workflow_dispatch:
-    # Inputs the workflow accepts.
     inputs:
       name:
-        # Friendly description to be shown in the UI instead of 'name'
-        description: 'Run terminal bench'
-        # Default value if no value is explicitly provided
+        description: 'Run terminal-bench workflow to test Q CLI in real terminal environments.' 
         default: 'all'
-        # Input has to be provided for the workflow to run
         required: true
-        # The data type of the input
         type: string
         
-# A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
   run-benchmark:
+    # avoids disk storage issues
     runs-on: ubuntu-latest-8-cores
+    # makes these env vars available in main.py
     env:
-      FIGCHAT_GAMMA_ID: ${{ secrets.FIGCHAT_GAMMA_ID }}
+      CHAT_DOWNLOAD_ROLE_ARN: ${{ secrets.CHAT_DOWNLOAD_ROLE_ARN }}
+      CHAT_BUILD_BUCKET_NAME: ${{ secrets.CHAT_BUILD_BUCKET_NAME }}
     permissions:
       id-token: write
       contents: read
     steps:
+
+    # clear unnecessary storage to ensure docker containers have space
     - name: Cleanup and free disk space
       run: |
         sudo rm -rf /usr/share/dotnet
@@ -43,6 +41,7 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v4
 
+    # Captures git hash of branch to query specific S3 bucket
     - name: Set git hash
       run: |
         if [ -n "$GITHUB_SHA" ]; then
@@ -64,18 +63,19 @@ jobs:
         python -m pip install --upgrade pip
         pip install terminal-bench
     
+    # OIDC enabled for github for ArjunPersonal
     - name: Configure AWS credentials
       uses: aws-actions/configure-aws-credentials@v4
       with:
-        role-to-assume: arn:aws:iam::${{ secrets.AWS_TB_ROLE }}:role/ArjunPersonal
+        role-to-assume: ${{ secrets.AWS_TB_ROLE }}
         aws-region: us-east-1
-       
 
     - name: Run terminal benchmark
       run: |
         cd terminal-bench-test
         tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head
 
+    # uploads results if run fails as well to allow for easy log inspection
     - name: Upload results
       if: always()
       uses: actions/upload-artifact@v4
diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py
index 0e754e3fdf..2d2a3bdd3f 100644
--- a/terminal-bench-test/main.py
+++ b/terminal-bench-test/main.py
@@ -14,22 +14,24 @@ class AmazonQCLIAgent(AbstractInstalledAgent):
     def name() -> str:
         return "Amazon Q CLI"
 
-    def __init__(self, model_name: str | None = None, *args, **kwargs):
+    def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self._model_name = model_name
-        self._start_url = 'https://amzn.awsapps.com/start'
-        self.region = 'us-east-1'
 
+    """
+    Makes necessary env vars available in docker containers
+    """
     @property
     def _env(self) -> dict[str, str]:
         # SIGv4 = 1 for AWS credentials
-        env = {}
-        env["AMAZON_Q_SIGV4"] = 1
-        env["FIGCHAT_GAMMA_ID"] = os.environ.get("FIGCHAT_GAMMA_ID", '')
-        env["AWS_ACCESS_KEY_ID"] = os.environ.get("AWS_ACCESS_KEY_ID", '')
-        env["AWS_SECRET_ACCESS_KEY"] = os.environ.get("AWS_SECRET_ACCESS_KEY", '')
-        env["AWS_SESSION_TOKEN"] = os.environ.get("AWS_SESSION_TOKEN", '')
-        env["GIT_HASH"] = os.environ.get("GIT_HASH", '')
+        env = {
+            "AMAZON_Q_SIGV4": 1,
+            "AWS_ACCESS_KEY_ID": os.environ.get("AWS_ACCESS_KEY_ID", ''),
+            "AWS_SECRET_ACCESS_KEY": os.environ.get("AWS_SECRET_ACCESS_KEY", ''),
+            "AWS_SESSION_TOKEN": os.environ.get("AWS_SESSION_TOKEN", ''),
+            "GIT_HASH": os.environ.get("GIT_HASH", ''),
+            "CHAT_DOWNLOAD_ROLE_ARN": os.environ.get("CHAT_DOWNLOAD_ROLE_ARN", ''),
+            "CHAT_BUILD_BUCKET_NAME": os.environ.get("CHAT_BUILD_BUCKET_NAME", '')
+        }
         return env
 
     @property
@@ -40,11 +42,10 @@ def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]:
         escaped_description = shlex.quote(task_description)
         
         return [
-        # q chat with 30 min max timeout and also we wait on input. Using qchat cuz sigv4. 
-        # non-interactive for now --> check if needed or not
+        # q chat with 30 min max timeout and also we wait on input. Using qchat because of sigv4. 
             TerminalCommand(
                 command=f"qchat chat --no-interactive --trust-all-tools {escaped_description}",
-                max_timeout_sec=370, 
+                max_timeout_sec=1800, 
                 block=True,
             )
         ]
\ No newline at end of file
diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index b3c0afdd71..fa83d1aeb5 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -27,20 +27,19 @@ chmod 600 ~/.aws/config
 
 # Assume role and capture temporary credentials --> needed for s3 bucket access for build
 echo "Assuming AWS s3 role"
-TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession 2>/dev/null || echo '{}')
+TEMP_CREDENTIALS=$(aws sts assume-role --role-arn ${CHAT_DOWNLOAD_ROLE_ARN} --role-session-name S3AccessSession 2>/dev/null || echo '{}')
 QCHAT_ACCESSKEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId')
 Q_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey')
 Q_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken')
 
 # Download specific build from S3 based on commit hash
 echo "Downloading Amazon Q CLI build from S3..."
-S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1"
 S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-musl"
 echo "Downloading qchat.zip from s3://.../${S3_PREFIX}/qchat.zip"
 
 # Try download, if hash is invalid we fail.
 AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \
-  aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1
+  aws s3 cp s3://${CHAT_BUILD_BUCKET_NAME}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1
 
 # Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code
 echo "Extracting qchat.zip..."

From 80be0628a61a470ae1ce16446e1bb4079cb1eb2d Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Thu, 17 Jul 2025 09:44:51 -0700
Subject: [PATCH 34/41] take git hash as user input to avoid confusion

---
 .github/workflows/terminal-bench.yaml | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index 162504d7b1..9491d719cc 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -7,20 +7,21 @@ name: Terminal-Bench
 on:
   workflow_dispatch:
     inputs:
-      name:
-        description: 'Run terminal-bench workflow to test Q CLI in real terminal environments.' 
-        default: 'all'
+      git_commit_hash:
+        description: 'Input git commit hash to run TB on'
         required: true
+        default: 'latest'
         type: string
-        
+       
 jobs:
   run-benchmark:
     # avoids disk storage issues
-    runs-on: ubuntu-latest-8-cores
+    runs-on: ubuntu-latest
     # makes these env vars available in main.py
     env:
       CHAT_DOWNLOAD_ROLE_ARN: ${{ secrets.CHAT_DOWNLOAD_ROLE_ARN }}
       CHAT_BUILD_BUCKET_NAME: ${{ secrets.CHAT_BUILD_BUCKET_NAME }}
+      GIT_HASH: ${{ github.event.inputs.git_commit_hash }}
     permissions:
       id-token: write
       contents: read
@@ -41,18 +42,6 @@ jobs:
     - name: Checkout repository
       uses: actions/checkout@v4
 
-    # Captures git hash of branch to query specific S3 bucket
-    - name: Set git hash
-      run: |
-        if [ -n "$GITHUB_SHA" ]; then
-          git_hash=$(git rev-parse "$GITHUB_SHA")
-        else
-          git_hash="latest"
-        fi
-        # appends to github_env file
-        echo "GIT_HASH=$git_hash" >> $GITHUB_ENV
-        echo "Git hash set to: $git_hash"
-
     - name: Set up Python
       uses: actions/setup-python@v4
       with:

From 8e7b75991329381f9330057f8bd223e6908f6007 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Thu, 17 Jul 2025 10:04:45 -0700
Subject: [PATCH 35/41] description

---
 .github/workflows/terminal-bench.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index 9491d719cc..116be7a22c 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -8,7 +8,7 @@ on:
   workflow_dispatch:
     inputs:
       git_commit_hash:
-        description: 'Input git commit hash to run TB on'
+        description: 'Input git commit hash to run TB on (must exist on S3)'
         required: true
         default: 'latest'
         type: string

From fd9a09ba8040c5cf0a0356f1e3a54fe0215d98da Mon Sep 17 00:00:00 2001
From: arjun37602 <33012834+arjun37602@users.noreply.github.com>
Date: Tue, 5 Aug 2025 16:54:55 -0700
Subject: [PATCH 36/41] change n and allow clean up

---
 .github/workflows/terminal-bench.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index eb8ff688c8..33f3a2f539 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -61,7 +61,7 @@ jobs:
     - name: Run terminal benchmark
       run: |
         cd terminal-bench-test
-        tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head
+        tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head --n-tasks=20 --cleanup
 
     # uploads results if run fails as well to allow for easy log inspection
     - name: Upload results

From 93c312d82a45c694264137ec2c69911422902271 Mon Sep 17 00:00:00 2001
From: arjun37602 <33012834+arjun37602@users.noreply.github.com>
Date: Wed, 6 Aug 2025 09:47:13 -0700
Subject: [PATCH 37/41] Update terminal-bench.yaml

---
 .github/workflows/terminal-bench.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index 33f3a2f539..ac2ed44b4e 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -61,7 +61,7 @@ jobs:
     - name: Run terminal benchmark
       run: |
         cd terminal-bench-test
-        tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head --n-tasks=20 --cleanup
+        tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head --cleanup
 
     # uploads results if run fails as well to allow for easy log inspection
     - name: Upload results

From 4f4643bd6215df59e222a9615ddb361fa8d4c009 Mon Sep 17 00:00:00 2001
From: arjun37602 <33012834+arjun37602@users.noreply.github.com>
Date: Wed, 6 Aug 2025 16:34:44 -0700
Subject: [PATCH 38/41] Update setup_amazon_q.sh to use gnu instead of musl

---
 terminal-bench-test/setup_amazon_q.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index fa83d1aeb5..08b12d4564 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -34,7 +34,7 @@ Q_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken')
 
 # Download specific build from S3 based on commit hash
 echo "Downloading Amazon Q CLI build from S3..."
-S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-musl"
+S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-gnu"
 echo "Downloading qchat.zip from s3://.../${S3_PREFIX}/qchat.zip"
 
 # Try download, if hash is invalid we fail.

From b1bdfeeec423e655dc615027f44ce6ff8e7c5b95 Mon Sep 17 00:00:00 2001
From: arjun37602 <33012834+arjun37602@users.noreply.github.com>
Date: Thu, 7 Aug 2025 09:32:43 -0700
Subject: [PATCH 39/41] set n tasks

---
 .github/workflows/terminal-bench.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml
index ac2ed44b4e..bc14b69734 100644
--- a/.github/workflows/terminal-bench.yaml
+++ b/.github/workflows/terminal-bench.yaml
@@ -61,7 +61,7 @@ jobs:
     - name: Run terminal benchmark
       run: |
         cd terminal-bench-test
-        tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head --cleanup
+        tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head --cleanup --n-tasks=5
 
     # uploads results if run fails as well to allow for easy log inspection
     - name: Upload results

From f0ca4153580b6330af2891b42acd18c518fc27a0 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Thu, 7 Aug 2025 09:46:30 -0700
Subject: [PATCH 40/41] mimic official installation and use gnu

---
 terminal-bench-test/setup_amazon_q.sh | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 08b12d4564..2c89cdf63b 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -45,15 +45,28 @@ AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY
 echo "Extracting qchat.zip..."
 unzip -q qchat.zip
 
-# move it to /usr/local/bin/qchat for path as qchat may not work otherwise
-if cp qchat /usr/local/bin/ && chmod +x /usr/local/bin/qchat; then
+# Debug: Show extracted structure
+echo "Extracted contents:"
+ls -la
+
+# Extract and install mimicing the official release
+if [ -d "q" ] && [ -f "q/bin/qchat" ]; then
+    cp q/bin/qchat /usr/local/bin/qchat
+    cp q/bin/q /usr/local/bin/q 2>/dev/null || ln -sf /usr/local/bin/qchat /usr/local/bin/q
+    chmod +x /usr/local/bin/qchat /usr/local/bin/q
+    echo "qchat installed successfully"
+elif [ -f "qchat" ]; then
+    # Fallback if it's just the binary
+    cp qchat /usr/local/bin/qchat
     ln -sf /usr/local/bin/qchat /usr/local/bin/q
+    chmod +x /usr/local/bin/qchat
     echo "qchat installed successfully"
 else
-    echo "ERROR: Failed to install qchat"
+    echo "ERROR: qchat not found in expected locations"
+    ls -la
     exit 1
 fi
 
 echo "Cleaning q zip"
 rm -f qchat.zip
-rm -rf qchat
+rm -rf q qchat

From 77eb58de5cb643bc6642a10dc8e1bfb0c1adbb01 Mon Sep 17 00:00:00 2001
From: Arjun Balaji <arjbal@amazon.com>
Date: Thu, 7 Aug 2025 09:57:03 -0700
Subject: [PATCH 41/41] run chat_cli executable

---
 terminal-bench-test/setup_amazon_q.sh | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh
index 2c89cdf63b..6cb580e502 100644
--- a/terminal-bench-test/setup_amazon_q.sh
+++ b/terminal-bench-test/setup_amazon_q.sh
@@ -45,24 +45,16 @@ AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY
 echo "Extracting qchat.zip..."
 unzip -q qchat.zip
 
-# Debug: Show extracted structure
-echo "Extracted contents:"
-ls -la
+# Extract and install - the executable is named chat_cli
+# qchat → runs /usr/local/bin/qchat directly → which is the chat_cli binary
 
-# Extract and install mimicing the official release
-if [ -d "q" ] && [ -f "q/bin/qchat" ]; then
-    cp q/bin/qchat /usr/local/bin/qchat
-    cp q/bin/q /usr/local/bin/q 2>/dev/null || ln -sf /usr/local/bin/qchat /usr/local/bin/q
-    chmod +x /usr/local/bin/qchat /usr/local/bin/q
-    echo "qchat installed successfully"
-elif [ -f "qchat" ]; then
-    # Fallback if it's just the binary
-    cp qchat /usr/local/bin/qchat
+if [ -f "chat_cli" ]; then
+    cp chat_cli /usr/local/bin/qchat
     ln -sf /usr/local/bin/qchat /usr/local/bin/q
     chmod +x /usr/local/bin/qchat
     echo "qchat installed successfully"
 else
-    echo "ERROR: qchat not found in expected locations"
+    echo "ERROR: chat_cli executable not found"
     ls -la
     exit 1
 fi