From ec744a3a810a3f2281d6526fd2028df2740fa26d Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Thu, 10 Jul 2025 11:30:14 -0700 Subject: [PATCH 01/41] half way thru tb set up --- .github/workflows/terminal-bench.yaml | 59 +++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 .github/workflows/terminal-bench.yaml diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml new file mode 100644 index 0000000000..d2901ac2c2 --- /dev/null +++ b/.github/workflows/terminal-bench.yaml @@ -0,0 +1,59 @@ +# This is a terminal-bench workflow that is manually triggered +# Template taken from https://github.com/actions/starter-workflows/blob/main/automation/manual.yml for reference + +name: TB workflow + +# Controls when the action will run. Workflow runs when manually triggered using the UI +# or API. +on: + workflow_dispatch: + # Inputs the workflow accepts. + inputs: + name: + # Friendly description to be shown in the UI instead of 'name' + description: 'Run terminal bench' + # Default value if no value is explicitly provided + default: 'all' + # Input has to be provided for the workflow to run + required: true + # The data type of the input + type: string + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + run-benchmark: + runs-on: ubuntu-latest + permissions: + id-token: write + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r terminal-bench-test/requirements.txt + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::${{ secrets.TB_ACCOUNTID }}:role/ArjunPersonal + aws-region: us-east-1 + + + - name: Run terminal benchmark + run: | + cd terminal-bench-test + tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head + + - name: Upload results + uses: actions/upload-artifact@v4 + with: + name: benchmark-results + path: terminal-bench-test/results/ \ No newline at end of file From d2a605df64a743082fd2366463ccf8c12f0223ff Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Thu, 10 Jul 2025 16:12:58 -0700 Subject: [PATCH 02/41] tb yaml --- .github/workflows/terminal-bench.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index d2901ac2c2..c7bf5552e4 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -43,7 +43,7 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - role-to-assume: arn:aws:iam::${{ secrets.TB_ACCOUNTID }}:role/ArjunPersonal + role-to-assume: arn:aws:iam::${{ secrets.AWS_ROLE }}:role/ArjunPersonal aws-region: us-east-1 From bea6d119da367c7269b02118d8f1dd1c559fac02 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Thu, 10 Jul 2025 16:13:16 -0700 Subject: [PATCH 03/41] TB --- .github/workflows/terminal-bench.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index c7bf5552e4..f60ed4e73f 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -43,7 +43,7 @@ jobs: - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - role-to-assume: arn:aws:iam::${{ secrets.AWS_ROLE }}:role/ArjunPersonal + role-to-assume: arn:aws:iam::${{ secrets.AWS_TB_ROLE }}:role/ArjunPersonal aws-region: us-east-1 From 1c03b3203c4aa60afea2f8c8f0d295eae60e8aa8 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Thu, 10 Jul 2025 16:37:16 -0700 Subject: [PATCH 04/41] tb bench scripts --- terminal-bench-test/main.py | 47 +++++++++++++++++++++++ terminal-bench-test/setup_amazon_q.sh | 55 +++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 terminal-bench-test/main.py create mode 100644 terminal-bench-test/setup_amazon_q.sh diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py new file mode 100644 index 0000000000..7efbd1cc4b --- /dev/null +++ b/terminal-bench-test/main.py @@ -0,0 +1,47 @@ +import os +import shlex +from pathlib import Path + +from terminal_bench.agents.installed_agents.abstract_installed_agent import ( + AbstractInstalledAgent, +) +from terminal_bench.terminal.models import TerminalCommand + + +class AmazonQCLIAgent(AbstractInstalledAgent): + + @staticmethod + def name() -> str: + return "Amazon Q CLI" + + def __init__(self, model_name: str | None = None, *args, **kwargs): + super().__init__(*args, **kwargs) + self._model_name = model_name + self._start_url = 'https://amzn.awsapps.com/start' + self.region = 'us-east-1' + + @property + def _env(self) -> dict[str, str]: + # SIGv4 = 1 for AWS credentials + env = {} + for q_var in ["CODEWHISPERER_TOKEN", "CODEWHISPERER_DEVICE_REGISTRATION", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS", "AWS_SESSION_TOKEN", "AMAZON_Q_SIGV4"]: + if q_var in os.environ: + env[q_var] = os.environ[q_var] + return env + + @property + def _install_agent_script_path(self) -> os.PathLike: + return Path(__file__).parent / "setup_amazon_q.sh" + + def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]: + escaped_description = shlex.quote(task_description) + + return [ + # q chat with 30 min max timeout and also we wait on input. Using qchat cuz sigv4. + # non-interactive for now --> check if needed or not + TerminalCommand( + command=f"qchat chat --no-interactive --trust-all-tools {escaped_description}", + max_timeout_sec=370, + block=True, + ) + ] \ No newline at end of file diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh new file mode 100644 index 0000000000..6a3e38144d --- /dev/null +++ b/terminal-bench-test/setup_amazon_q.sh @@ -0,0 +1,55 @@ +#!/bin/bash +set -e + +echo "Installing Amazon Q for command line (Linux)..." + +# Update package lists and install required tools +apt-get update +apt-get install -y curl unzip sqlite3 bc +export AMAZON_Q_SIGV4=1 + +# Check glibc version and architecture +GLIBC_VERSION=$(ldd --version | head -n1 | grep -o '[0-9]\+\.[0-9]\+' | head -n1) +ARCH=$(uname -m) + +echo "Detected architecture: $ARCH" +echo "Detected glibc version: $GLIBC_VERSION" + +# Determine download URL based on architecture and glibc version +if [ "$ARCH" = "x86_64" ]; then + if [ "$(echo "$GLIBC_VERSION >= 2.34" | bc -l 2>/dev/null || echo 0)" = "1" ]; then + Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-x86_64-linux.zip" + echo "Using standard x86_64 version" + else + Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-x86_64-linux-musl.zip" + echo "Using musl x86_64 version for older glibc" + fi +elif [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then + if [ "$(echo "$GLIBC_VERSION >= 2.34" | bc -l 2>/dev/null || echo 0)" = "1" ]; then + Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-aarch64-linux.zip" + echo "Using standard aarch64 version" + else + Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-aarch64-linux-musl.zip" + echo "Using musl aarch64 version for older glibc" + fi +else + echo "Unsupported architecture: $ARCH" + exit 1 +fi + +# Download Amazon Q +echo "Downloading from: $Q_URL" +curl --proto '=https' --tlsv1.2 -sSf "$Q_URL" -o "q.zip" + +# Extract and install w/o interactive installer +unzip q.zip +cp q/bin/q /usr/local/bin/q +cp q/bin/qchat /usr/local/bin/qchat +chmod +x /usr/local/bin/q +chmod +x /usr/local/bin/qchat + +# Test qchat installation without hanging +echo "Testing qchat version..." +q --version +echo "Cleaning q zip" +rm -f q.zip From f063ef4ed658e65adb380fd6186146ee7c36c353 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Thu, 10 Jul 2025 16:50:38 -0700 Subject: [PATCH 05/41] trigger on push change --- .github/workflows/terminal-bench.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index f60ed4e73f..56f8cced7e 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -18,6 +18,8 @@ on: required: true # The data type of the input type: string + push: + branches: [ terminal-bench-automation ] # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: From 62ea19127002de60f2b79e3c7e64d0e4cfcf303e Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Thu, 10 Jul 2025 16:52:11 -0700 Subject: [PATCH 06/41] typo --- .github/workflows/terminal-bench.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index 56f8cced7e..c93ece5c2a 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -18,8 +18,8 @@ on: required: true # The data type of the input type: string - push: - branches: [ terminal-bench-automation ] + push: + branches: [ terminal-bench-automation ] # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: From 73873b1d74183d4977e81a8f5316e6c319dfc157 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Thu, 10 Jul 2025 16:53:53 -0700 Subject: [PATCH 07/41] trigger push --- .github/workflows/terminal-bench.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index c93ece5c2a..ae88a12df3 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -40,7 +40,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -r terminal-bench-test/requirements.txt + pip install terminal-bench - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 From 79651ff90d7067bd26465a311e504090408e0d27 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Thu, 10 Jul 2025 16:56:20 -0700 Subject: [PATCH 08/41] python v --- .github/workflows/terminal-bench.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index ae88a12df3..db465fc3cc 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -35,7 +35,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.13' - name: Install dependencies run: | From 832ac1eca88b5acf128a9c3fdec99ad48ff917ca Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Fri, 11 Jul 2025 10:51:26 -0700 Subject: [PATCH 09/41] run local branch instead of prod --- terminal-bench-test/main.py | 2 +- terminal-bench-test/setup_amazon_q.sh | 54 ++------------------------- 2 files changed, 5 insertions(+), 51 deletions(-) diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py index 7efbd1cc4b..5cf2236e3e 100644 --- a/terminal-bench-test/main.py +++ b/terminal-bench-test/main.py @@ -40,7 +40,7 @@ def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]: # q chat with 30 min max timeout and also we wait on input. Using qchat cuz sigv4. # non-interactive for now --> check if needed or not TerminalCommand( - command=f"qchat chat --no-interactive --trust-all-tools {escaped_description}", + command=f"cargo run --bin chat_cli -- chat --no-interactive --trust-all-tools {escaped_description}", max_timeout_sec=370, block=True, ) diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 6a3e38144d..fb7f9c913e 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -2,54 +2,8 @@ set -e echo "Installing Amazon Q for command line (Linux)..." - -# Update package lists and install required tools -apt-get update -apt-get install -y curl unzip sqlite3 bc export AMAZON_Q_SIGV4=1 - -# Check glibc version and architecture -GLIBC_VERSION=$(ldd --version | head -n1 | grep -o '[0-9]\+\.[0-9]\+' | head -n1) -ARCH=$(uname -m) - -echo "Detected architecture: $ARCH" -echo "Detected glibc version: $GLIBC_VERSION" - -# Determine download URL based on architecture and glibc version -if [ "$ARCH" = "x86_64" ]; then - if [ "$(echo "$GLIBC_VERSION >= 2.34" | bc -l 2>/dev/null || echo 0)" = "1" ]; then - Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-x86_64-linux.zip" - echo "Using standard x86_64 version" - else - Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-x86_64-linux-musl.zip" - echo "Using musl x86_64 version for older glibc" - fi -elif [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then - if [ "$(echo "$GLIBC_VERSION >= 2.34" | bc -l 2>/dev/null || echo 0)" = "1" ]; then - Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-aarch64-linux.zip" - echo "Using standard aarch64 version" - else - Q_URL="https://desktop-release.q.us-east-1.amazonaws.com/latest/q-aarch64-linux-musl.zip" - echo "Using musl aarch64 version for older glibc" - fi -else - echo "Unsupported architecture: $ARCH" - exit 1 -fi - -# Download Amazon Q -echo "Downloading from: $Q_URL" -curl --proto '=https' --tlsv1.2 -sSf "$Q_URL" -o "q.zip" - -# Extract and install w/o interactive installer -unzip q.zip -cp q/bin/q /usr/local/bin/q -cp q/bin/qchat /usr/local/bin/qchat -chmod +x /usr/local/bin/q -chmod +x /usr/local/bin/qchat - -# Test qchat installation without hanging -echo "Testing qchat version..." -q --version -echo "Cleaning q zip" -rm -f q.zip + +# Install the Q branch code into machine + dependencies +cd /home/runner/work/amazon-q-developer-cli/amazon-q-developer-cli/ +npm run setup From 8a32d5618fc2f38ff74cfd64a7fc5165ec1f915f Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Fri, 11 Jul 2025 11:38:09 -0700 Subject: [PATCH 10/41] unnecessary env variables removed --- .github/workflows/terminal-bench.yaml | 2 +- terminal-bench-test/main.py | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index db465fc3cc..6ba1c91fca 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -52,7 +52,7 @@ jobs: - name: Run terminal benchmark run: | cd terminal-bench-test - tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head + tb run --agent-import-path main:AmazonQCLIAgent --task-id hello-world --livestream - name: Upload results uses: actions/upload-artifact@v4 diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py index 5cf2236e3e..d5d586547b 100644 --- a/terminal-bench-test/main.py +++ b/terminal-bench-test/main.py @@ -20,15 +20,6 @@ def __init__(self, model_name: str | None = None, *args, **kwargs): self._start_url = 'https://amzn.awsapps.com/start' self.region = 'us-east-1' - @property - def _env(self) -> dict[str, str]: - # SIGv4 = 1 for AWS credentials - env = {} - for q_var in ["CODEWHISPERER_TOKEN", "CODEWHISPERER_DEVICE_REGISTRATION", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS", "AWS_SESSION_TOKEN", "AMAZON_Q_SIGV4"]: - if q_var in os.environ: - env[q_var] = os.environ[q_var] - return env - @property def _install_agent_script_path(self) -> os.PathLike: return Path(__file__).parent / "setup_amazon_q.sh" From e6c6a80389395a63ae7f8eda16a5e4a1608bf01d Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Fri, 11 Jul 2025 11:46:12 -0700 Subject: [PATCH 11/41] allow log inspection --- .github/workflows/terminal-bench.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index 6ba1c91fca..e91ff63052 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -52,10 +52,11 @@ jobs: - name: Run terminal benchmark run: | cd terminal-bench-test - tb run --agent-import-path main:AmazonQCLIAgent --task-id hello-world --livestream + tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head - name: Upload results + if: always() uses: actions/upload-artifact@v4 with: name: benchmark-results - path: terminal-bench-test/results/ \ No newline at end of file + path: terminal-bench-test/runs/ \ No newline at end of file From e47a2bff2921b13b641387293a0693455bf19384 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Fri, 11 Jul 2025 11:48:32 -0700 Subject: [PATCH 12/41] implement _env --- terminal-bench-test/main.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py index d5d586547b..444b1dd0ab 100644 --- a/terminal-bench-test/main.py +++ b/terminal-bench-test/main.py @@ -20,6 +20,12 @@ def __init__(self, model_name: str | None = None, *args, **kwargs): self._start_url = 'https://amzn.awsapps.com/start' self.region = 'us-east-1' + @property + def _env(self) -> dict[str, str]: + # SIGv4 = 1 for AWS credentials + env = {} + return env + @property def _install_agent_script_path(self) -> os.PathLike: return Path(__file__).parent / "setup_amazon_q.sh" From f5ed0ca19bff22ad0e8dae2fbb96139a427ac3a0 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Fri, 11 Jul 2025 12:10:04 -0700 Subject: [PATCH 13/41] npm, rust, cargo, clone github directly --- terminal-bench-test/setup_amazon_q.sh | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index fb7f9c913e..62f5645169 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -1,9 +1,21 @@ #!/bin/bash set -e +# Install Node.js and npm +curl -fsSL https://deb.nodesource.com/setup_18.x | bash - +apt-get install -y nodejs + +# Install Rust and Cargo +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +source ~/.cargo/env + +# get the github code echo "Installing Amazon Q for command line (Linux)..." +mkdir -p /app/amazon-q-developer-cli +cd /app export AMAZON_Q_SIGV4=1 - -# Install the Q branch code into machine + dependencies -cd /home/runner/work/amazon-q-developer-cli/amazon-q-developer-cli/ +git clone https://github.com/aws/amazon-q-developer-cli.git +cd amazon-q-developer-cli npm run setup + +echo "Amazon Q CLI installation completed successfully" \ No newline at end of file From 3ec0dbebe8724140b358d07803fb9d82e213eb69 Mon Sep 17 00:00:00 2001 From: arjun37602 <33012834+arjun37602@users.noreply.github.com> Date: Fri, 11 Jul 2025 12:33:55 -0700 Subject: [PATCH 14/41] Update setup_amazon_q.sh --- terminal-bench-test/setup_amazon_q.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 62f5645169..01a04284a6 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -1,5 +1,7 @@ #!/bin/bash set -e +apt-get update +apt-get install -y curl wget gnupg2 software-properties-common git # Install Node.js and npm curl -fsSL https://deb.nodesource.com/setup_18.x | bash - @@ -18,4 +20,4 @@ git clone https://github.com/aws/amazon-q-developer-cli.git cd amazon-q-developer-cli npm run setup -echo "Amazon Q CLI installation completed successfully" \ No newline at end of file +echo "Amazon Q CLI installation completed successfully" From a2e6ebeb13d2fde4a660339a948516f61a6aac0c Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Fri, 11 Jul 2025 13:54:49 -0700 Subject: [PATCH 15/41] clean up disk + check amt of free space --- .github/workflows/terminal-bench.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index e91ff63052..0dea01ebad 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -29,6 +29,16 @@ jobs: id-token: write contents: read steps: + - name: Cleanup and free disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/swift + df -h + - name: Checkout repository uses: actions/checkout@v4 From e5f51df1b3380a380a186eebadc29174a84ab2cd Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Mon, 14 Jul 2025 10:09:57 -0700 Subject: [PATCH 16/41] get gcc dependencies --- terminal-bench-test/setup_amazon_q.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 01a04284a6..b02e81ca44 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -6,6 +6,8 @@ apt-get install -y curl wget gnupg2 software-properties-common git # Install Node.js and npm curl -fsSL https://deb.nodesource.com/setup_18.x | bash - apt-get install -y nodejs +apt-get install -y build-essential gcc g++ make +apt-get install -y pkg-config libssl-dev # Install Rust and Cargo curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y From f53926d6e283a2944ea3723060dbfaacbb635343 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Mon, 14 Jul 2025 11:04:40 -0700 Subject: [PATCH 17/41] big timeout --- terminal-bench-test/main.py | 4 ++-- terminal-bench-test/setup_amazon_q.sh | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py index 444b1dd0ab..0a8d5330e0 100644 --- a/terminal-bench-test/main.py +++ b/terminal-bench-test/main.py @@ -23,7 +23,7 @@ def __init__(self, model_name: str | None = None, *args, **kwargs): @property def _env(self) -> dict[str, str]: # SIGv4 = 1 for AWS credentials - env = {} + env = {"AMAZON_Q_SIGV4":1} return env @property @@ -38,7 +38,7 @@ def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]: # non-interactive for now --> check if needed or not TerminalCommand( command=f"cargo run --bin chat_cli -- chat --no-interactive --trust-all-tools {escaped_description}", - max_timeout_sec=370, + max_timeout_sec=1200, block=True, ) ] \ No newline at end of file diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index b02e81ca44..646e288a4e 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -20,6 +20,4 @@ cd /app export AMAZON_Q_SIGV4=1 git clone https://github.com/aws/amazon-q-developer-cli.git cd amazon-q-developer-cli -npm run setup - echo "Amazon Q CLI installation completed successfully" From 803eda1d0c78ec69ff8f97a663597697d8dbe548 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Mon, 14 Jul 2025 11:39:46 -0700 Subject: [PATCH 18/41] pipe config files from gh runner to docker --- terminal-bench-test/main.py | 29 +++++++++++++++++++++++++++ terminal-bench-test/setup_amazon_q.sh | 17 ++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py index 0a8d5330e0..1ce66c4007 100644 --- a/terminal-bench-test/main.py +++ b/terminal-bench-test/main.py @@ -24,6 +24,35 @@ def __init__(self, model_name: str | None = None, *args, **kwargs): def _env(self) -> dict[str, str]: # SIGv4 = 1 for AWS credentials env = {"AMAZON_Q_SIGV4":1} + aws_credentials_path = os.path.expanduser("~/.aws/credentials") + aws_config_path = os.path.expanduser("~/.aws/config") + + if os.path.exists(aws_credentials_path): + try: + import configparser + config = configparser.ConfigParser() + config.read(aws_credentials_path) + + if "default" in config: + if "aws_access_key_id" in config["default"]: + env["AWS_ACCESS_KEY_ID"] = config["default"]["aws_access_key_id"] + if "aws_secret_access_key" in config["default"]: + env["AWS_SECRET_ACCESS_KEY"] = config["default"]["aws_secret_access_key"] + if "aws_session_token" in config["default"]: + env["AWS_SESSION_TOKEN"] = config["default"]["aws_session_token"] + except Exception as e: + print(f"Warning: Failed to read AWS credentials: {e}") + + if os.path.exists(aws_config_path): + try: + import configparser + config = configparser.ConfigParser() + config.read(aws_config_path) + + if "default" in config and "region" in config["default"]: + env["AWS_REGION"] = config["default"]["region"] + except Exception as e: + print(f"Warning: Failed to read AWS config: {e}") return env @property diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 646e288a4e..4baccad255 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -21,3 +21,20 @@ export AMAZON_Q_SIGV4=1 git clone https://github.com/aws/amazon-q-developer-cli.git cd amazon-q-developer-cli echo "Amazon Q CLI installation completed successfully" + + +# Create AWS credentials from environment variables +mkdir -p ~/.aws +cat > ~/.aws/credentials << EOF +[default] +aws_access_key_id = ${AWS_ACCESS_KEY_ID} +aws_secret_access_key = ${AWS_SECRET_ACCESS_KEY} +aws_session_token = ${AWS_SESSION_TOKEN} +EOF +chmod 600 ~/.aws/credentials + +cat > ~/.aws/config << EOF +[default] +region = us-east-1 +EOF +chmod 600 ~/.aws/config \ No newline at end of file From 8db47f75b667ad9abbf2012ba56a99e154c0c6f0 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Mon, 14 Jul 2025 16:08:29 -0700 Subject: [PATCH 19/41] configure env + working with sso --- .github/workflows/terminal-bench.yaml | 3 +++ terminal-bench-test/main.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index 0dea01ebad..c4d7de086c 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -25,6 +25,8 @@ on: jobs: run-benchmark: runs-on: ubuntu-latest + env: + FIGCHAT_GAMMA_ID: ${{ secrets.figchat_gamma_id }} permissions: id-token: write contents: read @@ -37,6 +39,7 @@ jobs: sudo rm -rf "$AGENT_TOOLSDIRECTORY" sudo rm -rf /usr/local/lib/android sudo rm -rf /usr/share/swift + sudo apt-get clean df -h - name: Checkout repository diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py index 1ce66c4007..1604903517 100644 --- a/terminal-bench-test/main.py +++ b/terminal-bench-test/main.py @@ -66,8 +66,8 @@ def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]: # q chat with 30 min max timeout and also we wait on input. Using qchat cuz sigv4. # non-interactive for now --> check if needed or not TerminalCommand( - command=f"cargo run --bin chat_cli -- chat --no-interactive --trust-all-tools {escaped_description}", - max_timeout_sec=1200, + command=f"qchat chat --no-interactive --trust-all-tools {escaped_description}", + max_timeout_sec=370, block=True, ) ] \ No newline at end of file From c90896023afe07ea626d02c5b60cf78922468d9e Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Mon, 14 Jul 2025 16:34:49 -0700 Subject: [PATCH 20/41] changed default --- terminal-bench-test/setup_amazon_q.sh | 83 ++++++++++++++++++++------- 1 file changed, 62 insertions(+), 21 deletions(-) diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 4baccad255..68b6c67664 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -1,27 +1,15 @@ #!/bin/bash set -e +# if git hash empty then set to latest automatically. +git_hash=${GITHUB_SHA:+"$(git rev-parse --short "$GITHUB_SHA")"} +git_hash=${git_hash:-"00286fbe0688fe19fef9a5149bb9f9172f96783c"} apt-get update -apt-get install -y curl wget gnupg2 software-properties-common git - -# Install Node.js and npm -curl -fsSL https://deb.nodesource.com/setup_18.x | bash - -apt-get install -y nodejs -apt-get install -y build-essential gcc g++ make -apt-get install -y pkg-config libssl-dev - -# Install Rust and Cargo -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -source ~/.cargo/env - -# get the github code -echo "Installing Amazon Q for command line (Linux)..." -mkdir -p /app/amazon-q-developer-cli -cd /app -export AMAZON_Q_SIGV4=1 -git clone https://github.com/aws/amazon-q-developer-cli.git -cd amazon-q-developer-cli -echo "Amazon Q CLI installation completed successfully" +apt-get install -y curl wget unzip jq +echo "Installing AWS CLI..." +curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" +unzip -q awscliv2.zip +./aws/install --bin-dir /usr/local/bin --install-dir /usr/local/aws-cli # Create AWS credentials from environment variables mkdir -p ~/.aws @@ -37,4 +25,57 @@ cat > ~/.aws/config << EOF [default] region = us-east-1 EOF -chmod 600 ~/.aws/config \ No newline at end of file +chmod 600 ~/.aws/config + + +# Save original credentials +ORIGINAL_AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} +ORIGINAL_AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} +ORIGINAL_AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN} + +# Assume role and capture temporary credentials --> needed for s3 bucket access for build +echo "Assuming role FigIoChat-S3Access-Role-Gamma..." +TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession) + +# Extract and export temporary credentials -> jq is just used +export AWS_ACCESS_KEY_ID=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId') +export AWS_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey') +export AWS_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken') + +# Download specific build from S3 based on commit hash +echo "Downloading Amazon Q CLI build from S3..." +S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1" +S3_PREFIX="main/${git_hash}/x86_64-unknown-linux-musl" +echo "Downloading qchat.zip from s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip" +aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 + + +# Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code +echo "Extracting qchat.zip..." +unzip -q qchat.zip +mkdir -p /usr/local/bin +cp -f ./qchat/qchat /usr/local/bin/qchat +chmod +x /usr/local/bin/qchat +ln -sf /usr/local/bin/qchat /usr/local/bin/q + +# Restore credentials to run Q +export AWS_ACCESS_KEY_ID=${ORIGINAL_AWS_ACCESS_KEY_ID} +export AWS_SECRET_ACCESS_KEY=${ORIGINAL_AWS_SECRET_ACCESS_KEY} +export AWS_SESSION_TOKEN=${ORIGINAL_AWS_SESSION_TOKEN} + +cat > ~/.aws/credentials << EOF +[default] +aws_access_key_id = ${ORIGINAL_AWS_ACCESS_KEY_ID} +aws_secret_access_key = ${ORIGINAL_AWS_SECRET_ACCESS_KEY} +aws_session_token = ${ORIGINAL_AWS_SESSION_TOKEN} +EOF + +echo "Cleaning q zip" +rm -f qchat.zip +rm -rf qchat + + + + + + From fb9262dd65eee057d53a99d923ec7ccc656cfe96 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Mon, 14 Jul 2025 16:35:21 -0700 Subject: [PATCH 21/41] default to latest --- terminal-bench-test/setup_amazon_q.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 68b6c67664..335bd0cb04 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -1,8 +1,8 @@ #!/bin/bash set -e -# if git hash empty then set to latest automatically. +# if git hash empty then set to latest auto git_hash=${GITHUB_SHA:+"$(git rev-parse --short "$GITHUB_SHA")"} -git_hash=${git_hash:-"00286fbe0688fe19fef9a5149bb9f9172f96783c"} +git_hash=${git_hash:-"latest"} apt-get update apt-get install -y curl wget unzip jq From ebdd054d191d5f2aee8589a44b2b68395f6c0aea Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Mon, 14 Jul 2025 16:49:30 -0700 Subject: [PATCH 22/41] fixing qchat location + forcing correct auth --- terminal-bench-test/main.py | 1 + terminal-bench-test/setup_amazon_q.sh | 26 +++++++++++++++----------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py index 1604903517..72c79f25be 100644 --- a/terminal-bench-test/main.py +++ b/terminal-bench-test/main.py @@ -53,6 +53,7 @@ def _env(self) -> dict[str, str]: env["AWS_REGION"] = config["default"]["region"] except Exception as e: print(f"Warning: Failed to read AWS config: {e}") + env['FIGCHAT_GAMMA_ID'] = os.environ.get('FIGCHAT_GAMMA_ID', '') return env @property diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 335bd0cb04..5b22ab2e22 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -49,14 +49,24 @@ S3_PREFIX="main/${git_hash}/x86_64-unknown-linux-musl" echo "Downloading qchat.zip from s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip" aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 - # Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code echo "Extracting qchat.zip..." unzip -q qchat.zip mkdir -p /usr/local/bin -cp -f ./qchat/qchat /usr/local/bin/qchat -chmod +x /usr/local/bin/qchat -ln -sf /usr/local/bin/qchat /usr/local/bin/q + +# Find the qchat executable -> copy +find . -type f -name "qchat" -exec cp {} /usr/local/bin/qchat \; + +if [ -f "/usr/local/bin/qchat" ]; then + chmod +x /usr/local/bin/qchat + ln -sf /usr/local/bin/qchat /usr/local/bin/q + echo "qchat installed successfully" +else + echo "ERROR: qchat executable not found in the zip file" + # List the contents of the extracted files to debug + find . -type f | grep -v "aws" | head -20 + exit 1 +fi # Restore credentials to run Q export AWS_ACCESS_KEY_ID=${ORIGINAL_AWS_ACCESS_KEY_ID} @@ -72,10 +82,4 @@ EOF echo "Cleaning q zip" rm -f qchat.zip -rm -rf qchat - - - - - - +rm -rf qchat \ No newline at end of file From 1d665f5909b54595c37c24a6ca5bb665aebdccce Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Mon, 14 Jul 2025 16:59:40 -0700 Subject: [PATCH 23/41] set env vars not just config file --- terminal-bench-test/setup_amazon_q.sh | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 5b22ab2e22..c49b716585 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -35,15 +35,25 @@ ORIGINAL_AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN} # Assume role and capture temporary credentials --> needed for s3 bucket access for build echo "Assuming role FigIoChat-S3Access-Role-Gamma..." -TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession) - -# Extract and export temporary credentials -> jq is just used -export AWS_ACCESS_KEY_ID=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId') -export AWS_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey') -export AWS_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken') +if [ -n "${FIGCHAT_GAMMA_ID}" ]; then + TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession 2>/dev/null || echo '{}') + + # Check if role assumption was successful + if [ "$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId' 2>/dev/null)" != "null" ] && [ "$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId' 2>/dev/null)" != "" ]; then + echo "Role assumption successful, using temporary credentials" + export AWS_ACCESS_KEY_ID=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId') + export AWS_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey') + export AWS_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken') + else + echo "Role assumption failed, continuing with original credentials" + fi +else + echo "FIGCHAT_GAMMA_ID not set, skipping role assumption" +fi # Download specific build from S3 based on commit hash echo "Downloading Amazon Q CLI build from S3..." +# Use hardcoded bucket ID if FIGCHAT_GAMMA_ID is not set S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1" S3_PREFIX="main/${git_hash}/x86_64-unknown-linux-musl" echo "Downloading qchat.zip from s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip" From 12adf8677cffc596f699d55bac92110fc0cfb287 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Tue, 15 Jul 2025 09:27:29 -0700 Subject: [PATCH 24/41] env vars all caps --- .github/workflows/terminal-bench.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index c4d7de086c..267e0932b7 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -26,7 +26,7 @@ jobs: run-benchmark: runs-on: ubuntu-latest env: - FIGCHAT_GAMMA_ID: ${{ secrets.figchat_gamma_id }} + FIGCHAT_GAMMA_ID: ${{ secrets.FIGCHAT_GAMMA_ID }} permissions: id-token: write contents: read From 6888d4b30f85a7b9f4cab7309a7d2a3e780d774b Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Tue, 15 Jul 2025 09:38:03 -0700 Subject: [PATCH 25/41] confirm env vairables are visible --- terminal-bench-test/setup_amazon_q.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index c49b716585..2fdb481050 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -34,7 +34,7 @@ ORIGINAL_AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} ORIGINAL_AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN} # Assume role and capture temporary credentials --> needed for s3 bucket access for build -echo "Assuming role FigIoChat-S3Access-Role-Gamma..." +echo "Assuming arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma..." if [ -n "${FIGCHAT_GAMMA_ID}" ]; then TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession 2>/dev/null || echo '{}') From 2c52fc1f407e5c6daccccfceb3c1f2989a632789 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Tue, 15 Jul 2025 11:36:13 -0700 Subject: [PATCH 26/41] roleName + code simplify --- terminal-bench-test/main.py | 32 ++------------------ terminal-bench-test/setup_amazon_q.sh | 42 +++++++-------------------- 2 files changed, 13 insertions(+), 61 deletions(-) diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py index 72c79f25be..30c6baa6bb 100644 --- a/terminal-bench-test/main.py +++ b/terminal-bench-test/main.py @@ -23,36 +23,8 @@ def __init__(self, model_name: str | None = None, *args, **kwargs): @property def _env(self) -> dict[str, str]: # SIGv4 = 1 for AWS credentials - env = {"AMAZON_Q_SIGV4":1} - aws_credentials_path = os.path.expanduser("~/.aws/credentials") - aws_config_path = os.path.expanduser("~/.aws/config") - - if os.path.exists(aws_credentials_path): - try: - import configparser - config = configparser.ConfigParser() - config.read(aws_credentials_path) - - if "default" in config: - if "aws_access_key_id" in config["default"]: - env["AWS_ACCESS_KEY_ID"] = config["default"]["aws_access_key_id"] - if "aws_secret_access_key" in config["default"]: - env["AWS_SECRET_ACCESS_KEY"] = config["default"]["aws_secret_access_key"] - if "aws_session_token" in config["default"]: - env["AWS_SESSION_TOKEN"] = config["default"]["aws_session_token"] - except Exception as e: - print(f"Warning: Failed to read AWS credentials: {e}") - - if os.path.exists(aws_config_path): - try: - import configparser - config = configparser.ConfigParser() - config.read(aws_config_path) - - if "default" in config and "region" in config["default"]: - env["AWS_REGION"] = config["default"]["region"] - except Exception as e: - print(f"Warning: Failed to read AWS config: {e}") + env = {} + env["AMAZON_Q_SIGV4"] = 1 env['FIGCHAT_GAMMA_ID'] = os.environ.get('FIGCHAT_GAMMA_ID', '') return env diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 2fdb481050..7838176e52 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -27,29 +27,18 @@ region = us-east-1 EOF chmod 600 ~/.aws/config - # Save original credentials ORIGINAL_AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} ORIGINAL_AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} ORIGINAL_AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN} # Assume role and capture temporary credentials --> needed for s3 bucket access for build -echo "Assuming arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma..." -if [ -n "${FIGCHAT_GAMMA_ID}" ]; then - TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession 2>/dev/null || echo '{}') - - # Check if role assumption was successful - if [ "$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId' 2>/dev/null)" != "null" ] && [ "$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId' 2>/dev/null)" != "" ]; then - echo "Role assumption successful, using temporary credentials" - export AWS_ACCESS_KEY_ID=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId') - export AWS_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey') - export AWS_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken') - else - echo "Role assumption failed, continuing with original credentials" - fi -else - echo "FIGCHAT_GAMMA_ID not set, skipping role assumption" -fi +echo "Assuming AWS s3 role" +TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession 2>/dev/null || echo '{}') +echo $TEMP_CREDENTIALS +QCHAT_ACCESSKEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId') +Q_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey') +# export AWS_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken') # Download specific build from S3 based on commit hash echo "Downloading Amazon Q CLI build from S3..." @@ -57,32 +46,23 @@ echo "Downloading Amazon Q CLI build from S3..." S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1" S3_PREFIX="main/${git_hash}/x86_64-unknown-linux-musl" echo "Downloading qchat.zip from s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip" -aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 +AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY"\ + aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 # Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code echo "Extracting qchat.zip..." unzip -q qchat.zip -mkdir -p /usr/local/bin -# Find the qchat executable -> copy -find . -type f -name "qchat" -exec cp {} /usr/local/bin/qchat \; - -if [ -f "/usr/local/bin/qchat" ]; then - chmod +x /usr/local/bin/qchat +# move it to /usr/local/bin/qchat +if cp qchat /usr/local/bin/ && chmod +x /usr/local/bin/qchat; then ln -sf /usr/local/bin/qchat /usr/local/bin/q echo "qchat installed successfully" else - echo "ERROR: qchat executable not found in the zip file" - # List the contents of the extracted files to debug - find . -type f | grep -v "aws" | head -20 + echo "ERROR: Failed to install qchat" exit 1 fi # Restore credentials to run Q -export AWS_ACCESS_KEY_ID=${ORIGINAL_AWS_ACCESS_KEY_ID} -export AWS_SECRET_ACCESS_KEY=${ORIGINAL_AWS_SECRET_ACCESS_KEY} -export AWS_SESSION_TOKEN=${ORIGINAL_AWS_SESSION_TOKEN} - cat > ~/.aws/credentials << EOF [default] aws_access_key_id = ${ORIGINAL_AWS_ACCESS_KEY_ID} From a7e3150f2cac0128354df3f411c90a2f1295270f Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Tue, 15 Jul 2025 12:07:07 -0700 Subject: [PATCH 27/41] environment variable fix + local working --- terminal-bench-test/main.py | 5 ++++- terminal-bench-test/setup_amazon_q.sh | 19 +++---------------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py index 30c6baa6bb..9dc1e7698d 100644 --- a/terminal-bench-test/main.py +++ b/terminal-bench-test/main.py @@ -25,7 +25,10 @@ def _env(self) -> dict[str, str]: # SIGv4 = 1 for AWS credentials env = {} env["AMAZON_Q_SIGV4"] = 1 - env['FIGCHAT_GAMMA_ID'] = os.environ.get('FIGCHAT_GAMMA_ID', '') + env["FIGCHAT_GAMMA_ID"] = os.environ.get("FIGCHAT_GAMMA_ID", '') + env["AWS_ACCESS_KEY_ID"] = os.environ.get("AWS_ACCESS_KEY_ID", '') + env["AWS_SECRET_ACCESS_KEY"] = os.environ.get("AWS_SECRET_ACCESS_KEY", '') + env["AWS_SESSION_TOKEN"] = os.environ.get("AWS_SESSION_TOKEN", '') return env @property diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 7838176e52..df317a8738 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -27,18 +27,12 @@ region = us-east-1 EOF chmod 600 ~/.aws/config -# Save original credentials -ORIGINAL_AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} -ORIGINAL_AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} -ORIGINAL_AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN} - # Assume role and capture temporary credentials --> needed for s3 bucket access for build echo "Assuming AWS s3 role" TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession 2>/dev/null || echo '{}') -echo $TEMP_CREDENTIALS QCHAT_ACCESSKEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId') Q_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey') -# export AWS_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken') +Q_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken') # Download specific build from S3 based on commit hash echo "Downloading Amazon Q CLI build from S3..." @@ -46,9 +40,10 @@ echo "Downloading Amazon Q CLI build from S3..." S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1" S3_PREFIX="main/${git_hash}/x86_64-unknown-linux-musl" echo "Downloading qchat.zip from s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip" -AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY"\ +AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \ aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 + # Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code echo "Extracting qchat.zip..." unzip -q qchat.zip @@ -62,14 +57,6 @@ else exit 1 fi -# Restore credentials to run Q -cat > ~/.aws/credentials << EOF -[default] -aws_access_key_id = ${ORIGINAL_AWS_ACCESS_KEY_ID} -aws_secret_access_key = ${ORIGINAL_AWS_SECRET_ACCESS_KEY} -aws_session_token = ${ORIGINAL_AWS_SESSION_TOKEN} -EOF - echo "Cleaning q zip" rm -f qchat.zip rm -rf qchat \ No newline at end of file From cefeaf3c3daa131b22c8515611d9e2d22b6e35fb Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Tue, 15 Jul 2025 12:40:48 -0700 Subject: [PATCH 28/41] use the correct git hash --- .github/workflows/terminal-bench.yaml | 11 +++++++++++ terminal-bench-test/main.py | 1 + terminal-bench-test/setup_amazon_q.sh | 20 +++++++++++++------- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index 267e0932b7..45b83f23e2 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -45,6 +45,17 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + - name: Set git hash + run: | + if [ -n "$GITHUB_SHA" ]; then + git_hash=$(git rev-parse --short "$GITHUB_SHA") + else + git_hash="latest" + fi + # appends to github_env file + echo "GIT_HASH=$git_hash" >> $GITHUB_ENV + echo "Git hash set to: $git_hash" + - name: Set up Python uses: actions/setup-python@v4 with: diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py index 9dc1e7698d..0e754e3fdf 100644 --- a/terminal-bench-test/main.py +++ b/terminal-bench-test/main.py @@ -29,6 +29,7 @@ def _env(self) -> dict[str, str]: env["AWS_ACCESS_KEY_ID"] = os.environ.get("AWS_ACCESS_KEY_ID", '') env["AWS_SECRET_ACCESS_KEY"] = os.environ.get("AWS_SECRET_ACCESS_KEY", '') env["AWS_SESSION_TOKEN"] = os.environ.get("AWS_SESSION_TOKEN", '') + env["GIT_HASH"] = os.environ.get("GIT_HASH", '') return env @property diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index df317a8738..0259b3e69c 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -1,8 +1,6 @@ #!/bin/bash set -e # if git hash empty then set to latest auto -git_hash=${GITHUB_SHA:+"$(git rev-parse --short "$GITHUB_SHA")"} -git_hash=${git_hash:-"latest"} apt-get update apt-get install -y curl wget unzip jq @@ -36,19 +34,27 @@ Q_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken') # Download specific build from S3 based on commit hash echo "Downloading Amazon Q CLI build from S3..." -# Use hardcoded bucket ID if FIGCHAT_GAMMA_ID is not set S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1" -S3_PREFIX="main/${git_hash}/x86_64-unknown-linux-musl" -echo "Downloading qchat.zip from s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip" +S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-musl" +echo "Downloading qchat.zip from s3://.../${S3_PREFIX}/qchat.zip" + +# Try download, fall back to latest commit hash if it fails +# exit code = 0 means success/true in bash, all others are false AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \ - aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 + aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 || { + echo "Falling back to latest build" + S3_PREFIX="main/latest/x86_64-unknown-linux-musl" + AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \ + aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 + } + # Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code echo "Extracting qchat.zip..." unzip -q qchat.zip -# move it to /usr/local/bin/qchat +# move it to /usr/local/bin/qchat for path as qchat may not work otherwise if cp qchat /usr/local/bin/ && chmod +x /usr/local/bin/qchat; then ln -sf /usr/local/bin/qchat /usr/local/bin/q echo "qchat installed successfully" From 7a1b06a368af976f93ce415716093c1873dcb137 Mon Sep 17 00:00:00 2001 From: arjun37602 <33012834+arjun37602@users.noreply.github.com> Date: Tue, 15 Jul 2025 15:37:05 -0700 Subject: [PATCH 29/41] larger runner for storage --- .github/workflows/terminal-bench.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index 45b83f23e2..aae4737cdb 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -24,7 +24,7 @@ on: # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: run-benchmark: - runs-on: ubuntu-latest + runs-on: ubuntu-latest-8-cores env: FIGCHAT_GAMMA_ID: ${{ secrets.FIGCHAT_GAMMA_ID }} permissions: @@ -83,4 +83,4 @@ jobs: uses: actions/upload-artifact@v4 with: name: benchmark-results - path: terminal-bench-test/runs/ \ No newline at end of file + path: terminal-bench-test/runs/ From 52ce30d69987c7b9e9dcc75cfe7e53a3e4b5a47f Mon Sep 17 00:00:00 2001 From: arjun37602 <33012834+arjun37602@users.noreply.github.com> Date: Tue, 15 Jul 2025 15:42:33 -0700 Subject: [PATCH 30/41] use full hash instead of short hash --- .github/workflows/terminal-bench.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index aae4737cdb..0cfad7cd68 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -48,7 +48,7 @@ jobs: - name: Set git hash run: | if [ -n "$GITHUB_SHA" ]; then - git_hash=$(git rev-parse --short "$GITHUB_SHA") + git_hash=$(git rev-parse "$GITHUB_SHA") else git_hash="latest" fi From 407993c92d94f5174847186e27aed1784b90b706 Mon Sep 17 00:00:00 2001 From: arjun37602 <33012834+arjun37602@users.noreply.github.com> Date: Tue, 15 Jul 2025 15:48:54 -0700 Subject: [PATCH 31/41] fail if hash invalid --- terminal-bench-test/setup_amazon_q.sh | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 0259b3e69c..b3c0afdd71 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -38,17 +38,9 @@ S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1" S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-musl" echo "Downloading qchat.zip from s3://.../${S3_PREFIX}/qchat.zip" -# Try download, fall back to latest commit hash if it fails -# exit code = 0 means success/true in bash, all others are false +# Try download, if hash is invalid we fail. AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \ - aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 || { - echo "Falling back to latest build" - S3_PREFIX="main/latest/x86_64-unknown-linux-musl" - AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \ - aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 - } - - + aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 # Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code echo "Extracting qchat.zip..." @@ -65,4 +57,4 @@ fi echo "Cleaning q zip" rm -f qchat.zip -rm -rf qchat \ No newline at end of file +rm -rf qchat From ea88fed5aca0586d0412e36091909dac97c757d1 Mon Sep 17 00:00:00 2001 From: arjun37602 <33012834+arjun37602@users.noreply.github.com> Date: Tue, 15 Jul 2025 15:51:26 -0700 Subject: [PATCH 32/41] Force to run on manual trigger --- .github/workflows/terminal-bench.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index 0cfad7cd68..5a0907f1be 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -18,9 +18,7 @@ on: required: true # The data type of the input type: string - push: - branches: [ terminal-bench-automation ] - + # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: run-benchmark: From 5f1e553ac7e160262465742798ffe72c25696b30 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Wed, 16 Jul 2025 10:21:16 -0700 Subject: [PATCH 33/41] responding to PR comments --- .github/workflows/terminal-bench.yaml | 24 +++++++++++----------- terminal-bench-test/main.py | 29 ++++++++++++++------------- terminal-bench-test/setup_amazon_q.sh | 5 ++--- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index 5a0907f1be..162504d7b1 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -1,34 +1,32 @@ # This is a terminal-bench workflow that is manually triggered # Template taken from https://github.com/actions/starter-workflows/blob/main/automation/manual.yml for reference -name: TB workflow +name: Terminal-Bench # Controls when the action will run. Workflow runs when manually triggered using the UI -# or API. on: workflow_dispatch: - # Inputs the workflow accepts. inputs: name: - # Friendly description to be shown in the UI instead of 'name' - description: 'Run terminal bench' - # Default value if no value is explicitly provided + description: 'Run terminal-bench workflow to test Q CLI in real terminal environments.' default: 'all' - # Input has to be provided for the workflow to run required: true - # The data type of the input type: string -# A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: run-benchmark: + # avoids disk storage issues runs-on: ubuntu-latest-8-cores + # makes these env vars available in main.py env: - FIGCHAT_GAMMA_ID: ${{ secrets.FIGCHAT_GAMMA_ID }} + CHAT_DOWNLOAD_ROLE_ARN: ${{ secrets.CHAT_DOWNLOAD_ROLE_ARN }} + CHAT_BUILD_BUCKET_NAME: ${{ secrets.CHAT_BUILD_BUCKET_NAME }} permissions: id-token: write contents: read steps: + + # clear unnecessary storage to ensure docker containers have space - name: Cleanup and free disk space run: | sudo rm -rf /usr/share/dotnet @@ -43,6 +41,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 + # Captures git hash of branch to query specific S3 bucket - name: Set git hash run: | if [ -n "$GITHUB_SHA" ]; then @@ -64,18 +63,19 @@ jobs: python -m pip install --upgrade pip pip install terminal-bench + # OIDC enabled for github for ArjunPersonal - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - role-to-assume: arn:aws:iam::${{ secrets.AWS_TB_ROLE }}:role/ArjunPersonal + role-to-assume: ${{ secrets.AWS_TB_ROLE }} aws-region: us-east-1 - - name: Run terminal benchmark run: | cd terminal-bench-test tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head + # uploads results if run fails as well to allow for easy log inspection - name: Upload results if: always() uses: actions/upload-artifact@v4 diff --git a/terminal-bench-test/main.py b/terminal-bench-test/main.py index 0e754e3fdf..2d2a3bdd3f 100644 --- a/terminal-bench-test/main.py +++ b/terminal-bench-test/main.py @@ -14,22 +14,24 @@ class AmazonQCLIAgent(AbstractInstalledAgent): def name() -> str: return "Amazon Q CLI" - def __init__(self, model_name: str | None = None, *args, **kwargs): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self._model_name = model_name - self._start_url = 'https://amzn.awsapps.com/start' - self.region = 'us-east-1' + """ + Makes necessary env vars available in docker containers + """ @property def _env(self) -> dict[str, str]: # SIGv4 = 1 for AWS credentials - env = {} - env["AMAZON_Q_SIGV4"] = 1 - env["FIGCHAT_GAMMA_ID"] = os.environ.get("FIGCHAT_GAMMA_ID", '') - env["AWS_ACCESS_KEY_ID"] = os.environ.get("AWS_ACCESS_KEY_ID", '') - env["AWS_SECRET_ACCESS_KEY"] = os.environ.get("AWS_SECRET_ACCESS_KEY", '') - env["AWS_SESSION_TOKEN"] = os.environ.get("AWS_SESSION_TOKEN", '') - env["GIT_HASH"] = os.environ.get("GIT_HASH", '') + env = { + "AMAZON_Q_SIGV4": 1, + "AWS_ACCESS_KEY_ID": os.environ.get("AWS_ACCESS_KEY_ID", ''), + "AWS_SECRET_ACCESS_KEY": os.environ.get("AWS_SECRET_ACCESS_KEY", ''), + "AWS_SESSION_TOKEN": os.environ.get("AWS_SESSION_TOKEN", ''), + "GIT_HASH": os.environ.get("GIT_HASH", ''), + "CHAT_DOWNLOAD_ROLE_ARN": os.environ.get("CHAT_DOWNLOAD_ROLE_ARN", ''), + "CHAT_BUILD_BUCKET_NAME": os.environ.get("CHAT_BUILD_BUCKET_NAME", '') + } return env @property @@ -40,11 +42,10 @@ def _run_agent_commands(self, task_description: str) -> list[TerminalCommand]: escaped_description = shlex.quote(task_description) return [ - # q chat with 30 min max timeout and also we wait on input. Using qchat cuz sigv4. - # non-interactive for now --> check if needed or not + # q chat with 30 min max timeout and also we wait on input. Using qchat because of sigv4. TerminalCommand( command=f"qchat chat --no-interactive --trust-all-tools {escaped_description}", - max_timeout_sec=370, + max_timeout_sec=1800, block=True, ) ] \ No newline at end of file diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index b3c0afdd71..fa83d1aeb5 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -27,20 +27,19 @@ chmod 600 ~/.aws/config # Assume role and capture temporary credentials --> needed for s3 bucket access for build echo "Assuming AWS s3 role" -TEMP_CREDENTIALS=$(aws sts assume-role --role-arn arn:aws:iam::${FIGCHAT_GAMMA_ID}:role/FigIoChat-S3Access-Role-Gamma --role-session-name S3AccessSession 2>/dev/null || echo '{}') +TEMP_CREDENTIALS=$(aws sts assume-role --role-arn ${CHAT_DOWNLOAD_ROLE_ARN} --role-session-name S3AccessSession 2>/dev/null || echo '{}') QCHAT_ACCESSKEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.AccessKeyId') Q_SECRET_ACCESS_KEY=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SecretAccessKey') Q_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken') # Download specific build from S3 based on commit hash echo "Downloading Amazon Q CLI build from S3..." -S3_BUCKET="fig-io-chat-build-output-${FIGCHAT_GAMMA_ID}-us-east-1" S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-musl" echo "Downloading qchat.zip from s3://.../${S3_PREFIX}/qchat.zip" # Try download, if hash is invalid we fail. AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY" AWS_SESSION_TOKEN="$Q_SESSION_TOKEN" \ - aws s3 cp s3://${S3_BUCKET}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 + aws s3 cp s3://${CHAT_BUILD_BUCKET_NAME}/${S3_PREFIX}/qchat.zip ./qchat.zip --region us-east-1 # Handle the zip file, copy the qchat executable to /usr/local/bin + symlink from old code echo "Extracting qchat.zip..." From 80be0628a61a470ae1ce16446e1bb4079cb1eb2d Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Thu, 17 Jul 2025 09:44:51 -0700 Subject: [PATCH 34/41] take git hash as user input to avoid confusion --- .github/workflows/terminal-bench.yaml | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index 162504d7b1..9491d719cc 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -7,20 +7,21 @@ name: Terminal-Bench on: workflow_dispatch: inputs: - name: - description: 'Run terminal-bench workflow to test Q CLI in real terminal environments.' - default: 'all' + git_commit_hash: + description: 'Input git commit hash to run TB on' required: true + default: 'latest' type: string - + jobs: run-benchmark: # avoids disk storage issues - runs-on: ubuntu-latest-8-cores + runs-on: ubuntu-latest # makes these env vars available in main.py env: CHAT_DOWNLOAD_ROLE_ARN: ${{ secrets.CHAT_DOWNLOAD_ROLE_ARN }} CHAT_BUILD_BUCKET_NAME: ${{ secrets.CHAT_BUILD_BUCKET_NAME }} + GIT_HASH: ${{ github.event.inputs.git_commit_hash }} permissions: id-token: write contents: read @@ -41,18 +42,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - # Captures git hash of branch to query specific S3 bucket - - name: Set git hash - run: | - if [ -n "$GITHUB_SHA" ]; then - git_hash=$(git rev-parse "$GITHUB_SHA") - else - git_hash="latest" - fi - # appends to github_env file - echo "GIT_HASH=$git_hash" >> $GITHUB_ENV - echo "Git hash set to: $git_hash" - - name: Set up Python uses: actions/setup-python@v4 with: From 8e7b75991329381f9330057f8bd223e6908f6007 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Thu, 17 Jul 2025 10:04:45 -0700 Subject: [PATCH 35/41] description --- .github/workflows/terminal-bench.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index 9491d719cc..116be7a22c 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -8,7 +8,7 @@ on: workflow_dispatch: inputs: git_commit_hash: - description: 'Input git commit hash to run TB on' + description: 'Input git commit hash to run TB on (must exist on S3)' required: true default: 'latest' type: string From fd9a09ba8040c5cf0a0356f1e3a54fe0215d98da Mon Sep 17 00:00:00 2001 From: arjun37602 <33012834+arjun37602@users.noreply.github.com> Date: Tue, 5 Aug 2025 16:54:55 -0700 Subject: [PATCH 36/41] change n and allow clean up --- .github/workflows/terminal-bench.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index eb8ff688c8..33f3a2f539 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -61,7 +61,7 @@ jobs: - name: Run terminal benchmark run: | cd terminal-bench-test - tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head + tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head --n-tasks=20 --cleanup # uploads results if run fails as well to allow for easy log inspection - name: Upload results From 93c312d82a45c694264137ec2c69911422902271 Mon Sep 17 00:00:00 2001 From: arjun37602 <33012834+arjun37602@users.noreply.github.com> Date: Wed, 6 Aug 2025 09:47:13 -0700 Subject: [PATCH 37/41] Update terminal-bench.yaml --- .github/workflows/terminal-bench.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index 33f3a2f539..ac2ed44b4e 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -61,7 +61,7 @@ jobs: - name: Run terminal benchmark run: | cd terminal-bench-test - tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head --n-tasks=20 --cleanup + tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head --cleanup # uploads results if run fails as well to allow for easy log inspection - name: Upload results From 4f4643bd6215df59e222a9615ddb361fa8d4c009 Mon Sep 17 00:00:00 2001 From: arjun37602 <33012834+arjun37602@users.noreply.github.com> Date: Wed, 6 Aug 2025 16:34:44 -0700 Subject: [PATCH 38/41] Update setup_amazon_q.sh to use gnu instead of musl --- terminal-bench-test/setup_amazon_q.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index fa83d1aeb5..08b12d4564 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -34,7 +34,7 @@ Q_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken') # Download specific build from S3 based on commit hash echo "Downloading Amazon Q CLI build from S3..." -S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-musl" +S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-gnu" echo "Downloading qchat.zip from s3://.../${S3_PREFIX}/qchat.zip" # Try download, if hash is invalid we fail. From b1bdfeeec423e655dc615027f44ce6ff8e7c5b95 Mon Sep 17 00:00:00 2001 From: arjun37602 <33012834+arjun37602@users.noreply.github.com> Date: Thu, 7 Aug 2025 09:32:43 -0700 Subject: [PATCH 39/41] set n tasks --- .github/workflows/terminal-bench.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index ac2ed44b4e..bc14b69734 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -61,7 +61,7 @@ jobs: - name: Run terminal benchmark run: | cd terminal-bench-test - tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head --cleanup + tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head --cleanup --n-tasks=5 # uploads results if run fails as well to allow for easy log inspection - name: Upload results From f0ca4153580b6330af2891b42acd18c518fc27a0 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Thu, 7 Aug 2025 09:46:30 -0700 Subject: [PATCH 40/41] mimic official installation and use gnu --- terminal-bench-test/setup_amazon_q.sh | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 08b12d4564..2c89cdf63b 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -45,15 +45,28 @@ AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY echo "Extracting qchat.zip..." unzip -q qchat.zip -# move it to /usr/local/bin/qchat for path as qchat may not work otherwise -if cp qchat /usr/local/bin/ && chmod +x /usr/local/bin/qchat; then +# Debug: Show extracted structure +echo "Extracted contents:" +ls -la + +# Extract and install mimicing the official release +if [ -d "q" ] && [ -f "q/bin/qchat" ]; then + cp q/bin/qchat /usr/local/bin/qchat + cp q/bin/q /usr/local/bin/q 2>/dev/null || ln -sf /usr/local/bin/qchat /usr/local/bin/q + chmod +x /usr/local/bin/qchat /usr/local/bin/q + echo "qchat installed successfully" +elif [ -f "qchat" ]; then + # Fallback if it's just the binary + cp qchat /usr/local/bin/qchat ln -sf /usr/local/bin/qchat /usr/local/bin/q + chmod +x /usr/local/bin/qchat echo "qchat installed successfully" else - echo "ERROR: Failed to install qchat" + echo "ERROR: qchat not found in expected locations" + ls -la exit 1 fi echo "Cleaning q zip" rm -f qchat.zip -rm -rf qchat +rm -rf q qchat From 77eb58de5cb643bc6642a10dc8e1bfb0c1adbb01 Mon Sep 17 00:00:00 2001 From: Arjun Balaji Date: Thu, 7 Aug 2025 09:57:03 -0700 Subject: [PATCH 41/41] run chat_cli executable --- terminal-bench-test/setup_amazon_q.sh | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index 2c89cdf63b..6cb580e502 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -45,24 +45,16 @@ AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY echo "Extracting qchat.zip..." unzip -q qchat.zip -# Debug: Show extracted structure -echo "Extracted contents:" -ls -la +# Extract and install - the executable is named chat_cli +# qchat → runs /usr/local/bin/qchat directly → which is the chat_cli binary -# Extract and install mimicing the official release -if [ -d "q" ] && [ -f "q/bin/qchat" ]; then - cp q/bin/qchat /usr/local/bin/qchat - cp q/bin/q /usr/local/bin/q 2>/dev/null || ln -sf /usr/local/bin/qchat /usr/local/bin/q - chmod +x /usr/local/bin/qchat /usr/local/bin/q - echo "qchat installed successfully" -elif [ -f "qchat" ]; then - # Fallback if it's just the binary - cp qchat /usr/local/bin/qchat +if [ -f "chat_cli" ]; then + cp chat_cli /usr/local/bin/qchat ln -sf /usr/local/bin/qchat /usr/local/bin/q chmod +x /usr/local/bin/qchat echo "qchat installed successfully" else - echo "ERROR: qchat not found in expected locations" + echo "ERROR: chat_cli executable not found" ls -la exit 1 fi