diff --git a/.github/workflows/terminal-bench.yaml b/.github/workflows/terminal-bench.yaml index dd6efd57e2..bc14b69734 100644 --- a/.github/workflows/terminal-bench.yaml +++ b/.github/workflows/terminal-bench.yaml @@ -7,12 +7,11 @@ name: Terminal-Bench on: workflow_dispatch: inputs: - name: - description: 'Run terminal-bench workflow to test Q CLI in real terminal environments.' - default: 'all' + git_commit_hash: + description: 'Input git commit hash to run TB on (must exist on S3)' required: true + default: 'latest' type: string - jobs: run-benchmark: # avoids disk storage issues @@ -21,6 +20,7 @@ jobs: env: CHAT_DOWNLOAD_ROLE_ARN: ${{ secrets.CHAT_DOWNLOAD_ROLE_ARN }} CHAT_BUILD_BUCKET_NAME: ${{ secrets.CHAT_BUILD_BUCKET_NAME }} + GIT_HASH: ${{ github.event.inputs.git_commit_hash }} permissions: id-token: write contents: read @@ -41,18 +41,6 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - # Captures git hash of branch to query specific S3 bucket - - name: Set git hash - run: | - if [ -n "$GITHUB_SHA" ]; then - git_hash=$(git rev-parse "$GITHUB_SHA") - else - git_hash="latest" - fi - # appends to github_env file - echo "GIT_HASH=$git_hash" >> $GITHUB_ENV - echo "Git hash set to: $git_hash" - - name: Set up Python uses: actions/setup-python@v4 with: @@ -73,7 +61,7 @@ jobs: - name: Run terminal benchmark run: | cd terminal-bench-test - tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head + tb run --agent-import-path main:AmazonQCLIAgent --dataset-name terminal-bench-core --dataset-version head --cleanup --n-tasks=5 # uploads results if run fails as well to allow for easy log inspection - name: Upload results diff --git a/terminal-bench-test/setup_amazon_q.sh b/terminal-bench-test/setup_amazon_q.sh index fa83d1aeb5..6cb580e502 100644 --- a/terminal-bench-test/setup_amazon_q.sh +++ b/terminal-bench-test/setup_amazon_q.sh @@ -34,7 +34,7 @@ Q_SESSION_TOKEN=$(echo $TEMP_CREDENTIALS | jq -r '.Credentials.SessionToken') # Download specific build from S3 based on commit hash echo "Downloading Amazon Q CLI build from S3..." -S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-musl" +S3_PREFIX="main/${GIT_HASH}/x86_64-unknown-linux-gnu" echo "Downloading qchat.zip from s3://.../${S3_PREFIX}/qchat.zip" # Try download, if hash is invalid we fail. @@ -45,15 +45,20 @@ AWS_ACCESS_KEY_ID="$QCHAT_ACCESSKEY" AWS_SECRET_ACCESS_KEY="$Q_SECRET_ACCESS_KEY echo "Extracting qchat.zip..." unzip -q qchat.zip -# move it to /usr/local/bin/qchat for path as qchat may not work otherwise -if cp qchat /usr/local/bin/ && chmod +x /usr/local/bin/qchat; then +# Extract and install - the executable is named chat_cli +# qchat → runs /usr/local/bin/qchat directly → which is the chat_cli binary + +if [ -f "chat_cli" ]; then + cp chat_cli /usr/local/bin/qchat ln -sf /usr/local/bin/qchat /usr/local/bin/q + chmod +x /usr/local/bin/qchat echo "qchat installed successfully" else - echo "ERROR: Failed to install qchat" + echo "ERROR: chat_cli executable not found" + ls -la exit 1 fi echo "Cleaning q zip" rm -f qchat.zip -rm -rf qchat +rm -rf q qchat