diff --git a/.github/authorized_workflow_users.txt b/.github/authorized_workflow_users.txt new file mode 100644 index 000000000..a0d07ccc0 --- /dev/null +++ b/.github/authorized_workflow_users.txt @@ -0,0 +1,3 @@ +rlakhtakia +liu-cong +kfswain \ No newline at end of file diff --git a/.github/workflows/e2e-prefill-heavy-gke.yaml b/.github/workflows/e2e-prefill-heavy-gke.yaml new file mode 100644 index 000000000..7adeaa564 --- /dev/null +++ b/.github/workflows/e2e-prefill-heavy-gke.yaml @@ -0,0 +1,322 @@ +name: GKE Prefill Heavy Test + +on: + # Runs with a PR comment /run-gke-prefill-heavy + issue_comment: + types: [created] + workflow_dispatch: + inputs: + pr_or_branch: + description: 'Pull-request number or branch name to test' + required: true + default: 'main' + type: string + +permissions: + contents: read + +jobs: + # Authorization Job: Ensures only authorized users can execute workflow + # Note, even if user checks out branch to modify access, user will need to provide correct secret keys to deploy to GCP. + check_access: + runs-on: ubuntu-latest + + if: | + (github.event_name == 'issue_comment' && + github.event.issue.pull_request && + contains(github.event.comment.body, '/run-gke-prefill-heavy')) || github.event_name == 'workflow_dispatch' + + outputs: + authorized: ${{ steps.auth_logic.outputs.authorized }} + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Authorization Logic + id: auth_logic + shell: bash + run: | + authorized='false' + auth_file=".github/authorized_workflow_users.txt" + user="" + role="" + + if [[ "${{ github.event_name }}" == "issue_comment" ]]; then + user="${{ github.event.comment.user.login }}" + role="${{ github.event.comment.author_association }}" + + if [[ "${{ github.event.issue.pull_request.base.ref }}" != "main" ]]; then + echo "PR base is not 'main'." + echo "authorized=false" >> "$GITHUB_OUTPUT" + exit 1 + fi + + elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + user="${{ github.actor }}" + fi + + if [[ "$role" == "OWNER" || "$role" == "MAINTAINER" ]]; then + echo "User authorized by role: $role" + authorized='true' + + elif grep -Fxq "$user" "$auth_file"; then + echo "User authorized by file lookup: $auth_file" + authorized='true' + fi + + echo "authorized=$authorized" >> "$GITHUB_OUTPUT" + + deploy_and_validate: + needs: [check_access] + if: | + (github.event_name == 'workflow_dispatch' || github.event_name == 'issue_comment') && + needs.check_access.outputs.authorized == 'true' + + name: Test on ${{ matrix.accelerator.name }} + runs-on: ubuntu-latest + + strategy: + fail-fast: false + max-parallel: 1 + matrix: + accelerator: + - name: GPU + + env: + GCP_PROJECT_ID: llm-d-scale + GKE_CLUSTER_NAME: llm-d-e2e-us-east5 + GKE_CLUSTER_ZONE: us-east5 + NAMESPACE: igw-prefill-heavy + GATEWAY: gke-l7-regional-external-managed + GATEWAY_TYPE: gke + PR_OR_BRANCH: ${{ github.event.inputs.pr_or_branch || github.event.issue.number || github.event.number || 'actions' }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + MODEL: meta-llama/Llama-3.1-8B-Instruct + GSA_EMAIL: ${{ secrets.GCS_WORKLOAD_SA }} + GCS_BUCKET: igw-e2e-benchmark-results + KSA_NAME: igw-e2e-benchmark-sa + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + persist-credentials: false + + - name: Determine if pr_or_branch is a PR number + id: check_pr + env: + PR_OR_BRANCH: ${{ github.event.inputs.pr_or_branch }} + shell: bash + run: | + echo "PR_OR_BRANCH=${PR_OR_BRANCH:-actions}" >> "$GITHUB_ENV" + if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then + echo "is_pr=true" >> "$GITHUB_OUTPUT" + elif [[ "${{ github.event_name }}" = "pull_request" ]]; then + echo "PR_OR_BRANCH=${{ github.event.pull_request.number }}" >> $GITHUB_ENV + echo "is_pr=true" >> "$GITHUB_OUTPUT" + else + echo "is_pr=false" >> "$GITHUB_OUTPUT" + fi + + - name: Fetch and checkout PR + if: steps.check_pr.outputs.is_pr == 'true' + run: | + git fetch origin pull/"$PR_OR_BRANCH"/head:pr-"$PR_OR_BRANCH" + git checkout pr-"$PR_OR_BRANCH" + + - name: Checkout branch + if: steps.check_pr.outputs.is_pr == 'false' + run: git checkout "$PR_OR_BRANCH" + + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@b7593ed2efd1c1617e1b0254da33b86225adb2a5 + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + + - name: Set up gcloud CLI and kubectl + uses: google-github-actions/setup-gcloud@cb1e50a9932213ecece00a606661ae9ca44f3397 + with: + project_id: ${{ env.GCP_PROJECT_ID }} + install_components: 'kubectl,gke-gcloud-auth-plugin' + + - name: Get GKE credentials + run: | + gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}" + + - name: Create namespace + run: | + kubectl create namespace "${NAMESPACE}" || echo "Namespace already exists" + + - name: Create hf-token secret + run: | + kubectl create secret generic hf-token \ + --from-literal="token=${{ secrets.HF_TOKEN }}" \ + --namespace "${NAMESPACE}" \ + --dry-run=client -o yaml | kubectl apply -f - + + - name: Create and Annotate KSA for Workload Identity + run: | + kubectl create serviceaccount $KSA_NAME --namespace "${NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - + kubectl annotate serviceaccount $KSA_NAME \ + iam.gke.io/gcp-service-account=$GSA_EMAIL \ + --overwrite \ + --namespace "${NAMESPACE}" + + - name: Deploy Model Server and CRDs + run: | + cd config/manifests/vllm + echo "Deploying Model Server..." + kubectl apply -f gpu-deployment.yaml -n ${NAMESPACE} | tee ~/igw-prefill-heavy-deployment.log + echo "Installing CRDs" + kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.1.0/manifests.yaml + echo "---------------------------------------" >> ~/igw-prefill-heavy-deployment.log + + - name: Deploy InferencePool and Endpoint Picker Extension + run: | + export IGW_CHART_VERSION=v1.1.0 + helm install vllm-llama3-8b-instruct \ + --namespace $NAMESPACE \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=$GATEWAY_TYPE \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool | tee ~/igw-prefill-heavy-deployment.log + echo "---------------------------------------" >> ~/igw-prefill-heavy-deployment.log + + - name: Wait for all pods to be ready + run: | + kubectl wait pod \ + --for=condition=Ready \ + --all \ + -n "${NAMESPACE}" \ + --timeout=25m + echo "✅ All pods are ready." + kubectl get pods -n "${NAMESPACE}" + + - name: Deploy Gateway + run: | + GATEWAY_NAME=inference-gateway + kubectl delete httproute llm-route -n ${NAMESPACE} --ignore-not-found + kubectl delete gateway ${GATEWAY_NAME} -n ${NAMESPACE} --ignore-not-found + echo "Deploying Gateway..." + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.1.0/config/manifests/gateway/gke/gateway.yaml -n ${NAMESPACE} | tee ~/igw-prefill-heavy-deployment.log + echo "Deploying HTTPRoute..." + kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.1.0/config/manifests/gateway/gke/httproute.yaml -n ${NAMESPACE} | tee ~/igw-prefill-heavy-deployment.log + echo "---------------------------------------" >> ~/igw-prefill-heavy-deployment.log + + - name: Wait for gateway to be ready + run: | + GATEWAY_NAME=inference-gateway + kubectl wait gateway/${GATEWAY_NAME} \ + --for=condition=Programmed=True \ + -n "${NAMESPACE}" \ + --timeout=500s + echo "✅ Gateway is ready." + kubectl get gateway -n "${NAMESPACE}" + + - name: Show deployment status + run: | + echo "=== Deployments ===" + kubectl get deployments -n "${NAMESPACE}" + echo "" + echo "=== Pods ===" + kubectl get pods -n "${NAMESPACE}" + echo "" + echo "=== Services ===" + kubectl get svc -n "${NAMESPACE}" + echo "" + echo "=== Helm releases ===" + helm list -n "${NAMESPACE}" || true + echo "" + echo "=== Inference Pools ===" + kubectl get inferencepools -n "${NAMESPACE}" || true + echo "" + echo "=== HTTPRoutes ===" + kubectl get httproutes -n "${NAMESPACE}" -o yaml || true + echo "" + echo "=== Gateway ===" + kubectl get Gateway -n "${NAMESPACE}" || true + echo "" + + - name: Verify installation and run validation test + run: | + cd .github/scripts/e2e + ./e2e-validate.sh -n "${NAMESPACE}" -v -m ${MODEL} + + - name: Run benchmarking test + run: | + TIMESTAMP=$(date +"%Y-%m-%d-%H-%M-%S") + cd benchmarking/single-workload + host="${GATEWAY_HOST:-$(kubectl get gateway -n "$NAMESPACE" \ + -o jsonpath='{.items[0].status.addresses[0].value}' 2>/dev/null || true)}" + if [[ -z "$host" ]]; then + echo "Error: could not discover a Gateway address in namespace '$NAMESPACE'." >&2 + exit 1 + fi + port=80 + svc_host="${host}:${port}" + helm install prefill-heavy-benchmark ../inference-perf/ -f prefill-heavy-values.yaml \ + --namespace "${NAMESPACE}" \ + --create-namespace \ + --set token.hfToken="${HF_TOKEN}" \ + --set "config.server.base_url=http://${svc_host}" \ + --set "job.serviceAccountName=$KSA_NAME" \ + --set "job.image.tag=latest" \ + --set "config.storage.google_cloud_storage.bucket_name=${GCS_BUCKET}" \ + --set "config.storage.google_cloud_storage.path=${NAMESPACE}/${TIMESTAMP}" \ + --set "gcsPath=gs://${GCS_BUCKET}/datasets/billsum_conversations.json" \ + --set "config.data.path=/gcsDataset/gcs-dataset.json" \ + --set-string 'job.resources.limits.nvidia\.com/gpu=1' + + - name: Wait for benchmarking job to finish + run: | + job_name=prefill-heavy-benchmark-inference-perf-job + TIMEOUT_DURATION="7200s" + if ! kubectl wait --for=condition=complete job/"$job_name" -n "$NAMESPACE" --timeout="$TIMEOUT_DURATION"; then + echo "Error: Benchmark job $job_name did not complete successfully within $TIMEOUT_DURATION." >&2 + echo "--- Job Description ---" >&2 + kubectl describe job "$job_name" -n "$NAMESPACE" >&2 + echo "--- Pod Logs (Last 50 lines) ---" >&2 + kubectl logs -l job-name="$job_name" -n "$NAMESPACE" --all-containers=true --tail 50 >&2 + exit 1 + fi + echo "✅ Benchmarking Job Completed." + + - name: Collect and upload Kubernetes pod logs + if: always() + run: | + mkdir -p pod-logs-inference-prefill-heavy + cd pod-logs-inference-prefill-heavy + echo "Fetching ${NAMESPACE} pods log..." + kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \ + | xargs -I{} sh -c 'kubectl logs --all-containers=true -n "${NAMESPACE}" {} > "{}.log" 2>&1' + echo "Fetching ${NAMESPACE} pods descriptions..." + kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \ + | xargs -I{} sh -c 'kubectl describe pod -n "${NAMESPACE}" {} > "{}-describe.log" 2>&1' + mv ~/igw-prefill-heavy-deployment.log . || true + mv ~/install-deps.log . || true + + - name: Upload pod logs as artifact + uses: actions/upload-artifact@v4 + if: always() + with: + name: igw-pod-logs-inference-prefill-heavy-${{ matrix.accelerator.name }} + path: pod-logs-inference-prefill-heavy + + - name: Send Google Chat notification on failure + if: failure() + uses: SimonScholz/google-chat-action@3b3519e5102dba8aa5046fd711c4b553586409bb + with: + webhookUrl: ${{ secrets.GOOGLE_CHAT_WEBHOOK }} + jobStatus: ${{ job.status }} + title: '${{ github.workflow }} - ${{ matrix.accelerator.name }}' + + - name: Cleanup deployment + if: always() + run: | + GATEWAY_NAME=inference-gateway + helm uninstall vllm-llama3-8b-instruct -n ${NAMESPACE} --ignore-not-found + helm uninstall prefill-heavy-benchmark -n ${NAMESPACE} --ignore-not-found + kubectl delete httproute llm-route -n ${NAMESPACE} --ignore-not-found + kubectl delete gateway ${GATEWAY_NAME} -n ${NAMESPACE} --ignore-not-found \ No newline at end of file diff --git a/.github/workflows/e2e-prefix-cache-aware-gke.yaml b/.github/workflows/e2e-prefix-cache-aware-gke.yaml index f5db63580..ca344cd14 100644 --- a/.github/workflows/e2e-prefix-cache-aware-gke.yaml +++ b/.github/workflows/e2e-prefix-cache-aware-gke.yaml @@ -16,24 +16,65 @@ permissions: contents: read jobs: + # Authorization Job: Ensures only authorized users can execute workflow + # Note, even if user checks out branch to modify access, user will need to provide correct secret keys to deploy to GCP. + check_access: + runs-on: ubuntu-latest + + if: | + (github.event_name == 'issue_comment' && + github.event.issue.pull_request && + contains(github.event.comment.body, '/run-gke-prefix-cache')) || github.event_name == 'workflow_dispatch' + + outputs: + authorized: ${{ steps.auth_logic.outputs.authorized }} + + steps: + - name: Checkout Repository + uses: actions/checkout@v4 + + - name: Authorization Logic + id: auth_logic + shell: bash + run: | + authorized='false' + auth_file=".github/authorized_workflow_users.txt" + user="" + role="" + + if [[ "${{ github.event_name }}" == "issue_comment" ]]; then + user="${{ github.event.comment.user.login }}" + role="${{ github.event.comment.author_association }}" + + if [[ "${{ github.event.issue.pull_request.base.ref }}" != "main" ]]; then + echo "PR base is not 'main'." + echo "authorized=false" >> "$GITHUB_OUTPUT" + exit 1 + fi + + elif [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + user="${{ github.actor }}" + fi + + if [[ "$role" == "OWNER" || "$role" == "MAINTAINER" ]]; then + echo "User authorized by role: $role" + authorized='true' + + elif grep -Fxq "$user" "$auth_file"; then + echo "User authorized by file lookup: $auth_file" + authorized='true' + fi + + echo "authorized=$authorized" >> "$GITHUB_OUTPUT" + deploy_and_validate: - if: > - github.event_name == 'workflow_dispatch' || - ( - github.event_name == 'issue_comment' && - github.event.issue.pull_request && - github.event.issue.pull_request.base.ref == 'main' && - contains(github.event.comment.body, '/run-gke-prefix-cache') - && - ( - github.event.comment.author_association == 'OWNER' || - github.event.comment.author_association == 'MEMBER' || - github.event.comment.author_association == 'COLLABORATOR' - ) - ) + needs: [check_access] + if: | + (github.event_name == 'workflow_dispatch' || github.event_name == 'issue_comment') && + needs.check_access.outputs.authorized == 'true' + name: Test on ${{ matrix.accelerator.name }} runs-on: ubuntu-latest - strategy: fail-fast: false max-parallel: 1 @@ -221,7 +262,7 @@ jobs: --set hfToken="${HF_TOKEN}" \ --set "config.server.base_url=http://${svc_host}" \ --set "job.serviceAccountName=$KSA_NAME" \ - --set "job.image.tag=v0.2.0" \ + --set "job.image.tag=latest" \ --set "config.storage.google_cloud_storage.bucket_name=${GCS_BUCKET}" \ --set "config.storage.google_cloud_storage.path=${NAMESPACE}/${TIMESTAMP}" \ --set-string 'job.resources.limits.nvidia\.com/gpu=1' diff --git a/site-src/performance/benchmark/advanced-configs/prefill-heavy-workload.md b/site-src/performance/benchmark/advanced-configs/prefill-heavy-workload.md index 99ada4b2d..79ac08b41 100644 --- a/site-src/performance/benchmark/advanced-configs/prefill-heavy-workload.md +++ b/site-src/performance/benchmark/advanced-configs/prefill-heavy-workload.md @@ -126,3 +126,17 @@ helm uninstall prefill-heavy ## Post Benchmark Analysis Follow the benchmarking guide instructions to [compare benchmark results](https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark/#analyze-the-results). + + +## Running E2E Tests + +The following E2E test runs on GitHub using GitHub Actions. + +> If you have `MAINTAINER` access or above, you can trigger the workflow run from the GitHub Actions page, or by leaving a comment on the PR. + +> Please make sure there is no other GKE tests of the same type running at the same time, as they can interfere with each other. + + +| Test name | Link | PR comment trigger +| :--- | :--- | :--- | +| GKE Prefill Heavy Test | https://github.com/gateway-api-inference-extension/.github/workflows/e2e-prefill-heavy-gke.yaml | /run-gke-prefill-heavy | \ No newline at end of file diff --git a/site-src/performance/benchmark/advanced-configs/prefix-cache-aware.md b/site-src/performance/benchmark/advanced-configs/prefix-cache-aware.md index 0b35c7cd5..50c5854a2 100644 --- a/site-src/performance/benchmark/advanced-configs/prefix-cache-aware.md +++ b/site-src/performance/benchmark/advanced-configs/prefix-cache-aware.md @@ -137,3 +137,15 @@ helm uninstall my-low-cache-release ## Post Benchmark Analysis Follow the benchmarking guide instructions to [compare benchmark results](https://gateway-api-inference-extension.sigs.k8s.io/performance/benchmark/#analyze-the-results). + +## Running E2E Tests + +The following E2E test runs on GitHub using GitHub Actions. + +> If you have `MAINTAINER` access or above, you can trigger the workflow run from the GitHub Actions page, or by leaving a comment on the PR. + +> Please make sure there is no other GKE tests of the same type running at the same time, as they can interfere with each other. + + +| :--- | :--- | :--- | +| GKE Prefix Cache Aware Test | https://github.com/gateway-api-inference-extension/.github/workflows/e2e-prefix-cache-aware-gke.yaml | /run-gke-prefix-cache | \ No newline at end of file