1+ name : GKE Prefill Heavy Test
2+
3+ on :
4+ # Runs with a PR comment /run-gke-prefill-heavy
5+ issue_comment :
6+ types : [created]
7+ workflow_dispatch :
8+ inputs :
9+ pr_or_branch :
10+ description : ' Pull-request number or branch name to test'
11+ required : true
12+ default : ' main'
13+ type : string
14+
15+ permissions :
16+ contents : read
17+
18+ jobs :
19+ deploy_and_validate :
20+ if : >
21+ github.event_name == 'workflow_dispatch' ||
22+ (
23+ github.event_name == 'issue_comment' &&
24+ github.event.issue.pull_request &&
25+ github.event.issue.pull_request.base.ref == 'main' &&
26+ contains(github.event.comment.body, '/run-gke-prefill-heavy')
27+ &&
28+ (
29+ github.event.comment.author_association == 'OWNER' ||
30+ github.event.comment.author_association == 'MEMBER' ||
31+ github.event.comment.author_association == 'COLLABORATOR'
32+ )
33+ )
34+ name : Test on ${{ matrix.accelerator.name }}
35+ runs-on : ubuntu-latest
36+
37+ strategy :
38+ fail-fast : false
39+ max-parallel : 1
40+ matrix :
41+ accelerator :
42+ - name : GPU
43+
44+ env :
45+ GCP_PROJECT_ID : llm-d-scale
46+ GKE_CLUSTER_NAME : llm-d-e2e-us-east5
47+ GKE_CLUSTER_ZONE : us-east5
48+ NAMESPACE : igw-prefill-heavy
49+ GATEWAY : gke-l7-regional-external-managed
50+ GATEWAY_TYPE : gke
51+ PR_OR_BRANCH : ${{ github.event.inputs.pr_or_branch || github.event.issue.number || github.event.number || 'actions' }}
52+ HF_TOKEN : ${{ secrets.HF_TOKEN }}
53+ MODEL : meta-llama/Llama-3.1-8B-Instruct
54+ GSA_EMAIL : ${{ secrets.GCS_WORKLOAD_SA }}
55+ GCS_BUCKET : igw-e2e-benchmark-results
56+ KSA_NAME : igw-e2e-benchmark-sa
57+
58+ steps :
59+ - name : Checkout
60+ uses : actions/checkout@v4
61+ with :
62+ persist-credentials : false
63+
64+ - name : Determine if pr_or_branch is a PR number
65+ id : check_pr
66+ env :
67+ PR_OR_BRANCH : ${{ github.event.inputs.pr_or_branch }}
68+ shell : bash
69+ run : |
70+ echo "PR_OR_BRANCH=${PR_OR_BRANCH:-actions}" >> "$GITHUB_ENV"
71+ if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
72+ echo "is_pr=true" >> "$GITHUB_OUTPUT"
73+ elif [[ "${{ github.event_name }}" = "pull_request" ]]; then
74+ echo "PR_OR_BRANCH=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
75+ echo "is_pr=true" >> "$GITHUB_OUTPUT"
76+ else
77+ echo "is_pr=false" >> "$GITHUB_OUTPUT"
78+ fi
79+
80+ - name : Fetch and checkout PR
81+ if : steps.check_pr.outputs.is_pr == 'true'
82+ run : |
83+ git fetch origin pull/"$PR_OR_BRANCH"/head:pr-"$PR_OR_BRANCH"
84+ git checkout pr-"$PR_OR_BRANCH"
85+
86+ - name : Checkout branch
87+ if : steps.check_pr.outputs.is_pr == 'false'
88+ run : git checkout "$PR_OR_BRANCH"
89+
90+ - name : Authenticate to Google Cloud
91+ id : auth
92+ uses : google-github-actions/auth@b7593ed2efd1c1617e1b0254da33b86225adb2a5
93+ with :
94+ credentials_json : ${{ secrets.GCP_SA_KEY }}
95+
96+ - name : Set up gcloud CLI and kubectl
97+ uses : google-github-actions/setup-gcloud@cb1e50a9932213ecece00a606661ae9ca44f3397
98+ with :
99+ project_id : ${{ env.GCP_PROJECT_ID }}
100+ install_components : ' kubectl,gke-gcloud-auth-plugin'
101+
102+ - name : Get GKE credentials
103+ run : |
104+ gcloud container clusters get-credentials "${{ env.GKE_CLUSTER_NAME }}" --zone "${{ env.GKE_CLUSTER_ZONE }}"
105+
106+ - name : Create namespace
107+ run : |
108+ kubectl create namespace "${NAMESPACE}" || echo "Namespace already exists"
109+
110+ - name : Create hf-token secret
111+ run : |
112+ kubectl create secret generic hf-token \
113+ --from-literal="token=${{ secrets.HF_TOKEN }}" \
114+ --namespace "${NAMESPACE}" \
115+ --dry-run=client -o yaml | kubectl apply -f -
116+
117+ - name : Create and Annotate KSA for Workload Identity
118+ run : |
119+ kubectl create serviceaccount $KSA_NAME --namespace "${NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f -
120+ kubectl annotate serviceaccount $KSA_NAME \
121+ iam.gke.io/gcp-service-account=$GSA_EMAIL \
122+ --overwrite \
123+ --namespace "${NAMESPACE}"
124+
125+ - name : Deploy Model Server and CRDs
126+ run : |
127+ cd config/manifests/vllm
128+ echo "Deploying Model Server..."
129+ kubectl apply -f gpu-deployment.yaml -n ${NAMESPACE} | tee ~/igw-prefill-heavy-deployment.log
130+ echo "Installing CRDs"
131+ kubectl apply -f https://github.com/kubernetes-sigs/gateway-api-inference-extension/releases/download/v1.1.0/manifests.yaml
132+ echo "---------------------------------------" >> ~/igw-prefill-heavy-deployment.log
133+
134+ - name : Deploy InferencePool and Endpoint Picker Extension
135+ run : |
136+ export IGW_CHART_VERSION=v1.1.0
137+ helm install vllm-llama3-8b-instruct \
138+ --namespace $NAMESPACE \
139+ --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \
140+ --set provider.name=$GATEWAY_TYPE \
141+ --version $IGW_CHART_VERSION \
142+ oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool | tee ~/igw-prefill-heavy-deployment.log
143+ echo "---------------------------------------" >> ~/igw-prefill-heavy-deployment.log
144+
145+ - name : Wait for all pods to be ready
146+ run : |
147+ kubectl wait pod \
148+ --for=condition=Ready \
149+ --all \
150+ -n "${NAMESPACE}" \
151+ --timeout=25m
152+ echo "✅ All pods are ready."
153+ kubectl get pods -n "${NAMESPACE}"
154+
155+ - name : Deploy Gateway
156+ run : |
157+ GATEWAY_NAME=inference-gateway
158+ kubectl delete httproute llm-route -n ${NAMESPACE} --ignore-not-found
159+ kubectl delete gateway ${GATEWAY_NAME} -n ${NAMESPACE} --ignore-not-found
160+ echo "Deploying Gateway..."
161+ kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.1.0/config/manifests/gateway/gke/gateway.yaml -n ${NAMESPACE} | tee ~/igw-prefill-heavy-deployment.log
162+ echo "Deploying HTTPRoute..."
163+ kubectl apply -f https://raw.githubusercontent.com/kubernetes-sigs/gateway-api-inference-extension/refs/tags/v1.1.0/config/manifests/gateway/gke/httproute.yaml -n ${NAMESPACE} | tee ~/igw-prefill-heavy-deployment.log
164+ echo "---------------------------------------" >> ~/igw-prefill-heavy-deployment.log
165+
166+ - name : Wait for gateway to be ready
167+ run : |
168+ GATEWAY_NAME=inference-gateway
169+ kubectl wait gateway/${GATEWAY_NAME} \
170+ --for=condition=Programmed=True \
171+ -n "${NAMESPACE}" \
172+ --timeout=500s
173+ echo "✅ Gateway is ready."
174+ kubectl get gateway -n "${NAMESPACE}"
175+
176+ - name : Show deployment status
177+ run : |
178+ echo "=== Deployments ==="
179+ kubectl get deployments -n "${NAMESPACE}"
180+ echo ""
181+ echo "=== Pods ==="
182+ kubectl get pods -n "${NAMESPACE}"
183+ echo ""
184+ echo "=== Services ==="
185+ kubectl get svc -n "${NAMESPACE}"
186+ echo ""
187+ echo "=== Helm releases ==="
188+ helm list -n "${NAMESPACE}" || true
189+ echo ""
190+ echo "=== Inference Pools ==="
191+ kubectl get inferencepools -n "${NAMESPACE}" || true
192+ echo ""
193+ echo "=== HTTPRoutes ==="
194+ kubectl get httproutes -n "${NAMESPACE}" -o yaml || true
195+ echo ""
196+ echo "=== Gateway ==="
197+ kubectl get Gateway -n "${NAMESPACE}" || true
198+ echo ""
199+
200+ - name : Verify installation and run validation test
201+ run : |
202+ cd .github/scripts/e2e
203+ ./e2e-validate.sh -n "${NAMESPACE}" -v -m ${MODEL}
204+
205+ - name : Run benchmarking test
206+ run : |
207+ TIMESTAMP=$(date +"%Y-%m-%d-%H-%M-%S")
208+ cd benchmarking/single-workload
209+ host="${GATEWAY_HOST:-$(kubectl get gateway -n "$NAMESPACE" \
210+ -o jsonpath='{.items[0].status.addresses[0].value}' 2>/dev/null || true)}"
211+ if [[ -z "$host" ]]; then
212+ echo "Error: could not discover a Gateway address in namespace '$NAMESPACE'." >&2
213+ exit 1
214+ fi
215+ port=80
216+ svc_host="${host}:${port}"
217+ helm install prefill-heavy-benchmark ../inference-perf/ -f prefill-heavy-values.yaml \
218+ --namespace "${NAMESPACE}" \
219+ --create-namespace \
220+ --set token.hfToken="${HF_TOKEN}" \
221+ --set "config.server.base_url=http://${svc_host}" \
222+ --set "job.serviceAccountName=$KSA_NAME" \
223+ --set "job.image.tag=v0.2.0" \
224+ --set "config.storage.google_cloud_storage.bucket_name=${GCS_BUCKET}" \
225+ --set "config.storage.google_cloud_storage.path=${NAMESPACE}/${TIMESTAMP}" \
226+ --set "gcsPath=gs://${GCS_BUCKET}/datasets/billsum_conversations.json" \
227+ --set "config.data.path=/gcsDataset/gcs-dataset.json" \
228+ --set-string 'job.resources.limits.nvidia\.com/gpu=1'
229+
230+ - name : Wait for benchmarking job to finish
231+ run : |
232+ job_name=prefill-heavy-benchmark-inference-perf-job
233+ TIMEOUT_DURATION="7200s"
234+ if ! kubectl wait --for=condition=complete job/"$job_name" -n "$NAMESPACE" --timeout="$TIMEOUT_DURATION"; then
235+ echo "Error: Benchmark job $job_name did not complete successfully within $TIMEOUT_DURATION." >&2
236+ echo "--- Job Description ---" >&2
237+ kubectl describe job "$job_name" -n "$NAMESPACE" >&2
238+ echo "--- Pod Logs (Last 50 lines) ---" >&2
239+ kubectl logs -l job-name="$job_name" -n "$NAMESPACE" --all-containers=true --tail 50 >&2
240+ exit 1
241+ fi
242+ echo "✅ Benchmarking Job Completed."
243+
244+ - name : Collect and upload Kubernetes pod logs
245+ if : always()
246+ run : |
247+ mkdir -p pod-logs-inference-prefill-heavy
248+ cd pod-logs-inference-prefill-heavy
249+ echo "Fetching ${NAMESPACE} pods log..."
250+ kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \
251+ | xargs -I{} sh -c 'kubectl logs --all-containers=true -n "${NAMESPACE}" {} > "{}.log" 2>&1'
252+ echo "Fetching ${NAMESPACE} pods descriptions..."
253+ kubectl get pods -n "${NAMESPACE}" --no-headers -o custom-columns=":metadata.name" \
254+ | xargs -I{} sh -c 'kubectl describe pod -n "${NAMESPACE}" {} > "{}-describe.log" 2>&1'
255+ mv ~/igw-prefill-heavy-deployment.log . || true
256+ mv ~/install-deps.log . || true
257+
258+ - name : Upload pod logs as artifact
259+ uses : actions/upload-artifact@v4
260+ if : always()
261+ with :
262+ name : igw-pod-logs-inference-prefill-heavy-${{ matrix.accelerator.name }}
263+ path : pod-logs-inference-prefill-heavy
264+
265+ - name : Send Google Chat notification on failure
266+ if : failure()
267+ uses : SimonScholz/google-chat-action@3b3519e5102dba8aa5046fd711c4b553586409bb
268+ with :
269+ webhookUrl : ${{ secrets.GOOGLE_CHAT_WEBHOOK }}
270+ jobStatus : ${{ job.status }}
271+ title : ' ${{ github.workflow }} - ${{ matrix.accelerator.name }}'
272+
273+ - name : Cleanup deployment
274+ if : always()
275+ run : |
276+ GATEWAY_NAME=inference-gateway
277+ helm uninstall vllm-llama3-8b-instruct -n ${NAMESPACE} --ignore-not-found
278+ helm uninstall prefill-heavy-benchmark -n ${NAMESPACE} --ignore-not-found
279+ kubectl delete httproute llm-route -n ${NAMESPACE} --ignore-not-found
280+ kubectl delete gateway ${GATEWAY_NAME} -n ${NAMESPACE} --ignore-not-found
0 commit comments