From b4c36ea8db60ff09bda1dd9b14f1e3937932ab53 Mon Sep 17 00:00:00 2001
From: Dorin Geman <dorin.geman@docker.com>
Date: Thu, 5 Feb 2026 15:14:40 +0200
Subject: [PATCH] fix: build vLLM from source for ARM64 CUDA 13 (NVIDIA DGX)

The prebuilt vLLM ARM64 wheels have ABI incompatibility with PyTorch CUDA 13 nightly builds. For ARM64 with CUDA 13 (e.g., NVIDIA DGX GB300 Blackwell, DGX GB200):
- Install CUDA toolkit 13.0 for compilation
- Use PyTorch nightly with cu130 support
- Build vLLM from source to ensure ABI compatibility

Add VLLM_ARM64_BUILD_FROM_SOURCE build arg (default: true) to allow opting out of source builds for faster build times on non-CUDA 13 systems.

Also:
- Update AMD64 wheel path to manylinux_2_35 (required for cu130)
- Bump vLLM to 0.15.1

Signed-off-by: Dorin Geman <dorin.geman@docker.com>
---
 Dockerfile | 34 +++++++++++++++++++++++++++++++---
 Makefile   |  5 +++--
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b20d796e..c6b2d94c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -85,26 +85,54 @@ ENTRYPOINT ["/app/model-runner"]
 # --- vLLM variant ---
 FROM llamacpp AS vllm
 
-ARG VLLM_VERSION=0.12.0
+ARG VLLM_VERSION=0.15.1
 ARG VLLM_CUDA_VERSION=cu130
 ARG VLLM_PYTHON_TAG=cp38-abi3
 ARG TARGETARCH
+# Build vLLM from source on ARM64 for CUDA 13 compatibility (e.g., NVIDIA DGX).
+# Set to "false" to use prebuilt wheels instead (faster build, but may not work on CUDA 13).
+ARG VLLM_ARM64_BUILD_FROM_SOURCE=true
 
 USER root
 
-RUN apt update && apt install -y python3 python3-venv python3-dev curl ca-certificates build-essential && rm -rf /var/lib/apt/lists/*
+# Install build dependencies including CUDA toolkit for compiling vLLM from source on ARM64
+# Note: Base image already has CUDA repo configured, just install cuda-toolkit directly
+RUN apt update && apt install -y \
+    python3 python3-venv python3-dev \
+    curl ca-certificates build-essential \
+    git cmake ninja-build \
+    && if [ "$(uname -m)" = "aarch64" ] && [ "$VLLM_ARM64_BUILD_FROM_SOURCE" = "true" ]; then \
+    apt install -y cuda-toolkit-13-0; \
+    fi \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set CUDA paths for ARM64 builds
+ENV PATH=/usr/local/cuda-13.0/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda-13.0/lib64:$LD_LIBRARY_PATH
 
 RUN mkdir -p /opt/vllm-env && chown -R modelrunner:modelrunner /opt/vllm-env
 
 USER modelrunner
 
 # Install uv and vLLM as modelrunner user
+# For AMD64: Use prebuilt CUDA 13 wheels (PyTorch pulled as dependency)
+# For ARM64 with VLLM_ARM64_BUILD_FROM_SOURCE=true: Build from source against PyTorch nightly
+# For ARM64 with VLLM_ARM64_BUILD_FROM_SOURCE=false: Use prebuilt wheel (old behavior)
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
     && ~/.local/bin/uv venv --python /usr/bin/python3 /opt/vllm-env \
     && if [ "$TARGETARCH" = "amd64" ]; then \
-    WHEEL_ARCH="manylinux_2_31_x86_64"; \
+    WHEEL_ARCH="manylinux_2_35_x86_64"; \
     WHEEL_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}%2B${VLLM_CUDA_VERSION}-${VLLM_PYTHON_TAG}-${WHEEL_ARCH}.whl"; \
     ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "$WHEEL_URL"; \
+    elif [ "$VLLM_ARM64_BUILD_FROM_SOURCE" = "true" ]; then \
+    ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python \
+    torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130 \
+    && git clone --depth 1 --branch v${VLLM_VERSION} https://github.com/vllm-project/vllm.git /tmp/vllm \
+    && cd /tmp/vllm \
+    && /opt/vllm-env/bin/python use_existing_torch.py \
+    && ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python -r requirements/build.txt \
+    && VLLM_TARGET_DEVICE=cuda ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python . --no-build-isolation \
+    && rm -rf /tmp/vllm; \
     else \
     ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "vllm==${VLLM_VERSION}"; \
     fi
diff --git a/Makefile b/Makefile
index d39f2437..79251441 100644
--- a/Makefile
+++ b/Makefile
@@ -13,6 +13,7 @@ DOCKER_TARGET ?= final-llamacpp
 PORT := 8080
 MODELS_PATH := $(shell pwd)/models-store
 LLAMA_ARGS ?=
+EXTRA_DOCKER_BUILD_ARGS ?=
 DOCKER_BUILD_ARGS := \
 	--load \
 	--platform linux/$(shell docker version --format '{{.Server.Arch}}') \
@@ -84,11 +85,11 @@ lint:
 
 # Build Docker image
 docker-build:
-	docker buildx build $(DOCKER_BUILD_ARGS) .
+	docker buildx build $(DOCKER_BUILD_ARGS) $(EXTRA_DOCKER_BUILD_ARGS) .
 
 # Build multi-platform Docker image
 docker-build-multiplatform:
-	docker buildx build --platform linux/amd64,linux/arm64 $(DOCKER_BUILD_ARGS) .
+	docker buildx build --platform linux/amd64,linux/arm64 $(DOCKER_BUILD_ARGS) $(EXTRA_DOCKER_BUILD_ARGS) .
 
 # Run in Docker container with TCP port access and mounted model storage
 docker-run: docker-build