diff --git a/Dockerfile b/Dockerfile index b20d796e..c6b2d94c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -85,26 +85,54 @@ ENTRYPOINT ["/app/model-runner"] # --- vLLM variant --- FROM llamacpp AS vllm -ARG VLLM_VERSION=0.12.0 +ARG VLLM_VERSION=0.15.1 ARG VLLM_CUDA_VERSION=cu130 ARG VLLM_PYTHON_TAG=cp38-abi3 ARG TARGETARCH +# Build vLLM from source on ARM64 for CUDA 13 compatibility (e.g., NVIDIA DGX). +# Set to "false" to use prebuilt wheels instead (faster build, but may not work on CUDA 13). +ARG VLLM_ARM64_BUILD_FROM_SOURCE=true USER root -RUN apt update && apt install -y python3 python3-venv python3-dev curl ca-certificates build-essential && rm -rf /var/lib/apt/lists/* +# Install build dependencies including CUDA toolkit for compiling vLLM from source on ARM64 +# Note: Base image already has CUDA repo configured, just install cuda-toolkit directly +RUN apt update && apt install -y \ + python3 python3-venv python3-dev \ + curl ca-certificates build-essential \ + git cmake ninja-build \ + && if [ "$(uname -m)" = "aarch64" ] && [ "$VLLM_ARM64_BUILD_FROM_SOURCE" = "true" ]; then \ + apt install -y cuda-toolkit-13-0; \ + fi \ + && rm -rf /var/lib/apt/lists/* + +# Set CUDA paths for ARM64 builds +ENV PATH=/usr/local/cuda-13.0/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/local/cuda-13.0/lib64:$LD_LIBRARY_PATH RUN mkdir -p /opt/vllm-env && chown -R modelrunner:modelrunner /opt/vllm-env USER modelrunner # Install uv and vLLM as modelrunner user +# For AMD64: Use prebuilt CUDA 13 wheels (PyTorch pulled as dependency) +# For ARM64 with VLLM_ARM64_BUILD_FROM_SOURCE=true: Build from source against PyTorch nightly +# For ARM64 with VLLM_ARM64_BUILD_FROM_SOURCE=false: Use prebuilt wheel (old behavior) RUN curl -LsSf https://astral.sh/uv/install.sh | sh \ && ~/.local/bin/uv venv --python /usr/bin/python3 /opt/vllm-env \ && if [ "$TARGETARCH" = "amd64" ]; then \ - WHEEL_ARCH="manylinux_2_31_x86_64"; \ + WHEEL_ARCH="manylinux_2_35_x86_64"; \ WHEEL_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}%2B${VLLM_CUDA_VERSION}-${VLLM_PYTHON_TAG}-${WHEEL_ARCH}.whl"; \ ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "$WHEEL_URL"; \ + elif [ "$VLLM_ARM64_BUILD_FROM_SOURCE" = "true" ]; then \ + ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python \ + torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu130 \ + && git clone --depth 1 --branch v${VLLM_VERSION} https://github.com/vllm-project/vllm.git /tmp/vllm \ + && cd /tmp/vllm \ + && /opt/vllm-env/bin/python use_existing_torch.py \ + && ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python -r requirements/build.txt \ + && VLLM_TARGET_DEVICE=cuda ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python . --no-build-isolation \ + && rm -rf /tmp/vllm; \ else \ ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "vllm==${VLLM_VERSION}"; \ fi diff --git a/Makefile b/Makefile index d39f2437..79251441 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,7 @@ DOCKER_TARGET ?= final-llamacpp PORT := 8080 MODELS_PATH := $(shell pwd)/models-store LLAMA_ARGS ?= +EXTRA_DOCKER_BUILD_ARGS ?= DOCKER_BUILD_ARGS := \ --load \ --platform linux/$(shell docker version --format '{{.Server.Arch}}') \ @@ -84,11 +85,11 @@ lint: # Build Docker image docker-build: - docker buildx build $(DOCKER_BUILD_ARGS) . + docker buildx build $(DOCKER_BUILD_ARGS) $(EXTRA_DOCKER_BUILD_ARGS) . # Build multi-platform Docker image docker-build-multiplatform: - docker buildx build --platform linux/amd64,linux/arm64 $(DOCKER_BUILD_ARGS) . + docker buildx build --platform linux/amd64,linux/arm64 $(DOCKER_BUILD_ARGS) $(EXTRA_DOCKER_BUILD_ARGS) . # Run in Docker container with TCP port access and mounted model storage docker-run: docker-build