Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions .github/workflows/image.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
name: Image
on:
push:
branches: [ "main", "release" ]
pull_request:
branches: [ "main", "release" ]
branches: [ "main" ]

jobs:
build:
Expand All @@ -12,6 +10,15 @@ jobs:
- uses: actions/checkout@v4
- name: "Free up disk space"
uses: ./.github/actions/free-up-disk-space
- name: Build image
- name: Build NVCR dev Image
run: |
docker build -t fms-hf-tuning:dev . -f build/Dockerfile
docker build -t fms-hf-tuning:main-nvcr-latest . -f build/nvcr.Dockerfile
- name: Login to Quay.io
uses: docker/login-action@v3
with:
registry: quay.io
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_ROBOT_TOKEN }}
- name: Push docker image for every commit to Quay.io as dev images
run: |
docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:main-nvcr-latest
32 changes: 32 additions & 0 deletions .github/workflows/release-image.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Image
on:
push:
branches: [ "release" ]
pull_request:
branches: [ "release" ]

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Free up disk space"
uses: ./.github/actions/free-up-disk-space
- name: Build UBI9 Prod Image
run: |
docker build \
-t fms-hf-tuning:ubi9-latest \
-t fms-hf-tuning:release-ubi9-latest \
-f build/Dockerfile .
- name: Login to Quay.io
if: github.event_name == 'push'
uses: docker/login-action@v3
with:
registry: quay.io
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_ROBOT_TOKEN }}
- name: Push docker image to Quay.io
if: github.event_name == 'push'
run: |
docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:ubi9-latest
docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:release-ubi9-latest
42 changes: 42 additions & 0 deletions .github/workflows/staging-image.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: dev-image
on:
push:
tags:
- 'v*.*.*'
release:
types: [published]

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: "Free up disk space"
uses: ./.github/actions/free-up-disk-space
- name: Determine image tag
id: tag
run: |
if [ "${{ github.event_name }}" = "release" ]; then
TAG="${{ github.event.release.tag_name }}"
elif [ "${{ github.ref_type }}" = "tag" ]; then
TAG="${GITHUB_REF_NAME}"
else
TAG="dev"
fi
echo "IMAGE_TAG=$TAG" >> $GITHUB_ENV
- name: Build image
run: |
docker build \
-t fms-hf-tuning:${IMAGE_TAG}-nvcr \
-t fms-hf-tuning:staging-nvcr-latest \
-f build/nvcr.Dockerfile .
- name: Login to Quay.io
uses: docker/login-action@v3
with:
registry: quay.io
username: ${{ secrets.QUAY_USERNAME }}
password: ${{ secrets.QUAY_ROBOT_TOKEN }}
- name: Push docker image to Quay.io
run: |
docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:staging-nvcr-latest
docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:${IMAGE_TAG}-nvcr
53 changes: 33 additions & 20 deletions build/nvcr.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,36 +20,36 @@ ARG NVCR_IMAGE_VERSION=25.02-py3
# This is based on what is inside the NVCR image already
ARG PYTHON_VERSION=3.12

## Base Layer ##################################################################
FROM nvcr.io/nvidia/pytorch:${NVCR_IMAGE_VERSION} AS dev
######################## BUILDER ########################
FROM nvcr.io/nvidia/pytorch:${NVCR_IMAGE_VERSION} AS builder

ARG USER=root
ARG USER_UID=0
ARG WORKDIR=/app
ARG SOURCE_DIR=${WORKDIR}/fms-hf-tuning

ARG ENABLE_FMS_ACCELERATION=true
ARG ENABLE_AIM=true
ARG ENABLE_MLFLOW=true
ARG ENABLE_SCANNER=true
ARG ENABLE_AIM=false
ARG ENABLE_MLFLOW=false
ARG ENABLE_SCANNER=false
ARG ENABLE_CLEARML=true
ARG ENABLE_TRITON_KERNELS=true
ARG ENABLE_MAMBA_SUPPORT=true

# Ensures to always build mamba_ssm from source
ENV PIP_NO_BINARY=mamba-ssm,mamba_ssm

RUN python -m pip install --upgrade pip

# upgrade torch as the base layer contains only torch 2.7
RUN pip install --upgrade --force-reinstall torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu128
RUN python -m pip install --upgrade pip && \
pip install --upgrade setuptools && \
pip install --upgrade --force-reinstall torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu128

# Install main package + flash attention
COPY . ${SOURCE_DIR}
RUN cd ${SOURCE_DIR}

RUN pip install --no-cache-dir ${SOURCE_DIR}
RUN pip install --user --no-build-isolation ${SOURCE_DIR}[flash-attn]
RUN pip install --no-cache-dir ${SOURCE_DIR} && \
pip install --user --no-build-isolation ${SOURCE_DIR}[flash-attn] && \
pip install --no-cache-dir --no-build-isolation ${SOURCE_DIR}[mamba]

# Optional extras
RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \
Expand All @@ -61,6 +61,12 @@ RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \
python -m fms_acceleration.cli install fms_acceleration_odm; \
fi

RUN if [[ "${ENABLE_TRITON_KERNELS}" == "true" ]]; then \
pip install --no-cache-dir "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"; \
fi
RUN if [[ "${ENABLE_CLEARML}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[clearml]; \
fi
RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[aim]; \
fi
Expand All @@ -70,15 +76,22 @@ RUN if [[ "${ENABLE_MLFLOW}" == "true" ]]; then \
RUN if [[ "${ENABLE_SCANNER}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[scanner-dev]; \
fi
RUN if [[ "${ENABLE_CLEARML}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[clearml]; \
fi
RUN if [[ "${ENABLE_MAMBA_SUPPORT}" == "true" ]]; then \
pip install --no-cache-dir ${SOURCE_DIR}[mamba]; \
fi
RUN if [[ "${ENABLE_TRITON_KERNELS}" == "true" ]]; then \
pip install --no-cache-dir "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"; \
fi

# cleanup
RUN rm -rf /root/.cache /tmp/* /opt/pytorch

######################## RUNTIME ########################
FROM nvcr.io/nvidia/pytorch:${NVCR_IMAGE_VERSION}

WORKDIR ${WORKDIR}

# Copy only Python site-packages + app
COPY --from=builder /usr/local/lib/python3.12/dist-packages \
/usr/local/lib/python3.12/dist-packages
COPY --from=builder ${SOURCE_DIR} ${SOURCE_DIR}

# Runtime cleanup
RUN rm -rf /opt/pytorch /root/.cache /tmp/*

RUN chmod -R g+rwX $WORKDIR /tmp
RUN mkdir -p /.cache && chmod -R 777 /.cache
Expand Down