From d4999cb9be3525dac7945443967635a2683f2965 Mon Sep 17 00:00:00 2001 From: Dushyant Behl Date: Thu, 18 Dec 2025 13:07:29 +0530 Subject: [PATCH 1/4] change the image build and push workflows for various branches Signed-off-by: Dushyant Behl --- .github/workflows/dev-image.yaml | 24 ++++++++++++++++++++++++ .github/workflows/format.yml | 4 ++-- .github/workflows/image.yaml | 19 +++++++++++++++---- .github/workflows/release-image.yaml | 28 ++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/dev-image.yaml create mode 100644 .github/workflows/release-image.yaml diff --git a/.github/workflows/dev-image.yaml b/.github/workflows/dev-image.yaml new file mode 100644 index 000000000..527e713b3 --- /dev/null +++ b/.github/workflows/dev-image.yaml @@ -0,0 +1,24 @@ +name: dev-image +on: + push: + branches: [ "staging" ] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: "Free up disk space" + uses: ./.github/actions/free-up-disk-space + - name: Build image + run: | + docker build -t fms-hf-tuning-staging:latest . -f build/nvcr.Dockerfile + - name: Login to Quay.io + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ secrets.QUAY_USERNAME }} + password: ${{ secrets.QUAY_ROBOT_TOKEN }} + - name: Push docker image to Quay.io + run: | + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning-staging:latest \ No newline at end of file diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 3122efb0e..0dd24e0ce 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -16,9 +16,9 @@ name: Format on: push: - branches: [ "main", "release" ] + branches: [ "main", "release", "staging" ] pull_request: - branches: [ "main", "release" ] + branches: [ "main", "release", "staging" ] jobs: lint: diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index 4cee55e7b..e4063fdb9 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -1,9 +1,9 @@ name: Image on: push: - branches: [ "main", "release" ] + branches: [ "main" ] pull_request: - branches: [ "main", "release" ] + branches: [ "main" ] jobs: build: @@ -12,6 +12,17 @@ jobs: - uses: actions/checkout@v4 - name: "Free up disk space" uses: ./.github/actions/free-up-disk-space - - name: Build image + - name: Build NVCR dev Image run: | - docker build -t fms-hf-tuning:dev . -f build/Dockerfile + docker build -t fms-hf-tuning-dev:latest . -f build/nvcr.Dockerfile + - name: Login to Quay.io + if: github.event_name == 'push' + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ secrets.QUAY_USERNAME }} + password: ${{ secrets.QUAY_ROBOT_TOKEN }} + - name: Push docker image for every commit to Quay.io as dev images + if: github.event_name == 'push' + run: | + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning-dev:latest \ No newline at end of file diff --git a/.github/workflows/release-image.yaml b/.github/workflows/release-image.yaml new file mode 100644 index 000000000..04e6c189e --- /dev/null +++ b/.github/workflows/release-image.yaml @@ -0,0 +1,28 @@ +name: Image +on: + push: + branches: [ "release" ] + pull_request: + branches: [ "release" ] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: "Free up disk space" + uses: ./.github/actions/free-up-disk-space + - name: Build UBI9 Prod Image + run: | + docker build -t fms-hf-tuning:latest . -f build/Dockerfile + - name: Login to Quay.io + if: github.event_name == 'push' + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ secrets.QUAY_USERNAME }} + password: ${{ secrets.QUAY_ROBOT_TOKEN }} + - name: Push docker image to Quay.io + if: github.event_name == 'push' + run: | + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:latest \ No newline at end of file From cbc406ff53a1f6484b625a4150493739e8a0bb6b Mon Sep 17 00:00:00 2001 From: Dushyant Behl Date: Thu, 18 Dec 2025 18:48:08 +0530 Subject: [PATCH 2/4] push staging images on tags/release change image names to be standards fix nvcr dockerfile Signed-off-by: Dushyant Behl --- .github/workflows/dev-image.yaml | 24 ---------------- .github/workflows/image.yaml | 4 +-- .github/workflows/release-image.yaml | 8 ++++-- .github/workflows/staging-image.yaml | 42 ++++++++++++++++++++++++++++ build/nvcr.Dockerfile | 1 + 5 files changed, 51 insertions(+), 28 deletions(-) delete mode 100644 .github/workflows/dev-image.yaml create mode 100644 .github/workflows/staging-image.yaml diff --git a/.github/workflows/dev-image.yaml b/.github/workflows/dev-image.yaml deleted file mode 100644 index 527e713b3..000000000 --- a/.github/workflows/dev-image.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name: dev-image -on: - push: - branches: [ "staging" ] - -jobs: - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: "Free up disk space" - uses: ./.github/actions/free-up-disk-space - - name: Build image - run: | - docker build -t fms-hf-tuning-staging:latest . -f build/nvcr.Dockerfile - - name: Login to Quay.io - uses: docker/login-action@v3 - with: - registry: quay.io - username: ${{ secrets.QUAY_USERNAME }} - password: ${{ secrets.QUAY_ROBOT_TOKEN }} - - name: Push docker image to Quay.io - run: | - docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning-staging:latest \ No newline at end of file diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index e4063fdb9..0afbf4dbe 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -14,7 +14,7 @@ jobs: uses: ./.github/actions/free-up-disk-space - name: Build NVCR dev Image run: | - docker build -t fms-hf-tuning-dev:latest . -f build/nvcr.Dockerfile + docker build -t fms-hf-tuning:main-nvcr-latest . -f build/nvcr.Dockerfile - name: Login to Quay.io if: github.event_name == 'push' uses: docker/login-action@v3 @@ -25,4 +25,4 @@ jobs: - name: Push docker image for every commit to Quay.io as dev images if: github.event_name == 'push' run: | - docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning-dev:latest \ No newline at end of file + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:main-nvcr-latest \ No newline at end of file diff --git a/.github/workflows/release-image.yaml b/.github/workflows/release-image.yaml index 04e6c189e..c4bd5a978 100644 --- a/.github/workflows/release-image.yaml +++ b/.github/workflows/release-image.yaml @@ -14,7 +14,10 @@ jobs: uses: ./.github/actions/free-up-disk-space - name: Build UBI9 Prod Image run: | - docker build -t fms-hf-tuning:latest . -f build/Dockerfile + docker build \ + -t fms-hf-tuning:ubi9-latest \ + -t fms-hf-tuning:release-ubi9-latest \ + -f build/Dockerfile . - name: Login to Quay.io if: github.event_name == 'push' uses: docker/login-action@v3 @@ -25,4 +28,5 @@ jobs: - name: Push docker image to Quay.io if: github.event_name == 'push' run: | - docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:latest \ No newline at end of file + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:ubi9-latest + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:release-ubi9-latest \ No newline at end of file diff --git a/.github/workflows/staging-image.yaml b/.github/workflows/staging-image.yaml new file mode 100644 index 000000000..99eb6060e --- /dev/null +++ b/.github/workflows/staging-image.yaml @@ -0,0 +1,42 @@ +name: dev-image +on: + push: + tags: + - 'v*.*.*' + release: + types: [published] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: "Free up disk space" + uses: ./.github/actions/free-up-disk-space + - name: Determine image tag + id: tag + run: | + if [ "${{ github.event_name }}" = "release" ]; then + TAG="${{ github.event.release.tag_name }}" + elif [ "${{ github.ref_type }}" = "tag" ]; then + TAG="${GITHUB_REF_NAME}" + else + TAG="dev" + fi + echo "IMAGE_TAG=$TAG" >> $GITHUB_ENV + - name: Build image + run: | + docker build \ + -t fms-hf-tuning:${IMAGE_TAG}-nvcr \ + -t fms-hf-tuning:staging-nvcr-latest \ + -f build/nvcr.Dockerfile . + - name: Login to Quay.io + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ secrets.QUAY_USERNAME }} + password: ${{ secrets.QUAY_ROBOT_TOKEN }} + - name: Push docker image to Quay.io + run: | + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:staging-nvcr-latest + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:${IMAGE_TAG}-nvcr \ No newline at end of file diff --git a/build/nvcr.Dockerfile b/build/nvcr.Dockerfile index 1ee94d5e2..d703b045c 100644 --- a/build/nvcr.Dockerfile +++ b/build/nvcr.Dockerfile @@ -48,6 +48,7 @@ RUN pip install --upgrade --force-reinstall torch torchaudio torchvision --index COPY . ${SOURCE_DIR} RUN cd ${SOURCE_DIR} +RUN pip install --upgrade pip setuptools wheel RUN pip install --no-cache-dir ${SOURCE_DIR} RUN pip install --user --no-build-isolation ${SOURCE_DIR}[flash-attn] From 8cc8a0c75ec8574ae39e7afe1e10917954458b4d Mon Sep 17 00:00:00 2001 From: Dushyant Behl Date: Thu, 18 Dec 2025 18:50:27 +0530 Subject: [PATCH 3/4] remove staging branch Signed-off-by: Dushyant Behl --- .github/workflows/format.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 0dd24e0ce..3122efb0e 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -16,9 +16,9 @@ name: Format on: push: - branches: [ "main", "release", "staging" ] + branches: [ "main", "release" ] pull_request: - branches: [ "main", "release", "staging" ] + branches: [ "main", "release" ] jobs: lint: From 5ed66b9c6d042905275e4b591a79c373aac6dbd1 Mon Sep 17 00:00:00 2001 From: Dushyant Behl Date: Fri, 19 Dec 2025 00:39:45 +0530 Subject: [PATCH 4/4] update nvcr dockefile to multistage to save space Signed-off-by: Dushyant Behl --- .github/workflows/image.yaml | 4 --- build/nvcr.Dockerfile | 54 ++++++++++++++++++++++-------------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index 0afbf4dbe..06edf85ed 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -2,8 +2,6 @@ name: Image on: push: branches: [ "main" ] - pull_request: - branches: [ "main" ] jobs: build: @@ -16,13 +14,11 @@ jobs: run: | docker build -t fms-hf-tuning:main-nvcr-latest . -f build/nvcr.Dockerfile - name: Login to Quay.io - if: github.event_name == 'push' uses: docker/login-action@v3 with: registry: quay.io username: ${{ secrets.QUAY_USERNAME }} password: ${{ secrets.QUAY_ROBOT_TOKEN }} - name: Push docker image for every commit to Quay.io as dev images - if: github.event_name == 'push' run: | docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:main-nvcr-latest \ No newline at end of file diff --git a/build/nvcr.Dockerfile b/build/nvcr.Dockerfile index d703b045c..713988fcc 100644 --- a/build/nvcr.Dockerfile +++ b/build/nvcr.Dockerfile @@ -20,8 +20,8 @@ ARG NVCR_IMAGE_VERSION=25.02-py3 # This is based on what is inside the NVCR image already ARG PYTHON_VERSION=3.12 -## Base Layer ################################################################## -FROM nvcr.io/nvidia/pytorch:${NVCR_IMAGE_VERSION} AS dev +######################## BUILDER ######################## +FROM nvcr.io/nvidia/pytorch:${NVCR_IMAGE_VERSION} AS builder ARG USER=root ARG USER_UID=0 @@ -29,28 +29,27 @@ ARG WORKDIR=/app ARG SOURCE_DIR=${WORKDIR}/fms-hf-tuning ARG ENABLE_FMS_ACCELERATION=true -ARG ENABLE_AIM=true -ARG ENABLE_MLFLOW=true -ARG ENABLE_SCANNER=true +ARG ENABLE_AIM=false +ARG ENABLE_MLFLOW=false +ARG ENABLE_SCANNER=false ARG ENABLE_CLEARML=true ARG ENABLE_TRITON_KERNELS=true -ARG ENABLE_MAMBA_SUPPORT=true # Ensures to always build mamba_ssm from source ENV PIP_NO_BINARY=mamba-ssm,mamba_ssm -RUN python -m pip install --upgrade pip - # upgrade torch as the base layer contains only torch 2.7 -RUN pip install --upgrade --force-reinstall torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu128 +RUN python -m pip install --upgrade pip && \ + pip install --upgrade setuptools && \ + pip install --upgrade --force-reinstall torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu128 # Install main package + flash attention COPY . ${SOURCE_DIR} RUN cd ${SOURCE_DIR} -RUN pip install --upgrade pip setuptools wheel -RUN pip install --no-cache-dir ${SOURCE_DIR} -RUN pip install --user --no-build-isolation ${SOURCE_DIR}[flash-attn] +RUN pip install --no-cache-dir ${SOURCE_DIR} && \ + pip install --user --no-build-isolation ${SOURCE_DIR}[flash-attn] && \ + pip install --no-cache-dir --no-build-isolation ${SOURCE_DIR}[mamba] # Optional extras RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \ @@ -62,6 +61,12 @@ RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \ python -m fms_acceleration.cli install fms_acceleration_odm; \ fi +RUN if [[ "${ENABLE_TRITON_KERNELS}" == "true" ]]; then \ + pip install --no-cache-dir "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"; \ + fi +RUN if [[ "${ENABLE_CLEARML}" == "true" ]]; then \ + pip install --no-cache-dir ${SOURCE_DIR}[clearml]; \ + fi RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \ pip install --no-cache-dir ${SOURCE_DIR}[aim]; \ fi @@ -71,15 +76,22 @@ RUN if [[ "${ENABLE_MLFLOW}" == "true" ]]; then \ RUN if [[ "${ENABLE_SCANNER}" == "true" ]]; then \ pip install --no-cache-dir ${SOURCE_DIR}[scanner-dev]; \ fi -RUN if [[ "${ENABLE_CLEARML}" == "true" ]]; then \ - pip install --no-cache-dir ${SOURCE_DIR}[clearml]; \ - fi -RUN if [[ "${ENABLE_MAMBA_SUPPORT}" == "true" ]]; then \ - pip install --no-cache-dir ${SOURCE_DIR}[mamba]; \ - fi -RUN if [[ "${ENABLE_TRITON_KERNELS}" == "true" ]]; then \ - pip install --no-cache-dir "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"; \ - fi + +# cleanup +RUN rm -rf /root/.cache /tmp/* /opt/pytorch + +######################## RUNTIME ######################## +FROM nvcr.io/nvidia/pytorch:${NVCR_IMAGE_VERSION} + +WORKDIR ${WORKDIR} + +# Copy only Python site-packages + app +COPY --from=builder /usr/local/lib/python3.12/dist-packages \ + /usr/local/lib/python3.12/dist-packages +COPY --from=builder ${SOURCE_DIR} ${SOURCE_DIR} + +# Runtime cleanup +RUN rm -rf /opt/pytorch /root/.cache /tmp/* RUN chmod -R g+rwX $WORKDIR /tmp RUN mkdir -p /.cache && chmod -R 777 /.cache