diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index 4cee55e7b..06edf85ed 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -1,9 +1,7 @@ name: Image on: push: - branches: [ "main", "release" ] - pull_request: - branches: [ "main", "release" ] + branches: [ "main" ] jobs: build: @@ -12,6 +10,15 @@ jobs: - uses: actions/checkout@v4 - name: "Free up disk space" uses: ./.github/actions/free-up-disk-space - - name: Build image + - name: Build NVCR dev Image run: | - docker build -t fms-hf-tuning:dev . -f build/Dockerfile + docker build -t fms-hf-tuning:main-nvcr-latest . -f build/nvcr.Dockerfile + - name: Login to Quay.io + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ secrets.QUAY_USERNAME }} + password: ${{ secrets.QUAY_ROBOT_TOKEN }} + - name: Push docker image for every commit to Quay.io as dev images + run: | + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:main-nvcr-latest \ No newline at end of file diff --git a/.github/workflows/release-image.yaml b/.github/workflows/release-image.yaml new file mode 100644 index 000000000..c4bd5a978 --- /dev/null +++ b/.github/workflows/release-image.yaml @@ -0,0 +1,32 @@ +name: Image +on: + push: + branches: [ "release" ] + pull_request: + branches: [ "release" ] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: "Free up disk space" + uses: ./.github/actions/free-up-disk-space + - name: Build UBI9 Prod Image + run: | + docker build \ + -t fms-hf-tuning:ubi9-latest \ + -t fms-hf-tuning:release-ubi9-latest \ + -f build/Dockerfile . + - name: Login to Quay.io + if: github.event_name == 'push' + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ secrets.QUAY_USERNAME }} + password: ${{ secrets.QUAY_ROBOT_TOKEN }} + - name: Push docker image to Quay.io + if: github.event_name == 'push' + run: | + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:ubi9-latest + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:release-ubi9-latest \ No newline at end of file diff --git a/.github/workflows/staging-image.yaml b/.github/workflows/staging-image.yaml new file mode 100644 index 000000000..99eb6060e --- /dev/null +++ b/.github/workflows/staging-image.yaml @@ -0,0 +1,42 @@ +name: dev-image +on: + push: + tags: + - 'v*.*.*' + release: + types: [published] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: "Free up disk space" + uses: ./.github/actions/free-up-disk-space + - name: Determine image tag + id: tag + run: | + if [ "${{ github.event_name }}" = "release" ]; then + TAG="${{ github.event.release.tag_name }}" + elif [ "${{ github.ref_type }}" = "tag" ]; then + TAG="${GITHUB_REF_NAME}" + else + TAG="dev" + fi + echo "IMAGE_TAG=$TAG" >> $GITHUB_ENV + - name: Build image + run: | + docker build \ + -t fms-hf-tuning:${IMAGE_TAG}-nvcr \ + -t fms-hf-tuning:staging-nvcr-latest \ + -f build/nvcr.Dockerfile . + - name: Login to Quay.io + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ secrets.QUAY_USERNAME }} + password: ${{ secrets.QUAY_ROBOT_TOKEN }} + - name: Push docker image to Quay.io + run: | + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:staging-nvcr-latest + docker push ${{ env.QUAY_REPOSITORY }}/fms-hf-tuning:${IMAGE_TAG}-nvcr \ No newline at end of file diff --git a/build/nvcr.Dockerfile b/build/nvcr.Dockerfile index 1ee94d5e2..713988fcc 100644 --- a/build/nvcr.Dockerfile +++ b/build/nvcr.Dockerfile @@ -20,8 +20,8 @@ ARG NVCR_IMAGE_VERSION=25.02-py3 # This is based on what is inside the NVCR image already ARG PYTHON_VERSION=3.12 -## Base Layer ################################################################## -FROM nvcr.io/nvidia/pytorch:${NVCR_IMAGE_VERSION} AS dev +######################## BUILDER ######################## +FROM nvcr.io/nvidia/pytorch:${NVCR_IMAGE_VERSION} AS builder ARG USER=root ARG USER_UID=0 @@ -29,27 +29,27 @@ ARG WORKDIR=/app ARG SOURCE_DIR=${WORKDIR}/fms-hf-tuning ARG ENABLE_FMS_ACCELERATION=true -ARG ENABLE_AIM=true -ARG ENABLE_MLFLOW=true -ARG ENABLE_SCANNER=true +ARG ENABLE_AIM=false +ARG ENABLE_MLFLOW=false +ARG ENABLE_SCANNER=false ARG ENABLE_CLEARML=true ARG ENABLE_TRITON_KERNELS=true -ARG ENABLE_MAMBA_SUPPORT=true # Ensures to always build mamba_ssm from source ENV PIP_NO_BINARY=mamba-ssm,mamba_ssm -RUN python -m pip install --upgrade pip - # upgrade torch as the base layer contains only torch 2.7 -RUN pip install --upgrade --force-reinstall torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu128 +RUN python -m pip install --upgrade pip && \ + pip install --upgrade setuptools && \ + pip install --upgrade --force-reinstall torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu128 # Install main package + flash attention COPY . ${SOURCE_DIR} RUN cd ${SOURCE_DIR} -RUN pip install --no-cache-dir ${SOURCE_DIR} -RUN pip install --user --no-build-isolation ${SOURCE_DIR}[flash-attn] +RUN pip install --no-cache-dir ${SOURCE_DIR} && \ + pip install --user --no-build-isolation ${SOURCE_DIR}[flash-attn] && \ + pip install --no-cache-dir --no-build-isolation ${SOURCE_DIR}[mamba] # Optional extras RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \ @@ -61,6 +61,12 @@ RUN if [[ "${ENABLE_FMS_ACCELERATION}" == "true" ]]; then \ python -m fms_acceleration.cli install fms_acceleration_odm; \ fi +RUN if [[ "${ENABLE_TRITON_KERNELS}" == "true" ]]; then \ + pip install --no-cache-dir "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"; \ + fi +RUN if [[ "${ENABLE_CLEARML}" == "true" ]]; then \ + pip install --no-cache-dir ${SOURCE_DIR}[clearml]; \ + fi RUN if [[ "${ENABLE_AIM}" == "true" ]]; then \ pip install --no-cache-dir ${SOURCE_DIR}[aim]; \ fi @@ -70,15 +76,22 @@ RUN if [[ "${ENABLE_MLFLOW}" == "true" ]]; then \ RUN if [[ "${ENABLE_SCANNER}" == "true" ]]; then \ pip install --no-cache-dir ${SOURCE_DIR}[scanner-dev]; \ fi -RUN if [[ "${ENABLE_CLEARML}" == "true" ]]; then \ - pip install --no-cache-dir ${SOURCE_DIR}[clearml]; \ - fi -RUN if [[ "${ENABLE_MAMBA_SUPPORT}" == "true" ]]; then \ - pip install --no-cache-dir ${SOURCE_DIR}[mamba]; \ - fi -RUN if [[ "${ENABLE_TRITON_KERNELS}" == "true" ]]; then \ - pip install --no-cache-dir "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"; \ - fi + +# cleanup +RUN rm -rf /root/.cache /tmp/* /opt/pytorch + +######################## RUNTIME ######################## +FROM nvcr.io/nvidia/pytorch:${NVCR_IMAGE_VERSION} + +WORKDIR ${WORKDIR} + +# Copy only Python site-packages + app +COPY --from=builder /usr/local/lib/python3.12/dist-packages \ + /usr/local/lib/python3.12/dist-packages +COPY --from=builder ${SOURCE_DIR} ${SOURCE_DIR} + +# Runtime cleanup +RUN rm -rf /opt/pytorch /root/.cache /tmp/* RUN chmod -R g+rwX $WORKDIR /tmp RUN mkdir -p /.cache && chmod -R 777 /.cache