diff --git a/build/Dockerfile b/build/Dockerfile index 6944e42cb..d07d91acc 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -19,6 +19,7 @@ ARG USER=tuning ARG USER_UID=1000 ARG PYTHON_VERSION=3.11 ARG WHEEL_VERSION="" +ARG ENABLE_AIM=false ## Base Layer ################################################################## FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} as base @@ -42,13 +43,6 @@ ENV LANG=C.UTF-8 \ RUN useradd -u $USER_UID ${USER} -m -g 0 --system && \ chmod g+rx /home/${USER} -## Used as base of the Release stage to removed unrelated the packages and CVEs -FROM base as release-base - -# Removes the python3.9 code to eliminate possible CVEs. Also removes dnf -RUN rpm -e $(dnf repoquery python3-* -q --installed) dnf python3 yum crypto-policies-scripts - - ## CUDA Base ################################################################### FROM base as cuda-base @@ -98,20 +92,31 @@ RUN dnf config-manager \ ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs" -FROM cuda-devel as python-installations +## Python dep management / common files for dev & release ##### +FROM cuda-devel as files-common ARG WHEEL_VERSION ARG USER ARG USER_UID - -## Enable Aimstack if requested via ENABLE_AIM set to "true" -ARG ENABLE_AIM=false +ARG ENABLE_AIM RUN dnf install -y git && \ # perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies # Twistlock detects it as H severity: Private keys stored in image rm -f /usr/share/doc/perl-Net-SSLeay/examples/server_key.pem && \ dnf clean all + +# /app scripts and permission management +RUN mkdir /app && \ + chown -R $USER:0 /app /tmp && \ + chmod -R g+rwX /app /tmp +COPY build/accelerate_launch.py fixtures/accelerate_fsdp_defaults.yaml /app/ +COPY build/utils.py /app/build/ +RUN chmod +x /app/accelerate_launch.py + +RUN mkdir /.cache && \ + chmod -R 777 /.cache + USER ${USER} WORKDIR /tmp RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \ @@ -127,50 +132,95 @@ RUN if [[ -z "${WHEEL_VERSION}" ]]; \ fi && \ ls /tmp/*.whl >/tmp/bdist_name +## Stages for dev images ###################################### +FROM files-common as dev +ARG USER +ARG USER_UID +ARG ENABLE_AIM + +# Install from the wheel / optionals deps, pytest, etc +RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \ + python -m pip install --user wheel tox pytest && \ + python -m pip install --user "$(head /tmp/bdist_name)" && \ + python -m pip install --user "$(head /tmp/bdist_name)[flash-attn]" && \ + python -m pip install --user "$(head /tmp/bdist_name)[dev]" && \ + if [[ "${ENABLE_AIM}" == "true" ]]; then \ + python -m pip install --user "$(head /tmpbdist_name)[aim]"; \ + fi && \ + python -m pip uninstall wheel build -y && \ + rm $(head bdist_name) /tmp/bdist_name + +RUN if [[ "${ENABLE_AIM}" == "true" ]] ; then \ + touch /.aim_profile && \ + chmod -R 777 /.aim_profile; \ + fi + +# Create the directory for vscode-server; this directory has to be pre-created +# such that the user can write to it, otherwise we can't attach a vscode instance +# to it. +RUN mkdir -p /app/.vscode-server && \ + chown $USER:0 /app/.vscode-server + +WORKDIR /app +USER ${USER} + +# Unit tests, build infrastructure, common scripts +COPY --from=files-common /app/ /app/ +COPY --from=files-common /.cache/ /.cache/ +COPY tests /app/tests +COPY tox.ini /app/ +COPY Makefile /app/ +COPY scripts /app/scripts + +ENV FSDP_DEFAULTS_FILE_PATH="/app/accelerate_fsdp_defaults.yaml" +ENV SET_NUM_PROCESSES_TO_NUM_GPUS="True" +ENV PYTHONPATH="/home/${USER}/.local/lib/python${PYTHON_VERSION}/site-packages:/app" + +## Stages for release images ################################## +FROM files-common as python-rel-installations +ARG USER +ARG USER_UID +ARG ENABLE_AIM + # Install from the wheel +# TODO - probably a good idea to install most stuff in common and copy it out +# in both this stage and dev. RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \ python -m pip install --user wheel && \ - python -m pip install --user "$(head bdist_name)" && \ - python -m pip install --user "$(head bdist_name)[flash-attn]" && \ + python -m pip install --user "$(head /tmp/bdist_name)" && \ + python -m pip install --user "$(head /tmp/bdist_name)[flash-attn]" && \ if [[ "${ENABLE_AIM}" == "true" ]]; then \ - python -m pip install --user "$(head bdist_name)[aim]"; \ + python -m pip install --user "$(head /tmpbdist_name)[aim]"; \ fi && \ # Clean up the wheel module. It's only needed by flash-attn install python -m pip uninstall wheel build -y && \ # Cleanup the bdist whl file rm $(head bdist_name) /tmp/bdist_name -## Final image ################################################ -FROM release-base as release +FROM base as release ARG USER +ARG ENABLE_AIM ARG PYTHON_VERSION -RUN mkdir -p /licenses -COPY LICENSE /licenses/ - -RUN mkdir /app && \ - chown -R $USER:0 /app /tmp && \ - chmod -R g+rwX /app /tmp +# Removes the python3.9 code to eliminate possible CVEs. Also removes dnf +RUN rpm -e $(dnf repoquery python3-* -q --installed) dnf python3 yum crypto-policies-scripts -# Need a better way to address these hacks RUN if [[ "${ENABLE_AIM}" == "true" ]] ; then \ touch /.aim_profile && \ chmod -R 777 /.aim_profile; \ fi -RUN mkdir /.cache && \ - chmod -R 777 /.cache - -# Copy scripts and default configs -COPY build/accelerate_launch.py fixtures/accelerate_fsdp_defaults.yaml /app/ -COPY build/utils.py /app/build/ -RUN chmod +x /app/accelerate_launch.py -ENV FSDP_DEFAULTS_FILE_PATH="/app/accelerate_fsdp_defaults.yaml" -ENV SET_NUM_PROCESSES_TO_NUM_GPUS="True" +RUN mkdir -p /licenses +COPY LICENSE /licenses/ WORKDIR /app USER ${USER} -COPY --from=python-installations /home/${USER}/.local /home/${USER}/.local +COPY --from=python-rel-installations /home/${USER}/.local /home/${USER}/.local +COPY --from=files-common /app/ /app/ +COPY --from=files-common /.cache/ /.cache/ + +ENV FSDP_DEFAULTS_FILE_PATH="/app/accelerate_fsdp_defaults.yaml" +ENV SET_NUM_PROCESSES_TO_NUM_GPUS="True" ENV PYTHONPATH="/home/${USER}/.local/lib/python${PYTHON_VERSION}/site-packages" CMD [ "python", "/app/accelerate_launch.py" ]