Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 56 additions & 5 deletions .github/workflows/UploadDockerImages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# This workflow builds and pushes MaxText images for both TPU and GPU devices.
# It runs automatically daily at 12am UTC, on Pull Requests, or manually via Workflow Dispatch.

name: Build Images
name: Build and Test Images

on:
schedule:
Expand All @@ -32,6 +32,11 @@ on:
- all
- tpu
- gpu
for_dev_test:
description: 'For development test purpose. All images will be added a -test suffix'
required: false
type: boolean
default: false

permissions:
contents: read
Expand All @@ -42,6 +47,7 @@ jobs:
outputs:
maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }}
image_date: ${{ steps.vars.outputs.image_date }}
image_suffix: ${{ steps.vars.outputs.image_suffix }}
steps:
- name: Checkout MaxText
uses: actions/checkout@v5
Expand All @@ -55,6 +61,13 @@ jobs:
# Image date
echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT

# If for_dev_test is true, set suffix to -test, otherwise empty
if [[ "${{ github.event.inputs.for_dev_test }}" == "true" ]]; then
echo "image_suffix=-test" >> $GITHUB_OUTPUT
else
echo "image_suffix=" >> $GITHUB_OUTPUT
fi

tpu-pre-training:
name: ${{ matrix.image_name }}
needs: setup
Expand All @@ -72,7 +85,7 @@ jobs:
dockerfile: ./dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: ${{ matrix.image_name }}
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
dockerfile: ${{ matrix.dockerfile }}
Expand All @@ -96,14 +109,13 @@ jobs:
dockerfile: ./dependencies/dockerfiles/maxtext_post_training_local_dependencies.Dockerfile
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: ${{ matrix.image_name }}
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
image_date: ${{ needs.setup.outputs.image_date }}
base_image: gcr.io/tpu-prod-env-multipod/maxtext_jax_stable:${{ needs.setup.outputs.image_date }}
is_post_training: true

gpu-pre-training:
name: ${{ matrix.image_name }}
Expand All @@ -122,9 +134,48 @@ jobs:
dockerfile: ./dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile
uses: ./.github/workflows/build_and_push_docker_image.yml
with:
image_name: ${{ matrix.image_name }}
image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }}
device: ${{ matrix.device }}
build_mode: ${{ matrix.build_mode }}
dockerfile: ${{ matrix.dockerfile }}
maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }}
image_date: ${{ needs.setup.outputs.image_date }}

# TEST JOBS
pre-training-tpu-tests:
needs: [setup, tpu-pre-training]
strategy:
fail-fast: false
matrix:
image: [maxtext_jax_stable, maxtext_jax_nightly]
uses: ./.github/workflows/test_and_tag_docker_image.yml
with:
image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }}
image_date: ${{ needs.setup.outputs.image_date }}
test_mode: tpu-pre-training

post-training-tpu-tests:
needs: [setup, tpu-post-training]
strategy:
fail-fast: false
matrix:
image: [maxtext_post_training_stable, maxtext_post_training_nightly]
uses: ./.github/workflows/test_and_tag_docker_image.yml
with:
image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }}
image_date: ${{ needs.setup.outputs.image_date }}
test_mode: tpu-post-training


pre-training-gpu-tests:
needs: [setup, gpu-pre-training]
strategy:
fail-fast: false
matrix:
image: [maxtext_gpu_jax_stable, maxtext_gpu_jax_nightly]
uses: ./.github/workflows/test_and_tag_docker_image.yml
with:
image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }}
image_date: ${{ needs.setup.outputs.image_date }}
test_mode: gpu-pre-training

31 changes: 12 additions & 19 deletions .github/workflows/build_and_push_docker_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,6 @@ on:
required: false
type: string
default: ''
is_post_training:
required: false
type: boolean
default: false

permissions:
contents: read
Expand Down Expand Up @@ -82,7 +78,9 @@ jobs:
ref: ${{ inputs.maxtext_sha }}

- name: Checkout post-training dependencies
if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly'
if: |
steps.check.outputs.should_run == 'true' &&
contains(inputs.image_name, 'post_training_nightly')
run: |
git clone https://github.com/google/tunix.git ./tunix
git clone https://github.com/vllm-project/vllm.git ./vllm
Expand Down Expand Up @@ -110,8 +108,7 @@ jobs:
push: true
context: .
file: ${{ inputs.dockerfile }}
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:latest
cache-from: type=gha
tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ inputs.image_date }}-build-${{ github.run_id }}
outputs: type=image,compression=zstd,force-compression=true
build-args: |
DEVICE=${{ inputs.device }}
Expand All @@ -126,23 +123,19 @@ jobs:
shell: bash
run: |
SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}"

# Add date tag
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${{ inputs.image_date }}" --quiet
TEMP_IMG="$SOURCE_IMAGE:${{ inputs.image_date }}-build-${{ github.run_id }}"

# Convert date to YYYYMMDD format
clean_date=$(echo "${{ inputs.image_date }}" | sed 's/[-:]//g' | cut -c1-8)

# Add MaxText tag
maxtext_hash=$(git rev-parse --short HEAD)
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet
gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet

# Add post-training dependencies tags
if [ "${{ inputs.is_post_training }}" == "true" ]; then
for dir in tunix vllm tpu-inference; do
if [ -d "./$dir" ]; then
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
fi
done
fi
for dir in tunix vllm tpu-inference; do
if [ -d "./$dir" ]; then
dir_hash=$(git -C "$dir" rev-parse --short HEAD)
gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet
fi
done
Loading
Loading