diff --git a/.github/workflows/UploadDockerImages.yml b/.github/workflows/UploadDockerImages.yml index 145de13008..7c6a8b07db 100644 --- a/.github/workflows/UploadDockerImages.yml +++ b/.github/workflows/UploadDockerImages.yml @@ -15,7 +15,7 @@ # This workflow builds and pushes MaxText images for both TPU and GPU devices. # It runs automatically daily at 12am UTC, on Pull Requests, or manually via Workflow Dispatch. -name: Build Images +name: Build and Test Images on: schedule: @@ -32,6 +32,11 @@ on: - all - tpu - gpu + for_dev_test: + description: 'For development test purpose. All images will be added a -test suffix' + required: false + type: boolean + default: false permissions: contents: read @@ -42,6 +47,7 @@ jobs: outputs: maxtext_sha: ${{ steps.vars.outputs.maxtext_sha }} image_date: ${{ steps.vars.outputs.image_date }} + image_suffix: ${{ steps.vars.outputs.image_suffix }} steps: - name: Checkout MaxText uses: actions/checkout@v5 @@ -55,6 +61,13 @@ jobs: # Image date echo "image_date=$(date +%Y-%m-%d)" >> $GITHUB_OUTPUT + # If for_dev_test is true, set suffix to -test, otherwise empty + if [[ "${{ github.event.inputs.for_dev_test }}" == "true" ]]; then + echo "image_suffix=-test" >> $GITHUB_OUTPUT + else + echo "image_suffix=" >> $GITHUB_OUTPUT + fi + tpu-pre-training: name: ${{ matrix.image_name }} needs: setup @@ -72,7 +85,7 @@ jobs: dockerfile: ./dependencies/dockerfiles/maxtext_tpu_dependencies.Dockerfile uses: ./.github/workflows/build_and_push_docker_image.yml with: - image_name: ${{ matrix.image_name }} + image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }} device: ${{ matrix.device }} build_mode: ${{ matrix.build_mode }} dockerfile: ${{ matrix.dockerfile }} @@ -96,14 +109,13 @@ jobs: dockerfile: ./dependencies/dockerfiles/maxtext_post_training_local_dependencies.Dockerfile uses: ./.github/workflows/build_and_push_docker_image.yml with: - image_name: ${{ matrix.image_name }} + image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }} device: ${{ matrix.device }} build_mode: ${{ matrix.build_mode }} dockerfile: ${{ matrix.dockerfile }} maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }} image_date: ${{ needs.setup.outputs.image_date }} base_image: gcr.io/tpu-prod-env-multipod/maxtext_jax_stable:${{ needs.setup.outputs.image_date }} - is_post_training: true gpu-pre-training: name: ${{ matrix.image_name }} @@ -122,9 +134,48 @@ jobs: dockerfile: ./dependencies/dockerfiles/maxtext_gpu_dependencies.Dockerfile uses: ./.github/workflows/build_and_push_docker_image.yml with: - image_name: ${{ matrix.image_name }} + image_name: ${{ matrix.image_name }}${{ needs.setup.outputs.image_suffix }} device: ${{ matrix.device }} build_mode: ${{ matrix.build_mode }} dockerfile: ${{ matrix.dockerfile }} maxtext_sha: ${{ needs.setup.outputs.maxtext_sha }} image_date: ${{ needs.setup.outputs.image_date }} + + # TEST JOBS + pre-training-tpu-tests: + needs: [setup, tpu-pre-training] + strategy: + fail-fast: false + matrix: + image: [maxtext_jax_stable, maxtext_jax_nightly] + uses: ./.github/workflows/test_and_tag_docker_image.yml + with: + image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }} + image_date: ${{ needs.setup.outputs.image_date }} + test_mode: tpu-pre-training + + post-training-tpu-tests: + needs: [setup, tpu-post-training] + strategy: + fail-fast: false + matrix: + image: [maxtext_post_training_stable, maxtext_post_training_nightly] + uses: ./.github/workflows/test_and_tag_docker_image.yml + with: + image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }} + image_date: ${{ needs.setup.outputs.image_date }} + test_mode: tpu-post-training + + + pre-training-gpu-tests: + needs: [setup, gpu-pre-training] + strategy: + fail-fast: false + matrix: + image: [maxtext_gpu_jax_stable, maxtext_gpu_jax_nightly] + uses: ./.github/workflows/test_and_tag_docker_image.yml + with: + image_name: ${{ matrix.image }}${{ needs.setup.outputs.image_suffix }} + image_date: ${{ needs.setup.outputs.image_date }} + test_mode: gpu-pre-training + diff --git a/.github/workflows/build_and_push_docker_image.yml b/.github/workflows/build_and_push_docker_image.yml index 515bec2129..a94b3dae30 100644 --- a/.github/workflows/build_and_push_docker_image.yml +++ b/.github/workflows/build_and_push_docker_image.yml @@ -41,10 +41,6 @@ on: required: false type: string default: '' - is_post_training: - required: false - type: boolean - default: false permissions: contents: read @@ -82,7 +78,9 @@ jobs: ref: ${{ inputs.maxtext_sha }} - name: Checkout post-training dependencies - if: steps.check.outputs.should_run == 'true' && inputs.image_name == 'maxtext_post_training_nightly' + if: | + steps.check.outputs.should_run == 'true' && + contains(inputs.image_name, 'post_training_nightly') run: | git clone https://github.com/google/tunix.git ./tunix git clone https://github.com/vllm-project/vllm.git ./vllm @@ -110,8 +108,7 @@ jobs: push: true context: . file: ${{ inputs.dockerfile }} - tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:latest - cache-from: type=gha + tags: gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}:${{ inputs.image_date }}-build-${{ github.run_id }} outputs: type=image,compression=zstd,force-compression=true build-args: | DEVICE=${{ inputs.device }} @@ -126,23 +123,19 @@ jobs: shell: bash run: | SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}" - - # Add date tag - gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${{ inputs.image_date }}" --quiet + TEMP_IMG="$SOURCE_IMAGE:${{ inputs.image_date }}-build-${{ github.run_id }}" # Convert date to YYYYMMDD format clean_date=$(echo "${{ inputs.image_date }}" | sed 's/[-:]//g' | cut -c1-8) # Add MaxText tag maxtext_hash=$(git rev-parse --short HEAD) - gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet + gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:maxtext_${maxtext_hash}_${clean_date}" --quiet # Add post-training dependencies tags - if [ "${{ inputs.is_post_training }}" == "true" ]; then - for dir in tunix vllm tpu-inference; do - if [ -d "./$dir" ]; then - dir_hash=$(git -C "$dir" rev-parse --short HEAD) - gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet - fi - done - fi + for dir in tunix vllm tpu-inference; do + if [ -d "./$dir" ]; then + dir_hash=$(git -C "$dir" rev-parse --short HEAD) + gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:${dir}_${dir_hash}_${clean_date}" --quiet + fi + done diff --git a/.github/workflows/build_and_test_maxtext.yml b/.github/workflows/build_and_test_maxtext.yml index 9a2d778bb6..bb15bf71d7 100644 --- a/.github/workflows/build_and_test_maxtext.yml +++ b/.github/workflows/build_and_test_maxtext.yml @@ -113,72 +113,47 @@ jobs: with: device_type: tpu device_name: v6e-4 - image_type: ${{ matrix.image_type }} + base_image: maxtext-unit-test-tpu:${{ matrix.image_type }} cloud_runner: linux-x86-ct6e-180-4tpu maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} secrets: HF_TOKEN: ${{ secrets.HF_TOKEN }} - maxtext_cpu_unit_tests: - needs: build_and_upload_maxtext_package + tpu-tests: + needs: [build_and_upload_maxtext_package] if: needs.doc_only_check.outputs.run_tests == 'true' - uses: ./.github/workflows/run_tests_against_package.yml strategy: - fail-fast: false # don't cancel all jobs on failure - matrix: - image_type: ["py312"] - worker_group: [1, 2] + fail-fast: false + matrix: + flavor: [tpu-unit, tpu-integration] + uses: ./.github/workflows/run_tests_coordinator.yml with: - device_type: cpu - device_name: X64 - cloud_runner: linux-x86-n2-16 - image_type: ${{ matrix.image_type }} - pytest_marker: 'cpu_only' - xla_python_client_mem_fraction: 0.75 - tf_force_gpu_allow_growth: false - container_resource_option: "--privileged" + flavor: ${{ matrix.flavor }} + base_image: maxtext-unit-test-tpu:py312 is_scheduled_run: ${{ github.event_name == 'schedule' }} - worker_group: ${{ matrix.worker_group }} - total_workers: 2 maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} - maxtext_tpu_unit_tests: - needs: build_and_upload_maxtext_package + gpu-tests: + needs: [build_and_upload_maxtext_package] if: needs.doc_only_check.outputs.run_tests == 'true' - uses: ./.github/workflows/run_tests_against_package.yml strategy: - fail-fast: false - matrix: - image_type: ["py312"] + fail-fast: false + matrix: + flavor: [gpu-unit, gpu-integration] + uses: ./.github/workflows/run_tests_coordinator.yml with: - device_type: tpu - device_name: v6e-4 - image_type: ${{ matrix.image_type }} - cloud_runner: linux-x86-ct6e-180-4tpu - pytest_marker: 'not cpu_only and not gpu_only and not integration_test' - xla_python_client_mem_fraction: 0.75 - tf_force_gpu_allow_growth: false - container_resource_option: "--privileged" + flavor: ${{ matrix.flavor }} + base_image: maxtext-unit-test-cuda12:py312 is_scheduled_run: ${{ github.event_name == 'schedule' }} maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} - maxtext_tpu_integration_tests: - needs: build_and_upload_maxtext_package + cpu-tests: + needs: [build_and_upload_maxtext_package] if: needs.doc_only_check.outputs.run_tests == 'true' - uses: ./.github/workflows/run_tests_against_package.yml - strategy: - fail-fast: false - matrix: - image_type: ["py312"] + uses: ./.github/workflows/run_tests_coordinator.yml with: - device_type: tpu - device_name: v6e-4 - image_type: ${{ matrix.image_type }} - cloud_runner: linux-x86-ct6e-180-4tpu - pytest_marker: 'not cpu_only and not gpu_only and integration_test' - xla_python_client_mem_fraction: 0.75 - tf_force_gpu_allow_growth: false - container_resource_option: "--privileged" + flavor: cpu-unit + base_image: maxtext-unit-test-tpu:py312 is_scheduled_run: ${{ github.event_name == 'schedule' }} maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} @@ -188,14 +163,12 @@ jobs: uses: ./.github/workflows/run_pathways_tests.yml strategy: fail-fast: false - matrix: - image_type: ["py312"] with: device_type: tpu device_name: v6e-4 - image_type: ${{ matrix.image_type }} + base_image: maxtext-unit-test-tpu:py312 cloud_runner: linux-x86-ct6e-180-4tpu - pytest_marker: 'not cpu_only and not gpu_only and not integration_test' + pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training' xla_python_client_mem_fraction: 0.75 tf_force_gpu_allow_growth: false container_resource_option: "--privileged" @@ -208,71 +181,26 @@ jobs: uses: ./.github/workflows/run_pathways_tests.yml strategy: fail-fast: false - matrix: - image_type: ["py312"] with: device_type: tpu device_name: v6e-4 - image_type: ${{ matrix.image_type }} + base_image: maxtext-unit-test-tpu:py312 cloud_runner: linux-x86-ct6e-180-4tpu - pytest_marker: 'not cpu_only and not gpu_only and integration_test' + pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training' xla_python_client_mem_fraction: 0.75 tf_force_gpu_allow_growth: false container_resource_option: "--privileged" is_scheduled_run: ${{ github.event_name == 'schedule' }} maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} - maxtext_gpu_unit_tests: - needs: build_and_upload_maxtext_package - if: needs.doc_only_check.outputs.run_tests == 'true' - uses: ./.github/workflows/run_tests_against_package.yml - strategy: - fail-fast: false - matrix: - image_type: ["py312"] - cuda: ["cuda12"] - with: - device_type: ${{ matrix.cuda }} - device_name: a100-40gb-4 - image_type: ${{ matrix.image_type }} - cloud_runner: linux-x86-a2-48-a100-4gpu - pytest_marker: 'not cpu_only and not tpu_only and not integration_test' - xla_python_client_mem_fraction: 0.65 - tf_force_gpu_allow_growth: true - container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged" - is_scheduled_run: ${{ github.event_name == 'schedule' }} - maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} - - maxtext_gpu_integration_tests: - needs: build_and_upload_maxtext_package - if: needs.doc_only_check.outputs.run_tests == 'true' - uses: ./.github/workflows/run_tests_against_package.yml - strategy: - fail-fast: false - matrix: - image_type: ["py312"] - cuda: ["cuda12"] - with: - device_type: ${{ matrix.cuda }} - device_name: a100-40gb-4 - image_type: ${{ matrix.image_type }} - cloud_runner: linux-x86-a2-48-a100-4gpu - pytest_marker: 'not cpu_only and not tpu_only and integration_test' - xla_python_client_mem_fraction: 0.65 - tf_force_gpu_allow_growth: true - container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged" - is_scheduled_run: ${{ github.event_name == 'schedule' }} - maxtext_sha: ${{ needs.build_and_upload_maxtext_package.outputs.maxtext_sha }} - all_tests_passed: name: All Required Tests Passed - needs: [doc_only_check, build_and_upload_maxtext_package, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests] + needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests] if: always() runs-on: ubuntu-latest steps: - name: Check test results run: | - # If doc-only, all tests should be skipped if [ "${{ needs.doc_only_check.outputs.run_tests }}" == "false" ]; then echo "Documentation-only changes detected, tests were skipped" exit 0 @@ -280,13 +208,11 @@ jobs: # Otherwise, check that build and all tests passed or were skipped echo "Build result: ${{ needs.build_and_upload_maxtext_package.result }}" - echo "CPU tests: ${{ needs.maxtext_cpu_unit_tests.result }}" - echo "TPU tests: ${{ needs.maxtext_tpu_unit_tests.result }}" - echo "TPU integration: ${{ needs.maxtext_tpu_integration_tests.result }}" - echo "TPU pathways: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}" - echo "TPU pathways integration: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}" - echo "GPU tests: ${{ needs.maxtext_gpu_unit_tests.result }}" - echo "GPU integration: ${{ needs.maxtext_gpu_integration_tests.result }}" + echo "TPU Tests (Matrix) result: ${{ needs.tpu-tests.result }}" + echo "GPU Tests (Matrix) result: ${{ needs.gpu-tests.result }}" + echo "CPU Tests (Matrix) result: ${{ needs.cpu-tests.result }}" + echo "Pathways Unit result: ${{ needs.maxtext_tpu_pathways_unit_tests.result }}" + echo "Pathways Integration result: ${{ needs.maxtext_tpu_pathways_integration_tests.result }}" # Fail only if any job failed or was cancelled (skipped is OK) if [ "${{ contains(needs.*.result, 'failure') }}" == "true" ] || [ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]; then @@ -323,14 +249,14 @@ jobs: notify_failure: name: Notify failed build # creates an issue or modifies last open existing issue for failed build - needs: [maxtext_jupyter_notebooks, maxtext_cpu_unit_tests, maxtext_tpu_unit_tests, maxtext_tpu_integration_tests, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests, maxtext_gpu_unit_tests, maxtext_gpu_integration_tests] + needs: [tpu-tests, gpu-tests, cpu-tests, maxtext_jupyter_notebooks, maxtext_tpu_pathways_unit_tests, maxtext_tpu_pathways_integration_tests] if: ${{ always() }} runs-on: ubuntu-latest permissions: issues: write steps: - - name: Check whether one of the jobs failed - if: ${{ contains(needs.*.result, 'failure') && github.event.pull_request == null && github.event_name != 'workflow_dispatch' }} - uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0 - with: - github-token: ${{ secrets.GITHUB_TOKEN }} + - name: Check whether one of the jobs failed + if: ${{ contains(needs.*.result, 'failure') && github.event_name == 'schedule' }} + uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/run_jupyter_notebooks.yml b/.github/workflows/run_jupyter_notebooks.yml index b9af2b74d1..7fcb71b746 100644 --- a/.github/workflows/run_jupyter_notebooks.yml +++ b/.github/workflows/run_jupyter_notebooks.yml @@ -25,15 +25,20 @@ on: device_name: required: true type: string - image_type: - required: false + base_image: + required: true type: string cloud_runner: required: false type: string maxtext_sha: - required: true + required: false type: string + # Flag to skip source checkout and wheel installation + maxtext_installed: + required: false + type: boolean + default: false secrets: HF_TOKEN: required: true @@ -44,17 +49,20 @@ jobs: run: runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }} container: - image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }} + image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }} steps: - name: Checkout MaxText + if: ${{ !inputs.maxtext_installed }} uses: actions/checkout@v5 with: ref: ${{ inputs.maxtext_sha }} - name: Download the MaxText wheel + if: ${{ !inputs.maxtext_installed }} uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 with: name: maxtext-wheel - name: Install MaxText and Dependencies + if: ${{ !inputs.maxtext_installed }} shell: bash run: | python3 -m uv venv --seed @@ -65,10 +73,6 @@ jobs: uv pip install ${maxtext_wheel}[${MAXTEXT_PACKAGE_EXTRA}] --resolution=lowest uv pip install -r src/install_maxtext_extra_deps/extra_deps_from_github.txt - # Install dependencies for running notebooks - uv pip install papermill ipykernel ipywidgets - .venv/bin/python3 -m ipykernel install --user --name maxtext_venv - # Install Tunix for post-training notebooks git clone https://github.com/google/tunix uv pip install ./tunix @@ -90,9 +94,24 @@ jobs: PYTHONPATH: "${{ github.workspace }}/src" HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | + if [ "${{ inputs.maxtext_installed }}" == "true" ]; then + # Move to the directory where code is baked into the image. See the Dockerfile. + # This is necessary because GHA sets an empty workspace by default. + cd /deps + PYTHON_EXE="python3" + PAPERMILL_EXE="papermill" + else + PYTHON_EXE=".venv/bin/python3" + PAPERMILL_EXE=".venv/bin/papermill" + fi + MAXTEXT_REPO_ROOT=$(pwd) MAXTEXT_NOTEBOOKS_ROOT="$MAXTEXT_REPO_ROOT/src/maxtext/examples" + # Install dependencies for running notebooks + $PYTHON_EXE -m pip install papermill ipykernel ipywidgets + $PYTHON_EXE -m ipykernel install --user --name maxtext_venv + for notebook in "$MAXTEXT_NOTEBOOKS_ROOT"/{sft,rl}*.ipynb; do filename=$(basename "$notebook") output_name="${filename%.ipynb}_output.ipynb" @@ -101,7 +120,7 @@ jobs: echo "Running $filename ..." echo "------------------------------------------------------" - .venv/bin/papermill "$notebook" "$output_name" -k maxtext_venv + $PAPERMILL_EXE "$notebook" "$output_name" -k maxtext_venv done - name: Record Commit IDs shell: bash diff --git a/.github/workflows/run_pathways_tests.yml b/.github/workflows/run_pathways_tests.yml index 08ab9eab32..868b43a195 100644 --- a/.github/workflows/run_pathways_tests.yml +++ b/.github/workflows/run_pathways_tests.yml @@ -25,8 +25,8 @@ on: device_name: required: true type: string - image_type: - required: false + base_image: + required: true type: string pytest_marker: required: true @@ -61,7 +61,7 @@ jobs: run: runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }} container: - image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-tpu:${{ inputs.image_type != '' && inputs.image_type }} + image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }} env: XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ inputs.xla_python_client_mem_fraction }} TF_FORCE_GPU_ALLOW_GROWTH: ${{ inputs.tf_force_gpu_allow_growth }} diff --git a/.github/workflows/run_tests_against_package.yml b/.github/workflows/run_tests_against_package.yml index 7ad07d1c17..2d54be868d 100644 --- a/.github/workflows/run_tests_against_package.yml +++ b/.github/workflows/run_tests_against_package.yml @@ -25,12 +25,15 @@ on: device_name: required: true type: string - image_type: - required: false + base_image: + required: true type: string pytest_marker: required: true type: string + pytest_extra_args: + required: false + type: string pytest_addopts: required: false type: string @@ -59,8 +62,14 @@ on: type: number default: 1 maxtext_sha: - required: true + description: 'Git SHA to checkout if MaxText is not pre-installed' + required: false type: string + # Flag to skip source checkout and wheel installation + maxtext_installed: + description: 'If false, maxtext_sha must be provided for checkout' + type: boolean + default: false permissions: contents: read @@ -68,7 +77,7 @@ jobs: run: runs-on: ${{ inputs.cloud_runner != '' && inputs.cloud_runner || fromJson(format('["self-hosted", "{0}", "{1}"]', inputs.device_type, inputs.device_name)) }} container: - image: gcr.io/tpu-prod-env-multipod/maxtext-unit-test-${{ inputs.device_type == 'cpu' && 'tpu' || inputs.device_type }}:${{ inputs.image_type != '' && inputs.image_type }} + image: gcr.io/tpu-prod-env-multipod/${{ inputs.base_image }} env: XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ inputs.xla_python_client_mem_fraction }} TF_FORCE_GPU_ALLOW_GROWTH: ${{ inputs.tf_force_gpu_allow_growth }} @@ -78,14 +87,17 @@ jobs: options: ${{ inputs.container_resource_option }} steps: - name: Checkout MaxText + if: ${{ !inputs.maxtext_installed }} uses: actions/checkout@v5 with: ref: ${{ inputs.maxtext_sha }} - name: Download the maxtext wheel + if: ${{ !inputs.maxtext_installed }} uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 with: name: maxtext-wheel - name: Install the maxtext wheel + if: ${{ !inputs.maxtext_installed }} shell: bash run: | python3 -m uv venv --seed @@ -95,12 +107,27 @@ jobs: uv pip install -r src/install_maxtext_extra_deps/extra_deps_from_github.txt python3 --version python3 -m pip freeze - uv pip install pytest-cov - name: Copy test assets files + if: ${{ !inputs.maxtext_installed }} run : gcloud storage cp gs://maxtext-test-assets/* tests/assets - name: Run Tests shell: bash run: | + # Determine environment and entry directory + if [ "${{ inputs.maxtext_installed }}" == "true" ]; then + # Move to the directory where code is baked into the image. See the Dockerfile. + cd /deps + PYTHON_EXE="python3" + # Disable coverage flags when testing against a pre-installed package + PYTEST_COV_ARGS="" + else + # Use the local virtual environment created in Step 3 + PYTHON_EXE=".venv/bin/python3" + # Ensure pytest-cov is available and enable coverage flags + $PYTHON_EXE -m pip install --quiet pytest-cov + PYTEST_COV_ARGS="--cov=src/MaxText --cov=maxtext --cov-report=xml --cov-report=term" + fi + if [ "${{ inputs.is_scheduled_run }}" == "true" ]; then FINAL_PYTEST_MARKER="${{ inputs.pytest_marker }}" else @@ -116,25 +143,25 @@ jobs: export LIBTPU_INIT_ARGS='--xla_tpu_scoped_vmem_limit_kib=65536' fi if [ "${{ inputs.total_workers }}" -gt 1 ]; then - .venv/bin/python3 -m pip install --quiet pytest-split pytest-xdist + $PYTHON_EXE -m pip install --quiet pytest-split pytest-xdist SPLIT_ARGS="--splits ${{ inputs.total_workers }} --group ${{ inputs.worker_group }} -n auto" else SPLIT_ARGS="" fi # TODO: Fix the skipped tests and remove the deselect flags - .venv/bin/python3 -m pytest ${{ inputs.pytest_addopts }} \ + $PYTHON_EXE -m pytest ${{ inputs.pytest_addopts }} \ -v \ -m "${FINAL_PYTEST_MARKER}" \ --durations=0 \ --deselect "tests/unit/tokenizer_test.py::TokenizerTest::test_detokenize" \ - --cov=MaxText \ - --cov=maxtext \ - --cov-report=xml \ - --cov-report=term \ - $SPLIT_ARGS + $PYTEST_COV_ARGS \ + $SPLIT_ARGS \ + ${{inputs.pytest_extra_args}} + env: PYTHONPATH: "${{ github.workspace }}/src" - name: Upload results to Codecov + if: ${{ !inputs.maxtext_installed }} # Skip code coverage upload for maxtext image testing uses: codecov/codecov-action@v5 continue-on-error: true with: diff --git a/.github/workflows/run_tests_coordinator.yml b/.github/workflows/run_tests_coordinator.yml new file mode 100644 index 0000000000..8751c9bd1b --- /dev/null +++ b/.github/workflows/run_tests_coordinator.yml @@ -0,0 +1,130 @@ +# Copyright 2023-2026 Google LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This file defines a module for running tests against the built maxtext package or +# pre-built images. It can run unit tests and integration tests. + +name: MaxText Test Coordinator + +on: + workflow_call: + inputs: + flavor: + description: > + Test flavor (tpu-unit, tpu-integration, post-training-tpu-unit, + post-training-tpu-integration, gpu-unit, gpu-integration, cpu-unit) + required: true + type: string + base_image: + description: 'The docker image to run tests against' + required: true + type: string + is_scheduled_run: + required: false + type: boolean + default: false + maxtext_sha: + description: 'Git SHA to checkout if MaxText is not pre-installed' + required: false + type: string + maxtext_installed: + description: 'If false, maxtext_sha must be provided for checkout' + type: boolean + default: false + +permissions: + contents: read + +jobs: + execute-test-package: + name: ${{ inputs.flavor }} + strategy: + fail-fast: false + matrix: + worker_group: ${{ fromJSON(contains(inputs.flavor, 'cpu') && '[1, 2]' || '[1]') }} + + uses: ./.github/workflows/run_tests_against_package.yml + with: + # Infrastructure Mapping + device_type: >- + ${{ fromJSON('{ + "tpu-unit": "tpu", + "tpu-integration": "tpu", + "post-training-tpu-unit": "tpu", + "post-training-tpu-integration": "tpu", + "gpu-unit": "cuda12", + "gpu-integration": "cuda12", + "cpu-unit": "cpu" + }')[inputs.flavor] }} + + device_name: >- + ${{ fromJSON('{ + "tpu-unit": "v6e-4", + "tpu-integration": "v6e-4", + "post-training-tpu-unit": "v6e-4", + "post-training-tpu-integration": "v6e-4", + "gpu-unit": "a100-40gb-4", + "gpu-integration": "a100-40gb-4", + "cpu-unit": "X64" + }')[inputs.flavor] }} + + cloud_runner: >- + ${{ fromJSON('{ + "tpu-unit": "linux-x86-ct6e-180-4tpu", + "tpu-integration": "linux-x86-ct6e-180-4tpu", + "post-training-tpu-unit": "linux-x86-ct6e-180-4tpu", + "post-training-tpu-integration": "linux-x86-ct6e-180-4tpu", + "gpu-unit": "linux-x86-a2-48-a100-4gpu", + "gpu-integration": "linux-x86-a2-48-a100-4gpu", + "cpu-unit": "linux-x86-n2-16" + }')[inputs.flavor] }} + # Pytest Marker Mapping + pytest_marker: >- + ${{ fromJSON('{ + "tpu-unit": "not cpu_only and not gpu_only and not integration_test", + "tpu-integration": "not cpu_only and not gpu_only and integration_test", + "post-training-tpu-unit": "not cpu_only and not gpu_only and not integration_test", + "post-training-tpu-integration": "not cpu_only and not gpu_only and integration_test", + "gpu-unit": "not cpu_only and not tpu_only and not integration_test", + "gpu-integration": "not cpu_only and not tpu_only and integration_test", + "cpu-unit": "cpu_only" + }')[inputs.flavor] }} + + pytest_extra_args: >- + ${{ fromJSON('{ + "tpu-unit": "--ignore=tests/unit/post_train", + "tpu-integration": "--ignore=tests/unit/post_train", + "post-training-tpu-unit": "", + "post-training-tpu-integration": "", + "gpu-unit": "--ignore=tests/unit/post_train", + "gpu-integration": "--ignore=tests/unit/post_train", + "cpu-unit": "--ignore=tests/unit/post_train" + }')[inputs.flavor] }} + + # Resource Scaling + xla_python_client_mem_fraction: "${{ contains(inputs.flavor, 'gpu') && '0.65' || '0.75' }}" + tf_force_gpu_allow_growth: "${{ contains(inputs.flavor, 'gpu') && 'true' || 'false' }}" + + container_resource_option: >- + ${{ contains(inputs.flavor, 'gpu') + && '--shm-size 2g --runtime=nvidia --gpus all --privileged' + || '--privileged' }} + + # Metadata + base_image: ${{ inputs.base_image }} + is_scheduled_run: ${{ inputs.is_scheduled_run }} + maxtext_installed: ${{ inputs.maxtext_installed }} + worker_group: ${{ matrix.worker_group }} + total_workers: ${{ contains(inputs.flavor, 'cpu') && 2 || 1 }} + maxtext_sha: ${{ inputs.maxtext_sha }} \ No newline at end of file diff --git a/.github/workflows/test_and_tag_docker_image.yml b/.github/workflows/test_and_tag_docker_image.yml new file mode 100644 index 0000000000..12fe1faf8f --- /dev/null +++ b/.github/workflows/test_and_tag_docker_image.yml @@ -0,0 +1,93 @@ +# Copyright 2025 Google LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This workflow will test and tag MaxText Docker image to GCR. +name: Test and Tag MaxText Docker Images + +on: + workflow_call: + inputs: + image_name: + required: true + type: string + test_mode: + description: "Test mode (tpu-pre-training, tpu-post-training, gpu-pre-training)" + required: true + type: string + image_date: + required: true + type: string + +permissions: + contents: read + +jobs: + test: + strategy: + fail-fast: false + matrix: + flavor: >- + ${{ fromJSON('{ + "gpu-pre-training": ["gpu-unit", "gpu-integration"], + "tpu-post-training": ["post-training-tpu-unit", "post-training-tpu-integration"], + "tpu-pre-training": ["tpu-unit", "tpu-integration", "cpu-unit"] + }')[inputs.test_mode] }} + uses: ./.github/workflows/run_tests_coordinator.yml + with: + flavor: ${{ matrix.flavor }} + base_image: ${{ inputs.image_name }}:${{ inputs.image_date }}-build-${{ github.run_id }} + is_scheduled_run: true + maxtext_installed: true + + notebook-test: + if: inputs.test_mode == 'tpu-post-training' + uses: ./.github/workflows/run_jupyter_notebooks.yml + with: + device_type: tpu + device_name: v6e-4 + base_image: ${{ inputs.image_name }}:${{ inputs.image_date }}-build-${{ github.run_id }} + cloud_runner: linux-x86-ct6e-180-4tpu + maxtext_installed: true + secrets: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + + tagging: + needs: [test, notebook-test] + if: | + always() && + needs.test.result == 'success' && + (needs.notebook-test.result == 'success' || needs.notebook-test.result == 'skipped') + runs-on: ubuntu-latest + container: google/cloud-sdk:524.0.0 + steps: + - name: Configure Docker + run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q + + - name: Create Production Tags + shell: bash + run: | + SOURCE_IMAGE="gcr.io/tpu-prod-env-multipod/${{ inputs.image_name }}" + TEMP_IMG="$SOURCE_IMAGE:${{ inputs.image_date }}-build-${{ github.run_id }}" + + # Validate existence first + gcloud container images describe "$TEMP_IMG" > /dev/null + + # 1. Date Tag + gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:${{ inputs.image_date }}" --quiet + + # 2. Latest Tag + gcloud container images add-tag "$TEMP_IMG" "$SOURCE_IMAGE:latest" --quiet + + # 3. Clean up Temporary Tag + gcloud container images untag "$TEMP_IMG" --quiet \ No newline at end of file diff --git a/tests/unit/post_train/README.md b/tests/unit/post_train/README.md new file mode 100644 index 0000000000..d53ce2a134 --- /dev/null +++ b/tests/unit/post_train/README.md @@ -0,0 +1,19 @@ + + +This folder contains tests that can only be run with post-training dependencies. +For unit tests run with pre-training environment, the tests under this fold should +be ignored. diff --git a/tests/unit/sft_hooks_test.py b/tests/unit/post_train/sft_hooks_test.py similarity index 100% rename from tests/unit/sft_hooks_test.py rename to tests/unit/post_train/sft_hooks_test.py