From 8211d8cd8bb14a8ae9645336f429642a417d6250 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Mon, 23 Jun 2025 13:41:57 -0400 Subject: [PATCH] ci: Switch to python 3.12 as default target Signed-off-by: Ihar Hrachyshka --- .../workflows/e2e-nvidia-l40s-x4-py312.yml | 221 ------------------ .github/workflows/e2e-nvidia-l40s-x4-sdk.yml | 14 +- .github/workflows/e2e-nvidia-l40s-x4.yml | 4 +- .github/workflows/lint.yml | 4 +- .github/workflows/smoke-py312.yaml | 141 ----------- .github/workflows/smoke.yaml | 4 +- docs/ci.md | 4 +- tox.ini | 2 +- 8 files changed, 16 insertions(+), 378 deletions(-) delete mode 100644 .github/workflows/e2e-nvidia-l40s-x4-py312.yml delete mode 100644 .github/workflows/smoke-py312.yaml diff --git a/.github/workflows/e2e-nvidia-l40s-x4-py312.yml b/.github/workflows/e2e-nvidia-l40s-x4-py312.yml deleted file mode 100644 index e78dce4d..00000000 --- a/.github/workflows/e2e-nvidia-l40s-x4-py312.yml +++ /dev/null @@ -1,221 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -name: E2E (NVIDIA L40S x4) (python 3.12) - -on: - schedule: - - cron: '0 16 * * *' # Runs at 4PM UTC every day - workflow_dispatch: - inputs: - pr_or_branch: - description: 'pull request number or branch name' - required: true - default: 'main' - -env: - TMPDIR: /home/tmp - -jobs: - start-large-ec2-runner: - runs-on: ubuntu-latest - outputs: - label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }} - ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }} - ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }} - steps: - - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - repository: instructlab/ci-actions - # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents - path: ci-actions - ref: release-v0.1 - sparse-checkout: | - actions/launch-ec2-runner-with-fallback - - - name: Launch EC2 Runner with Fallback - id: launch-ec2-instance-with-fallback - uses: ./ci-actions/actions/launch-ec2-runner-with-fallback - env: - TMPDIR: "/tmp" - with: - aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - regions_config: > - [ - { - "region": "us-east-2", - "subnets": { - "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}", - "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}", - "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}" - }, - "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}", - "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}" - }, - { - "region": "us-east-1", - "subnets": { - "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}", - "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}", - "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}", - "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}", - "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}", - "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}" - }, - "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}", - "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}" - } - ] - try_spot_instance_first: false - ec2_instance_type: g6e.12xlarge - aws_resource_tags: > - [ - {"Key": "Name", "Value": "instructlab-ci-github-large-runner"}, - {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, - {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, - {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} - ] - - e2e-large-test: - needs: - - start-large-ec2-runner - runs-on: ${{ needs.start-large-ec2-runner.outputs.label }} - - permissions: - pull-requests: write - - steps: - - name: Checkout instructlab/training - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - repository: "instructlab/training" - path: "training" - # https://github.com/actions/checkout/issues/249 - fetch-depth: 0 - - - name: Run e2e tests - uses: ./training/.github/actions/run-e2e - with: - python-version: 3.12 - gh-token: ${{ secrets.GITHUB_TOKEN }} - hf-token: ${{ secrets.HF_TOKEN }} - openai-api-key: ${{ secrets.OPENAI_API_KEY }} - son-of-jeeves-discord-webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }} - - stop-large-ec2-runner: - needs: - - start-large-ec2-runner - - e2e-large-test - runs-on: ubuntu-latest - if: ${{ always() }} - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }} - - - name: Stop EC2 runner - uses: machulav/ec2-github-runner@8b37f736c69ba6af391e437447d3c07548478d78 # v2.4.0 - with: - mode: stop - github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - label: ${{ needs.start-large-ec2-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} - - loss-graphs: - needs: - - stop-large-ec2-runner - runs-on: ubuntu-latest - if: ${{ always() }} - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ vars.AWS_REGION }} - - - name: Download loss data Phase 1 - id: phase-1-download-logs - uses: actions/download-artifact@v4 - with: - name: phase-1-training-log.jsonl - path: downloaded-data - - - name: Download loss data Phase 2 - id: phase-2-download-logs - uses: actions/download-artifact@v4 - with: - name: phase-2-training-log.jsonl - path: downloaded-data - - - name: Checkout instructlab/training - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - repository: "instructlab/training" - path: "training" - fetch-depth: 0 - - - name: Install dependencies - working-directory: ./training - run: | - python -m pip install --upgrade pip - pip install -r requirements-dev.txt -c constraints-dev.txt - - - name: Try to upload Phase 1 to s3 - id: phase-1-upload-s3 - continue-on-error: true - run: | - python training/scripts/create-loss-graph.py \ - --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \ - --output-file "./phase-1-test.md" \ - --phase "1" \ - --aws-region "${{ vars.AWS_REGION }}" \ - --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ - --base-branch "${GITHUB_REF##*/}" \ - --head-sha "${{ github.sha }}" \ - --pr-number "${{ github.event.number }}" \ - --origin-repository "${{ github.repository }}" - - - name: Try to upload Phase 2 to s3 - id: phase-2-upload-s3 - continue-on-error: true - run: | - python training/scripts/create-loss-graph.py \ - --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \ - --output-file "./phase-2-test.md" \ - --phase "2" \ - --aws-region "${{ vars.AWS_REGION }}" \ - --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \ - --base-branch "${GITHUB_REF##*/}" \ - --head-sha "${{ github.sha }}" \ - --pr-number "${{ github.event.number }}" \ - --origin-repository "${{ github.repository }}" - - - name: Check Phase 1 S3 upload status for success - if: steps.phase-1-upload-s3.outcome == 'success' - run: | - echo "Uploaded Phase 1 loss graph to S3." - cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}" - - - name: Check Phase 2 S3 upload status for success - if: steps.phase-2-upload-s3.outcome == 'success' - run: | - echo "Uploaded Phase 2 loss graph to S3." - cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}" - - - name: Check Phase 1 S3 upload status for failure - if: steps.phase-1-upload-s3.outcome == 'failure' - run: | - echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate." - echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" - - - name: Check Phase 2 S3 upload status for failure - if: steps.phase-2-upload-s3.outcome == 'failure' - run: | - echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate." - echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}" diff --git a/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml b/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml index 3f416642..2b22237b 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml @@ -105,7 +105,7 @@ jobs: run: | cat /etc/os-release mkdir -p "${TMPDIR}" - sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel + sudo dnf install -y gcc gcc-c++ make git python3.12 python3.12-devel - name: Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -130,18 +130,18 @@ jobs: export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64" export PATH="$PATH:$CUDA_HOME/bin" nvidia-smi - python3.11 -m venv --upgrade-deps venv + python3.12 -m venv --upgrade-deps venv . venv/bin/activate pip install instructlab pip install instructlab[cuda] pip install vllm - python3.11 -m pip install packaging wheel setuptools-scm + python3.12 -m pip install packaging wheel setuptools-scm pip install . pip install .[cuda] - python3.11 -m pip uninstall -y flash-attn - python3.11 -m pip cache purge - python3.11 -m pip install ninja - MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation + python3.12 -m pip uninstall -y flash-attn + python3.12 -m pip cache purge + python3.12 -m pip install ninja + MAX_JOBS=8 python3.12 -m pip install flash-attn --no-build-isolation - name: Check disk before tests run: | diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml index f464783c..43ad281e 100644 --- a/.github/workflows/e2e-nvidia-l40s-x4.yml +++ b/.github/workflows/e2e-nvidia-l40s-x4.yml @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -name: E2E (NVIDIA L40S x4) (python 3.11) +name: E2E (NVIDIA L40S x4) (python 3.12) on: schedule: @@ -98,7 +98,7 @@ jobs: - name: Run e2e tests uses: ./training/.github/actions/run-e2e with: - python-version: 3.11 + python-version: 3.12 gh-token: ${{ secrets.GITHUB_TOKEN }} hf-token: ${{ secrets.HF_TOKEN }} openai-api-key: ${{ secrets.OPENAI_API_KEY }} diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 2c8df16d..042360ee 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -65,10 +65,10 @@ jobs: # https://github.com/actions/checkout/issues/249 fetch-depth: 0 - - name: Setup Python 3.11 + - name: Setup Python 3.12 uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 with: - python-version: 3.11 + python-version: 3.12 cache: pip cache-dependency-path: | **/pyproject.toml diff --git a/.github/workflows/smoke-py312.yaml b/.github/workflows/smoke-py312.yaml deleted file mode 100644 index 400146d1..00000000 --- a/.github/workflows/smoke-py312.yaml +++ /dev/null @@ -1,141 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -name: "Run smoke tests via Tox::pytest (python 3.12)" -# These tests will be long running and require accelerated hardware. - -on: - workflow_dispatch: - inputs: - branch: - type: string - default: main - # using this rather than pull_request because this workflow - # needs to run in the context of the base branch (main) and - # access the repo's secrets to start the AWS instances. - pull_request_target: - branches: - - main - - release-* - paths: - # note this should match the merging criteria in 'mergify.yml' - - "**.py" - - "tox.ini" - - "pyproject.toml" - - "requirements-dev.txt" - - "requirements-cuda.txt" - - "constraints-dev.txt" - -permissions: - contents: read - -defaults: - run: - shell: bash - -env: - ec2_runner_variant: "g6e.12xlarge" # 4x L40s - -jobs: - start-large-ec2-runner: - runs-on: ubuntu-latest - outputs: - label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }} - ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }} - ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }} - steps: - - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - repository: instructlab/ci-actions - # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents - path: ci-actions - ref: release-v0.1 - sparse-checkout: | - actions/launch-ec2-runner-with-fallback - - - name: Launch EC2 Runner with Fallback - id: launch-ec2-instance-with-fallback - uses: ./ci-actions/actions/launch-ec2-runner-with-fallback - env: - TMPDIR: "/tmp" - with: - aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - regions_config: > - [ - { - "region": "us-east-2", - "subnets": { - "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}", - "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}", - "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}" - }, - "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}", - "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}" - }, - { - "region": "us-east-1", - "subnets": { - "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}", - "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}", - "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}", - "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}", - "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}", - "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}" - }, - "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}", - "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}" - } - ] - try_spot_instance_first: false - ec2_instance_type: g6e.12xlarge - aws_resource_tags: > - [ - {"Key": "Name", "Value": "instructlab-training-ci-github-large-runner"}, - {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, - {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, - {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} - ] - - run-smoke-tests: - needs: - - start-large-ec2-runner - runs-on: ${{needs.start-large-ec2-runner.outputs.label}} - # It is important that this job has no write permissions and has - # no access to any secrets. This part is where we are running - # untrusted code from PRs. - permissions: {} - steps: - - name: "Checkout code" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - ref: ${{inputs.branch}} - - - name: Run smoke tests - uses: ./.github/actions/run-smoke - with: - python-version: 3.12 - - stop-large-ec2-runner: - needs: - - start-large-ec2-runner - - run-smoke-tests - runs-on: ubuntu-latest - if: ${{ always() }} - steps: - - name: "Configure AWS credentials" - uses: "aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df" # v4.2.1 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }} - - - name: "Stop EC2 runner" - uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1 - with: - mode: stop - github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - label: ${{ needs.start-large-ec2-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml index bedeeb3b..aab51fa9 100644 --- a/.github/workflows/smoke.yaml +++ b/.github/workflows/smoke.yaml @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -name: "Run smoke tests via Tox::pytest (python 3.11)" +name: "Run smoke tests via Tox::pytest (python 3.12)" # These tests will be long running and require accelerated hardware. on: @@ -118,7 +118,7 @@ jobs: - name: Run smoke tests uses: ./.github/actions/run-smoke with: - python-version: 3.11 + python-version: 3.12 stop-large-ec2-runner: needs: diff --git a/docs/ci.md b/docs/ci.md index fdd0fb6d..a46efddb 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -10,7 +10,7 @@ All unit tests currently live in the `tests/unit` directory and are run with [py To run the unit tests, you can run `tox -e py3-unit`. -In CI, the tests are run with Python 3.11 - 3.13 on Ubuntu and MacOS runners - you can see the details [in the unit tests workflow file](https://github.com/instructlab/training/blob/main/.github/workflows/unit.yaml). +In CI, the tests are run on Ubuntu runners - you can see the details [in the unit tests workflow file](https://github.com/instructlab/training/blob/main/.github/workflows/unit.yaml). ## Smoke tests @@ -20,7 +20,7 @@ There is also a Shell-based smoke test script that can be found at `tests/smoket To run the smoke tests, you can run `tox -e py3-smoke` -In CI, the smoke tests are run with Python 3.11 on CentOS runners - you can see the details [in the smoke workflow file](https://github.com/instructlab/training/blob/main/.github/workflows/smoke.yaml). +In CI, the smoke tests are run on CentOS runners - you can see the details [in the smoke workflow file](https://github.com/instructlab/training/blob/main/.github/workflows/smoke.yaml). ## End-to-end (E2E) tests diff --git a/tox.ini b/tox.ini index 0794c417..f65d746b 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ install_command = pip install \ {opts} {packages} [testenv:py3] -basepython = python3.11 +basepython = python3.12 [testenv:py3-unit] description = run unit tests with pytest