From 8211d8cd8bb14a8ae9645336f429642a417d6250 Mon Sep 17 00:00:00 2001
From: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
Date: Mon, 23 Jun 2025 13:41:57 -0400
Subject: [PATCH] ci: Switch to python 3.12 as default target

Signed-off-by: Ihar Hrachyshka <ihar.hrachyshka@gmail.com>
---
 .../workflows/e2e-nvidia-l40s-x4-py312.yml    | 221 ------------------
 .github/workflows/e2e-nvidia-l40s-x4-sdk.yml  |  14 +-
 .github/workflows/e2e-nvidia-l40s-x4.yml      |   4 +-
 .github/workflows/lint.yml                    |   4 +-
 .github/workflows/smoke-py312.yaml            | 141 -----------
 .github/workflows/smoke.yaml                  |   4 +-
 docs/ci.md                                    |   4 +-
 tox.ini                                       |   2 +-
 8 files changed, 16 insertions(+), 378 deletions(-)
 delete mode 100644 .github/workflows/e2e-nvidia-l40s-x4-py312.yml
 delete mode 100644 .github/workflows/smoke-py312.yaml

diff --git a/.github/workflows/e2e-nvidia-l40s-x4-py312.yml b/.github/workflows/e2e-nvidia-l40s-x4-py312.yml
deleted file mode 100644
index e78dce4d..00000000
--- a/.github/workflows/e2e-nvidia-l40s-x4-py312.yml
+++ /dev/null
@@ -1,221 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-name: E2E (NVIDIA L40S x4) (python 3.12)
-
-on:
-  schedule:
-    - cron: '0 16 * * *' # Runs at 4PM UTC every day
-  workflow_dispatch:
-    inputs:
-      pr_or_branch:
-        description: 'pull request number or branch name'
-        required: true
-        default: 'main'
-
-env:
-  TMPDIR: /home/tmp
-
-jobs:
-  start-large-ec2-runner:
-    runs-on: ubuntu-latest
-    outputs:
-      label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
-      ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
-      ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
-    steps:
-      - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          repository: instructlab/ci-actions
-          # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
-          path: ci-actions
-          ref: release-v0.1
-          sparse-checkout: |
-            actions/launch-ec2-runner-with-fallback
-
-      - name: Launch EC2 Runner with Fallback
-        id: launch-ec2-instance-with-fallback
-        uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
-        env:
-          TMPDIR: "/tmp"
-        with:
-          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          regions_config: >
-            [
-              {
-                "region": "us-east-2",
-                "subnets": {
-                  "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
-                  "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
-                  "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
-                },
-                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
-                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
-              },
-              {
-                "region": "us-east-1",
-                "subnets": {
-                  "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
-                  "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
-                  "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
-                  "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
-                  "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
-                  "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
-                },
-                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
-                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
-              }
-            ]
-          try_spot_instance_first: false
-          ec2_instance_type: g6e.12xlarge
-          aws_resource_tags: >
-            [
-              {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
-              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
-              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
-            ]
-
-  e2e-large-test:
-    needs:
-      - start-large-ec2-runner
-    runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
-
-    permissions:
-      pull-requests: write
-
-    steps:
-      - name: Checkout instructlab/training
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          repository: "instructlab/training"
-          path: "training"
-            # https://github.com/actions/checkout/issues/249
-          fetch-depth: 0
-
-      - name: Run e2e tests
-        uses: ./training/.github/actions/run-e2e
-        with:
-          python-version: 3.12
-          gh-token: ${{ secrets.GITHUB_TOKEN }}
-          hf-token: ${{ secrets.HF_TOKEN }}
-          openai-api-key: ${{ secrets.OPENAI_API_KEY }}
-          son-of-jeeves-discord-webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }}
-
-  stop-large-ec2-runner:
-    needs:
-      - start-large-ec2-runner
-      - e2e-large-test
-    runs-on: ubuntu-latest
-    if: ${{ always() }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
-
-      - name: Stop EC2 runner
-        uses: machulav/ec2-github-runner@8b37f736c69ba6af391e437447d3c07548478d78 # v2.4.0
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-large-ec2-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
-
-  loss-graphs:
-    needs:
-      - stop-large-ec2-runner
-    runs-on: ubuntu-latest
-    if: ${{ always() }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ vars.AWS_REGION }}
-
-      - name: Download loss data Phase 1
-        id: phase-1-download-logs
-        uses: actions/download-artifact@v4
-        with:
-          name: phase-1-training-log.jsonl
-          path: downloaded-data
-
-      - name: Download loss data Phase 2
-        id: phase-2-download-logs
-        uses: actions/download-artifact@v4
-        with:
-          name: phase-2-training-log.jsonl
-          path: downloaded-data
-
-      - name: Checkout instructlab/training
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          repository: "instructlab/training"
-          path: "training"
-          fetch-depth: 0
-
-      - name: Install dependencies
-        working-directory: ./training
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements-dev.txt -c constraints-dev.txt
-
-      - name: Try to upload Phase 1 to s3
-        id: phase-1-upload-s3
-        continue-on-error: true
-        run: |
-          python training/scripts/create-loss-graph.py  \
-            --log-file "${{ steps.phase-1-download-logs.outputs.download-path }}/phase-1-training-log.jsonl" \
-            --output-file "./phase-1-test.md" \
-            --phase "1" \
-            --aws-region "${{ vars.AWS_REGION }}" \
-            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
-            --base-branch "${GITHUB_REF##*/}" \
-            --head-sha "${{ github.sha }}" \
-            --pr-number "${{ github.event.number }}" \
-            --origin-repository "${{ github.repository }}"
-
-      - name: Try to upload Phase 2 to s3
-        id: phase-2-upload-s3
-        continue-on-error: true
-        run: |
-          python training/scripts/create-loss-graph.py  \
-            --log-file "${{ steps.phase-2-download-logs.outputs.download-path }}/phase-2-training-log.jsonl" \
-            --output-file "./phase-2-test.md" \
-            --phase "2" \
-            --aws-region "${{ vars.AWS_REGION }}" \
-            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
-            --base-branch "${GITHUB_REF##*/}" \
-            --head-sha "${{ github.sha }}" \
-            --pr-number "${{ github.event.number }}" \
-            --origin-repository "${{ github.repository }}"
-
-      - name: Check Phase 1 S3 upload status for success
-        if: steps.phase-1-upload-s3.outcome == 'success'
-        run: |
-          echo "Uploaded Phase 1 loss graph to S3."
-          cat ./phase-1-test.md >> "${GITHUB_STEP_SUMMARY}"
-
-      - name: Check Phase 2 S3 upload status for success
-        if: steps.phase-2-upload-s3.outcome == 'success'
-        run: |
-          echo "Uploaded Phase 2 loss graph to S3."
-          cat ./phase-2-test.md >> "${GITHUB_STEP_SUMMARY}"
-
-      - name: Check Phase 1 S3 upload status for failure
-        if: steps.phase-1-upload-s3.outcome == 'failure'
-        run: |
-          echo "::warning::Failed to upload Phase 1 loss graph to S3. This won't block the workflow, but you may want to investigate."
-          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
-
-      - name: Check Phase 2 S3 upload status for failure
-        if: steps.phase-2-upload-s3.outcome == 'failure'
-        run: |
-          echo "::warning::Failed to upload Phase 2 loss graph to S3. This won't block the workflow, but you may want to investigate."
-          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
diff --git a/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml b/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml
index 3f416642..2b22237b 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4-sdk.yml
@@ -105,7 +105,7 @@ jobs:
         run: |
           cat /etc/os-release
           mkdir -p "${TMPDIR}"
-          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
+          sudo dnf install -y gcc gcc-c++ make git python3.12 python3.12-devel
   
       - name: Checkout
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -130,18 +130,18 @@ jobs:
           export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
           export PATH="$PATH:$CUDA_HOME/bin"
           nvidia-smi
-          python3.11 -m venv --upgrade-deps venv
+          python3.12 -m venv --upgrade-deps venv
           . venv/bin/activate
           pip install instructlab
           pip install instructlab[cuda]
           pip install vllm
-          python3.11 -m pip install packaging wheel setuptools-scm
+          python3.12 -m pip install packaging wheel setuptools-scm
           pip install .
           pip install .[cuda]
-          python3.11 -m pip uninstall -y flash-attn
-          python3.11 -m pip cache purge
-          python3.11 -m pip install ninja
-          MAX_JOBS=8 python3.11 -m pip install flash-attn --no-build-isolation
+          python3.12 -m pip uninstall -y flash-attn
+          python3.12 -m pip cache purge
+          python3.12 -m pip install ninja
+          MAX_JOBS=8 python3.12 -m pip install flash-attn --no-build-isolation
 
       - name: Check disk before tests
         run: |
diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index f464783c..43ad281e 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-name: E2E (NVIDIA L40S x4) (python 3.11)
+name: E2E (NVIDIA L40S x4) (python 3.12)
 
 on:
   schedule:
@@ -98,7 +98,7 @@ jobs:
       - name: Run e2e tests
         uses: ./training/.github/actions/run-e2e
         with:
-          python-version: 3.11
+          python-version: 3.12
           gh-token: ${{ secrets.GITHUB_TOKEN }}
           hf-token: ${{ secrets.HF_TOKEN }}
           openai-api-key: ${{ secrets.OPENAI_API_KEY }}
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 2c8df16d..042360ee 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -65,10 +65,10 @@ jobs:
           # https://github.com/actions/checkout/issues/249
           fetch-depth: 0
 
-      - name: Setup Python 3.11
+      - name: Setup Python 3.12
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: 3.11
+          python-version: 3.12
           cache: pip
           cache-dependency-path: |
             **/pyproject.toml
diff --git a/.github/workflows/smoke-py312.yaml b/.github/workflows/smoke-py312.yaml
deleted file mode 100644
index 400146d1..00000000
--- a/.github/workflows/smoke-py312.yaml
+++ /dev/null
@@ -1,141 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-name: "Run smoke tests via Tox::pytest (python 3.12)"
-# These tests will be long running and require accelerated hardware.
-
-on:
-  workflow_dispatch:
-    inputs:
-      branch:
-        type: string
-        default: main
-  # using this rather than pull_request because this workflow
-  # needs to run in the context of the base branch (main) and
-  # access the repo's secrets to start the AWS instances.
-  pull_request_target:
-    branches:
-      - main
-      - release-*
-    paths:
-      # note this should match the merging criteria in 'mergify.yml'
-      - "**.py"
-      - "tox.ini"
-      - "pyproject.toml"
-      - "requirements-dev.txt"
-      - "requirements-cuda.txt"
-      - "constraints-dev.txt"
-
-permissions:
-  contents: read
-
-defaults:
-  run:
-    shell: bash
-
-env:
-  ec2_runner_variant: "g6e.12xlarge" # 4x L40s
-
-jobs:
-  start-large-ec2-runner:
-    runs-on: ubuntu-latest
-    outputs:
-      label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
-      ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
-      ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
-    steps:
-      - name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          repository: instructlab/ci-actions
-          # clone the "ci-actions" repo to a local directory called "ci-actions", instead of overwriting the current WORKDIR contents
-          path: ci-actions
-          ref: release-v0.1
-          sparse-checkout: |
-            actions/launch-ec2-runner-with-fallback
-
-      - name: Launch EC2 Runner with Fallback
-        id: launch-ec2-instance-with-fallback
-        uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
-        env:
-          TMPDIR: "/tmp"
-        with:
-          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          regions_config: >
-            [
-              {
-                "region": "us-east-2",
-                "subnets": {
-                  "us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
-                  "us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
-                  "us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
-                },
-                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
-                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
-              },
-              {
-                "region": "us-east-1",
-                "subnets": {
-                  "us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
-                  "us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
-                  "us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
-                  "us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
-                  "us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
-                  "us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
-                },
-                "ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
-                "security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
-              }
-            ]
-          try_spot_instance_first: false
-          ec2_instance_type: g6e.12xlarge
-          aws_resource_tags: >
-            [
-              {"Key": "Name", "Value": "instructlab-training-ci-github-large-runner"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
-              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
-              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
-            ]
-
-  run-smoke-tests:
-    needs:
-      - start-large-ec2-runner
-    runs-on: ${{needs.start-large-ec2-runner.outputs.label}}
-    # It is important that this job has no write permissions and has
-    # no access to any secrets. This part is where we are running
-    # untrusted code from PRs.
-    permissions: {}
-    steps:
-      - name: "Checkout code"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-          ref: ${{inputs.branch}}
-
-      - name: Run smoke tests
-        uses: ./.github/actions/run-smoke
-        with:
-          python-version: 3.12
-
-  stop-large-ec2-runner:
-    needs:
-      - start-large-ec2-runner
-      - run-smoke-tests
-    runs-on: ubuntu-latest
-    if: ${{ always() }}
-    steps:
-      - name: "Configure AWS credentials"
-        uses: "aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df" # v4.2.1
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
-
-      - name: "Stop EC2 runner"
-        uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-large-ec2-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml
index bedeeb3b..aab51fa9 100644
--- a/.github/workflows/smoke.yaml
+++ b/.github/workflows/smoke.yaml
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-name: "Run smoke tests via Tox::pytest (python 3.11)"
+name: "Run smoke tests via Tox::pytest (python 3.12)"
 # These tests will be long running and require accelerated hardware.
 
 on:
@@ -118,7 +118,7 @@ jobs:
       - name: Run smoke tests
         uses: ./.github/actions/run-smoke
         with:
-          python-version: 3.11
+          python-version: 3.12
 
   stop-large-ec2-runner:
     needs:
diff --git a/docs/ci.md b/docs/ci.md
index fdd0fb6d..a46efddb 100644
--- a/docs/ci.md
+++ b/docs/ci.md
@@ -10,7 +10,7 @@ All unit tests currently live in the `tests/unit` directory and are run with [py
 
 To run the unit tests, you can run `tox -e py3-unit`.
 
-In CI, the tests are run with Python 3.11 - 3.13 on Ubuntu and MacOS runners - you can see the details [in the unit tests workflow file](https://github.com/instructlab/training/blob/main/.github/workflows/unit.yaml).
+In CI, the tests are run on Ubuntu runners - you can see the details [in the unit tests workflow file](https://github.com/instructlab/training/blob/main/.github/workflows/unit.yaml).
 
 ## Smoke tests
 
@@ -20,7 +20,7 @@ There is also a Shell-based smoke test script that can be found at `tests/smoket
 
 To run the smoke tests, you can run `tox -e py3-smoke`
 
-In CI, the smoke tests are run with Python 3.11 on CentOS runners - you can see the details [in the smoke workflow file](https://github.com/instructlab/training/blob/main/.github/workflows/smoke.yaml).
+In CI, the smoke tests are run on CentOS runners - you can see the details [in the smoke workflow file](https://github.com/instructlab/training/blob/main/.github/workflows/smoke.yaml).
 
 ## End-to-end (E2E) tests
 
diff --git a/tox.ini b/tox.ini
index 0794c417..f65d746b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -15,7 +15,7 @@ install_command = pip install \
                   {opts} {packages}
 
 [testenv:py3]
-basepython = python3.11
+basepython = python3.12
 
 [testenv:py3-unit]
 description = run unit tests with pytest