Skip to content

Continue refactor of pyspark tests (#12007) #313

Continue refactor of pyspark tests (#12007)

Continue refactor of pyspark tests (#12007) #313

Workflow file for this run

name: XGBoost CI
on: [push, pull_request]
permissions:
contents: read # to fetch code (actions/checkout)
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
BRANCH_NAME: >-
${{ github.event.pull_request.number && 'PR-' }}${{ github.event.pull_request.number || github.ref_name }}
jobs:
ci-configure:
name: Configure variables for CI
runs-on:
- runs-on=${{ github.run_id }}
- runner=linux-amd64-cpu
- tag=main-ci-configure
steps:
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2.0.1
with:
mask-password: 'false'
registries: '492475357299'
- uses: actions/checkout@v6.0.1
- name: Get image tag
id: get-image-tag
run: |
source ops/pipeline/get-image-tag.sh
echo "Using image tag $IMAGE_TAG"
echo "image_tag=$IMAGE_TAG" >> "$GITHUB_OUTPUT"
outputs:
docker_registry: ${{ steps.login-ecr.outputs.registry }}
docker_username: ${{ steps.login-ecr.outputs.docker_username_492475357299_dkr_ecr_us_west_2_amazonaws_com }}
docker_password: ${{ steps.login-ecr.outputs.docker_password_492475357299_dkr_ecr_us_west_2_amazonaws_com }}
image_tag: ${{ steps.get-image-tag.outputs.image_tag }}
build-cpu:
name: Build CPU (${{ matrix.variant }})
needs: ci-configure
runs-on:
- runs-on=${{ github.run_id }}
- runner=linux-amd64-cpu
- tag=main-build-cpu-${{ matrix.variant }}
strategy:
fail-fast: false
matrix:
include:
- variant: default
build_suite: cpu
# Default build doesn't need privileged mode
# Using --init as harmless default (proper signal handling)
container_options: "--init"
- variant: sanitizer
build_suite: cpu-sanitizer
# Sanitizer needs privileged for: sysctl vm.mmap_rnd_bits=28
# See https://github.com/google/sanitizers/issues/1614
container_options: "--privileged"
container:
image: ${{ needs.ci-configure.outputs.docker_registry }}/xgb-ci.cpu:${{ needs.ci-configure.outputs.image_tag }}
credentials:
username: ${{ needs.ci-configure.outputs.docker_username }}
password: ${{ needs.ci-configure.outputs.docker_password }}
options: ${{ matrix.container_options }}
steps:
- uses: actions/checkout@v6.0.1
with:
submodules: "true"
# Remove default build config to ensure CMake-configured header is used
- name: Remove default build config
run: rm -fv dmlc-core/include/dmlc/build_config_default.h
- name: Configure the system for sanitizers
if: matrix.variant == 'sanitizer'
run: |
echo "ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer" >> $GITHUB_ENV
echo "ASAN_OPTIONS=symbolize=1" >> $GITHUB_ENV
echo "UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log" >> $GITHUB_ENV
# Work around https://github.com/google/sanitizers/issues/1614
sysctl vm.mmap_rnd_bits=28
- uses: dmlc/xgboost-devops/actions/sccache@main
with:
cache-key-prefix: ${{ github.job }}-${{ matrix.build_suite }}
- name: Build and test
run: bash ops/pipeline/build-cpu.sh ${{ matrix.build_suite }}
- run: sccache --show-stats
build-cuda:
name: Build CUDA ${{ matrix.cuda_version }} (${{ matrix.arch }})
needs: ci-configure
runs-on:
- runs-on=${{ github.run_id }}
- runner=${{ matrix.runner }}
- tag=main-build-cuda${{ matrix.cuda_version }}-${{ matrix.arch }}
- extras=s3-cache
strategy:
fail-fast: false
matrix:
include:
# CUDA 12
- cuda_version: 12
arch: aarch64
runner: linux-arm64-cpu
image_repo: xgb-ci.gpu_build_rockylinux8_aarch64
use_rmm: 0
use_federated: 1
- cuda_version: 12
arch: x86_64
runner: linux-amd64-cpu
image_repo: xgb-ci.gpu_build_rockylinux8
use_rmm: 0
use_federated: 1
# CUDA 13
- cuda_version: 13
arch: aarch64
runner: linux-arm64-cpu
image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
use_rmm: 0
use_federated: 0
- cuda_version: 13
arch: x86_64
runner: linux-amd64-cpu
image_repo: xgb-ci.gpu_build_cuda13_rockylinux8
use_rmm: 0
use_federated: 0
container:
image: ${{ needs.ci-configure.outputs.docker_registry }}/${{ matrix.image_repo }}:${{ needs.ci-configure.outputs.image_tag }}
credentials:
username: ${{ needs.ci-configure.outputs.docker_username }}
password: ${{ needs.ci-configure.outputs.docker_password }}
steps:
- uses: runs-on/action@v2
- uses: actions/checkout@v6.0.1
with:
submodules: "true"
- uses: dmlc/xgboost-devops/actions/sccache@main
with:
cache-key-prefix: ${{ github.job }}-${{ matrix.cuda_version }}
- run: >-
bash ops/pipeline/build-cuda.sh
--cuda-version ${{ matrix.cuda_version }}
--use-rmm ${{ matrix.use_rmm }}
--use-federated ${{ matrix.use_federated }}
- run: sccache --show-stats
- name: Stash files
run: |
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-cuda${{ matrix.cuda_version }}-${{ matrix.arch }} \
build/testxgboost python-package/dist/*.whl
audit-cuda-wheel:
name: Audit CUDA ${{ matrix.cuda_version }} wheel for manylinux_2_28_${{ matrix.arch }}
needs: [ci-configure, build-cuda]
runs-on:
- runs-on=${{ github.run_id }}
- runner=${{ matrix.runner }}
- tag=main-audit-cuda${{ matrix.cuda_version }}-wheel-${{ matrix.arch }}
strategy:
fail-fast: false
matrix:
include:
# CUDA 12
- cuda_version: 12
cuda_variant_flag: ""
arch: aarch64
runner: linux-arm64-cpu
- cuda_version: 12
cuda_variant_flag: ""
arch: x86_64
runner: linux-amd64-cpu
# CUDA 13
- cuda_version: 13
cuda_variant_flag: "--cuda-variant cuda13"
arch: aarch64
runner: linux-arm64-cpu
- cuda_version: 13
cuda_variant_flag: "--cuda-variant cuda13"
arch: x86_64
runner: linux-amd64-cpu
container:
image: ${{ needs.ci-configure.outputs.docker_registry }}/xgb-ci.manylinux_2_28_${{ matrix.arch }}:${{ needs.ci-configure.outputs.image_tag }}
credentials:
username: ${{ needs.ci-configure.outputs.docker_username }}
password: ${{ needs.ci-configure.outputs.docker_password }}
steps:
- uses: actions/checkout@v6.0.1
- name: Pick Python
run: |
export PATH=/opt/python/cp310-cp310/bin/:$PATH
echo ${PATH} >> $GITHUB_PATH
- name: Install dependencies
run: |
pip install awscli wheel auditwheel pydistcheck
- name: Unstash raw wheel
run: |
mkdir -p python-package/dist
python3 ops/pipeline/manage-artifacts.py download \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-cuda${{ matrix.cuda_version }}-${{ matrix.arch }} \
--dest-dir python-package/dist \
*.whl
- run: bash ops/pipeline/audit-cuda-wheel.sh ${{ matrix.arch }} ${{ matrix.cuda_variant_flag }}
- name: Stash files
run: |
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/audit-cuda${{ matrix.cuda_version }}-wheel-${{ matrix.arch }} \
python-package/dist/*.whl
build-cuda-with-rmm:
name: Build CUDA with RMM
needs: ci-configure
runs-on:
- runs-on=${{ github.run_id }}
- runner=linux-amd64-cpu
- tag=main-build-cuda-with-rmm
- extras=s3-cache
container:
image: ${{ needs.ci-configure.outputs.docker_registry }}/xgb-ci.gpu_build_rockylinux8:${{ needs.ci-configure.outputs.image_tag }}
credentials:
username: ${{ needs.ci-configure.outputs.docker_username }}
password: ${{ needs.ci-configure.outputs.docker_password }}
steps:
- uses: runs-on/action@v2
- uses: actions/checkout@v6.0.1
with:
submodules: "true"
- uses: dmlc/xgboost-devops/actions/sccache@main
- run: >-
bash ops/pipeline/build-cuda.sh
--cuda-version 12
--use-rmm 1
--use-federated 1
- run: sccache --show-stats
- name: Stash files
run: |
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/build-cuda-with-rmm \
build/testxgboost
build-python-wheels-cpu:
name: Build CPU wheel (xgboost-cpu) for ${{ matrix.manylinux_target }}_${{ matrix.arch }}
runs-on:
- runs-on
- runner=${{ matrix.runner }}
- run-id=${{ github.run_id }}
- tag=main-build-python-wheels-cpu-${{ matrix.manylinux_target }}-${{ matrix.arch }}
strategy:
fail-fast: false
matrix:
include:
- manylinux_target: manylinux_2_28
arch: aarch64
runner: linux-arm64-cpu
- manylinux_target: manylinux_2_28
arch: x86_64
runner: linux-amd64-cpu
steps:
- uses: actions/checkout@v6.0.1
with:
submodules: "true"
- name: Log into Docker registry (AWS ECR)
run: bash ops/pipeline/login-docker-registry.sh
- run: |
bash ops/pipeline/build-python-wheels-cpu.sh \
${{ matrix.manylinux_target }} ${{ matrix.arch }}
build-gpu-rpkg:
name: Build GPU-enabled R package
needs: ci-configure
runs-on:
- runs-on=${{ github.run_id }}
- runner=linux-amd64-cpu
- tag=main-build-gpu-rpkg
- extras=s3-cache
container:
image: ${{ needs.ci-configure.outputs.docker_registry }}/xgb-ci.gpu_build_r_rockylinux8:${{ needs.ci-configure.outputs.image_tag }}
credentials:
username: ${{ needs.ci-configure.outputs.docker_username }}
password: ${{ needs.ci-configure.outputs.docker_password }}
steps:
- uses: runs-on/action@v2
- name: Trust git cloning project sources
run: |
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
- uses: actions/checkout@v6.0.1
with:
submodules: "true"
- uses: dmlc/xgboost-devops/actions/sccache@main
with:
cache-key-prefix: ${{ github.job }}
- run: bash ops/pipeline/build-gpu-rpkg.sh
- run: sccache --show-stats
- name: Upload R package tarball
run: |
python3 ops/pipeline/manage-artifacts.py upload \
--s3-bucket xgboost-nightly-builds \
--prefix ${BRANCH_NAME}/${GITHUB_SHA} --make-public \
xgboost_r_gpu_linux.tar.gz
if: github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release_')
test-cpp-gpu:
name: >-
Google Test (C++) CUDA ${{ matrix.cuda_version }}
(${{ matrix.suite }}, ${{ matrix.runner }})
needs: [ci-configure, build-cuda, build-cuda-with-rmm]
runs-on:
- runs-on=${{ github.run_id }}
- runner=${{ matrix.runner }}
- tag=main-test-cpp-gpu-cuda${{ matrix.cuda_version }}-${{ matrix.suite }}-${{ matrix.arch }}
- extras=s3-cache
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
include:
# CUDA 12 tests
# Note: --gpus all provides GPU access; --privileged is not needed for basic GPU tests
- cuda_version: 12
suite: gpu
arch: x86_64
runner: linux-amd64-gpu
image_repo: xgb-ci.gpu
artifact_from: build-cuda12-x86_64
container_options: "--gpus all --privileged"
test_args: ""
- cuda_version: 12
suite: gpu
arch: aarch64
runner: linux-arm64-gpu
image_repo: xgb-ci.gpu_aarch64
artifact_from: build-cuda12-aarch64
container_options: "--gpus all --privileged"
test_args: ""
- cuda_version: 12
suite: gpu-rmm
arch: x86_64
runner: linux-amd64-gpu
image_repo: xgb-ci.gpu
artifact_from: build-cuda-with-rmm
container_options: "--gpus all --privileged"
test_args: "--use-rmm-pool"
- cuda_version: 12
suite: mgpu
arch: x86_64
runner: linux-amd64-mgpu
image_repo: xgb-ci.gpu
artifact_from: build-cuda12-x86_64
# mgpu needs --shm-size for NCCL shared memory communication
container_options: "--gpus all --shm-size=4g --privileged"
test_args: "--gtest_filter=*MGPU*"
# CUDA 13 tests
- cuda_version: 13
suite: gpu
arch: x86_64
runner: linux-amd64-gpu
image_repo: xgb-ci.gpu_build_cuda13_rockylinux8
artifact_from: build-cuda13-x86_64
container_options: "--gpus all --privileged"
test_args: ""
- cuda_version: 13
suite: gpu
arch: aarch64
runner: linux-arm64-gpu
image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
artifact_from: build-cuda13-aarch64
container_options: "--gpus all --privileged"
test_args: ""
container:
image: ${{ needs.ci-configure.outputs.docker_registry }}/${{ matrix.image_repo }}:${{ needs.ci-configure.outputs.image_tag }}
credentials:
username: ${{ needs.ci-configure.outputs.docker_username }}
password: ${{ needs.ci-configure.outputs.docker_password }}
options: ${{ matrix.container_options }}
steps:
- uses: runs-on/action@v2
- uses: actions/checkout@v6.0.1
- name: Unstash gtest
run: |
python3 ops/pipeline/manage-artifacts.py download \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \
--dest-dir build \
testxgboost
chmod +x build/testxgboost
- name: Run Google Tests (${{ matrix.suite }})
run: build/testxgboost ${{ matrix.test_args }}
test-python-wheel-gpu:
name: Python tests CUDA ${{ matrix.cuda_version }} (${{ matrix.description }})
needs: [ci-configure, audit-cuda-wheel]
runs-on:
- runs-on=${{ github.run_id }}
- runner=${{ matrix.runner }}
- tag=main-test-python-wheel-cuda${{ matrix.cuda_version }}-${{ matrix.description }}
- extras=s3-cache
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
include:
# CUDA 12 tests
- cuda_version: 12
description: GPU-x86_64-CUDA-12
image_repo: xgb-ci.gpu
suite: gpu
runner: linux-amd64-gpu
artifact_from: audit-cuda12-wheel-x86_64
container_options: "--gpus all --privileged"
- cuda_version: 12
description: Multi-GPU-x86_64-CUDA-12
image_repo: xgb-ci.gpu
suite: mgpu
runner: linux-amd64-mgpu
artifact_from: audit-cuda12-wheel-x86_64
# mgpu needs --shm-size for NCCL shared memory communication
container_options: "--gpus all --shm-size=4g --privileged"
# CUDA 12 aarch64 tests
- cuda_version: 12
description: GPU-arm64-CUDA-12
image_repo: xgb-ci.gpu_aarch64
suite: gpu-arm64
runner: linux-arm64-gpu
artifact_from: audit-cuda12-wheel-aarch64
container_options: "--gpus all --privileged"
# CUDA 13 tests
- cuda_version: 13
description: GPU-x86_64-CUDA-13
image_repo: xgb-ci.gpu_build_cuda13_rockylinux8
suite: gpu
runner: linux-amd64-gpu
artifact_from: audit-cuda13-wheel-x86_64
container_options: "--gpus all --privileged"
- cuda_version: 13
description: GPU-arm64-CUDA-13
image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
suite: gpu-arm64
runner: linux-arm64-gpu
artifact_from: audit-cuda13-wheel-aarch64
container_options: "--gpus all --privileged"
container:
image: ${{ needs.ci-configure.outputs.docker_registry }}/${{ matrix.image_repo }}:${{ needs.ci-configure.outputs.image_tag }}
credentials:
username: ${{ needs.ci-configure.outputs.docker_username }}
password: ${{ needs.ci-configure.outputs.docker_password }}
options: ${{ matrix.container_options }}
steps:
- uses: runs-on/action@v2
- uses: actions/checkout@v6.0.1
- name: Unstash Python wheel
run: |
python3 ops/pipeline/manage-artifacts.py download \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \
--dest-dir wheelhouse \
*.whl
- name: Run Python tests (${{ matrix.description }})
run: >-
bash ops/pipeline/test-python-wheel.sh
--suite ${{ matrix.suite }}
--cuda-version ${{ matrix.cuda_version }}
# Train a model for cross-platform testing (only for CUDA 12 x86_64)
- name: Train cross-platform test model
if: matrix.cuda_version == 12 && matrix.suite == 'gpu'
shell: bash -l {0}
run: |
source activate gpu_test
python tests/cross-platform/test_cross_platform_model.py \
--train --model-path cross_platform_model.ubj
- name: Upload cross-platform model artifact
if: matrix.cuda_version == 12 && matrix.suite == 'gpu'
uses: actions/upload-artifact@v6.0.0
with:
name: cross-platform-model
path: cross_platform_model.ubj
retention-days: 1
test-python-wheel-cpu:
name: Python tests CPU (${{ matrix.description }})
needs: [ci-configure, audit-cuda-wheel]
runs-on:
- runs-on=${{ github.run_id }}
- runner=${{ matrix.runner }}
- tag=main-test-python-wheel-cpu-${{ matrix.description }}
- extras=s3-cache
timeout-minutes: 60
strategy:
fail-fast: false
# Uses the wheel from cuda12 for tests.
matrix:
include:
- description: CPU-amd64
image_repo: xgb-ci.cpu
suite: cpu
runner: linux-amd64-cpu
artifact_from: audit-cuda12-wheel-x86_64
container_options: "--init"
- description: CPU-arm64
image_repo: xgb-ci.cpu_aarch64
suite: cpu-arm64
runner: linux-arm64-cpu
artifact_from: audit-cuda12-wheel-aarch64
container_options: "--init"
container:
image: ${{ needs.ci-configure.outputs.docker_registry }}/${{ matrix.image_repo }}:${{ needs.ci-configure.outputs.image_tag }}
credentials:
username: ${{ needs.ci-configure.outputs.docker_username }}
password: ${{ needs.ci-configure.outputs.docker_password }}
options: ${{ matrix.container_options }}
steps:
- uses: runs-on/action@v2
- uses: actions/checkout@v6.0.1
- name: Unstash Python wheel
run: |
python3 ops/pipeline/manage-artifacts.py download \
--s3-bucket ${{ env.RUNS_ON_S3_BUCKET_CACHE }} \
--prefix cache/${{ github.run_id }}/${{ matrix.artifact_from }} \
--dest-dir wheelhouse \
*.whl
- name: Run Python tests (${{ matrix.description }})
run: bash ops/pipeline/test-python-wheel.sh --suite ${{ matrix.suite }}
python-wheels-macos:
name: Build macOS wheel (${{ matrix.platform_id }})
runs-on: ${{ matrix.os }}
defaults:
run:
shell: bash -l {0}
strategy:
fail-fast: false
matrix:
include:
- os: macos-15-intel
platform_id: macosx_x86_64
- os: macos-14
platform_id: macosx_arm64
steps:
- uses: actions/checkout@v6.0.1
with:
submodules: 'true'
- name: Set up homebrew
uses: Homebrew/actions/setup-homebrew@13341b4d5e459a98bbe0b122b12c11bf90518cc8
- name: Install libomp
run: brew install libomp
- uses: dmlc/xgboost-devops/actions/miniforge-setup@main
with:
environment-name: minimal
environment-file: ops/conda_env/minimal.yml
- name: Build wheels
run: bash ops/pipeline/build-python-wheels-macos.sh ${{ matrix.platform_id }} ${{ github.sha }}
- name: Verify wheel can be installed
run: |
python -m pip install -vvv wheelhouse/*.whl
- name: Upload wheel artifact
uses: actions/upload-artifact@v6.0.0
with:
name: python-wheel-${{ matrix.platform_id }}
path: wheelhouse/*.whl
retention-days: 1
- name: Upload Python wheel to S3
if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
run: |
python ops/pipeline/manage-artifacts.py upload \
--s3-bucket xgboost-nightly-builds \
--prefix ${{ env.BRANCH_NAME }}/${{ github.sha }} --make-public \
wheelhouse/*.whl
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
test-cross-platform-inference:
name: Cross-platform inference test (macOS Apple Silicon)
needs: [test-python-wheel-gpu, python-wheels-macos]
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
include:
- os: macos-15-intel
platform_id: macosx_x86_64
- os: macos-14
platform_id: macosx_arm64
defaults:
run:
shell: bash -l {0}
steps:
- uses: actions/checkout@v6.0.1
- uses: dmlc/xgboost-devops/actions/miniforge-setup@main
with:
environment-name: macos_test
environment-file: ops/conda_env/minimal.yml
- run: conda install scikit-learn numpy -y
- name: Download macOS wheel artifact
uses: actions/download-artifact@v7.0.0
with:
name: python-wheel-${{ matrix.platform_id }}
path: wheelhouse
- name: Install XGBoost wheel
run: |
python -m pip install -v wheelhouse/*.whl
- name: Download cross-platform model artifact
uses: actions/download-artifact@v7.0.0
with:
name: cross-platform-model
path: .
- name: Run cross-platform inference test
run: |
python tests/cross-platform/test_cross_platform_model.py \
--inference --model-path cross_platform_model.ubj