diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index ab85796ecb..c50aefd5af 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -109,41 +109,51 @@ jobs: # Checkout full history is required to shallow cloning while mark HEAD as "grafted". This breaks remote # tracking thereby making it impossible to detect whether a commit is contained in upstream main. fetch-depth: 0 - - name: Run benchmarks + - name: Start container run: | SLURM_JOB_NAME="$(uuidgen)_$(date +%Y%m%d_%H%M%S)" - echo "SLURM_JOB_NAME=${SLURM_JOB_NAME}" >> $GITHUB_ENV - + CONTAINER_NAME="${SLURM_JOB_NAME}" + SRUN_CONTAINER_OPTS="--container-name=${CONTAINER_NAME} \ + --container-mounts=/mnt/data/artifacts:/mnt/data/artifacts,${{ github.workspace }}:/root/workspace,${HOME}/.cache/uv:/root/.cache/uv \ + --no-container-mount-home \ + --container-workdir=/root/workspace" SLURM_ENV_VARS="NVIDIA_DRIVER_CAPABILITIES=all,BASH_ENV=/root/.bashrc,HF_TOKEN,GS_ENABLE_NDARRAY=${GS_ENABLE_NDARRAY}" if [[ "${{ github.repository }}" == 'Genesis-Embodied-AI/Genesis' && "${{ github.ref }}" == 'refs/heads/main' ]] ; then SLURM_ENV_VARS="${SLURM_ENV_VARS},WANDB_API_KEY" fi - srun \ - --container-image="/mnt/data/images/genesis-v${GENESIS_IMAGE_VER}.sqsh" \ - --container-mounts=\ - "${HOME}/.venv":/root/.venv,\ - /mnt/data/artifacts:/mnt/data/artifacts,\ - "${{ github.workspace }}":/root/workspace \ - --no-container-mount-home --container-workdir=/root/workspace \ - --export=${SLURM_ENV_VARS} \ - --partition=hpc-mid --nodes=1 --gpus=8 --exclusive --time="${TIMEOUT_MINUTES}" \ - --job-name=${SLURM_JOB_NAME} \ - bash -e -s << 'EOF' - # sudo apt update - # sudo apt install -y tmate - # tmate -S /tmp/tmate.sock new-session -d - # tmate -S /tmp/tmate.sock wait tmate-ready - # tmate -S /tmp/tmate.sock display -p '#{tmate_ssh}' + JOBID_FIFO="${{ github.workspace }}/.slurm_job_id_fifo" + [[ -e "$JOBID_FIFO" ]] && rm -f "$JOBID_FIFO" + mkfifo "$JOBID_FIFO" + salloc --job-name="${SLURM_JOB_NAME}" \ + --partition=hpc-mid --nodes=1 --gpus=8 --exclusive \ + --time="${TIMEOUT_MINUTES}" \ + bash -c "echo \$SLURM_JOB_ID > $JOBID_FIFO; sleep ${TIMEOUT_MINUTES}m" & + SLURM_JOB_ID=$(cat "$JOBID_FIFO") + rm -f "$JOBID_FIFO" + SRUN_COMMON="--overlap --jobid=${SLURM_JOB_ID} ${SRUN_CONTAINER_OPTS} --export=${SLURM_ENV_VARS}" + srun --jobid=${SLURM_JOB_ID} \ + --container-image=/mnt/data/images/genesis-v${GENESIS_IMAGE_VER}.sqsh \ + ${SRUN_CONTAINER_OPTS} \ + --export=${SLURM_ENV_VARS} \ + echo "Container ready" - source /root/.venv/bin/activate - pip install --no-input ".[dev,render]" + echo "SLURM_JOB_NAME=${SLURM_JOB_NAME}" >> $GITHUB_ENV + echo "SRUN_COMMON=${SRUN_COMMON}" >> "$GITHUB_ENV" - pytest --print -x -m "benchmarks" ./tests - cat speed_test*.txt > "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt" + - name: Build + run: | + srun ${SRUN_COMMON} \ + bash .github/workflows/scripts/production_build.sh - # tmate -S /tmp/tmate.sock wait tmate-exit + - name: Run benchmarks + run: | + srun ${SRUN_COMMON} bash -e -s <<'EOF' + source /venv/bin/activate + pytest --print -x -m "benchmarks" ./tests + cat speed_test*.txt > "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt" EOF + - name: Kill srun job systematically if: always() run: | diff --git a/.github/workflows/scripts/production_build.sh b/.github/workflows/scripts/production_build.sh new file mode 100644 index 0000000000..e1f093e7b4 --- /dev/null +++ b/.github/workflows/scripts/production_build.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +set -ex + +# sudo apt update +# sudo apt install -y tmate +# tmate -S /tmp/tmate.sock new-session -d +# tmate -S /tmp/tmate.sock wait tmate-ready +# tmate -S /tmp/tmate.sock display -p '#{tmate_ssh}' + +pwd +ls + +curl -LsSf https://astral.sh/uv/install.sh | sh +which uv +uv --version + +VENV_DIR="/venv" + +if [[ -d "${VENV_DIR}" ]]; then + # Remove existing venv (use unique temp name to avoid collisions) + temp_dir="/_venv_$(date +%s)_$$" + mv "${VENV_DIR}" "$temp_dir" 2>/dev/null || rm -rf "${VENV_DIR}" + rm -rf "$temp_dir" & +fi + + +uv venv --python '3.10' --allow-existing ${VENV_DIR} +source "${VENV_DIR}/bin/activate" +uv pip install ".[dev,render]" +uv pip install torch + +# tmate -S /tmp/tmate.sock wait tmate-exit