@@ -109,62 +109,56 @@ jobs:
109109 fetch-depth : 0
110110 # Checkout full history is required to shallow cloning while mark HEAD as "grafted". This breaks remote
111111 # tracking thereby making it impossible to detect whether a commit is contained in upstream main.
112- - name : Allocate SLURM slot (single allocation for all steps)
112+ - name : Allocate SLURM slot and start container session
113113 run : |
114114 SLURM_JOB_NAME="$(uuidgen)_$(date +%Y%m%d_%H%M%S)"
115+ CONTAINER_NAME="genesis-${SLURM_JOB_NAME}"
115116 echo "SLURM_JOB_NAME=${SLURM_JOB_NAME}" >> $GITHUB_ENV
116- salloc \
117- --job-name="${SLURM_JOB_NAME}" \
118- --partition=hpc-mid --nodes=1 --gpus=8 --exclusive \
119- --time="${TIMEOUT_MINUTES}" \
120- bash -c "sleep ${TIMEOUT_MINUTES}m" &
117+ echo "CONTAINER_NAME=${CONTAINER_NAME}" >> $GITHUB_ENV
118+
119+ salloc --job-name="${SLURM_JOB_NAME}" \
120+ --partition=hpc-mid --nodes=1 --gpus=8 --exclusive \
121+ --time="${TIMEOUT_MINUTES}" \
122+ bash -c "sleep ${TIMEOUT_MINUTES}m" &
121123 echo "SLURM_ALLOC_PID=$!" >> $GITHUB_ENV
124+
122125 for i in $(seq 1 30); do
123126 SLURM_JOB_ID="$(squeue --noheader -o '%A' --name "${SLURM_JOB_NAME}" | head -n1)"
124- if [ -n "${SLURM_JOB_ID}" ]; then
125- echo "SLURM_JOB_ID=${SLURM_JOB_ID}" >> $GITHUB_ENV
126- break
127- fi
127+ test -n "$SLURM_JOB_ID" && break
128128 sleep 3
129129 done
130- if [ -z "${SLURM_JOB_ID}" ]; then
131- echo "Failed to obtain SLURM_JOB_ID within timeout" >&2
132- exit 1
133- fi
134-
135- - name : Set common srun args
136- run : |
137- SRUN_COMMON=(
138- "--jobid=${SLURM_JOB_ID}"
139- "--container-image=/mnt/data/images/genesis-v${GENESIS_IMAGE_VER}.sqsh"
140- "--container-mounts=/mnt/data/artifacts:/mnt/data/artifacts,${{ github.workspace }}:/root/workspace,${HOME}/.cache/pip:/root/.cache/pip"
141- "--no-container-mount-home"
142- "--container-workdir=/root/workspace"
143- "--export=NVIDIA_DRIVER_CAPABILITIES=all,BASH_ENV=/root/.bashrc,HF_TOKEN,GS_ENABLE_NDARRAY=${GS_ENABLE_NDARRAY}"
144- )
145- echo "SRUN_COMMON=${SRUN_COMMON[*]}" >> $GITHUB_ENV
146-
147- - name : Build
130+ echo "SLURM_JOB_ID=${SLURM_JOB_ID}" >> $GITHUB_ENV
131+
132+ # Start the container once and keep it alive
133+ srun --jobid="${SLURM_JOB_ID}" \
134+ --container-image="/mnt/data/images/genesis-v${GENESIS_IMAGE_VER}.sqsh" \
135+ --container-name="${CONTAINER_NAME}" \
136+ --no-container-mount-home \
137+ --container-workdir=/root/workspace \
138+ --export=NVIDIA_DRIVER_CAPABILITIES=all,BASH_ENV=/root/.bashrc,HF_TOKEN,GS_ENABLE_NDARRAY=${GS_ENABLE_NDARRAY} \
139+ sleep "${TIMEOUT_MINUTES}m" &
140+ echo "BASE_SRUN_PID=$!" >> $GITHUB_ENV
141+
142+ - name : Build (attach to same container)
148143 run : |
149- srun ${SRUN_COMMON} bash .github/workflows/scripts/production_build.sh
144+ srun --jobid="${SLURM_JOB_ID}" --overlap --container-name="${CONTAINER_NAME}" \
145+ bash .github/workflows/scripts/production_build.sh
150146
151- - name : Test
147+ - name : Test (attach to same container; reuses /tmp/venv created in Build)
152148 run : |
153- srun ${SRUN_COMMON} bash -e -s << 'EOF'
154- source /tmp/venv/bin/activate
155- pytest --print -x -m "benchmarks" ./tests
156- cat speed_test*.txt > "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt"
157- EOF
149+ srun --jobid="${SLURM_JOB_ID}" --overlap --container-name="${CONTAINER_NAME}" \
150+ bash -e -s <<'EOF'
151+ source /tmp/venv/bin/activate
152+ pytest --print -x -m "benchmarks" ./tests
153+ cat speed_test*.txt > "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt"
154+ EOF
158155
159156 - name : Cleanup allocation
160157 if : always()
161158 run : |
162- if [ -n "${SLURM_JOB_ID}" ]; then
163- scancel "${SLURM_JOB_ID}" || true
164- fi
165- if [ -n "${SLURM_ALLOC_PID}" ]; then
166- kill "${SLURM_ALLOC_PID}" || true
167- fi
159+ kill "${BASE_SRUN_PID}" 2>/dev/null || true
160+ scancel "${SLURM_JOB_ID}" 2>/dev/null || true
161+ kill "${SLURM_ALLOC_PID}" 2>/dev/null || true
168162 - name : Display benchmark stats
169163 run : |
170164 cat "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt"
0 commit comments