Skip to content

Commit 06ce7d9

Browse files
committed
use salloc and overlapped
1 parent 6152ca7 commit 06ce7d9

File tree

1 file changed

+35
-41
lines changed

1 file changed

+35
-41
lines changed

.github/workflows/production.yml

Lines changed: 35 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -109,62 +109,56 @@ jobs:
109109
fetch-depth: 0
110110
# Checkout full history is required to shallow cloning while mark HEAD as "grafted". This breaks remote
111111
# tracking thereby making it impossible to detect whether a commit is contained in upstream main.
112-
- name: Allocate SLURM slot (single allocation for all steps)
112+
- name: Allocate SLURM slot and start container session
113113
run: |
114114
SLURM_JOB_NAME="$(uuidgen)_$(date +%Y%m%d_%H%M%S)"
115+
CONTAINER_NAME="genesis-${SLURM_JOB_NAME}"
115116
echo "SLURM_JOB_NAME=${SLURM_JOB_NAME}" >> $GITHUB_ENV
116-
salloc \
117-
--job-name="${SLURM_JOB_NAME}" \
118-
--partition=hpc-mid --nodes=1 --gpus=8 --exclusive \
119-
--time="${TIMEOUT_MINUTES}" \
120-
bash -c "sleep ${TIMEOUT_MINUTES}m" &
117+
echo "CONTAINER_NAME=${CONTAINER_NAME}" >> $GITHUB_ENV
118+
119+
salloc --job-name="${SLURM_JOB_NAME}" \
120+
--partition=hpc-mid --nodes=1 --gpus=8 --exclusive \
121+
--time="${TIMEOUT_MINUTES}" \
122+
bash -c "sleep ${TIMEOUT_MINUTES}m" &
121123
echo "SLURM_ALLOC_PID=$!" >> $GITHUB_ENV
124+
122125
for i in $(seq 1 30); do
123126
SLURM_JOB_ID="$(squeue --noheader -o '%A' --name "${SLURM_JOB_NAME}" | head -n1)"
124-
if [ -n "${SLURM_JOB_ID}" ]; then
125-
echo "SLURM_JOB_ID=${SLURM_JOB_ID}" >> $GITHUB_ENV
126-
break
127-
fi
127+
test -n "$SLURM_JOB_ID" && break
128128
sleep 3
129129
done
130-
if [ -z "${SLURM_JOB_ID}" ]; then
131-
echo "Failed to obtain SLURM_JOB_ID within timeout" >&2
132-
exit 1
133-
fi
134-
135-
- name: Set common srun args
136-
run: |
137-
SRUN_COMMON=(
138-
"--jobid=${SLURM_JOB_ID}"
139-
"--container-image=/mnt/data/images/genesis-v${GENESIS_IMAGE_VER}.sqsh"
140-
"--container-mounts=/mnt/data/artifacts:/mnt/data/artifacts,${{ github.workspace }}:/root/workspace,${HOME}/.cache/pip:/root/.cache/pip"
141-
"--no-container-mount-home"
142-
"--container-workdir=/root/workspace"
143-
"--export=NVIDIA_DRIVER_CAPABILITIES=all,BASH_ENV=/root/.bashrc,HF_TOKEN,GS_ENABLE_NDARRAY=${GS_ENABLE_NDARRAY}"
144-
)
145-
echo "SRUN_COMMON=${SRUN_COMMON[*]}" >> $GITHUB_ENV
146-
147-
- name: Build
130+
echo "SLURM_JOB_ID=${SLURM_JOB_ID}" >> $GITHUB_ENV
131+
132+
# Start the container once and keep it alive
133+
srun --jobid="${SLURM_JOB_ID}" \
134+
--container-image="/mnt/data/images/genesis-v${GENESIS_IMAGE_VER}.sqsh" \
135+
--container-name="${CONTAINER_NAME}" \
136+
--no-container-mount-home \
137+
--container-workdir=/root/workspace \
138+
--export=NVIDIA_DRIVER_CAPABILITIES=all,BASH_ENV=/root/.bashrc,HF_TOKEN,GS_ENABLE_NDARRAY=${GS_ENABLE_NDARRAY} \
139+
sleep "${TIMEOUT_MINUTES}m" &
140+
echo "BASE_SRUN_PID=$!" >> $GITHUB_ENV
141+
142+
- name: Build (attach to same container)
148143
run: |
149-
srun ${SRUN_COMMON} bash .github/workflows/scripts/production_build.sh
144+
srun --jobid="${SLURM_JOB_ID}" --overlap --container-name="${CONTAINER_NAME}" \
145+
bash .github/workflows/scripts/production_build.sh
150146
151-
- name: Test
147+
- name: Test (attach to same container; reuses /tmp/venv created in Build)
152148
run: |
153-
srun ${SRUN_COMMON} bash -e -s << 'EOF'
154-
source /tmp/venv/bin/activate
155-
pytest --print -x -m "benchmarks" ./tests
156-
cat speed_test*.txt > "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt"
157-
EOF
149+
srun --jobid="${SLURM_JOB_ID}" --overlap --container-name="${CONTAINER_NAME}" \
150+
bash -e -s <<'EOF'
151+
source /tmp/venv/bin/activate
152+
pytest --print -x -m "benchmarks" ./tests
153+
cat speed_test*.txt > "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt"
154+
EOF
158155
159156
- name: Cleanup allocation
160157
if: always()
161158
run: |
162-
if [ -n "${SLURM_JOB_ID}" ]; then
163-
scancel "${SLURM_JOB_ID}" || true
164-
fi
165-
if [ -n "${SLURM_ALLOC_PID}" ]; then
166-
kill "${SLURM_ALLOC_PID}" || true
167-
fi
159+
kill "${BASE_SRUN_PID}" 2>/dev/null || true
160+
scancel "${SLURM_JOB_ID}" 2>/dev/null || true
161+
kill "${SLURM_ALLOC_PID}" 2>/dev/null || true
168162
- name: Display benchmark stats
169163
run: |
170164
cat "/mnt/data/artifacts/speed_test_${SLURM_JOB_NAME}.txt"

0 commit comments

Comments
 (0)