Skip to content

Commit 8d75aab

Browse files
authored
Improve SSH into runner (#42695)
better Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
1 parent ff13eb6 commit 8d75aab

File tree

1 file changed

+62
-0
lines changed

1 file changed

+62
-0
lines changed

.github/workflows/ssh-runner.yml

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,38 @@ jobs:
8080
run: |
8181
nvidia-smi
8282
83+
- name: Create python alias
84+
run: |
85+
ln -sf $(which python3) /usr/local/bin/python
86+
ln -sf $(which pip3) /usr/local/bin/pip
87+
echo "✅ python -> python3 symlink created"
88+
89+
- name: Install psutil for memory monitor
90+
run: |
91+
pip install psutil --break-system-packages
92+
93+
- name: Download memory monitor script
94+
working-directory: /transformers
95+
run: |
96+
apt-get update && apt-get install -y curl
97+
curl -o memory_monitor.py https://raw.githubusercontent.com/huggingface/transformers/refs/heads/utility_scripts/utils/memory_monitor.py
98+
99+
- name: Start memory monitor
100+
working-directory: /transformers
101+
continue-on-error: true # Don't fail workflow if monitor has issues
102+
run: |
103+
python3 memory_monitor.py --threshold 90 --interval 1 > memory_monitor.log 2>&1 &
104+
echo $! > memory_monitor.pid
105+
echo "Memory monitor started with PID $(cat memory_monitor.pid)"
106+
# Give it a moment to start
107+
sleep 2
108+
# Verify it's running
109+
ps aux | grep memory_monitor | grep -v grep || echo "Warning: memory monitor may not be running"
110+
111+
- name: Install utilities
112+
run: |
113+
apt-get install -y nano
114+
83115
- name: Store Slack infos
84116
#because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
85117
shell: bash
@@ -92,6 +124,36 @@ jobs:
92124
echo "$github_actor"
93125
echo "github_actor=$github_actor" >> $GITHUB_ENV
94126
127+
- name: Setup automatic environment for SSH login
128+
run: |
129+
# Create shared environment setup
130+
cat > /root/.env_setup << 'EOF'
131+
# Auto-setup (non-sensitive vars)
132+
export HF_HOME=/mnt/cache
133+
export TRANSFORMERS_IS_CI=yes
134+
export OMP_NUM_THREADS=8
135+
export MKL_NUM_THREADS=8
136+
export RUN_SLOW=yes
137+
export TF_FORCE_GPU_ALLOW_GROWTH=true
138+
export CUDA_VISIBLE_DEVICES=0,1
139+
140+
cd /transformers 2>/dev/null || true
141+
142+
# Remind user to set token if needed
143+
if [ -z "$HF_HUB_READ_TOKEN" ]; then
144+
echo "⚠️ HF_HUB_READ_TOKEN not set. Set it with:"
145+
echo " export HF_HUB_READ_TOKEN=hf_xxxxx"
146+
else
147+
echo "✅ HF_HUB_READ_TOKEN is set"
148+
fi
149+
150+
echo "📁 Working directory: $(pwd)"
151+
EOF
152+
153+
# Source from both .bash_profile and .bashrc
154+
echo 'source /root/.env_setup' >> /root/.bash_profile
155+
echo 'source /root/.env_setup' >> /root/.bashrc
156+
95157
- name: Store Slack infos
96158
#because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
97159
shell: bash

0 commit comments

Comments
 (0)