Skip to content

Commit 2c94587

Browse files
committed
PBS scheduling and monitoring script for Titan
1 parent d584582 commit 2c94587

File tree

1 file changed

+78
-0
lines changed

1 file changed

+78
-0
lines changed
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/usr/bin/env python
2+
import subprocess
3+
from subprocess import Popen
4+
from time import sleep
5+
6+
def checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount):
7+
if nextGPUcount > maxGPUcount: return
8+
job_is_running = subprocess.check_output(['qstat','-u','alexeys'])
9+
if len(job_is_running) > 0:
10+
#sleep 500 seconds
11+
sleep(500)
12+
checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount)
13+
else:
14+
#create a config
15+
nextConfigName = createOneConfig(configBaseName,nextGPUcount)
16+
print "Submitting next PBS job {} to run on {} GPUs".format(configBaseName,nextGPUcount)
17+
print "qsub "+nextConfigName
18+
Popen("qsub "+nextConfigName,shell=True).wait()
19+
#update parameters
20+
nextGPUcount += GPUstep
21+
sleep(10)
22+
checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount)
23+
24+
25+
def createOneConfig(configBaseName, GPUcount):
26+
configFullName = configBaseName+str(GPUcount)+".cmd"
27+
with open(configFullName,"w") as f:
28+
f.write('#!/bin/bash\n')
29+
f.write('#PBS -A FUS117\n')
30+
f.write('#PBS -l walltime=1:30:00\n') #FIXME this depends a lot on the number of GPUs 1900s/1epoch at 50, 2350s/1epoch at 4
31+
f.write('#PBS -l nodes='+str(GPUcount)+'\n')
32+
f.write('##PBS -l procs=1\n')
33+
f.write('##PBS -l gres=atlas1%atlas2\n')
34+
f.write('\n\n')
35+
f.write('export HOME=/ccs/proj/fus117/\n')
36+
f.write('cd $HOME/PPPL/plasma-python/examples\n')
37+
f.write('\n\n')
38+
f.write('source $MODULESHOME/init/bash\n')
39+
f.write('module switch PrgEnv-pgi PrgEnv-gnu\n')
40+
f.write('\n\n')
41+
f.write('module load cudatoolkit\n')
42+
f.write('export LIBRARY_PATH=/opt/nvidia/cudatoolkit7.5/7.5.18-1.0502.10743.2.1/lib64:$LIBRARY_PATH\n')
43+
f.write('\n\n')
44+
f.write('#This block is CuDNN module\n')
45+
f.write('export LD_LIBRARY_PATH=$HOME/cuda/lib64:$LD_LIBRARY_PATH\n')
46+
f.write('export LIBRARY_PATH=$HOME/cuda/lib64:$LIBRARY_PATH\n')
47+
f.write('export LDFLAGS=$LDFLAGS:$HOME/cuda/lib64\n')
48+
f.write('export INCLUDE=$INCLUDE:$HOME/cuda/include\n')
49+
f.write('export CPATH=$CPATH:$HOME/cuda/include\n')
50+
f.write('export FFLAGS=$FFLAGS:$HOME/cuda/include\n')
51+
f.write('export LOCAL_LDFLAGS=$LOCAL_LDFLAGS:$HOME/cuda/lib64\n')
52+
f.write('export LOCAL_INCLUDE=$LOCAL_INCLUDE:$HOME/cuda/include\n')
53+
f.write('export LOCAL_CFLAGS=$LOCAL_CFLAGS:$HOME/cuda/include\n')
54+
f.write('export LOCAL_FFLAGS=$LOCAL_FFLAGS:$HOME/cuda/include\n')
55+
f.write('export LOCAL_CXXFLAGS=$LOCAL_CXXFLAGS:$HOME/cuda/include\n')
56+
f.write('\n\n')
57+
f.write('#This sets new home and Anaconda module\n')
58+
f.write('export PATH=$HOME/anaconda2/bin:$PATH\n')
59+
f.write('export LD_LIBRARY_PATH=$HOME/anaconda2/lib:$LD_LIBRARY_PATH\n')
60+
f.write('source activate PPPL\n')
61+
f.write('\n\n')
62+
f.write('PYTHON=`which python`\n')
63+
f.write('echo $PYTHON\n')
64+
f.write('\n\n')
65+
f.write('export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH\n')
66+
f.write('export MPICH_RDMA_ENABLED_CUDA=1\n')
67+
f.write('\n\n')
68+
f.write('rm $HOME/tigress/alexeys/model_checkpoints/*\n')
69+
f.write('aprun -n'+str(GPUcount)+' -N1 $PYTHON mpi_learn.py\n')
70+
71+
return configFullName
72+
73+
if __name__=='__main__':
74+
nextGPUcount = 50
75+
GPUstep = 50
76+
maxGPUcount = 101
77+
configBaseName = "FRNN_Titan"
78+
checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount)

0 commit comments

Comments
 (0)