|
| 1 | +#!/usr/bin/env python |
| 2 | +import subprocess |
| 3 | +from subprocess import Popen |
| 4 | +from time import sleep |
| 5 | + |
| 6 | +def checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount): |
| 7 | + if nextGPUcount > maxGPUcount: return |
| 8 | + job_is_running = subprocess.check_output(['qstat','-u','alexeys']) |
| 9 | + if len(job_is_running) > 0: |
| 10 | + #sleep 500 seconds |
| 11 | + sleep(500) |
| 12 | + checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount) |
| 13 | + else: |
| 14 | + #create a config |
| 15 | + nextConfigName = createOneConfig(configBaseName,nextGPUcount) |
| 16 | + print "Submitting next PBS job {} to run on {} GPUs".format(configBaseName,nextGPUcount) |
| 17 | + print "qsub "+nextConfigName |
| 18 | + Popen("qsub "+nextConfigName,shell=True).wait() |
| 19 | + #update parameters |
| 20 | + nextGPUcount += GPUstep |
| 21 | + sleep(10) |
| 22 | + checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount) |
| 23 | + |
| 24 | + |
| 25 | +def createOneConfig(configBaseName, GPUcount): |
| 26 | + configFullName = configBaseName+str(GPUcount)+".cmd" |
| 27 | + with open(configFullName,"w") as f: |
| 28 | + f.write('#!/bin/bash\n') |
| 29 | + f.write('#PBS -A FUS117\n') |
| 30 | + f.write('#PBS -l walltime=1:30:00\n') #FIXME this depends a lot on the number of GPUs 1900s/1epoch at 50, 2350s/1epoch at 4 |
| 31 | + f.write('#PBS -l nodes='+str(GPUcount)+'\n') |
| 32 | + f.write('##PBS -l procs=1\n') |
| 33 | + f.write('##PBS -l gres=atlas1%atlas2\n') |
| 34 | + f.write('\n\n') |
| 35 | + f.write('export HOME=/lustre/atlas/proj-shared/fus117\n') |
| 36 | + f.write('cd $HOME/PPPL/plasma-python/examples\n') |
| 37 | + f.write('\n\n') |
| 38 | + f.write('source $MODULESHOME/init/bash\n') |
| 39 | + f.write('module switch PrgEnv-pgi PrgEnv-gnu\n') |
| 40 | + f.write('\n\n') |
| 41 | + f.write('module load cudatoolkit\n') |
| 42 | + f.write('export LIBRARY_PATH=/opt/nvidia/cudatoolkit7.5/7.5.18-1.0502.10743.2.1/lib64:$LIBRARY_PATH\n') |
| 43 | + f.write('\n\n') |
| 44 | + f.write('#This block is CuDNN module\n') |
| 45 | + f.write('export LD_LIBRARY_PATH=$HOME/cuda/lib64:$LD_LIBRARY_PATH\n') |
| 46 | + f.write('export LIBRARY_PATH=$HOME/cuda/lib64:$LIBRARY_PATH\n') |
| 47 | + f.write('export LDFLAGS=$LDFLAGS:$HOME/cuda/lib64\n') |
| 48 | + f.write('export INCLUDE=$INCLUDE:$HOME/cuda/include\n') |
| 49 | + f.write('export CPATH=$CPATH:$HOME/cuda/include\n') |
| 50 | + f.write('export FFLAGS=$FFLAGS:$HOME/cuda/include\n') |
| 51 | + f.write('export LOCAL_LDFLAGS=$LOCAL_LDFLAGS:$HOME/cuda/lib64\n') |
| 52 | + f.write('export LOCAL_INCLUDE=$LOCAL_INCLUDE:$HOME/cuda/include\n') |
| 53 | + f.write('export LOCAL_CFLAGS=$LOCAL_CFLAGS:$HOME/cuda/include\n') |
| 54 | + f.write('export LOCAL_FFLAGS=$LOCAL_FFLAGS:$HOME/cuda/include\n') |
| 55 | + f.write('export LOCAL_CXXFLAGS=$LOCAL_CXXFLAGS:$HOME/cuda/include\n') |
| 56 | + f.write('\n\n') |
| 57 | + f.write('#This sets new home and Anaconda module\n') |
| 58 | + f.write('export PATH=$HOME/anaconda2/bin:$PATH\n') |
| 59 | + f.write('export LD_LIBRARY_PATH=$HOME/anaconda2/lib:$LD_LIBRARY_PATH\n') |
| 60 | + f.write('source activate PPPL\n') |
| 61 | + f.write('\n\n') |
| 62 | + f.write('PYTHON=`which python`\n') |
| 63 | + f.write('echo $PYTHON\n') |
| 64 | + f.write('\n\n') |
| 65 | + f.write('export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH\n') |
| 66 | + f.write('export MPICH_RDMA_ENABLED_CUDA=1\n') |
| 67 | + f.write('\n\n') |
| 68 | + f.write('rm $HOME/tigress/alexeys/model_checkpoints/*\n') |
| 69 | + f.write('aprun -n'+str(GPUcount)+' -N1 $PYTHON mpi_learn.py\n') |
| 70 | + |
| 71 | + return configFullName |
| 72 | + |
| 73 | +if __name__=='__main__': |
| 74 | + nextGPUcount = 50 |
| 75 | + GPUstep = 50 |
| 76 | + maxGPUcount = 101 |
| 77 | + configBaseName = "FRNN_Titan" |
| 78 | + checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount) |
0 commit comments