33from subprocess import Popen
44from time import sleep
55
6- def checkAndSchedule (configBaseName ,nextGPUcount , GPUstep , maxGPUcount ):
7- if nextGPUcount > maxGPUcount : return
6+ def checkAndSchedule (configBaseName ,gpuNodeCountGrid , nextGPUNodeCount ):
7+ if nextGPUNodeCount > len ( gpuNodeCountGrid ) - 1 : return
88 job_is_running = subprocess .check_output (['squeue' ,'-u' ,'alexeys' ]) #['qstat','-u','alexeys'])
99 if 'alexeys' in job_is_running :
1010 #sleep 500 seconds
1111 sleep (500 )
12- checkAndSchedule (configBaseName ,nextGPUcount , GPUstep , maxGPUcount )
12+ checkAndSchedule (configBaseName ,gpuNodeCountGrid , nextGPUNodeCount )
1313 else :
1414 #create a config
15- nextConfigName = createOneConfig (configBaseName ,nextGPUcount )
16- print "Submitting next PBS job {} to run on {} GPUs" .format (configBaseName ,nextGPUcount )
15+ nextConfigName = createOneConfig (configBaseName ,gpuNodeCountGrid [ nextGPUNodeCount ] )
16+ print "Submitting next PBS job {} to run on {} GPUs" .format (configBaseName ,gpuNodeCountGrid [ nextGPUNodeCount ] )
1717 print "sbatch " + nextConfigName
1818 Popen ("sbatch " + nextConfigName ,shell = True ).wait ()
1919 #update parameters
20- nextGPUcount += GPUstep
20+ nextGPUNodeCount += 1
2121 sleep (10 )
22- checkAndSchedule (configBaseName ,nextGPUcount , GPUstep , maxGPUcount )
22+ checkAndSchedule (configBaseName ,gpuNodeCountGrid , nextGPUNodeCount )
2323
2424
2525def createOneConfig (configBaseName , GPUcount ):
@@ -43,8 +43,6 @@ def createOneConfig(configBaseName, GPUcount):
4343 return configFullName
4444
4545if __name__ == '__main__' :
46- nextGPUcount = 1
47- GPUstep = 3
48- maxGPUcount = 24
46+ gpuNodeCountGrid = [1 ,3 ,6 ,12 ,24 ,32 ,48 ]
4947 configBaseName = "FRNN_TigerGPU"
50- checkAndSchedule (configBaseName ,nextGPUcount , GPUstep , maxGPUcount )
48+ checkAndSchedule (configBaseName ,gpuNodeCountGrid , 0 )
0 commit comments