Merge pull request #10 from PPPLDeepLearning/titan_setup

ASvyatkovskiy · web-flow · commit 8cf0432ac43d · 2017-04-19T14:01:33.000-04:00
Titan setup
diff --git a/docs/Titan.md b/docs/Titan.md
@@ -0,0 +1,71 @@
+# Set home at PROJWORK in the .bashrc:
+```bash
+export HOME=/lustre/atlas/proj-shared/fus117/
+```
+
+#cd ~
+
+
+#Set up CUDA:
+```bash
+wget http://developer.download.nvidia.com/compute/redist/cudnn/v5.1/cudnn-7.5-linux-x64-v5.1.tgz
+tar -xvf 
+```
+
+Add following to the submission script:
+
+```bash
+export LD_LIBRARY_PATH=$HOME/cuda/lib64:$LD_LIBRARY_PATH
+export LIBRARY_PATH=$HOME/cuda/lib64:$LIBRARY_PATH
+export LDFLAGS=$LDFLAGS:$HOME/cuda/lib64
+export INCLUDE=$INCLUDE:$HOME/cuda/include
+export CPATH=$CPATH:$HOME/cuda/include
+export FFLAGS=$FFLAGS:$HOME/cuda/include
+export LOCAL_LDFLAGS=$LOCAL_LDFLAGS:$HOME/cuda/lib64
+export LOCAL_INCLUDE=$LOCAL_INCLUDE:$HOME/cuda/include
+export LOCAL_CFLAGS=$LOCAL_CFLAGS:$HOME/cuda/include
+export LOCAL_FFLAGS=$LOCAL_FFLAGS:$HOME/cuda/include
+export LOCAL_CXXFLAGS=$LOCAL_CXXFLAGS:$HOME/cuda/include
+```
+
+
+Add LIBRARY_PATH in addition to cudatoolkit:
+```bash
+module load cudatoolkit
+export LIBRARY_PATH=/opt/nvidia/cudatoolkit7.5/7.5.18-1.0502.10743.2.1/lib64:$LIBRARY_PATH
+```
+
+# Download and install Anaconda
+wget https://repo.continuum.io/archive/Anaconda2-4.3.1-Linux-x86_64.sh
+sh A..
+
+
+do not add PATH to .bashrc - it messes up modules for some reason
+
+
+# Clone the PPPL repo
+
+add ssh keys to github to ~/.ssh
+ssh-add ~/.ssh/olcf_github_rsa
+
+git clone git@github.com:PPPLDeepLearning/plasma-python.git
+cd PPPL/plasma-python
+
+Create PPPL env:
+conda create --name PPPL --file requirements.txt
+
+#Install mpi4py
+
+module switch PrgEnv-pgi PrgEnv-gnu
+export MPICC=cc
+python setup.py install
+
+
+doing custom installs with pip --user is OK
+
+
+#Make sure to update paths in the conf.yaml
+
+
+#The mass batch job submission is performed with this script:
+https://github.com/PPPLDeepLearning/plasma-python/blob/titan_setup/examples/prepare_pbs_configs_titan.py
diff --git a/examples/pbs.cmd b/examples/pbs.cmd
@@ -0,0 +1,49 @@
+#!/bin/bash
+#PBS -A FUS117
+#PBS -l walltime=0:05:00
+#PBS -l nodes=2
+##PBS -l procs=1
+##PBS -l gres=atlas1%atlas2
+
+
+#Cannot see home folder, will just hang until wall limit
+export HOME=/ccs/proj/fus117/
+cd $HOME/PPPL/plasma-python/examples
+
+source $MODULESHOME/init/bash
+module switch PrgEnv-pgi PrgEnv-gnu
+
+module load cudatoolkit
+export LIBRARY_PATH=/opt/nvidia/cudatoolkit7.5/7.5.18-1.0502.10743.2.1/lib64:$LIBRARY_PATH
+
+#This block is CuDNN module
+export LD_LIBRARY_PATH=$HOME/cuda/lib64:$LD_LIBRARY_PATH
+export LIBRARY_PATH=$HOME/cuda/lib64:$LIBRARY_PATH
+export LDFLAGS=$LDFLAGS:$HOME/cuda/lib64
+export INCLUDE=$INCLUDE:$HOME/cuda/include
+export CPATH=$CPATH:$HOME/cuda/include
+export FFLAGS=$FFLAGS:$HOME/cuda/include
+export LOCAL_LDFLAGS=$LOCAL_LDFLAGS:$HOME/cuda/lib64
+export LOCAL_INCLUDE=$LOCAL_INCLUDE:$HOME/cuda/include
+export LOCAL_CFLAGS=$LOCAL_CFLAGS:$HOME/cuda/include
+export LOCAL_FFLAGS=$LOCAL_FFLAGS:$HOME/cuda/include
+export LOCAL_CXXFLAGS=$LOCAL_CXXFLAGS:$HOME/cuda/include
+
+#This sets new home and Anaconda module
+export PATH=$HOME/anaconda2/bin:$PATH
+export LD_LIBRARY_PATH=$HOME/anaconda2/lib:$LD_LIBRARY_PATH
+source activate PPPL
+
+PYTHON=`which python`
+echo $PYTHON
+
+#pygpu backend
+#export CPATH=$CPATH:~/.local/include
+#export LIBRARY_PATH=$LIBRARY_PATH:~/.local/lib
+#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.local/lib
+
+export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
+export MPICH_RDMA_ENABLED_CUDA=1
+
+rm $HOME/tigress/alexeys/model_checkpoints/*
+aprun -n2 -N1 $PYTHON mpi_learn.py
diff --git a/examples/prepare_pbs_configs_titan.py b/examples/prepare_pbs_configs_titan.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+import subprocess
+from subprocess import Popen
+from time import sleep
+
+def checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount):
+    if nextGPUcount > maxGPUcount: return 
+    job_is_running = subprocess.check_output(['qstat','-u','alexeys'])
+    if len(job_is_running) > 0: 
+        #sleep 500 seconds
+        sleep(500)
+        checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount)
+    else:
+        #create a config
+        nextConfigName = createOneConfig(configBaseName,nextGPUcount)
+        print "Submitting next PBS job {} to run on {} GPUs".format(configBaseName,nextGPUcount)
+        print "qsub "+nextConfigName
+        Popen("qsub "+nextConfigName,shell=True).wait() 
+        #update parameters
+        nextGPUcount += GPUstep
+        sleep(10)
+        checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount)
+
+
+def createOneConfig(configBaseName, GPUcount):
+    configFullName = configBaseName+str(GPUcount)+".cmd"
+    with open(configFullName,"w") as f:
+	f.write('#!/bin/bash\n')
+	f.write('#PBS -A FUS117\n')
+	f.write('#PBS -l walltime=1:30:00\n') #FIXME this depends a lot on the number of GPUs 1900s/1epoch at 50, 2350s/1epoch at 4
+	f.write('#PBS -l nodes='+str(GPUcount)+'\n')
+	f.write('##PBS -l procs=1\n')
+	f.write('##PBS -l gres=atlas1%atlas2\n')
+        f.write('\n\n')
+	f.write('export HOME=/lustre/atlas/proj-shared/fus117\n')
+	f.write('cd $HOME/PPPL/plasma-python/examples\n')
+        f.write('\n\n')
+	f.write('source $MODULESHOME/init/bash\n')
+	f.write('module switch PrgEnv-pgi PrgEnv-gnu\n')
+        f.write('\n\n')
+	f.write('module load cudatoolkit\n')
+	f.write('export LIBRARY_PATH=/opt/nvidia/cudatoolkit7.5/7.5.18-1.0502.10743.2.1/lib64:$LIBRARY_PATH\n')
+        f.write('\n\n')
+	f.write('#This block is CuDNN module\n')
+	f.write('export LD_LIBRARY_PATH=$HOME/cuda/lib64:$LD_LIBRARY_PATH\n')
+	f.write('export LIBRARY_PATH=$HOME/cuda/lib64:$LIBRARY_PATH\n')
+	f.write('export LDFLAGS=$LDFLAGS:$HOME/cuda/lib64\n')
+	f.write('export INCLUDE=$INCLUDE:$HOME/cuda/include\n')
+	f.write('export CPATH=$CPATH:$HOME/cuda/include\n')
+	f.write('export FFLAGS=$FFLAGS:$HOME/cuda/include\n')
+	f.write('export LOCAL_LDFLAGS=$LOCAL_LDFLAGS:$HOME/cuda/lib64\n')
+	f.write('export LOCAL_INCLUDE=$LOCAL_INCLUDE:$HOME/cuda/include\n')
+	f.write('export LOCAL_CFLAGS=$LOCAL_CFLAGS:$HOME/cuda/include\n')
+	f.write('export LOCAL_FFLAGS=$LOCAL_FFLAGS:$HOME/cuda/include\n')
+	f.write('export LOCAL_CXXFLAGS=$LOCAL_CXXFLAGS:$HOME/cuda/include\n')
+        f.write('\n\n')
+	f.write('#This sets new home and Anaconda module\n')
+	f.write('export PATH=$HOME/anaconda2/bin:$PATH\n')
+	f.write('export LD_LIBRARY_PATH=$HOME/anaconda2/lib:$LD_LIBRARY_PATH\n')
+	f.write('source activate PPPL\n')
+        f.write('\n\n')
+	f.write('PYTHON=`which python`\n')
+	f.write('echo $PYTHON\n')
+        f.write('\n\n')
+	f.write('export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH\n')
+	f.write('export MPICH_RDMA_ENABLED_CUDA=1\n')
+        f.write('\n\n')
+	f.write('rm $HOME/tigress/alexeys/model_checkpoints/*\n')
+	f.write('aprun -n'+str(GPUcount)+' -N1 $PYTHON mpi_learn.py\n')
+
+    return configFullName
+
+if __name__=='__main__':
+    nextGPUcount = 50
+    GPUstep = 50 
+    maxGPUcount = 101
+    configBaseName = "FRNN_Titan"
+    checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount)