Skip to content

Commit 8cf0432

Browse files
Merge pull request #10 from PPPLDeepLearning/titan_setup
Titan setup
2 parents 9d5d5c4 + 598c8ba commit 8cf0432

File tree

3 files changed

+198
-0
lines changed

3 files changed

+198
-0
lines changed

docs/Titan.md

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Set home at PROJWORK in the .bashrc:
2+
```bash
3+
export HOME=/lustre/atlas/proj-shared/fus117/
4+
```
5+
6+
#cd ~
7+
8+
9+
#Set up CUDA:
10+
```bash
11+
wget http://developer.download.nvidia.com/compute/redist/cudnn/v5.1/cudnn-7.5-linux-x64-v5.1.tgz
12+
tar -xvf
13+
```
14+
15+
Add following to the submission script:
16+
17+
```bash
18+
export LD_LIBRARY_PATH=$HOME/cuda/lib64:$LD_LIBRARY_PATH
19+
export LIBRARY_PATH=$HOME/cuda/lib64:$LIBRARY_PATH
20+
export LDFLAGS=$LDFLAGS:$HOME/cuda/lib64
21+
export INCLUDE=$INCLUDE:$HOME/cuda/include
22+
export CPATH=$CPATH:$HOME/cuda/include
23+
export FFLAGS=$FFLAGS:$HOME/cuda/include
24+
export LOCAL_LDFLAGS=$LOCAL_LDFLAGS:$HOME/cuda/lib64
25+
export LOCAL_INCLUDE=$LOCAL_INCLUDE:$HOME/cuda/include
26+
export LOCAL_CFLAGS=$LOCAL_CFLAGS:$HOME/cuda/include
27+
export LOCAL_FFLAGS=$LOCAL_FFLAGS:$HOME/cuda/include
28+
export LOCAL_CXXFLAGS=$LOCAL_CXXFLAGS:$HOME/cuda/include
29+
```
30+
31+
32+
Add LIBRARY_PATH in addition to cudatoolkit:
33+
```bash
34+
module load cudatoolkit
35+
export LIBRARY_PATH=/opt/nvidia/cudatoolkit7.5/7.5.18-1.0502.10743.2.1/lib64:$LIBRARY_PATH
36+
```
37+
38+
# Download and install Anaconda
39+
wget https://repo.continuum.io/archive/Anaconda2-4.3.1-Linux-x86_64.sh
40+
sh A..
41+
42+
43+
do not add PATH to .bashrc - it messes up modules for some reason
44+
45+
46+
# Clone the PPPL repo
47+
48+
add ssh keys to github to ~/.ssh
49+
ssh-add ~/.ssh/olcf_github_rsa
50+
51+
git clone git@github.com:PPPLDeepLearning/plasma-python.git
52+
cd PPPL/plasma-python
53+
54+
Create PPPL env:
55+
conda create --name PPPL --file requirements.txt
56+
57+
#Install mpi4py
58+
59+
module switch PrgEnv-pgi PrgEnv-gnu
60+
export MPICC=cc
61+
python setup.py install
62+
63+
64+
doing custom installs with pip --user is OK
65+
66+
67+
#Make sure to update paths in the conf.yaml
68+
69+
70+
#The mass batch job submission is performed with this script:
71+
https://github.com/PPPLDeepLearning/plasma-python/blob/titan_setup/examples/prepare_pbs_configs_titan.py

examples/pbs.cmd

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/bin/bash
2+
#PBS -A FUS117
3+
#PBS -l walltime=0:05:00
4+
#PBS -l nodes=2
5+
##PBS -l procs=1
6+
##PBS -l gres=atlas1%atlas2
7+
8+
9+
#Cannot see home folder, will just hang until wall limit
10+
export HOME=/ccs/proj/fus117/
11+
cd $HOME/PPPL/plasma-python/examples
12+
13+
source $MODULESHOME/init/bash
14+
module switch PrgEnv-pgi PrgEnv-gnu
15+
16+
module load cudatoolkit
17+
export LIBRARY_PATH=/opt/nvidia/cudatoolkit7.5/7.5.18-1.0502.10743.2.1/lib64:$LIBRARY_PATH
18+
19+
#This block is CuDNN module
20+
export LD_LIBRARY_PATH=$HOME/cuda/lib64:$LD_LIBRARY_PATH
21+
export LIBRARY_PATH=$HOME/cuda/lib64:$LIBRARY_PATH
22+
export LDFLAGS=$LDFLAGS:$HOME/cuda/lib64
23+
export INCLUDE=$INCLUDE:$HOME/cuda/include
24+
export CPATH=$CPATH:$HOME/cuda/include
25+
export FFLAGS=$FFLAGS:$HOME/cuda/include
26+
export LOCAL_LDFLAGS=$LOCAL_LDFLAGS:$HOME/cuda/lib64
27+
export LOCAL_INCLUDE=$LOCAL_INCLUDE:$HOME/cuda/include
28+
export LOCAL_CFLAGS=$LOCAL_CFLAGS:$HOME/cuda/include
29+
export LOCAL_FFLAGS=$LOCAL_FFLAGS:$HOME/cuda/include
30+
export LOCAL_CXXFLAGS=$LOCAL_CXXFLAGS:$HOME/cuda/include
31+
32+
#This sets new home and Anaconda module
33+
export PATH=$HOME/anaconda2/bin:$PATH
34+
export LD_LIBRARY_PATH=$HOME/anaconda2/lib:$LD_LIBRARY_PATH
35+
source activate PPPL
36+
37+
PYTHON=`which python`
38+
echo $PYTHON
39+
40+
#pygpu backend
41+
#export CPATH=$CPATH:~/.local/include
42+
#export LIBRARY_PATH=$LIBRARY_PATH:~/.local/lib
43+
#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/.local/lib
44+
45+
export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
46+
export MPICH_RDMA_ENABLED_CUDA=1
47+
48+
rm $HOME/tigress/alexeys/model_checkpoints/*
49+
aprun -n2 -N1 $PYTHON mpi_learn.py
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#!/usr/bin/env python
2+
import subprocess
3+
from subprocess import Popen
4+
from time import sleep
5+
6+
def checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount):
7+
if nextGPUcount > maxGPUcount: return
8+
job_is_running = subprocess.check_output(['qstat','-u','alexeys'])
9+
if len(job_is_running) > 0:
10+
#sleep 500 seconds
11+
sleep(500)
12+
checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount)
13+
else:
14+
#create a config
15+
nextConfigName = createOneConfig(configBaseName,nextGPUcount)
16+
print "Submitting next PBS job {} to run on {} GPUs".format(configBaseName,nextGPUcount)
17+
print "qsub "+nextConfigName
18+
Popen("qsub "+nextConfigName,shell=True).wait()
19+
#update parameters
20+
nextGPUcount += GPUstep
21+
sleep(10)
22+
checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount)
23+
24+
25+
def createOneConfig(configBaseName, GPUcount):
26+
configFullName = configBaseName+str(GPUcount)+".cmd"
27+
with open(configFullName,"w") as f:
28+
f.write('#!/bin/bash\n')
29+
f.write('#PBS -A FUS117\n')
30+
f.write('#PBS -l walltime=1:30:00\n') #FIXME this depends a lot on the number of GPUs 1900s/1epoch at 50, 2350s/1epoch at 4
31+
f.write('#PBS -l nodes='+str(GPUcount)+'\n')
32+
f.write('##PBS -l procs=1\n')
33+
f.write('##PBS -l gres=atlas1%atlas2\n')
34+
f.write('\n\n')
35+
f.write('export HOME=/lustre/atlas/proj-shared/fus117\n')
36+
f.write('cd $HOME/PPPL/plasma-python/examples\n')
37+
f.write('\n\n')
38+
f.write('source $MODULESHOME/init/bash\n')
39+
f.write('module switch PrgEnv-pgi PrgEnv-gnu\n')
40+
f.write('\n\n')
41+
f.write('module load cudatoolkit\n')
42+
f.write('export LIBRARY_PATH=/opt/nvidia/cudatoolkit7.5/7.5.18-1.0502.10743.2.1/lib64:$LIBRARY_PATH\n')
43+
f.write('\n\n')
44+
f.write('#This block is CuDNN module\n')
45+
f.write('export LD_LIBRARY_PATH=$HOME/cuda/lib64:$LD_LIBRARY_PATH\n')
46+
f.write('export LIBRARY_PATH=$HOME/cuda/lib64:$LIBRARY_PATH\n')
47+
f.write('export LDFLAGS=$LDFLAGS:$HOME/cuda/lib64\n')
48+
f.write('export INCLUDE=$INCLUDE:$HOME/cuda/include\n')
49+
f.write('export CPATH=$CPATH:$HOME/cuda/include\n')
50+
f.write('export FFLAGS=$FFLAGS:$HOME/cuda/include\n')
51+
f.write('export LOCAL_LDFLAGS=$LOCAL_LDFLAGS:$HOME/cuda/lib64\n')
52+
f.write('export LOCAL_INCLUDE=$LOCAL_INCLUDE:$HOME/cuda/include\n')
53+
f.write('export LOCAL_CFLAGS=$LOCAL_CFLAGS:$HOME/cuda/include\n')
54+
f.write('export LOCAL_FFLAGS=$LOCAL_FFLAGS:$HOME/cuda/include\n')
55+
f.write('export LOCAL_CXXFLAGS=$LOCAL_CXXFLAGS:$HOME/cuda/include\n')
56+
f.write('\n\n')
57+
f.write('#This sets new home and Anaconda module\n')
58+
f.write('export PATH=$HOME/anaconda2/bin:$PATH\n')
59+
f.write('export LD_LIBRARY_PATH=$HOME/anaconda2/lib:$LD_LIBRARY_PATH\n')
60+
f.write('source activate PPPL\n')
61+
f.write('\n\n')
62+
f.write('PYTHON=`which python`\n')
63+
f.write('echo $PYTHON\n')
64+
f.write('\n\n')
65+
f.write('export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH\n')
66+
f.write('export MPICH_RDMA_ENABLED_CUDA=1\n')
67+
f.write('\n\n')
68+
f.write('rm $HOME/tigress/alexeys/model_checkpoints/*\n')
69+
f.write('aprun -n'+str(GPUcount)+' -N1 $PYTHON mpi_learn.py\n')
70+
71+
return configFullName
72+
73+
if __name__=='__main__':
74+
nextGPUcount = 50
75+
GPUstep = 50
76+
maxGPUcount = 101
77+
configBaseName = "FRNN_Titan"
78+
checkAndSchedule(configBaseName,nextGPUcount,GPUstep,maxGPUcount)

0 commit comments

Comments
 (0)