Skip to content

Commit 9d2f49c

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into acc_image_proc
2 parents ae06deb + f93af82 commit 9d2f49c

File tree

153 files changed

+10706
-4544
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

153 files changed

+10706
-4544
lines changed

.dockerignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.gitignore

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ build/
88
.cproject
99
.pydevproject
1010
Makefile
11+
.test_env/

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ before_install:
5050
fi
5151
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
5252
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
53-
- pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
53+
- pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy sphinx_rtd_theme
5454
script:
5555
- paddle/scripts/travis/main.sh
5656
notifications:

CMakeLists.txt

Lines changed: 17 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,24 @@
11
cmake_minimum_required(VERSION 2.8)
22

33
project(paddle CXX C)
4-
set(PADDLE_MAJOR_VERSION 0)
5-
set(PADDLE_MINOR_VERSION 9)
6-
set(PADDLE_PATCH_VERSION 0)
7-
set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
84

95
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
106
set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
117
include(package)
128
find_package(SWIG 2.0)
139
find_package(CUDA QUIET)
1410
find_package(Protobuf REQUIRED)
11+
12+
# Check protobuf library version.
13+
execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
14+
OUTPUT_VARIABLE PROTOBUF_VERSION)
15+
string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
16+
17+
set(PROTOBUF_3 OFF)
18+
if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
19+
set(PROTOBUF_3 ON)
20+
endif()
21+
1522
find_package(PythonLibs 2.7 REQUIRED)
1623
find_package(PythonInterp 2.7 REQUIRED)
1724
find_package(ZLIB REQUIRED)
@@ -45,7 +52,7 @@ option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
4552
option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
4653

4754
if(NOT CMAKE_BUILD_TYPE)
48-
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
55+
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
4956
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
5057
FORCE)
5158
endif()
@@ -64,31 +71,11 @@ include(check_packages)
6471
include(swig)
6572
include(coveralls)
6673

67-
# add PaddlePaddle version
68-
if(DEFINED ENV{PADDLE_VERSION})
69-
add_definitions(-DPADDLE_VERSION=\"$ENV{PADDLE_VERSION}\")
70-
else()
71-
if(EXISTS ${PROJ_ROOT}/.svn/)
72-
find_package(Subversion REQUIRED)
73-
if(SUBVERSION_FOUND)
74-
Subversion_WC_INFO(${PROJ_ROOT} Project)
75-
add_definitions(-DPADDLE_VERSION=${Project_WC_REVISION})
76-
endif()
77-
elseif(EXISTS ${PROJ_ROOT}/.git/)
78-
find_package(Git REQUIRED)
79-
execute_process(
80-
COMMAND ${GIT_EXECUTABLE} log -1 --format=%H
81-
WORKING_DIRECTORY ${PROJ_ROOT}
82-
OUTPUT_VARIABLE GIT_SHA1
83-
RESULT_VARIABLE GIT_RESULT
84-
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
85-
if(NOT ${GIT_RESULT})
86-
add_definitions(-DPADDLE_VERSION=\"${GIT_SHA1}\")
87-
else()
88-
message(WARNING "Cannot add paddle version from git tag")
89-
endif()
90-
endif()
91-
endif()
74+
# Set PaddlePaddle version to Git tag name or Git commit ID.
75+
find_package(Git REQUIRED)
76+
# version.cmake will get the current PADDLE_VERSION
77+
include(version)
78+
add_definitions(-DPADDLE_VERSION=\"${PADDLE_VERSION}\")
9279

9380

9481
if(NOT WITH_GPU)

benchmark/.gitignore

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
paddle/image/logs
2+
paddle/image/*.pyc
3+
paddle/image/train.list
4+
paddle/rnn/logs
5+
paddle/rnn/*.pyc
6+
paddle/rnn/imdb.pkl
7+
caffe/image/logs
8+
tensorflow/image/logs
9+
tensorflow/rnn/logs

benchmark/README.md

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
# Benchmark
2+
3+
Machine:
4+
5+
- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
6+
- GPU: Tesla K40m
7+
- cuDNN: v5.1
8+
- system: Docker 1.12.1, all platforms are tested in docker environment.
9+
10+
Platforms:
11+
12+
- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0
13+
- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu
14+
- Caffe: kaixhin/cuda-caffe
15+
16+
Several convolutional neural networks and recurrent neural networks are used to test.
17+
18+
## Image
19+
20+
### Benchmark Model
21+
22+
AlexNet, GoogleNet and a small network used in Caffe.
23+
24+
- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
25+
26+
- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
27+
28+
- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
29+
30+
31+
### Single-GPU
32+
33+
- AlexNet: input - 3 * 227 * 227, Time: ms/batch
34+
35+
| BatchSize | 64 | 128 | 256 | 512 |
36+
|--------------|-----| -----| ------| -----|
37+
| PaddlePaddle | 195 | 334 | 602 | 1629 |
38+
| TensorFlow | 223 | 364 | 645 | 1235 |
39+
| Caffe | 324 | 627 | 1232 | 2513 |
40+
41+
**Notation**
42+
43+
All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
44+
45+
- GoogletNet: input - 3 * 224 * 224, Time: ms/batch
46+
47+
48+
| BatchSize | 64 | 128 | 256 |
49+
|--------------|-------| -------| --------|
50+
| PaddlePaddle | 613 | 1149 | 2348 |
51+
| TensorFlow | 644 | 1176 | 2219 |
52+
| Caffe | 694 | 1364 | out of memory |
53+
54+
- SmallNet: input - 3 * 32 * 32, Time ms/batch
55+
56+
| BatchSize | 64 | 128 | 256 | 512 |
57+
|--------------|--------| -------- | --------|---------|
58+
| PaddlePaddle | 10.463 | 18.184 | 33.113 | 63.039 |
59+
| TensorFlow | 9 | 15 | 28 | 59 |
60+
| Caffe | 9.373 | 16.6606 | 31.4797 | 59.719 |
61+
62+
**Notation**
63+
64+
All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
65+
66+
In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
67+
68+
### Multi-GPU: 4 GPUs
69+
70+
- AlexNet, ms / batch
71+
72+
| total-BatchSize | 128 * 4 | 256 * 4 |
73+
|------------------|----------| -----------|
74+
| PaddlePaddle | 347 | 622 |
75+
| TensorFlow | 377 | 675 |
76+
| Caffe | 1229 | 2435 |
77+
78+
For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by
79+
80+
```
81+
time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512
82+
= (334 * 4)/347
83+
= 3.85
84+
```
85+
86+
<img src="figs/alexnet-4gpu.png" width="420">
87+
88+
89+
- GoogleNet, ms / batch
90+
91+
| total-BatchSize | 128 * 4 | 256 * 4 |
92+
|-------------------|--------------| ----------- |
93+
| PaddlePaddle | 1178 | 2367 |
94+
| TensorFlow | 1210 | 2292 |
95+
| Caffe | 2007 | out of memory |
96+
97+
<img src="figs/googlenet-4gpu.png" width="420">
98+
99+
100+
## RNN
101+
We use lstm network for text classfication to test benchmark.
102+
103+
### Dataset
104+
- [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
105+
- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
106+
- Dictionary size=30000
107+
- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
108+
109+
### Single-GPU
110+
111+
#### LSTM in Text Classification
112+
113+
Testing `2 lstm layer + fc` network with different hidden size and batch size.
114+
115+
- Batch size = 64, ms / batch
116+
117+
| hidden_size | 256 | 512 | 1280 |
118+
|--------------|-------| -------| --------|
119+
| PaddlePaddle | 83 | 184 | 641 |
120+
| TensorFlow | 175 | 280 | 818 |
121+
122+
- Batch size = 128, ms / batch
123+
124+
| hidden_size | 256 | 512 | 1280 |
125+
|--------------|------- | -------| --------|
126+
| PaddlePaddle | 110 | 261 | 1007 |
127+
| TensorFlow | 181 | 361 | 1237 |
128+
129+
130+
- Batch size = 256, ms / batch
131+
132+
| hidden_size | 256 | 512 | 1280 |
133+
|--------------|-------| -------| --------|
134+
| PaddlePaddle | 170 | 414 | 1655 |
135+
| TensorFlow | 238 | 536 | 1905 |
136+
137+
<img src="figs/rnn_lstm_cls.png" width="600">
138+
139+
#### Seq2Seq
140+
141+
The benchmark of sequence-to-sequence network will be added later.
142+
143+
144+
### Multi GPU: 4 GPUs
145+
146+
#### LSTM in Text Classification
147+
148+
- hidden_size = 256, ms / batch
149+
150+
| batch_size | 256 | 512 |
151+
|--------------| -------| --------|
152+
| PaddlePaddle | 90 | 118 |
153+
| TensorFlow | 226 | 118 |
154+
155+
156+
- hidden_size = 512, ms / batch
157+
158+
| batch_size | 256 | 512 |
159+
|--------------| -------| --------|
160+
| PaddlePaddle | 189 | 268 |
161+
| TensorFlow | 297 | 383 |
162+
163+
164+
<img src="figs/rnn_lstm_4gpus.png" width="420">
165+
166+
#### Seq2Seq
167+
168+
The benchmark of sequence-to-sequence network will be added later.

0 commit comments

Comments
 (0)