Skip to content

Commit 021b3a4

Browse files
authored
Merge pull request #1 from PaddlePaddle/develop
merge PaddlePaddle/Paddle
2 parents 4f1bf30 + 17f7125 commit 021b3a4

File tree

284 files changed

+10214
-5198
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

284 files changed

+10214
-5198
lines changed

.dockerignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.gitignore

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ build/
88
.cproject
99
.pydevproject
1010
Makefile
11+
.test_env/

CMakeLists.txt

Lines changed: 17 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,24 @@
11
cmake_minimum_required(VERSION 2.8)
22

33
project(paddle CXX C)
4-
set(PADDLE_MAJOR_VERSION 0)
5-
set(PADDLE_MINOR_VERSION 9)
6-
set(PADDLE_PATCH_VERSION 0a0)
7-
set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
84

95
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
106
set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
117
include(package)
128
find_package(SWIG 2.0)
139
find_package(CUDA QUIET)
1410
find_package(Protobuf REQUIRED)
11+
12+
# Check protobuf library version.
13+
execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
14+
OUTPUT_VARIABLE PROTOBUF_VERSION)
15+
string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
16+
17+
set(PROTOBUF_3 OFF)
18+
if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
19+
set(PROTOBUF_3 ON)
20+
endif()
21+
1522
find_package(PythonLibs 2.7 REQUIRED)
1623
find_package(PythonInterp 2.7 REQUIRED)
1724
find_package(ZLIB REQUIRED)
@@ -45,7 +52,7 @@ option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
4552
option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
4653

4754
if(NOT CMAKE_BUILD_TYPE)
48-
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
55+
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
4956
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
5057
FORCE)
5158
endif()
@@ -64,31 +71,11 @@ include(check_packages)
6471
include(swig)
6572
include(coveralls)
6673

67-
# add PaddlePaddle version
68-
if(DEFINED ENV{PADDLE_VERSION})
69-
add_definitions(-DPADDLE_VERSION=\"$ENV{PADDLE_VERSION}\")
70-
else()
71-
if(EXISTS ${PROJ_ROOT}/.svn/)
72-
find_package(Subversion REQUIRED)
73-
if(SUBVERSION_FOUND)
74-
Subversion_WC_INFO(${PROJ_ROOT} Project)
75-
add_definitions(-DPADDLE_VERSION=${Project_WC_REVISION})
76-
endif()
77-
elseif(EXISTS ${PROJ_ROOT}/.git/)
78-
find_package(Git REQUIRED)
79-
execute_process(
80-
COMMAND ${GIT_EXECUTABLE} log -1 --format=%H
81-
WORKING_DIRECTORY ${PROJ_ROOT}
82-
OUTPUT_VARIABLE GIT_SHA1
83-
RESULT_VARIABLE GIT_RESULT
84-
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
85-
if(NOT ${GIT_RESULT})
86-
add_definitions(-DPADDLE_VERSION=\"${GIT_SHA1}\")
87-
else()
88-
message(WARNING "Cannot add paddle version from git tag")
89-
endif()
90-
endif()
91-
endif()
74+
# Set PaddlePaddle version to Git tag name or Git commit ID.
75+
find_package(Git REQUIRED)
76+
# version.cmake will get the current PADDLE_VERSION
77+
include(version)
78+
add_definitions(-DPADDLE_VERSION=\"${PADDLE_VERSION}\")
9279

9380

9481
if(NOT WITH_GPU)

RELEASE.md

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Release v0.9.0
2+
3+
## New Features:
4+
5+
* New Layers
6+
* bilinear interpolation layer.
7+
* spatial pyramid-pool layer.
8+
* de-convolution layer.
9+
* maxout layer.
10+
* Support rectangle padding, stride, window and input for Pooling Operation.
11+
* Add —job=time in trainer, which can be used to print time info without compiler option -WITH_TIMER=ON.
12+
* Expose cost_weight/nce_layer in `trainer_config_helpers`
13+
* Add FAQ, concepts, h-rnn docs.
14+
* Add Bidi-LSTM and DB-LSTM to quick start demo @alvations
15+
* Add usage track scripts.
16+
17+
## Improvements
18+
19+
* Add Travis-CI for Mac OS X. Enable swig unittest in Travis-CI. Skip Travis-CI when only docs are changed.
20+
* Add code coverage tools.
21+
* Refine convolution layer to speedup and reduce GPU memory.
22+
* Speed up PyDataProvider2
23+
* Add ubuntu deb package build scripts.
24+
* Make Paddle use git-flow branching model.
25+
* PServer support no parameter blocks.
26+
27+
## Bug Fixes
28+
29+
* add zlib link to py_paddle
30+
* add input sparse data check for sparse layer at runtime
31+
* Bug fix for sparse matrix multiplication
32+
* Fix floating-point overflow problem of tanh
33+
* Fix some nvcc compile options
34+
* Fix a bug in yield dictionary in DataProvider
35+
* Fix SRL hang when exit.
36+
37+
# Release v0.8.0beta.1
38+
New features:
39+
40+
* Mac OSX is supported by source code. #138
41+
* Both GPU and CPU versions of PaddlePaddle are supported.
42+
43+
* Support CUDA 8.0
44+
45+
* Enhance `PyDataProvider2`
46+
* Add dictionary yield format. `PyDataProvider2` can yield a dictionary with key is data_layer's name, value is features.
47+
* Add `min_pool_size` to control memory pool in provider.
48+
49+
* Add `deb` install package & docker image for no_avx machines.
50+
* Especially for cloud computing and virtual machines
51+
52+
* Automatically disable `avx` instructions in cmake when machine's CPU don't support `avx` instructions.
53+
54+
* Add Parallel NN api in trainer_config_helpers.
55+
56+
* Add `travis ci` for Github
57+
58+
Bug fixes:
59+
60+
* Several bugs in trainer_config_helpers. Also complete the unittest for trainer_config_helpers
61+
* Check if PaddlePaddle is installed when unittest.
62+
* Fix bugs in GTX series GPU
63+
* Fix bug in MultinomialSampler
64+
65+
Also more documentation was written since last release.
66+
67+
# Release v0.8.0beta.0
68+
69+
PaddlePaddle v0.8.0beta.0 release. The install package is not stable yet and it's a pre-release version.

benchmark/.gitignore

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
paddle/image/logs
2+
paddle/image/*.pyc
3+
paddle/image/train.list
4+
paddle/rnn/logs
5+
paddle/rnn/*.pyc
6+
paddle/rnn/imdb.pkl
7+
caffe/image/logs
8+
tensorflow/image/logs
9+
tensorflow/rnn/logs

benchmark/README.md

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
# Benchmark
2+
3+
Machine:
4+
5+
- CPU: 12-core Intel(R) Xeon(R) CPU E5-2620 v2 @2.10GHz
6+
- GPU: Tesla K40m
7+
- cuDNN: v5.1
8+
- system: Docker 1.12.1, all platforms are tested in docker environment.
9+
10+
Platforms:
11+
12+
- PaddlePaddle: paddledev/paddle:gpu-devel-v0.9.0a0
13+
- Tensorflow: gcr.io/tensorflow/tensorflow:0.11.0rc0-gpu
14+
- Caffe: kaixhin/cuda-caffe
15+
16+
Several convolutional neural networks and recurrent neural networks are used to test.
17+
18+
## Image
19+
20+
### Benchmark Model
21+
22+
AlexNet, GoogleNet and a small network used in Caffe.
23+
24+
- [AlexNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet): but the group size is one.
25+
26+
- [GoogleNet](https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet): but remove loss1 and loss2 when testing benchmark.
27+
28+
- [SmallNet](https://github.com/BVLC/caffe/blob/master/examples/cifar10/cifar10\_quick\_train\_test.prototxt)
29+
30+
31+
### Single-GPU
32+
33+
- AlexNet: input - 3 * 227 * 227, Time: ms/batch
34+
35+
| BatchSize | 64 | 128 | 256 | 512 |
36+
|--------------|-----| -----| ------| -----|
37+
| PaddlePaddle | 195 | 334 | 602 | 1629 |
38+
| TensorFlow | 223 | 364 | 645 | 1235 |
39+
| Caffe | 324 | 627 | 1232 | 2513 |
40+
41+
**Notation**
42+
43+
All platforms use cuDNN-v5.1. We see that caffe is slower in this experiment, because its workspace limit size of cuDNN-conv interface is 8 * 1024 * 1024, which is smaller in PaddlePaddle and TensorFlow. Note that Caffe will be faster if increasing the workspace limit size.
44+
45+
- GoogletNet: input - 3 * 224 * 224, Time: ms/batch
46+
47+
48+
| BatchSize | 64 | 128 | 256 |
49+
|--------------|-------| -------| --------|
50+
| PaddlePaddle | 613 | 1149 | 2348 |
51+
| TensorFlow | 644 | 1176 | 2219 |
52+
| Caffe | 694 | 1364 | out of memory |
53+
54+
- SmallNet: input - 3 * 32 * 32, Time ms/batch
55+
56+
| BatchSize | 64 | 128 | 256 | 512 |
57+
|--------------|--------| -------- | --------|---------|
58+
| PaddlePaddle | 10.463 | 18.184 | 33.113 | 63.039 |
59+
| TensorFlow | 9 | 15 | 28 | 59 |
60+
| Caffe | 9.373 | 16.6606 | 31.4797 | 59.719 |
61+
62+
**Notation**
63+
64+
All the single-GPU experiments in caffe use `caffe time` to calculate elapsed time, which does not include parameter updating time. However, both PaddlePaddle and TensorFlow experiments contain the parameter updating time. As compared with the total time, this part is relatively little on single machine, we can ignore it.
65+
66+
In Tensorflow, they implement algorithm searching method instead of using the algorithm searching interface in cuDNN.
67+
68+
### Multi-GPU: 4 GPUs
69+
70+
- AlexNet, ms / batch
71+
72+
| total-BatchSize | 128 * 4 | 256 * 4 |
73+
|------------------|----------| -----------|
74+
| PaddlePaddle | 347 | 622 |
75+
| TensorFlow | 377 | 675 |
76+
| Caffe | 1229 | 2435 |
77+
78+
For example, if `total-BatchSize = 128 * 4`, the speedup ratio is calculated by
79+
80+
```
81+
time_at_1gpu_batch_128 * 4 / time_at_4gpu_total_batch_512
82+
= (334 * 4)/347
83+
= 3.85
84+
```
85+
86+
<img src="figs/alexnet-4gpu.png" width="420">
87+
88+
89+
- GoogleNet, ms / batch
90+
91+
| total-BatchSize | 128 * 4 | 256 * 4 |
92+
|-------------------|--------------| ----------- |
93+
| PaddlePaddle | 1178 | 2367 |
94+
| TensorFlow | 1210 | 2292 |
95+
| Caffe | 2007 | out of memory |
96+
97+
<img src="figs/googlenet-4gpu.png" width="420">
98+
99+
100+
## RNN
101+
We use lstm network for text classfication to test benchmark.
102+
103+
### Dataset
104+
- [IMDB](http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl)
105+
- Sequence length is 100. In fact, PaddlePaddle supports training with variable-length sequence, but TensorFlow needs to pad. Thus, we also pad sequence length to 100 in PaddlePaddle in order to compare.
106+
- Dictionary size=30000
107+
- Peephole connection is used in `lstmemory` by default in PaddlePaddle. It is also configured in TensorFlow.
108+
109+
### Single-GPU
110+
111+
#### LSTM in Text Classification
112+
113+
Testing `2 lstm layer + fc` network with different hidden size and batch size.
114+
115+
- Batch size = 64, ms / batch
116+
117+
| hidden_size | 256 | 512 | 1280 |
118+
|--------------|-------| -------| --------|
119+
| PaddlePaddle | 83 | 184 | 641 |
120+
| TensorFlow | 175 | 280 | 818 |
121+
122+
- Batch size = 128, ms / batch
123+
124+
| hidden_size | 256 | 512 | 1280 |
125+
|--------------|------- | -------| --------|
126+
| PaddlePaddle | 110 | 261 | 1007 |
127+
| TensorFlow | 181 | 361 | 1237 |
128+
129+
130+
- Batch size = 256, ms / batch
131+
132+
| hidden_size | 256 | 512 | 1280 |
133+
|--------------|-------| -------| --------|
134+
| PaddlePaddle | 170 | 414 | 1655 |
135+
| TensorFlow | 238 | 536 | 1905 |
136+
137+
<img src="figs/rnn_lstm_cls.png" width="600">
138+
139+
#### Seq2Seq
140+
141+
The benchmark of sequence-to-sequence network will be added later.
142+
143+
144+
### Multi GPU: 4 GPUs
145+
146+
#### LSTM in Text Classification
147+
148+
- hidden_size = 256, ms / batch
149+
150+
| batch_size | 256 | 512 |
151+
|--------------| -------| --------|
152+
| PaddlePaddle | 90 | 118 |
153+
| TensorFlow | 226 | 118 |
154+
155+
156+
- hidden_size = 512, ms / batch
157+
158+
| batch_size | 256 | 512 |
159+
|--------------| -------| --------|
160+
| PaddlePaddle | 189 | 268 |
161+
| TensorFlow | 297 | 383 |
162+
163+
164+
<img src="figs/rnn_lstm_4gpus.png" width="420">
165+
166+
#### Seq2Seq
167+
168+
The benchmark of sequence-to-sequence network will be added later.

0 commit comments

Comments
 (0)