PaddlePaddle
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 5 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎cmake/util.cmake‎
Lines changed: 5 additions & 0 deletions b/‎cmake/util.cmake‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎paddle/cuda/CMakeLists.txt‎
Lines changed: 20 additions & 11 deletions b/‎paddle/cuda/CMakeLists.txt‎
Lines changed: 20 additions & 11 deletions
diff --git a/‎paddle/cuda/include/hl_dso_loader.h‎
Lines changed: 8 additions & 4 deletions b/‎paddle/cuda/include/hl_dso_loader.h‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎paddle/cuda/include/hl_gpu.h‎
Lines changed: 1 addition & 0 deletions b/‎paddle/cuda/include/hl_gpu.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/cuda/include/hl_sequence.h‎
Lines changed: 33 additions & 0 deletions b/‎paddle/cuda/include/hl_sequence.h‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎paddle/cuda/include/hl_warpctc_wrap.h‎
Lines changed: 93 additions & 0 deletions b/‎paddle/cuda/include/hl_warpctc_wrap.h‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎paddle/cuda/include/stub/hl_sequence_stub.h‎
Lines changed: 9 additions & 0 deletions b/‎paddle/cuda/include/stub/hl_sequence_stub.h‎
Lines changed: 9 additions & 0 deletions
@@ -0,0 +1,3 @@
+[submodule "warp-ctc"]
+	path = warp-ctc
+	url = https://github.com/baidu-research/warp-ctc.git
@@ -2,6 +2,7 @@
     sha: c25201a00e6b0514370501050cf2a8538ac12270
     hooks:
     -   id: remove-crlf
+        files: (?!.*warp-ctc)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
@@ -13,6 +14,7 @@
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
+        files: (?!.*warp-ctc)^.*$
     -   id: end-of-file-fixer
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
 
@@ -77,10 +77,10 @@ find_package(Git REQUIRED)
 include(version)
 add_definitions(-DPADDLE_VERSION=\"${PADDLE_VERSION}\")
 
-
 if(NOT WITH_GPU)
     add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
+
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
     if(${CUDA_VERSION_MAJOR} GREATER 6)
@@ -102,15 +102,15 @@ else()
         set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
     endif(WITH_AVX)
 
-    if(WITH_DSO)
-        add_definitions(-DPADDLE_USE_DSO)
-    endif(WITH_DSO)
-
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
+if(WITH_DSO)
+    add_definitions(-DPADDLE_USE_DSO)
+endif(WITH_DSO)
+
 if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
     set(ACCURACY double)
 
@@ -148,6 +148,11 @@ function(link_paddle_exe TARGET_NAME)
             target_link_libraries(${TARGET_NAME} rt)
         endif()
     endif()
+
+    if(NOT WITH_DSO)
+        target_link_libraries(${TARGET_NAME}
+            ${WARPCTC_LIBRARY})
+    endif()
 endfunction()
 
 # link_paddle_test
 
@@ -15,20 +15,28 @@ else()
 endif()
 
 set(CUDA_CXX_WITH_GPU_SOURCES
+    src/hl_cudart_wrap.cc
     src/hl_cuda_cublas.cc
     src/hl_cuda_cudnn.cc
     src/hl_cuda_device.cc)
 
-set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES}
-                            PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+if(WITH_GPU)
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc
+        ${CUDA_CXX_WITH_GPU_SOURCES})
+
+    set_source_files_properties(${CUDA_CXX_SOURCES}
+                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+else()
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc)
+endif()
 
 set_source_files_properties(${AVX_SOURCES}
                             PROPERTIES COMPILE_FLAGS "-mavx")
 
-set(CUDA_DSO_SOURCES
-    src/hl_dso_loader.cc
-    src/hl_cudart_wrap.cc)
-
 set(CUDA_CU_SOURCES
     src/hl_perturbation_util.cu
     src/hl_cuda_aggregate.cu
@@ -44,6 +52,7 @@ set(CUDA_CU_SOURCES
 set(CUDA_HEADERS
     include/hl_time.h
     include/hl_dso_loader.h
+    include/hl_warpctc_wrap.h
     include/hl_sequence.h
     include/hl_cuda_cublas.h
     include/hl_batch_transpose.h
@@ -75,14 +84,14 @@ if(WITH_GPU)
     cuda_add_library(paddle_cuda
         ${CUDA_SOURCES}
         ${CUDA_CU_SOURCES}
-        ${CUDA_DSO_SOURCES}
-        ${CUDA_CXX_WITH_GPU_SOURCES})
+        ${CUDA_CXX_SOURCES})
 else()
-    add_library(paddle_cuda ${CUDA_SOURCES})
+    add_library(paddle_cuda
+                ${CUDA_SOURCES}
+                ${CUDA_CXX_SOURCES})
 endif()
 
 add_style_check_target(paddle_cuda
                        ${CUDA_SOURCES}
                        ${CUDA_HEADERS}
-                       ${CUDA_DSO_SOURCES}
-                       ${CUDA_CXX_WITH_GPU_SOURCES})
+                       ${CUDA_CXX_SOURCES})
@@ -18,10 +18,6 @@ limitations under the License. */
 #include <dlfcn.h>
 #include <string>
 #include <memory>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <curand.h>
-#include <cudnn.h>
 #include "hl_base.h"
 
 /**
@@ -56,4 +52,12 @@ void GetCudartDsoHandle(void** dso_handle);
  */
 void GetCurandDsoHandle(void** dso_handle);
 
+/**
+ * @brief    load the DSO of warp-ctc
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetWarpCTCDsoHandle(void** dso_handle);
+
 #endif  // HL_DSO_LOADER_H_
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "hl_sparse.h"
 #include "hl_lstm.h"
 #include "hl_sequence.h"
+#include "hl_warpctc_wrap.h"
 
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_cuda_stub.h"
 
@@ -172,6 +172,39 @@ extern void hl_sequence2batch_add(real* batch,
                                   int batchCount,
                                   bool seq2batch);
 
+/**
+ * @brief   Memory copy from sequence to batch,
+ *          while padding all sequences to the same length.
+ *
+ * if seq2batch == true
+ *
+ *    copy from sequence to batch:
+ *        batch[i] = sequence[sequenceStartPositions[i]]
+ *
+ * if seq2batch == false
+ *
+ *    copy from batch to sequence:
+ *        sequence[sequenceStartPositions[i]] = batch[i]
+ *
+ * @param[in,out]   batch                   batch matrix.
+ * @param[in,out]   sequence                sequence matrix.
+ * @param[in]       sequenceStartPositions  index vector.
+ * @param[in]       sequenceWidth           width of sequence.
+ * @param[in]       maxSequenceLength       maximum length of sequences.
+ * @param[in]       numSequences            number of sequences.
+ * @param[in]       normByTimes             whether dividing sequence's length.
+ * @param[in]       seq2batch               copy direction.
+ *
+ */
+extern void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch);
+
 /**
  * @brief  dst = Op(src), src is sequence.
  *
 
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_WARPCTC_WRAP_H_
+#define HL_WARPCTC_WRAP_H_
+
+#include "hl_base.h"
+#include "warp-ctc/include/ctc.h"
+
+typedef ctcStatus_t hl_warpctc_status_t;
+typedef ctcOptions hl_warpctc_options_t;
+
+/**
+ * @brief Init ctc options.
+ *
+ * @param[in]   blank     blank label used in ctc loss function.
+ * @param[in]   useGpu    whether use gpu.
+ * @param[out]  options   handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_init(const size_t blank,
+                            bool useGpu,
+                            hl_warpctc_options_t* options);
+
+/**
+ * @brief Compute the connectionist temporal classification loss,
+ *        and optionally compute the gradient with respect to the inputs.
+ *
+ * if batchGrad == nullptr
+ *
+ *    only compute the ctc loss.
+ *
+ * if batchGrad != nullptr
+ *
+ *    compute both ctc loss and gradient.
+ *
+ * @param[in]   batchInput      batch matrix of input probabilities,
+ *                              in maxSequenceLength x numSequence x numClasses
+ *                              (row-major) format.
+ * @param[out]  batchGrad       batch matrix of gradient.
+ * @param[in]   cpuLabels       labels always in CPU memory.
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[out]  cpuCosts        cost of each sequence in CPU memory.
+ * @param[out]  workspace       workspace to store some temporary results.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_compute_loss(const real* batchInput,
+                                    real* batchGrad,
+                                    const int* cpuLabels,
+                                    const int* cpuLabelLengths,
+                                    const int* cpuInputLengths,
+                                    const size_t numClasses,
+                                    const size_t numSequences,
+                                    real* cpuCosts,
+                                    void* workspace,
+                                    hl_warpctc_options_t* options);
+
+/**
+ * @brief Compute the required workspace size.
+ *        There is no memory allocated operations within warp-ctc.
+ *
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ * @param[out]  bytes           pointer to a scalar where the memory
+ *                              requirement in bytes will be placed.
+ *
+ */
+extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                          const int* cpuInputLengths,
+                                          const size_t numClasses,
+                                          const size_t numSequences,
+                                          hl_warpctc_options_t* options,
+                                          size_t* bytes);
+
+#endif  // HL_WARPCTC_WRAP_H_
@@ -70,6 +70,15 @@ inline void hl_sequence2batch_add(real* batch,
                                   int batchCount,
                                   bool seq2batch) {}
 
+inline void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch) {}
+
 inline void hl_sequence_avg_forward(real* dst,
                                     real* src,
                                     const int* starts,
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "warp-ctc"]`
	`2`	`+ path = warp-ctc`
	`3`	`+ url = https://github.com/baidu-research/warp-ctc.git`