diff --git a/.gitignore b/.gitignore index 3b9fdd98d..b3f0e3b62 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ examples/protonect/include/libfreenect2/config.h # generated resource file -examples/protonect/src/resources.inc +examples/protonect/src/resources.inc.h examples/protonect/build # Dependency folders diff --git a/examples/protonect/CMakeLists.txt b/examples/protonect/CMakeLists.txt index d90277e06..90286a03e 100644 --- a/examples/protonect/CMakeLists.txt +++ b/examples/protonect/CMakeLists.txt @@ -8,6 +8,7 @@ SET(DEPENDS_DIR "${MY_DIR}/../../depends" CACHE STRING "dependency directory") OPTION(ENABLE_CXX11 "Enable C++11 support" OFF) OPTION(ENABLE_OPENCL "Enable OpenCL support" ON) +OPTION(ENABLE_CUDA "Enable CUDA support" ON) OPTION(ENABLE_OPENGL "Enable OpenGL support" ON) IF(ENABLE_CXX11) @@ -40,7 +41,7 @@ SET(LIBRARY_OUTPUT_PATH ${MY_DIR}/lib) # dependencies FIND_PACKAGE(PkgConfig) # try find PKGConfig as it will be used if found FIND_PACKAGE(LibUSB REQUIRED) -FIND_PACKAGE(OpenCV REQUIRED) +FIND_PACKAGE(OpenCV REQUIRED core highgui) FIND_PACKAGE(TurboJPEG REQUIRED) #does not provide a package-config file # Add includes @@ -54,7 +55,7 @@ INCLUDE_DIRECTORIES( LINK_DIRECTORIES(${LibUSB_LIBRARY_DIRS}) -SET(RESOURCES_INC_FILE "${MY_DIR}/src/resources.inc") +SET(RESOURCES_INC_FILE "${MY_DIR}/src/resources.inc.h") SET(SOURCES include/libfreenect2/protocol/command.h @@ -165,30 +166,96 @@ IF(ENABLE_OPENCL) ENDIF(OPENCL_FOUND) ENDIF(ENABLE_OPENCL) +IF(ENABLE_CUDA) + FIND_PACKAGE(CUDA) + IF(CUDA_FOUND) + SET(LIBFREENECT2_WITH_CUDA_SUPPORT 1) + + CUDA_INCLUDE_DIRECTORIES( + "${MY_DIR}/include/" + "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc" + "$ENV{NVCUDASAMPLES_ROOT}/common/inc" + ) + SET(CUDA_FLAGS -use_fast_math) + IF(NOT MSVC) + SET(CUDA_FLAGS "${CUDA_FLAGS} -Xcompiler -fPIC") + ENDIF() + CUDA_COMPILE(CUDA_SHARED_OBJECTS + src/cuda_depth_packet_processor.cu SHARED + OPTIONS ${CUDA_FLAGS} + ) + LIST(APPEND OTHER_SHARED_OBJECTS ${CUDA_SHARED_OBJECTS}) + CUDA_COMPILE(CUDA_STATIC_OBJECTS + src/cuda_depth_packet_processor.cu STATIC + OPTIONS ${CUDA_FLAGS} + ) + LIST(APPEND OTHER_STATIC_OBJECTS ${CUDA_STATIC_OBJECTS}) + + INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS}) + + LIST(APPEND SOURCES + src/cuda_depth_packet_processor.cpp + ${CUDA_OBJECTS} + ) + + LIST(APPEND LIBRARIES + ${CUDA_LIBRARIES} + ) + ENDIF(CUDA_FOUND) +ENDIF(ENABLE_CUDA) + SET(CMAKE_INSTALL_RPATH ${LibUSB_LIBDIR}) CONFIGURE_FILE("${MY_DIR}/include/libfreenect2/config.h.in" "${MY_DIR}/include/libfreenect2/config.h" @ONLY) GENERATE_RESOURCES(${RESOURCES_INC_FILE} ${MY_DIR} ${RESOURCES}) ADD_DEFINITIONS(-DRESOURCES_INC) -ADD_LIBRARY(freenect2 SHARED ${SOURCES}) +ADD_LIBRARY(freenect2 OBJECT ${SOURCES}) +set_target_properties(freenect2 PROPERTIES POSITION_INDEPENDENT_CODE 1) +ADD_LIBRARY(freenect2shared SHARED $ ${OTHER_SHARED_OBJECTS}) +ADD_LIBRARY(freenect2static STATIC $ ${OTHER_STATIC_OBJECTS}) +set_target_properties(freenect2shared PROPERTIES OUTPUT_NAME freenect2) +set_target_properties(freenect2static PROPERTIES OUTPUT_NAME freenect2) MESSAGE("Linking with these libraries: ${LIBRARIES}") -TARGET_LINK_LIBRARIES(freenect2 ${LIBRARIES}) +TARGET_LINK_LIBRARIES(freenect2shared ${LIBRARIES}) ADD_EXECUTABLE(Protonect Protonect.cpp ) TARGET_LINK_LIBRARIES(Protonect - freenect2 + freenect2shared ) CONFIGURE_FILE(freenect2.cmake.in "${PROJECT_BINARY_DIR}/freenect2Config.cmake" @ONLY) -INSTALL(TARGETS freenect2 DESTINATION lib) +INSTALL(TARGETS Protonect DESTINATION bin) +INSTALL(TARGETS freenect2shared DESTINATION lib) +INSTALL(TARGETS freenect2static DESTINATION lib) INSTALL(DIRECTORY "${MY_DIR}/include/" DESTINATION include PATTERN "*.in" EXCLUDE) IF(LIBFREENECT2_THREADING_TINYTHREAD) INSTALL(FILES "${MY_DIR}/src/tinythread/tinythread.h" DESTINATION include/${PROJECT_NAME}/tinythread/) ENDIF(LIBFREENECT2_THREADING_TINYTHREAD) INSTALL(FILES "${PROJECT_BINARY_DIR}/freenect2Config.cmake" DESTINATION lib/cmake/freenect2/) +IF(WIN32) + INSTALL(TARGETS freenect2shared RUNTIME DESTINATION bin) + + FOREACH(LIB ${OpenCV_LIBS}) + INSTALL(FILES $ DESTINATION bin) + ENDFOREACH() + + get_filename_component(LibUSB_LIBDIR ${LibUSB_LIBRARIES} DIRECTORY) + FIND_FILE(LibUSB_DLL libusb-1.0.dll PATHS ${LibUSB_LIBDIR} NO_DEFAULT_PATH) + INSTALL(FILES ${LibUSB_DLL} DESTINATION bin) + + get_filename_component(TurboJPEG_LIBDIR ${TurboJPEG_LIBRARIES} DIRECTORY) + FIND_FILE(TurboJPEG_DLL turbojpeg.dll PATHS "${TurboJPEG_LIBDIR}/../bin" NO_DEFAULT_PATH) + INSTALL(FILES ${TurboJPEG_DLL} DESTINATION bin) + + IF(GLFW3_FOUND) + get_filename_component(GLFW3_LIBDIR ${GLFW3_LIBRARIES} DIRECTORY) + FIND_FILE(GLFW3_DLL glfw3.dll PATHS ${GLFW3_LIBDIR} NO_DEFAULT_PATH) + INSTALL(FILES ${GLFW3_DLL} DESTINATION bin) + ENDIF() +ENDIF() diff --git a/examples/protonect/Protonect.cpp b/examples/protonect/Protonect.cpp index 138c189de..3e6bed6f8 100644 --- a/examples/protonect/Protonect.cpp +++ b/examples/protonect/Protonect.cpp @@ -92,6 +92,15 @@ int main(int argc, char *argv[]) pipeline = new libfreenect2::OpenCLPacketPipeline(); #else std::cout << "OpenCL pipeline is not supported!" << std::endl; +#endif + } + else if(arg == "cuda") + { +#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT + if(!pipeline) + pipeline = new libfreenect2::CudaPacketPipeline(); +#else + std::cout << "CUDA pipeline is not supported!" << std::endl; #endif } else if(arg.find_first_not_of("0123456789") == std::string::npos) //check if parameter could be a serial number diff --git a/examples/protonect/include/libfreenect2/async_packet_processor.h b/examples/protonect/include/libfreenect2/async_packet_processor.h index 807dbead7..2293406ea 100644 --- a/examples/protonect/include/libfreenect2/async_packet_processor.h +++ b/examples/protonect/include/libfreenect2/async_packet_processor.h @@ -76,6 +76,12 @@ class AsyncPacketProcessor : public PacketProcessor } packet_condition_.notify_one(); } + + virtual unsigned char *getPacketBuffer(size_t size) + { + return processor_->getPacketBuffer(size); + } + private: PacketProcessorPtr processor_; bool current_packet_available_; diff --git a/examples/protonect/include/libfreenect2/config.h.in b/examples/protonect/include/libfreenect2/config.h.in index f3abf7d43..abb2da43b 100644 --- a/examples/protonect/include/libfreenect2/config.h.in +++ b/examples/protonect/include/libfreenect2/config.h.in @@ -43,6 +43,8 @@ #cmakedefine LIBFREENECT2_WITH_OPENCL_SUPPORT +#cmakedefine LIBFREENECT2_WITH_CUDA_SUPPORT + #cmakedefine LIBFREENECT2_THREADING_STDLIB #cmakedefine LIBFREENECT2_THREADING_TINYTHREAD diff --git a/examples/protonect/include/libfreenect2/depth_packet_processor.h b/examples/protonect/include/libfreenect2/depth_packet_processor.h index a6a83d1d7..15ea9d74d 100644 --- a/examples/protonect/include/libfreenect2/depth_packet_processor.h +++ b/examples/protonect/include/libfreenect2/depth_packet_processor.h @@ -196,5 +196,58 @@ class LIBFREENECT2_API OpenCLDepthPacketProcessor : public DepthPacketProcessor OpenCLDepthPacketProcessorImpl *impl_; }; #endif // LIBFREENECT2_WITH_OPENCL_SUPPORT + +#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT +#ifdef _MSC_VER +struct __declspec(align(16)) Float4 +#else +struct __attribute__((aligned(16))) Float4 +#endif +{ + float x, y, z, w; +}; + +class CudaDepthPacketProcessorImpl; + +class LIBFREENECT2_API CudaDepthPacketProcessor : public DepthPacketProcessor +{ +public: + CudaDepthPacketProcessor(const int deviceId = -1); + virtual ~CudaDepthPacketProcessor(); + virtual unsigned char *getPacketBuffer(size_t size); + virtual void setConfiguration(const libfreenect2::DepthPacketProcessor::Config &config); + + virtual void loadP0TablesFromCommandResponse(unsigned char* buffer, size_t buffer_length); + + /** + * GUESS: the x and z table follow some polynomial, until we know the exact polynom formula and its coefficients + * just load them from a memory dump - although they probably vary per camera + */ + void loadXTableFromFile(const char* filename); + + void loadZTableFromFile(const char* filename); + + void load11To16LutFromFile(const char* filename); + + virtual void process(const DepthPacket &packet); +private: + CudaDepthPacketProcessorImpl *impl_; +}; + +class CudaDepthPacketProcessorKernelImpl; + +class CudaDepthPacketProcessorKernel +{ +public: + CudaDepthPacketProcessorKernel(); + virtual ~CudaDepthPacketProcessorKernel(); + void initDevice(const int deviceId, size_t image_size_, size_t block); + void generateOptions(const DepthPacketProcessor::Parameters ¶ms, const DepthPacketProcessor::Config &config); + void loadTables(const short *lut11to16, const Float4 *p0_table, const float *x_table, const float *z_table); + void run(const DepthPacket &packet, Frame *ir_frame, Frame *depth_frame, const DepthPacketProcessor::Config &config); +private: + CudaDepthPacketProcessorKernelImpl *impl_; +}; +#endif // LIBFREENECT2_WITH_CUDA_SUPPORT } /* namespace libfreenect2 */ #endif /* DEPTH_PACKET_PROCESSOR_H_ */ diff --git a/examples/protonect/include/libfreenect2/double_buffer.h b/examples/protonect/include/libfreenect2/double_buffer.h index 49eb1f155..452cadb25 100644 --- a/examples/protonect/include/libfreenect2/double_buffer.h +++ b/examples/protonect/include/libfreenect2/double_buffer.h @@ -49,6 +49,8 @@ class LIBFREENECT2_API DoubleBuffer void allocate(size_t buffer_size); + void setbuffer(unsigned char *buf, size_t size); + void swap(); Buffer& front(); @@ -59,6 +61,7 @@ class LIBFREENECT2_API DoubleBuffer unsigned char front_buffer_index_; unsigned char* buffer_data_; + bool external_buffer_; }; } /* namespace libfreenect2 */ diff --git a/examples/protonect/include/libfreenect2/frame_listener.hpp b/examples/protonect/include/libfreenect2/frame_listener.hpp index 1ef02354e..091e5781a 100644 --- a/examples/protonect/include/libfreenect2/frame_listener.hpp +++ b/examples/protonect/include/libfreenect2/frame_listener.hpp @@ -48,15 +48,18 @@ struct LIBFREENECT2_API Frame size_t width, height, bytes_per_pixel; unsigned char* data; - Frame(size_t width, size_t height, size_t bytes_per_pixel) : + Frame(size_t width, size_t height, size_t bytes_per_pixel, bool alloc = true) : width(width), height(height), - bytes_per_pixel(bytes_per_pixel) + bytes_per_pixel(bytes_per_pixel), + data(NULL) { + if (!alloc) + return; data = new unsigned char[width * height * bytes_per_pixel]; } - ~Frame() + virtual ~Frame() { delete[] data; } diff --git a/examples/protonect/include/libfreenect2/packet_pipeline.h b/examples/protonect/include/libfreenect2/packet_pipeline.h index b1bf467c2..15ee92305 100644 --- a/examples/protonect/include/libfreenect2/packet_pipeline.h +++ b/examples/protonect/include/libfreenect2/packet_pipeline.h @@ -107,6 +107,17 @@ class LIBFREENECT2_API OpenCLPacketPipeline : public BasePacketPipeline }; #endif // LIBFREENECT2_WITH_OPENCL_SUPPORT +#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT +class LIBFREENECT2_API CudaPacketPipeline : public BasePacketPipeline +{ +protected: + const int deviceId; + virtual DepthPacketProcessor *createDepthPacketProcessor(); +public: + CudaPacketPipeline(const int deviceId = -1); + virtual ~CudaPacketPipeline(); +}; +#endif // LIBFREENECT2_WITH_CUDA_SUPPORT } /* namespace libfreenect2 */ #endif /* PACKET_PIPELINE_H_ */ diff --git a/examples/protonect/include/libfreenect2/packet_processor.h b/examples/protonect/include/libfreenect2/packet_processor.h index ea21b9d8a..bd70a8cfb 100644 --- a/examples/protonect/include/libfreenect2/packet_processor.h +++ b/examples/protonect/include/libfreenect2/packet_processor.h @@ -38,6 +38,7 @@ class PacketProcessor virtual bool ready() { return true; } virtual void process(const PacketT &packet) = 0; + virtual unsigned char *getPacketBuffer(size_t size) { return NULL; } }; template diff --git a/examples/protonect/include/libfreenect2/resource.h b/examples/protonect/include/libfreenect2/resource.h index a8dc38262..f6398c0dd 100644 --- a/examples/protonect/include/libfreenect2/resource.h +++ b/examples/protonect/include/libfreenect2/resource.h @@ -33,6 +33,7 @@ namespace libfreenect2 { bool loadResource(const std::string &name, unsigned char const**data, size_t *length); +bool loadBufferFromResources(const std::string &filename, unsigned char *buffer, const size_t n); } /* namespace libfreenect2 */ #endif /* RESOURCE_H_ */ diff --git a/examples/protonect/src/cuda_depth_packet_processor.cpp b/examples/protonect/src/cuda_depth_packet_processor.cpp new file mode 100644 index 000000000..89e8a8867 --- /dev/null +++ b/examples/protonect/src/cuda_depth_packet_processor.cpp @@ -0,0 +1,303 @@ +/* + * This file is part of the OpenKinect Project. http://www.openkinect.org + * + * Copyright (c) 2014 individual OpenKinect contributors. See the CONTRIB file + * for details. + * + * This code is licensed to you under the terms of the Apache License, version + * 2.0, or, at your option, the terms of the GNU General Public License, + * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses, + * or the following URLs: + * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.gnu.org/licenses/gpl-2.0.txt + * + * If you redistribute this file in source form, modified or unmodified, you + * may: + * 1) Leave this header intact and distribute it under the same terms, + * accompanying it with the APACHE20 and GPL20 files, or + * 2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or + * 3) Delete the GPL v2 clause and accompany it with the APACHE20 file + * In all cases you must keep the copyright notice intact and include a copy + * of the CONTRIB file. + * + * Binary distributions must follow the binary distribution requirements of + * either License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#if defined(WIN32) +#define _USE_MATH_DEFINES +#include +#endif + +#define cudaSafeCall(expr) do { cudaError_t err = (expr); if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err)); } while(0) + +#define OUT_NAME(FUNCTION) "[CudaDepthPacketProcessor::" FUNCTION "] " +namespace libfreenect2 +{ + +struct PinnedFrame: Frame +{ + PinnedFrame(size_t width, size_t height, size_t bytes_per_pixel): + Frame(width, height, bytes_per_pixel, false) + { + cudaSafeCall(cudaHostAlloc(&data, width*height*bytes_per_pixel, cudaHostAllocPortable)); + } + + ~PinnedFrame() + { + cudaFreeHost(data); + data = NULL; + } +}; + +class CudaDepthPacketProcessorImpl +{ +public: + short lut11to16[2048]; + float x_table[512 * 424]; + float z_table[512 * 424]; + Float4 p0_table[512 * 424]; + libfreenect2::DepthPacketProcessor::Config config; + DepthPacketProcessor::Parameters params; + + CudaDepthPacketProcessorKernel kernel; + + double timing_acc; + double timing_acc_n; + + double timing_current_start; + + Frame *ir_frame, *depth_frame; + + size_t image_size; + + size_t packet_buffer_size; + unsigned char *packet_buffer; + + bool deviceInitialized; + bool programInitialized; + + CudaDepthPacketProcessorImpl(const int deviceId = -1) : deviceInitialized(false), programInitialized(false) + { + timing_acc = 0.0; + timing_acc_n = 0.0; + timing_current_start = 0.0; + image_size = 512 * 424; + + deviceInitialized = initDevice(deviceId); + newIrFrame(); + newDepthFrame(); + packet_buffer_size = 0; + packet_buffer = NULL; + } + + bool initDevice(const int deviceId) + { + size_t block_size = 128; + try + { + kernel.initDevice(deviceId, image_size, block_size); + } + catch (const std::runtime_error &err) + { + std::cerr << OUT_NAME("initDevice") << err.what() << std::endl; + return false; + } + return true; + } + + bool initProgram() + { + if(!deviceInitialized) + { + return false; + } + + try + { + kernel.generateOptions(params, config); + kernel.loadTables(lut11to16, p0_table, x_table, z_table); + } + catch (const std::runtime_error &err) + { + std::cerr << OUT_NAME("initProgram") << err.what() << std::endl; + throw err; + } + programInitialized = true; + return true; + } + + void run(const DepthPacket &packet) + { + kernel.run(packet, ir_frame, depth_frame, config); + } + + void startTiming() + { + timing_current_start = cv::getTickCount(); + } + + void stopTiming() + { + timing_acc += (cv::getTickCount() - timing_current_start) / cv::getTickFrequency(); + timing_acc_n += 1.0; + + if(timing_acc_n >= 100.0) + { + double avg = (timing_acc / timing_acc_n); + std::cout << "[CudaDepthPacketProcessor] avg. time: " << (avg * 1000) << "ms -> ~" << (1.0 / avg) << "Hz" << std::endl; + timing_acc = 0.0; + timing_acc_n = 0.0; + } + } + + void newIrFrame() + { + ir_frame = new PinnedFrame(512, 424, 4); + } + + void newDepthFrame() + { + depth_frame = new PinnedFrame(512, 424, 4); + } + + void fill_trig_table(const libfreenect2::protocol::P0TablesResponse *p0table) + { + for(int r = 0; r < 424; ++r) + { + Float4 *it = &p0_table[r * 512]; + const uint16_t *it0 = &p0table->p0table0[r * 512]; + const uint16_t *it1 = &p0table->p0table1[r * 512]; + const uint16_t *it2 = &p0table->p0table2[r * 512]; + for(int c = 0; c < 512; ++c, ++it, ++it0, ++it1, ++it2) + { + it->x = -((float) * it0) * 0.000031 * M_PI; + it->y = -((float) * it1) * 0.000031 * M_PI; + it->z = -((float) * it2) * 0.000031 * M_PI; + it->w = 0.0f; + } + } + } +}; + +CudaDepthPacketProcessor::CudaDepthPacketProcessor(const int deviceId) : + impl_(new CudaDepthPacketProcessorImpl(deviceId)) +{ +} + +CudaDepthPacketProcessor::~CudaDepthPacketProcessor() +{ + delete impl_; +} + +unsigned char *CudaDepthPacketProcessor::getPacketBuffer(size_t size) +{ + if (impl_->packet_buffer != NULL) + { + if (size == impl_->packet_buffer_size) + return impl_->packet_buffer; + cudaSafeCall(cudaFreeHost(impl_->packet_buffer)); + impl_->packet_buffer = NULL; + impl_->packet_buffer_size = 0; + } + + cudaSafeCall(cudaHostAlloc(&impl_->packet_buffer, size, cudaHostAllocWriteCombined | cudaHostAllocPortable)); + impl_->packet_buffer_size = size; + return impl_->packet_buffer; +} + +void CudaDepthPacketProcessor::setConfiguration(const libfreenect2::DepthPacketProcessor::Config &config) +{ + DepthPacketProcessor::setConfiguration(config); + impl_->config = config; + impl_->programInitialized = false; +} + +void CudaDepthPacketProcessor::loadP0TablesFromCommandResponse(unsigned char *buffer, size_t buffer_length) +{ + libfreenect2::protocol::P0TablesResponse *p0table = (libfreenect2::protocol::P0TablesResponse *)buffer; + + if(buffer_length < sizeof(libfreenect2::protocol::P0TablesResponse)) + { + std::cerr << OUT_NAME("loadP0TablesFromCommandResponse") "P0Table response too short!" << std::endl; + return; + } + + impl_->fill_trig_table(p0table); +} + +void CudaDepthPacketProcessor::loadXTableFromFile(const char *filename) +{ + if(!loadBufferFromResources(filename, (unsigned char *)impl_->x_table, impl_->image_size * sizeof(float))) + { + std::cerr << OUT_NAME("loadXTableFromFile") "could not load x table from: " << filename << std::endl; + } +} + +void CudaDepthPacketProcessor::loadZTableFromFile(const char *filename) +{ + if(!loadBufferFromResources(filename, (unsigned char *)impl_->z_table, impl_->image_size * sizeof(float))) + { + std::cerr << OUT_NAME("loadZTableFromFile") "could not load z table from: " << filename << std::endl; + } +} + +void CudaDepthPacketProcessor::load11To16LutFromFile(const char *filename) +{ + if(!loadBufferFromResources(filename, (unsigned char *)impl_->lut11to16, 2048 * sizeof(short))) + { + std::cerr << OUT_NAME("load11To16LutFromFile") "could not load lut table from: " << filename << std::endl; + } +} + +void CudaDepthPacketProcessor::process(const DepthPacket &packet) +{ + bool has_listener = this->listener_ != 0; + + if(!impl_->programInitialized && !impl_->initProgram()) + { + std::cerr << OUT_NAME("process") "could not initialize CudaDepthPacketProcessor" << std::endl; + return; + } + + impl_->startTiming(); + + impl_->ir_frame->timestamp = packet.timestamp; + impl_->depth_frame->timestamp = packet.timestamp; + impl_->ir_frame->sequence = packet.sequence; + impl_->depth_frame->sequence = packet.sequence; + + impl_->run(packet); + + impl_->stopTiming(); + + if(has_listener) + { + if(this->listener_->onNewFrame(Frame::Ir, impl_->ir_frame)) + { + impl_->newIrFrame(); + } + + if(this->listener_->onNewFrame(Frame::Depth, impl_->depth_frame)) + { + impl_->newDepthFrame(); + } + } +} + +} /* namespace libfreenect2 */ + diff --git a/examples/protonect/src/cuda_depth_packet_processor.cu b/examples/protonect/src/cuda_depth_packet_processor.cu new file mode 100644 index 000000000..99b8cede7 --- /dev/null +++ b/examples/protonect/src/cuda_depth_packet_processor.cu @@ -0,0 +1,752 @@ +/* + * This file is part of the OpenKinect Project. http://www.openkinect.org + * + * Copyright (c) 2014 individual OpenKinect contributors. See the CONTRIB file + * for details. + * + * This code is licensed to you under the terms of the Apache License, version + * 2.0, or, at your option, the terms of the GNU General Public License, + * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses, + * or the following URLs: + * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.gnu.org/licenses/gpl-2.0.txt + * + * If you redistribute this file in source form, modified or unmodified, you + * may: + * 1) Leave this header intact and distribute it under the same terms, + * accompanying it with the APACHE20 and GPL20 files, or + * 2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or + * 3) Delete the GPL v2 clause and accompany it with the APACHE20 file + * In all cases you must keep the copyright notice intact and include a copy + * of the CONTRIB file. + * + * Binary distributions must follow the binary distribution requirements of + * either License. + */ + +#include +#include +#include +#include + +#include +#include +#define cudaSafeCall(expr) do { cudaError_t err = (expr); if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err)); } while(0) + +__constant__ static unsigned int BFI_BITMASK; +__constant__ static float AB_MULTIPLIER; +__constant__ static float AB_MULTIPLIER_PER_FRQ0; +__constant__ static float AB_MULTIPLIER_PER_FRQ1; +__constant__ static float AB_MULTIPLIER_PER_FRQ2; +__constant__ static float AB_OUTPUT_MULTIPLIER; +; +__constant__ static float PHASE_IN_RAD0; +__constant__ static float PHASE_IN_RAD1; +__constant__ static float PHASE_IN_RAD2; +; +__constant__ static float JOINT_BILATERAL_AB_THRESHOLD; +__constant__ static float JOINT_BILATERAL_MAX_EDGE; +__constant__ static float JOINT_BILATERAL_EXP; +__constant__ static float JOINT_BILATERAL_THRESHOLD; +; +__constant__ static float GAUSSIAN_KERNEL_0; +__constant__ static float GAUSSIAN_KERNEL_1; +__constant__ static float GAUSSIAN_KERNEL_2; +__constant__ static float GAUSSIAN_KERNEL_3; +__constant__ static float GAUSSIAN_KERNEL_4; +__constant__ static float GAUSSIAN_KERNEL_5; +__constant__ static float GAUSSIAN_KERNEL_6; +__constant__ static float GAUSSIAN_KERNEL_7; +__constant__ static float GAUSSIAN_KERNEL_8; +; +__constant__ static float PHASE_OFFSET; +__constant__ static float UNAMBIGIOUS_DIST; +__constant__ static float INDIVIDUAL_AB_THRESHOLD; +__constant__ static float AB_THRESHOLD; +__constant__ static float AB_CONFIDENCE_SLOPE; +__constant__ static float AB_CONFIDENCE_OFFSET; +__constant__ static float MIN_DEALIAS_CONFIDENCE; +__constant__ static float MAX_DEALIAS_CONFIDENCE; +; +__constant__ static float EDGE_AB_AVG_MIN_VALUE; +__constant__ static float EDGE_AB_STD_DEV_THRESHOLD; +__constant__ static float EDGE_CLOSE_DELTA_THRESHOLD; +__constant__ static float EDGE_FAR_DELTA_THRESHOLD; +__constant__ static float EDGE_MAX_DELTA_THRESHOLD; +__constant__ static float EDGE_AVG_DELTA_THRESHOLD; +__constant__ static float MAX_EDGE_COUNT; +; +__constant__ static float MIN_DEPTH; +__constant__ static float MAX_DEPTH; + +#define sqrt(x) sqrtf(x) +#define sincos(x, a, b) sincosf(x, a, b) +#define atan2(a, b) atan2f(a, b) +#define log(x) logf(x) +#define exp(x) expf(x) +#define max(x, y) fmaxf(x, y) +#define min(x, y) fminf(x, y) +#define M_PI_F CUDART_PI_F + +typedef unsigned char uchar; + +inline __device__ uint get_global_id(uint i) +{ + if (i == 0) + return blockIdx.x*blockDim.x + threadIdx.x; + // NOT IMPLEMENTED for i > 0 + return 0; +} + +static inline __device__ int3 isnan(float3 v) +{ + return make_int3(isnan(v.x) ? -1 : 0, isnan(v.y) ? -1 : 0, isnan(v.z) ? -1 : 0); +} +static inline __device__ float3 sqrtf(float3 v) +{ + return make_float3(sqrtf(v.x), sqrtf(v.y), sqrtf(v.z)); +} +static inline __device__ void sincosf(float3 v, float3 *a, float3 *b) +{ + sincosf(v.x, &a->x, &b->x); + sincosf(v.y, &a->y, &b->y); + sincosf(v.z, &a->z, &b->z); +} +static inline __device__ float3 atan2f(float3 a, float3 b) +{ + return make_float3(atan2f(a.x, b.x), atan2f(a.y, b.y), atan2f(a.z, b.z)); +} +static inline __device__ float3 expf(float3 v) +{ + return make_float3(expf(v.x), expf(v.y), expf(v.z)); +} +static inline __device__ float3 select(float3 a, float3 b, int3 c) +{ + return make_float3(c.x < 0 ? b.x : a.x, c.y < 0 ? b.y : a.y, c.z < 0 ? b.z : a.z); +} +static inline __device__ int3 isless(float3 a, float3 b) +{ + return make_int3(a.x < b.x ? -1 : 0, a.y < b.y ? -1 : 0, a.z < b.z ? -1 : 0); +} +static inline __device__ int3 isequal(float3 a, float3 b) +{ + return make_int3(a.x == b.x ? -1 : 0, a.y == b.y ? -1 : 0, a.z == b.z ? -1 : 0); +} +static inline __device__ int any(int3 v) +{ + return (v.x | v.y | v.z) < 0; +} +static inline __device__ int all(int3 v) +{ + return (v.x & v.y & v.z) < 0; +} + +/******************************************************************************* + * Process pixel stage 1 + ******************************************************************************/ + +static __device__ +float decodePixelMeasurement(const ushort* __restrict__ data, const short* __restrict__ lut11to16, const uint sub, const uint x, const uint y) +{ + uint row_idx = (424 * sub + (y < 212 ? y + 212 : 423 - y)) * 352; + uint idx = (((x >> 2) + ((x << 7) & BFI_BITMASK)) * 11) & (uint)0xffffffff; + + uint col_idx = idx >> 4; + uint upper_bytes = idx & 15; + uint lower_bytes = 16 - upper_bytes; + + uint data_idx0 = row_idx + col_idx; + uint data_idx1 = row_idx + col_idx + 1; + + return (float)lut11to16[(x < 1 || 510 < x || col_idx > 352) ? 0 : ((data[data_idx0] >> upper_bytes) | (data[data_idx1] << lower_bytes)) & 2047]; +} + +static __device__ +float2 processMeasurementTriple(const float ab_multiplier_per_frq, const float p0, const float3 v, int *invalid) +{ + float3 p0vec = make_float3(p0 + PHASE_IN_RAD0, p0 + PHASE_IN_RAD1, p0 + PHASE_IN_RAD2); + float3 p0sin, p0cos; + sincos(p0vec, &p0sin, &p0cos); + + *invalid = *invalid && any(isequal(v, make_float3(32767.0f))); + + return make_float2(dot(v, p0cos), -dot(v, p0sin)) * ab_multiplier_per_frq; +} + +static __global__ +void processPixelStage1(const short* __restrict__ lut11to16, const float* __restrict__ z_table, const float4* __restrict__ p0_table, const ushort* __restrict__ data, + float4 *a_out, float4 *b_out, float4 *n_out, float *ir_out) +{ + const uint i = get_global_id(0); + + const uint x = i % 512; + const uint y = i / 512; + + const uint y_in = (423 - y); + + const float zmultiplier = z_table[i]; + int valid = (int)(0.0f < zmultiplier); + int saturatedX = valid; + int saturatedY = valid; + int saturatedZ = valid; + int3 invalid_pixel = make_int3((int)(!valid)); + const float3 p0 = make_float3(p0_table[i]); + + const float3 v0 = make_float3(decodePixelMeasurement(data, lut11to16, 0, x, y_in), + decodePixelMeasurement(data, lut11to16, 1, x, y_in), + decodePixelMeasurement(data, lut11to16, 2, x, y_in)); + const float2 ab0 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ0, p0.x, v0, &saturatedX); + + const float3 v1 = make_float3(decodePixelMeasurement(data, lut11to16, 3, x, y_in), + decodePixelMeasurement(data, lut11to16, 4, x, y_in), + decodePixelMeasurement(data, lut11to16, 5, x, y_in)); + const float2 ab1 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ1, p0.y, v1, &saturatedY); + + const float3 v2 = make_float3(decodePixelMeasurement(data, lut11to16, 6, x, y_in), + decodePixelMeasurement(data, lut11to16, 7, x, y_in), + decodePixelMeasurement(data, lut11to16, 8, x, y_in)); + const float2 ab2 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ2, p0.z, v2, &saturatedZ); + + float3 a = select(make_float3(ab0.x, ab1.x, ab2.x), make_float3(0.0f), invalid_pixel); + float3 b = select(make_float3(ab0.y, ab1.y, ab2.y), make_float3(0.0f), invalid_pixel); + float3 n = sqrt(a * a + b * b); + + int3 saturated = make_int3(saturatedX, saturatedY, saturatedZ); + a = select(a, make_float3(0.0f), saturated); + b = select(b, make_float3(0.0f), saturated); + + a_out[i] = make_float4(a); + b_out[i] = make_float4(b); + n_out[i] = make_float4(n); + ir_out[i] = min(dot(select(n, make_float3(65535.0f), saturated), make_float3(0.333333333f * AB_MULTIPLIER * AB_OUTPUT_MULTIPLIER)), 65535.0f); +} + +/******************************************************************************* + * Filter pixel stage 1 + ******************************************************************************/ +static __global__ +void filterPixelStage1(const float4* __restrict__ a, const float4* __restrict__ b, const float4* __restrict__ n, + float4 *a_out, float4 *b_out, uchar *max_edge_test) +{ + const uint i = get_global_id(0); + + const uint x = i % 512; + const uint y = i / 512; + + const float3 self_a = make_float3(a[i]); + const float3 self_b = make_float3(b[i]); + + const float gaussian[9] = {GAUSSIAN_KERNEL_0, GAUSSIAN_KERNEL_1, GAUSSIAN_KERNEL_2, GAUSSIAN_KERNEL_3, GAUSSIAN_KERNEL_4, GAUSSIAN_KERNEL_5, GAUSSIAN_KERNEL_6, GAUSSIAN_KERNEL_7, GAUSSIAN_KERNEL_8}; + + if(x < 1 || y < 1 || x > 510 || y > 422) + { + a_out[i] = make_float4(self_a); + b_out[i] = make_float4(self_b); + max_edge_test[i] = 1; + } + else + { + float3 threshold = make_float3(sqrt(JOINT_BILATERAL_THRESHOLD)); + float3 joint_bilateral_exp = make_float3(JOINT_BILATERAL_EXP); + + const float3 self_norm = make_float3(n[i]); + const float3 self_normalized_a = self_a / self_norm; + const float3 self_normalized_b = self_b / self_norm; + + float3 weight_acc = make_float3(0.0f); + float3 weighted_a_acc = make_float3(0.0f); + float3 weighted_b_acc = make_float3(0.0f); + float3 dist_acc = make_float3(0.0f); + + const int3 c0 = isless(self_norm, threshold); + + threshold = select(threshold, make_float3(0.0f), c0); + joint_bilateral_exp = select(joint_bilateral_exp, make_float3(0.0f), c0); + + for(int yi = -1, j = 0; yi < 2; ++yi) + { + uint i_other = (y + yi) * 512 + x - 1; + + for(int xi = -1; xi < 2; ++xi, ++j, ++i_other) + { + const float3 other_a = make_float3(a[i_other]); + const float3 other_b = make_float3(b[i_other]); + const float3 other_norm = make_float3(n[i_other]); + const float3 other_normalized_a = other_a / other_norm; + const float3 other_normalized_b = other_b / other_norm; + + const int3 c1 = isless(other_norm, threshold); + + const float3 dist = 0.5f * (1.0f - (self_normalized_a * other_normalized_a + self_normalized_b * other_normalized_b)); + const float3 weight = select(gaussian[j] * exp(-1.442695f * joint_bilateral_exp * dist), make_float3(0.0f), c1); + + weighted_a_acc += weight * other_a; + weighted_b_acc += weight * other_b; + weight_acc += weight; + dist_acc += select(dist, make_float3(0.0f), c1); + } + } + + const int3 c2 = isless(make_float3(0.0f), weight_acc); + a_out[i] = make_float4(select(make_float3(0.0f), weighted_a_acc / weight_acc, c2)); + b_out[i] = make_float4(select(make_float3(0.0f), weighted_b_acc / weight_acc, c2)); + + max_edge_test[i] = all(isless(dist_acc, make_float3(JOINT_BILATERAL_MAX_EDGE))); + } +} + +/******************************************************************************* + * Process pixel stage 2 + ******************************************************************************/ +static __global__ +void processPixelStage2(const float4* __restrict__ a_in, const float4* __restrict__ b_in, const float* __restrict__ x_table, const float* __restrict__ z_table, + float *depth, float *ir_sums) +{ + const uint i = get_global_id(0); + float3 a = make_float3(a_in[i]); + float3 b = make_float3(b_in[i]); + + float3 phase = atan2(b, a); + phase = select(phase, phase + 2.0f * M_PI_F, isless(phase, make_float3(0.0f))); + phase = select(phase, make_float3(0.0f), isnan(phase)); + float3 ir = sqrt(a * a + b * b) * AB_MULTIPLIER; + + float ir_sum = ir.x + ir.y + ir.z; + float ir_min = min(ir.x, min(ir.y, ir.z)); + float ir_max = max(ir.x, max(ir.y, ir.z)); + + float phase_final = 0; + + if(ir_min >= INDIVIDUAL_AB_THRESHOLD && ir_sum >= AB_THRESHOLD) + { + float3 t = phase / (2.0f * M_PI_F) * make_float3(3.0f, 15.0f, 2.0f); + + float t0 = t.x; + float t1 = t.y; + float t2 = t.z; + + float t5 = (floor((t1 - t0) * 0.333333f + 0.5f) * 3.0f + t0); + float t3 = (-t2 + t5); + float t4 = t3 * 2.0f; + + bool c1 = t4 >= -t4; // true if t4 positive + + float f1 = c1 ? 2.0f : -2.0f; + float f2 = c1 ? 0.5f : -0.5f; + t3 *= f2; + t3 = (t3 - floor(t3)) * f1; + + bool c2 = 0.5f < fabs(t3) && fabs(t3) < 1.5f; + + float t6 = c2 ? t5 + 15.0f : t5; + float t7 = c2 ? t1 + 15.0f : t1; + + float t8 = (floor((-t2 + t6) * 0.5f + 0.5f) * 2.0f + t2) * 0.5f; + + t6 *= 0.333333f; // = / 3 + t7 *= 0.066667f; // = / 15 + + float t9 = (t8 + t6 + t7); // transformed phase measurements (they are transformed and divided by the values the original values were multiplied with) + float t10 = t9 * 0.333333f; // some avg + + t6 *= 2.0f * M_PI_F; + t7 *= 2.0f * M_PI_F; + t8 *= 2.0f * M_PI_F; + + // some cross product + float t8_new = t7 * 0.826977f - t8 * 0.110264f; + float t6_new = t8 * 0.551318f - t6 * 0.826977f; + float t7_new = t6 * 0.110264f - t7 * 0.551318f; + + t8 = t8_new; + t6 = t6_new; + t7 = t7_new; + + float norm = t8 * t8 + t6 * t6 + t7 * t7; + float mask = t9 >= 0.0f ? 1.0f : 0.0f; + t10 *= mask; + + bool slope_positive = 0 < AB_CONFIDENCE_SLOPE; + + float ir_x = slope_positive ? ir_min : ir_max; + + ir_x = log(ir_x); + ir_x = (ir_x * AB_CONFIDENCE_SLOPE * 0.301030f + AB_CONFIDENCE_OFFSET) * 3.321928f; + ir_x = exp(ir_x); + ir_x = clamp(ir_x, MIN_DEALIAS_CONFIDENCE, MAX_DEALIAS_CONFIDENCE); + ir_x *= ir_x; + + float mask2 = ir_x >= norm ? 1.0f : 0.0f; + + float t11 = t10 * mask2; + + float mask3 = MAX_DEALIAS_CONFIDENCE * MAX_DEALIAS_CONFIDENCE >= norm ? 1.0f : 0.0f; + t10 *= mask3; + phase_final = true/*(modeMask & 2) != 0*/ ? t11 : t10; + } + + float zmultiplier = z_table[i]; + float xmultiplier = x_table[i]; + + phase_final = 0.0f < phase_final ? phase_final + PHASE_OFFSET : phase_final; + + float depth_linear = zmultiplier * phase_final; + float max_depth = phase_final * UNAMBIGIOUS_DIST * 2.0; + + bool cond1 = /*(modeMask & 32) != 0*/ true && 0.0f < depth_linear && 0.0f < max_depth; + + xmultiplier = (xmultiplier * 90.0) / (max_depth * max_depth * 8192.0); + + float depth_fit = depth_linear / (-depth_linear * xmultiplier + 1); + depth_fit = depth_fit < 0.0f ? 0.0f : depth_fit; + + float d = cond1 ? depth_fit : depth_linear; // r1.y -> later r2.z + depth[i] = d; + ir_sums[i] = ir_sum; +} + +/******************************************************************************* + * Filter pixel stage 2 + ******************************************************************************/ +static __global__ +void filterPixelStage2(const float* __restrict__ depth, const float* __restrict__ ir_sums, const uchar* __restrict__ max_edge_test, float *filtered) +{ + const uint i = get_global_id(0); + + const uint x = i % 512; + const uint y = i / 512; + + const float raw_depth = depth[i]; + const float ir_sum = ir_sums[i]; + const uchar edge_test = max_edge_test[i]; + + if(raw_depth >= MIN_DEPTH && raw_depth <= MAX_DEPTH) + { + if(x < 1 || y < 1 || x > 510 || y > 422) + { + filtered[i] = raw_depth; + } + else + { + float ir_sum_acc = ir_sum; + float squared_ir_sum_acc = ir_sum * ir_sum; + float min_depth = raw_depth; + float max_depth = raw_depth; + + for(int yi = -1; yi < 2; ++yi) + { + uint i_other = (y + yi) * 512 + x - 1; + + for(int xi = -1; xi < 2; ++xi, ++i_other) + { + if(i_other == i) + { + continue; + } + + const float raw_depth_other = depth[i_other]; + const float ir_sum_other = ir_sums[i_other]; + + ir_sum_acc += ir_sum_other; + squared_ir_sum_acc += ir_sum_other * ir_sum_other; + + if(0.0f < raw_depth_other) + { + min_depth = min(min_depth, raw_depth_other); + max_depth = max(max_depth, raw_depth_other); + } + } + } + + float tmp0 = sqrt(squared_ir_sum_acc * 9.0f - ir_sum_acc * ir_sum_acc) / 9.0f; + float edge_avg = max(ir_sum_acc / 9.0f, EDGE_AB_AVG_MIN_VALUE); + tmp0 /= edge_avg; + + float abs_min_diff = fabs(raw_depth - min_depth); + float abs_max_diff = fabs(raw_depth - max_depth); + + float avg_diff = (abs_min_diff + abs_max_diff) * 0.5f; + float max_abs_diff = max(abs_min_diff, abs_max_diff); + + bool cond0 = + 0.0f < raw_depth && + tmp0 >= EDGE_AB_STD_DEV_THRESHOLD && + EDGE_CLOSE_DELTA_THRESHOLD < abs_min_diff && + EDGE_FAR_DELTA_THRESHOLD < abs_max_diff && + EDGE_MAX_DELTA_THRESHOLD < max_abs_diff && + EDGE_AVG_DELTA_THRESHOLD < avg_diff; + + if(!cond0) + { + if(edge_test != 0) + { + //float tmp1 = 1500.0f > raw_depth ? 30.0f : 0.02f * raw_depth; + float edge_count = 0.0f; + + filtered[i] = edge_count > MAX_EDGE_COUNT ? 0.0f : raw_depth; + } + else + { + filtered[i] = 0.0f; + } + } + else + { + filtered[i] = 0.0f; + } + } + } + else + { + filtered[i] = 0.0f; + } +} + +#define OUT_NAME(FUNCTION) "[CudaDepthPacketProcessorKernel::" FUNCTION "] " +namespace libfreenect2 +{ +class CudaDepthPacketProcessorKernelImpl +{ +public: + short *buf_lut11to16; + float4 *buf_p0_table; + float *buf_x_table; + float *buf_z_table; + unsigned short *buf_packet; + + float4 *buf_a; + float4 *buf_b; + float4 *buf_n; + float *buf_ir; + float4 *buf_a_filtered; + float4 *buf_b_filtered; + unsigned char *buf_edge_test; + float *buf_depth; + float *buf_ir_sum; + float *buf_filtered; + + size_t image_size; + size_t grid_size; + size_t block_size; + + void initDevice(const int deviceId, size_t image_size_, size_t block) + { + int deviceCount = 0; + + cudaSafeCall(cudaGetDeviceCount(&deviceCount)); + + int devId = -1; + for (int i = 0; i < deviceCount; i++) + { + if (deviceId != -1 && i != deviceId) + continue; + + cudaDeviceProp prop; + cudaSafeCall(cudaGetDeviceProperties(&prop, i)); + std::cout << OUT_NAME("initDevice") "device " << i << ": " << prop.name << " @ " << (prop.clockRate / 1000) << "MHz Memory " << (prop.totalGlobalMem >> 20) << "MB"; + + if (prop.computeMode == cudaComputeModeProhibited) + { + std::cout << " Compute Mode Prohibited" << std::endl; + continue; + } + + if (prop.major < 1) + { + std::cout << " does not support CUDA" << std::endl; + continue; + } + + std::cout << std::endl; + devId = i; + break; + } + + if (devId == -1) + { + throw std::runtime_error("No suitable CUDA devices found."); + } + + cudaSafeCall(cudaSetDevice(devId)); + std::cout << OUT_NAME("initDevice") "selected device " << devId << std::endl; + + image_size = image_size_; + grid_size = image_size_/block; + block_size = block; + } + + void loadTables(const short *lut11to16, const Float4 *p0_table, const float *x_table, const float *z_table) + { + //Read only + size_t buf_lut11to16_size = 2048 * sizeof(short); + size_t buf_p0_table_size = image_size * sizeof(float4); + size_t buf_x_table_size = image_size * sizeof(float); + size_t buf_z_table_size = image_size * sizeof(float); + size_t buf_packet_size = ((image_size * 11) / 16) * 10 * sizeof(short); + + cudaSafeCall(cudaMalloc(&buf_lut11to16, buf_lut11to16_size)); + cudaSafeCall(cudaMalloc(&buf_p0_table, buf_p0_table_size)); + cudaSafeCall(cudaMalloc(&buf_x_table, buf_x_table_size)); + cudaSafeCall(cudaMalloc(&buf_z_table, buf_z_table_size)); + cudaSafeCall(cudaMalloc(&buf_packet, buf_packet_size)); + + cudaMemcpyAsync(buf_lut11to16, lut11to16, buf_lut11to16_size, cudaMemcpyHostToDevice); + cudaMemcpyAsync(buf_p0_table, p0_table, buf_p0_table_size, cudaMemcpyHostToDevice); + cudaMemcpyAsync(buf_x_table, x_table, buf_x_table_size, cudaMemcpyHostToDevice); + cudaMemcpyAsync(buf_z_table, z_table, buf_z_table_size, cudaMemcpyHostToDevice); + + //Read-Write + size_t buf_a_size = image_size * sizeof(float4); + size_t buf_b_size = image_size * sizeof(float4); + size_t buf_n_size = image_size * sizeof(float4); + size_t buf_ir_size = image_size * sizeof(float); + size_t buf_a_filtered_size = image_size * sizeof(float4); + size_t buf_b_filtered_size = image_size * sizeof(float4); + size_t buf_edge_test_size = image_size * sizeof(char); + size_t buf_depth_size = image_size * sizeof(float); + size_t buf_ir_sum_size = image_size * sizeof(float); + size_t buf_filtered_size = image_size * sizeof(float); + + cudaSafeCall(cudaMalloc(&buf_a, buf_a_size)); + cudaSafeCall(cudaMalloc(&buf_b, buf_b_size)); + cudaSafeCall(cudaMalloc(&buf_n, buf_n_size)); + cudaSafeCall(cudaMalloc(&buf_ir, buf_ir_size)); + cudaSafeCall(cudaMalloc(&buf_a_filtered, buf_a_filtered_size)); + cudaSafeCall(cudaMalloc(&buf_b_filtered, buf_b_filtered_size)); + cudaSafeCall(cudaMalloc(&buf_edge_test, buf_edge_test_size)); + cudaSafeCall(cudaMalloc(&buf_depth, buf_depth_size)); + cudaSafeCall(cudaMalloc(&buf_ir_sum, buf_ir_sum_size)); + cudaSafeCall(cudaMalloc(&buf_filtered, buf_filtered_size)); + + cudaDeviceSynchronize(); + + cudaSafeCall(cudaGetLastError()); + } + + void generateOptions(const DepthPacketProcessor::Parameters ¶ms, const DepthPacketProcessor::Config &config) + { + unsigned int tmpi; + float tmpf; + + #define COPY(upper, lower) cudaMemcpyToSymbolAsync(upper, ¶ms.lower, sizeof(params.lower)); + tmpi = 0x180; + cudaMemcpyToSymbolAsync(BFI_BITMASK, &tmpi, sizeof(int)); + + COPY(AB_MULTIPLIER, ab_multiplier) + COPY(AB_MULTIPLIER_PER_FRQ0, ab_multiplier_per_frq[0]) + COPY(AB_MULTIPLIER_PER_FRQ1, ab_multiplier_per_frq[1]) + COPY(AB_MULTIPLIER_PER_FRQ2, ab_multiplier_per_frq[2]) + COPY(AB_OUTPUT_MULTIPLIER, ab_output_multiplier) + + COPY(PHASE_IN_RAD0, phase_in_rad[0]) + COPY(PHASE_IN_RAD1, phase_in_rad[1]) + COPY(PHASE_IN_RAD2, phase_in_rad[2]) + + COPY(JOINT_BILATERAL_AB_THRESHOLD, joint_bilateral_ab_threshold) + COPY(JOINT_BILATERAL_MAX_EDGE, joint_bilateral_max_edge) + COPY(JOINT_BILATERAL_EXP, joint_bilateral_exp) + tmpf = (params.joint_bilateral_ab_threshold * params.joint_bilateral_ab_threshold) / (params.ab_multiplier * params.ab_multiplier); + cudaMemcpyToSymbolAsync(JOINT_BILATERAL_THRESHOLD, &tmpf, sizeof(tmpf)); + + COPY(GAUSSIAN_KERNEL_0, gaussian_kernel[0]) + COPY(GAUSSIAN_KERNEL_1, gaussian_kernel[1]) + COPY(GAUSSIAN_KERNEL_2, gaussian_kernel[2]) + COPY(GAUSSIAN_KERNEL_3, gaussian_kernel[3]) + COPY(GAUSSIAN_KERNEL_4, gaussian_kernel[4]) + COPY(GAUSSIAN_KERNEL_5, gaussian_kernel[5]) + COPY(GAUSSIAN_KERNEL_6, gaussian_kernel[6]) + COPY(GAUSSIAN_KERNEL_7, gaussian_kernel[7]) + COPY(GAUSSIAN_KERNEL_8, gaussian_kernel[8]) + + COPY(PHASE_OFFSET, phase_offset) + COPY(UNAMBIGIOUS_DIST, unambigious_dist) + COPY(INDIVIDUAL_AB_THRESHOLD, individual_ab_threshold) + COPY(AB_THRESHOLD, ab_threshold) + COPY(AB_CONFIDENCE_SLOPE, ab_confidence_slope) + COPY(AB_CONFIDENCE_OFFSET, ab_confidence_offset) + COPY(MIN_DEALIAS_CONFIDENCE, min_dealias_confidence) + COPY(MAX_DEALIAS_CONFIDENCE, max_dealias_confidence) + + COPY(EDGE_AB_AVG_MIN_VALUE, edge_ab_avg_min_value) + COPY(EDGE_AB_STD_DEV_THRESHOLD, edge_ab_std_dev_threshold) + COPY(EDGE_CLOSE_DELTA_THRESHOLD, edge_close_delta_threshold) + COPY(EDGE_FAR_DELTA_THRESHOLD, edge_far_delta_threshold) + COPY(EDGE_MAX_DELTA_THRESHOLD, edge_max_delta_threshold) + COPY(EDGE_AVG_DELTA_THRESHOLD, edge_avg_delta_threshold) + COPY(MAX_EDGE_COUNT, max_edge_count) + + tmpf = config.MinDepth * 1000.0f; + cudaMemcpyToSymbolAsync(MIN_DEPTH, &tmpf, sizeof(tmpf)); + + tmpf = config.MaxDepth * 1000.0f; + cudaMemcpyToSymbolAsync(MAX_DEPTH, &tmpf, sizeof(tmpf)); + + cudaDeviceSynchronize(); + + cudaSafeCall(cudaGetLastError()); + } + + void run(const DepthPacket &packet, Frame *ir_frame, Frame *depth_frame, const DepthPacketProcessor::Config &config) + { + size_t ir_frame_size = ir_frame->width * ir_frame->height * ir_frame->bytes_per_pixel; + size_t depth_frame_size = depth_frame->width * depth_frame->height * depth_frame->bytes_per_pixel; + + cudaMemcpyAsync(buf_packet, packet.buffer, packet.buffer_length, cudaMemcpyHostToDevice); + + processPixelStage1<<>>(buf_lut11to16, buf_z_table, buf_p0_table, buf_packet, buf_a, buf_b, buf_n, buf_ir); + + cudaMemcpyAsync(ir_frame->data, buf_ir, ir_frame_size, cudaMemcpyDeviceToHost); + + if (config.EnableBilateralFilter) + { + filterPixelStage1<<>>(buf_a, buf_b, buf_n, buf_a_filtered, buf_b_filtered, buf_edge_test); + } + + processPixelStage2<<>>( + config.EnableBilateralFilter ? buf_a_filtered : buf_a, + config.EnableBilateralFilter ? buf_b_filtered : buf_b, + buf_x_table, buf_z_table, buf_depth, buf_ir_sum); + + if (config.EnableEdgeAwareFilter) + { + filterPixelStage2<<>>(buf_depth, buf_ir_sum, buf_edge_test, buf_filtered); + } + + cudaMemcpyAsync(depth_frame->data, config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, depth_frame_size, cudaMemcpyDeviceToHost); + + cudaDeviceSynchronize(); + + cudaSafeCall(cudaGetLastError()); + } +}; + +CudaDepthPacketProcessorKernel::CudaDepthPacketProcessorKernel(): + impl_(new CudaDepthPacketProcessorKernelImpl()) +{ +} + +CudaDepthPacketProcessorKernel::~CudaDepthPacketProcessorKernel() +{ + delete impl_; +} + +void CudaDepthPacketProcessorKernel::initDevice(const int deviceId, size_t image_size_, size_t block) +{ + impl_->initDevice(deviceId, image_size_, block); +} + +void CudaDepthPacketProcessorKernel::loadTables(const short *lut11to16, const Float4 *p0_table, const float *x_table, const float *z_table) +{ + impl_->loadTables(lut11to16, p0_table, x_table, z_table); +} + +void CudaDepthPacketProcessorKernel::generateOptions(const DepthPacketProcessor::Parameters ¶ms, const DepthPacketProcessor::Config &config) +{ + impl_->generateOptions(params, config); +} + +void CudaDepthPacketProcessorKernel::run(const DepthPacket &packet, Frame *ir_frame, Frame *depth_frame, const DepthPacketProcessor::Config &config) +{ + impl_->run(packet, ir_frame, depth_frame, config); +} +} diff --git a/examples/protonect/src/depth_packet_stream_parser.cpp b/examples/protonect/src/depth_packet_stream_parser.cpp index 3a03c44ea..f37d61e27 100644 --- a/examples/protonect/src/depth_packet_stream_parser.cpp +++ b/examples/protonect/src/depth_packet_stream_parser.cpp @@ -54,6 +54,12 @@ DepthPacketStreamParser::~DepthPacketStreamParser() void DepthPacketStreamParser::setPacketProcessor(libfreenect2::BaseDepthPacketProcessor *processor) { processor_ = (processor != 0) ? processor : noopProcessor(); + size_t single_image = 512*424*11/8; + unsigned char *processor_buf = processor_->getPacketBuffer(single_image * 10 * 2); + if (processor_buf) + { + buffer_.setbuffer(processor_buf, single_image * 10 * 2); + } } void DepthPacketStreamParser::onDataReceived(unsigned char* buffer, size_t in_length) diff --git a/examples/protonect/src/double_buffer.cpp b/examples/protonect/src/double_buffer.cpp index c1948b49a..3e4a253d9 100644 --- a/examples/protonect/src/double_buffer.cpp +++ b/examples/protonect/src/double_buffer.cpp @@ -37,7 +37,7 @@ DoubleBuffer::DoubleBuffer() : DoubleBuffer::~DoubleBuffer() { - if(buffer_data_ != 0) + if(buffer_data_ != 0 && !external_buffer_) { buffer_[0].data = 0; buffer_[1].data = 0; @@ -49,6 +49,7 @@ void DoubleBuffer::allocate(size_t buffer_size) { size_t total_buffer_size = 2 * buffer_size; buffer_data_ = new unsigned char[total_buffer_size]; + external_buffer_ = false; buffer_[0].capacity = buffer_size; buffer_[0].length = 0; @@ -59,6 +60,22 @@ void DoubleBuffer::allocate(size_t buffer_size) buffer_[1].data = buffer_data_ + buffer_size; } +void DoubleBuffer::setbuffer(unsigned char *buf, size_t size) +{ + if (!external_buffer_) + delete[] buffer_data_; + external_buffer_ = true; + buffer_data_ = buf; + + buffer_[0].capacity = size / 2; + buffer_[0].length = size / 2; + buffer_[0].data = buf; + + buffer_[1].capacity = size / 2; + buffer_[1].length = size / 2; + buffer_[1].data = buf + size / 2; +} + void DoubleBuffer::swap() { front_buffer_index_ = (front_buffer_index_ + 1) & 1; diff --git a/examples/protonect/src/libfreenect2.cpp b/examples/protonect/src/libfreenect2.cpp index 9524fd6ca..a758924b2 100644 --- a/examples/protonect/src/libfreenect2.cpp +++ b/examples/protonect/src/libfreenect2.cpp @@ -623,6 +623,9 @@ void Freenect2DeviceImpl::close() PacketPipeline *createDefaultPacketPipeline() { +#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT + return new CudaPacketPipeline(); +#endif #ifdef LIBFREENECT2_WITH_OPENGL_SUPPORT return new OpenGLPacketPipeline(); #else diff --git a/examples/protonect/src/opencl_depth_packet_processor.cpp b/examples/protonect/src/opencl_depth_packet_processor.cpp index 62bd850d3..cc13587ab 100644 --- a/examples/protonect/src/opencl_depth_packet_processor.cpp +++ b/examples/protonect/src/opencl_depth_packet_processor.cpp @@ -57,27 +57,6 @@ namespace libfreenect2 { -bool loadBufferFromResources(const std::string &filename, unsigned char *buffer, const size_t n) -{ - size_t length = 0; - const unsigned char *data = NULL; - - if(!loadResource(filename, &data, &length)) - { - std::cerr << OUT_NAME("loadBufferFromResources") "failed to load resource: " << filename << std::endl; - return false; - } - - if(length != n) - { - std::cerr << OUT_NAME("loadBufferFromResources") "wrong size of resource: " << filename << std::endl; - return false; - } - - memcpy(buffer, data, length); - return true; -} - std::string loadCLSource(const std::string &filename) { const unsigned char *data; diff --git a/examples/protonect/src/packet_pipeline.cpp b/examples/protonect/src/packet_pipeline.cpp index 1c66f66b6..8de1d0932 100644 --- a/examples/protonect/src/packet_pipeline.cpp +++ b/examples/protonect/src/packet_pipeline.cpp @@ -136,4 +136,24 @@ DepthPacketProcessor *OpenCLPacketPipeline::createDepthPacketProcessor() } #endif // LIBFREENECT2_WITH_OPENCL_SUPPORT +#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT + +CudaPacketPipeline::CudaPacketPipeline(const int deviceId) : deviceId(deviceId) +{ + initialize(); +} + +CudaPacketPipeline::~CudaPacketPipeline() { } + +DepthPacketProcessor *CudaPacketPipeline::createDepthPacketProcessor() +{ + CudaDepthPacketProcessor *depth_processor = new CudaDepthPacketProcessor(deviceId); + depth_processor->load11To16LutFromFile("11to16.bin"); + depth_processor->loadXTableFromFile("xTable.bin"); + depth_processor->loadZTableFromFile("zTable.bin"); + + return depth_processor; +} +#endif // LIBFREENECT2_WITH_CUDA_SUPPORT + } /* namespace libfreenect2 */ diff --git a/examples/protonect/src/resource.cpp b/examples/protonect/src/resource.cpp index f3253de40..1c3d24caf 100644 --- a/examples/protonect/src/resource.cpp +++ b/examples/protonect/src/resource.cpp @@ -25,6 +25,9 @@ */ #include +#include +#include +#include namespace libfreenect2 { @@ -37,7 +40,7 @@ struct ResourceDescriptor }; #ifdef RESOURCES_INC -#include "resources.inc" +#include "resources.inc.h" #else ResourceDescriptor resource_descriptors[] = {}; #endif @@ -59,4 +62,25 @@ bool loadResource(const std::string &name, unsigned char const**data, size_t *le return result; } +bool loadBufferFromResources(const std::string &filename, unsigned char *buffer, const size_t n) +{ + size_t length = 0; + const unsigned char *data = NULL; + + if(!loadResource(filename, &data, &length)) + { + std::cerr << "loadBufferFromResources: failed to load resource: " << filename << std::endl; + return false; + } + + if(length != n) + { + std::cerr << "loadBufferFromResources: wrong size of resource: " << filename << std::endl; + return false; + } + + memcpy(buffer, data, length); + return true; +} + } /* namespace libfreenect2 */