diff --git a/.gitignore b/.gitignore
index 3b9fdd98d..b3f0e3b62 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 examples/protonect/include/libfreenect2/config.h
 
 # generated resource file
-examples/protonect/src/resources.inc
+examples/protonect/src/resources.inc.h
 examples/protonect/build
 
 # Dependency folders
diff --git a/examples/protonect/CMakeLists.txt b/examples/protonect/CMakeLists.txt
index d90277e06..90286a03e 100644
--- a/examples/protonect/CMakeLists.txt
+++ b/examples/protonect/CMakeLists.txt
@@ -8,6 +8,7 @@ SET(DEPENDS_DIR "${MY_DIR}/../../depends" CACHE STRING "dependency directory")
 
 OPTION(ENABLE_CXX11 "Enable C++11 support" OFF)
 OPTION(ENABLE_OPENCL "Enable OpenCL support" ON)
+OPTION(ENABLE_CUDA "Enable CUDA support" ON)
 OPTION(ENABLE_OPENGL "Enable OpenGL support" ON)
 
 IF(ENABLE_CXX11)
@@ -40,7 +41,7 @@ SET(LIBRARY_OUTPUT_PATH ${MY_DIR}/lib)
 # dependencies
 FIND_PACKAGE(PkgConfig)    # try find PKGConfig as it will be used if found
 FIND_PACKAGE(LibUSB REQUIRED)
-FIND_PACKAGE(OpenCV REQUIRED)
+FIND_PACKAGE(OpenCV REQUIRED core highgui)
 FIND_PACKAGE(TurboJPEG REQUIRED) #does not provide a package-config file
 
 # Add includes
@@ -54,7 +55,7 @@ INCLUDE_DIRECTORIES(
 
 LINK_DIRECTORIES(${LibUSB_LIBRARY_DIRS})
 
-SET(RESOURCES_INC_FILE "${MY_DIR}/src/resources.inc")
+SET(RESOURCES_INC_FILE "${MY_DIR}/src/resources.inc.h")
 
 SET(SOURCES
   include/libfreenect2/protocol/command.h
@@ -165,30 +166,96 @@ IF(ENABLE_OPENCL)
   ENDIF(OPENCL_FOUND)
 ENDIF(ENABLE_OPENCL)
 
+IF(ENABLE_CUDA)
+  FIND_PACKAGE(CUDA)
+  IF(CUDA_FOUND)
+    SET(LIBFREENECT2_WITH_CUDA_SUPPORT 1)
+
+    CUDA_INCLUDE_DIRECTORIES(
+      "${MY_DIR}/include/"
+      "${CUDA_TOOLKIT_ROOT_DIR}/samples/common/inc"
+      "$ENV{NVCUDASAMPLES_ROOT}/common/inc"
+    )
+    SET(CUDA_FLAGS -use_fast_math)
+    IF(NOT MSVC)
+      SET(CUDA_FLAGS "${CUDA_FLAGS} -Xcompiler -fPIC")
+    ENDIF()
+    CUDA_COMPILE(CUDA_SHARED_OBJECTS
+     src/cuda_depth_packet_processor.cu SHARED
+     OPTIONS ${CUDA_FLAGS}
+    )
+    LIST(APPEND OTHER_SHARED_OBJECTS ${CUDA_SHARED_OBJECTS})
+    CUDA_COMPILE(CUDA_STATIC_OBJECTS
+     src/cuda_depth_packet_processor.cu STATIC
+     OPTIONS ${CUDA_FLAGS}
+    )
+    LIST(APPEND OTHER_STATIC_OBJECTS ${CUDA_STATIC_OBJECTS})
+
+    INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})
+
+    LIST(APPEND SOURCES
+      src/cuda_depth_packet_processor.cpp
+      ${CUDA_OBJECTS}
+    )
+
+    LIST(APPEND LIBRARIES
+      ${CUDA_LIBRARIES}
+    )
+  ENDIF(CUDA_FOUND)
+ENDIF(ENABLE_CUDA)
+
 SET(CMAKE_INSTALL_RPATH ${LibUSB_LIBDIR})
 
 CONFIGURE_FILE("${MY_DIR}/include/libfreenect2/config.h.in" "${MY_DIR}/include/libfreenect2/config.h" @ONLY)
 GENERATE_RESOURCES(${RESOURCES_INC_FILE} ${MY_DIR} ${RESOURCES})
 
 ADD_DEFINITIONS(-DRESOURCES_INC)
-ADD_LIBRARY(freenect2 SHARED ${SOURCES})
+ADD_LIBRARY(freenect2 OBJECT ${SOURCES})
+set_target_properties(freenect2 PROPERTIES POSITION_INDEPENDENT_CODE 1)
+ADD_LIBRARY(freenect2shared SHARED $<TARGET_OBJECTS:freenect2> ${OTHER_SHARED_OBJECTS})
+ADD_LIBRARY(freenect2static STATIC $<TARGET_OBJECTS:freenect2> ${OTHER_STATIC_OBJECTS})
+set_target_properties(freenect2shared PROPERTIES OUTPUT_NAME freenect2)
+set_target_properties(freenect2static PROPERTIES OUTPUT_NAME freenect2)
 MESSAGE("Linking with these libraries: ${LIBRARIES}")
-TARGET_LINK_LIBRARIES(freenect2 ${LIBRARIES})
+TARGET_LINK_LIBRARIES(freenect2shared ${LIBRARIES})
   
 ADD_EXECUTABLE(Protonect
   Protonect.cpp
 )
   
 TARGET_LINK_LIBRARIES(Protonect
-  freenect2
+  freenect2shared
 )
 
 CONFIGURE_FILE(freenect2.cmake.in "${PROJECT_BINARY_DIR}/freenect2Config.cmake" @ONLY)
 
-INSTALL(TARGETS freenect2 DESTINATION lib)
+INSTALL(TARGETS Protonect DESTINATION bin)
+INSTALL(TARGETS freenect2shared DESTINATION lib)
+INSTALL(TARGETS freenect2static DESTINATION lib)
 INSTALL(DIRECTORY "${MY_DIR}/include/" DESTINATION include PATTERN "*.in" EXCLUDE)
 IF(LIBFREENECT2_THREADING_TINYTHREAD)
   INSTALL(FILES  "${MY_DIR}/src/tinythread/tinythread.h" DESTINATION include/${PROJECT_NAME}/tinythread/)
 ENDIF(LIBFREENECT2_THREADING_TINYTHREAD)
 INSTALL(FILES "${PROJECT_BINARY_DIR}/freenect2Config.cmake" DESTINATION lib/cmake/freenect2/)
   
+IF(WIN32)
+  INSTALL(TARGETS freenect2shared RUNTIME DESTINATION bin)
+
+  FOREACH(LIB ${OpenCV_LIBS})
+    INSTALL(FILES $<TARGET_PROPERTY:${LIB},IMPORTED_LOCATION_RELEASE> DESTINATION bin)
+  ENDFOREACH()
+
+  get_filename_component(LibUSB_LIBDIR ${LibUSB_LIBRARIES} DIRECTORY)
+  FIND_FILE(LibUSB_DLL libusb-1.0.dll PATHS ${LibUSB_LIBDIR} NO_DEFAULT_PATH)
+  INSTALL(FILES ${LibUSB_DLL} DESTINATION bin)
+
+  get_filename_component(TurboJPEG_LIBDIR ${TurboJPEG_LIBRARIES} DIRECTORY)
+  FIND_FILE(TurboJPEG_DLL turbojpeg.dll PATHS "${TurboJPEG_LIBDIR}/../bin" NO_DEFAULT_PATH)
+  INSTALL(FILES ${TurboJPEG_DLL} DESTINATION bin)
+
+  IF(GLFW3_FOUND)
+    get_filename_component(GLFW3_LIBDIR ${GLFW3_LIBRARIES} DIRECTORY)
+    FIND_FILE(GLFW3_DLL glfw3.dll PATHS ${GLFW3_LIBDIR} NO_DEFAULT_PATH)
+    INSTALL(FILES ${GLFW3_DLL} DESTINATION bin)
+  ENDIF()
+ENDIF()
diff --git a/examples/protonect/Protonect.cpp b/examples/protonect/Protonect.cpp
index 138c189de..3e6bed6f8 100644
--- a/examples/protonect/Protonect.cpp
+++ b/examples/protonect/Protonect.cpp
@@ -92,6 +92,15 @@ int main(int argc, char *argv[])
         pipeline = new libfreenect2::OpenCLPacketPipeline();
 #else
       std::cout << "OpenCL pipeline is not supported!" << std::endl;
+#endif
+    }
+    else if(arg == "cuda")
+    {
+#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT
+      if(!pipeline)
+        pipeline = new libfreenect2::CudaPacketPipeline();
+#else
+      std::cout << "CUDA pipeline is not supported!" << std::endl;
 #endif
     }
     else if(arg.find_first_not_of("0123456789") == std::string::npos) //check if parameter could be a serial number
diff --git a/examples/protonect/include/libfreenect2/async_packet_processor.h b/examples/protonect/include/libfreenect2/async_packet_processor.h
index 807dbead7..2293406ea 100644
--- a/examples/protonect/include/libfreenect2/async_packet_processor.h
+++ b/examples/protonect/include/libfreenect2/async_packet_processor.h
@@ -76,6 +76,12 @@ class AsyncPacketProcessor : public PacketProcessor<PacketT>
     }
     packet_condition_.notify_one();
   }
+
+  virtual unsigned char *getPacketBuffer(size_t size)
+  {
+    return processor_->getPacketBuffer(size);
+  }
+
 private:
   PacketProcessorPtr processor_;
   bool current_packet_available_;
diff --git a/examples/protonect/include/libfreenect2/config.h.in b/examples/protonect/include/libfreenect2/config.h.in
index f3abf7d43..abb2da43b 100644
--- a/examples/protonect/include/libfreenect2/config.h.in
+++ b/examples/protonect/include/libfreenect2/config.h.in
@@ -43,6 +43,8 @@
 
 #cmakedefine LIBFREENECT2_WITH_OPENCL_SUPPORT
 
+#cmakedefine LIBFREENECT2_WITH_CUDA_SUPPORT
+
 #cmakedefine LIBFREENECT2_THREADING_STDLIB
 
 #cmakedefine LIBFREENECT2_THREADING_TINYTHREAD
diff --git a/examples/protonect/include/libfreenect2/depth_packet_processor.h b/examples/protonect/include/libfreenect2/depth_packet_processor.h
index a6a83d1d7..15ea9d74d 100644
--- a/examples/protonect/include/libfreenect2/depth_packet_processor.h
+++ b/examples/protonect/include/libfreenect2/depth_packet_processor.h
@@ -196,5 +196,58 @@ class LIBFREENECT2_API OpenCLDepthPacketProcessor : public DepthPacketProcessor
   OpenCLDepthPacketProcessorImpl *impl_;
 };
 #endif // LIBFREENECT2_WITH_OPENCL_SUPPORT
+
+#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT
+#ifdef _MSC_VER
+struct __declspec(align(16)) Float4
+#else
+struct __attribute__((aligned(16))) Float4
+#endif
+{
+    float x, y, z, w;
+};
+
+class CudaDepthPacketProcessorImpl;
+
+class LIBFREENECT2_API CudaDepthPacketProcessor : public DepthPacketProcessor
+{
+public:
+  CudaDepthPacketProcessor(const int deviceId = -1);
+  virtual ~CudaDepthPacketProcessor();
+  virtual unsigned char *getPacketBuffer(size_t size);
+  virtual void setConfiguration(const libfreenect2::DepthPacketProcessor::Config &config);
+
+  virtual void loadP0TablesFromCommandResponse(unsigned char* buffer, size_t buffer_length);
+
+  /**
+   * GUESS: the x and z table follow some polynomial, until we know the exact polynom formula and its coefficients
+   * just load them from a memory dump - although they probably vary per camera
+   */
+  void loadXTableFromFile(const char* filename);
+
+  void loadZTableFromFile(const char* filename);
+
+  void load11To16LutFromFile(const char* filename);
+
+  virtual void process(const DepthPacket &packet);
+private:
+  CudaDepthPacketProcessorImpl *impl_;
+};
+
+class CudaDepthPacketProcessorKernelImpl;
+
+class CudaDepthPacketProcessorKernel
+{
+public:
+  CudaDepthPacketProcessorKernel();
+  virtual ~CudaDepthPacketProcessorKernel();
+  void initDevice(const int deviceId, size_t image_size_, size_t block);
+  void generateOptions(const DepthPacketProcessor::Parameters &params, const DepthPacketProcessor::Config &config);
+  void loadTables(const short *lut11to16, const Float4 *p0_table, const float *x_table, const float *z_table);
+  void run(const DepthPacket &packet, Frame *ir_frame, Frame *depth_frame, const DepthPacketProcessor::Config &config);
+private:
+  CudaDepthPacketProcessorKernelImpl *impl_;
+};
+#endif // LIBFREENECT2_WITH_CUDA_SUPPORT
 } /* namespace libfreenect2 */
 #endif /* DEPTH_PACKET_PROCESSOR_H_ */
diff --git a/examples/protonect/include/libfreenect2/double_buffer.h b/examples/protonect/include/libfreenect2/double_buffer.h
index 49eb1f155..452cadb25 100644
--- a/examples/protonect/include/libfreenect2/double_buffer.h
+++ b/examples/protonect/include/libfreenect2/double_buffer.h
@@ -49,6 +49,8 @@ class LIBFREENECT2_API DoubleBuffer
 
   void allocate(size_t buffer_size);
 
+  void setbuffer(unsigned char *buf, size_t size);
+
   void swap();
 
   Buffer& front();
@@ -59,6 +61,7 @@ class LIBFREENECT2_API DoubleBuffer
   unsigned char front_buffer_index_;
 
   unsigned char* buffer_data_;
+  bool external_buffer_;
 };
 
 } /* namespace libfreenect2 */
diff --git a/examples/protonect/include/libfreenect2/frame_listener.hpp b/examples/protonect/include/libfreenect2/frame_listener.hpp
index 1ef02354e..091e5781a 100644
--- a/examples/protonect/include/libfreenect2/frame_listener.hpp
+++ b/examples/protonect/include/libfreenect2/frame_listener.hpp
@@ -48,15 +48,18 @@ struct LIBFREENECT2_API Frame
   size_t width, height, bytes_per_pixel;
   unsigned char* data;
 
-  Frame(size_t width, size_t height, size_t bytes_per_pixel) :
+  Frame(size_t width, size_t height, size_t bytes_per_pixel, bool alloc = true) :
     width(width),
     height(height),
-    bytes_per_pixel(bytes_per_pixel)
+    bytes_per_pixel(bytes_per_pixel),
+    data(NULL)
   {
+    if (!alloc)
+      return;
     data = new unsigned char[width * height * bytes_per_pixel];
   }
 
-  ~Frame()
+  virtual ~Frame()
   {
     delete[] data;
   }
diff --git a/examples/protonect/include/libfreenect2/packet_pipeline.h b/examples/protonect/include/libfreenect2/packet_pipeline.h
index b1bf467c2..15ee92305 100644
--- a/examples/protonect/include/libfreenect2/packet_pipeline.h
+++ b/examples/protonect/include/libfreenect2/packet_pipeline.h
@@ -107,6 +107,17 @@ class LIBFREENECT2_API OpenCLPacketPipeline : public BasePacketPipeline
 };
 #endif // LIBFREENECT2_WITH_OPENCL_SUPPORT
 
+#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT
+class LIBFREENECT2_API CudaPacketPipeline : public BasePacketPipeline
+{
+protected:
+  const int deviceId;
+  virtual DepthPacketProcessor *createDepthPacketProcessor();
+public:
+  CudaPacketPipeline(const int deviceId = -1);
+  virtual ~CudaPacketPipeline();
+};
+#endif // LIBFREENECT2_WITH_CUDA_SUPPORT
 
 } /* namespace libfreenect2 */
 #endif /* PACKET_PIPELINE_H_ */
diff --git a/examples/protonect/include/libfreenect2/packet_processor.h b/examples/protonect/include/libfreenect2/packet_processor.h
index ea21b9d8a..bd70a8cfb 100644
--- a/examples/protonect/include/libfreenect2/packet_processor.h
+++ b/examples/protonect/include/libfreenect2/packet_processor.h
@@ -38,6 +38,7 @@ class PacketProcessor
 
   virtual bool ready() { return true; }
   virtual void process(const PacketT &packet) = 0;
+  virtual unsigned char *getPacketBuffer(size_t size) { return NULL; }
 };
 
 template<typename PacketT>
diff --git a/examples/protonect/include/libfreenect2/resource.h b/examples/protonect/include/libfreenect2/resource.h
index a8dc38262..f6398c0dd 100644
--- a/examples/protonect/include/libfreenect2/resource.h
+++ b/examples/protonect/include/libfreenect2/resource.h
@@ -33,6 +33,7 @@ namespace libfreenect2
 {
 
 bool loadResource(const std::string &name, unsigned char const**data, size_t *length);
+bool loadBufferFromResources(const std::string &filename, unsigned char *buffer, const size_t n);
 
 } /* namespace libfreenect2 */
 #endif /* RESOURCE_H_ */
diff --git a/examples/protonect/src/cuda_depth_packet_processor.cpp b/examples/protonect/src/cuda_depth_packet_processor.cpp
new file mode 100644
index 000000000..89e8a8867
--- /dev/null
+++ b/examples/protonect/src/cuda_depth_packet_processor.cpp
@@ -0,0 +1,303 @@
+/*
+ * This file is part of the OpenKinect Project. http://www.openkinect.org
+ *
+ * Copyright (c) 2014 individual OpenKinect contributors. See the CONTRIB file
+ * for details.
+ *
+ * This code is licensed to you under the terms of the Apache License, version
+ * 2.0, or, at your option, the terms of the GNU General Public License,
+ * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses,
+ * or the following URLs:
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * If you redistribute this file in source form, modified or unmodified, you
+ * may:
+ *   1) Leave this header intact and distribute it under the same terms,
+ *      accompanying it with the APACHE20 and GPL20 files, or
+ *   2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or
+ *   3) Delete the GPL v2 clause and accompany it with the APACHE20 file
+ * In all cases you must keep the copyright notice intact and include a copy
+ * of the CONTRIB file.
+ *
+ * Binary distributions must follow the binary distribution requirements of
+ * either License.
+ */
+
+#include <libfreenect2/depth_packet_processor.h>
+#include <libfreenect2/resource.h>
+#include <libfreenect2/protocol/response.h>
+
+#include <opencv2/opencv.hpp>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include <stdexcept>
+#include <cstring>
+#include <cmath>
+#include <cuda_runtime.h>
+
+#if defined(WIN32)
+#define _USE_MATH_DEFINES
+#include <math.h>
+#endif
+
+#define cudaSafeCall(expr) do { cudaError_t err = (expr); if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err)); } while(0)
+
+#define OUT_NAME(FUNCTION) "[CudaDepthPacketProcessor::" FUNCTION "] "
+namespace libfreenect2
+{
+
+struct PinnedFrame: Frame
+{
+  PinnedFrame(size_t width, size_t height, size_t bytes_per_pixel):
+    Frame(width, height, bytes_per_pixel, false)
+  {
+    cudaSafeCall(cudaHostAlloc(&data, width*height*bytes_per_pixel, cudaHostAllocPortable));
+  }
+
+  ~PinnedFrame()
+  {
+    cudaFreeHost(data);
+    data = NULL;
+  }
+};
+
+class CudaDepthPacketProcessorImpl
+{
+public:
+  short lut11to16[2048];
+  float x_table[512 * 424];
+  float z_table[512 * 424];
+  Float4 p0_table[512 * 424];
+  libfreenect2::DepthPacketProcessor::Config config;
+  DepthPacketProcessor::Parameters params;
+
+  CudaDepthPacketProcessorKernel kernel;
+
+  double timing_acc;
+  double timing_acc_n;
+
+  double timing_current_start;
+
+  Frame *ir_frame, *depth_frame;
+
+  size_t image_size;
+
+  size_t packet_buffer_size;
+  unsigned char *packet_buffer;
+
+  bool deviceInitialized;
+  bool programInitialized;
+
+  CudaDepthPacketProcessorImpl(const int deviceId = -1) : deviceInitialized(false), programInitialized(false)
+  {
+    timing_acc = 0.0;
+    timing_acc_n = 0.0;
+    timing_current_start = 0.0;
+    image_size = 512 * 424;
+
+    deviceInitialized = initDevice(deviceId);
+    newIrFrame();
+    newDepthFrame();
+    packet_buffer_size = 0;
+    packet_buffer = NULL;
+  }
+
+  bool initDevice(const int deviceId)
+  {
+    size_t block_size = 128;
+    try
+    {
+      kernel.initDevice(deviceId, image_size, block_size);
+    }
+    catch (const std::runtime_error &err)
+    {
+      std::cerr << OUT_NAME("initDevice") << err.what() << std::endl;
+      return false;
+    }
+    return true;
+  }
+
+  bool initProgram()
+  {
+    if(!deviceInitialized)
+    {
+      return false;
+    }
+
+    try
+    {
+      kernel.generateOptions(params, config);
+      kernel.loadTables(lut11to16, p0_table, x_table, z_table);
+    }
+    catch (const std::runtime_error &err)
+    {
+      std::cerr << OUT_NAME("initProgram") << err.what() << std::endl;
+      throw err;
+    }
+    programInitialized = true;
+    return true;
+  }
+
+  void run(const DepthPacket &packet)
+  {
+    kernel.run(packet, ir_frame, depth_frame, config);
+  }
+
+  void startTiming()
+  {
+    timing_current_start = cv::getTickCount();
+  }
+
+  void stopTiming()
+  {
+    timing_acc += (cv::getTickCount() - timing_current_start) / cv::getTickFrequency();
+    timing_acc_n += 1.0;
+
+    if(timing_acc_n >= 100.0)
+    {
+      double avg = (timing_acc / timing_acc_n);
+      std::cout << "[CudaDepthPacketProcessor] avg. time: " << (avg * 1000) << "ms -> ~" << (1.0 / avg) << "Hz" << std::endl;
+      timing_acc = 0.0;
+      timing_acc_n = 0.0;
+    }
+  }
+
+  void newIrFrame()
+  {
+    ir_frame = new PinnedFrame(512, 424, 4);
+  }
+
+  void newDepthFrame()
+  {
+    depth_frame = new PinnedFrame(512, 424, 4);
+  }
+
+  void fill_trig_table(const libfreenect2::protocol::P0TablesResponse *p0table)
+  {
+    for(int r = 0; r < 424; ++r)
+    {
+      Float4 *it = &p0_table[r * 512];
+      const uint16_t *it0 = &p0table->p0table0[r * 512];
+      const uint16_t *it1 = &p0table->p0table1[r * 512];
+      const uint16_t *it2 = &p0table->p0table2[r * 512];
+      for(int c = 0; c < 512; ++c, ++it, ++it0, ++it1, ++it2)
+      {
+        it->x = -((float) * it0) * 0.000031 * M_PI;
+        it->y = -((float) * it1) * 0.000031 * M_PI;
+        it->z = -((float) * it2) * 0.000031 * M_PI;
+        it->w = 0.0f;
+      }
+    }
+  }
+};
+
+CudaDepthPacketProcessor::CudaDepthPacketProcessor(const int deviceId) :
+  impl_(new CudaDepthPacketProcessorImpl(deviceId))
+{
+}
+
+CudaDepthPacketProcessor::~CudaDepthPacketProcessor()
+{
+  delete impl_;
+}
+
+unsigned char *CudaDepthPacketProcessor::getPacketBuffer(size_t size)
+{
+  if (impl_->packet_buffer != NULL)
+  {
+    if (size == impl_->packet_buffer_size)
+      return impl_->packet_buffer;
+    cudaSafeCall(cudaFreeHost(impl_->packet_buffer));
+    impl_->packet_buffer = NULL;
+    impl_->packet_buffer_size = 0;
+  }
+
+  cudaSafeCall(cudaHostAlloc(&impl_->packet_buffer, size, cudaHostAllocWriteCombined | cudaHostAllocPortable));
+  impl_->packet_buffer_size = size;
+  return impl_->packet_buffer;
+}
+
+void CudaDepthPacketProcessor::setConfiguration(const libfreenect2::DepthPacketProcessor::Config &config)
+{
+  DepthPacketProcessor::setConfiguration(config);
+  impl_->config = config;
+  impl_->programInitialized = false;
+}
+
+void CudaDepthPacketProcessor::loadP0TablesFromCommandResponse(unsigned char *buffer, size_t buffer_length)
+{
+  libfreenect2::protocol::P0TablesResponse *p0table = (libfreenect2::protocol::P0TablesResponse *)buffer;
+
+  if(buffer_length < sizeof(libfreenect2::protocol::P0TablesResponse))
+  {
+    std::cerr << OUT_NAME("loadP0TablesFromCommandResponse") "P0Table response too short!" << std::endl;
+    return;
+  }
+
+  impl_->fill_trig_table(p0table);
+}
+
+void CudaDepthPacketProcessor::loadXTableFromFile(const char *filename)
+{
+  if(!loadBufferFromResources(filename, (unsigned char *)impl_->x_table, impl_->image_size * sizeof(float)))
+  {
+    std::cerr << OUT_NAME("loadXTableFromFile") "could not load x table from: " << filename << std::endl;
+  }
+}
+
+void CudaDepthPacketProcessor::loadZTableFromFile(const char *filename)
+{
+  if(!loadBufferFromResources(filename, (unsigned char *)impl_->z_table, impl_->image_size * sizeof(float)))
+  {
+    std::cerr << OUT_NAME("loadZTableFromFile") "could not load z table from: " << filename << std::endl;
+  }
+}
+
+void CudaDepthPacketProcessor::load11To16LutFromFile(const char *filename)
+{
+  if(!loadBufferFromResources(filename, (unsigned char *)impl_->lut11to16, 2048 * sizeof(short)))
+  {
+    std::cerr << OUT_NAME("load11To16LutFromFile") "could not load lut table from: " << filename << std::endl;
+  }
+}
+
+void CudaDepthPacketProcessor::process(const DepthPacket &packet)
+{
+  bool has_listener = this->listener_ != 0;
+
+  if(!impl_->programInitialized && !impl_->initProgram())
+  {
+    std::cerr << OUT_NAME("process") "could not initialize CudaDepthPacketProcessor" << std::endl;
+    return;
+  }
+
+  impl_->startTiming();
+
+  impl_->ir_frame->timestamp = packet.timestamp;
+  impl_->depth_frame->timestamp = packet.timestamp;
+  impl_->ir_frame->sequence = packet.sequence;
+  impl_->depth_frame->sequence = packet.sequence;
+
+  impl_->run(packet);
+
+  impl_->stopTiming();
+
+  if(has_listener)
+  {
+    if(this->listener_->onNewFrame(Frame::Ir, impl_->ir_frame))
+    {
+      impl_->newIrFrame();
+    }
+
+    if(this->listener_->onNewFrame(Frame::Depth, impl_->depth_frame))
+    {
+      impl_->newDepthFrame();
+    }
+  }
+}
+
+} /* namespace libfreenect2 */
+
diff --git a/examples/protonect/src/cuda_depth_packet_processor.cu b/examples/protonect/src/cuda_depth_packet_processor.cu
new file mode 100644
index 000000000..99b8cede7
--- /dev/null
+++ b/examples/protonect/src/cuda_depth_packet_processor.cu
@@ -0,0 +1,752 @@
+/*
+ * This file is part of the OpenKinect Project. http://www.openkinect.org
+ *
+ * Copyright (c) 2014 individual OpenKinect contributors. See the CONTRIB file
+ * for details.
+ *
+ * This code is licensed to you under the terms of the Apache License, version
+ * 2.0, or, at your option, the terms of the GNU General Public License,
+ * version 2.0. See the APACHE20 and GPL2 files for the text of the licenses,
+ * or the following URLs:
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.gnu.org/licenses/gpl-2.0.txt
+ *
+ * If you redistribute this file in source form, modified or unmodified, you
+ * may:
+ *   1) Leave this header intact and distribute it under the same terms,
+ *      accompanying it with the APACHE20 and GPL20 files, or
+ *   2) Delete the Apache 2.0 clause and accompany it with the GPL2 file, or
+ *   3) Delete the GPL v2 clause and accompany it with the APACHE20 file
+ * In all cases you must keep the copyright notice intact and include a copy
+ * of the CONTRIB file.
+ *
+ * Binary distributions must follow the binary distribution requirements of
+ * either License.
+ */
+
+#include <libfreenect2/depth_packet_processor.h>
+#include <libfreenect2/frame_listener.hpp>
+#include <helper_math.h>
+#include <math_constants.h>
+
+#include <iostream>
+#include <stdexcept>
+#define cudaSafeCall(expr) do { cudaError_t err = (expr); if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err)); } while(0)
+
+__constant__ static unsigned int BFI_BITMASK;
+__constant__ static float AB_MULTIPLIER;
+__constant__ static float AB_MULTIPLIER_PER_FRQ0;
+__constant__ static float AB_MULTIPLIER_PER_FRQ1;
+__constant__ static float AB_MULTIPLIER_PER_FRQ2;
+__constant__ static float AB_OUTPUT_MULTIPLIER;
+;
+__constant__ static float PHASE_IN_RAD0;
+__constant__ static float PHASE_IN_RAD1;
+__constant__ static float PHASE_IN_RAD2;
+;
+__constant__ static float JOINT_BILATERAL_AB_THRESHOLD;
+__constant__ static float JOINT_BILATERAL_MAX_EDGE;
+__constant__ static float JOINT_BILATERAL_EXP;
+__constant__ static float JOINT_BILATERAL_THRESHOLD;
+;
+__constant__ static float GAUSSIAN_KERNEL_0;
+__constant__ static float GAUSSIAN_KERNEL_1;
+__constant__ static float GAUSSIAN_KERNEL_2;
+__constant__ static float GAUSSIAN_KERNEL_3;
+__constant__ static float GAUSSIAN_KERNEL_4;
+__constant__ static float GAUSSIAN_KERNEL_5;
+__constant__ static float GAUSSIAN_KERNEL_6;
+__constant__ static float GAUSSIAN_KERNEL_7;
+__constant__ static float GAUSSIAN_KERNEL_8;
+;
+__constant__ static float PHASE_OFFSET;
+__constant__ static float UNAMBIGIOUS_DIST;
+__constant__ static float INDIVIDUAL_AB_THRESHOLD;
+__constant__ static float AB_THRESHOLD;
+__constant__ static float AB_CONFIDENCE_SLOPE;
+__constant__ static float AB_CONFIDENCE_OFFSET;
+__constant__ static float MIN_DEALIAS_CONFIDENCE;
+__constant__ static float MAX_DEALIAS_CONFIDENCE;
+;
+__constant__ static float EDGE_AB_AVG_MIN_VALUE;
+__constant__ static float EDGE_AB_STD_DEV_THRESHOLD;
+__constant__ static float EDGE_CLOSE_DELTA_THRESHOLD;
+__constant__ static float EDGE_FAR_DELTA_THRESHOLD;
+__constant__ static float EDGE_MAX_DELTA_THRESHOLD;
+__constant__ static float EDGE_AVG_DELTA_THRESHOLD;
+__constant__ static float MAX_EDGE_COUNT;
+;
+__constant__ static float MIN_DEPTH;
+__constant__ static float MAX_DEPTH;
+
+#define sqrt(x) sqrtf(x)
+#define sincos(x, a, b) sincosf(x, a, b)
+#define atan2(a, b) atan2f(a, b)
+#define log(x) logf(x)
+#define exp(x) expf(x)
+#define max(x, y) fmaxf(x, y)
+#define min(x, y) fminf(x, y)
+#define M_PI_F CUDART_PI_F
+
+typedef unsigned char uchar;
+
+inline  __device__ uint get_global_id(uint i)
+{
+  if (i == 0)
+    return blockIdx.x*blockDim.x + threadIdx.x;
+  // NOT IMPLEMENTED for i > 0
+  return 0;
+}
+
+static inline __device__ int3 isnan(float3 v)
+{
+  return make_int3(isnan(v.x) ? -1 : 0, isnan(v.y) ? -1 : 0, isnan(v.z) ? -1 : 0);
+}
+static inline __device__ float3 sqrtf(float3 v)
+{
+  return make_float3(sqrtf(v.x), sqrtf(v.y), sqrtf(v.z));
+}
+static inline __device__ void sincosf(float3 v, float3 *a, float3 *b)
+{
+  sincosf(v.x, &a->x, &b->x);
+  sincosf(v.y, &a->y, &b->y);
+  sincosf(v.z, &a->z, &b->z);
+}
+static inline __device__ float3 atan2f(float3 a, float3 b)
+{
+  return make_float3(atan2f(a.x, b.x), atan2f(a.y, b.y), atan2f(a.z, b.z));
+}
+static inline __device__ float3 expf(float3 v)
+{
+  return make_float3(expf(v.x), expf(v.y), expf(v.z));
+}
+static inline __device__ float3 select(float3 a, float3 b, int3 c)
+{
+  return make_float3(c.x < 0 ? b.x : a.x, c.y < 0 ? b.y : a.y, c.z < 0 ? b.z : a.z);
+}
+static inline __device__ int3 isless(float3 a, float3 b)
+{
+  return make_int3(a.x < b.x ? -1 : 0, a.y < b.y ? -1 : 0, a.z < b.z ? -1 : 0);
+}
+static inline __device__ int3 isequal(float3 a, float3 b)
+{
+  return make_int3(a.x == b.x ? -1 : 0, a.y == b.y ? -1 : 0, a.z == b.z ? -1 : 0);
+}
+static inline __device__ int any(int3 v)
+{
+  return (v.x | v.y | v.z) < 0;
+}
+static inline __device__ int all(int3 v)
+{
+  return (v.x & v.y & v.z) < 0;
+}
+
+/*******************************************************************************
+ * Process pixel stage 1
+ ******************************************************************************/
+
+static __device__
+float decodePixelMeasurement(const ushort* __restrict__ data, const short* __restrict__ lut11to16, const uint sub, const uint x, const uint y)
+{
+  uint row_idx = (424 * sub + (y < 212 ? y + 212 : 423 - y)) * 352;
+  uint idx = (((x >> 2) + ((x << 7) & BFI_BITMASK)) * 11) & (uint)0xffffffff;
+
+  uint col_idx = idx >> 4;
+  uint upper_bytes = idx & 15;
+  uint lower_bytes = 16 - upper_bytes;
+
+  uint data_idx0 = row_idx + col_idx;
+  uint data_idx1 = row_idx + col_idx + 1;
+
+  return (float)lut11to16[(x < 1 || 510 < x || col_idx > 352) ? 0 : ((data[data_idx0] >> upper_bytes) | (data[data_idx1] << lower_bytes)) & 2047];
+}
+
+static __device__
+float2 processMeasurementTriple(const float ab_multiplier_per_frq, const float p0, const float3 v, int *invalid)
+{
+  float3 p0vec = make_float3(p0 + PHASE_IN_RAD0, p0 + PHASE_IN_RAD1, p0 + PHASE_IN_RAD2);
+  float3 p0sin, p0cos;
+  sincos(p0vec, &p0sin, &p0cos);
+
+  *invalid = *invalid && any(isequal(v, make_float3(32767.0f)));
+
+  return make_float2(dot(v, p0cos), -dot(v, p0sin)) * ab_multiplier_per_frq;
+}
+
+static __global__
+void processPixelStage1(const short* __restrict__ lut11to16, const float* __restrict__ z_table, const float4* __restrict__ p0_table, const ushort* __restrict__ data,
+                               float4 *a_out, float4 *b_out, float4 *n_out, float *ir_out)
+{
+  const uint i = get_global_id(0);
+
+  const uint x = i % 512;
+  const uint y = i / 512;
+
+  const uint y_in = (423 - y);
+
+  const float zmultiplier = z_table[i];
+  int valid = (int)(0.0f < zmultiplier);
+  int saturatedX = valid;
+  int saturatedY = valid;
+  int saturatedZ = valid;
+  int3 invalid_pixel = make_int3((int)(!valid));
+  const float3 p0 = make_float3(p0_table[i]);
+
+  const float3 v0 = make_float3(decodePixelMeasurement(data, lut11to16, 0, x, y_in),
+                             decodePixelMeasurement(data, lut11to16, 1, x, y_in),
+                             decodePixelMeasurement(data, lut11to16, 2, x, y_in));
+  const float2 ab0 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ0, p0.x, v0, &saturatedX);
+
+  const float3 v1 = make_float3(decodePixelMeasurement(data, lut11to16, 3, x, y_in),
+                             decodePixelMeasurement(data, lut11to16, 4, x, y_in),
+                             decodePixelMeasurement(data, lut11to16, 5, x, y_in));
+  const float2 ab1 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ1, p0.y, v1, &saturatedY);
+
+  const float3 v2 = make_float3(decodePixelMeasurement(data, lut11to16, 6, x, y_in),
+                             decodePixelMeasurement(data, lut11to16, 7, x, y_in),
+                             decodePixelMeasurement(data, lut11to16, 8, x, y_in));
+  const float2 ab2 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ2, p0.z, v2, &saturatedZ);
+
+  float3 a = select(make_float3(ab0.x, ab1.x, ab2.x), make_float3(0.0f), invalid_pixel);
+  float3 b = select(make_float3(ab0.y, ab1.y, ab2.y), make_float3(0.0f), invalid_pixel);
+  float3 n = sqrt(a * a + b * b);
+
+  int3 saturated = make_int3(saturatedX, saturatedY, saturatedZ);
+  a = select(a, make_float3(0.0f), saturated);
+  b = select(b, make_float3(0.0f), saturated);
+
+  a_out[i] = make_float4(a);
+  b_out[i] = make_float4(b);
+  n_out[i] = make_float4(n);
+  ir_out[i] = min(dot(select(n, make_float3(65535.0f), saturated), make_float3(0.333333333f  * AB_MULTIPLIER * AB_OUTPUT_MULTIPLIER)), 65535.0f);
+}
+
+/*******************************************************************************
+ * Filter pixel stage 1
+ ******************************************************************************/
+static __global__
+void filterPixelStage1(const float4* __restrict__ a, const float4* __restrict__ b, const float4* __restrict__ n,
+                              float4 *a_out, float4 *b_out, uchar *max_edge_test)
+{
+  const uint i = get_global_id(0);
+
+  const uint x = i % 512;
+  const uint y = i / 512;
+
+  const float3 self_a = make_float3(a[i]);
+  const float3 self_b = make_float3(b[i]);
+
+  const float gaussian[9] = {GAUSSIAN_KERNEL_0, GAUSSIAN_KERNEL_1, GAUSSIAN_KERNEL_2, GAUSSIAN_KERNEL_3, GAUSSIAN_KERNEL_4, GAUSSIAN_KERNEL_5, GAUSSIAN_KERNEL_6, GAUSSIAN_KERNEL_7, GAUSSIAN_KERNEL_8};
+
+  if(x < 1 || y < 1 || x > 510 || y > 422)
+  {
+    a_out[i] = make_float4(self_a);
+    b_out[i] = make_float4(self_b);
+    max_edge_test[i] = 1;
+  }
+  else
+  {
+    float3 threshold = make_float3(sqrt(JOINT_BILATERAL_THRESHOLD));
+    float3 joint_bilateral_exp = make_float3(JOINT_BILATERAL_EXP);
+
+    const float3 self_norm = make_float3(n[i]);
+    const float3 self_normalized_a = self_a / self_norm;
+    const float3 self_normalized_b = self_b / self_norm;
+
+    float3 weight_acc = make_float3(0.0f);
+    float3 weighted_a_acc = make_float3(0.0f);
+    float3 weighted_b_acc = make_float3(0.0f);
+    float3 dist_acc = make_float3(0.0f);
+
+    const int3 c0 = isless(self_norm, threshold);
+
+    threshold = select(threshold, make_float3(0.0f), c0);
+    joint_bilateral_exp = select(joint_bilateral_exp, make_float3(0.0f), c0);
+
+    for(int yi = -1, j = 0; yi < 2; ++yi)
+    {
+      uint i_other = (y + yi) * 512 + x - 1;
+
+      for(int xi = -1; xi < 2; ++xi, ++j, ++i_other)
+      {
+        const float3 other_a = make_float3(a[i_other]);
+        const float3 other_b = make_float3(b[i_other]);
+        const float3 other_norm = make_float3(n[i_other]);
+        const float3 other_normalized_a = other_a / other_norm;
+        const float3 other_normalized_b = other_b / other_norm;
+
+        const int3 c1 = isless(other_norm, threshold);
+
+        const float3 dist = 0.5f * (1.0f - (self_normalized_a * other_normalized_a + self_normalized_b * other_normalized_b));
+        const float3 weight = select(gaussian[j] * exp(-1.442695f * joint_bilateral_exp * dist), make_float3(0.0f), c1);
+
+        weighted_a_acc += weight * other_a;
+        weighted_b_acc += weight * other_b;
+        weight_acc += weight;
+        dist_acc += select(dist, make_float3(0.0f), c1);
+      }
+    }
+
+    const int3 c2 = isless(make_float3(0.0f), weight_acc);
+    a_out[i] = make_float4(select(make_float3(0.0f), weighted_a_acc / weight_acc, c2));
+    b_out[i] = make_float4(select(make_float3(0.0f), weighted_b_acc / weight_acc, c2));
+
+    max_edge_test[i] = all(isless(dist_acc, make_float3(JOINT_BILATERAL_MAX_EDGE)));
+  }
+}
+
+/*******************************************************************************
+ * Process pixel stage 2
+ ******************************************************************************/
+static __global__
+void processPixelStage2(const float4* __restrict__ a_in, const float4* __restrict__ b_in, const float* __restrict__ x_table, const float* __restrict__ z_table,
+                               float *depth, float *ir_sums)
+{
+  const uint i = get_global_id(0);
+  float3 a = make_float3(a_in[i]);
+  float3 b = make_float3(b_in[i]);
+
+  float3 phase = atan2(b, a);
+  phase = select(phase, phase + 2.0f * M_PI_F, isless(phase, make_float3(0.0f)));
+  phase = select(phase, make_float3(0.0f), isnan(phase));
+  float3 ir = sqrt(a * a + b * b) * AB_MULTIPLIER;
+
+  float ir_sum = ir.x + ir.y + ir.z;
+  float ir_min = min(ir.x, min(ir.y, ir.z));
+  float ir_max = max(ir.x, max(ir.y, ir.z));
+
+  float phase_final = 0;
+
+  if(ir_min >= INDIVIDUAL_AB_THRESHOLD && ir_sum >= AB_THRESHOLD)
+  {
+    float3 t = phase / (2.0f * M_PI_F) * make_float3(3.0f, 15.0f, 2.0f);
+
+    float t0 = t.x;
+    float t1 = t.y;
+    float t2 = t.z;
+
+    float t5 = (floor((t1 - t0) * 0.333333f + 0.5f) * 3.0f + t0);
+    float t3 = (-t2 + t5);
+    float t4 = t3 * 2.0f;
+
+    bool c1 = t4 >= -t4; // true if t4 positive
+
+    float f1 = c1 ? 2.0f : -2.0f;
+    float f2 = c1 ? 0.5f : -0.5f;
+    t3 *= f2;
+    t3 = (t3 - floor(t3)) * f1;
+
+    bool c2 = 0.5f < fabs(t3) && fabs(t3) < 1.5f;
+
+    float t6 = c2 ? t5 + 15.0f : t5;
+    float t7 = c2 ? t1 + 15.0f : t1;
+
+    float t8 = (floor((-t2 + t6) * 0.5f + 0.5f) * 2.0f + t2) * 0.5f;
+
+    t6 *= 0.333333f; // = / 3
+    t7 *= 0.066667f; // = / 15
+
+    float t9 = (t8 + t6 + t7); // transformed phase measurements (they are transformed and divided by the values the original values were multiplied with)
+    float t10 = t9 * 0.333333f; // some avg
+
+    t6 *= 2.0f * M_PI_F;
+    t7 *= 2.0f * M_PI_F;
+    t8 *= 2.0f * M_PI_F;
+
+    // some cross product
+    float t8_new = t7 * 0.826977f - t8 * 0.110264f;
+    float t6_new = t8 * 0.551318f - t6 * 0.826977f;
+    float t7_new = t6 * 0.110264f - t7 * 0.551318f;
+
+    t8 = t8_new;
+    t6 = t6_new;
+    t7 = t7_new;
+
+    float norm = t8 * t8 + t6 * t6 + t7 * t7;
+    float mask = t9 >= 0.0f ? 1.0f : 0.0f;
+    t10 *= mask;
+
+    bool slope_positive = 0 < AB_CONFIDENCE_SLOPE;
+
+    float ir_x = slope_positive ? ir_min : ir_max;
+
+    ir_x = log(ir_x);
+    ir_x = (ir_x * AB_CONFIDENCE_SLOPE * 0.301030f + AB_CONFIDENCE_OFFSET) * 3.321928f;
+    ir_x = exp(ir_x);
+    ir_x = clamp(ir_x, MIN_DEALIAS_CONFIDENCE, MAX_DEALIAS_CONFIDENCE);
+    ir_x *= ir_x;
+
+    float mask2 = ir_x >= norm ? 1.0f : 0.0f;
+
+    float t11 = t10 * mask2;
+
+    float mask3 = MAX_DEALIAS_CONFIDENCE * MAX_DEALIAS_CONFIDENCE >= norm ? 1.0f : 0.0f;
+    t10 *= mask3;
+    phase_final = true/*(modeMask & 2) != 0*/ ? t11 : t10;
+  }
+
+  float zmultiplier = z_table[i];
+  float xmultiplier = x_table[i];
+
+  phase_final = 0.0f < phase_final ? phase_final + PHASE_OFFSET : phase_final;
+
+  float depth_linear = zmultiplier * phase_final;
+  float max_depth = phase_final * UNAMBIGIOUS_DIST * 2.0;
+
+  bool cond1 = /*(modeMask & 32) != 0*/ true && 0.0f < depth_linear && 0.0f < max_depth;
+
+  xmultiplier = (xmultiplier * 90.0) / (max_depth * max_depth * 8192.0);
+
+  float depth_fit = depth_linear / (-depth_linear * xmultiplier + 1);
+  depth_fit = depth_fit < 0.0f ? 0.0f : depth_fit;
+
+  float d = cond1 ? depth_fit : depth_linear; // r1.y -> later r2.z
+  depth[i] = d;
+  ir_sums[i] = ir_sum;
+}
+
+/*******************************************************************************
+ * Filter pixel stage 2
+ ******************************************************************************/
+static __global__
+void filterPixelStage2(const float* __restrict__ depth, const float* __restrict__ ir_sums, const uchar* __restrict__ max_edge_test, float *filtered)
+{
+  const uint i = get_global_id(0);
+
+  const uint x = i % 512;
+  const uint y = i / 512;
+
+  const float raw_depth = depth[i];
+  const float ir_sum = ir_sums[i];
+  const uchar edge_test = max_edge_test[i];
+
+  if(raw_depth >= MIN_DEPTH && raw_depth <= MAX_DEPTH)
+  {
+    if(x < 1 || y < 1 || x > 510 || y > 422)
+    {
+      filtered[i] = raw_depth;
+    }
+    else
+    {
+      float ir_sum_acc = ir_sum;
+      float squared_ir_sum_acc = ir_sum * ir_sum;
+      float min_depth = raw_depth;
+      float max_depth = raw_depth;
+
+      for(int yi = -1; yi < 2; ++yi)
+      {
+        uint i_other = (y + yi) * 512 + x - 1;
+
+        for(int xi = -1; xi < 2; ++xi, ++i_other)
+        {
+          if(i_other == i)
+          {
+            continue;
+          }
+
+          const float raw_depth_other = depth[i_other];
+          const float ir_sum_other = ir_sums[i_other];
+
+          ir_sum_acc += ir_sum_other;
+          squared_ir_sum_acc += ir_sum_other * ir_sum_other;
+
+          if(0.0f < raw_depth_other)
+          {
+            min_depth = min(min_depth, raw_depth_other);
+            max_depth = max(max_depth, raw_depth_other);
+          }
+        }
+      }
+
+      float tmp0 = sqrt(squared_ir_sum_acc * 9.0f - ir_sum_acc * ir_sum_acc) / 9.0f;
+      float edge_avg = max(ir_sum_acc / 9.0f, EDGE_AB_AVG_MIN_VALUE);
+      tmp0 /= edge_avg;
+
+      float abs_min_diff = fabs(raw_depth - min_depth);
+      float abs_max_diff = fabs(raw_depth - max_depth);
+
+      float avg_diff = (abs_min_diff + abs_max_diff) * 0.5f;
+      float max_abs_diff = max(abs_min_diff, abs_max_diff);
+
+      bool cond0 =
+          0.0f < raw_depth &&
+          tmp0 >= EDGE_AB_STD_DEV_THRESHOLD &&
+          EDGE_CLOSE_DELTA_THRESHOLD < abs_min_diff &&
+          EDGE_FAR_DELTA_THRESHOLD < abs_max_diff &&
+          EDGE_MAX_DELTA_THRESHOLD < max_abs_diff &&
+          EDGE_AVG_DELTA_THRESHOLD < avg_diff;
+
+      if(!cond0)
+      {
+        if(edge_test != 0)
+        {
+          //float tmp1 = 1500.0f > raw_depth ? 30.0f : 0.02f * raw_depth;
+          float edge_count = 0.0f;
+
+          filtered[i] = edge_count > MAX_EDGE_COUNT ? 0.0f : raw_depth;
+        }
+        else
+        {
+          filtered[i] = 0.0f;
+        }
+      }
+      else
+      {
+        filtered[i] = 0.0f;
+      }
+    }
+  }
+  else
+  {
+    filtered[i] = 0.0f;
+  }
+}
+
+#define OUT_NAME(FUNCTION) "[CudaDepthPacketProcessorKernel::" FUNCTION "] "
+namespace libfreenect2
+{
+class CudaDepthPacketProcessorKernelImpl
+{
+public:
+  short *buf_lut11to16;
+  float4 *buf_p0_table;
+  float *buf_x_table;
+  float *buf_z_table;
+  unsigned short *buf_packet;
+
+  float4 *buf_a;
+  float4 *buf_b;
+  float4 *buf_n;
+  float *buf_ir;
+  float4 *buf_a_filtered;
+  float4 *buf_b_filtered;
+  unsigned char *buf_edge_test;
+  float *buf_depth;
+  float *buf_ir_sum;
+  float *buf_filtered;
+
+  size_t image_size;
+  size_t grid_size;
+  size_t block_size;
+
+  void initDevice(const int deviceId, size_t image_size_, size_t block)
+  {
+    int deviceCount = 0;
+
+    cudaSafeCall(cudaGetDeviceCount(&deviceCount));
+
+    int devId = -1;
+    for (int i = 0; i < deviceCount; i++)
+    {
+      if (deviceId != -1 && i != deviceId)
+        continue;
+
+      cudaDeviceProp prop;
+      cudaSafeCall(cudaGetDeviceProperties(&prop, i));
+      std::cout << OUT_NAME("initDevice") "device " << i << ": " << prop.name << " @ " << (prop.clockRate / 1000) << "MHz Memory " << (prop.totalGlobalMem >> 20) << "MB";
+
+      if (prop.computeMode == cudaComputeModeProhibited)
+      {
+        std::cout << " Compute Mode Prohibited" << std::endl;
+        continue;
+      }
+
+      if (prop.major < 1)
+      {
+        std::cout << " does not support CUDA" << std::endl;
+        continue;
+      }
+
+      std::cout << std::endl;
+      devId = i;
+      break;
+    }
+
+    if (devId == -1)
+    {
+      throw std::runtime_error("No suitable CUDA devices found.");
+    }
+
+    cudaSafeCall(cudaSetDevice(devId));
+    std::cout << OUT_NAME("initDevice") "selected device " << devId << std::endl;
+
+    image_size = image_size_;
+    grid_size = image_size_/block;
+    block_size = block;
+  }
+
+  void loadTables(const short *lut11to16, const Float4 *p0_table, const float *x_table, const float *z_table)
+  {
+    //Read only
+    size_t buf_lut11to16_size = 2048 * sizeof(short);
+    size_t buf_p0_table_size = image_size * sizeof(float4);
+    size_t buf_x_table_size = image_size * sizeof(float);
+    size_t buf_z_table_size = image_size * sizeof(float);
+    size_t buf_packet_size = ((image_size * 11) / 16) * 10 * sizeof(short);
+
+    cudaSafeCall(cudaMalloc(&buf_lut11to16, buf_lut11to16_size));
+    cudaSafeCall(cudaMalloc(&buf_p0_table, buf_p0_table_size));
+    cudaSafeCall(cudaMalloc(&buf_x_table, buf_x_table_size));
+    cudaSafeCall(cudaMalloc(&buf_z_table, buf_z_table_size));
+    cudaSafeCall(cudaMalloc(&buf_packet, buf_packet_size));
+
+    cudaMemcpyAsync(buf_lut11to16, lut11to16, buf_lut11to16_size, cudaMemcpyHostToDevice);
+    cudaMemcpyAsync(buf_p0_table, p0_table, buf_p0_table_size, cudaMemcpyHostToDevice);
+    cudaMemcpyAsync(buf_x_table, x_table, buf_x_table_size, cudaMemcpyHostToDevice);
+    cudaMemcpyAsync(buf_z_table, z_table, buf_z_table_size, cudaMemcpyHostToDevice);
+
+    //Read-Write
+    size_t buf_a_size = image_size * sizeof(float4);
+    size_t buf_b_size = image_size * sizeof(float4);
+    size_t buf_n_size = image_size * sizeof(float4);
+    size_t buf_ir_size = image_size * sizeof(float);
+    size_t buf_a_filtered_size = image_size * sizeof(float4);
+    size_t buf_b_filtered_size = image_size * sizeof(float4);
+    size_t buf_edge_test_size = image_size * sizeof(char);
+    size_t buf_depth_size = image_size * sizeof(float);
+    size_t buf_ir_sum_size = image_size * sizeof(float);
+    size_t buf_filtered_size = image_size * sizeof(float);
+
+    cudaSafeCall(cudaMalloc(&buf_a, buf_a_size));
+    cudaSafeCall(cudaMalloc(&buf_b, buf_b_size));
+    cudaSafeCall(cudaMalloc(&buf_n, buf_n_size));
+    cudaSafeCall(cudaMalloc(&buf_ir, buf_ir_size));
+    cudaSafeCall(cudaMalloc(&buf_a_filtered, buf_a_filtered_size));
+    cudaSafeCall(cudaMalloc(&buf_b_filtered, buf_b_filtered_size));
+    cudaSafeCall(cudaMalloc(&buf_edge_test, buf_edge_test_size));
+    cudaSafeCall(cudaMalloc(&buf_depth, buf_depth_size));
+    cudaSafeCall(cudaMalloc(&buf_ir_sum, buf_ir_sum_size));
+    cudaSafeCall(cudaMalloc(&buf_filtered, buf_filtered_size));
+
+    cudaDeviceSynchronize();
+
+    cudaSafeCall(cudaGetLastError());
+  }
+
+  void generateOptions(const DepthPacketProcessor::Parameters &params, const DepthPacketProcessor::Config &config)
+  {
+    unsigned int tmpi;
+    float tmpf;
+
+    #define COPY(upper, lower) cudaMemcpyToSymbolAsync(upper, &params.lower, sizeof(params.lower));
+    tmpi = 0x180;
+    cudaMemcpyToSymbolAsync(BFI_BITMASK, &tmpi, sizeof(int));
+
+    COPY(AB_MULTIPLIER, ab_multiplier)
+    COPY(AB_MULTIPLIER_PER_FRQ0, ab_multiplier_per_frq[0])
+    COPY(AB_MULTIPLIER_PER_FRQ1, ab_multiplier_per_frq[1])
+    COPY(AB_MULTIPLIER_PER_FRQ2, ab_multiplier_per_frq[2])
+    COPY(AB_OUTPUT_MULTIPLIER, ab_output_multiplier)
+
+    COPY(PHASE_IN_RAD0, phase_in_rad[0])
+    COPY(PHASE_IN_RAD1, phase_in_rad[1])
+    COPY(PHASE_IN_RAD2, phase_in_rad[2])
+
+    COPY(JOINT_BILATERAL_AB_THRESHOLD, joint_bilateral_ab_threshold)
+    COPY(JOINT_BILATERAL_MAX_EDGE, joint_bilateral_max_edge)
+    COPY(JOINT_BILATERAL_EXP, joint_bilateral_exp)
+    tmpf = (params.joint_bilateral_ab_threshold * params.joint_bilateral_ab_threshold) / (params.ab_multiplier * params.ab_multiplier);
+    cudaMemcpyToSymbolAsync(JOINT_BILATERAL_THRESHOLD, &tmpf, sizeof(tmpf));
+
+    COPY(GAUSSIAN_KERNEL_0, gaussian_kernel[0])
+    COPY(GAUSSIAN_KERNEL_1, gaussian_kernel[1])
+    COPY(GAUSSIAN_KERNEL_2, gaussian_kernel[2])
+    COPY(GAUSSIAN_KERNEL_3, gaussian_kernel[3])
+    COPY(GAUSSIAN_KERNEL_4, gaussian_kernel[4])
+    COPY(GAUSSIAN_KERNEL_5, gaussian_kernel[5])
+    COPY(GAUSSIAN_KERNEL_6, gaussian_kernel[6])
+    COPY(GAUSSIAN_KERNEL_7, gaussian_kernel[7])
+    COPY(GAUSSIAN_KERNEL_8, gaussian_kernel[8])
+
+    COPY(PHASE_OFFSET, phase_offset)
+    COPY(UNAMBIGIOUS_DIST, unambigious_dist)
+    COPY(INDIVIDUAL_AB_THRESHOLD, individual_ab_threshold)
+    COPY(AB_THRESHOLD, ab_threshold)
+    COPY(AB_CONFIDENCE_SLOPE, ab_confidence_slope)
+    COPY(AB_CONFIDENCE_OFFSET, ab_confidence_offset)
+    COPY(MIN_DEALIAS_CONFIDENCE, min_dealias_confidence)
+    COPY(MAX_DEALIAS_CONFIDENCE, max_dealias_confidence)
+
+    COPY(EDGE_AB_AVG_MIN_VALUE, edge_ab_avg_min_value)
+    COPY(EDGE_AB_STD_DEV_THRESHOLD, edge_ab_std_dev_threshold)
+    COPY(EDGE_CLOSE_DELTA_THRESHOLD, edge_close_delta_threshold)
+    COPY(EDGE_FAR_DELTA_THRESHOLD, edge_far_delta_threshold)
+    COPY(EDGE_MAX_DELTA_THRESHOLD, edge_max_delta_threshold)
+    COPY(EDGE_AVG_DELTA_THRESHOLD, edge_avg_delta_threshold)
+    COPY(MAX_EDGE_COUNT, max_edge_count)
+
+    tmpf = config.MinDepth * 1000.0f;
+    cudaMemcpyToSymbolAsync(MIN_DEPTH, &tmpf, sizeof(tmpf));
+
+    tmpf = config.MaxDepth * 1000.0f;
+    cudaMemcpyToSymbolAsync(MAX_DEPTH, &tmpf, sizeof(tmpf));
+
+    cudaDeviceSynchronize();
+
+    cudaSafeCall(cudaGetLastError());
+  }
+
+  void run(const DepthPacket &packet, Frame *ir_frame, Frame *depth_frame, const DepthPacketProcessor::Config &config)
+  {
+    size_t ir_frame_size = ir_frame->width * ir_frame->height * ir_frame->bytes_per_pixel;
+    size_t depth_frame_size = depth_frame->width * depth_frame->height * depth_frame->bytes_per_pixel;
+
+    cudaMemcpyAsync(buf_packet, packet.buffer, packet.buffer_length, cudaMemcpyHostToDevice);
+
+    processPixelStage1<<<grid_size, block_size>>>(buf_lut11to16, buf_z_table, buf_p0_table, buf_packet, buf_a, buf_b, buf_n, buf_ir);
+
+    cudaMemcpyAsync(ir_frame->data, buf_ir, ir_frame_size, cudaMemcpyDeviceToHost);
+
+    if (config.EnableBilateralFilter)
+    {
+      filterPixelStage1<<<grid_size, block_size>>>(buf_a, buf_b, buf_n, buf_a_filtered, buf_b_filtered, buf_edge_test);
+    }
+
+    processPixelStage2<<<grid_size, block_size>>>(
+      config.EnableBilateralFilter ? buf_a_filtered : buf_a,
+      config.EnableBilateralFilter ? buf_b_filtered : buf_b,
+      buf_x_table, buf_z_table, buf_depth, buf_ir_sum);
+
+    if (config.EnableEdgeAwareFilter)
+    {
+      filterPixelStage2<<<grid_size, block_size>>>(buf_depth, buf_ir_sum, buf_edge_test, buf_filtered);
+    }
+
+    cudaMemcpyAsync(depth_frame->data, config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, depth_frame_size, cudaMemcpyDeviceToHost);
+
+    cudaDeviceSynchronize();
+
+    cudaSafeCall(cudaGetLastError());
+  }
+};
+
+CudaDepthPacketProcessorKernel::CudaDepthPacketProcessorKernel():
+  impl_(new CudaDepthPacketProcessorKernelImpl())
+{
+}
+
+CudaDepthPacketProcessorKernel::~CudaDepthPacketProcessorKernel()
+{
+  delete impl_;
+}
+
+void CudaDepthPacketProcessorKernel::initDevice(const int deviceId, size_t image_size_, size_t block)
+{
+  impl_->initDevice(deviceId, image_size_, block);
+}
+
+void CudaDepthPacketProcessorKernel::loadTables(const short *lut11to16, const Float4 *p0_table, const float *x_table, const float *z_table)
+{
+  impl_->loadTables(lut11to16, p0_table, x_table, z_table);
+}
+
+void CudaDepthPacketProcessorKernel::generateOptions(const DepthPacketProcessor::Parameters &params, const DepthPacketProcessor::Config &config)
+{
+  impl_->generateOptions(params, config);
+}
+
+void CudaDepthPacketProcessorKernel::run(const DepthPacket &packet, Frame *ir_frame, Frame *depth_frame, const DepthPacketProcessor::Config &config)
+{
+  impl_->run(packet, ir_frame, depth_frame, config);
+}
+}
diff --git a/examples/protonect/src/depth_packet_stream_parser.cpp b/examples/protonect/src/depth_packet_stream_parser.cpp
index 3a03c44ea..f37d61e27 100644
--- a/examples/protonect/src/depth_packet_stream_parser.cpp
+++ b/examples/protonect/src/depth_packet_stream_parser.cpp
@@ -54,6 +54,12 @@ DepthPacketStreamParser::~DepthPacketStreamParser()
 void DepthPacketStreamParser::setPacketProcessor(libfreenect2::BaseDepthPacketProcessor *processor)
 {
   processor_ = (processor != 0) ? processor : noopProcessor<DepthPacket>();
+  size_t single_image = 512*424*11/8;
+  unsigned char *processor_buf = processor_->getPacketBuffer(single_image * 10 * 2);
+  if (processor_buf)
+  {
+    buffer_.setbuffer(processor_buf, single_image * 10 * 2);
+  }
 }
 
 void DepthPacketStreamParser::onDataReceived(unsigned char* buffer, size_t in_length)
diff --git a/examples/protonect/src/double_buffer.cpp b/examples/protonect/src/double_buffer.cpp
index c1948b49a..3e4a253d9 100644
--- a/examples/protonect/src/double_buffer.cpp
+++ b/examples/protonect/src/double_buffer.cpp
@@ -37,7 +37,7 @@ DoubleBuffer::DoubleBuffer() :
 
 DoubleBuffer::~DoubleBuffer()
 {
-  if(buffer_data_ != 0)
+  if(buffer_data_ != 0 && !external_buffer_)
   {
     buffer_[0].data = 0;
     buffer_[1].data = 0;
@@ -49,6 +49,7 @@ void DoubleBuffer::allocate(size_t buffer_size)
 {
   size_t total_buffer_size = 2 * buffer_size;
   buffer_data_ = new unsigned char[total_buffer_size];
+  external_buffer_ = false;
 
   buffer_[0].capacity = buffer_size;
   buffer_[0].length = 0;
@@ -59,6 +60,22 @@ void DoubleBuffer::allocate(size_t buffer_size)
   buffer_[1].data = buffer_data_ + buffer_size;
 }
 
+void DoubleBuffer::setbuffer(unsigned char *buf, size_t size)
+{
+  if (!external_buffer_)
+    delete[] buffer_data_;
+  external_buffer_ = true;
+  buffer_data_ = buf;
+
+  buffer_[0].capacity = size / 2;
+  buffer_[0].length = size / 2;
+  buffer_[0].data = buf;
+
+  buffer_[1].capacity = size / 2;
+  buffer_[1].length = size / 2;
+  buffer_[1].data = buf + size / 2;
+}
+
 void DoubleBuffer::swap()
 {
   front_buffer_index_ = (front_buffer_index_ + 1) & 1;
diff --git a/examples/protonect/src/libfreenect2.cpp b/examples/protonect/src/libfreenect2.cpp
index 9524fd6ca..a758924b2 100644
--- a/examples/protonect/src/libfreenect2.cpp
+++ b/examples/protonect/src/libfreenect2.cpp
@@ -623,6 +623,9 @@ void Freenect2DeviceImpl::close()
 
 PacketPipeline *createDefaultPacketPipeline()
 {
+#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT
+  return new CudaPacketPipeline();
+#endif
 #ifdef LIBFREENECT2_WITH_OPENGL_SUPPORT
   return new OpenGLPacketPipeline();
 #else
diff --git a/examples/protonect/src/opencl_depth_packet_processor.cpp b/examples/protonect/src/opencl_depth_packet_processor.cpp
index 62bd850d3..cc13587ab 100644
--- a/examples/protonect/src/opencl_depth_packet_processor.cpp
+++ b/examples/protonect/src/opencl_depth_packet_processor.cpp
@@ -57,27 +57,6 @@
 namespace libfreenect2
 {
 
-bool loadBufferFromResources(const std::string &filename, unsigned char *buffer, const size_t n)
-{
-  size_t length = 0;
-  const unsigned char *data = NULL;
-
-  if(!loadResource(filename, &data, &length))
-  {
-    std::cerr << OUT_NAME("loadBufferFromResources") "failed to load resource: " << filename << std::endl;
-    return false;
-  }
-
-  if(length != n)
-  {
-    std::cerr << OUT_NAME("loadBufferFromResources") "wrong size of resource: " << filename << std::endl;
-    return false;
-  }
-
-  memcpy(buffer, data, length);
-  return true;
-}
-
 std::string loadCLSource(const std::string &filename)
 {
   const unsigned char *data;
diff --git a/examples/protonect/src/packet_pipeline.cpp b/examples/protonect/src/packet_pipeline.cpp
index 1c66f66b6..8de1d0932 100644
--- a/examples/protonect/src/packet_pipeline.cpp
+++ b/examples/protonect/src/packet_pipeline.cpp
@@ -136,4 +136,24 @@ DepthPacketProcessor *OpenCLPacketPipeline::createDepthPacketProcessor()
 }
 #endif // LIBFREENECT2_WITH_OPENCL_SUPPORT
 
+#ifdef LIBFREENECT2_WITH_CUDA_SUPPORT
+
+CudaPacketPipeline::CudaPacketPipeline(const int deviceId) : deviceId(deviceId)
+{
+  initialize();
+}
+
+CudaPacketPipeline::~CudaPacketPipeline() { }
+
+DepthPacketProcessor *CudaPacketPipeline::createDepthPacketProcessor()
+{
+  CudaDepthPacketProcessor *depth_processor = new CudaDepthPacketProcessor(deviceId);
+  depth_processor->load11To16LutFromFile("11to16.bin");
+  depth_processor->loadXTableFromFile("xTable.bin");
+  depth_processor->loadZTableFromFile("zTable.bin");
+
+  return depth_processor;
+}
+#endif // LIBFREENECT2_WITH_CUDA_SUPPORT
+
 } /* namespace libfreenect2 */
diff --git a/examples/protonect/src/resource.cpp b/examples/protonect/src/resource.cpp
index f3253de40..1c3d24caf 100644
--- a/examples/protonect/src/resource.cpp
+++ b/examples/protonect/src/resource.cpp
@@ -25,6 +25,9 @@
  */
 
 #include <libfreenect2/resource.h>
+#include <string>
+#include <cstring>
+#include <iostream>
 
 namespace libfreenect2
 {
@@ -37,7 +40,7 @@ struct ResourceDescriptor
 };
 
 #ifdef RESOURCES_INC
-#include "resources.inc"
+#include "resources.inc.h"
 #else
 ResourceDescriptor resource_descriptors[] = {};
 #endif
@@ -59,4 +62,25 @@ bool loadResource(const std::string &name, unsigned char const**data, size_t *le
   return result;
 }
 
+bool loadBufferFromResources(const std::string &filename, unsigned char *buffer, const size_t n)
+{
+  size_t length = 0;
+  const unsigned char *data = NULL;
+
+  if(!loadResource(filename, &data, &length))
+  {
+    std::cerr << "loadBufferFromResources: failed to load resource: " << filename << std::endl;
+    return false;
+  }
+
+  if(length != n)
+  {
+    std::cerr << "loadBufferFromResources: wrong size of resource: " << filename << std::endl;
+    return false;
+  }
+
+  memcpy(buffer, data, length);
+  return true;
+}
+
 } /* namespace libfreenect2 */