diff --git a/include/internal/libfreenect2/depth_packet_processor.h b/include/internal/libfreenect2/depth_packet_processor.h index 952ca4315..5072de059 100644 --- a/include/internal/libfreenect2/depth_packet_processor.h +++ b/include/internal/libfreenect2/depth_packet_processor.h @@ -175,8 +175,12 @@ class OpenCLDepthPacketProcessor : public DepthPacketProcessor virtual void loadXZTables(const float *xtable, const float *ztable); virtual void loadLookupTable(const short *lut); + virtual bool good(); virtual const char *name() { return "OpenCL"; } + virtual void process(const DepthPacket &packet); +protected: + virtual Allocator *getAllocator(); private: OpenCLDepthPacketProcessorImpl *impl_; }; diff --git a/src/opencl_depth_packet_processor.cl b/src/opencl_depth_packet_processor.cl index 2e5d15436..2b53f07e8 100644 --- a/src/opencl_depth_packet_processor.cl +++ b/src/opencl_depth_packet_processor.cl @@ -24,13 +24,16 @@ * either License. */ +#define PHASE (float3)(PHASE_IN_RAD0, PHASE_IN_RAD1, PHASE_IN_RAD2) +#define AB_MULTIPLIER_PER_FRQ (float3)(AB_MULTIPLIER_PER_FRQ0, AB_MULTIPLIER_PER_FRQ1, AB_MULTIPLIER_PER_FRQ2) + /******************************************************************************* * Process pixel stage 1 ******************************************************************************/ float decodePixelMeasurement(global const ushort *data, global const short *lut11to16, const uint sub, const uint x, const uint y) { - uint row_idx = (424 * sub + (y < 212 ? y + 212 : 423 - y)) * 352; + uint row_idx = (424 * sub + y) * 352; uint idx = (((x >> 2) + ((x << 7) & BFI_BITMASK)) * 11) & (uint)0xffffffff; uint col_idx = idx >> 4; @@ -43,17 +46,6 @@ float decodePixelMeasurement(global const ushort *data, global const short *lut1 return (float)lut11to16[(x < 1 || 510 < x || col_idx > 352) ? 0 : ((data[data_idx0] >> upper_bytes) | (data[data_idx1] << lower_bytes)) & 2047]; } -float2 processMeasurementTriple(const float ab_multiplier_per_frq, const float p0, const float3 v, int *invalid) -{ - float3 p0vec = (float3)(p0 + PHASE_IN_RAD0, p0 + PHASE_IN_RAD1, p0 + PHASE_IN_RAD2); - float3 p0cos = cos(p0vec); - float3 p0sin = sin(-p0vec); - - *invalid = *invalid && any(isequal(v, (float3)(32767.0f))); - - return (float2)(dot(v, p0cos), dot(v, p0sin)) * ab_multiplier_per_frq; -} - void kernel processPixelStage1(global const short *lut11to16, global const float *z_table, global const float3 *p0_table, global const ushort *data, global float3 *a_out, global float3 *b_out, global float3 *n_out, global float *ir_out) { @@ -62,41 +54,47 @@ void kernel processPixelStage1(global const short *lut11to16, global const float const uint x = i % 512; const uint y = i / 512; - const uint y_in = (423 - y); + const uint y_tmp = (423 - y); + const uint y_in = (y_tmp < 212 ? y_tmp + 212 : 423 - y_tmp); - const float zmultiplier = z_table[i]; - int valid = (int)(0.0f < zmultiplier); - int saturatedX = valid; - int saturatedY = valid; - int saturatedZ = valid; - int3 invalid_pixel = (int3)((int)(!valid)); + const int3 invalid = (int)(0.0f >= z_table[i]); const float3 p0 = p0_table[i]; + float3 p0x_sin, p0y_sin, p0z_sin; + float3 p0x_cos, p0y_cos, p0z_cos; + + p0x_sin = -sincos(PHASE + p0.x, &p0x_cos); + p0y_sin = -sincos(PHASE + p0.y, &p0y_cos); + p0z_sin = -sincos(PHASE + p0.z, &p0z_cos); + + int3 invalid_pixel = (int3)(invalid); const float3 v0 = (float3)(decodePixelMeasurement(data, lut11to16, 0, x, y_in), decodePixelMeasurement(data, lut11to16, 1, x, y_in), decodePixelMeasurement(data, lut11to16, 2, x, y_in)); - const float2 ab0 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ0, p0.x, v0, &saturatedX); - const float3 v1 = (float3)(decodePixelMeasurement(data, lut11to16, 3, x, y_in), decodePixelMeasurement(data, lut11to16, 4, x, y_in), decodePixelMeasurement(data, lut11to16, 5, x, y_in)); - const float2 ab1 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ1, p0.y, v1, &saturatedY); - const float3 v2 = (float3)(decodePixelMeasurement(data, lut11to16, 6, x, y_in), decodePixelMeasurement(data, lut11to16, 7, x, y_in), decodePixelMeasurement(data, lut11to16, 8, x, y_in)); - const float2 ab2 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ2, p0.z, v2, &saturatedZ); - float3 a = select((float3)(ab0.x, ab1.x, ab2.x), (float3)(0.0f), invalid_pixel); - float3 b = select((float3)(ab0.y, ab1.y, ab2.y), (float3)(0.0f), invalid_pixel); + float3 a = (float3)(dot(v0, p0x_cos), + dot(v1, p0y_cos), + dot(v2, p0z_cos)) * AB_MULTIPLIER_PER_FRQ; + float3 b = (float3)(dot(v0, p0x_sin), + dot(v1, p0y_sin), + dot(v2, p0z_sin)) * AB_MULTIPLIER_PER_FRQ; + + a = select(a, (float3)(0.0f), invalid_pixel); + b = select(b, (float3)(0.0f), invalid_pixel); float3 n = sqrt(a * a + b * b); - int3 saturated = (int3)(saturatedX, saturatedY, saturatedZ); - a = select(a, (float3)(0.0f), saturated); - b = select(b, (float3)(0.0f), saturated); + int3 saturated = (int3)(any(isequal(v0, (float3)(32767.0f))), + any(isequal(v1, (float3)(32767.0f))), + any(isequal(v2, (float3)(32767.0f)))); - a_out[i] = a; - b_out[i] = b; + a_out[i] = select(a, (float3)(0.0f), saturated); + b_out[i] = select(b, (float3)(0.0f), saturated); n_out[i] = n; ir_out[i] = min(dot(select(n, (float3)(65535.0f), saturated), (float3)(0.333333333f * AB_MULTIPLIER * AB_OUTPUT_MULTIPLIER)), 65535.0f); } diff --git a/src/opencl_depth_packet_processor.cpp b/src/opencl_depth_packet_processor.cpp index 429f20e9f..7e93cd1e5 100644 --- a/src/opencl_depth_packet_processor.cpp +++ b/src/opencl_depth_packet_processor.cpp @@ -55,6 +55,10 @@ #include +#define CHECK_CL_PARAM(expr) do { cl_int err = CL_SUCCESS; (expr); if (err != CL_SUCCESS) { LOG_ERROR << #expr ": " << err; return false; } } while(0) +#define CHECK_CL_RETURN(expr) do { cl_int err = (expr); if (err != CL_SUCCESS) { LOG_ERROR << #expr ": " << err; return false; } } while(0) +#define CHECK_CL_ON_FAIL(expr, on_fail) do { cl_int err = (expr); if (err != CL_SUCCESS) { LOG_ERROR << #expr ": " << err; on_fail; return false; } } while(0) + namespace libfreenect2 { @@ -72,17 +76,102 @@ std::string loadCLSource(const std::string &filename) return std::string(reinterpret_cast(data), length); } +class OpenCLDepthPacketProcessorImpl; + +class OpenCLBuffer: public Buffer +{ +public: + cl::Buffer buffer; +}; + +class OpenCLAllocator: public Allocator +{ +private: + cl::Context &context; + cl::CommandQueue &queue; + const bool isInputBuffer; + + bool allocate_opencl(OpenCLBuffer *b, size_t size) + { + if(isInputBuffer) + { + CHECK_CL_PARAM(b->buffer = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size, NULL, &err)); + CHECK_CL_PARAM(b->data = (unsigned char*)queue.enqueueMapBuffer(b->buffer, CL_TRUE, CL_MAP_WRITE, 0, size, NULL, NULL, &err)); + } + else + { + CHECK_CL_PARAM(b->buffer = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, size, NULL, &err)); + CHECK_CL_PARAM(b->data = (unsigned char*)queue.enqueueMapBuffer(b->buffer, CL_TRUE, CL_MAP_READ, 0, size, NULL, NULL, &err)); + } + + b->length = 0; + b->capacity = size; + return true; + } + + bool release_opencl(OpenCLBuffer *b) + { + cl::Event event; + CHECK_CL_RETURN(queue.enqueueUnmapMemObject(b->buffer, b->data, NULL, &event)); + CHECK_CL_RETURN(event.wait()); + return true; + } + +public: + OpenCLAllocator(cl::Context &context, cl::CommandQueue &queue, bool isInputBuffer) : context(context), queue(queue), isInputBuffer(isInputBuffer) + { + } + + virtual Buffer *allocate(size_t size) + { + OpenCLBuffer *b = new OpenCLBuffer(); + if(!allocate_opencl(b, size)) + b->data = NULL; + return b; + } + + virtual void free(Buffer *b) + { + if(b == NULL) + return; + release_opencl(static_cast(b)); + delete b; + } +}; + +class OpenCLFrame: public Frame +{ +private: + OpenCLBuffer *buffer; + +public: + OpenCLFrame(OpenCLBuffer *buffer) + : Frame(512, 424, 4, (unsigned char*)-1) + , buffer(buffer) + { + data = buffer->data; + } + + virtual ~OpenCLFrame() + { + buffer->allocator->free(buffer); + data = NULL; + } +}; + class OpenCLDepthPacketProcessorImpl: public WithPerfLogging { public: - cl_short lut11to16[2048]; - cl_float x_table[512 * 424]; - cl_float z_table[512 * 424]; - cl_float3 p0_table[512 * 424]; + static const size_t IMAGE_SIZE = 512*424; + static const size_t LUT_SIZE = 2048; + libfreenect2::DepthPacketProcessor::Config config; DepthPacketProcessor::Parameters params; Frame *ir_frame, *depth_frame; + Allocator *input_buffer_allocator; + Allocator *ir_buffer_allocator; + Allocator *depth_buffer_allocator; cl::Context context; cl::Device device; @@ -95,8 +184,6 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging cl::Kernel kernel_processPixelStage2; cl::Kernel kernel_filterPixelStage2; - size_t image_size; - // Read only buffers size_t buf_lut11to16_size; size_t buf_p0_table_size; @@ -138,7 +225,12 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging bool programInitialized; std::string sourceCode; - OpenCLDepthPacketProcessorImpl(const int deviceId = -1) +#ifdef LIBFREENECT2_WITH_PROFILING_CL + std::vector timings; + int count; +#endif + + OpenCLDepthPacketProcessorImpl(const int deviceId = -1) : deviceInitialized(false) , programBuilt(false) , programInitialized(false) @@ -148,12 +240,14 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging setenv("OCL_STRICT_CONFORMANCE", "0", 0); #endif - newIrFrame(); - newDepthFrame(); + deviceInitialized = initDevice(deviceId); - image_size = 512 * 424; + input_buffer_allocator = new PoolAllocator(new OpenCLAllocator(context, queue, true)); + ir_buffer_allocator = new PoolAllocator(new OpenCLAllocator(context, queue, false)); + depth_buffer_allocator = new PoolAllocator(new OpenCLAllocator(context, queue, false)); - deviceInitialized = initDevice(deviceId); + newIrFrame(); + newDepthFrame(); const int CL_ICDL_VERSION = 2; typedef cl_int (*icdloader_func)(int, size_t, void*, size_t*); @@ -185,6 +279,9 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging { delete ir_frame; delete depth_frame; + delete input_buffer_allocator; + delete ir_buffer_allocator; + delete depth_buffer_allocator; } void generateOptions(std::string &options) const @@ -237,6 +334,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging oss << " -D MIN_DEPTH=" << config.MinDepth * 1000.0f << "f"; oss << " -D MAX_DEPTH=" << config.MaxDepth * 1000.0f << "f"; + + oss << " -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math"; options = oss.str(); } @@ -319,8 +418,6 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging return selected; } -#define CHECK_CL_ERROR(err, str) do {if (err != CL_SUCCESS) {LOG_ERROR << str << " failed: " << err; return false; } } while(0) - bool initDevice(const int deviceId) { if(!readProgram(sourceCode)) @@ -328,35 +425,81 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging return false; } - cl_int err = CL_SUCCESS; + std::vector platforms; + CHECK_CL_RETURN(cl::Platform::get(&platforms)); + + if(platforms.empty()) { - std::vector platforms; - err = cl::Platform::get(&platforms); - CHECK_CL_ERROR(err, "cl::Platform::get"); + LOG_ERROR << "no opencl platforms found."; + return false; + } - if(platforms.empty()) - { - LOG_ERROR << "no opencl platforms found."; - return false; - } + std::vector devices; + getDevices(platforms, devices); + listDevice(devices); + if(!selectDevice(devices, deviceId)) + { + LOG_ERROR << "could not find any suitable device"; + return false; + } + LOG_INFO << "selected device: " << deviceString(device); - std::vector devices; - getDevices(platforms, devices); - listDevice(devices); - if(!selectDevice(devices, deviceId)) - { - LOG_ERROR << "could not find any suitable device"; - return false; - } - LOG_INFO << "selected device: " << deviceString(device); + CHECK_CL_PARAM(context = cl::Context(device, NULL, NULL, NULL, &err)); - context = cl::Context(device, NULL, NULL, NULL, &err); - CHECK_CL_ERROR(err, "cl::Context"); - } + if(!initBuffers()) + return false; return buildProgram(sourceCode); } + bool initBuffers() + { +#ifdef LIBFREENECT2_WITH_PROFILING_CL + count = 0; + CHECK_CL_PARAM(queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); +#else + CHECK_CL_PARAM(queue = cl::CommandQueue(context, device, 0, &err)); +#endif + + //Read only + buf_lut11to16_size = LUT_SIZE * sizeof(cl_short); + buf_p0_table_size = IMAGE_SIZE * sizeof(cl_float3); + buf_x_table_size = IMAGE_SIZE * sizeof(cl_float); + buf_z_table_size = IMAGE_SIZE * sizeof(cl_float); + buf_packet_size = ((IMAGE_SIZE * 11) / 16) * 10 * sizeof(cl_ushort); + + CHECK_CL_PARAM(buf_lut11to16 = cl::Buffer(context, CL_MEM_READ_ONLY, buf_lut11to16_size, NULL, &err)); + CHECK_CL_PARAM(buf_p0_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL, &err)); + CHECK_CL_PARAM(buf_x_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_x_table_size, NULL, &err)); + CHECK_CL_PARAM(buf_z_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_z_table_size, NULL, &err)); + CHECK_CL_PARAM(buf_packet = cl::Buffer(context, CL_MEM_READ_ONLY, buf_packet_size, NULL, &err)); + + //Read-Write + buf_a_size = IMAGE_SIZE * sizeof(cl_float3); + buf_b_size = IMAGE_SIZE * sizeof(cl_float3); + buf_n_size = IMAGE_SIZE * sizeof(cl_float3); + buf_ir_size = IMAGE_SIZE * sizeof(cl_float); + buf_a_filtered_size = IMAGE_SIZE * sizeof(cl_float3); + buf_b_filtered_size = IMAGE_SIZE * sizeof(cl_float3); + buf_edge_test_size = IMAGE_SIZE * sizeof(cl_uchar); + buf_depth_size = IMAGE_SIZE * sizeof(cl_float); + buf_ir_sum_size = IMAGE_SIZE * sizeof(cl_float); + buf_filtered_size = IMAGE_SIZE * sizeof(cl_float); + + CHECK_CL_PARAM(buf_a = cl::Buffer(context, CL_MEM_READ_WRITE, buf_a_size, NULL, &err)); + CHECK_CL_PARAM(buf_b = cl::Buffer(context, CL_MEM_READ_WRITE, buf_b_size, NULL, &err)); + CHECK_CL_PARAM(buf_n = cl::Buffer(context, CL_MEM_READ_WRITE, buf_n_size, NULL, &err)); + CHECK_CL_PARAM(buf_ir = cl::Buffer(context, CL_MEM_WRITE_ONLY, buf_ir_size, NULL, &err)); + CHECK_CL_PARAM(buf_a_filtered = cl::Buffer(context, CL_MEM_READ_WRITE, buf_a_filtered_size, NULL, &err)); + CHECK_CL_PARAM(buf_b_filtered = cl::Buffer(context, CL_MEM_READ_WRITE, buf_b_filtered_size, NULL, &err)); + CHECK_CL_PARAM(buf_edge_test = cl::Buffer(context, CL_MEM_READ_WRITE, buf_edge_test_size, NULL, &err)); + CHECK_CL_PARAM(buf_depth = cl::Buffer(context, CL_MEM_READ_WRITE, buf_depth_size, NULL, &err)); + CHECK_CL_PARAM(buf_ir_sum = cl::Buffer(context, CL_MEM_READ_WRITE, buf_ir_sum_size, NULL, &err)); + CHECK_CL_PARAM(buf_filtered = cl::Buffer(context, CL_MEM_WRITE_ONLY, buf_filtered_size, NULL, &err)); + + return true; + } + bool initProgram() { if(!deviceInitialized) @@ -368,141 +511,37 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging if (!buildProgram(sourceCode)) return false; - cl_int err = CL_SUCCESS; - { - queue = cl::CommandQueue(context, device, 0, &err); - CHECK_CL_ERROR(err, "cl::CommandQueue"); - - //Read only - buf_lut11to16_size = 2048 * sizeof(cl_short); - buf_p0_table_size = image_size * sizeof(cl_float3); - buf_x_table_size = image_size * sizeof(cl_float); - buf_z_table_size = image_size * sizeof(cl_float); - buf_packet_size = ((image_size * 11) / 16) * 10 * sizeof(cl_ushort); - - buf_lut11to16 = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_lut11to16_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_p0_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_x_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_x_table_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_z_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_z_table_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_packet = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_packet_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - - //Read-Write - buf_a_size = image_size * sizeof(cl_float3); - buf_b_size = image_size * sizeof(cl_float3); - buf_n_size = image_size * sizeof(cl_float3); - buf_ir_size = image_size * sizeof(cl_float); - buf_a_filtered_size = image_size * sizeof(cl_float3); - buf_b_filtered_size = image_size * sizeof(cl_float3); - buf_edge_test_size = image_size * sizeof(cl_uchar); - buf_depth_size = image_size * sizeof(cl_float); - buf_ir_sum_size = image_size * sizeof(cl_float); - buf_filtered_size = image_size * sizeof(cl_float); - - buf_a = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_a_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_b = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_b_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_n = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_n_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_ir = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_ir_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_a_filtered = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_a_filtered_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_b_filtered = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_b_filtered_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_edge_test = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_edge_test_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_depth = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_depth_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_ir_sum = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_ir_sum_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_filtered = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_filtered_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - - kernel_processPixelStage1 = cl::Kernel(program, "processPixelStage1", &err); - CHECK_CL_ERROR(err, "cl::Kernel"); - err = kernel_processPixelStage1.setArg(0, buf_lut11to16); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(1, buf_z_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(2, buf_p0_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(3, buf_packet); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(4, buf_a); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(5, buf_b); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(6, buf_n); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(7, buf_ir); - CHECK_CL_ERROR(err, "setArg"); - - kernel_filterPixelStage1 = cl::Kernel(program, "filterPixelStage1", &err); - CHECK_CL_ERROR(err, "cl::Kernel"); - err = kernel_filterPixelStage1.setArg(0, buf_a); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(1, buf_b); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(2, buf_n); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(3, buf_a_filtered); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(4, buf_b_filtered); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(5, buf_edge_test); - CHECK_CL_ERROR(err, "setArg"); - - kernel_processPixelStage2 = cl::Kernel(program, "processPixelStage2", &err); - CHECK_CL_ERROR(err, "cl::Kernel"); - err = kernel_processPixelStage2.setArg(0, config.EnableBilateralFilter ? buf_a_filtered : buf_a); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(1, config.EnableBilateralFilter ? buf_b_filtered : buf_b); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(2, buf_x_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(3, buf_z_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(4, buf_depth); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(5, buf_ir_sum); - CHECK_CL_ERROR(err, "setArg"); - - kernel_filterPixelStage2 = cl::Kernel(program, "filterPixelStage2", &err); - CHECK_CL_ERROR(err, "cl::Kernel"); - err = kernel_filterPixelStage2.setArg(0, buf_depth); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage2.setArg(1, buf_ir_sum); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage2.setArg(2, buf_edge_test); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage2.setArg(3, buf_filtered); - CHECK_CL_ERROR(err, "setArg"); - - cl::Event event0, event1, event2, event3; - err = queue.enqueueWriteBuffer(buf_lut11to16, CL_FALSE, 0, buf_lut11to16_size, lut11to16, NULL, &event0); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - err = queue.enqueueWriteBuffer(buf_p0_table, CL_FALSE, 0, buf_p0_table_size, p0_table, NULL, &event1); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - err = queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, x_table, NULL, &event2); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - err = queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, z_table, NULL, &event3); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - - err = event0.wait(); - CHECK_CL_ERROR(err, "wait"); - err = event1.wait(); - CHECK_CL_ERROR(err, "wait"); - err = event2.wait(); - CHECK_CL_ERROR(err, "wait"); - err = event3.wait(); - CHECK_CL_ERROR(err, "wait"); - } + CHECK_CL_PARAM(kernel_processPixelStage1 = cl::Kernel(program, "processPixelStage1", &err)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(0, buf_lut11to16)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(1, buf_z_table)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(2, buf_p0_table)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(3, buf_packet)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(4, buf_a)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(5, buf_b)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(6, buf_n)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(7, buf_ir)); + + CHECK_CL_PARAM(kernel_filterPixelStage1 = cl::Kernel(program, "filterPixelStage1", &err)); + CHECK_CL_RETURN(kernel_filterPixelStage1.setArg(0, buf_a)); + CHECK_CL_RETURN(kernel_filterPixelStage1.setArg(1, buf_b)); + CHECK_CL_RETURN(kernel_filterPixelStage1.setArg(2, buf_n)); + CHECK_CL_RETURN(kernel_filterPixelStage1.setArg(3, buf_a_filtered)); + CHECK_CL_RETURN(kernel_filterPixelStage1.setArg(4, buf_b_filtered)); + CHECK_CL_RETURN(kernel_filterPixelStage1.setArg(5, buf_edge_test)); + + CHECK_CL_PARAM(kernel_processPixelStage2 = cl::Kernel(program, "processPixelStage2", &err)); + CHECK_CL_RETURN(kernel_processPixelStage2.setArg(0, config.EnableBilateralFilter ? buf_a_filtered : buf_a)); + CHECK_CL_RETURN(kernel_processPixelStage2.setArg(1, config.EnableBilateralFilter ? buf_b_filtered : buf_b)); + CHECK_CL_RETURN(kernel_processPixelStage2.setArg(2, buf_x_table)); + CHECK_CL_RETURN(kernel_processPixelStage2.setArg(3, buf_z_table)); + CHECK_CL_RETURN(kernel_processPixelStage2.setArg(4, buf_depth)); + CHECK_CL_RETURN(kernel_processPixelStage2.setArg(5, buf_ir_sum)); + + CHECK_CL_PARAM(kernel_filterPixelStage2 = cl::Kernel(program, "filterPixelStage2", &err)); + CHECK_CL_RETURN(kernel_filterPixelStage2.setArg(0, buf_depth)); + CHECK_CL_RETURN(kernel_filterPixelStage2.setArg(1, buf_ir_sum)); + CHECK_CL_RETURN(kernel_filterPixelStage2.setArg(2, buf_edge_test)); + CHECK_CL_RETURN(kernel_filterPixelStage2.setArg(3, buf_filtered)); programInitialized = true; return true; @@ -510,49 +549,67 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging bool run(const DepthPacket &packet) { - cl_int err; + std::vector eventWrite(1), eventPPS1(1), eventFPS1(1), eventPPS2(1), eventFPS2(1); + cl::Event eventReadIr, eventReadDepth; + + CHECK_CL_RETURN(queue.enqueueWriteBuffer(buf_packet, CL_FALSE, 0, buf_packet_size, packet.buffer, NULL, &eventWrite[0])); + CHECK_CL_RETURN(queue.enqueueNDRangeKernel(kernel_processPixelStage1, cl::NullRange, cl::NDRange(IMAGE_SIZE), cl::NullRange, &eventWrite, &eventPPS1[0])); + CHECK_CL_RETURN(queue.enqueueReadBuffer(buf_ir, CL_FALSE, 0, buf_ir_size, ir_frame->data, &eventPPS1, &eventReadIr)); + + if(config.EnableBilateralFilter) + { + CHECK_CL_RETURN(queue.enqueueNDRangeKernel(kernel_filterPixelStage1, cl::NullRange, cl::NDRange(IMAGE_SIZE), cl::NullRange, &eventPPS1, &eventFPS1[0])); + } + else { - std::vector eventWrite(1), eventPPS1(1), eventFPS1(1), eventPPS2(1), eventFPS2(1); - cl::Event event0, event1; + eventFPS1[0] = eventPPS1[0]; + } - err = queue.enqueueWriteBuffer(buf_packet, CL_FALSE, 0, buf_packet_size, packet.buffer, NULL, &eventWrite[0]); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); + CHECK_CL_RETURN(queue.enqueueNDRangeKernel(kernel_processPixelStage2, cl::NullRange, cl::NDRange(IMAGE_SIZE), cl::NullRange, &eventFPS1, &eventPPS2[0])); - err = queue.enqueueNDRangeKernel(kernel_processPixelStage1, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventWrite, &eventPPS1[0]); - CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); - err = queue.enqueueReadBuffer(buf_ir, CL_FALSE, 0, buf_ir_size, ir_frame->data, &eventPPS1, &event0); - CHECK_CL_ERROR(err, "enqueueReadBuffer"); + if(config.EnableEdgeAwareFilter) + { + CHECK_CL_RETURN(queue.enqueueNDRangeKernel(kernel_filterPixelStage2, cl::NullRange, cl::NDRange(IMAGE_SIZE), cl::NullRange, &eventPPS2, &eventFPS2[0])); + } + else + { + eventFPS2[0] = eventPPS2[0]; + } - if(config.EnableBilateralFilter) - { - err = queue.enqueueNDRangeKernel(kernel_filterPixelStage1, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventPPS1, &eventFPS1[0]); - CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); - } - else - { - eventFPS1[0] = eventPPS1[0]; - } + CHECK_CL_RETURN(queue.enqueueReadBuffer(config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0, buf_depth_size, depth_frame->data, &eventFPS2, &eventReadDepth)); + CHECK_CL_RETURN(eventReadIr.wait()); + CHECK_CL_RETURN(eventReadDepth.wait()); - err = queue.enqueueNDRangeKernel(kernel_processPixelStage2, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventFPS1, &eventPPS2[0]); - CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); +#ifdef LIBFREENECT2_WITH_PROFILING_CL + if(count == 0) + { + timings.clear(); + timings.resize(7, 0.0); + } - if(config.EnableEdgeAwareFilter) - { - err = queue.enqueueNDRangeKernel(kernel_filterPixelStage2, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventPPS2, &eventFPS2[0]); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - } - else - { - eventFPS2[0] = eventPPS2[0]; - } + timings[0] += eventWrite[0].getProfilingInfo() - eventWrite[0].getProfilingInfo(); + timings[1] += eventPPS1[0].getProfilingInfo() - eventPPS1[0].getProfilingInfo(); + timings[2] += eventFPS1[0].getProfilingInfo() - eventFPS1[0].getProfilingInfo(); + timings[3] += eventPPS2[0].getProfilingInfo() - eventPPS2[0].getProfilingInfo(); + timings[4] += eventFPS2[0].getProfilingInfo() - eventFPS2[0].getProfilingInfo(); + timings[5] += eventReadIr.getProfilingInfo() - eventReadIr.getProfilingInfo(); + timings[6] += eventReadDepth.getProfilingInfo() - eventReadDepth.getProfilingInfo(); - err = queue.enqueueReadBuffer(config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0, buf_depth_size, depth_frame->data, &eventFPS2, &event1); - CHECK_CL_ERROR(err, "enqueueReadBuffer"); - err = event0.wait(); - CHECK_CL_ERROR(err, "wait"); - err = event1.wait(); - CHECK_CL_ERROR(err, "wait"); + if(++count == 100) + { + double sum = timings[0] + timings[1] + timings[2] + timings[3] + timings[4] + timings[5] + timings[6]; + LOG_INFO << "writing package: " << timings[0] / 100000000.0 << " ms."; + LOG_INFO << "stage 1: " << timings[1] / 100000000.0 << " ms."; + LOG_INFO << "filter 1: " << timings[2] / 100000000.0 << " ms."; + LOG_INFO << "stage 2: " << timings[3] / 100000000.0 << " ms."; + LOG_INFO << "filter 2: " << timings[4] / 100000000.0 << " ms."; + LOG_INFO << "reading ir: " << timings[5] / 100000000.0 << " ms."; + LOG_INFO << "reading depth: " << timings[6] / 100000000.0 << " ms."; + LOG_INFO << "overall: " << sum / 100000000.0 << " ms."; + count = 0; } +#endif + return true; } @@ -562,30 +619,21 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging return !source.empty(); } - bool buildProgram(const std::string& sources) + bool buildProgram(const std::string &sources) { - cl_int err; - { - LOG_INFO << "building OpenCL program..."; + LOG_INFO << "building OpenCL program..."; - std::string options; - generateOptions(options); + std::string options; + generateOptions(options); - cl::Program::Sources source(1, std::make_pair(sources.c_str(), sources.length())); - program = cl::Program(context, source, &err); - CHECK_CL_ERROR(err, "cl::Program"); + cl::Program::Sources source(1, std::make_pair(sources.c_str(), sources.length())); + CHECK_CL_PARAM(program = cl::Program(context, source, &err)); - err = program.build(options.c_str()); - if (err != CL_SUCCESS) - { - LOG_ERROR << "failed to build program: " << err; - LOG_ERROR << "Build Status: " << program.getBuildInfo(device); - LOG_ERROR << "Build Options:\t" << program.getBuildInfo(device); - LOG_ERROR << "Build Log:\t " << program.getBuildInfo(device); - programBuilt = false; - return false; - } - } + CHECK_CL_ON_FAIL(program.build(options.c_str()), + LOG_ERROR << "failed to build program: " << err; + LOG_ERROR << "Build Status: " << program.getBuildInfo(device); + LOG_ERROR << "Build Options:\t" << program.getBuildInfo(device); + LOG_ERROR << "Build Log:\t " << program.getBuildInfo(device)); LOG_INFO << "OpenCL program built successfully"; programBuilt = true; @@ -594,16 +642,24 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging void newIrFrame() { - ir_frame = new Frame(512, 424, 4); + ir_frame = new OpenCLFrame(static_cast(ir_buffer_allocator->allocate(IMAGE_SIZE * sizeof(cl_float)))); } void newDepthFrame() { - depth_frame = new Frame(512, 424, 4); + depth_frame = new OpenCLFrame(static_cast(depth_buffer_allocator->allocate(IMAGE_SIZE * sizeof(cl_float)))); } - void fill_trig_table(const libfreenect2::protocol::P0TablesResponse *p0table) + bool fill_trig_table(const libfreenect2::protocol::P0TablesResponse *p0table) { + if(!deviceInitialized) + { + LOG_ERROR << "OpenCLDepthPacketProcessor is not initialized!"; + return false; + } + + cl_float3 *p0_table = new cl_float3[IMAGE_SIZE]; + for(int r = 0; r < 424; ++r) { cl_float3 *it = &p0_table[r * 512]; @@ -612,12 +668,48 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging const uint16_t *it2 = &p0table->p0table2[r * 512]; for(int c = 0; c < 512; ++c, ++it, ++it0, ++it1, ++it2) { - it->s[0] = -((float) * it0) * 0.000031 * M_PI; - it->s[1] = -((float) * it1) * 0.000031 * M_PI; - it->s[2] = -((float) * it2) * 0.000031 * M_PI; + it->s[0] = -((float)*it0) * 0.000031 * M_PI; + it->s[1] = -((float)*it1) * 0.000031 * M_PI; + it->s[2] = -((float)*it2) * 0.000031 * M_PI; it->s[3] = 0.0f; } } + + cl::Event event; + CHECK_CL_ON_FAIL(queue.enqueueWriteBuffer(buf_p0_table, CL_FALSE, 0, buf_p0_table_size, p0_table, NULL, &event), delete[] p0_table); + CHECK_CL_ON_FAIL(event.wait(), delete[] p0_table); + delete[] p0_table; + return true; + } + + bool fill_xz_tables(const float *xtable, const float *ztable) + { + if(!deviceInitialized) + { + LOG_ERROR << "OpenCLDepthPacketProcessor is not initialized!"; + return false; + } + + cl::Event event0, event1; + CHECK_CL_RETURN(queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, xtable, NULL, &event0)); + CHECK_CL_RETURN(queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, ztable, NULL, &event1)); + CHECK_CL_RETURN(event0.wait()); + CHECK_CL_RETURN(event1.wait()); + return true; + } + + bool fill_lut(const short *lut) + { + if(!deviceInitialized) + { + LOG_ERROR << "OpenCLDepthPacketProcessor is not initialized!"; + return false; + } + + cl::Event event; + CHECK_CL_RETURN(queue.enqueueWriteBuffer(buf_lut11to16, CL_FALSE, 0, buf_lut11to16_size, lut, NULL, &event)); + CHECK_CL_RETURN(event.wait()); + return true; } }; @@ -635,7 +727,7 @@ void OpenCLDepthPacketProcessor::setConfiguration(const libfreenect2::DepthPacke { DepthPacketProcessor::setConfiguration(config); - if ( impl_->config.MaxDepth != config.MaxDepth + if ( impl_->config.MaxDepth != config.MaxDepth || impl_->config.MinDepth != config.MinDepth) { // OpenCL program needs to be rebuilt, then reinitialized @@ -669,13 +761,17 @@ void OpenCLDepthPacketProcessor::loadP0TablesFromCommandResponse(unsigned char * void OpenCLDepthPacketProcessor::loadXZTables(const float *xtable, const float *ztable) { - std::copy(xtable, xtable + TABLE_SIZE, impl_->x_table); - std::copy(ztable, ztable + TABLE_SIZE, impl_->z_table); + impl_->fill_xz_tables(xtable, ztable); } void OpenCLDepthPacketProcessor::loadLookupTable(const short *lut) { - std::copy(lut, lut + LUT_SIZE, impl_->lut11to16); + impl_->fill_lut(lut); +} + +bool OpenCLDepthPacketProcessor::good() +{ + return impl_->deviceInitialized; } void OpenCLDepthPacketProcessor::process(const DepthPacket &packet) @@ -713,5 +809,9 @@ void OpenCLDepthPacketProcessor::process(const DepthPacket &packet) } } +Allocator *OpenCLDepthPacketProcessor::getAllocator() +{ + return impl_->input_buffer_allocator; +} } /* namespace libfreenect2 */