From 6a7f651ce8bdff904196dce9cb0745c02636ae32 Mon Sep 17 00:00:00 2001 From: Thiemo Wiedemeyer Date: Wed, 17 Feb 2016 14:22:53 +0100 Subject: [PATCH 1/7] Instead of computing the sine and cosine for the p0 table and the phases on the GPU, they are now precomputed once on the CPU. Details: Replaced sin(a+b) by sin(a)*cos(b)+cos(a)*sin(b), where sin(a),cos(b),cos(a),sin(b) are stored in a LUT. Simplyfied processPixelStage1 code and removed processMeasurementTriple. Moved one if from decodePixelMeasurement to processPixelStage1. Removed the first part of `valid && any(...)` because valid has been checked before. --- src/opencl_depth_packet_processor.cl | 62 ++++++++++++------------- src/opencl_depth_packet_processor.cpp | 65 ++++++++++++++++++--------- 2 files changed, 71 insertions(+), 56 deletions(-) diff --git a/src/opencl_depth_packet_processor.cl b/src/opencl_depth_packet_processor.cl index 2e5d15436..75b14b74f 100644 --- a/src/opencl_depth_packet_processor.cl +++ b/src/opencl_depth_packet_processor.cl @@ -24,13 +24,17 @@ * either License. */ +#define PHASE_SIN (float3)(PHASE_IN_RAD0_SIN, PHASE_IN_RAD1_SIN, PHASE_IN_RAD2_SIN) +#define PHASE_COS (float3)(PHASE_IN_RAD0_COS, PHASE_IN_RAD1_COS, PHASE_IN_RAD2_COS) +#define AB_MULTIPLIER_PER_FRQ (float3)(AB_MULTIPLIER_PER_FRQ0, AB_MULTIPLIER_PER_FRQ1, AB_MULTIPLIER_PER_FRQ2) + /******************************************************************************* * Process pixel stage 1 ******************************************************************************/ float decodePixelMeasurement(global const ushort *data, global const short *lut11to16, const uint sub, const uint x, const uint y) { - uint row_idx = (424 * sub + (y < 212 ? y + 212 : 423 - y)) * 352; + uint row_idx = (424 * sub + y) * 352; uint idx = (((x >> 2) + ((x << 7) & BFI_BITMASK)) * 11) & (uint)0xffffffff; uint col_idx = idx >> 4; @@ -43,60 +47,50 @@ float decodePixelMeasurement(global const ushort *data, global const short *lut1 return (float)lut11to16[(x < 1 || 510 < x || col_idx > 352) ? 0 : ((data[data_idx0] >> upper_bytes) | (data[data_idx1] << lower_bytes)) & 2047]; } -float2 processMeasurementTriple(const float ab_multiplier_per_frq, const float p0, const float3 v, int *invalid) -{ - float3 p0vec = (float3)(p0 + PHASE_IN_RAD0, p0 + PHASE_IN_RAD1, p0 + PHASE_IN_RAD2); - float3 p0cos = cos(p0vec); - float3 p0sin = sin(-p0vec); - - *invalid = *invalid && any(isequal(v, (float3)(32767.0f))); - - return (float2)(dot(v, p0cos), dot(v, p0sin)) * ab_multiplier_per_frq; -} - -void kernel processPixelStage1(global const short *lut11to16, global const float *z_table, global const float3 *p0_table, global const ushort *data, - global float3 *a_out, global float3 *b_out, global float3 *n_out, global float *ir_out) +void kernel processPixelStage1(global const short *lut11to16, global const float *z_table, global const float3 *p0_sin_table, global const float3 *p0_cos_table, + global const ushort *data, global float3 *a_out, global float3 *b_out, global float3 *n_out, global float *ir_out) { const uint i = get_global_id(0); const uint x = i % 512; const uint y = i / 512; - const uint y_in = (423 - y); + const uint y_tmp = (423 - y); + const uint y_in = (y_tmp < 212 ? y_tmp + 212 : 423 - y_tmp); + + const int3 invalid = (int)(0.0f >= z_table[i]); + const float3 p0_sin = p0_sin_table[i]; + const float3 p0_cos = p0_cos_table[i]; - const float zmultiplier = z_table[i]; - int valid = (int)(0.0f < zmultiplier); - int saturatedX = valid; - int saturatedY = valid; - int saturatedZ = valid; - int3 invalid_pixel = (int3)((int)(!valid)); - const float3 p0 = p0_table[i]; + int3 invalid_pixel = (int3)(invalid); const float3 v0 = (float3)(decodePixelMeasurement(data, lut11to16, 0, x, y_in), decodePixelMeasurement(data, lut11to16, 1, x, y_in), decodePixelMeasurement(data, lut11to16, 2, x, y_in)); - const float2 ab0 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ0, p0.x, v0, &saturatedX); - const float3 v1 = (float3)(decodePixelMeasurement(data, lut11to16, 3, x, y_in), decodePixelMeasurement(data, lut11to16, 4, x, y_in), decodePixelMeasurement(data, lut11to16, 5, x, y_in)); - const float2 ab1 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ1, p0.y, v1, &saturatedY); - const float3 v2 = (float3)(decodePixelMeasurement(data, lut11to16, 6, x, y_in), decodePixelMeasurement(data, lut11to16, 7, x, y_in), decodePixelMeasurement(data, lut11to16, 8, x, y_in)); - const float2 ab2 = processMeasurementTriple(AB_MULTIPLIER_PER_FRQ2, p0.z, v2, &saturatedZ); - float3 a = select((float3)(ab0.x, ab1.x, ab2.x), (float3)(0.0f), invalid_pixel); - float3 b = select((float3)(ab0.y, ab1.y, ab2.y), (float3)(0.0f), invalid_pixel); + float3 a = (float3)(dot(v0, PHASE_COS * p0_cos.x - PHASE_SIN * p0_sin.x), + dot(v1, PHASE_COS * p0_cos.y - PHASE_SIN * p0_sin.y), + dot(v2, PHASE_COS * p0_cos.z - PHASE_SIN * p0_sin.z)) * AB_MULTIPLIER_PER_FRQ; + float3 b = (float3)(dot(v0, PHASE_COS * p0_sin.x + PHASE_SIN * p0_cos.x), + dot(v1, PHASE_COS * p0_sin.y + PHASE_SIN * p0_cos.y), + dot(v2, PHASE_COS * p0_sin.z + PHASE_SIN * p0_cos.z)) * AB_MULTIPLIER_PER_FRQ; + + a = select(a, (float3)(0.0f), invalid_pixel); + b = select(b, (float3)(0.0f), invalid_pixel); float3 n = sqrt(a * a + b * b); - int3 saturated = (int3)(saturatedX, saturatedY, saturatedZ); - a = select(a, (float3)(0.0f), saturated); - b = select(b, (float3)(0.0f), saturated); + int3 saturated = (int3)(any(isequal(v0, (float3)(32767.0f))), + any(isequal(v1, (float3)(32767.0f))), + any(isequal(v2, (float3)(32767.0f)))); - a_out[i] = a; - b_out[i] = b; + a_out[i] = select(a, (float3)(0.0f), saturated); + b_out[i] = select(b, (float3)(0.0f), saturated); n_out[i] = n; ir_out[i] = min(dot(select(n, (float3)(65535.0f), saturated), (float3)(0.333333333f * AB_MULTIPLIER * AB_OUTPUT_MULTIPLIER)), 65535.0f); } diff --git a/src/opencl_depth_packet_processor.cpp b/src/opencl_depth_packet_processor.cpp index 429f20e9f..f280f1559 100644 --- a/src/opencl_depth_packet_processor.cpp +++ b/src/opencl_depth_packet_processor.cpp @@ -78,7 +78,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging cl_short lut11to16[2048]; cl_float x_table[512 * 424]; cl_float z_table[512 * 424]; - cl_float3 p0_table[512 * 424]; + cl_float3 p0_sin_table[512 * 424]; + cl_float3 p0_cos_table[512 * 424]; libfreenect2::DepthPacketProcessor::Config config; DepthPacketProcessor::Parameters params; @@ -105,7 +106,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging size_t buf_packet_size; cl::Buffer buf_lut11to16; - cl::Buffer buf_p0_table; + cl::Buffer buf_p0_sin_table; + cl::Buffer buf_p0_cos_table; cl::Buffer buf_x_table; cl::Buffer buf_z_table; cl::Buffer buf_packet; @@ -200,9 +202,12 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging oss << " -D AB_MULTIPLIER_PER_FRQ2=" << params.ab_multiplier_per_frq[2] << "f"; oss << " -D AB_OUTPUT_MULTIPLIER=" << params.ab_output_multiplier << "f"; - oss << " -D PHASE_IN_RAD0=" << params.phase_in_rad[0] << "f"; - oss << " -D PHASE_IN_RAD1=" << params.phase_in_rad[1] << "f"; - oss << " -D PHASE_IN_RAD2=" << params.phase_in_rad[2] << "f"; + oss << " -D PHASE_IN_RAD0_SIN=" << std::sin(-params.phase_in_rad[0]) << "f"; + oss << " -D PHASE_IN_RAD0_COS=" << std::cos(params.phase_in_rad[0]) << "f"; + oss << " -D PHASE_IN_RAD1_SIN=" << std::sin(-params.phase_in_rad[1]) << "f"; + oss << " -D PHASE_IN_RAD1_COS=" << std::cos(params.phase_in_rad[1]) << "f"; + oss << " -D PHASE_IN_RAD2_SIN=" << std::sin(-params.phase_in_rad[2]) << "f"; + oss << " -D PHASE_IN_RAD2_COS=" << std::cos(params.phase_in_rad[2]) << "f"; oss << " -D JOINT_BILATERAL_AB_THRESHOLD=" << params.joint_bilateral_ab_threshold << "f"; oss << " -D JOINT_BILATERAL_MAX_EDGE=" << params.joint_bilateral_max_edge << "f"; @@ -382,7 +387,9 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging buf_lut11to16 = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_lut11to16_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_p0_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err); + buf_p0_sin_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_p0_cos_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); buf_x_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_x_table_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); @@ -430,17 +437,19 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging CHECK_CL_ERROR(err, "setArg"); err = kernel_processPixelStage1.setArg(1, buf_z_table); CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(2, buf_p0_table); + err = kernel_processPixelStage1.setArg(2, buf_p0_sin_table); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage1.setArg(3, buf_p0_cos_table); CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(3, buf_packet); + err = kernel_processPixelStage1.setArg(4, buf_packet); CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(4, buf_a); + err = kernel_processPixelStage1.setArg(5, buf_a); CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(5, buf_b); + err = kernel_processPixelStage1.setArg(6, buf_b); CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(6, buf_n); + err = kernel_processPixelStage1.setArg(7, buf_n); CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(7, buf_ir); + err = kernel_processPixelStage1.setArg(8, buf_ir); CHECK_CL_ERROR(err, "setArg"); kernel_filterPixelStage1 = cl::Kernel(program, "filterPixelStage1", &err); @@ -484,14 +493,16 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging err = kernel_filterPixelStage2.setArg(3, buf_filtered); CHECK_CL_ERROR(err, "setArg"); - cl::Event event0, event1, event2, event3; + cl::Event event0, event1, event2, event3, event4; err = queue.enqueueWriteBuffer(buf_lut11to16, CL_FALSE, 0, buf_lut11to16_size, lut11to16, NULL, &event0); CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - err = queue.enqueueWriteBuffer(buf_p0_table, CL_FALSE, 0, buf_p0_table_size, p0_table, NULL, &event1); + err = queue.enqueueWriteBuffer(buf_p0_sin_table, CL_FALSE, 0, buf_p0_table_size, p0_sin_table, NULL, &event1); CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - err = queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, x_table, NULL, &event2); + err = queue.enqueueWriteBuffer(buf_p0_cos_table, CL_FALSE, 0, buf_p0_table_size, p0_cos_table, NULL, &event2); CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - err = queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, z_table, NULL, &event3); + err = queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, x_table, NULL, &event3); + CHECK_CL_ERROR(err, "enqueueWriteBuffer"); + err = queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, z_table, NULL, &event4); CHECK_CL_ERROR(err, "enqueueWriteBuffer"); err = event0.wait(); @@ -502,6 +513,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging CHECK_CL_ERROR(err, "wait"); err = event3.wait(); CHECK_CL_ERROR(err, "wait"); + err = event4.wait(); + CHECK_CL_ERROR(err, "wait"); } programInitialized = true; @@ -606,16 +619,24 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging { for(int r = 0; r < 424; ++r) { - cl_float3 *it = &p0_table[r * 512]; + cl_float3 *itS = &p0_sin_table[r * 512]; + cl_float3 *itC = &p0_cos_table[r * 512]; const uint16_t *it0 = &p0table->p0table0[r * 512]; const uint16_t *it1 = &p0table->p0table1[r * 512]; const uint16_t *it2 = &p0table->p0table2[r * 512]; - for(int c = 0; c < 512; ++c, ++it, ++it0, ++it1, ++it2) + for(int c = 0; c < 512; ++c, ++itS, ++itC, ++it0, ++it1, ++it2) { - it->s[0] = -((float) * it0) * 0.000031 * M_PI; - it->s[1] = -((float) * it1) * 0.000031 * M_PI; - it->s[2] = -((float) * it2) * 0.000031 * M_PI; - it->s[3] = 0.0f; + const float x = ((float)*it0) * 0.000031 * M_PI; + const float y = ((float)*it1) * 0.000031 * M_PI; + const float z = ((float)*it2) * 0.000031 * M_PI; + itS->s[0] = std::sin(x); + itS->s[1] = std::sin(y); + itS->s[2] = std::sin(z); + itS->s[3] = 0.0f; + itC->s[0] = std::cos(-x); + itC->s[1] = std::cos(-y); + itC->s[2] = std::cos(-z); + itC->s[3] = 0.0f; } } } From 6a1e977d2f0ccab90f4481da690457d4ece3d2de Mon Sep 17 00:00:00 2001 From: Thiemo Wiedemeyer Date: Wed, 17 Feb 2016 15:44:37 +0100 Subject: [PATCH 2/7] removed arrays for tables and allocated OpenCL buffers on initialization. loadXZTables, loadLookupTable and loadP0TablesFromCommandResponse will now directly write to the OpenCL buffers. --- src/opencl_depth_packet_processor.cpp | 494 ++++++++++++++------------ 1 file changed, 265 insertions(+), 229 deletions(-) diff --git a/src/opencl_depth_packet_processor.cpp b/src/opencl_depth_packet_processor.cpp index f280f1559..97bd450a6 100644 --- a/src/opencl_depth_packet_processor.cpp +++ b/src/opencl_depth_packet_processor.cpp @@ -75,11 +75,6 @@ std::string loadCLSource(const std::string &filename) class OpenCLDepthPacketProcessorImpl: public WithPerfLogging { public: - cl_short lut11to16[2048]; - cl_float x_table[512 * 424]; - cl_float z_table[512 * 424]; - cl_float3 p0_sin_table[512 * 424]; - cl_float3 p0_cos_table[512 * 424]; libfreenect2::DepthPacketProcessor::Config config; DepthPacketProcessor::Parameters params; @@ -96,7 +91,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging cl::Kernel kernel_processPixelStage2; cl::Kernel kernel_filterPixelStage2; - size_t image_size; + const size_t image_size; + const size_t lut_size; // Read only buffers size_t buf_lut11to16_size; @@ -140,8 +136,10 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging bool programInitialized; std::string sourceCode; - OpenCLDepthPacketProcessorImpl(const int deviceId = -1) - : deviceInitialized(false) + OpenCLDepthPacketProcessorImpl(const int deviceId = -1) + : image_size(512 * 424) + , lut_size(2048) + , deviceInitialized(false) , programBuilt(false) , programInitialized(false) { @@ -153,8 +151,6 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging newIrFrame(); newDepthFrame(); - image_size = 512 * 424; - deviceInitialized = initDevice(deviceId); const int CL_ICDL_VERSION = 2; @@ -325,6 +321,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging } #define CHECK_CL_ERROR(err, str) do {if (err != CL_SUCCESS) {LOG_ERROR << str << " failed: " << err; return false; } } while(0) +#define LOG_CL_ERROR(err, str) if (err != CL_SUCCESS) LOG_ERROR << str << " failed: " << err bool initDevice(const int deviceId) { @@ -334,34 +331,97 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging } cl_int err = CL_SUCCESS; + std::vector platforms; + err = cl::Platform::get(&platforms); + CHECK_CL_ERROR(err, "cl::Platform::get"); + + if(platforms.empty()) { - std::vector platforms; - err = cl::Platform::get(&platforms); - CHECK_CL_ERROR(err, "cl::Platform::get"); + LOG_ERROR << "no opencl platforms found."; + return false; + } - if(platforms.empty()) - { - LOG_ERROR << "no opencl platforms found."; - return false; - } + std::vector devices; + getDevices(platforms, devices); + listDevice(devices); + if(!selectDevice(devices, deviceId)) + { + LOG_ERROR << "could not find any suitable device"; + return false; + } + LOG_INFO << "selected device: " << deviceString(device); - std::vector devices; - getDevices(platforms, devices); - listDevice(devices); - if(!selectDevice(devices, deviceId)) - { - LOG_ERROR << "could not find any suitable device"; - return false; - } - LOG_INFO << "selected device: " << deviceString(device); + context = cl::Context(device, NULL, NULL, NULL, &err); + CHECK_CL_ERROR(err, "cl::Context"); - context = cl::Context(device, NULL, NULL, NULL, &err); - CHECK_CL_ERROR(err, "cl::Context"); - } + if(!initBuffers()) + return false; return buildProgram(sourceCode); } + bool initBuffers() + { + cl_int err = CL_SUCCESS; + queue = cl::CommandQueue(context, device, 0, &err); + CHECK_CL_ERROR(err, "cl::CommandQueue"); + + //Read only + buf_lut11to16_size = lut_size * sizeof(cl_short); + buf_p0_table_size = image_size * sizeof(cl_float3); + buf_x_table_size = image_size * sizeof(cl_float); + buf_z_table_size = image_size * sizeof(cl_float); + buf_packet_size = ((image_size * 11) / 16) * 10 * sizeof(cl_ushort); + + buf_lut11to16 = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_lut11to16_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_p0_sin_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_p0_cos_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_x_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_x_table_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_z_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_z_table_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_packet = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_packet_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + + //Read-Write + buf_a_size = image_size * sizeof(cl_float3); + buf_b_size = image_size * sizeof(cl_float3); + buf_n_size = image_size * sizeof(cl_float3); + buf_ir_size = image_size * sizeof(cl_float); + buf_a_filtered_size = image_size * sizeof(cl_float3); + buf_b_filtered_size = image_size * sizeof(cl_float3); + buf_edge_test_size = image_size * sizeof(cl_uchar); + buf_depth_size = image_size * sizeof(cl_float); + buf_ir_sum_size = image_size * sizeof(cl_float); + buf_filtered_size = image_size * sizeof(cl_float); + + buf_a = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_a_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_b = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_b_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_n = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_n_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_ir = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_ir_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_a_filtered = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_a_filtered_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_b_filtered = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_b_filtered_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_edge_test = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_edge_test_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_depth = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_depth_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_ir_sum = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_ir_sum_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + buf_filtered = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_filtered_size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + + return true; + } + bool initProgram() { if(!deviceInitialized) @@ -374,148 +434,67 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging return false; cl_int err = CL_SUCCESS; - { - queue = cl::CommandQueue(context, device, 0, &err); - CHECK_CL_ERROR(err, "cl::CommandQueue"); - - //Read only - buf_lut11to16_size = 2048 * sizeof(cl_short); - buf_p0_table_size = image_size * sizeof(cl_float3); - buf_x_table_size = image_size * sizeof(cl_float); - buf_z_table_size = image_size * sizeof(cl_float); - buf_packet_size = ((image_size * 11) / 16) * 10 * sizeof(cl_ushort); - - buf_lut11to16 = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_lut11to16_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_p0_sin_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_p0_cos_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_x_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_x_table_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_z_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_z_table_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_packet = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_packet_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - - //Read-Write - buf_a_size = image_size * sizeof(cl_float3); - buf_b_size = image_size * sizeof(cl_float3); - buf_n_size = image_size * sizeof(cl_float3); - buf_ir_size = image_size * sizeof(cl_float); - buf_a_filtered_size = image_size * sizeof(cl_float3); - buf_b_filtered_size = image_size * sizeof(cl_float3); - buf_edge_test_size = image_size * sizeof(cl_uchar); - buf_depth_size = image_size * sizeof(cl_float); - buf_ir_sum_size = image_size * sizeof(cl_float); - buf_filtered_size = image_size * sizeof(cl_float); - - buf_a = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_a_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_b = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_b_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_n = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_n_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_ir = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_ir_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_a_filtered = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_a_filtered_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_b_filtered = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_b_filtered_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_edge_test = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_edge_test_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_depth = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_depth_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_ir_sum = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_ir_sum_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_filtered = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_filtered_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - - kernel_processPixelStage1 = cl::Kernel(program, "processPixelStage1", &err); - CHECK_CL_ERROR(err, "cl::Kernel"); - err = kernel_processPixelStage1.setArg(0, buf_lut11to16); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(1, buf_z_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(2, buf_p0_sin_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(3, buf_p0_cos_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(4, buf_packet); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(5, buf_a); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(6, buf_b); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(7, buf_n); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(8, buf_ir); - CHECK_CL_ERROR(err, "setArg"); - - kernel_filterPixelStage1 = cl::Kernel(program, "filterPixelStage1", &err); - CHECK_CL_ERROR(err, "cl::Kernel"); - err = kernel_filterPixelStage1.setArg(0, buf_a); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(1, buf_b); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(2, buf_n); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(3, buf_a_filtered); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(4, buf_b_filtered); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(5, buf_edge_test); - CHECK_CL_ERROR(err, "setArg"); - - kernel_processPixelStage2 = cl::Kernel(program, "processPixelStage2", &err); - CHECK_CL_ERROR(err, "cl::Kernel"); - err = kernel_processPixelStage2.setArg(0, config.EnableBilateralFilter ? buf_a_filtered : buf_a); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(1, config.EnableBilateralFilter ? buf_b_filtered : buf_b); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(2, buf_x_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(3, buf_z_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(4, buf_depth); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(5, buf_ir_sum); - CHECK_CL_ERROR(err, "setArg"); - - kernel_filterPixelStage2 = cl::Kernel(program, "filterPixelStage2", &err); - CHECK_CL_ERROR(err, "cl::Kernel"); - err = kernel_filterPixelStage2.setArg(0, buf_depth); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage2.setArg(1, buf_ir_sum); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage2.setArg(2, buf_edge_test); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage2.setArg(3, buf_filtered); - CHECK_CL_ERROR(err, "setArg"); - - cl::Event event0, event1, event2, event3, event4; - err = queue.enqueueWriteBuffer(buf_lut11to16, CL_FALSE, 0, buf_lut11to16_size, lut11to16, NULL, &event0); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - err = queue.enqueueWriteBuffer(buf_p0_sin_table, CL_FALSE, 0, buf_p0_table_size, p0_sin_table, NULL, &event1); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - err = queue.enqueueWriteBuffer(buf_p0_cos_table, CL_FALSE, 0, buf_p0_table_size, p0_cos_table, NULL, &event2); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - err = queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, x_table, NULL, &event3); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - err = queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, z_table, NULL, &event4); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - - err = event0.wait(); - CHECK_CL_ERROR(err, "wait"); - err = event1.wait(); - CHECK_CL_ERROR(err, "wait"); - err = event2.wait(); - CHECK_CL_ERROR(err, "wait"); - err = event3.wait(); - CHECK_CL_ERROR(err, "wait"); - err = event4.wait(); - CHECK_CL_ERROR(err, "wait"); - } + kernel_processPixelStage1 = cl::Kernel(program, "processPixelStage1", &err); + CHECK_CL_ERROR(err, "cl::Kernel"); + err = kernel_processPixelStage1.setArg(0, buf_lut11to16); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage1.setArg(1, buf_z_table); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage1.setArg(2, buf_p0_sin_table); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage1.setArg(3, buf_p0_cos_table); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage1.setArg(4, buf_packet); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage1.setArg(5, buf_a); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage1.setArg(6, buf_b); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage1.setArg(7, buf_n); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage1.setArg(8, buf_ir); + CHECK_CL_ERROR(err, "setArg"); + + kernel_filterPixelStage1 = cl::Kernel(program, "filterPixelStage1", &err); + CHECK_CL_ERROR(err, "cl::Kernel"); + err = kernel_filterPixelStage1.setArg(0, buf_a); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_filterPixelStage1.setArg(1, buf_b); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_filterPixelStage1.setArg(2, buf_n); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_filterPixelStage1.setArg(3, buf_a_filtered); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_filterPixelStage1.setArg(4, buf_b_filtered); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_filterPixelStage1.setArg(5, buf_edge_test); + CHECK_CL_ERROR(err, "setArg"); + + kernel_processPixelStage2 = cl::Kernel(program, "processPixelStage2", &err); + CHECK_CL_ERROR(err, "cl::Kernel"); + err = kernel_processPixelStage2.setArg(0, config.EnableBilateralFilter ? buf_a_filtered : buf_a); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage2.setArg(1, config.EnableBilateralFilter ? buf_b_filtered : buf_b); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage2.setArg(2, buf_x_table); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage2.setArg(3, buf_z_table); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage2.setArg(4, buf_depth); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_processPixelStage2.setArg(5, buf_ir_sum); + CHECK_CL_ERROR(err, "setArg"); + + kernel_filterPixelStage2 = cl::Kernel(program, "filterPixelStage2", &err); + CHECK_CL_ERROR(err, "cl::Kernel"); + err = kernel_filterPixelStage2.setArg(0, buf_depth); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_filterPixelStage2.setArg(1, buf_ir_sum); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_filterPixelStage2.setArg(2, buf_edge_test); + CHECK_CL_ERROR(err, "setArg"); + err = kernel_filterPixelStage2.setArg(3, buf_filtered); + CHECK_CL_ERROR(err, "setArg"); programInitialized = true; return true; @@ -524,48 +503,47 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging bool run(const DepthPacket &packet) { cl_int err; - { - std::vector eventWrite(1), eventPPS1(1), eventFPS1(1), eventPPS2(1), eventFPS2(1); - cl::Event event0, event1; - - err = queue.enqueueWriteBuffer(buf_packet, CL_FALSE, 0, buf_packet_size, packet.buffer, NULL, &eventWrite[0]); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); + std::vector eventWrite(1), eventPPS1(1), eventFPS1(1), eventPPS2(1), eventFPS2(1); + cl::Event event0, event1; - err = queue.enqueueNDRangeKernel(kernel_processPixelStage1, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventWrite, &eventPPS1[0]); - CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); - err = queue.enqueueReadBuffer(buf_ir, CL_FALSE, 0, buf_ir_size, ir_frame->data, &eventPPS1, &event0); - CHECK_CL_ERROR(err, "enqueueReadBuffer"); + err = queue.enqueueWriteBuffer(buf_packet, CL_FALSE, 0, buf_packet_size, packet.buffer, NULL, &eventWrite[0]); + CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - if(config.EnableBilateralFilter) - { - err = queue.enqueueNDRangeKernel(kernel_filterPixelStage1, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventPPS1, &eventFPS1[0]); - CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); - } - else - { - eventFPS1[0] = eventPPS1[0]; - } + err = queue.enqueueNDRangeKernel(kernel_processPixelStage1, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventWrite, &eventPPS1[0]); + CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); + err = queue.enqueueReadBuffer(buf_ir, CL_FALSE, 0, buf_ir_size, ir_frame->data, &eventPPS1, &event0); + CHECK_CL_ERROR(err, "enqueueReadBuffer"); - err = queue.enqueueNDRangeKernel(kernel_processPixelStage2, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventFPS1, &eventPPS2[0]); + if(config.EnableBilateralFilter) + { + err = queue.enqueueNDRangeKernel(kernel_filterPixelStage1, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventPPS1, &eventFPS1[0]); CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); + } + else + { + eventFPS1[0] = eventPPS1[0]; + } - if(config.EnableEdgeAwareFilter) - { - err = queue.enqueueNDRangeKernel(kernel_filterPixelStage2, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventPPS2, &eventFPS2[0]); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - } - else - { - eventFPS2[0] = eventPPS2[0]; - } + err = queue.enqueueNDRangeKernel(kernel_processPixelStage2, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventFPS1, &eventPPS2[0]); + CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); - err = queue.enqueueReadBuffer(config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0, buf_depth_size, depth_frame->data, &eventFPS2, &event1); - CHECK_CL_ERROR(err, "enqueueReadBuffer"); - err = event0.wait(); - CHECK_CL_ERROR(err, "wait"); - err = event1.wait(); - CHECK_CL_ERROR(err, "wait"); + if(config.EnableEdgeAwareFilter) + { + err = queue.enqueueNDRangeKernel(kernel_filterPixelStage2, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventPPS2, &eventFPS2[0]); + CHECK_CL_ERROR(err, "enqueueWriteBuffer"); + } + else + { + eventFPS2[0] = eventPPS2[0]; } + + err = queue.enqueueReadBuffer(config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0, buf_depth_size, depth_frame->data, &eventFPS2, &event1); + CHECK_CL_ERROR(err, "enqueueReadBuffer"); + err = event0.wait(); + CHECK_CL_ERROR(err, "wait"); + err = event1.wait(); + CHECK_CL_ERROR(err, "wait"); + return true; } @@ -575,29 +553,26 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging return !source.empty(); } - bool buildProgram(const std::string& sources) + bool buildProgram(const std::string &sources) { cl_int err; - { - LOG_INFO << "building OpenCL program..."; + LOG_INFO << "building OpenCL program..."; - std::string options; - generateOptions(options); + std::string options; + generateOptions(options); - cl::Program::Sources source(1, std::make_pair(sources.c_str(), sources.length())); - program = cl::Program(context, source, &err); - CHECK_CL_ERROR(err, "cl::Program"); + cl::Program::Sources source(1, std::make_pair(sources.c_str(), sources.length())); + program = cl::Program(context, source, &err); + CHECK_CL_ERROR(err, "cl::Program"); - err = program.build(options.c_str()); - if (err != CL_SUCCESS) - { - LOG_ERROR << "failed to build program: " << err; - LOG_ERROR << "Build Status: " << program.getBuildInfo(device); - LOG_ERROR << "Build Options:\t" << program.getBuildInfo(device); - LOG_ERROR << "Build Log:\t " << program.getBuildInfo(device); - programBuilt = false; - return false; - } + err = program.build(options.c_str()); + if(err != CL_SUCCESS) + { + LOG_ERROR << "failed to build program: " << err; + LOG_ERROR << "Build Status: " << program.getBuildInfo(device); + LOG_ERROR << "Build Options:\t" << program.getBuildInfo(device); + LOG_ERROR << "Build Log:\t " << program.getBuildInfo(device); + return false; } LOG_INFO << "OpenCL program built successfully"; @@ -617,6 +592,15 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging void fill_trig_table(const libfreenect2::protocol::P0TablesResponse *p0table) { + if(!deviceInitialized) + { + LOG_ERROR << "OpenCLDepthPacketProcessor is not initialized!"; + return; + } + + cl_float3 *p0_sin_table = new cl_float3[image_size]; + cl_float3 *p0_cos_table = new cl_float3[image_size]; + for(int r = 0; r < 424; ++r) { cl_float3 *itS = &p0_sin_table[r * 512]; @@ -639,6 +623,59 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging itC->s[3] = 0.0f; } } + + cl_int err = CL_SUCCESS; + cl::Event event0, event1; + err = queue.enqueueWriteBuffer(buf_p0_sin_table, CL_FALSE, 0, buf_p0_table_size, p0_sin_table, NULL, &event0); + LOG_CL_ERROR(err, "enqueueWriteBuffer"); + err = queue.enqueueWriteBuffer(buf_p0_cos_table, CL_FALSE, 0, buf_p0_table_size, p0_cos_table, NULL, &event1); + LOG_CL_ERROR(err, "enqueueWriteBuffer"); + + err = event0.wait(); + LOG_CL_ERROR(err, "wait"); + err = event1.wait(); + LOG_CL_ERROR(err, "wait"); + + delete[] p0_sin_table; + delete[] p0_cos_table; + } + + void fill_xz_tables(const float *xtable, const float *ztable) + { + if(!deviceInitialized) + { + LOG_ERROR << "OpenCLDepthPacketProcessor is not initialized!"; + return; + } + + cl_int err = CL_SUCCESS; + cl::Event event0, event1; + err = queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, xtable, NULL, &event0); + LOG_CL_ERROR(err, "enqueueWriteBuffer"); + err = queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, ztable, NULL, &event1); + LOG_CL_ERROR(err, "enqueueWriteBuffer"); + + err = event0.wait(); + LOG_CL_ERROR(err, "wait"); + err = event1.wait(); + LOG_CL_ERROR(err, "wait"); + } + + void fill_lut(const short *lut) + { + if(!deviceInitialized) + { + LOG_ERROR << "OpenCLDepthPacketProcessor is not initialized!"; + return; + } + + cl_int err = CL_SUCCESS; + cl::Event event0; + err = queue.enqueueWriteBuffer(buf_lut11to16, CL_FALSE, 0, buf_lut11to16_size, lut, NULL, &event0); + LOG_CL_ERROR(err, "enqueueWriteBuffer"); + + err = event0.wait(); + LOG_CL_ERROR(err, "wait"); } }; @@ -656,7 +693,7 @@ void OpenCLDepthPacketProcessor::setConfiguration(const libfreenect2::DepthPacke { DepthPacketProcessor::setConfiguration(config); - if ( impl_->config.MaxDepth != config.MaxDepth + if ( impl_->config.MaxDepth != config.MaxDepth || impl_->config.MinDepth != config.MinDepth) { // OpenCL program needs to be rebuilt, then reinitialized @@ -690,13 +727,12 @@ void OpenCLDepthPacketProcessor::loadP0TablesFromCommandResponse(unsigned char * void OpenCLDepthPacketProcessor::loadXZTables(const float *xtable, const float *ztable) { - std::copy(xtable, xtable + TABLE_SIZE, impl_->x_table); - std::copy(ztable, ztable + TABLE_SIZE, impl_->z_table); + impl_->fill_xz_tables(xtable, ztable); } void OpenCLDepthPacketProcessor::loadLookupTable(const short *lut) { - std::copy(lut, lut + LUT_SIZE, impl_->lut11to16); + impl_->fill_lut(lut); } void OpenCLDepthPacketProcessor::process(const DepthPacket &packet) From 8575372eac3dfd97e63e8131fa9eabc3d7a68f07 Mon Sep 17 00:00:00 2001 From: Thiemo Wiedemeyer Date: Thu, 18 Feb 2016 13:17:41 +0100 Subject: [PATCH 3/7] Implemented pinned memory buffers and frames. --- .../libfreenect2/depth_packet_processor.h | 2 + src/opencl_depth_packet_processor.cpp | 150 +++++++++++++++--- 2 files changed, 129 insertions(+), 23 deletions(-) diff --git a/include/internal/libfreenect2/depth_packet_processor.h b/include/internal/libfreenect2/depth_packet_processor.h index 952ca4315..26f975639 100644 --- a/include/internal/libfreenect2/depth_packet_processor.h +++ b/include/internal/libfreenect2/depth_packet_processor.h @@ -177,6 +177,8 @@ class OpenCLDepthPacketProcessor : public DepthPacketProcessor virtual const char *name() { return "OpenCL"; } virtual void process(const DepthPacket &packet); +protected: + virtual Allocator *getAllocator(); private: OpenCLDepthPacketProcessorImpl *impl_; }; diff --git a/src/opencl_depth_packet_processor.cpp b/src/opencl_depth_packet_processor.cpp index 97bd450a6..0aff80827 100644 --- a/src/opencl_depth_packet_processor.cpp +++ b/src/opencl_depth_packet_processor.cpp @@ -55,6 +55,9 @@ #include +#define CHECK_CL_ERROR(err, str) do {if (err != CL_SUCCESS) {LOG_ERROR << str << " failed: " << err; return false; } } while(0) +#define LOG_CL_ERROR(err, str) if (err != CL_SUCCESS) LOG_ERROR << str << " failed: " << err + namespace libfreenect2 { @@ -72,6 +75,67 @@ std::string loadCLSource(const std::string &filename) return std::string(reinterpret_cast(data), length); } +class OpenCLDepthPacketProcessorImpl; + +class OpenCLBuffer: public Buffer +{ +public: + cl::Buffer buffer; +}; + +class OpenCLAllocator: public Allocator +{ +private: + OpenCLDepthPacketProcessorImpl *impl_; + cl::Buffer buffer; + + bool allocate_opencl(OpenCLBuffer *b, size_t size); + +public: + OpenCLAllocator(OpenCLDepthPacketProcessorImpl *impl_) : impl_(impl_) + { + } + + virtual Buffer *allocate(size_t size) + { + OpenCLBuffer *b = new OpenCLBuffer(); + if (!allocate_opencl(b, size)) { + delete b; + b = NULL; + } + return b; + } + + virtual void free(Buffer *b) + { + if (b == NULL || b->data == NULL) + return; + delete b; + } +}; + +class OpenCLFrame: public Frame +{ + bool allocate_opencl(size_t size, OpenCLDepthPacketProcessorImpl *impl_); + +public: + cl::Buffer frameBuffer; + + OpenCLFrame(size_t width, size_t height, size_t bytes_per_pixel, OpenCLDepthPacketProcessorImpl *impl_): + Frame(width, height, bytes_per_pixel, (unsigned char*)-1) + { + data = NULL; + + size_t size = width*height*bytes_per_pixel; + allocate_opencl(size, impl_); + } + + virtual ~OpenCLFrame() + { + data = NULL; + } +}; + class OpenCLDepthPacketProcessorImpl: public WithPerfLogging { public: @@ -79,6 +143,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging DepthPacketProcessor::Parameters params; Frame *ir_frame, *depth_frame; + Allocator *allocator; cl::Context context; cl::Device device; @@ -148,10 +213,12 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging setenv("OCL_STRICT_CONFORMANCE", "0", 0); #endif + deviceInitialized = initDevice(deviceId); + newIrFrame(); newDepthFrame(); - deviceInitialized = initDevice(deviceId); + allocator = new PoolAllocator(new OpenCLAllocator(this)); const int CL_ICDL_VERSION = 2; typedef cl_int (*icdloader_func)(int, size_t, void*, size_t*); @@ -181,6 +248,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging ~OpenCLDepthPacketProcessorImpl() { + delete allocator; delete ir_frame; delete depth_frame; } @@ -238,6 +306,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging oss << " -D MIN_DEPTH=" << config.MinDepth * 1000.0f << "f"; oss << " -D MAX_DEPTH=" << config.MaxDepth * 1000.0f << "f"; + + oss << " -cl-mad-enable -cl-no-signed-zeros -cl-fast-relaxed-math"; options = oss.str(); } @@ -320,9 +390,6 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging return selected; } -#define CHECK_CL_ERROR(err, str) do {if (err != CL_SUCCESS) {LOG_ERROR << str << " failed: " << err; return false; } } while(0) -#define LOG_CL_ERROR(err, str) if (err != CL_SUCCESS) LOG_ERROR << str << " failed: " << err - bool initDevice(const int deviceId) { if(!readProgram(sourceCode)) @@ -373,17 +440,17 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging buf_z_table_size = image_size * sizeof(cl_float); buf_packet_size = ((image_size * 11) / 16) * 10 * sizeof(cl_ushort); - buf_lut11to16 = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_lut11to16_size, NULL, &err); + buf_lut11to16 = cl::Buffer(context, CL_MEM_READ_ONLY, buf_lut11to16_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_p0_sin_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err); + buf_p0_sin_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_p0_cos_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_p0_table_size, NULL, &err); + buf_p0_cos_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_x_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_x_table_size, NULL, &err); + buf_x_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_x_table_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_z_table = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_z_table_size, NULL, &err); + buf_z_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_z_table_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_packet = cl::Buffer(context, CL_READ_ONLY_CACHE, buf_packet_size, NULL, &err); + buf_packet = cl::Buffer(context, CL_MEM_READ_ONLY, buf_packet_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); //Read-Write @@ -398,25 +465,25 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging buf_ir_sum_size = image_size * sizeof(cl_float); buf_filtered_size = image_size * sizeof(cl_float); - buf_a = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_a_size, NULL, &err); + buf_a = cl::Buffer(context, CL_MEM_READ_WRITE, buf_a_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_b = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_b_size, NULL, &err); + buf_b = cl::Buffer(context, CL_MEM_READ_WRITE, buf_b_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_n = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_n_size, NULL, &err); + buf_n = cl::Buffer(context, CL_MEM_READ_WRITE, buf_n_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_ir = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_ir_size, NULL, &err); + buf_ir = cl::Buffer(context, CL_MEM_READ_WRITE, buf_ir_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_a_filtered = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_a_filtered_size, NULL, &err); + buf_a_filtered = cl::Buffer(context, CL_MEM_READ_WRITE, buf_a_filtered_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_b_filtered = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_b_filtered_size, NULL, &err); + buf_b_filtered = cl::Buffer(context, CL_MEM_READ_WRITE, buf_b_filtered_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_edge_test = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_edge_test_size, NULL, &err); + buf_edge_test = cl::Buffer(context, CL_MEM_READ_WRITE, buf_edge_test_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_depth = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_depth_size, NULL, &err); + buf_depth = cl::Buffer(context, CL_MEM_READ_WRITE, buf_depth_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_ir_sum = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_ir_sum_size, NULL, &err); + buf_ir_sum = cl::Buffer(context, CL_MEM_READ_WRITE, buf_ir_sum_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_filtered = cl::Buffer(context, CL_READ_WRITE_CACHE, buf_filtered_size, NULL, &err); + buf_filtered = cl::Buffer(context, CL_MEM_WRITE_ONLY, buf_filtered_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); return true; @@ -507,7 +574,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging cl::Event event0, event1; err = queue.enqueueWriteBuffer(buf_packet, CL_FALSE, 0, buf_packet_size, packet.buffer, NULL, &eventWrite[0]); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); + CHECK_CL_ERROR(err, "enqueueMapBuffer"); err = queue.enqueueNDRangeKernel(kernel_processPixelStage1, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventWrite, &eventPPS1[0]); CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); @@ -582,12 +649,12 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging void newIrFrame() { - ir_frame = new Frame(512, 424, 4); + ir_frame = new OpenCLFrame(512, 424, 4, this); } void newDepthFrame() { - depth_frame = new Frame(512, 424, 4); + depth_frame = new OpenCLFrame(512, 424, 4, this); } void fill_trig_table(const libfreenect2::protocol::P0TablesResponse *p0table) @@ -679,6 +746,39 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging } }; +bool OpenCLFrame::allocate_opencl(size_t size, OpenCLDepthPacketProcessorImpl *impl_) +{ + if(!impl_->deviceInitialized) + return false; + + cl_int err = CL_SUCCESS; + + frameBuffer = cl::Buffer(impl_->context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + + data = (unsigned char*)impl_->queue.enqueueMapBuffer(frameBuffer, CL_TRUE, CL_MAP_READ, 0, size, NULL, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + return true; +} + +bool OpenCLAllocator::allocate_opencl(OpenCLBuffer *b, size_t size) +{ + if(!impl_->deviceInitialized) + return false; + + cl_int err = CL_SUCCESS; + + b->buffer = cl::Buffer(impl_->context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size, NULL, &err); + CHECK_CL_ERROR(err, "cl::Buffer"); + + b->data = (unsigned char*)impl_->queue.enqueueMapBuffer(b->buffer, CL_TRUE, CL_MAP_WRITE, 0, size, NULL, NULL, &err); + CHECK_CL_ERROR(err, "enqueueMapBuffer"); + + b->length = 0; + b->capacity = size; + return true; +} + OpenCLDepthPacketProcessor::OpenCLDepthPacketProcessor(const int deviceId) : impl_(new OpenCLDepthPacketProcessorImpl(deviceId)) { @@ -770,5 +870,9 @@ void OpenCLDepthPacketProcessor::process(const DepthPacket &packet) } } +Allocator *OpenCLDepthPacketProcessor::getAllocator() +{ + return impl_->allocator; +} } /* namespace libfreenect2 */ From dd477d50b4b13fa98b53a2a93304bcb8a1b28acd Mon Sep 17 00:00:00 2001 From: Thiemo Wiedemeyer Date: Thu, 18 Feb 2016 14:43:44 +0100 Subject: [PATCH 4/7] Added (optional) profiling of OpenCL kernels. Reverted back to calculating sine and cosine on the GPU. --- src/opencl_depth_packet_processor.cl | 28 +++--- src/opencl_depth_packet_processor.cpp | 122 +++++++++++++++----------- 2 files changed, 87 insertions(+), 63 deletions(-) diff --git a/src/opencl_depth_packet_processor.cl b/src/opencl_depth_packet_processor.cl index 75b14b74f..2b53f07e8 100644 --- a/src/opencl_depth_packet_processor.cl +++ b/src/opencl_depth_packet_processor.cl @@ -24,8 +24,7 @@ * either License. */ -#define PHASE_SIN (float3)(PHASE_IN_RAD0_SIN, PHASE_IN_RAD1_SIN, PHASE_IN_RAD2_SIN) -#define PHASE_COS (float3)(PHASE_IN_RAD0_COS, PHASE_IN_RAD1_COS, PHASE_IN_RAD2_COS) +#define PHASE (float3)(PHASE_IN_RAD0, PHASE_IN_RAD1, PHASE_IN_RAD2) #define AB_MULTIPLIER_PER_FRQ (float3)(AB_MULTIPLIER_PER_FRQ0, AB_MULTIPLIER_PER_FRQ1, AB_MULTIPLIER_PER_FRQ2) /******************************************************************************* @@ -47,8 +46,8 @@ float decodePixelMeasurement(global const ushort *data, global const short *lut1 return (float)lut11to16[(x < 1 || 510 < x || col_idx > 352) ? 0 : ((data[data_idx0] >> upper_bytes) | (data[data_idx1] << lower_bytes)) & 2047]; } -void kernel processPixelStage1(global const short *lut11to16, global const float *z_table, global const float3 *p0_sin_table, global const float3 *p0_cos_table, - global const ushort *data, global float3 *a_out, global float3 *b_out, global float3 *n_out, global float *ir_out) +void kernel processPixelStage1(global const short *lut11to16, global const float *z_table, global const float3 *p0_table, global const ushort *data, + global float3 *a_out, global float3 *b_out, global float3 *n_out, global float *ir_out) { const uint i = get_global_id(0); @@ -59,8 +58,13 @@ void kernel processPixelStage1(global const short *lut11to16, global const float const uint y_in = (y_tmp < 212 ? y_tmp + 212 : 423 - y_tmp); const int3 invalid = (int)(0.0f >= z_table[i]); - const float3 p0_sin = p0_sin_table[i]; - const float3 p0_cos = p0_cos_table[i]; + const float3 p0 = p0_table[i]; + float3 p0x_sin, p0y_sin, p0z_sin; + float3 p0x_cos, p0y_cos, p0z_cos; + + p0x_sin = -sincos(PHASE + p0.x, &p0x_cos); + p0y_sin = -sincos(PHASE + p0.y, &p0y_cos); + p0z_sin = -sincos(PHASE + p0.z, &p0z_cos); int3 invalid_pixel = (int3)(invalid); @@ -74,12 +78,12 @@ void kernel processPixelStage1(global const short *lut11to16, global const float decodePixelMeasurement(data, lut11to16, 7, x, y_in), decodePixelMeasurement(data, lut11to16, 8, x, y_in)); - float3 a = (float3)(dot(v0, PHASE_COS * p0_cos.x - PHASE_SIN * p0_sin.x), - dot(v1, PHASE_COS * p0_cos.y - PHASE_SIN * p0_sin.y), - dot(v2, PHASE_COS * p0_cos.z - PHASE_SIN * p0_sin.z)) * AB_MULTIPLIER_PER_FRQ; - float3 b = (float3)(dot(v0, PHASE_COS * p0_sin.x + PHASE_SIN * p0_cos.x), - dot(v1, PHASE_COS * p0_sin.y + PHASE_SIN * p0_cos.y), - dot(v2, PHASE_COS * p0_sin.z + PHASE_SIN * p0_cos.z)) * AB_MULTIPLIER_PER_FRQ; + float3 a = (float3)(dot(v0, p0x_cos), + dot(v1, p0y_cos), + dot(v2, p0z_cos)) * AB_MULTIPLIER_PER_FRQ; + float3 b = (float3)(dot(v0, p0x_sin), + dot(v1, p0y_sin), + dot(v2, p0z_sin)) * AB_MULTIPLIER_PER_FRQ; a = select(a, (float3)(0.0f), invalid_pixel); b = select(b, (float3)(0.0f), invalid_pixel); diff --git a/src/opencl_depth_packet_processor.cpp b/src/opencl_depth_packet_processor.cpp index 0aff80827..1aa4755ab 100644 --- a/src/opencl_depth_packet_processor.cpp +++ b/src/opencl_depth_packet_processor.cpp @@ -58,6 +58,8 @@ #define CHECK_CL_ERROR(err, str) do {if (err != CL_SUCCESS) {LOG_ERROR << str << " failed: " << err; return false; } } while(0) #define LOG_CL_ERROR(err, str) if (err != CL_SUCCESS) LOG_ERROR << str << " failed: " << err +#define WITH_PROFILING 0 + namespace libfreenect2 { @@ -167,8 +169,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging size_t buf_packet_size; cl::Buffer buf_lut11to16; - cl::Buffer buf_p0_sin_table; - cl::Buffer buf_p0_cos_table; + cl::Buffer buf_p0_table; cl::Buffer buf_x_table; cl::Buffer buf_z_table; cl::Buffer buf_packet; @@ -201,6 +202,11 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging bool programInitialized; std::string sourceCode; +#if WITH_PROFILING + std::vector timings; + int count; +#endif + OpenCLDepthPacketProcessorImpl(const int deviceId = -1) : image_size(512 * 424) , lut_size(2048) @@ -266,12 +272,9 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging oss << " -D AB_MULTIPLIER_PER_FRQ2=" << params.ab_multiplier_per_frq[2] << "f"; oss << " -D AB_OUTPUT_MULTIPLIER=" << params.ab_output_multiplier << "f"; - oss << " -D PHASE_IN_RAD0_SIN=" << std::sin(-params.phase_in_rad[0]) << "f"; - oss << " -D PHASE_IN_RAD0_COS=" << std::cos(params.phase_in_rad[0]) << "f"; - oss << " -D PHASE_IN_RAD1_SIN=" << std::sin(-params.phase_in_rad[1]) << "f"; - oss << " -D PHASE_IN_RAD1_COS=" << std::cos(params.phase_in_rad[1]) << "f"; - oss << " -D PHASE_IN_RAD2_SIN=" << std::sin(-params.phase_in_rad[2]) << "f"; - oss << " -D PHASE_IN_RAD2_COS=" << std::cos(params.phase_in_rad[2]) << "f"; + oss << " -D PHASE_IN_RAD0=" << params.phase_in_rad[0] << "f"; + oss << " -D PHASE_IN_RAD1=" << params.phase_in_rad[1] << "f"; + oss << " -D PHASE_IN_RAD2=" << params.phase_in_rad[2] << "f"; oss << " -D JOINT_BILATERAL_AB_THRESHOLD=" << params.joint_bilateral_ab_threshold << "f"; oss << " -D JOINT_BILATERAL_MAX_EDGE=" << params.joint_bilateral_max_edge << "f"; @@ -430,7 +433,12 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging bool initBuffers() { cl_int err = CL_SUCCESS; +#if WITH_PROFILING + count = 0; + queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); +#else queue = cl::CommandQueue(context, device, 0, &err); +#endif CHECK_CL_ERROR(err, "cl::CommandQueue"); //Read only @@ -442,9 +450,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging buf_lut11to16 = cl::Buffer(context, CL_MEM_READ_ONLY, buf_lut11to16_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_p0_sin_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_p0_cos_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL, &err); + buf_p0_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); buf_x_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_x_table_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); @@ -471,7 +477,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging CHECK_CL_ERROR(err, "cl::Buffer"); buf_n = cl::Buffer(context, CL_MEM_READ_WRITE, buf_n_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); - buf_ir = cl::Buffer(context, CL_MEM_READ_WRITE, buf_ir_size, NULL, &err); + buf_ir = cl::Buffer(context, CL_MEM_WRITE_ONLY, buf_ir_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); buf_a_filtered = cl::Buffer(context, CL_MEM_READ_WRITE, buf_a_filtered_size, NULL, &err); CHECK_CL_ERROR(err, "cl::Buffer"); @@ -507,19 +513,17 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging CHECK_CL_ERROR(err, "setArg"); err = kernel_processPixelStage1.setArg(1, buf_z_table); CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(2, buf_p0_sin_table); + err = kernel_processPixelStage1.setArg(2, buf_p0_table); CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(3, buf_p0_cos_table); + err = kernel_processPixelStage1.setArg(3, buf_packet); CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(4, buf_packet); + err = kernel_processPixelStage1.setArg(4, buf_a); CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(5, buf_a); + err = kernel_processPixelStage1.setArg(5, buf_b); CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(6, buf_b); + err = kernel_processPixelStage1.setArg(6, buf_n); CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(7, buf_n); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(8, buf_ir); + err = kernel_processPixelStage1.setArg(7, buf_ir); CHECK_CL_ERROR(err, "setArg"); kernel_filterPixelStage1 = cl::Kernel(program, "filterPixelStage1", &err); @@ -571,14 +575,14 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging { cl_int err; std::vector eventWrite(1), eventPPS1(1), eventFPS1(1), eventPPS2(1), eventFPS2(1); - cl::Event event0, event1; + cl::Event eventReadIr, eventReadDepth; err = queue.enqueueWriteBuffer(buf_packet, CL_FALSE, 0, buf_packet_size, packet.buffer, NULL, &eventWrite[0]); - CHECK_CL_ERROR(err, "enqueueMapBuffer"); + CHECK_CL_ERROR(err, "enqueueWriteBuffer"); err = queue.enqueueNDRangeKernel(kernel_processPixelStage1, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventWrite, &eventPPS1[0]); CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); - err = queue.enqueueReadBuffer(buf_ir, CL_FALSE, 0, buf_ir_size, ir_frame->data, &eventPPS1, &event0); + err = queue.enqueueReadBuffer(buf_ir, CL_FALSE, 0, buf_ir_size, ir_frame->data, &eventPPS1, &eventReadIr); CHECK_CL_ERROR(err, "enqueueReadBuffer"); if(config.EnableBilateralFilter) @@ -597,20 +601,50 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging if(config.EnableEdgeAwareFilter) { err = queue.enqueueNDRangeKernel(kernel_filterPixelStage2, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventPPS2, &eventFPS2[0]); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); + CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); } else { eventFPS2[0] = eventPPS2[0]; } - err = queue.enqueueReadBuffer(config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0, buf_depth_size, depth_frame->data, &eventFPS2, &event1); + err = queue.enqueueReadBuffer(config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0, buf_depth_size, depth_frame->data, &eventFPS2, &eventReadDepth); CHECK_CL_ERROR(err, "enqueueReadBuffer"); - err = event0.wait(); + err = eventReadIr.wait(); CHECK_CL_ERROR(err, "wait"); - err = event1.wait(); + err = eventReadDepth.wait(); CHECK_CL_ERROR(err, "wait"); +#if WITH_PROFILING + if(count == 0) + { + timings.clear(); + timings.resize(7, 0.0); + } + + timings[0] += eventWrite[0].getProfilingInfo() - eventWrite[0].getProfilingInfo(); + timings[1] += eventPPS1[0].getProfilingInfo() - eventPPS1[0].getProfilingInfo(); + timings[2] += eventFPS1[0].getProfilingInfo() - eventFPS1[0].getProfilingInfo(); + timings[3] += eventPPS2[0].getProfilingInfo() - eventPPS2[0].getProfilingInfo(); + timings[4] += eventFPS2[0].getProfilingInfo() - eventFPS2[0].getProfilingInfo(); + timings[5] += eventReadIr.getProfilingInfo() - eventReadIr.getProfilingInfo(); + timings[6] += eventReadDepth.getProfilingInfo() - eventReadDepth.getProfilingInfo(); + + if(++count == 100) + { + double sum = timings[0] + timings[1] + timings[2] + timings[3] + timings[4] + timings[5] + timings[6]; + LOG_INFO << "writing package: " << timings[0] / 100000000.0 << " ms."; + LOG_INFO << "stage 1: " << timings[1] / 100000000.0 << " ms."; + LOG_INFO << "filter 1: " << timings[2] / 100000000.0 << " ms."; + LOG_INFO << "stage 2: " << timings[3] / 100000000.0 << " ms."; + LOG_INFO << "filter 2: " << timings[4] / 100000000.0 << " ms."; + LOG_INFO << "reading ir: " << timings[5] / 100000000.0 << " ms."; + LOG_INFO << "reading depth: " << timings[6] / 100000000.0 << " ms."; + LOG_INFO << "overall: " << sum / 100000000.0 << " ms."; + count = 0; + } +#endif + return true; } @@ -665,46 +699,32 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging return; } - cl_float3 *p0_sin_table = new cl_float3[image_size]; - cl_float3 *p0_cos_table = new cl_float3[image_size]; + cl_float3 *p0_table = new cl_float3[image_size]; for(int r = 0; r < 424; ++r) { - cl_float3 *itS = &p0_sin_table[r * 512]; - cl_float3 *itC = &p0_cos_table[r * 512]; + cl_float3 *it = &p0_table[r * 512]; const uint16_t *it0 = &p0table->p0table0[r * 512]; const uint16_t *it1 = &p0table->p0table1[r * 512]; const uint16_t *it2 = &p0table->p0table2[r * 512]; - for(int c = 0; c < 512; ++c, ++itS, ++itC, ++it0, ++it1, ++it2) + for(int c = 0; c < 512; ++c, ++it, ++it0, ++it1, ++it2) { - const float x = ((float)*it0) * 0.000031 * M_PI; - const float y = ((float)*it1) * 0.000031 * M_PI; - const float z = ((float)*it2) * 0.000031 * M_PI; - itS->s[0] = std::sin(x); - itS->s[1] = std::sin(y); - itS->s[2] = std::sin(z); - itS->s[3] = 0.0f; - itC->s[0] = std::cos(-x); - itC->s[1] = std::cos(-y); - itC->s[2] = std::cos(-z); - itC->s[3] = 0.0f; + it->s[0] = -((float)*it0) * 0.000031 * M_PI; + it->s[1] = -((float)*it1) * 0.000031 * M_PI; + it->s[2] = -((float)*it2) * 0.000031 * M_PI; + it->s[3] = 0.0f; } } cl_int err = CL_SUCCESS; - cl::Event event0, event1; - err = queue.enqueueWriteBuffer(buf_p0_sin_table, CL_FALSE, 0, buf_p0_table_size, p0_sin_table, NULL, &event0); - LOG_CL_ERROR(err, "enqueueWriteBuffer"); - err = queue.enqueueWriteBuffer(buf_p0_cos_table, CL_FALSE, 0, buf_p0_table_size, p0_cos_table, NULL, &event1); + cl::Event event0; + err = queue.enqueueWriteBuffer(buf_p0_table, CL_FALSE, 0, buf_p0_table_size, p0_table, NULL, &event0); LOG_CL_ERROR(err, "enqueueWriteBuffer"); err = event0.wait(); LOG_CL_ERROR(err, "wait"); - err = event1.wait(); - LOG_CL_ERROR(err, "wait"); - delete[] p0_sin_table; - delete[] p0_cos_table; + delete[] p0_table; } void fill_xz_tables(const float *xtable, const float *ztable) From 37d088cb71464aa73c29a436f6477eaf475b5be3 Mon Sep 17 00:00:00 2001 From: Thiemo Wiedemeyer Date: Thu, 18 Feb 2016 14:52:01 +0100 Subject: [PATCH 5/7] Changed filling methods to return a bool on success, making macro LOG_CL_ERROR obsolete. --- src/opencl_depth_packet_processor.cpp | 42 +++++++++++++++++---------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/src/opencl_depth_packet_processor.cpp b/src/opencl_depth_packet_processor.cpp index 1aa4755ab..ad21d806e 100644 --- a/src/opencl_depth_packet_processor.cpp +++ b/src/opencl_depth_packet_processor.cpp @@ -56,7 +56,6 @@ #include #define CHECK_CL_ERROR(err, str) do {if (err != CL_SUCCESS) {LOG_ERROR << str << " failed: " << err; return false; } } while(0) -#define LOG_CL_ERROR(err, str) if (err != CL_SUCCESS) LOG_ERROR << str << " failed: " << err #define WITH_PROFILING 0 @@ -691,12 +690,12 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging depth_frame = new OpenCLFrame(512, 424, 4, this); } - void fill_trig_table(const libfreenect2::protocol::P0TablesResponse *p0table) + bool fill_trig_table(const libfreenect2::protocol::P0TablesResponse *p0table) { if(!deviceInitialized) { LOG_ERROR << "OpenCLDepthPacketProcessor is not initialized!"; - return; + return false; } cl_float3 *p0_table = new cl_float3[image_size]; @@ -719,50 +718,63 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging cl_int err = CL_SUCCESS; cl::Event event0; err = queue.enqueueWriteBuffer(buf_p0_table, CL_FALSE, 0, buf_p0_table_size, p0_table, NULL, &event0); - LOG_CL_ERROR(err, "enqueueWriteBuffer"); + if(err != CL_SUCCESS) + { + LOG_ERROR << "enqueueWriteBuffer failed: " << err; + delete[] p0_table; + return false; + } err = event0.wait(); - LOG_CL_ERROR(err, "wait"); + if(err != CL_SUCCESS) + { + LOG_ERROR << "wait failed: " << err; + delete[] p0_table; + return false; + } delete[] p0_table; + return true; } - void fill_xz_tables(const float *xtable, const float *ztable) + bool fill_xz_tables(const float *xtable, const float *ztable) { if(!deviceInitialized) { LOG_ERROR << "OpenCLDepthPacketProcessor is not initialized!"; - return; + return false; } cl_int err = CL_SUCCESS; cl::Event event0, event1; err = queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, xtable, NULL, &event0); - LOG_CL_ERROR(err, "enqueueWriteBuffer"); + CHECK_CL_ERROR(err, "enqueueWriteBuffer"); err = queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, ztable, NULL, &event1); - LOG_CL_ERROR(err, "enqueueWriteBuffer"); + CHECK_CL_ERROR(err, "enqueueWriteBuffer"); err = event0.wait(); - LOG_CL_ERROR(err, "wait"); + CHECK_CL_ERROR(err, "wait"); err = event1.wait(); - LOG_CL_ERROR(err, "wait"); + CHECK_CL_ERROR(err, "wait"); + return true; } - void fill_lut(const short *lut) + bool fill_lut(const short *lut) { if(!deviceInitialized) { LOG_ERROR << "OpenCLDepthPacketProcessor is not initialized!"; - return; + return false; } cl_int err = CL_SUCCESS; cl::Event event0; err = queue.enqueueWriteBuffer(buf_lut11to16, CL_FALSE, 0, buf_lut11to16_size, lut, NULL, &event0); - LOG_CL_ERROR(err, "enqueueWriteBuffer"); + CHECK_CL_ERROR(err, "enqueueWriteBuffer"); err = event0.wait(); - LOG_CL_ERROR(err, "wait"); + CHECK_CL_ERROR(err, "wait"); + return true; } }; From ce31c541a3b7a8e717926ee89e9c9c871bb3fe86 Mon Sep 17 00:00:00 2001 From: Thiemo Wiedemeyer Date: Fri, 19 Feb 2016 11:42:32 +0100 Subject: [PATCH 6/7] opencl: recommended changes Usage of LIBFREENECT2_WITH_PROFILING. Changed CHECK_CL macros. OpenCLAllocator can now be used for input and output buffers. OpenCLFrame now uses OpenCLBuffer from allocator. IMAGE_SIZE and LUT_SIZE as static const. Added Allocators for input and output buffers. Moved allocate_opencl to top. Added good method. --- .../libfreenect2/depth_packet_processor.h | 2 + src/opencl_depth_packet_processor.cpp | 409 +++++++----------- 2 files changed, 160 insertions(+), 251 deletions(-) diff --git a/include/internal/libfreenect2/depth_packet_processor.h b/include/internal/libfreenect2/depth_packet_processor.h index 26f975639..5072de059 100644 --- a/include/internal/libfreenect2/depth_packet_processor.h +++ b/include/internal/libfreenect2/depth_packet_processor.h @@ -175,7 +175,9 @@ class OpenCLDepthPacketProcessor : public DepthPacketProcessor virtual void loadXZTables(const float *xtable, const float *ztable); virtual void loadLookupTable(const short *lut); + virtual bool good(); virtual const char *name() { return "OpenCL"; } + virtual void process(const DepthPacket &packet); protected: virtual Allocator *getAllocator(); diff --git a/src/opencl_depth_packet_processor.cpp b/src/opencl_depth_packet_processor.cpp index ad21d806e..edb22dbcd 100644 --- a/src/opencl_depth_packet_processor.cpp +++ b/src/opencl_depth_packet_processor.cpp @@ -55,9 +55,9 @@ #include -#define CHECK_CL_ERROR(err, str) do {if (err != CL_SUCCESS) {LOG_ERROR << str << " failed: " << err; return false; } } while(0) - -#define WITH_PROFILING 0 +#define CHECK_CL_PARAM(expr) do { cl_int err = CL_SUCCESS; (expr); if (err != CL_SUCCESS) { LOG_ERROR << #expr ": " << err; return false; } } while(0) +#define CHECK_CL_RETURN(expr) do { cl_int err = (expr); if (err != CL_SUCCESS) { LOG_ERROR << #expr ": " << err; return false; } } while(0) +#define CHECK_CL_ON_FAIL(expr, on_fail) do { cl_int err = (expr); if (err != CL_SUCCESS) { LOG_ERROR << #expr ": " << err; on_fail; return false; } } while(0) namespace libfreenect2 { @@ -87,52 +87,74 @@ class OpenCLBuffer: public Buffer class OpenCLAllocator: public Allocator { private: - OpenCLDepthPacketProcessorImpl *impl_; - cl::Buffer buffer; + cl::Context &context; + cl::CommandQueue &queue; + const bool isInputBuffer; + + bool allocate_opencl(OpenCLBuffer *b, size_t size) + { + if(isInputBuffer) + { + CHECK_CL_PARAM(b->buffer = cl::Buffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size, NULL, &err)); + CHECK_CL_PARAM(b->data = (unsigned char*)queue.enqueueMapBuffer(b->buffer, CL_TRUE, CL_MAP_WRITE, 0, size, NULL, NULL, &err)); + } + else + { + CHECK_CL_PARAM(b->buffer = cl::Buffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, size, NULL, &err)); + CHECK_CL_PARAM(b->data = (unsigned char*)queue.enqueueMapBuffer(b->buffer, CL_TRUE, CL_MAP_READ, 0, size, NULL, NULL, &err)); + } - bool allocate_opencl(OpenCLBuffer *b, size_t size); + b->length = 0; + b->capacity = size; + return true; + } + + bool release_opencl(OpenCLBuffer *b) + { + cl::Event event; + CHECK_CL_RETURN(queue.enqueueUnmapMemObject(b->buffer, b->data, NULL, &event)); + CHECK_CL_RETURN(event.wait()); + return true; + } public: - OpenCLAllocator(OpenCLDepthPacketProcessorImpl *impl_) : impl_(impl_) + OpenCLAllocator(cl::Context &context, cl::CommandQueue &queue, bool isInputBuffer) : context(context), queue(queue), isInputBuffer(isInputBuffer) { } virtual Buffer *allocate(size_t size) { OpenCLBuffer *b = new OpenCLBuffer(); - if (!allocate_opencl(b, size)) { - delete b; - b = NULL; - } + if(!allocate_opencl(b, size)) + b->data = NULL; return b; } virtual void free(Buffer *b) { - if (b == NULL || b->data == NULL) + if(b == NULL) return; + release_opencl(static_cast(b)); delete b; } }; class OpenCLFrame: public Frame { - bool allocate_opencl(size_t size, OpenCLDepthPacketProcessorImpl *impl_); +private: + OpenCLBuffer *buffer; public: - cl::Buffer frameBuffer; - - OpenCLFrame(size_t width, size_t height, size_t bytes_per_pixel, OpenCLDepthPacketProcessorImpl *impl_): - Frame(width, height, bytes_per_pixel, (unsigned char*)-1) + OpenCLFrame(OpenCLBuffer *buffer) + : Frame(512, 424, 4, (unsigned char*)-1) + , buffer(buffer) { - data = NULL; - - size_t size = width*height*bytes_per_pixel; - allocate_opencl(size, impl_); + data = buffer->data; } virtual ~OpenCLFrame() { + buffer->allocator->free(buffer); data = NULL; } }; @@ -140,11 +162,16 @@ class OpenCLFrame: public Frame class OpenCLDepthPacketProcessorImpl: public WithPerfLogging { public: + static const size_t IMAGE_SIZE = 512*424; + static const size_t LUT_SIZE = 2048; + libfreenect2::DepthPacketProcessor::Config config; DepthPacketProcessor::Parameters params; Frame *ir_frame, *depth_frame; - Allocator *allocator; + Allocator *input_buffer_allocator; + Allocator *ir_buffer_allocator; + Allocator *depth_buffer_allocator; cl::Context context; cl::Device device; @@ -157,9 +184,6 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging cl::Kernel kernel_processPixelStage2; cl::Kernel kernel_filterPixelStage2; - const size_t image_size; - const size_t lut_size; - // Read only buffers size_t buf_lut11to16_size; size_t buf_p0_table_size; @@ -201,15 +225,13 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging bool programInitialized; std::string sourceCode; -#if WITH_PROFILING +#if LIBFREENECT2_WITH_PROFILING std::vector timings; int count; #endif OpenCLDepthPacketProcessorImpl(const int deviceId = -1) - : image_size(512 * 424) - , lut_size(2048) - , deviceInitialized(false) + : deviceInitialized(false) , programBuilt(false) , programInitialized(false) { @@ -220,11 +242,13 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging deviceInitialized = initDevice(deviceId); + input_buffer_allocator = new PoolAllocator(new OpenCLAllocator(context, queue, true)); + ir_buffer_allocator = new PoolAllocator(new OpenCLAllocator(context, queue, false)); + depth_buffer_allocator = new PoolAllocator(new OpenCLAllocator(context, queue, false)); + newIrFrame(); newDepthFrame(); - allocator = new PoolAllocator(new OpenCLAllocator(this)); - const int CL_ICDL_VERSION = 2; typedef cl_int (*icdloader_func)(int, size_t, void*, size_t*); #ifdef _MSC_VER @@ -253,9 +277,11 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging ~OpenCLDepthPacketProcessorImpl() { - delete allocator; delete ir_frame; delete depth_frame; + delete input_buffer_allocator; + delete ir_buffer_allocator; + delete depth_buffer_allocator; } void generateOptions(std::string &options) const @@ -399,10 +425,8 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging return false; } - cl_int err = CL_SUCCESS; std::vector platforms; - err = cl::Platform::get(&platforms); - CHECK_CL_ERROR(err, "cl::Platform::get"); + CHECK_CL_RETURN(cl::Platform::get(&platforms)); if(platforms.empty()) { @@ -420,8 +444,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging } LOG_INFO << "selected device: " << deviceString(device); - context = cl::Context(device, NULL, NULL, NULL, &err); - CHECK_CL_ERROR(err, "cl::Context"); + CHECK_CL_PARAM(context = cl::Context(device, NULL, NULL, NULL, &err)); if(!initBuffers()) return false; @@ -431,65 +454,48 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging bool initBuffers() { - cl_int err = CL_SUCCESS; -#if WITH_PROFILING +#if LIBFREENECT2_WITH_PROFILING count = 0; - queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); + CHECK_CL_PARAM(queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); #else - queue = cl::CommandQueue(context, device, 0, &err); + CHECK_CL_PARAM(queue = cl::CommandQueue(context, device, 0, &err)); #endif - CHECK_CL_ERROR(err, "cl::CommandQueue"); //Read only - buf_lut11to16_size = lut_size * sizeof(cl_short); - buf_p0_table_size = image_size * sizeof(cl_float3); - buf_x_table_size = image_size * sizeof(cl_float); - buf_z_table_size = image_size * sizeof(cl_float); - buf_packet_size = ((image_size * 11) / 16) * 10 * sizeof(cl_ushort); - - buf_lut11to16 = cl::Buffer(context, CL_MEM_READ_ONLY, buf_lut11to16_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_p0_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_x_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_x_table_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_z_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_z_table_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_packet = cl::Buffer(context, CL_MEM_READ_ONLY, buf_packet_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); + buf_lut11to16_size = LUT_SIZE * sizeof(cl_short); + buf_p0_table_size = IMAGE_SIZE * sizeof(cl_float3); + buf_x_table_size = IMAGE_SIZE * sizeof(cl_float); + buf_z_table_size = IMAGE_SIZE * sizeof(cl_float); + buf_packet_size = ((IMAGE_SIZE * 11) / 16) * 10 * sizeof(cl_ushort); + + CHECK_CL_PARAM(buf_lut11to16 = cl::Buffer(context, CL_MEM_READ_ONLY, buf_lut11to16_size, NULL, &err)); + CHECK_CL_PARAM(buf_p0_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_p0_table_size, NULL, &err)); + CHECK_CL_PARAM(buf_x_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_x_table_size, NULL, &err)); + CHECK_CL_PARAM(buf_z_table = cl::Buffer(context, CL_MEM_READ_ONLY, buf_z_table_size, NULL, &err)); + CHECK_CL_PARAM(buf_packet = cl::Buffer(context, CL_MEM_READ_ONLY, buf_packet_size, NULL, &err)); //Read-Write - buf_a_size = image_size * sizeof(cl_float3); - buf_b_size = image_size * sizeof(cl_float3); - buf_n_size = image_size * sizeof(cl_float3); - buf_ir_size = image_size * sizeof(cl_float); - buf_a_filtered_size = image_size * sizeof(cl_float3); - buf_b_filtered_size = image_size * sizeof(cl_float3); - buf_edge_test_size = image_size * sizeof(cl_uchar); - buf_depth_size = image_size * sizeof(cl_float); - buf_ir_sum_size = image_size * sizeof(cl_float); - buf_filtered_size = image_size * sizeof(cl_float); - - buf_a = cl::Buffer(context, CL_MEM_READ_WRITE, buf_a_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_b = cl::Buffer(context, CL_MEM_READ_WRITE, buf_b_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_n = cl::Buffer(context, CL_MEM_READ_WRITE, buf_n_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_ir = cl::Buffer(context, CL_MEM_WRITE_ONLY, buf_ir_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_a_filtered = cl::Buffer(context, CL_MEM_READ_WRITE, buf_a_filtered_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_b_filtered = cl::Buffer(context, CL_MEM_READ_WRITE, buf_b_filtered_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_edge_test = cl::Buffer(context, CL_MEM_READ_WRITE, buf_edge_test_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_depth = cl::Buffer(context, CL_MEM_READ_WRITE, buf_depth_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_ir_sum = cl::Buffer(context, CL_MEM_READ_WRITE, buf_ir_sum_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - buf_filtered = cl::Buffer(context, CL_MEM_WRITE_ONLY, buf_filtered_size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); + buf_a_size = IMAGE_SIZE * sizeof(cl_float3); + buf_b_size = IMAGE_SIZE * sizeof(cl_float3); + buf_n_size = IMAGE_SIZE * sizeof(cl_float3); + buf_ir_size = IMAGE_SIZE * sizeof(cl_float); + buf_a_filtered_size = IMAGE_SIZE * sizeof(cl_float3); + buf_b_filtered_size = IMAGE_SIZE * sizeof(cl_float3); + buf_edge_test_size = IMAGE_SIZE * sizeof(cl_uchar); + buf_depth_size = IMAGE_SIZE * sizeof(cl_float); + buf_ir_sum_size = IMAGE_SIZE * sizeof(cl_float); + buf_filtered_size = IMAGE_SIZE * sizeof(cl_float); + + CHECK_CL_PARAM(buf_a = cl::Buffer(context, CL_MEM_READ_WRITE, buf_a_size, NULL, &err)); + CHECK_CL_PARAM(buf_b = cl::Buffer(context, CL_MEM_READ_WRITE, buf_b_size, NULL, &err)); + CHECK_CL_PARAM(buf_n = cl::Buffer(context, CL_MEM_READ_WRITE, buf_n_size, NULL, &err)); + CHECK_CL_PARAM(buf_ir = cl::Buffer(context, CL_MEM_WRITE_ONLY, buf_ir_size, NULL, &err)); + CHECK_CL_PARAM(buf_a_filtered = cl::Buffer(context, CL_MEM_READ_WRITE, buf_a_filtered_size, NULL, &err)); + CHECK_CL_PARAM(buf_b_filtered = cl::Buffer(context, CL_MEM_READ_WRITE, buf_b_filtered_size, NULL, &err)); + CHECK_CL_PARAM(buf_edge_test = cl::Buffer(context, CL_MEM_READ_WRITE, buf_edge_test_size, NULL, &err)); + CHECK_CL_PARAM(buf_depth = cl::Buffer(context, CL_MEM_READ_WRITE, buf_depth_size, NULL, &err)); + CHECK_CL_PARAM(buf_ir_sum = cl::Buffer(context, CL_MEM_READ_WRITE, buf_ir_sum_size, NULL, &err)); + CHECK_CL_PARAM(buf_filtered = cl::Buffer(context, CL_MEM_WRITE_ONLY, buf_filtered_size, NULL, &err)); return true; } @@ -505,66 +511,37 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging if (!buildProgram(sourceCode)) return false; - cl_int err = CL_SUCCESS; - kernel_processPixelStage1 = cl::Kernel(program, "processPixelStage1", &err); - CHECK_CL_ERROR(err, "cl::Kernel"); - err = kernel_processPixelStage1.setArg(0, buf_lut11to16); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(1, buf_z_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(2, buf_p0_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(3, buf_packet); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(4, buf_a); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(5, buf_b); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(6, buf_n); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage1.setArg(7, buf_ir); - CHECK_CL_ERROR(err, "setArg"); - - kernel_filterPixelStage1 = cl::Kernel(program, "filterPixelStage1", &err); - CHECK_CL_ERROR(err, "cl::Kernel"); - err = kernel_filterPixelStage1.setArg(0, buf_a); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(1, buf_b); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(2, buf_n); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(3, buf_a_filtered); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(4, buf_b_filtered); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage1.setArg(5, buf_edge_test); - CHECK_CL_ERROR(err, "setArg"); - - kernel_processPixelStage2 = cl::Kernel(program, "processPixelStage2", &err); - CHECK_CL_ERROR(err, "cl::Kernel"); - err = kernel_processPixelStage2.setArg(0, config.EnableBilateralFilter ? buf_a_filtered : buf_a); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(1, config.EnableBilateralFilter ? buf_b_filtered : buf_b); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(2, buf_x_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(3, buf_z_table); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(4, buf_depth); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_processPixelStage2.setArg(5, buf_ir_sum); - CHECK_CL_ERROR(err, "setArg"); - - kernel_filterPixelStage2 = cl::Kernel(program, "filterPixelStage2", &err); - CHECK_CL_ERROR(err, "cl::Kernel"); - err = kernel_filterPixelStage2.setArg(0, buf_depth); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage2.setArg(1, buf_ir_sum); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage2.setArg(2, buf_edge_test); - CHECK_CL_ERROR(err, "setArg"); - err = kernel_filterPixelStage2.setArg(3, buf_filtered); - CHECK_CL_ERROR(err, "setArg"); + CHECK_CL_PARAM(kernel_processPixelStage1 = cl::Kernel(program, "processPixelStage1", &err)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(0, buf_lut11to16)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(1, buf_z_table)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(2, buf_p0_table)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(3, buf_packet)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(4, buf_a)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(5, buf_b)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(6, buf_n)); + CHECK_CL_RETURN(kernel_processPixelStage1.setArg(7, buf_ir)); + + CHECK_CL_PARAM(kernel_filterPixelStage1 = cl::Kernel(program, "filterPixelStage1", &err)); + CHECK_CL_RETURN(kernel_filterPixelStage1.setArg(0, buf_a)); + CHECK_CL_RETURN(kernel_filterPixelStage1.setArg(1, buf_b)); + CHECK_CL_RETURN(kernel_filterPixelStage1.setArg(2, buf_n)); + CHECK_CL_RETURN(kernel_filterPixelStage1.setArg(3, buf_a_filtered)); + CHECK_CL_RETURN(kernel_filterPixelStage1.setArg(4, buf_b_filtered)); + CHECK_CL_RETURN(kernel_filterPixelStage1.setArg(5, buf_edge_test)); + + CHECK_CL_PARAM(kernel_processPixelStage2 = cl::Kernel(program, "processPixelStage2", &err)); + CHECK_CL_RETURN(kernel_processPixelStage2.setArg(0, config.EnableBilateralFilter ? buf_a_filtered : buf_a)); + CHECK_CL_RETURN(kernel_processPixelStage2.setArg(1, config.EnableBilateralFilter ? buf_b_filtered : buf_b)); + CHECK_CL_RETURN(kernel_processPixelStage2.setArg(2, buf_x_table)); + CHECK_CL_RETURN(kernel_processPixelStage2.setArg(3, buf_z_table)); + CHECK_CL_RETURN(kernel_processPixelStage2.setArg(4, buf_depth)); + CHECK_CL_RETURN(kernel_processPixelStage2.setArg(5, buf_ir_sum)); + + CHECK_CL_PARAM(kernel_filterPixelStage2 = cl::Kernel(program, "filterPixelStage2", &err)); + CHECK_CL_RETURN(kernel_filterPixelStage2.setArg(0, buf_depth)); + CHECK_CL_RETURN(kernel_filterPixelStage2.setArg(1, buf_ir_sum)); + CHECK_CL_RETURN(kernel_filterPixelStage2.setArg(2, buf_edge_test)); + CHECK_CL_RETURN(kernel_filterPixelStage2.setArg(3, buf_filtered)); programInitialized = true; return true; @@ -572,49 +549,38 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging bool run(const DepthPacket &packet) { - cl_int err; std::vector eventWrite(1), eventPPS1(1), eventFPS1(1), eventPPS2(1), eventFPS2(1); cl::Event eventReadIr, eventReadDepth; - err = queue.enqueueWriteBuffer(buf_packet, CL_FALSE, 0, buf_packet_size, packet.buffer, NULL, &eventWrite[0]); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - - err = queue.enqueueNDRangeKernel(kernel_processPixelStage1, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventWrite, &eventPPS1[0]); - CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); - err = queue.enqueueReadBuffer(buf_ir, CL_FALSE, 0, buf_ir_size, ir_frame->data, &eventPPS1, &eventReadIr); - CHECK_CL_ERROR(err, "enqueueReadBuffer"); + CHECK_CL_RETURN(queue.enqueueWriteBuffer(buf_packet, CL_FALSE, 0, buf_packet_size, packet.buffer, NULL, &eventWrite[0])); + CHECK_CL_RETURN(queue.enqueueNDRangeKernel(kernel_processPixelStage1, cl::NullRange, cl::NDRange(IMAGE_SIZE), cl::NullRange, &eventWrite, &eventPPS1[0])); + CHECK_CL_RETURN(queue.enqueueReadBuffer(buf_ir, CL_FALSE, 0, buf_ir_size, ir_frame->data, &eventPPS1, &eventReadIr)); if(config.EnableBilateralFilter) { - err = queue.enqueueNDRangeKernel(kernel_filterPixelStage1, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventPPS1, &eventFPS1[0]); - CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); + CHECK_CL_RETURN(queue.enqueueNDRangeKernel(kernel_filterPixelStage1, cl::NullRange, cl::NDRange(IMAGE_SIZE), cl::NullRange, &eventPPS1, &eventFPS1[0])); } else { eventFPS1[0] = eventPPS1[0]; } - err = queue.enqueueNDRangeKernel(kernel_processPixelStage2, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventFPS1, &eventPPS2[0]); - CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); + CHECK_CL_RETURN(queue.enqueueNDRangeKernel(kernel_processPixelStage2, cl::NullRange, cl::NDRange(IMAGE_SIZE), cl::NullRange, &eventFPS1, &eventPPS2[0])); if(config.EnableEdgeAwareFilter) { - err = queue.enqueueNDRangeKernel(kernel_filterPixelStage2, cl::NullRange, cl::NDRange(image_size), cl::NullRange, &eventPPS2, &eventFPS2[0]); - CHECK_CL_ERROR(err, "enqueueNDRangeKernel"); + CHECK_CL_RETURN(queue.enqueueNDRangeKernel(kernel_filterPixelStage2, cl::NullRange, cl::NDRange(IMAGE_SIZE), cl::NullRange, &eventPPS2, &eventFPS2[0])); } else { eventFPS2[0] = eventPPS2[0]; } - err = queue.enqueueReadBuffer(config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0, buf_depth_size, depth_frame->data, &eventFPS2, &eventReadDepth); - CHECK_CL_ERROR(err, "enqueueReadBuffer"); - err = eventReadIr.wait(); - CHECK_CL_ERROR(err, "wait"); - err = eventReadDepth.wait(); - CHECK_CL_ERROR(err, "wait"); + CHECK_CL_RETURN(queue.enqueueReadBuffer(config.EnableEdgeAwareFilter ? buf_filtered : buf_depth, CL_FALSE, 0, buf_depth_size, depth_frame->data, &eventFPS2, &eventReadDepth)); + CHECK_CL_RETURN(eventReadIr.wait()); + CHECK_CL_RETURN(eventReadDepth.wait()); -#if WITH_PROFILING +#if LIBFREENECT2_WITH_PROFILING if(count == 0) { timings.clear(); @@ -655,25 +621,19 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging bool buildProgram(const std::string &sources) { - cl_int err; LOG_INFO << "building OpenCL program..."; std::string options; generateOptions(options); cl::Program::Sources source(1, std::make_pair(sources.c_str(), sources.length())); - program = cl::Program(context, source, &err); - CHECK_CL_ERROR(err, "cl::Program"); + CHECK_CL_PARAM(program = cl::Program(context, source, &err)); - err = program.build(options.c_str()); - if(err != CL_SUCCESS) - { + CHECK_CL_ON_FAIL(program.build(options.c_str()), LOG_ERROR << "failed to build program: " << err; LOG_ERROR << "Build Status: " << program.getBuildInfo(device); LOG_ERROR << "Build Options:\t" << program.getBuildInfo(device); - LOG_ERROR << "Build Log:\t " << program.getBuildInfo(device); - return false; - } + LOG_ERROR << "Build Log:\t " << program.getBuildInfo(device)); LOG_INFO << "OpenCL program built successfully"; programBuilt = true; @@ -682,12 +642,12 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging void newIrFrame() { - ir_frame = new OpenCLFrame(512, 424, 4, this); + ir_frame = new OpenCLFrame(static_cast(ir_buffer_allocator->allocate(IMAGE_SIZE * sizeof(cl_float)))); } void newDepthFrame() { - depth_frame = new OpenCLFrame(512, 424, 4, this); + depth_frame = new OpenCLFrame(static_cast(depth_buffer_allocator->allocate(IMAGE_SIZE * sizeof(cl_float)))); } bool fill_trig_table(const libfreenect2::protocol::P0TablesResponse *p0table) @@ -698,7 +658,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging return false; } - cl_float3 *p0_table = new cl_float3[image_size]; + cl_float3 *p0_table = new cl_float3[IMAGE_SIZE]; for(int r = 0; r < 424; ++r) { @@ -715,24 +675,9 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging } } - cl_int err = CL_SUCCESS; - cl::Event event0; - err = queue.enqueueWriteBuffer(buf_p0_table, CL_FALSE, 0, buf_p0_table_size, p0_table, NULL, &event0); - if(err != CL_SUCCESS) - { - LOG_ERROR << "enqueueWriteBuffer failed: " << err; - delete[] p0_table; - return false; - } - - err = event0.wait(); - if(err != CL_SUCCESS) - { - LOG_ERROR << "wait failed: " << err; - delete[] p0_table; - return false; - } - + cl::Event event; + CHECK_CL_ON_FAIL(queue.enqueueWriteBuffer(buf_p0_table, CL_FALSE, 0, buf_p0_table_size, p0_table, NULL, &event), delete[] p0_table); + CHECK_CL_ON_FAIL(event.wait(), delete[] p0_table); delete[] p0_table; return true; } @@ -745,17 +690,11 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging return false; } - cl_int err = CL_SUCCESS; cl::Event event0, event1; - err = queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, xtable, NULL, &event0); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - err = queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, ztable, NULL, &event1); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - - err = event0.wait(); - CHECK_CL_ERROR(err, "wait"); - err = event1.wait(); - CHECK_CL_ERROR(err, "wait"); + CHECK_CL_RETURN(queue.enqueueWriteBuffer(buf_x_table, CL_FALSE, 0, buf_x_table_size, xtable, NULL, &event0)); + CHECK_CL_RETURN(queue.enqueueWriteBuffer(buf_z_table, CL_FALSE, 0, buf_z_table_size, ztable, NULL, &event1)); + CHECK_CL_RETURN(event0.wait()); + CHECK_CL_RETURN(event1.wait()); return true; } @@ -767,50 +706,13 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging return false; } - cl_int err = CL_SUCCESS; - cl::Event event0; - err = queue.enqueueWriteBuffer(buf_lut11to16, CL_FALSE, 0, buf_lut11to16_size, lut, NULL, &event0); - CHECK_CL_ERROR(err, "enqueueWriteBuffer"); - - err = event0.wait(); - CHECK_CL_ERROR(err, "wait"); + cl::Event event; + CHECK_CL_RETURN(queue.enqueueWriteBuffer(buf_lut11to16, CL_FALSE, 0, buf_lut11to16_size, lut, NULL, &event)); + CHECK_CL_RETURN(event.wait()); return true; } }; -bool OpenCLFrame::allocate_opencl(size_t size, OpenCLDepthPacketProcessorImpl *impl_) -{ - if(!impl_->deviceInitialized) - return false; - - cl_int err = CL_SUCCESS; - - frameBuffer = cl::Buffer(impl_->context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - - data = (unsigned char*)impl_->queue.enqueueMapBuffer(frameBuffer, CL_TRUE, CL_MAP_READ, 0, size, NULL, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - return true; -} - -bool OpenCLAllocator::allocate_opencl(OpenCLBuffer *b, size_t size) -{ - if(!impl_->deviceInitialized) - return false; - - cl_int err = CL_SUCCESS; - - b->buffer = cl::Buffer(impl_->context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size, NULL, &err); - CHECK_CL_ERROR(err, "cl::Buffer"); - - b->data = (unsigned char*)impl_->queue.enqueueMapBuffer(b->buffer, CL_TRUE, CL_MAP_WRITE, 0, size, NULL, NULL, &err); - CHECK_CL_ERROR(err, "enqueueMapBuffer"); - - b->length = 0; - b->capacity = size; - return true; -} - OpenCLDepthPacketProcessor::OpenCLDepthPacketProcessor(const int deviceId) : impl_(new OpenCLDepthPacketProcessorImpl(deviceId)) { @@ -867,6 +769,11 @@ void OpenCLDepthPacketProcessor::loadLookupTable(const short *lut) impl_->fill_lut(lut); } +bool OpenCLDepthPacketProcessor::good() +{ + return impl_->deviceInitialized; +} + void OpenCLDepthPacketProcessor::process(const DepthPacket &packet) { bool has_listener = this->listener_ != 0; @@ -904,7 +811,7 @@ void OpenCLDepthPacketProcessor::process(const DepthPacket &packet) Allocator *OpenCLDepthPacketProcessor::getAllocator() { - return impl_->allocator; + return impl_->input_buffer_allocator; } } /* namespace libfreenect2 */ From 2dfafb7588593c4c1a5983dc30669c4aedc17e43 Mon Sep 17 00:00:00 2001 From: Thiemo Wiedemeyer Date: Fri, 19 Feb 2016 13:37:54 +0100 Subject: [PATCH 7/7] opencl: different profiling definition for OpenCL Enabling profiling in OpenCL effects the performance, so for profiling libfreenect2s processors, it should be disabled and only used when testing improvements of the OpenCL code itself. --- src/opencl_depth_packet_processor.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/opencl_depth_packet_processor.cpp b/src/opencl_depth_packet_processor.cpp index edb22dbcd..7e93cd1e5 100644 --- a/src/opencl_depth_packet_processor.cpp +++ b/src/opencl_depth_packet_processor.cpp @@ -225,7 +225,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging bool programInitialized; std::string sourceCode; -#if LIBFREENECT2_WITH_PROFILING +#ifdef LIBFREENECT2_WITH_PROFILING_CL std::vector timings; int count; #endif @@ -454,7 +454,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging bool initBuffers() { -#if LIBFREENECT2_WITH_PROFILING +#ifdef LIBFREENECT2_WITH_PROFILING_CL count = 0; CHECK_CL_PARAM(queue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); #else @@ -580,7 +580,7 @@ class OpenCLDepthPacketProcessorImpl: public WithPerfLogging CHECK_CL_RETURN(eventReadIr.wait()); CHECK_CL_RETURN(eventReadDepth.wait()); -#if LIBFREENECT2_WITH_PROFILING +#ifdef LIBFREENECT2_WITH_PROFILING_CL if(count == 0) { timings.clear();