diff --git a/examples/vx_tiling_ext.c b/examples/vx_tiling_ext.c index aa3adc6..a367da4 100644 --- a/examples/vx_tiling_ext.c +++ b/examples/vx_tiling_ext.c @@ -220,7 +220,8 @@ static vx_status VX_CALLBACK vxAlphaOutputValidator(vx_node node, vx_uint32 inde return status; } - +//Move this struct into "include/VX/vx_khr_tiling.h" +#if 0 /*! [publish_support] */ typedef struct _vx_tiling_kernel_t { /*! kernel name */ @@ -246,6 +247,7 @@ typedef struct _vx_tiling_kernel_t { /*! border information. */ vx_border_t border; } vx_tiling_kernel_t; +#endif static vx_tiling_kernel_t tiling_kernels[] = { {"org.khronos.openvx.tiling_gaussian_3x3", @@ -255,6 +257,7 @@ static vx_tiling_kernel_t tiling_kernels[] = { 2, {{VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}}, + NULL, vxFilterInputValidator, vxFilterOutputValidator, {1, 1}, @@ -269,6 +272,7 @@ static vx_tiling_kernel_t tiling_kernels[] = { {{VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}}, + NULL, vxAlphaInputValidator, vxAlphaOutputValidator, {1, 1}, @@ -282,6 +286,7 @@ static vx_tiling_kernel_t tiling_kernels[] = { 2, {{VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}}, + NULL, vxFilterInputValidator, vxFilterOutputValidator, {1, 1}, @@ -296,6 +301,7 @@ static vx_tiling_kernel_t tiling_kernels[] = { {{VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}}, + NULL, vxAddInputValidator, vxAddOutputValidator, {1, 1}, @@ -319,6 +325,7 @@ VX_API_ENTRY vx_status VX_API_CALL vxPublishKernels(vx_context context) tiling_kernels[k].flexible_function, tiling_kernels[k].fast_function, tiling_kernels[k].num_params, + tiling_kernels[k].validate, tiling_kernels[k].input_validator, tiling_kernels[k].output_validator); if (kernel) diff --git a/kernels/CMakeLists.txt b/kernels/CMakeLists.txt index 75d1f72..99ee945 100644 --- a/kernels/CMakeLists.txt +++ b/kernels/CMakeLists.txt @@ -19,4 +19,7 @@ add_subdirectory( c_model ) add_subdirectory( debug ) add_subdirectory( extras ) +if (OPENVX_USE_TILING) + add_subdirectory( tiling ) +endif (OPENVX_USE_TILING) diff --git a/kernels/opencl/vx_and.cl b/kernels/opencl/vx_and.cl new file mode 100644 index 0000000..6a08e00 --- /dev/null +++ b/kernels/opencl/vx_and.cl @@ -0,0 +1,10 @@ + +__kernel void vx_and(int asx, int asy, __global uchar *a, + int bsx, int bsy, __global uchar *b, + int csx, int csy, __global uchar *c) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + c[y * csy + x * csx] = a[y * asy + x * asx] & b[y * bsy + x * bsx]; +} diff --git a/kernels/opencl/vx_box3x3.cl b/kernels/opencl/vx_box3x3.cl new file mode 100644 index 0000000..3ee0d3d --- /dev/null +++ b/kernels/opencl/vx_box3x3.cl @@ -0,0 +1,87 @@ + +//Define 3 types of border +#define VX_ID_KHRONOS 0x000 +#define VX_ENUM_BORDER 0x0C +#define VX_ENUM_BASE(vendor, id) (((vendor) << 20) | (id << 12)) + +#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0 +#define VX_BORDER_CONSTANT VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1 +#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2 + +#define BOX3x3 sum += (uint)src[x_top * ssx + y_top * ssy]; \ + sum += (uint)src[x * ssx + y_top * ssy]; \ + sum += (uint)src[x_bot * ssx + y_top * ssy]; \ + sum += (uint)src[x_top * ssx + y * ssy]; \ + sum += (uint)src[x * ssx + y * ssy]; \ + sum += (uint)src[x_bot * ssx + y * ssy]; \ + sum += (uint)src[x_top * ssx + y_bot * ssy]; \ + sum += (uint)src[x * ssx + y_bot * ssy]; \ + sum += (uint)src[x_bot * ssx + y_bot * ssy]; \ + sum = sum / 9; \ + dst[x * dsx + y * dsy] = (uchar)sum; \ + + +__kernel void vx_box3x3(int ssx, int ssy, __global uchar *src, + int bordermode, uchar const_vaule, + int dsx, int dsy, __global uchar *dst) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const size_t high_x = get_global_size(0); + const size_t high_y = get_global_size(1); + uint sum = 0; + + int y_top = y - 1; + int y_bot = y + 1; + int x_top = x - 1; + int x_bot = x + 1; + + int ky, kx; + uint dest_index = 0; + + if (bordermode == VX_BORDER_CONSTANT) + { + uchar pixel[9]; + // Calculate border + if (y == 0 || x == 0 || x == high_x - 1 || y == high_y - 1) + { + for (ky = -1; ky <= 1; ++ky) + { + int yy = y + ky; + int ccase_y = yy < 0 || yy >= high_y; + + for (kx = -1; kx <= 1; ++kx, ++dest_index) + { + int xx = x + kx; + int ccase = ccase_y || xx < 0 || xx >= high_x; + + if (!ccase) + pixel[dest_index] = src[xx * ssx + yy * ssy]; + else + pixel[dest_index] = const_vaule; + } + } + + sum = pixel[0] + pixel[1] + pixel[2] + pixel[3] + pixel[4] + pixel[5] + pixel[6] + pixel[7] + pixel[8]; + + sum = sum / 9; + dst[x * dsx + y * dsy] = (uchar)sum; + } + else + { + BOX3x3; + } + } + else + { + if (bordermode == VX_BORDER_REPLICATE) + { + y_top = y_top < 0 ? 0 : y - 1; + y_bot = y_bot >= high_y ? high_y - 1 : y + 1; + x_top = x_top < 0 ? 0 : x - 1; + x_bot = x_bot >= high_x ? high_x - 1 : x + 1; + } + + BOX3x3; + } +} diff --git a/kernels/opencl/vx_convolve.cl b/kernels/opencl/vx_convolve.cl new file mode 100644 index 0000000..84fec9d --- /dev/null +++ b/kernels/opencl/vx_convolve.cl @@ -0,0 +1,93 @@ + +//Define 3 types of border +#define VX_ID_KHRONOS 0x000 +#define VX_ENUM_BORDER 0x0C +#define VX_ENUM_BASE(vendor, id) (((vendor) << 20) | (id << 12)) + +#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0 +#define VX_BORDER_CONSTANT VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1 +#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2 + +#define C_MAX_CONVOLUTION_DIM (15) +#define UINT8_MAX 255 + +#define Convolve \ + uchar slice[C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM] = { 0 }; \ + uint center_x = x, center_y = y; \ + int width = high_x, height = high_y; \ + int ky, kx; \ + uint dest_index = 0; \ + \ + if( bordermode == VX_BORDER_REPLICATE || bordermode == VX_BORDER_UNDEFINED ) \ + { \ + for (ky = -(int)conv_radius_y; ky <= (int)conv_radius_y; ++ky) \ + { \ + int yy = (int)(center_y + ky); \ + yy = yy < 0 ? 0 : yy >= height ? height - 1 : yy; \ + \ + for (kx = -(int)conv_radius_x; kx <= (int)conv_radius_x; ++kx, ++dest_index) \ + { \ + int xx = (int)(center_x + kx); \ + xx = xx < 0 ? 0 : xx >= width ? width - 1 : xx; \ + slice[dest_index] = src[xx * ssx + yy * ssy]; \ + } \ + } \ + } \ + else if( bordermode == VX_BORDER_CONSTANT ) \ + { \ + for (ky = -(int)conv_radius_y; ky <= (int)conv_radius_y; ++ky) \ + { \ + int yy = (int)(center_y + ky); \ + int ccase_y = yy < 0 || yy >= height; \ + \ + for (kx = -(int)conv_radius_x; kx <= (int)conv_radius_x; ++kx, ++dest_index) \ + { \ + int xx = (int)(center_x + kx); \ + int ccase = ccase_y || xx < 0 || xx >= width; \ + if( !ccase ) \ + slice[dest_index] = src[xx * ssx + yy * ssy]; \ + else \ + slice[dest_index] = (uchar)const_vaule; \ + } \ + } \ + } \ + \ + for (int i = 0; i < (int)(conv_width * conv_height); ++i) \ + sum += conv_mat[conv_width * conv_height - 1 - i] * slice[i]; \ + \ + value = sum / (int)scale; \ + \ + if (value < 0) dst[x * dsx + y * dsy] = 0; \ + else if (value > UINT8_MAX) dst[x * dsx + y * dsy] = UINT8_MAX; \ + else dst[x * dsx + y * dsy] = value; + +__kernel void vx_Convolve(int ssx, int ssy, __global uchar *src, + int bordermode, uchar const_vaule, + uint conv_width, uint conv_height, + uint scale, __global short *conv_mat, + int dsx, int dsy, __global uchar *dst) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + int low_x = 0, low_y = 0; + int high_x = get_global_size(0); + int high_y = get_global_size(1); + int sum = 0; + int value = 0; + + int conv_radius_x, conv_radius_y; + conv_radius_x = (int)conv_width / 2; + conv_radius_y = (int)conv_height / 2; + + if (bordermode == VX_BORDER_UNDEFINED) + { + low_x = conv_radius_x; + high_x = ((high_x >= (uint)conv_radius_x) ? high_x - conv_radius_x : 0); + low_y = conv_radius_y; + high_y = ((high_y >= (uint)conv_radius_y) ? high_y - conv_radius_y : 0); + } + + Convolve; + +} diff --git a/kernels/opencl/vx_dilate3x3.cl b/kernels/opencl/vx_dilate3x3.cl new file mode 100644 index 0000000..076bc7f --- /dev/null +++ b/kernels/opencl/vx_dilate3x3.cl @@ -0,0 +1,93 @@ +//Define 3 types of border +#define VX_ID_KHRONOS 0x000 +#define VX_ENUM_BORDER 0x0C +#define VX_ENUM_BASE(vendor, id) (((vendor) << 20) | (id << 12)) + +#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0 +#define VX_BORDER_CONSTANT VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1 +#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2 + +uchar max_op(uchar a, uchar b) +{ + return a > b ? a : b; +} + +#define DILATE3x3 pixels[0] = src[x_top * ssx + y_top * ssy]; \ + pixels[1] = src[x * ssx + y_top * ssy]; \ + pixels[2] = src[x_bot * ssx + y_top * ssy]; \ + pixels[3] = src[x_top * ssx + y * ssy]; \ + pixels[4] = src[x * ssx + y * ssy]; \ + pixels[5] = src[x_bot * ssx + y * ssy]; \ + pixels[6] = src[x_top * ssx + y_bot * ssy]; \ + pixels[7] = src[x * ssx + y_bot * ssy]; \ + pixels[8] = src[x_bot * ssx + y_bot * ssy]; \ + max_value = pixels[0]; \ + for (i = 1; i < 9; i++) \ + max_value = max_op(max_value, pixels[i]); \ + dst[x * dsx + y * dsy] = max_value; \ + +__kernel void vx_dilate3x3(int ssx, int ssy, __global uchar *src, + int bordermode, uchar const_vaule, + int dsx, int dsy, __global uchar *dst) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const size_t high_x = get_global_size(0); + const size_t high_y = get_global_size(1); + uint sum = 0; + + int y_top = y - 1; + int y_bot = y + 1; + int x_top = x - 1; + int x_bot = x + 1; + + int ky, kx, i; + uint dest_index = 0; + uchar pixels[9], max_value; + + if (bordermode == VX_BORDER_CONSTANT) + { + // Calculate border + if (y == 0 || x == 0 || x == high_x - 1 || y == high_y - 1) + { + for (ky = -1; ky <= 1; ++ky) + { + int yy = y + ky; + int ccase_y = yy < 0 || yy >= high_y; + + for (kx = -1; kx <= 1; ++kx, ++dest_index) + { + int xx = x + kx; + int ccase = ccase_y || xx < 0 || xx >= high_x; + + if (!ccase) + pixels[dest_index] = src[xx * ssx + yy * ssy]; + else + pixels[dest_index] = const_vaule; + } + } + + max_value = pixels[0]; + for (i = 1; i < 9; i++) + max_value = max_op(max_value, pixels[i]); + + dst[x * dsx + y * dsy] = max_value; + } + else + { + DILATE3x3; + } + } + else + { + if (bordermode == VX_BORDER_REPLICATE) + { + y_top = y_top < 0 ? 0 : y - 1; + y_bot = y_bot >= high_y ? high_y - 1 : y + 1; + x_top = x_top < 0 ? 0 : x - 1; + x_bot = x_bot >= high_x ? high_x - 1 : x + 1; + } + + DILATE3x3; + } +} diff --git a/kernels/opencl/vx_erode3x3.cl b/kernels/opencl/vx_erode3x3.cl new file mode 100644 index 0000000..a694be9 --- /dev/null +++ b/kernels/opencl/vx_erode3x3.cl @@ -0,0 +1,93 @@ +//Define 3 types of border +#define VX_ID_KHRONOS 0x000 +#define VX_ENUM_BORDER 0x0C +#define VX_ENUM_BASE(vendor, id) (((vendor) << 20) | (id << 12)) + +#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0 +#define VX_BORDER_CONSTANT VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1 +#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2 + +uchar min_op(uchar a, uchar b) +{ + return a < b ? a : b; +} + +#define ERODE3x3 pixels[0] = src[x_top * ssx + y_top * ssy]; \ + pixels[1] = src[x * ssx + y_top * ssy]; \ + pixels[2] = src[x_bot * ssx + y_top * ssy]; \ + pixels[3] = src[x_top * ssx + y * ssy]; \ + pixels[4] = src[x * ssx + y * ssy]; \ + pixels[5] = src[x_bot * ssx + y * ssy]; \ + pixels[6] = src[x_top * ssx + y_bot * ssy]; \ + pixels[7] = src[x * ssx + y_bot * ssy]; \ + pixels[8] = src[x_bot * ssx + y_bot * ssy]; \ + min_value = pixels[0]; \ + for (i = 1; i < 9; i++) \ + min_value = min_op(min_value, pixels[i]); \ + dst[x * dsx + y * dsy] = min_value; \ + +__kernel void vx_erode3x3(int ssx, int ssy, __global uchar *src, + int bordermode, uchar const_vaule, + int dsx, int dsy, __global uchar *dst) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const size_t high_x = get_global_size(0); + const size_t high_y = get_global_size(1); + uint sum = 0; + + int y_top = y - 1; + int y_bot = y + 1; + int x_top = x - 1; + int x_bot = x + 1; + + int ky, kx, i; + uint dest_index = 0; + uchar pixels[9], min_value; + + if (bordermode == VX_BORDER_CONSTANT) + { + // Calculate border + if (y == 0 || x == 0 || x == high_x - 1 || y == high_y - 1) + { + for (ky = -1; ky <= 1; ++ky) + { + int yy = y + ky; + int ccase_y = yy < 0 || yy >= high_y; + + for (kx = -1; kx <= 1; ++kx, ++dest_index) + { + int xx = x + kx; + int ccase = ccase_y || xx < 0 || xx >= high_x; + + if (!ccase) + pixels[dest_index] = src[xx * ssx + yy * ssy]; + else + pixels[dest_index] = const_vaule; + } + } + + min_value = pixels[0]; + for (i = 1; i < 9; i++) + min_value = min_op(min_value, pixels[i]); + + dst[x * dsx + y * dsy] = min_value; + } + else + { + ERODE3x3; + } + } + else + { + if (bordermode == VX_BORDER_REPLICATE) + { + y_top = y_top < 0 ? 0 : y - 1; + y_bot = y_bot >= high_y ? high_y - 1 : y + 1; + x_top = x_top < 0 ? 0 : x - 1; + x_bot = x_bot >= high_x ? high_x - 1 : x + 1; + } + + ERODE3x3; + } +} diff --git a/kernels/opencl/vx_gaussian3x3.cl b/kernels/opencl/vx_gaussian3x3.cl new file mode 100644 index 0000000..112adf6 --- /dev/null +++ b/kernels/opencl/vx_gaussian3x3.cl @@ -0,0 +1,86 @@ + +//Define 3 types of border +#define VX_ID_KHRONOS 0x000 +#define VX_ENUM_BORDER 0x0C +#define VX_ENUM_BASE(vendor, id) (((vendor) << 20) | (id << 12)) + +#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0 +#define VX_BORDER_CONSTANT VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1 +#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2 + +#define GAUSSIAN3x3 sum += (uint)src[x_top * ssx + y_top * ssy]; \ + sum += 2*(uint)src[x * ssx + y_top * ssy]; \ + sum += (uint)src[x_bot * ssx + y_top * ssy]; \ + sum += 2*(uint)src[x_top * ssx + y * ssy]; \ + sum += 4*(uint)src[x * ssx + y * ssy]; \ + sum += 2*(uint)src[x_bot * ssx + y * ssy]; \ + sum += (uint)src[x_top * ssx + y_bot * ssy]; \ + sum += 2*(uint)src[x * ssx + y_bot * ssy]; \ + sum += (uint)src[x_bot * ssx + y_bot * ssy]; \ + sum = sum / 16; \ + dst[x * dsx + y * dsy] = (uchar)sum; \ + +__kernel void vx_gaussian3x3(int ssx, int ssy, __global uchar *src, + int bordermode, uchar const_vaule, + int dsx, int dsy, __global uchar *dst) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const size_t high_x = get_global_size(0); + const size_t high_y = get_global_size(1); + uint sum = 0; + + int y_top = y - 1; + int y_bot = y + 1; + int x_top = x - 1; + int x_bot = x + 1; + + int ky, kx; + uint dest_index = 0; + + if (bordermode == VX_BORDER_CONSTANT) + { + uchar pixel[9]; + // Calculate border + if (y == 0 || x == 0 || x == high_x - 1 || y == high_y - 1) + { + for (ky = -1; ky <= 1; ++ky) + { + int yy = y + ky; + int ccase_y = yy < 0 || yy >= high_y; + + for (kx = -1; kx <= 1; ++kx, ++dest_index) + { + int xx = x + kx; + int ccase = ccase_y || xx < 0 || xx >= high_x; + + if (!ccase) + pixel[dest_index] = src[xx * ssx + yy * ssy]; + else + pixel[dest_index] = const_vaule; + } + } + + sum = pixel[0] + 2*pixel[1] + pixel[2] + 2*pixel[3] + 4*pixel[4] + 2*pixel[5] + pixel[6] + 2*pixel[7] + pixel[8]; + + sum = sum / 16; + dst[x * dsx + y * dsy] = (uchar)sum; + } + else + { + GAUSSIAN3x3; + } + } + else + { + if (bordermode == VX_BORDER_REPLICATE) + { + y_top = y_top < 0 ? 0 : y - 1; + y_bot = y_bot >= high_y ? high_y - 1 : y + 1; + x_top = x_top < 0 ? 0 : x - 1; + x_bot = x_bot >= high_x ? high_x - 1 : x + 1; + } + + GAUSSIAN3x3; + } +} diff --git a/kernels/opencl/vx_median3x3.cl b/kernels/opencl/vx_median3x3.cl new file mode 100644 index 0000000..2ae7a61 --- /dev/null +++ b/kernels/opencl/vx_median3x3.cl @@ -0,0 +1,123 @@ +//Define 3 types of border +#define VX_ID_KHRONOS 0x000 +#define VX_ENUM_BORDER 0x0C +#define VX_ENUM_BASE(vendor, id) (((vendor) << 20) | (id << 12)) + +#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0 +#define VX_BORDER_CONSTANT VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1 +#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2 + +uchar min_op(uchar a, uchar b) +{ + return a < b ? a : b; +} + +uchar max_op(uchar a, uchar b) +{ + return a > b ? a : b; +} + +void sort_mid(uchar *a, uchar *b) +{ + const uchar min = min_op(*a, *b); + const uchar max = max_op(*a, *b); + + *a = min; + *b = max; +} + +#define SORT sort_mid(&pixels[1], &pixels[2]); \ + sort_mid(&pixels[4], &pixels[5]); \ + sort_mid(&pixels[7], &pixels[8]); \ + sort_mid(&pixels[0], &pixels[1]); \ + sort_mid(&pixels[3], &pixels[4]); \ + sort_mid(&pixels[6], &pixels[7]); \ + sort_mid(&pixels[1], &pixels[2]); \ + sort_mid(&pixels[4], &pixels[5]); \ + sort_mid(&pixels[7], &pixels[8]); \ + sort_mid(&pixels[0], &pixels[3]); \ + sort_mid(&pixels[5], &pixels[8]); \ + sort_mid(&pixels[4], &pixels[7]); \ + sort_mid(&pixels[3], &pixels[6]); \ + sort_mid(&pixels[1], &pixels[4]); \ + sort_mid(&pixels[2], &pixels[5]); \ + sort_mid(&pixels[4], &pixels[7]); \ + sort_mid(&pixels[4], &pixels[2]); \ + sort_mid(&pixels[6], &pixels[4]); \ + sort_mid(&pixels[4], &pixels[2]); \ + +#define MEDIAN3x3 pixels[0] = src[x_top * ssx + y_top * ssy]; \ + pixels[1] = src[x * ssx + y_top * ssy]; \ + pixels[2] = src[x_bot * ssx + y_top * ssy]; \ + pixels[3] = src[x_top * ssx + y * ssy]; \ + pixels[4] = src[x * ssx + y * ssy]; \ + pixels[5] = src[x_bot * ssx + y * ssy]; \ + pixels[6] = src[x_top * ssx + y_bot * ssy]; \ + pixels[7] = src[x * ssx + y_bot * ssy]; \ + pixels[8] = src[x_bot * ssx + y_bot * ssy]; \ + SORT; \ + dst[x * dsx + y * dsy] = pixels[4]; \ + +__kernel void vx_median3x3(int ssx, int ssy, __global uchar *src, + int bordermode, uchar const_vaule, + int dsx, int dsy, __global uchar *dst) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const size_t high_x = get_global_size(0); + const size_t high_y = get_global_size(1); + uint sum = 0; + + int y_top = y - 1; + int y_bot = y + 1; + int x_top = x - 1; + int x_bot = x + 1; + + int ky, kx; + uint dest_index = 0; + uchar pixels[9]; + + if (bordermode == VX_BORDER_CONSTANT) + { + // Calculate border + if (y == 0 || x == 0 || x == high_x - 1 || y == high_y - 1) + { + for (ky = -1; ky <= 1; ++ky) + { + int yy = y + ky; + int ccase_y = yy < 0 || yy >= high_y; + + for (kx = -1; kx <= 1; ++kx, ++dest_index) + { + int xx = x + kx; + int ccase = ccase_y || xx < 0 || xx >= high_x; + + if (!ccase) + pixels[dest_index] = src[xx * ssx + yy * ssy]; + else + pixels[dest_index] = const_vaule; + } + } + + SORT; + + dst[x * dsx + y * dsy] = pixels[4]; + } + else + { + MEDIAN3x3; + } + } + else + { + if (bordermode == VX_BORDER_REPLICATE) + { + y_top = y_top < 0 ? 0 : y - 1; + y_bot = y_bot >= high_y ? high_y - 1 : y + 1; + x_top = x_top < 0 ? 0 : x - 1; + x_bot = x_bot >= high_x ? high_x - 1 : x + 1; + } + + MEDIAN3x3; + } +} diff --git a/kernels/opencl/vx_nonlinearfilter.cl b/kernels/opencl/vx_nonlinearfilter.cl new file mode 100644 index 0000000..a9d1958 --- /dev/null +++ b/kernels/opencl/vx_nonlinearfilter.cl @@ -0,0 +1,531 @@ +//Define 3 types of border +#define VX_ID_KHRONOS 0x000 +#define VX_ENUM_BORDER 0x0C +#define VX_ENUM_NONLINEAR 0x16 +#define VX_ENUM_BASE(vendor, id) (((vendor) << 20) | (id << 12)) + +#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0 +#define VX_BORDER_CONSTANT VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1 +#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2 + +#define VX_NONLINEAR_FILTER_MEDIAN VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NONLINEAR) + 0x0 +#define VX_NONLINEAR_FILTER_MIN VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NONLINEAR) + 0x1 +#define VX_NONLINEAR_FILTER_MAX VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NONLINEAR) + 0x2 + +uchar min_op(uchar a, uchar b) +{ + return a < b ? a : b; +} + +uchar max_op(uchar a, uchar b) +{ + return a > b ? a : b; +} + +void sort_mid(uchar *a, uchar *b) +{ + const uchar min = min_op(*a, *b); + const uchar max = max_op(*a, *b); + + *a = min; + *b = max; +} + +#define SORT_MID_CROSS_3x3 sort_mid(&pixels[0], &pixels[1]); \ + sort_mid(&pixels[2], &pixels[3]); \ + sort_mid(&pixels[0], &pixels[2]); \ + sort_mid(&pixels[1], &pixels[3]); \ + sort_mid(&pixels[1], &pixels[2]); \ + sort_mid(&pixels[0], &pixels[4]); \ + sort_mid(&pixels[1], &pixels[4]); \ + sort_mid(&pixels[2], &pixels[4]); \ + +#define SORT_MID_3x3 sort_mid(&pixels[1], &pixels[2]); \ + sort_mid(&pixels[4], &pixels[5]); \ + sort_mid(&pixels[7], &pixels[8]); \ + sort_mid(&pixels[0], &pixels[1]); \ + sort_mid(&pixels[3], &pixels[4]); \ + sort_mid(&pixels[6], &pixels[7]); \ + sort_mid(&pixels[1], &pixels[2]); \ + sort_mid(&pixels[4], &pixels[5]); \ + sort_mid(&pixels[7], &pixels[8]); \ + sort_mid(&pixels[0], &pixels[3]); \ + sort_mid(&pixels[5], &pixels[8]); \ + sort_mid(&pixels[4], &pixels[7]); \ + sort_mid(&pixels[3], &pixels[6]); \ + sort_mid(&pixels[1], &pixels[4]); \ + sort_mid(&pixels[2], &pixels[5]); \ + sort_mid(&pixels[4], &pixels[7]); \ + sort_mid(&pixels[4], &pixels[2]); \ + sort_mid(&pixels[6], &pixels[4]); \ + sort_mid(&pixels[4], &pixels[2]); \ + + +#define SORT_MID_DISK_5x5 sort_mid(&pixels[0], &pixels[1]); \ + sort_mid(&pixels[2], &pixels[3]); \ + sort_mid(&pixels[4], &pixels[5]); \ + sort_mid(&pixels[6], &pixels[7]); \ + sort_mid(&pixels[8], &pixels[9]); \ + sort_mid(&pixels[10], &pixels[11]); \ + sort_mid(&pixels[12], &pixels[13]); \ + sort_mid(&pixels[14], &pixels[15]); \ + sort_mid(&pixels[16], &pixels[17]); \ + sort_mid(&pixels[18], &pixels[19]); \ + sort_mid(&pixels[0], &pixels[2]); \ + sort_mid(&pixels[1], &pixels[3]); \ + sort_mid(&pixels[4], &pixels[6]); \ + sort_mid(&pixels[5], &pixels[7]); \ + sort_mid(&pixels[8], &pixels[10]); \ + sort_mid(&pixels[9], &pixels[11]); \ + sort_mid(&pixels[12], &pixels[14]); \ + sort_mid(&pixels[13], &pixels[15]); \ + sort_mid(&pixels[16], &pixels[18]); \ + sort_mid(&pixels[17], &pixels[19]); \ + sort_mid(&pixels[1], &pixels[2]); \ + sort_mid(&pixels[5], &pixels[6]); \ + sort_mid(&pixels[0], &pixels[4]); \ + sort_mid(&pixels[3], &pixels[7]); \ + sort_mid(&pixels[9], &pixels[10]); \ + sort_mid(&pixels[13], &pixels[14]); \ + sort_mid(&pixels[8], &pixels[12]); \ + sort_mid(&pixels[11], &pixels[15]); \ + sort_mid(&pixels[17], &pixels[18]); \ + sort_mid(&pixels[16], &pixels[20]); \ + sort_mid(&pixels[1], &pixels[5]); \ + sort_mid(&pixels[2], &pixels[6]); \ + sort_mid(&pixels[9], &pixels[13]); \ + sort_mid(&pixels[10], &pixels[14]); \ + sort_mid(&pixels[0], &pixels[8]); \ + sort_mid(&pixels[7], &pixels[15]); \ + sort_mid(&pixels[17], &pixels[20]); \ + sort_mid(&pixels[1], &pixels[4]); \ + sort_mid(&pixels[3], &pixels[6]); \ + sort_mid(&pixels[9], &pixels[12]); \ + sort_mid(&pixels[11], &pixels[14]); \ + sort_mid(&pixels[18], &pixels[20]); \ + sort_mid(&pixels[0], &pixels[16]); \ + sort_mid(&pixels[2], &pixels[4]); \ + sort_mid(&pixels[3], &pixels[5]); \ + sort_mid(&pixels[10], &pixels[12]); \ + sort_mid(&pixels[11], &pixels[13]); \ + sort_mid(&pixels[1], &pixels[9]); \ + sort_mid(&pixels[6], &pixels[14]); \ + sort_mid(&pixels[19], &pixels[20]); \ + sort_mid(&pixels[3], &pixels[4]); \ + sort_mid(&pixels[11], &pixels[12]); \ + sort_mid(&pixels[1], &pixels[8]); \ + sort_mid(&pixels[2], &pixels[10]); \ + sort_mid(&pixels[5], &pixels[13]); \ + sort_mid(&pixels[7], &pixels[14]); \ + sort_mid(&pixels[3], &pixels[11]); \ + sort_mid(&pixels[2], &pixels[8]); \ + sort_mid(&pixels[4], &pixels[12]); \ + sort_mid(&pixels[7], &pixels[13]); \ + sort_mid(&pixels[1], &pixels[17]); \ + sort_mid(&pixels[3], &pixels[10]); \ + sort_mid(&pixels[5], &pixels[12]); \ + sort_mid(&pixels[1], &pixels[16]); \ + sort_mid(&pixels[2], &pixels[18]); \ + sort_mid(&pixels[3], &pixels[9]); \ + sort_mid(&pixels[6], &pixels[12]); \ + sort_mid(&pixels[2], &pixels[16]); \ + sort_mid(&pixels[3], &pixels[8]); \ + sort_mid(&pixels[7], &pixels[12]); \ + sort_mid(&pixels[5], &pixels[9]); \ + sort_mid(&pixels[6], &pixels[10]); \ + sort_mid(&pixels[4], &pixels[8]); \ + sort_mid(&pixels[7], &pixels[11]); \ + sort_mid(&pixels[3], &pixels[19]); \ + sort_mid(&pixels[5], &pixels[8]); \ + sort_mid(&pixels[7], &pixels[10]); \ + sort_mid(&pixels[3], &pixels[18]); \ + sort_mid(&pixels[4], &pixels[20]); \ + sort_mid(&pixels[6], &pixels[8]); \ + sort_mid(&pixels[7], &pixels[9]); \ + sort_mid(&pixels[3], &pixels[17]); \ + sort_mid(&pixels[5], &pixels[20]); \ + sort_mid(&pixels[7], &pixels[8]); \ + sort_mid(&pixels[3], &pixels[16]); \ + sort_mid(&pixels[6], &pixels[20]); \ + sort_mid(&pixels[5], &pixels[17]); \ + sort_mid(&pixels[7], &pixels[20]); \ + sort_mid(&pixels[4], &pixels[16]); \ + sort_mid(&pixels[6], &pixels[18]); \ + sort_mid(&pixels[5], &pixels[16]); \ + sort_mid(&pixels[7], &pixels[19]); \ + sort_mid(&pixels[7], &pixels[18]); \ + sort_mid(&pixels[6], &pixels[16]); \ + sort_mid(&pixels[7], &pixels[17]); \ + sort_mid(&pixels[10], &pixels[18]); \ + sort_mid(&pixels[7], &pixels[16]); \ + sort_mid(&pixels[9], &pixels[17]); \ + sort_mid(&pixels[8], &pixels[16]); \ + sort_mid(&pixels[9], &pixels[16]); \ + sort_mid(&pixels[10], &pixels[16]); \ + + + + #define SORT_MID_BOX_5x5 sort_mid(&pixels[1], &pixels[2]); \ + sort_mid(&pixels[0], &pixels[1]); \ + sort_mid(&pixels[1], &pixels[2]); \ + sort_mid(&pixels[4], &pixels[5]); \ + sort_mid(&pixels[3], &pixels[4]); \ + sort_mid(&pixels[4], &pixels[5]); \ + sort_mid(&pixels[0], &pixels[3]); \ + sort_mid(&pixels[2], &pixels[5]); \ + sort_mid(&pixels[2], &pixels[3]); \ + sort_mid(&pixels[1], &pixels[4]); \ + sort_mid(&pixels[1], &pixels[2]); \ + sort_mid(&pixels[3], &pixels[4]); \ + sort_mid(&pixels[7], &pixels[8]); \ + sort_mid(&pixels[6], &pixels[7]); \ + sort_mid(&pixels[7], &pixels[8]); \ + sort_mid(&pixels[10], &pixels[11]); \ + sort_mid(&pixels[9], &pixels[10]); \ + sort_mid(&pixels[10], &pixels[11]); \ + sort_mid(&pixels[6], &pixels[9]); \ + sort_mid(&pixels[8], &pixels[11]); \ + sort_mid(&pixels[8], &pixels[9]); \ + sort_mid(&pixels[7], &pixels[10]); \ + sort_mid(&pixels[7], &pixels[8]); \ + sort_mid(&pixels[9], &pixels[10]); \ + sort_mid(&pixels[0], &pixels[6]); \ + sort_mid(&pixels[4], &pixels[10]); \ + sort_mid(&pixels[4], &pixels[6]); \ + sort_mid(&pixels[2], &pixels[8]); \ + sort_mid(&pixels[2], &pixels[4]); \ + sort_mid(&pixels[6], &pixels[8]); \ + sort_mid(&pixels[1], &pixels[7]); \ + sort_mid(&pixels[5], &pixels[11]); \ + sort_mid(&pixels[5], &pixels[7]); \ + sort_mid(&pixels[3], &pixels[9]); \ + sort_mid(&pixels[3], &pixels[5]); \ + sort_mid(&pixels[7], &pixels[9]); \ + sort_mid(&pixels[1], &pixels[2]); \ + sort_mid(&pixels[3], &pixels[4]); \ + sort_mid(&pixels[5], &pixels[6]); \ + sort_mid(&pixels[7], &pixels[8]); \ + sort_mid(&pixels[9], &pixels[10]); \ + sort_mid(&pixels[13], &pixels[14]); \ + sort_mid(&pixels[12], &pixels[13]); \ + sort_mid(&pixels[13], &pixels[14]); \ + sort_mid(&pixels[16], &pixels[17]); \ + sort_mid(&pixels[15], &pixels[16]); \ + sort_mid(&pixels[16], &pixels[17]); \ + sort_mid(&pixels[12], &pixels[15]); \ + sort_mid(&pixels[14], &pixels[17]); \ + sort_mid(&pixels[14], &pixels[15]); \ + sort_mid(&pixels[13], &pixels[16]); \ + sort_mid(&pixels[13], &pixels[14]); \ + sort_mid(&pixels[15], &pixels[16]); \ + sort_mid(&pixels[19], &pixels[20]); \ + sort_mid(&pixels[18], &pixels[19]); \ + sort_mid(&pixels[19], &pixels[20]); \ + sort_mid(&pixels[21], &pixels[22]); \ + sort_mid(&pixels[23], &pixels[24]); \ + sort_mid(&pixels[21], &pixels[23]); \ + sort_mid(&pixels[22], &pixels[24]); \ + sort_mid(&pixels[22], &pixels[23]); \ + sort_mid(&pixels[18], &pixels[21]); \ + sort_mid(&pixels[20], &pixels[23]); \ + sort_mid(&pixels[20], &pixels[21]); \ + sort_mid(&pixels[19], &pixels[22]); \ + sort_mid(&pixels[22], &pixels[24]); \ + sort_mid(&pixels[19], &pixels[20]); \ + sort_mid(&pixels[21], &pixels[22]); \ + sort_mid(&pixels[23], &pixels[24]); \ + sort_mid(&pixels[12], &pixels[18]); \ + sort_mid(&pixels[16], &pixels[22]); \ + sort_mid(&pixels[16], &pixels[18]); \ + sort_mid(&pixels[14], &pixels[20]); \ + sort_mid(&pixels[20], &pixels[24]); \ + sort_mid(&pixels[14], &pixels[16]); \ + sort_mid(&pixels[18], &pixels[20]); \ + sort_mid(&pixels[22], &pixels[24]); \ + sort_mid(&pixels[13], &pixels[19]); \ + sort_mid(&pixels[17], &pixels[23]); \ + sort_mid(&pixels[17], &pixels[19]); \ + sort_mid(&pixels[15], &pixels[21]); \ + sort_mid(&pixels[15], &pixels[17]); \ + sort_mid(&pixels[19], &pixels[21]); \ + sort_mid(&pixels[13], &pixels[14]); \ + sort_mid(&pixels[15], &pixels[16]); \ + sort_mid(&pixels[17], &pixels[18]); \ + sort_mid(&pixels[19], &pixels[20]); \ + sort_mid(&pixels[21], &pixels[22]); \ + sort_mid(&pixels[23], &pixels[24]); \ + sort_mid(&pixels[0], &pixels[12]); \ + sort_mid(&pixels[8], &pixels[20]); \ + sort_mid(&pixels[8], &pixels[12]); \ + sort_mid(&pixels[4], &pixels[16]); \ + sort_mid(&pixels[16], &pixels[24]); \ + sort_mid(&pixels[12], &pixels[16]); \ + sort_mid(&pixels[2], &pixels[14]); \ + sort_mid(&pixels[10], &pixels[22]); \ + sort_mid(&pixels[10], &pixels[14]); \ + sort_mid(&pixels[6], &pixels[18]); \ + sort_mid(&pixels[6], &pixels[10]); \ + sort_mid(&pixels[10], &pixels[12]); \ + sort_mid(&pixels[1], &pixels[13]); \ + sort_mid(&pixels[9], &pixels[21]); \ + sort_mid(&pixels[9], &pixels[13]); \ + sort_mid(&pixels[5], &pixels[17]); \ + sort_mid(&pixels[13], &pixels[17]); \ + sort_mid(&pixels[3], &pixels[15]); \ + sort_mid(&pixels[11], &pixels[23]); \ + sort_mid(&pixels[11], &pixels[15]); \ + sort_mid(&pixels[7], &pixels[19]); \ + sort_mid(&pixels[7], &pixels[11]); \ + sort_mid(&pixels[11], &pixels[13]); \ + sort_mid(&pixels[11], &pixels[12]); \ + + +#define FILTER_VALUE_3x3 switch (function) \ + { \ + case VX_NONLINEAR_FILTER_MIN: \ + { \ + min_value = pixels[0]; \ + for (i = 1; i < count_mask; i++) \ + min_value = min_op(min_value, pixels[i]); \ + \ + dst[x * dsx + y * dsy] = min_value; \ + \ + break; \ + } \ + case VX_NONLINEAR_FILTER_MAX: \ + { \ + max_value = pixels[0]; \ + for (i = 1; i < count_mask; i++) \ + max_value = max_op(max_value, pixels[i]); \ + \ + dst[x * dsx + y * dsy] = max_value; \ + \ + break; \ + } \ + case VX_NONLINEAR_FILTER_MEDIAN: \ + { \ + SORT_MID_3x3; \ + \ + dst[x * dsx + y * dsy] = pixels[4]; \ + \ + break; \ + } \ + } \ + + +#define FILTER_CROSS_3x3 switch (function) \ + { \ + case VX_NONLINEAR_FILTER_MIN: \ + { \ + min_value = pixels[0]; \ + for (i = 1; i < count_mask; i++) \ + min_value = min_op(min_value, pixels[i]); \ + \ + dst[x * dsx + y * dsy] = min_value; \ + \ + break; \ + } \ + case VX_NONLINEAR_FILTER_MAX: \ + { \ + max_value = pixels[0]; \ + for (i = 1; i < count_mask; i++) \ + max_value = max_op(max_value, pixels[i]); \ + \ + dst[x * dsx + y * dsy] = max_value; \ + \ + break; \ + } \ + case VX_NONLINEAR_FILTER_MEDIAN: \ + { \ + SORT_MID_CROSS_3x3; \ + \ + dst[x * dsx + y * dsy] = pixels[2]; \ + \ + break; \ + } \ + } \ + + +#define FILTER_DISK_5x5 switch (function) \ + { \ + case VX_NONLINEAR_FILTER_MIN: \ + { \ + min_value = pixels[0]; \ + for (i = 1; i < count_mask; i++) \ + min_value = min_op(min_value, pixels[i]); \ + \ + dst[x * dsx + y * dsy] = min_value; \ + \ + break; \ + } \ + case VX_NONLINEAR_FILTER_MAX: \ + { \ + max_value = pixels[0]; \ + for (i = 1; i < count_mask; i++) \ + max_value = max_op(max_value, pixels[i]); \ + \ + dst[x * dsx + y * dsy] = max_value; \ + \ + break; \ + } \ + case VX_NONLINEAR_FILTER_MEDIAN: \ + { \ + SORT_MID_DISK_5x5; \ + \ + dst[x * dsx + y * dsy] = pixels[10]; \ + \ + break; \ + } \ + } \ + +#define FILTER_BOX_5x5 switch (function) \ + { \ + case VX_NONLINEAR_FILTER_MIN: \ + { \ + min_value = pixels[0]; \ + for (i = 1; i < count_mask; i++) \ + min_value = min_op(min_value, pixels[i]); \ + \ + dst[x * dsx + y * dsy] = min_value; \ + \ + break; \ + } \ + case VX_NONLINEAR_FILTER_MAX: \ + { \ + max_value = pixels[0]; \ + for (i = 1; i < count_mask; i++) \ + max_value = max_op(max_value, pixels[i]); \ + \ + dst[x * dsx + y * dsy] = max_value; \ + \ + break; \ + } \ + case VX_NONLINEAR_FILTER_MEDIAN: \ + { \ + SORT_MID_BOX_5x5; \ + \ + dst[x * dsx + y * dsy] = pixels[12]; \ + \ + break; \ + } \ + } \ + + +__kernel void vx_nonlinearfilter(uint function, int ssx, int ssy, __global uchar *src, + __global uchar *mask, uint left, uint top, uint right, uint bottom, + int mat_rows, int count_mask, int bordermode, uchar const_vaule, + int dsx, int dsy, __global uchar *dst) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const size_t high_x = get_global_size(0); + const size_t high_y = get_global_size(1); + + int ky, kx, i; + uint dest_index = 0; + uint mask_index = 0; + uchar pixels[25], min_value, max_value; + + if (bordermode == VX_BORDER_CONSTANT) + { + for (ky = -(int)top; ky <= (int)bottom; ++ky) + { + int yy = y + ky; + int ccase_y = yy < 0 || yy >= high_y; + + for (kx = -(int)left; kx <= (int)right; ++kx, ++mask_index) + { + int xx = x + kx; + int ccase = ccase_y || xx < 0 || xx >= high_x; + + if (mask[mask_index]) + { + if (!ccase) + pixels[dest_index++] = src[xx * ssx + yy * ssy]; + else + pixels[dest_index++] = const_vaule; + } + } + } + + switch (mat_rows) + { + case 3 : //mask = 3x3 + { + if (count_mask == 5) + { + FILTER_CROSS_3x3; + } + else //count_mask = 9 + { + FILTER_VALUE_3x3; + } + break; + } + case 5 : //mask = 5x5 + { + if (count_mask == 9) + { + FILTER_VALUE_3x3; + } + else if (count_mask == 21) + { + FILTER_DISK_5x5; + } + else //count_mask = 25 + { + FILTER_BOX_5x5; + } + break; + } + } + } + else + { + for (ky = -(int)top; ky <= (int)bottom; ++ky) + { + int yy = y + ky; + yy = yy < 0 ? 0 : yy >= high_y ? high_y - 1 : yy; + + for (kx = -(int)left; kx <= (int)right; ++kx, ++mask_index) + { + int xx = x + kx; + xx = xx < 0 ? 0 : xx >= high_x ? high_x - 1 : xx; + if (mask[mask_index]) + pixels[dest_index++] = src[xx * ssx + yy * ssy]; + } + } + + switch (mat_rows) + { + case 3 : //mask = 3x3 + { + if (count_mask == 5) + { + FILTER_CROSS_3x3; + } + else //count_mask = 9 + { + FILTER_VALUE_3x3; + } + break; + } + case 5 : //mask = 5x5 + { + if (count_mask == 9) + { + FILTER_VALUE_3x3; + } + else if (count_mask == 21) + { + FILTER_DISK_5x5; + } + else //count_mask = 25 + { + FILTER_BOX_5x5; + } + break; + } + } + } +} \ No newline at end of file diff --git a/kernels/opencl/vx_not.cl b/kernels/opencl/vx_not.cl new file mode 100644 index 0000000..106c21c --- /dev/null +++ b/kernels/opencl/vx_not.cl @@ -0,0 +1,9 @@ + +__kernel void vx_not(int asx, int asy, __global uchar *a, + int bsx, int bsy, __global uchar *b) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + b[y * bsy + x * bsx] = ~a[y * asy + x * asx]; +} diff --git a/kernels/opencl/vx_orr.cl b/kernels/opencl/vx_orr.cl new file mode 100644 index 0000000..6b7195f --- /dev/null +++ b/kernels/opencl/vx_orr.cl @@ -0,0 +1,10 @@ + +__kernel void vx_orr(int asx, int asy, __global uchar *a, + int bsx, int bsy, __global uchar *b, + int csx, int csy, __global uchar *c) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + c[y * csy + x * csx] = a[y * asy + x * asx] | b[y * bsy + x * bsx]; +} diff --git a/kernels/opencl/vx_phase.cl b/kernels/opencl/vx_phase.cl new file mode 100644 index 0000000..b624c49 --- /dev/null +++ b/kernels/opencl/vx_phase.cl @@ -0,0 +1,56 @@ + +#define DBL_EPSILON 2.2204460492503131e-016 + +#define M_PI 3.1415926535897932384626433832795 + +#define ABS(x) ((x) > 0 ? (x) : -(x)) + +#define FLOOR(x) (x > 0 ? (int)(x) : (int)(x - 0.99)) + +__kernel void vx_phase(int ssx0, int ssy0, __global short *src0, + int ssx1, int ssy1, __global short *src1, + int dsx, int dsy, __global uchar *dst) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + + float scale = 256.0f / 360.0f; + + float P1 = ((float)( 0.9997878412794807 * (180.0 / M_PI) * scale)), + P3 = ((float)(-0.3258083974640975 * (180.0 / M_PI) * scale)), + P5 = ((float)( 0.1555786518463281 * (180.0 / M_PI) * scale)), + P7 = ((float)(-0.04432655554792128 * (180.0 / M_PI) * scale)), + A_90 = ((float)(90.f * scale)), + A_180 = ((float)(180.f * scale)), + A_360 = ((float)(360.f * scale)); + + /* -M_PI to M_PI */ + float val_x; + float val_y; + + val_x = (float)(src0[x * ssx0 / 2 + y * ssy0 / 2]); + val_y = (float)(src1[x * ssx1 / 2 + y * ssy1 / 2]); + + float arct; + + float ax = ABS(val_x), ay = ABS(val_y); + float c, c2; + if (ax >= ay) + { + c = ay / (ax + (float)DBL_EPSILON); + c2 = c * c; + arct = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; + } + else + { + c = ax / (ay + (float)DBL_EPSILON); + c2 = c * c; + arct = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; + } + if (val_x < 0) + arct = A_180 - arct; + if (val_y < 0) + arct = A_360 - arct; + + dst[x * dsx + y * dsy] = (uchar)(int)floor(arct + 0.5f); +} diff --git a/kernels/opencl/vx_sobel3x3.cl b/kernels/opencl/vx_sobel3x3.cl new file mode 100644 index 0000000..31fe8c6 --- /dev/null +++ b/kernels/opencl/vx_sobel3x3.cl @@ -0,0 +1,114 @@ + +//Define 3 types of border +#define VX_ID_KHRONOS 0x000 +#define VX_ENUM_BORDER 0x0C +#define VX_ENUM_BASE(vendor, id) (((vendor) << 20) | (id << 12)) + +#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0 +#define VX_BORDER_CONSTANT VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1 +#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2 + +#define SOBEL3x3_gx sx -= (uint)src[x_top * ssx + y_top * ssy]; \ + sx -= 2 * (uint)src[x_top * ssx + y * ssy]; \ + sx -= (uint)src[x_top * ssx + y_bot * ssy]; \ + sx += (uint)src[x_bot * ssx + y_top * ssy]; \ + sx += 2 * (uint)src[x_bot * ssx + y * ssy]; \ + sx += (uint)src[x_bot * ssx + y_bot * ssy]; \ + gx[x * dsx1 + y * dsy1] = (short)sx; \ + +#define SOBEL3x3_gy sy -= (uint)src[x_top * ssx + y_top * ssy]; \ + sy -= 2 * (uint)src[x * ssx + y_top * ssy]; \ + sy -= (uint)src[x_bot * ssx + y_top * ssy]; \ + sy += (uint)src[x_top * ssx + y_bot * ssy]; \ + sy += 2 * (uint)src[x * ssx + y_bot * ssy]; \ + sy += (uint)src[x_bot * ssx + y_bot * ssy]; \ + gy[x * dsx2 + y * dsy2] = (short)sy; \ + + +__kernel void vx_sobel3x3(int ssx, int ssy, __global uchar *src, + int bordermode, uchar const_vaule, + int dsx1, int dsy1, __global short *gx, + int dsx2, int dsy2, __global short *gy) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const size_t high_x = get_global_size(0); + const size_t high_y = get_global_size(1); + int sx = 0, sy = 0; + + int y_top = y - 1; + int y_bot = y + 1; + int x_top = x - 1; + int x_bot = x + 1; + + int ky, kx; + uint dest_index = 0; + + if (bordermode == VX_BORDER_CONSTANT) + { + uchar pixel[9]; + // Calculate border + if (y == 0 || x == 0 || x == high_x - 1 || y == high_y - 1) + { + for (ky = -1; ky <= 1; ++ky) + { + int yy = y + ky; + int ccase_y = yy < 0 || yy >= high_y; + + for (kx = -1; kx <= 1; ++kx, ++dest_index) + { + int xx = x + kx; + int ccase = ccase_y || xx < 0 || xx >= high_x; + + if (!ccase) + pixel[dest_index] = src[xx * ssx + yy * ssy]; + else + pixel[dest_index] = const_vaule; + } + } + + if (gx) + { + sx = pixel[8] + 2*pixel[5] - pixel[6] - pixel[0] - 2*pixel[3] + pixel[2]; + + gx[x * dsx1 + y * dsy1] = (short)sx; + } + if (gy) + { + sy = pixel[6] + 2*pixel[7] + pixel[8] - pixel[0] - 2*pixel[1] - pixel[2]; + + gy[x * dsx2 + y * dsy2] = (short)sy; + } + } + else + { + if (gx) + { + SOBEL3x3_gx; + } + if (gy) + { + SOBEL3x3_gy; + } + } + } + else + { + if (bordermode == VX_BORDER_REPLICATE) + { + y_top = y_top < 0 ? 0 : y - 1; + y_bot = y_bot >= high_y ? high_y - 1 : y + 1; + x_top = x_top < 0 ? 0 : x - 1; + x_bot = x_bot >= high_x ? high_x - 1 : x + 1; + } + + if (gx) + { + SOBEL3x3_gx; + } + if (gy) + { + SOBEL3x3_gy; + } + } +} diff --git a/kernels/opencl/vx_warp_affine.cl b/kernels/opencl/vx_warp_affine.cl new file mode 100644 index 0000000..e0009be --- /dev/null +++ b/kernels/opencl/vx_warp_affine.cl @@ -0,0 +1,273 @@ + +#define VX_ID_KHRONOS 0x000 +#define VX_ENUM_INTERPOLATION 0x04 +#define VX_ENUM_BASE(vendor, id) (((vendor) << 20) | (id << 12)) +#define VX_INTERPOLATION_NEAREST_NEIGHBOR VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x0 +#define VX_INTERPOLATION_BILINEAR VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x1 + +#define VEC_DATA_TYPE_STR(type, size) type##size +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) + +#define CONVERT_STR(x, type) (convert_##type((x))) +#define CONVERT(x, type) CONVERT_STR(x, type) + +#define IMAGE_DECLARATION(name) \ + __global uchar *name##_ptr, \ + uint name##_stride_x, \ + uint name##_step_x, \ + uint name##_stride_y, \ + uint name##_step_y, \ + uint name##_offset_first_element_in_bytes + +#define CONVERT_TO_IMAGE_STRUCT(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) + +#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) + +/** Structure to hold Image information */ +typedef struct Image +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ +} Image; + +/** Wrap image information into an Image structure, and make the pointer point at this workitem's data. +* +* @param[in] ptr Pointer to the starting postion of the buffer +* @param[in] offset_first_element_in_bytes The offset of the first element in the source image +* @param[in] stride_x Stride of the image in X dimension (in bytes) +* @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) +* @param[in] stride_y Stride of the image in Y dimension (in bytes) +* @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) +* +* @return An image object +*/ +inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) +{ + Image img = + { + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y + }; + img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; + return img; +} + +/** Get the pointer position of a Image +* +* @param[in] img Pointer to the starting position of the buffer +* @param[in] x Relative X position +* @param[in] y Relative Y position +*/ +inline __global uchar *offset(const Image *img, int x, int y) +{ + return img->ptr + x * img->stride_x + y * img->stride_y; +} + +/** Clamps the given coordinates to the borders according to the border size. + * + * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords. + * @param[in] width Width of the image + * @param[in] height Height of the image + * @param[in] border_size Border size of the image + * + */ +inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size) +{ + const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size); + const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size); + + return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3); +} + +/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */ +/** Clamps the given coordinates to the borders. + * + * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords. + * @param[in] width Width of the image + * @param[in] height Height of the image + * + */ +inline const float8 clamp_to_border(float8 coords, const float width, const float height) +{ + return clamp_to_border_with_size(coords, width, height, 0); +} + +/** Reads four texels from the input image. The coords vector is used to determine which texels to be read. + * + * @param[in] in Pointer to the source image. + * @param[in] coords Vector of coordinates to be read from the image. + */ +inline const VEC_DATA_TYPE(uchar, 4) read_texels4(const Image *in, const int8 coords) +{ + return (VEC_DATA_TYPE(uchar, 4))(*((__global uchar *)offset(in, coords.s0, coords.s1)), + *((__global uchar *)offset(in, coords.s2, coords.s3)), + *((__global uchar *)offset(in, coords.s4, coords.s5)), + *((__global uchar *)offset(in, coords.s6, coords.s7))); +} + +/** Returns the current thread coordinates. */ +inline const float2 get_current_coords() +{ + return (float2)(get_global_id(0) * 4, get_global_id(1)); +} + +/** Transforms 4 2D coordinates using the formula: + * + * x0 = M[1][1] * x + M[1][2] * y + M[1][3] + * y0 = M[2][1] * x + M[2][2] * y + M[2][3] + * + * @param[in] coord 2D coordinate to transform. + * @param[in] mtx affine matrix + * + * @return a int8 containing 4 2D transformed values. + */ +inline const float8 apply_affine_transform(const float2 coord, const float8 mtx) +{ + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); + // transform [x,x+1,x+2,x+3] + const float4 new_x = mad(/*A*/ in_x_coords, (float4)(mtx.s0) /*B*/, mad((float4)(coord.s1), (float4)(mtx.s2), (float4)(mtx.s4))); + // transform [y,y+1,y+2,y+3] + const float4 new_y = mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s5))); + + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +} + + +/** Given a texel coordinates this function will return the following array of coordinates: + * [ P, right neighbour, below neighbour, below right neighbour ] + * + * @note No checks to see if the coordinates are out of the image are done here. + * + * @param[in] coord Input coordinates + * + * @return vector of 8 floats with the coordinates, even positions are x and odd y. + */ +inline const float8 get_neighbour_coords(const float2 coord) +{ + return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1); +} + +/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values + * + * @param[in] in Pointer to the source image. + * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y. + * @param[in] width Width of the image + * @param[in] height Height of the image + * @param[in] border_size Border size + */ +inline const VEC_DATA_TYPE(uchar, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size) +{ + // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image. + + // Sets the 4x4 coordinates for each of the four input texels + const float8 fc = floor(coords); + const float16 c1 = (float16)( + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size), + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size)); + const float16 c2 = (float16)( + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size), + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size)); + + // Loads the values from the input image + const float16 t = (float16)( + /* tl, tr, bl, br */ + * ((__global uchar *)offset(in, c1.s0, c1.s1)), *((__global uchar *)offset(in, c1.s2, c1.s3)), + *((__global uchar *)offset(in, c1.s4, c1.s5)), *((__global uchar *)offset(in, c1.s6, c1.s7)), + *((__global uchar *)offset(in, c1.s8, c1.s9)), *((__global uchar *)offset(in, c1.sa, c1.sb)), + *((__global uchar *)offset(in, c1.sc, c1.sd)), *((__global uchar *)offset(in, c1.se, c1.sf)), + *((__global uchar *)offset(in, c2.s0, c2.s1)), *((__global uchar *)offset(in, c2.s2, c2.s3)), + *((__global uchar *)offset(in, c2.s4, c2.s5)), *((__global uchar *)offset(in, c2.s6, c2.s7)), + *((__global uchar *)offset(in, c2.s8, c2.s9)), *((__global uchar *)offset(in, c2.sa, c2.sb)), + *((__global uchar *)offset(in, c2.sc, c2.sd)), *((__global uchar *)offset(in, c2.se, c2.sf))); + const float8 a = coords - fc; + const float8 b = ((float8)(1.f)) - a; + const float4 fr = (float4)( + ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)), + ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)), + ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)), + ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7))); + return CONVERT(fr, VEC_DATA_TYPE(uchar, 4)); +} + +/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */ +/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values + * + * @param[in] in Pointer to the source image. + * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y. + * @param[in] width Width of the image + * @param[in] height Height of the image + */ +inline const VEC_DATA_TYPE(uchar, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height) +{ + return bilinear_interpolate_with_border(in, coords, width, height, 1); +} + +/** Performs an affine transform on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8. + * + * This kernel performs an affine transform with a 2x3 Matrix M with this method of pixel coordinate translation: + * x0 = M[1][1] * x + M[1][2] * y + M[1][3] + * y0 = M[2][1] * x + M[2][2] * y + M[2][3] + * output(x,y) = input(x0,y0) + * + * @attention The matrix coefficients need to be passed at compile time:\n + * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n + * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] width Width of the destination image + * @param[in] height Height of the destination image + */ +__kernel void warp_affine( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const int width, + const int height, + __global float matrix[9], + const uchar constValue, + const int type) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + float8 mat = (float8)(matrix[0], matrix[1], matrix[2], matrix[3], matrix[4], matrix[5], 0.0, 0.0); + float8 coords = apply_affine_transform(get_current_coords(), mat); + + if (type == VX_INTERPOLATION_NEAREST_NEIGHBOR) + vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(coords, width, height))), 0, out.ptr); + else if (type == VX_INTERPOLATION_BILINEAR) + vstore4(bilinear_interpolate(&in, coords, width, height), 0, out.ptr); + + if (coords.even.s0 < 0 || coords.odd.s0 < 0 || coords.even.s0 >= width || coords.odd.s0 >= height) + { + out.ptr[0] = constValue; + } + if (coords.even.s1 < 0 || coords.odd.s1 < 0 || coords.even.s1 >= width || coords.odd.s1 >= height) + { + out.ptr[1] = constValue; + } + if (coords.even.s2 < 0 || coords.odd.s2 < 0 || coords.even.s2 >= width || coords.odd.s2 >= height) + { + out.ptr[2] = constValue; + } + if (coords.even.s3 < 0 || coords.odd.s3 < 0 || coords.even.s3 >= width || coords.odd.s3 >= height) + { + out.ptr[3] = constValue; + } +} diff --git a/kernels/opencl/vx_warp_perspective.cl b/kernels/opencl/vx_warp_perspective.cl new file mode 100644 index 0000000..405dfb6 --- /dev/null +++ b/kernels/opencl/vx_warp_perspective.cl @@ -0,0 +1,277 @@ +#define VX_ID_KHRONOS 0x000 +#define VX_ENUM_INTERPOLATION 0x04 +#define VX_ENUM_BASE(vendor, id) (((vendor) << 20) | (id << 12)) +#define VX_INTERPOLATION_NEAREST_NEIGHBOR VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x0 +#define VX_INTERPOLATION_BILINEAR VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x1 + +#define VEC_DATA_TYPE_STR(type, size) type##size +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) + +#define CONVERT_STR(x, type) (convert_##type((x))) +#define CONVERT(x, type) CONVERT_STR(x, type) + +#define IMAGE_DECLARATION(name) \ + __global uchar *name##_ptr, \ + uint name##_stride_x, \ + uint name##_step_x, \ + uint name##_stride_y, \ + uint name##_step_y, \ + uint name##_offset_first_element_in_bytes + +#define CONVERT_TO_IMAGE_STRUCT(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) + +#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) + +/** Structure to hold Image information */ +typedef struct Image +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ +} Image; + +/** Wrap image information into an Image structure, and make the pointer point at this workitem's data. +* +* @param[in] ptr Pointer to the starting postion of the buffer +* @param[in] offset_first_element_in_bytes The offset of the first element in the source image +* @param[in] stride_x Stride of the image in X dimension (in bytes) +* @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) +* @param[in] stride_y Stride of the image in Y dimension (in bytes) +* @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) +* +* @return An image object +*/ +inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) +{ + Image img = + { + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y + }; + img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; + return img; +} + +/** Get the pointer position of a Image +* +* @param[in] img Pointer to the starting position of the buffer +* @param[in] x Relative X position +* @param[in] y Relative Y position +*/ +inline __global uchar *offset(const Image *img, int x, int y) +{ + return img->ptr + x * img->stride_x + y * img->stride_y; +} + +/** Clamps the given coordinates to the borders according to the border size. + * + * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords. + * @param[in] width Width of the image + * @param[in] height Height of the image + * @param[in] border_size Border size of the image + * + */ +inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size) +{ + const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size); + const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size); + + return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3); +} + +/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */ +/** Clamps the given coordinates to the borders. + * + * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords. + * @param[in] width Width of the image + * @param[in] height Height of the image + * + */ +inline const float8 clamp_to_border(float8 coords, const float width, const float height) +{ + return clamp_to_border_with_size(coords, width, height, 1); +} + +/** Reads four texels from the input image. The coords vector is used to determine which texels to be read. + * + * @param[in] in Pointer to the source image. + * @param[in] coords Vector of coordinates to be read from the image. + */ +inline const VEC_DATA_TYPE(uchar, 4) read_texels4(const Image *in, const int8 coords) +{ + return (VEC_DATA_TYPE(uchar, 4))(*((__global uchar *)offset(in, coords.s0, coords.s1)), + *((__global uchar *)offset(in, coords.s2, coords.s3)), + *((__global uchar *)offset(in, coords.s4, coords.s5)), + *((__global uchar *)offset(in, coords.s6, coords.s7))); +} + +/** Returns the current thread coordinates. */ +inline const float2 get_current_coords() +{ + return (float2)(get_global_id(0) * 4, get_global_id(1)); +} + +/** Transforms four 2D coordinates using the formula: + * + * x0 = M[1][1] * x + M[1][2] * y + M[1][3] + * y0 = M[2][1] * x + M[2][2] * y + M[2][3] + * z0 = M[3][1] * x + M[3][2] * y + M[3][3] + * + * (x0/z0,y0/z0) + * + * @param[in] coord 2D coordinate to transform. + * @param[in] mtx perspective matrix + * + * @return a vector float8 containing four 2D transformed values. + */ +inline const float8 apply_perspective_transform(const float2 coord, const float16 mtx) +{ + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); + // transform [z,z+1,z+2,z+3] + const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8))); + // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation + // transform [x,x+1,x+2,x+3] + const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z; + // transform [y,y+1,y+2,y+3] + const float4 new_y = (float4)mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s4), (float4)(mtx.s7))) / z; + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +} + + +/** Given a texel coordinates this function will return the following array of coordinates: + * [ P, right neighbour, below neighbour, below right neighbour ] + * + * @note No checks to see if the coordinates are out of the image are done here. + * + * @param[in] coord Input coordinates + * + * @return vector of 8 floats with the coordinates, even positions are x and odd y. + */ +inline const float8 get_neighbour_coords(const float2 coord) +{ + return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1); +} + +/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values + * + * @param[in] in Pointer to the source image. + * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y. + * @param[in] width Width of the image + * @param[in] height Height of the image + * @param[in] border_size Border size + */ +inline const VEC_DATA_TYPE(uchar, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size) +{ + // Sets the 4x4 coordinates for each of the four input texels + const float8 fc = floor(coords); + const float16 c1 = (float16)( + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size), + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size)); + const float16 c2 = (float16)( + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size), + clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size)); + + // Loads the values from the input image + const float16 t = (float16)( + /* tl, tr, bl, br */ + * ((__global uchar *)offset(in, c1.s0, c1.s1)), *((__global uchar *)offset(in, c1.s2, c1.s3)), + *((__global uchar *)offset(in, c1.s4, c1.s5)), *((__global uchar *)offset(in, c1.s6, c1.s7)), + *((__global uchar *)offset(in, c1.s8, c1.s9)), *((__global uchar *)offset(in, c1.sa, c1.sb)), + *((__global uchar *)offset(in, c1.sc, c1.sd)), *((__global uchar *)offset(in, c1.se, c1.sf)), + *((__global uchar *)offset(in, c2.s0, c2.s1)), *((__global uchar *)offset(in, c2.s2, c2.s3)), + *((__global uchar *)offset(in, c2.s4, c2.s5)), *((__global uchar *)offset(in, c2.s6, c2.s7)), + *((__global uchar *)offset(in, c2.s8, c2.s9)), *((__global uchar *)offset(in, c2.sa, c2.sb)), + *((__global uchar *)offset(in, c2.sc, c2.sd)), *((__global uchar *)offset(in, c2.se, c2.sf))); + const float8 a = coords - fc; + const float8 b = ((float8)(1.f)) - a; + const float4 fr = (float4)( + ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)), + ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)), + ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)), + ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7))); + return CONVERT(fr, VEC_DATA_TYPE(uchar, 4)); +} + +/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */ +/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values + * + * @param[in] in Pointer to the source image. + * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y. + * @param[in] width Width of the image + * @param[in] height Height of the image + */ +inline const VEC_DATA_TYPE(uchar, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height) +{ + return bilinear_interpolate_with_border(in, coords, width, height, 1); +} + +/** Performs perspective transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8. + * + * This kernel performs perspective transform with a 3x3 Matrix M with this method of pixel coordinate translation: + * x0 = M[1][1] * x + M[1][2] * y + M[1][3] + * y0 = M[2][1] * x + M[2][2] * y + M[2][3] + * z0 = M[3][1] * x + M[3][2] * y + M[3][3] + * + * output(x,y) = input(x0/z0,y0/z0) + * + * @attention The matrix coefficients need to be passed at compile time:\n + * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n + * clBuildProgram( program, 0, NULL, build_options, NULL, NULL); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] width Width of the destination image + * @param[in] height Height of the destination image + */ +__kernel void warp_perspective( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const int width, + const int height, + __global float matrix[9], + const uchar constValue, + const int type) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + + float16 mat = (float16)(matrix[0], matrix[1], matrix[2], matrix[3], matrix[4], matrix[5], matrix[6], matrix[7], matrix[8], 0, 0, 0, (float4)0); + float8 coords = apply_perspective_transform(get_current_coords(), mat); + + if (type == VX_INTERPOLATION_NEAREST_NEIGHBOR) + vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(coords, width, height))), 0, out.ptr); + else if (type == VX_INTERPOLATION_BILINEAR) + vstore4(bilinear_interpolate(&in, coords, width, height), 0, out.ptr); + + if (coords.even.s0 < 0 || coords.odd.s0 < 0 || coords.even.s0 >= width || coords.odd.s0 >= height) + { + out.ptr[0] = constValue; + } + if (coords.even.s1 < 0 || coords.odd.s1 < 0 || coords.even.s1 >= width || coords.odd.s1 >= height) + { + out.ptr[1] = constValue; + } + if (coords.even.s2 < 0 || coords.odd.s2 < 0 || coords.even.s2 >= width || coords.odd.s2 >= height) + { + out.ptr[2] = constValue; + } + if (coords.even.s3 < 0 || coords.odd.s3 < 0 || coords.even.s3 >= width || coords.odd.s3 >= height) + { + out.ptr[3] = constValue; + } +} diff --git a/kernels/opencl/vx_xor.cl b/kernels/opencl/vx_xor.cl new file mode 100644 index 0000000..81ad9a7 --- /dev/null +++ b/kernels/opencl/vx_xor.cl @@ -0,0 +1,10 @@ + +__kernel void vx_xor(int asx, int asy, __global uchar *a, + int bsx, int bsy, __global uchar *b, + int csx, int csy, __global uchar *c) +{ + int x = get_global_id(0); + int y = get_global_id(1); + + c[y * csy + x * csx] = a[y * asy + x * asx] ^ b[y * bsy + x * bsx]; +} diff --git a/kernels/tiling/CMakeLists.txt b/kernels/tiling/CMakeLists.txt new file mode 100644 index 0000000..66af1b7 --- /dev/null +++ b/kernels/tiling/CMakeLists.txt @@ -0,0 +1,40 @@ +# + +# Copyright (c) 2011-2017 The Khronos Group Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# set target name +set( TARGET_NAME openvx-tiling_chaining-lib ) + +include_directories( BEFORE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/utils + ${CMAKE_SOURCE_DIR}/debug ) + +FIND_SOURCES() + +# add a target named ${TARGET_NAME} +add_library (${TARGET_NAME} ${SOURCE_FILES}) + +target_link_libraries( ${TARGET_NAME} openvx ) + +install ( TARGETS ${TARGET_NAME} + RUNTIME DESTINATION bin + ARCHIVE DESTINATION bin + LIBRARY DESTINATION bin ) + +set_target_properties( ${TARGET_NAME} PROPERTIES FOLDER ${KERNELS_FOLDER} ) diff --git a/kernels/tiling/tiling.h b/kernels/tiling/tiling.h new file mode 100644 index 0000000..3537538 --- /dev/null +++ b/kernels/tiling/tiling.h @@ -0,0 +1,123 @@ +/* + +* Copyright (c) 2011-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include + +void box3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void box3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Phase_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Phase_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void And_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void And_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Or_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Or_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Xor_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Xor_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Not_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Not_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Threshold_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Threshold_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void ConvertColor_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void ConvertColor_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Multiply_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Multiply_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void NonLinearFilter_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void NonLinearFilter_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Magnitude_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Magnitude_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Erode3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Erode3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Dilate3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Dilate3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Median3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Median3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Sobel3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Sobel3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Max_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Max_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Min_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Min_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Gaussian3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Gaussian3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Addition_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Addition_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Subtraction_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Subtraction_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void ConvertDepth_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void ConvertDepth_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void WarpAffine_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void WarpAffine_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void WarpPerspective_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void WarpPerspective_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void WeightedAverage_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void WeightedAverage_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void AbsDiff_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void AbsDiff_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void IntegralImage_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void IntegralImage_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Convolve_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Convolve_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void HogFeatures_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void HogFeatures_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void Fast9Corners_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void Fast9Corners_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void LBP_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void LBP_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void ScaleImage_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void ScaleImage_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void TableLookup_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void TableLookup_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void ChannelCombine_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void ChannelCombine_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void NonMaxSuppression_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void NonMaxSuppression_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); + +void HogCells_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size); +void HogCells_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size); diff --git a/kernels/tiling/tiling_absdiff.c b/kernels/tiling/tiling_absdiff.c new file mode 100644 index 0000000..6a28534 --- /dev/null +++ b/kernels/tiling/tiling_absdiff.c @@ -0,0 +1,193 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +void AbsDiff_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint32 low_height = out->tile_y; + vx_uint32 height = out->tile_y + out->tile_block.height; + switch (in_1->image.format) { + case VX_DF_IMAGE_U8: + { + for (y = low_height; y < height; y++) { + const vx_uint8* src1R = (vx_uint8 *)in_1->base[0] + in_1->tile_x + y * in_1->addr[0].stride_y; + const vx_uint8* src2R = (vx_uint8 *)in_2->base[0] + in_2->tile_x + y * in_2->addr[0].stride_y; + vx_uint8* dstR = (vx_uint8 *)out->base[0] + out->tile_x + y * out->addr[0].stride_y;; + for (x = 0; x < out->tile_block.width; x+=16) { + uint8x16_t vSrc1R = vld1q_u8(src1R); + uint8x16_t vSrc2R = vld1q_u8(src2R); + uint8x16_t vDiff = vabdq_u8(vSrc1R, vSrc2R); + vst1q_u8(dstR, vDiff); + src2R += 16* in_1->addr[0].stride_x; + src1R += 16* in_2->addr[0].stride_x; + dstR += 16* out->addr[0].stride_x; + } + } + } + break; + + case VX_DF_IMAGE_S16: + { + uint16x8_t vMaxs16 = vdupq_n_u16(0x7FFF); + for (y = low_height; y < height; y++) { + const vx_int16* src1R = (vx_int16 *)in_1->base[0] + in_1->tile_x + y * in_1->addr[0].stride_y /2;// + x * in_1->addr[0].stride_x / 2; + const vx_int16* src2R = (vx_int16 *)in_2->base[0] + in_2->tile_x + y * in_2->addr[0].stride_y /2;// + x * in_2->addr[0].stride_x / 2; + vx_int16* dstR = (vx_int16 *)out->base[0] + out->tile_x + y * out->addr[0].stride_y /2;// + x * in_1->addr[0].stride_x / 2; + if (out->image.format == VX_DF_IMAGE_S16) { + for (x = 0; x < out->tile_block.width; x+=8) { + int16x8_t vSrc1R = vld1q_s16(src1R); + int16x8_t vSrc2R = vld1q_s16(src2R); + uint16x8_t vDiff = (uint16x8_t)vabdq_s16(vSrc1R, vSrc2R); + vDiff = vminq_u16(vDiff, vMaxs16); + vst1q_s16(dstR, (int16x8_t)vDiff); + src2R += 8 * in_1->addr[0].stride_x / 2; + src1R += 8 * in_2->addr[0].stride_x / 2; + dstR += 8 * out->addr[0].stride_x / 2; + } + }else if (out->image.format == VX_DF_IMAGE_U16) { + for (x = 0; x < out->tile_block.width; x+=8) { + int16x8_t vSrc1R = vld1q_s16(src1R); + int16x8_t vSrc2R = vld1q_s16(src2R); + uint16x8_t vDiff = vabdq_u16((uint16x8_t)vSrc1R, (uint16x8_t)vSrc2R); + vst1q_u16((vx_uint16 *)dstR, vDiff); + src2R += 8 * in_1->addr[0].stride_x / 2; + src1R += 8 * in_2->addr[0].stride_x / 2; + dstR += 8 * out->addr[0].stride_x / 2; + } + } + } + } + break; + + case VX_DF_IMAGE_U16: + { + for (y = low_height; y < height; y++) { + const vx_uint16* src1R = (vx_uint16 *)in_1->base[0] + in_1->tile_x + y * in_1->addr[0].stride_y / 2; + const vx_uint16* src2R = (vx_uint16 *)in_2->base[0] + in_2->tile_x + y * in_2->addr[0].stride_y / 2; + vx_uint16* dstR = (vx_uint16 *)out->base[0] + out->tile_x + y * out->addr[0].stride_y / 2; + for (x = 0; x < out->tile_block.width; x+=8) { + uint16x8_t vSrc1R = vld1q_u16(src1R); + uint16x8_t vSrc2R = vld1q_u16(src2R); + uint16x8_t vDiff = vabdq_u16(vSrc1R, vSrc2R); + vst1q_u16(dstR, vDiff); + src2R += 8 * in_1->addr[0].stride_x / 2; + src1R += 8 * in_2->addr[0].stride_x / 2; + dstR += 8 * out->addr[0].stride_x / 2; + } + } + } + break; + + default: + break; + } +} + +#define ABSDIFF_FLEXIBLE(low_y, low_x, high_y, high_x, in_1_tile_x, in_2_tile_x, out_tile_x) \ + switch (in_1->image.format)\ + {\ + case VX_DF_IMAGE_U8:\ + {\ + for (y = low_y; y < high_y; y++) {\ + vx_uint8* src1R = (vx_uint8 *)in_1->base[0] + in_1_tile_x + y * in_1->addr[0].stride_y;\ + vx_uint8* src2R = (vx_uint8 *)in_2->base[0] + in_2_tile_x + y * in_2->addr[0].stride_y;\ + vx_uint8* dstR = (vx_uint8 *)out->base[0] + out_tile_x + y * out->addr[0].stride_y;\ + for (x = low_x; x < high_x; x++) \ + {\ + vx_int16 tmp = (*src1R) - (*src2R);\ + *dstR = (vx_uint8)(tmp < 0 ? (-tmp) : tmp); \ + src1R++;\ + src2R++;\ + dstR++;\ + }\ + }\ + }\ + break;\ + default:\ + for (y = low_y; y < high_y; y++)\ + {\ + for (x = low_x; x < high_x; x++)\ + {\ + if (in_1->image.format == VX_DF_IMAGE_S16)\ + {\ + vx_int16 *src[2] = \ + {\ + (vx_int16 *)in_1->base[0] + in_1_tile_x + y * in_1->addr[0].stride_y /2 + x * in_1->addr[0].stride_x / 2,\ + (vx_int16 *)in_2->base[0] + in_2_tile_x + y * in_2->addr[0].stride_y /2 + x * in_2->addr[0].stride_x / 2,\ + };\ + if (out->image.format == VX_DF_IMAGE_S16)\ + {\ + vx_int16 *dst = (vx_int16 *)out->base[0] + out_tile_x + y * out->addr[0].stride_y / 2 + x * out->addr[0].stride_x / 2;\ + vx_uint32 val;\ + if (*src[0] > *src[1])\ + val = *src[0] - *src[1];\ + else\ + val = *src[1] - *src[0];\ + *dst = (vx_int16)((val > 32767) ? 32767 : val);\ + }\ + else if (out->image.format == VX_DF_IMAGE_U16) {\ + vx_uint16 *dst = (vx_uint16 *)out->base[0] + out_tile_x + y * out->addr[0].stride_y / 2+ x * out->addr[0].stride_x /2;\ + if (*src[0] > *src[1])\ + *dst = *src[0] - *src[1];\ + else\ + *dst = *src[1] - *src[0];\ + }\ + }\ + else if (in_1->image.format == VX_DF_IMAGE_U16)\ + {\ + vx_uint16 *src[2] = \ + {\ + (vx_uint16 *)in_1->base[0] + in_1_tile_x + y * in_1->addr[0].stride_y / 2 + x * in_1->addr[0].stride_x / 2,\ + (vx_uint16 *)in_2->base[0] + in_2->tile_x + y * in_2->addr[0].stride_y / 2 + x * in_2->addr[0].stride_x / 2,\ + };\ + vx_uint16 *dst = (vx_uint16 *)out->base[0] + out_tile_x + y * out->addr[0].stride_y + x * out->addr[0].stride_x;\ + if (*src[0] > *src[1])\ + *dst = *src[0] - *src[1];\ + else\ + *dst = *src[1] - *src[0];\ + }\ + }\ + }\ + break;\ + }\ + + +void AbsDiff_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + if (ty == 0 && tx == 0) + { + ABSDIFF_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x) + } + else + { + ABSDIFF_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x) + ABSDIFF_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), 0, 0, 0) + } +} + diff --git a/kernels/tiling/tiling_addsub.c b/kernels/tiling/tiling_addsub.c new file mode 100644 index 0000000..4d48c52 --- /dev/null +++ b/kernels/tiling/tiling_addsub.c @@ -0,0 +1,439 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +void Addition_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_enum *overflow_policy = (vx_enum*)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + vx_uint32 low_height = out->tile_y; + vx_uint32 height = out->tile_y + out->tile_block.height; + for (y = low_height; y < height; y++) + { + vx_uint8 *src0p = (vx_uint8 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width; + vx_uint8 *src1p = (vx_uint8 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; + vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out->tile_x + y * out->image.width; + vx_int16 *src0p_16 = (vx_int16 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width; + vx_int16 *src1p_16 = (vx_int16 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; + vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out->tile_x + y * out->image.width; + for (x = 0; x < out->tile_block.width; x += 8) + { + int32x4_t src01; + int32x4_t src02; + int32x4_t src11; + int32x4_t src12; + if(in_1->image.format == VX_DF_IMAGE_U8) + { + uint8x8_t in01_8x8_data = vld1_u8((vx_uint8*)src0p); + uint16x8_t tmp16x8 = vmovl_u8 (in01_8x8_data); + int32x4x2_t tmp32x4_int_u8 = + { + { + vreinterpretq_s32_u32 (vmovl_u16 (vget_low_u16(tmp16x8))), + vreinterpretq_s32_u32 (vmovl_u16 (vget_high_u16(tmp16x8))) + } + }; + src01 = tmp32x4_int_u8.val[0]; + src02 = tmp32x4_int_u8.val[1]; + src0p += 8; + } + else + { + int16x8_t int02_16x8_data = vld1q_s16((vx_int16*)src0p_16); + int32x4x2_t tmp32x4_int_s16 = + { + { + vmovl_s16 (vget_low_s16(int02_16x8_data)), + vmovl_s16 (vget_high_s16(int02_16x8_data)) + } + }; + src01 = tmp32x4_int_s16.val[0]; + src02 = tmp32x4_int_s16.val[1]; + src0p_16 += 8; + } + if(in_2->image.format == VX_DF_IMAGE_U8) + { + uint8x8_t in01_8x8_data = vld1_u8((vx_uint8*)src1p); + uint16x8_t tmp16x8 = vmovl_u8 (in01_8x8_data); + int32x4x2_t tmp32x4_int_u8 = + { + { + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp16x8))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp16x8))) + } + }; + src11 = tmp32x4_int_u8.val[0]; + src12 = tmp32x4_int_u8.val[1]; + src1p += 8; + } + else + { + int16x8_t int02_16x8_data = vld1q_s16((vx_int16*)src1p_16); + int32x4x2_t tmp32x4_int_s16 = + { + { + vmovl_s16(vget_low_s16(int02_16x8_data)), + vmovl_s16(vget_high_s16(int02_16x8_data)) + } + }; + src11 = tmp32x4_int_s16.val[0]; + src12 = tmp32x4_int_s16.val[1]; + src1p_16 += 8; + } + int32x4_t unscaled_unconverted_result1 = vaddq_s32(src01, src11); + int32x4_t unscaled_unconverted_result2 = vaddq_s32(src02, src12); + vx_int32 tmp0 = vgetq_lane_s32(unscaled_unconverted_result1, 0); + vx_int32 tmp1 = vgetq_lane_s32(unscaled_unconverted_result1, 1); + vx_int32 tmp2 = vgetq_lane_s32(unscaled_unconverted_result1, 2); + vx_int32 tmp3 = vgetq_lane_s32(unscaled_unconverted_result1, 3); + vx_int32 tmp4 = vgetq_lane_s32(unscaled_unconverted_result2, 0); + vx_int32 tmp5 = vgetq_lane_s32(unscaled_unconverted_result2, 1); + vx_int32 tmp6 = vgetq_lane_s32(unscaled_unconverted_result2, 2); + vx_int32 tmp7 = vgetq_lane_s32(unscaled_unconverted_result2, 3); + + vx_int32 i; + for(i = 0; i < 8; i++) + { + vx_int32 int_typed_result; + if(i == 0) + int_typed_result = tmp0; + else if(i == 1) + int_typed_result = tmp1; + else if(i == 2) + int_typed_result = tmp2; + else if(i == 3) + int_typed_result = tmp3; + else if(i == 4) + int_typed_result = tmp4; + else if(i == 5) + int_typed_result = tmp5; + else if(i == 6) + int_typed_result = tmp6; + else if(i == 7) + int_typed_result = tmp7; + vx_int32 final_result_value; + if (*overflow_policy == VX_CONVERT_POLICY_SATURATE) + { + if (out->image.format == VX_DF_IMAGE_U8) + { + if (int_typed_result > UINT8_MAX) + final_result_value = UINT8_MAX; + else if (int_typed_result < 0) + final_result_value = 0; + else + final_result_value = int_typed_result; + } + else + { + if (int_typed_result > INT16_MAX) + final_result_value = INT16_MAX; + else if (int_typed_result < INT16_MIN) + final_result_value = INT16_MIN; + else + final_result_value = int_typed_result; + } + } + else + { + final_result_value = (out->image.format == VX_DF_IMAGE_U8) ? + (vx_uint8)int_typed_result : (vx_int16)int_typed_result; + } + + if (out->image.format == VX_DF_IMAGE_U8) + { + *dstp = (vx_uint8)final_result_value; + dstp += 1; + } + else + { + *dstp_16 = (vx_int16)final_result_value; + dstp_16 += 1; + } + } + } + } +} + +#define ADD_SUB_FLEXIBLE(low_y, low_x, high_y, high_x, opmode, in_1_tile_x, in_2_tile_x, out_tile_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + vx_uint8 *src0p = (vx_uint8 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width; \ + vx_uint8 *src1p = (vx_uint8 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width; \ + vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out_tile_x + y * out->image.width; \ + vx_int16 *src0p_16 = (vx_int16 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width; \ + vx_int16 *src1p_16 = (vx_int16 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width; \ + vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out_tile_x + y * out->image.width; \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_int32 src0 = in_1->image.format == VX_DF_IMAGE_U8 ? *src0p : *src0p_16; \ + vx_int32 src1 = in_2->image.format == VX_DF_IMAGE_U8 ? *src1p : *src1p_16; \ + src0p++; \ + src1p++; \ + src0p_16++; \ + src1p_16++; \ + vx_int32 int_typed_result; \ + if(opmode == 0) \ + { \ + int_typed_result = src0 + src1; \ + } \ + else \ + { \ + int_typed_result = src0 - src1; \ + } \ + vx_int32 final_result_value; \ + if (*overflow_policy == VX_CONVERT_POLICY_SATURATE) \ + { \ + if (out->image.format == VX_DF_IMAGE_U8) \ + { \ + if (int_typed_result > UINT8_MAX) \ + final_result_value = UINT8_MAX; \ + else if (int_typed_result < 0) \ + final_result_value = 0; \ + else \ + final_result_value = int_typed_result; \ + } \ + else \ + { \ + if (int_typed_result > INT16_MAX) \ + final_result_value = INT16_MAX; \ + else if (int_typed_result < INT16_MIN) \ + final_result_value = INT16_MIN; \ + else \ + final_result_value = int_typed_result; \ + } \ + } \ + else \ + { \ + final_result_value = (out->image.format == VX_DF_IMAGE_U8) ? \ + (vx_uint8)int_typed_result : (vx_int16)int_typed_result; \ + } \ + if (out->image.format == VX_DF_IMAGE_U8) \ + { \ + *dstp = (vx_uint8)final_result_value; \ + dstp++; \ + } \ + else \ + { \ + *dstp_16 = (vx_int16)final_result_value; \ + dstp_16++; \ + } \ + } \ + } \ + +void Addition_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_enum *overflow_policy = (vx_enum*)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + vx_uint8 op_mode = 0; + if (ty == 0 && tx == 0) + { + ADD_SUB_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), op_mode, in_1->tile_x, in_2->tile_x, out->tile_x) + } + else + { + ADD_SUB_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), op_mode, in_1->tile_x, in_2->tile_x, out->tile_x) + ADD_SUB_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), op_mode, 0, 0, 0) + } +} + +void Subtraction_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_enum *overflow_policy = (vx_enum*)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + vx_uint32 low_height = out->tile_y; + vx_uint32 height = out->tile_y + out->tile_block.height; + for (y = low_height; y < height; y++) + { + vx_uint8 *src0p = (vx_uint8 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width; + vx_uint8 *src1p = (vx_uint8 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; + vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out->tile_x + y * out->image.width; + vx_int16 *src0p_16 = (vx_int16 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width; + vx_int16 *src1p_16 = (vx_int16 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; + vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out->tile_x + y * out->image.width; + for (x = 0; x < out->tile_block.width; x += 8) + { + int32x4_t src01; + int32x4_t src02; + int32x4_t src11; + int32x4_t src12; + if(in_1->image.format == VX_DF_IMAGE_U8) + { + uint8x8_t in01_8x8_data = vld1_u8((vx_uint8*)src0p); + uint16x8_t tmp16x8 = vmovl_u8 (in01_8x8_data); + int32x4x2_t tmp32x4_int_u8 = + { + { + vreinterpretq_s32_u32 (vmovl_u16 (vget_low_u16(tmp16x8))), + vreinterpretq_s32_u32 (vmovl_u16 (vget_high_u16(tmp16x8))) + } + }; + src01 = tmp32x4_int_u8.val[0]; + src02 = tmp32x4_int_u8.val[1]; + src0p += 8; + } + else + { + int16x8_t int02_16x8_data = vld1q_s16((vx_int16*)src0p_16); + int32x4x2_t tmp32x4_int_s16 = + { + { + vmovl_s16 (vget_low_s16(int02_16x8_data)), + vmovl_s16 (vget_high_s16(int02_16x8_data)) + } + }; + src01 = tmp32x4_int_s16.val[0]; + src02 = tmp32x4_int_s16.val[1]; + src0p_16 += 8; + } + if(in_2->image.format == VX_DF_IMAGE_U8) + { + uint8x8_t in01_8x8_data = vld1_u8((vx_uint8*)src1p); + uint16x8_t tmp16x8 = vmovl_u8 (in01_8x8_data); + int32x4x2_t tmp32x4_int_u8 = + { + { + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp16x8))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp16x8))) + } + }; + src11 = tmp32x4_int_u8.val[0]; + src12 = tmp32x4_int_u8.val[1]; + src1p += 8; + } + else + { + int16x8_t int02_16x8_data = vld1q_s16((vx_int16*)src1p_16); + int32x4x2_t tmp32x4_int_s16 = + { + { + vmovl_s16(vget_low_s16(int02_16x8_data)), + vmovl_s16(vget_high_s16(int02_16x8_data)) + } + }; + src11 = tmp32x4_int_s16.val[0]; + src12 = tmp32x4_int_s16.val[1]; + src1p_16 += 8; + } + int32x4_t unscaled_unconverted_result1 = vsubq_s32(src01, src11); + int32x4_t unscaled_unconverted_result2 = vsubq_s32(src02, src12); + vx_int32 tmp0 = vgetq_lane_s32(unscaled_unconverted_result1, 0); + vx_int32 tmp1 = vgetq_lane_s32(unscaled_unconverted_result1, 1); + vx_int32 tmp2 = vgetq_lane_s32(unscaled_unconverted_result1, 2); + vx_int32 tmp3 = vgetq_lane_s32(unscaled_unconverted_result1, 3); + vx_int32 tmp4 = vgetq_lane_s32(unscaled_unconverted_result2, 0); + vx_int32 tmp5 = vgetq_lane_s32(unscaled_unconverted_result2, 1); + vx_int32 tmp6 = vgetq_lane_s32(unscaled_unconverted_result2, 2); + vx_int32 tmp7 = vgetq_lane_s32(unscaled_unconverted_result2, 3); + + vx_int32 i; + for(i = 0; i < 8; i++) + { + vx_int32 int_typed_result; + if(i == 0) + int_typed_result = tmp0; + else if(i == 1) + int_typed_result = tmp1; + else if(i == 2) + int_typed_result = tmp2; + else if(i == 3) + int_typed_result = tmp3; + else if(i == 4) + int_typed_result = tmp4; + else if(i == 5) + int_typed_result = tmp5; + else if(i == 6) + int_typed_result = tmp6; + else if(i == 7) + int_typed_result = tmp7; + vx_int32 final_result_value; + if (*overflow_policy == VX_CONVERT_POLICY_SATURATE) + { + if (out->image.format == VX_DF_IMAGE_U8) + { + if (int_typed_result > UINT8_MAX) + final_result_value = UINT8_MAX; + else if (int_typed_result < 0) + final_result_value = 0; + else + final_result_value = int_typed_result; + } + else + { + if (int_typed_result > INT16_MAX) + final_result_value = INT16_MAX; + else if (int_typed_result < INT16_MIN) + final_result_value = INT16_MIN; + else + final_result_value = int_typed_result; + } + } + else + { + final_result_value = (out->image.format == VX_DF_IMAGE_U8) ? + (vx_uint8)int_typed_result : (vx_int16)int_typed_result; + } + + if (out->image.format == VX_DF_IMAGE_U8) + { + *dstp = (vx_uint8)final_result_value; + dstp += 1; + } + else + { + *dstp_16 = (vx_int16)final_result_value; + dstp_16 += 1; + } + } + } + } +} + +void Subtraction_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_enum *overflow_policy = (vx_enum*)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + vx_uint8 op_mode = 1; + if (ty == 0 && tx == 0) + { + ADD_SUB_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), op_mode, in_1->tile_x, in_2->tile_x, out->tile_x) + } + else + { + ADD_SUB_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), op_mode, in_1->tile_x, in_2->tile_x, out->tile_x) + ADD_SUB_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), op_mode, 0, 0, 0) + } +} diff --git a/kernels/tiling/tiling_bitwise.c b/kernels/tiling/tiling_bitwise.c new file mode 100644 index 0000000..5c85792 --- /dev/null +++ b/kernels/tiling/tiling_bitwise.c @@ -0,0 +1,377 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +void And_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint8 *src_1 = in_1->base[0] + in_1->tile_x; + vx_uint8 *src_2 = in_2->base[0] + in_2->tile_x; + vx_uint8 *dst = out->base[0] + out->tile_x; + vx_uint32 low_height = out->tile_y; + vx_uint32 height = out->tile_y + out->tile_block.height; + + for (y = low_height; y < height; y++) + { + const vx_uint8* src1R = src_1 + y * in_1->image.width; + const vx_uint8* src2R = src_2 + y * in_2->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = 0; x < out->tile_block.width; x+=16) + { + uint8x16_t vSrc1R = vld1q_u8(src1R); + uint8x16_t vSrc2R = vld1q_u8(src2R); + uint8x16_t vAnd = vandq_u8(vSrc1R, vSrc2R); + vst1q_u8(dstR, vAnd); + + src2R += 16; + src1R += 16; + dstR += 16; + } + } + +} +void And_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint8 *src_1 = in_1->base[0] + in_1->tile_x; + vx_uint8 *src_2 = in_2->base[0] + in_2->tile_x; + vx_uint8 *dst = out->base[0] + out->tile_x; + + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + if (ty == 0 && tx == 0) + { + for (y = 0; y < vxTileHeight(out, 0); y++) + { + const vx_uint8* src1R = src_1 + y * in_1->image.width; + const vx_uint8* src2R = src_2 + y * in_2->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = 0; x < vxTileWidth(out, 0); x++) + { + *(dstR+x) = *(src1R + x)&*(src2R + x); + src2R ++; + src1R ++; + dstR ++; + } + } + } + else + { + for (y = 0; y < ty; y++) + { + const vx_uint8* src1R = src_1 + y * in_1->image.width; + const vx_uint8* src2R = src_2 + y * in_2->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = tx; x < vxTileWidth(out, 0); x++) + { + *(dstR+x) = *(src1R + x)&*(src2R + x); + src2R ++; + src1R ++; + dstR ++; + } + } + for (y = ty; y < vxTileHeight(out, 0); y++) + { + src_1 = in_1->base[0]; + src_2 = in_2->base[0]; + dst = out->base[0]; + const vx_uint8* src1R = src_1 + y * in_1->image.width; + const vx_uint8* src2R = src_2 + y * in_2->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = 0; x < vxTileWidth(out, 0); x++) + { + *(dstR+x) = *(src1R + x)&*(src2R + x); + src2R ++; + src1R ++; + dstR ++; + } + } + } +} + +void Or_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint8 *src_1 = in_1->base[0] + in_1->tile_x; + vx_uint8 *src_2 = in_2->base[0] + in_2->tile_x; + vx_uint8 *dst = out->base[0] + out->tile_x; + vx_uint32 low_height = out->tile_y; + vx_uint32 height = out->tile_y + out->tile_block.height; + + for (y = low_height; y < height; y++) + { + const vx_uint8* src1R = src_1 + y * in_1->image.width; + const vx_uint8* src2R = src_2 + y * in_2->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = 0; x < out->tile_block.width; x+=16) + { + uint8x16_t vSrc1R = vld1q_u8(src1R); + uint8x16_t vSrc2R = vld1q_u8(src2R); + uint8x16_t vOr = vorrq_u8(vSrc1R, vSrc2R); + vst1q_u8(dstR, vOr); + + src2R += 16; + src1R += 16; + dstR += 16; + } + } + +} +void Or_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint8 *src_1 = in_1->base[0] + in_1->tile_x; + vx_uint8 *src_2 = in_2->base[0] + in_2->tile_x; + vx_uint8 *dst = out->base[0] + out->tile_x; + + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + if (ty == 0 && tx == 0) + { + for (y = 0; y < vxTileHeight(out, 0); y++) + { + const vx_uint8* src1R = src_1 + y * in_1->image.width; + const vx_uint8* src2R = src_2 + y * in_2->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = 0; x < vxTileWidth(out, 0); x++) + { + *(dstR+x) = *(src1R + x)|*(src2R + x); + src2R ++; + src1R ++; + dstR ++; + } + } + } + else + { + for (y = 0; y < ty; y++) + { + const vx_uint8* src1R = src_1 + y * in_1->image.width; + const vx_uint8* src2R = src_2 + y * in_2->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = tx; x < vxTileWidth(out, 0); x++) + { + *(dstR+x) = *(src1R + x)|*(src2R + x); + src2R ++; + src1R ++; + dstR ++; + } + } + for (y = ty; y < vxTileHeight(out, 0); y++) + { + src_1 = in_1->base[0]; + src_2 = in_2->base[0]; + dst = out->base[0]; + const vx_uint8* src1R = src_1 + y * in_1->image.width; + const vx_uint8* src2R = src_2 + y * in_2->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = 0; x < vxTileWidth(out, 0); x++) + { + *(dstR+x) = *(src1R + x)|*(src2R + x); + src2R ++; + src1R ++; + dstR ++; + } + } + } +} + +void Xor_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint8 *src_1 = in_1->base[0] + in_1->tile_x; + vx_uint8 *src_2 = in_2->base[0] + in_2->tile_x; + vx_uint8 *dst = out->base[0] + out->tile_x; + vx_uint32 low_height = out->tile_y; + vx_uint32 height = out->tile_y + out->tile_block.height; + + for (y = low_height; y < height; y++) + { + const vx_uint8* src1R = src_1 + y * in_1->image.width; + const vx_uint8* src2R = src_2 + y * in_2->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = 0; x < out->tile_block.width; x+=16) + { + uint8x16_t vSrc1R = vld1q_u8(src1R); + uint8x16_t vSrc2R = vld1q_u8(src2R); + uint8x16_t vXor = veorq_u8(vSrc1R, vSrc2R); + vst1q_u8(dstR, vXor); + + src2R += 16; + src1R += 16; + dstR += 16; + } + } + +} +void Xor_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint8 *src_1 = in_1->base[0] + in_1->tile_x; + vx_uint8 *src_2 = in_2->base[0] + in_2->tile_x; + vx_uint8 *dst = out->base[0] + out->tile_x; + + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + if (ty == 0 && tx == 0) + { + for (y = 0; y < vxTileHeight(out, 0); y++) + { + const vx_uint8* src1R = src_1 + y * in_1->image.width; + const vx_uint8* src2R = src_2 + y * in_2->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = 0; x < vxTileWidth(out, 0); x++) + { + *(dstR+x) = *(src1R + x)^*(src2R + x); + src2R ++; + src1R ++; + dstR ++; + } + } + } + else + { + for (y = 0; y < ty; y++) + { + const vx_uint8* src1R = src_1 + y * in_1->image.width; + const vx_uint8* src2R = src_2 + y * in_2->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = tx; x < vxTileWidth(out, 0); x++) + { + *(dstR+x) = *(src1R + x)^*(src2R + x); + src2R ++; + src1R ++; + dstR ++; + } + } + for (y = ty; y < vxTileHeight(out, 0); y++) + { + src_1 = in_1->base[0]; + src_2 = in_2->base[0]; + dst = out->base[0]; + const vx_uint8* src1R = src_1 + y * in_1->image.width; + const vx_uint8* src2R = src_2 + y * in_2->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = 0; x < vxTileWidth(out, 0); x++) + { + *(dstR+x) = *(src1R + x)^*(src2R + x); + src2R ++; + src1R ++; + dstR ++; + } + } + } +} + +void Not_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + vx_uint8 *src = in->base[0] + in->tile_x; + vx_uint8 *dst = out->base[0] + out->tile_x; + vx_uint32 low_height = out->tile_y; + vx_uint32 height = out->tile_y + out->tile_block.height; + + for (y = low_height; y < height; y++) + { + const vx_uint8* srcR = src + y * in->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = 0; x < out->tile_block.width; x+=16) + { + uint8x16_t vSrcR = vld1q_u8(srcR); + uint8x16_t vNot = vmvnq_u8(vSrcR); + vst1q_u8(dstR, vNot); + + srcR += 16; + dstR += 16; + } + } + +} +void Not_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + vx_uint8 *src = in->base[0] + in->tile_x; + vx_uint8 *dst = out->base[0] + out->tile_x; + + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + if (ty == 0 && tx == 0) + { + for (y = 0; y < vxTileHeight(out, 0); y++) + { + const vx_uint8* srcR = src + y * in->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = 0; x < vxTileWidth(out, 0); x++) + { + *(dstR+x) = ~*(srcR + x); + srcR ++; + dstR ++; + } + } + } + else + { + for (y = 0; y < ty; y++) + { + const vx_uint8* srcR = src + y * in->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = tx; x < vxTileWidth(out, 0); x++) + { + *(dstR+x) = ~*(srcR + x); + srcR ++; + dstR ++; + } + } + for (y = ty; y < vxTileHeight(out, 0); y++) + { + src = in->base[0]; + dst = out->base[0]; + const vx_uint8* srcR = src + y * in->image.width; + vx_uint8* dstR = dst + y * out->image.width; + for (x = 0; x < vxTileWidth(out, 0); x++) + { + *(dstR+x) = ~*(srcR + x); + srcR ++; + dstR ++; + } + } + } +} diff --git a/kernels/tiling/tiling_channel.c b/kernels/tiling/tiling_channel.c new file mode 100644 index 0000000..ff43294 --- /dev/null +++ b/kernels/tiling/tiling_channel.c @@ -0,0 +1,386 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include + +void ChannelCombine_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0, p; + vx_tile_t *in[4]; + in[0] = (vx_tile_t *)parameters[0]; + in[1] = (vx_tile_t *)parameters[1]; + in[2] = (vx_tile_t *)parameters[2]; + in[3] = (vx_tile_t *)parameters[3]; + vx_tile_t *out = (vx_tile_t *)parameters[4]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = out->tile_x + out->tile_block.width; + + void *base_src_ptrs[4] = { NULL }; + void *base_dst_ptr[4] = { NULL }; + + base_src_ptrs[0] = in[0]->base[0]; + base_src_ptrs[1] = in[1]->base[0]; + base_src_ptrs[2] = in[2]->base[0]; + base_src_ptrs[3] = in[3]->base[0]; + + base_dst_ptr[0] = out->base[0]; + base_dst_ptr[1] = out->base[1]; + base_dst_ptr[2] = out->base[2]; + base_dst_ptr[3] = out->base[3]; + + vx_df_image format; + + format = out->image.format; + + vx_uint8 *planes[4]; + + if (format == VX_DF_IMAGE_RGB) + { + vx_uint8 *ptr0, *ptr1, *ptr2, *pout; + for (y = low_y; y < high_y; y += out->addr[0].step_y) + { + ptr0 = (vx_uint8 *)base_src_ptrs[0] + y * in[0]->addr->stride_y; + ptr1 = (vx_uint8 *)base_src_ptrs[1] + y * in[1]->addr->stride_y; + ptr2 = (vx_uint8 *)base_src_ptrs[2] + y * in[2]->addr->stride_y; + pout = (vx_uint8 *)base_dst_ptr[0] + y * out->addr->stride_y; + for (x = low_x; x < high_x; x += 16) + { + uint8x16x3_t pixels = {{vld1q_u8(ptr0 + x * in[0]->addr->stride_x), + vld1q_u8(ptr1 + x * in[1]->addr->stride_x), + vld1q_u8(ptr2 + x * in[2]->addr->stride_x)}}; + + vst3q_u8(pout + x * out->addr->stride_x, pixels); + } + } + } + else if (format == VX_DF_IMAGE_RGBX) + { + vx_uint8 *ptr0, *ptr1, *ptr2, *ptr3, *pout; + for (y = low_y; y < high_y; y += out->addr[0].step_y) + { + ptr0 = (vx_uint8 *)base_src_ptrs[0] + y * in[0]->addr->stride_y; + ptr1 = (vx_uint8 *)base_src_ptrs[1] + y * in[1]->addr->stride_y; + ptr2 = (vx_uint8 *)base_src_ptrs[2] + y * in[2]->addr->stride_y; + ptr3 = (vx_uint8 *)base_src_ptrs[3] + y * in[3]->addr->stride_y; + pout = (vx_uint8 *)base_dst_ptr[0] + y * out->addr->stride_y; + for (x = low_x; x < high_x; x += 16) + { + uint8x16x4_t pixels = {{vld1q_u8(ptr0 + x * in[0]->addr->stride_x), + vld1q_u8(ptr1 + x * in[1]->addr->stride_x), + vld1q_u8(ptr2 + x * in[2]->addr->stride_x), + vld1q_u8(ptr3 + x * in[3]->addr->stride_x)}}; + + vst4q_u8(pout + x * out->addr->stride_x, pixels); + } + } + } + else if ((format == VX_DF_IMAGE_YUV4) || (format == VX_DF_IMAGE_IYUV)) + { + vx_uint8 *ptr_in, *ptr_out; + vx_uint32 wCnt = ((high_x >> 1) >> 3) << 3; + for (p = 0; p < 3; p++) + { + if (1 == out->addr[p].step_y) + { + for (y = low_y; y < high_y; y += out->addr[p].step_y) + { + ptr_in = (vx_uint8 *)base_src_ptrs[p] + y * in[p]->addr->stride_y; + ptr_out = (vx_uint8 *)base_dst_ptr[p] + y * out->addr[p].stride_y; + + for (x = low_x; x < high_x; x += 16) + { + uint8x16_t pixels = vld1q_u8(ptr_in + x * in[p]->addr->stride_x); + vst1q_u8(ptr_out + x * out->addr[p].stride_x, pixels); + } + } + } + else + { + for (y = low_y; y < high_y; y += out->addr[p].step_y) + { + ptr_in = (vx_uint8 *)base_src_ptrs[p] + ((y * in[p]->addr->step_y / out->addr[p].step_y) * + in[p]->addr->scale_y / VX_SCALE_UNITY) * in[p]->addr->stride_y; + ptr_out = (vx_uint8 *)base_dst_ptr[p] + (y * out->addr[p].scale_y / VX_SCALE_UNITY) * out->addr[p].stride_y; + + for (x = low_x; x < wCnt; x += 8) + { + uint8x8_t pixels = vld1_u8(ptr_in + x * in[p]->addr->stride_x); + vst1_u8(ptr_out + x * out->addr[p].stride_x, pixels); + } + } + } + } + } + else if ((format == VX_DF_IMAGE_NV12) || (format == VX_DF_IMAGE_NV21)) + { + int vidx = (format == VX_DF_IMAGE_NV12) ? 1 : 0; + + //plane 0 + { + for (y = low_y; y < high_y; y += out->addr[0].step_y) + { + vx_uint8 *ptr_src = (vx_uint8 *)base_src_ptrs[0] + y * in[0]->addr->stride_y; + vx_uint8 *ptr_dst = (vx_uint8 *)base_dst_ptr[0] + y * out->addr[0].stride_y; + for (x = low_x; x < high_x; x += 16) + { + uint8x16_t pixels = vld1q_u8(ptr_src + x * in[0]->addr->stride_x); + vst1q_u8(ptr_dst + x * out->addr[0].stride_x, pixels); + } + } + } + + // plane 1 + { + vx_uint32 wCnt = ((high_x >> 1) >> 3) << 3; + for (y = low_y; y < high_y; y += out->addr[1].step_y) + { + vx_uint8 *ptr_src0 = (vx_uint8 *)base_src_ptrs[1] + in[1]->addr->stride_y * + ((y * in[1]->addr->step_y / out->addr[1].step_y) * in[1]->addr->scale_y / VX_SCALE_UNITY); + vx_uint8 *ptr_src1 = (vx_uint8 *)base_src_ptrs[2] + in[2]->addr->stride_y * + ((y * in[1]->addr->step_y / out->addr[1].step_y) * in[2]->addr->scale_y / VX_SCALE_UNITY); + vx_uint8 *ptr_dst = (vx_uint8 *)base_dst_ptr[1] + out->addr[1].stride_y * (y *out->addr[1].scale_y / VX_SCALE_UNITY); + for (x = low_x; x < wCnt; x += 8) + { + uint8x8x2_t pixels; + pixels.val[1-vidx] = vld1_u8(ptr_src0 + x * in[1]->addr->stride_x); + pixels.val[vidx] = vld1_u8(ptr_src1 + x * in[2]->addr->stride_x); + vst2_u8(ptr_dst + x * out->addr[1].stride_x, pixels); + } + } + } + } + else if ((format == VX_DF_IMAGE_YUYV) || (format == VX_DF_IMAGE_UYVY)) + { + int yidx = (format == VX_DF_IMAGE_UYVY) ? 1 : 0; + for (y = low_y; y < high_y; y += out->addr[0].step_y) + { + vx_uint8 *ptr_src0 = (vx_uint8 *)base_src_ptrs[0] + in[0]->addr->stride_y * + ((y * in[0]->addr->step_y / out->addr->step_y) * in[0]->addr->scale_y / VX_SCALE_UNITY); + vx_uint8 *ptr_src1 = (vx_uint8 *)base_src_ptrs[1] + in[1]->addr->stride_y * + ((y * in[1]->addr->step_y / out->addr->step_y) * in[1]->addr->scale_y / VX_SCALE_UNITY); + vx_uint8 *ptr_src2 = (vx_uint8 *)base_src_ptrs[2] + in[2]->addr->stride_y * + ((y * in[1]->addr->step_y / out->addr->step_y) * in[2]->addr->scale_y / VX_SCALE_UNITY); + vx_uint8 *ptr_dst = (vx_uint8 *)base_dst_ptr[0] + out->addr[0].stride_y * y; + for (x = low_x; x < high_x; x += 16) + { + uint8x8x2_t pixels_y = vld2_u8(ptr_src0 + x * in[0]->addr->stride_x); + uint8x8x2_t pixels_uv = {{vld1_u8(ptr_src1 + (x >> 1) * in[1]->addr->stride_x), + vld1_u8(ptr_src2 + (x >> 1) * in[2]->addr->stride_x)}}; + uint8x8x4_t pixels; + pixels.val[0 + yidx] = pixels_y.val[0]; + pixels.val[1 - yidx] = pixels_uv.val[0]; + pixels.val[2 + yidx] = pixels_y.val[1]; + pixels.val[3 - yidx] = pixels_uv.val[1]; + + vst4_u8(ptr_dst + x * out->addr[0].stride_x, pixels); + } + } + } +} + +#define RGB(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y += out->addr->step_y) \ + { \ + planes[0] = (vx_uint8 *)base_src_ptrs[0] + y * in[0]->addr->stride_y; \ + planes[1] = (vx_uint8 *)base_src_ptrs[1] + y * in[1]->addr->stride_y; \ + planes[2] = (vx_uint8 *)base_src_ptrs[2] + y * in[2]->addr->stride_y; \ + vx_uint8 *dst = (vx_uint8 *)base_dst_ptr[0] + y * out->addr->stride_y; \ + for (x = low_x; x < high_x; x += out->addr->step_x) \ + { \ + dst[0] = planes[0][0]; \ + dst[1] = planes[1][0]; \ + dst[2] = planes[2][0]; \ + if (format == VX_DF_IMAGE_RGBX) \ + { \ + planes[3] = (vx_uint8 *)base_src_ptrs[3] + y * in[3]->addr->stride_y + x * in[3]->addr->stride_x; \ + dst[3] = planes[3][0]; \ + } \ + planes[0] += out->addr->step_x * in[0]->addr->stride_x; \ + planes[1] += out->addr->step_x * in[1]->addr->stride_x; \ + planes[2] += out->addr->step_x * in[2]->addr->stride_x; \ + dst += out->addr->step_x * out->addr->stride_x; \ + } \ + } + + +#define YUV4(low_y, high_y, low_x) \ + for (p = 0; p < 3; p++) \ + { \ + for (y = low_y; y < high_y; y += out->addr[p].step_y) \ + { \ + for (x = low_x; x < high_x; x += out->addr[p].step_x) \ + { \ + vx_uint32 x1 = x * in[p]->addr->step_x / out->addr[p].step_x; \ + vx_uint32 y1 = y * in[p]->addr->step_y / out->addr[p].step_y; \ + vx_uint8 *src = (vx_uint8 *)base_src_ptrs[p] + y1 * in[p]->addr->stride_y + x1 * in[p]->addr->stride_x; \ + vx_uint8 *dst = (vx_uint8 *)base_dst_ptr[p] + out->addr[p].stride_y * (out->addr[p].scale_y * y) / VX_SCALE_UNITY + \ + out->addr[p].stride_x * (out->addr[p].scale_x * x) / VX_SCALE_UNITY; \ + *dst = *src; \ + } \ + } \ + } + + +#define NV12(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y += out->addr[0].step_y) \ + { \ + vx_uint8 *src = (vx_uint8 *)base_src_ptrs[0] + y * in[0]->addr->stride_y; \ + vx_uint8 *dst = (vx_uint8 *)base_dst_ptr[0] + y * out->addr[0].stride_y; \ + for (x = low_x; x < high_x; x += out->addr[0].step_x) \ + { \ + *dst = *src; \ + \ + src += out->addr[0].step_x * in[0]->addr->stride_x; \ + dst += out->addr[0].step_x * out->addr[0].stride_x; \ + } \ + } \ + \ + for (y = low_y; y < high_y; y += out->addr[1].step_y) \ + { \ + for (x = low_x; x < high_x; x += out->addr[1].step_x) \ + { \ + vx_uint32 x1 = x * in[1]->addr->step_x / out->addr[1].step_x; \ + vx_uint32 y1 = y * in[1]->addr->step_y / out->addr[1].step_y; \ + vx_uint8 *src0 = (vx_uint8 *)base_src_ptrs[1] + y1 * in[1]->addr->stride_y + x1 * in[1]->addr->stride_x; \ + vx_uint8 *src1 = (vx_uint8 *)base_src_ptrs[2] + y1 * in[2]->addr->stride_y + x1 * in[2]->addr->stride_x; \ + vx_uint8 *dst = (vx_uint8 *)base_dst_ptr[1] + out->addr[1].stride_y * (out->addr[1].scale_y * y) / VX_SCALE_UNITY + \ + out->addr[1].stride_x * (out->addr[1].scale_x * x) / VX_SCALE_UNITY; \ + dst[1 - vidx] = *src0; \ + dst[vidx] = *src1; \ + } \ + } + + +#define YUYV(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y += out->addr->step_y) \ + { \ + for (x = low_x; x < high_x; x += out->addr->step_x * 2) \ + { \ + vx_uint32 x1 = x * in[0]->addr->step_x / out->addr->step_x; \ + vx_uint32 y1 = y * in[0]->addr->step_y / out->addr->step_y; \ + vx_uint32 x2 = x * in[1]->addr->step_x / (out->addr->step_x * 2); \ + vx_uint32 y2 = y * in[1]->addr->step_y / out->addr->step_y; \ + vx_uint8 *srcy0 = (vx_uint8 *)base_src_ptrs[0] + y1 * in[0]->addr->stride_y + x1 * in[0]->addr->stride_x; \ + vx_uint8 *srcy1 = (vx_uint8 *)base_src_ptrs[0] + y1 * in[0]->addr->stride_y + \ + (x1 + in[0]->addr->step_x) * in[0]->addr->stride_x; \ + vx_uint8 *srcu = (vx_uint8 *)base_src_ptrs[1] + y2 * in[1]->addr->stride_y + x2 * in[1]->addr->stride_x; \ + vx_uint8 *srcv = (vx_uint8 *)base_src_ptrs[2] + y2 * in[2]->addr->stride_y + x2 * in[2]->addr->stride_x; \ + vx_uint8 *dst0 = (vx_uint8 *)base_dst_ptr[0] + out->addr[0].stride_y * (out->addr[0].scale_y * y) / VX_SCALE_UNITY + \ + out->addr[0].stride_x * (out->addr[0].scale_x * x) / VX_SCALE_UNITY; \ + vx_uint8 *dst1 = (vx_uint8 *)base_dst_ptr[0] + out->addr[0].stride_y * (out->addr[0].scale_y * y) / VX_SCALE_UNITY + \ + out->addr[0].stride_x * (out->addr[0].scale_x * (x + out->addr[0].step_x)) / VX_SCALE_UNITY; \ + \ + dst0[yidx] = *srcy0; \ + dst1[yidx] = *srcy1; \ + dst0[1 - yidx] = *srcu; \ + dst1[1 - yidx] = *srcv; \ + } \ + } + + +void ChannelCombine_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0, p; + vx_tile_t *in[4]; + in[0] = (vx_tile_t *)parameters[0]; + in[1] = (vx_tile_t *)parameters[1]; + in[2] = (vx_tile_t *)parameters[2]; + in[3] = (vx_tile_t *)parameters[3]; + vx_tile_t *out = (vx_tile_t *)parameters[4]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + void *base_src_ptrs[4] = { NULL }; + void *base_dst_ptr[4] = { NULL }; + + base_src_ptrs[0] = in[0]->base[0]; + base_src_ptrs[1] = in[1]->base[0]; + base_src_ptrs[2] = in[2]->base[0]; + base_src_ptrs[3] = in[3]->base[0]; + + base_dst_ptr[0] = out->base[0]; + base_dst_ptr[1] = out->base[1]; + base_dst_ptr[2] = out->base[2]; + base_dst_ptr[3] = out->base[3]; + + vx_df_image format; + + format = out->image.format; + + vx_uint8 *planes[4]; + + if ((format == VX_DF_IMAGE_RGB) || (format == VX_DF_IMAGE_RGBX)) + { + if (low_y == 0 && low_x == 0) + { + RGB(low_y, high_y, low_x) + } + else + { + RGB(0, low_y, low_x) + RGB(low_y, high_y, 0) + } + } + else if ((format == VX_DF_IMAGE_YUV4) || (format == VX_DF_IMAGE_IYUV)) + { + if (low_y == 0 && low_x == 0) + { + YUV4(low_y, high_y, low_x) + } + else + { + YUV4(0, low_y, low_x) + YUV4(low_y, high_y, 0) + } + } + else if ((format == VX_DF_IMAGE_NV12) || (format == VX_DF_IMAGE_NV21)) + { + int vidx = (format == VX_DF_IMAGE_NV12) ? 1 : 0; + if (low_y == 0 && low_x == 0) + { + NV12(low_y, high_y, low_x) + } + else + { + NV12(0, low_y, low_x) + NV12(low_y, high_y, 0) + } + } + else if ((format == VX_DF_IMAGE_YUYV) || (format == VX_DF_IMAGE_UYVY)) + { + int yidx = (format == VX_DF_IMAGE_UYVY) ? 1 : 0; + + if (low_y == 0 && low_x == 0) + { + YUYV(low_y, high_y, low_x) + } + else + { + YUYV(0, low_y, low_x) + YUYV(low_y, high_y, 0) + } + } +} diff --git a/kernels/tiling/tiling_convertcolor.c b/kernels/tiling/tiling_convertcolor.c new file mode 100644 index 0000000..a82b53e --- /dev/null +++ b/kernels/tiling/tiling_convertcolor.c @@ -0,0 +1,2088 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include + +#include + +static vx_uint8 usat8(vx_int32 a) +{ + if (a > 255) + a = 255; + if (a < 0) + a = 0; + return (vx_uint8)a; +} + +static void yuv2rgb_bt601(vx_uint8 y, vx_uint8 cb, vx_uint8 cr, + vx_uint8 *r, vx_uint8 *g, vx_uint8 *b) +{ + /* + R'= Y' + 0.000*U' + 1.403*V' + G'= Y' - 0.344*U' - 0.714*V' + B'= Y' + 1.773*U' + 0.000*V' + */ + vx_float64 f_y = (vx_float64)y; + vx_float64 f_u = (vx_float64)cb - 128; + vx_float64 f_v = (vx_float64)cr - 128; + vx_float64 f_r = f_y + 0.000f*f_u + 1.403f*f_v; + vx_float64 f_g = f_y - 0.344f*f_u - 0.714f*f_v; + vx_float64 f_b = f_y + 1.773f*f_u + 0.000f*f_v; + vx_int32 i_r = (vx_int32)f_r; + vx_int32 i_g = (vx_int32)f_g; + vx_int32 i_b = (vx_int32)f_b; + *r = usat8(i_r); + *g = usat8(i_g); + *b = usat8(i_b); +} + +static void yuv2rgb_bt709(vx_uint8 y, vx_uint8 cb, vx_uint8 cr, + vx_uint8 *r, vx_uint8 *g, vx_uint8 *b) +{ + /* + R'= Y' + 0.0000*U + 1.5748*V + G'= Y' - 0.1873*U - 0.4681*V + B'= Y' + 1.8556*U + 0.0000*V + */ + vx_float64 f_y = (vx_float64)y; + vx_float64 f_u = (vx_float64)cb - 128; + vx_float64 f_v = (vx_float64)cr - 128; + vx_float64 f_r = f_y + 0.0000f*f_u + 1.5748f*f_v; + vx_float64 f_g = f_y - 0.1873f*f_u - 0.4681f*f_v; + vx_float64 f_b = f_y + 1.8556f*f_u + 0.0000f*f_v; + vx_int32 i_r = (vx_int32)f_r; + vx_int32 i_g = (vx_int32)f_g; + vx_int32 i_b = (vx_int32)f_b; + *r = usat8(i_r); + *g = usat8(i_g); + *b = usat8(i_b); +} + + +static void rgb2yuv_bt709(vx_uint8 r, vx_uint8 g, vx_uint8 b, + vx_uint8 *y, vx_uint8 *cb, vx_uint8 *cr) +{ + /* + Y'= 0.2126*R' + 0.7152*G' + 0.0722*B' + U'=-0.1146*R' - 0.3854*G' + 0.5000*B' + V'= 0.5000*R' - 0.4542*G' - 0.0458*B' + */ + vx_float64 f_r = (vx_float64)r; + vx_float64 f_g = (vx_float64)g; + vx_float64 f_b = (vx_float64)b; + vx_float64 f_y = 0 + 0.2126f*f_r + 0.7152f*f_g + 0.0722f*f_b; + vx_float64 f_u = 0 - 0.1146f*f_r - 0.3854f*f_g + 0.5000f*f_b; + vx_float64 f_v = 0 + 0.5000f*f_r - 0.4542f*f_g - 0.0458f*f_b; + vx_int32 i_y = (vx_int32)f_y; + vx_int32 i_u = (vx_int32)f_u + 128; + vx_int32 i_v = (vx_int32)f_v + 128; + *y = usat8(i_y); + *cb = usat8(i_u); + *cr = usat8(i_v); +} + +static void yuv2yuv_601to709(vx_uint8 y0, vx_uint8 cb0, vx_uint8 cr0, + vx_uint8 *y1, vx_uint8 *cb1, vx_uint8 *cr1) +{ + /* + Y' = 1.0090*Y - 0.11826430*Cb - 0.2000311*Cr + Cb'= 0.0000*Y + 1.01911200*Cb + 0.1146035*Cr + Cr'= 0.0001*Y + 0.07534570*Cb + 1.0290932*Cr + */ + vx_float64 f_y0 = (vx_float64)y0; + vx_float64 f_cb0 = (vx_float64)cb0; + vx_float64 f_cr0 = (vx_float64)cr0; + vx_float64 f_y1 = 1.0090*f_y0 - 0.11826430*f_cb0 - 0.2000311*f_cr0; + vx_float64 f_cb1 = 0.0000*f_y0 + 1.01911200*f_cb0 + 0.1146035*f_cr0; + vx_float64 f_cr1 = 0.0001*f_y0 + 0.07534570*f_cb0 + 1.0290932*f_cr0; + vx_int32 i_y = (vx_int32)f_y1; + vx_int32 i_cb = (vx_int32)f_cb1; + vx_int32 i_cr = (vx_int32)f_cr1; + *y1 = usat8(i_y); + *cb1 = usat8(i_cb); + *cr1 = usat8(i_cr); +} + +static void rgb2yuv_bt709_neon(vx_float32 *arrfr, vx_float32 *arrfg, vx_float32 *arrfb, + vx_uint8 **y, vx_uint8 *cb, vx_uint8 *cr) +{ + /* + Y'= 0.2126*R' + 0.7152*G' + 0.0722*B' + U'=-0.1146*R' - 0.3854*G' + 0.5000*B' + V'= 0.5000*R' - 0.4542*G' - 0.0458*B' + */ + + float32x4_t fr32x4 = vld1q_f32(arrfr); + float32x4_t fg32x4 = vld1q_f32(arrfg); + float32x4_t fb32x4 = vld1q_f32(arrfb); + + float32x4_t fy32x4 = vdupq_n_f32(0.0f); + fy32x4 = vmlaq_n_f32(fy32x4, fr32x4, 0.2126f); + fy32x4 = vmlaq_n_f32(fy32x4, fg32x4, 0.7152f); + fy32x4 = vmlaq_n_f32(fy32x4, fb32x4, 0.0722f); + + float32x4_t fu32x4 = vdupq_n_f32(0.0f); + fu32x4 = vmlaq_n_f32(fu32x4, fr32x4, -0.1146f); + fu32x4 = vmlaq_n_f32(fu32x4, fg32x4, -0.3854f); + fu32x4 = vmlaq_n_f32(fu32x4, fb32x4, 0.5000f); + + float32x4_t fv32x4 = vdupq_n_f32(0.0f); + fv32x4 = vmlaq_n_f32(fv32x4, fr32x4, 0.5000f); + fv32x4 = vmlaq_n_f32(fv32x4, fg32x4, -0.4542f); + fv32x4 = vmlaq_n_f32(fv32x4, fb32x4, -0.0458f); + + int32x4_t iy32x4 = vcvtq_s32_f32(fy32x4); + + int32x4_t icoeff32x4 = vdupq_n_s32(128); + int32x4_t iu32x4 = vcvtq_s32_f32(fu32x4); + iu32x4 = vaddq_s32(iu32x4, icoeff32x4); + + int32x4_t iv32x4 = vcvtq_s32_f32(fv32x4); + iv32x4 = vaddq_s32(iv32x4, icoeff32x4); + + int16x4_t vqmovn_s32 (int32x4_t __a); + uint16x4_t vreinterpret_u16_s16 (int16x4_t __a); + uint8x8_t vqmovn_u16 (uint16x8_t __a); + + y[0][0] = usat8(vgetq_lane_s32(iy32x4, 0)); + y[1][0] = usat8(vgetq_lane_s32(iy32x4, 1)); + y[2][0] = usat8(vgetq_lane_s32(iy32x4, 2)); + y[3][0] = usat8(vgetq_lane_s32(iy32x4, 3)); + + + cb[0] = usat8(vgetq_lane_s32(iu32x4, 0)); + cb[1] = usat8(vgetq_lane_s32(iu32x4, 1)); + cb[2] = usat8(vgetq_lane_s32(iu32x4, 2)); + cb[3] = usat8(vgetq_lane_s32(iu32x4, 3)); + + cr[0] = usat8(vgetq_lane_s32(iv32x4, 0)); + cr[1] = usat8(vgetq_lane_s32(iv32x4, 1)); + cr[2] = usat8(vgetq_lane_s32(iv32x4, 2)); + cr[3] = usat8(vgetq_lane_s32(iv32x4, 3)); +} + +static void yuv2rgb_bt601_neon(vx_uint8 **y, vx_uint8 cb, vx_uint8 cr, + vx_uint8 **r, vx_uint8 **g, vx_uint8 **b) +{ + /* + R'= Y' + 0.000*U' + 1.403*V' + G'= Y' - 0.344*U' - 0.714*V' + B'= Y' + 1.773*U' + 0.000*V' + */ + vx_float32 fy[4] = { (vx_float32)y[0][0], (vx_float32)y[1][0], (vx_float32)y[2][0], (vx_float32)y[3][0] }; + vx_float32 fu[4] = { (vx_float32)cb - 128, (vx_float32)cb - 128, (vx_float32)cb - 128, (vx_float32)cb - 128 }; + vx_float32 fv[4] = { (vx_float32)cr - 128, (vx_float32)cr - 128, (vx_float32)cr - 128, (vx_float32)cr - 128 }; + + float32x4_t fy32x4 = vld1q_f32(fy); + float32x4_t fu32x4 = vld1q_f32(fu); + float32x4_t fv32x4 = vld1q_f32(fv); + + float32x4_t fr32x4 = vdupq_n_f32(0.0f); + fr32x4 = vaddq_f32(fr32x4, fy32x4); + fr32x4 = vmlaq_n_f32(fr32x4, fu32x4, 0.000f); + fr32x4 = vmlaq_n_f32(fr32x4, fv32x4, 1.403f); + + float32x4_t fg32x4 = vdupq_n_f32(0.0f); + fg32x4 = vaddq_f32(fg32x4, fy32x4); + fg32x4 = vmlaq_n_f32(fg32x4, fu32x4, -0.344f); + fg32x4 = vmlaq_n_f32(fg32x4, fv32x4, -0.714f); + + float32x4_t fb32x4 = vdupq_n_f32(0.0f); + fb32x4 = vaddq_f32(fb32x4, fy32x4); + fb32x4 = vmlaq_n_f32(fb32x4, fu32x4, 1.773f); + fb32x4 = vmlaq_n_f32(fb32x4, fv32x4, 0.000f); + + int32x4_t ir32x4 = vcvtq_s32_f32(fr32x4); + int32x4_t ig32x4 = vcvtq_s32_f32(fg32x4); + int32x4_t ib32x4 = vcvtq_s32_f32(fb32x4); + + vx_int32 arr32[12]; + vst1q_s32(arr32, ir32x4); + vst1q_s32(arr32+4, ig32x4); + vst1q_s32(arr32+8, ib32x4); + + for (vx_uint8 i = 0; i < 4; i++) + { + r[i][0] = usat8(arr32[i]); + g[i][1] = usat8(arr32[4 + i]); + b[i][2] = usat8(arr32[8 + i]); + } +} + +static void yuv2rgb_bt709_neon(vx_uint8 **y, vx_uint8 cb, vx_uint8 cr, + vx_uint8 **r, vx_uint8 **g, vx_uint8 **b) +{ + /* + R'= Y' + 0.0000*U + 1.5748*V + G'= Y' - 0.1873*U - 0.4681*V + B'= Y' + 1.8556*U + 0.0000*V + */ + vx_float32 fy[4] = { (vx_float32)y[0][0], (vx_float32)y[1][0], (vx_float32)y[2][0], (vx_float32)y[3][0] }; + vx_float32 fu[4] = { (vx_float32)cb - 128, (vx_float32)cb - 128, (vx_float32)cb - 128, (vx_float32)cb - 128 }; + vx_float32 fv[4] = { (vx_float32)cr - 128, (vx_float32)cr - 128, (vx_float32)cr - 128, (vx_float32)cr - 128 }; + + float32x4_t fy32x4 = vld1q_f32(fy); + float32x4_t fu32x4 = vld1q_f32(fu); + float32x4_t fv32x4 = vld1q_f32(fv); + + float32x4_t fr32x4 = vdupq_n_f32(0.0f); + fr32x4 = vaddq_f32(fr32x4, fy32x4); + fr32x4 = vmlaq_n_f32(fr32x4, fu32x4, 0.000f); + fr32x4 = vmlaq_n_f32(fr32x4, fv32x4, 1.5748f); + + float32x4_t fg32x4 = vdupq_n_f32(0.0f); + fg32x4 = vaddq_f32(fg32x4, fy32x4); + fg32x4 = vmlaq_n_f32(fg32x4, fu32x4, -0.1873f); + fg32x4 = vmlaq_n_f32(fg32x4, fv32x4, -0.4681f); + + float32x4_t fb32x4 = vdupq_n_f32(0.0f); + fb32x4 = vaddq_f32(fb32x4, fy32x4); + fb32x4 = vmlaq_n_f32(fb32x4, fu32x4, 1.8556f); + fb32x4 = vmlaq_n_f32(fb32x4, fv32x4, 0.000f); + + int32x4_t ir32x4 = vcvtq_s32_f32(fr32x4); + int32x4_t ig32x4 = vcvtq_s32_f32(fg32x4); + int32x4_t ib32x4 = vcvtq_s32_f32(fb32x4); + + vx_int32 arr32[12]; + vst1q_s32(arr32, ir32x4); + vst1q_s32(arr32 + 4, ig32x4); + vst1q_s32(arr32+8, ib32x4); + + for (vx_uint8 i = 0; i < 4; i++) + { + r[i][0] = usat8(arr32[i]); + g[i][1] = usat8(arr32[4 + i]); + b[i][2] = usat8(arr32[8 + i]); + } +} + +static void yuv2yuv_601to709_neon(vx_uint8 *y0, vx_uint8 *cb0, vx_uint8 *cr0, + vx_uint8 *y1, vx_uint8 *cb1, vx_uint8 *cr1) +{ + /* + Y' = 1.0090*Y - 0.11826430*Cb - 0.2000311*Cr + Cb'= 0.0000*Y + 1.01911200*Cb + 0.1146035*Cr + Cr'= 0.0001*Y + 0.07534570*Cb + 1.0290932*Cr + */ + vx_float32 fy0[4] = { (vx_float32)y0[0], (vx_float32)y0[1], (vx_float32)y0[2], (vx_float32)y0[3] }; + vx_float32 fcb0[4] = { (vx_float32)cb0[0], (vx_float32)cb0[1], (vx_float32)cb0[2], (vx_float32)cb0[3] }; + vx_float32 fcr0[4] = { (vx_float32)cr0[0], (vx_float32)cr0[1], (vx_float32)cr0[2], (vx_float32)cr0[3] };; + + float32x4_t fy032x4 = vld1q_f32(fy0); + float32x4_t fcb032x4 = vld1q_f32(fcb0); + float32x4_t fcr032x4 = vld1q_f32(fcr0); + + float32x4_t fy132x4 = vdupq_n_f32(0.0f); + fy132x4 = vmlaq_n_f32(fy132x4, fy032x4, 1.0090); + fy132x4 = vmlaq_n_f32(fy132x4, fcb032x4, -0.11826430); + fy132x4 = vmlaq_n_f32(fy132x4, fcr032x4, -0.2000311); + + float32x4_t fcb132x4 = vdupq_n_f32(0.0f); + fcb132x4 = vmlaq_n_f32(fcb132x4, fy032x4, 0.0000); + fcb132x4 = vmlaq_n_f32(fcb132x4, fcb032x4, 1.01911200); + fcb132x4 = vmlaq_n_f32(fcb132x4, fcr032x4, 0.1146035); + + float32x4_t fcr132x4 = vdupq_n_f32(0.0f); + fcr132x4 = vmlaq_n_f32(fcr132x4, fy032x4, 0.0001); + fcr132x4 = vmlaq_n_f32(fcr132x4, fcb032x4, 0.07534570); + fcr132x4 = vmlaq_n_f32(fcr132x4, fcr032x4, 1.0290932); + + int32x4_t iy32x4 = vcvtq_s32_f32(fy132x4); + int32x4_t icb32x4 = vcvtq_s32_f32(fcb132x4); + int32x4_t icr32x4 = vcvtq_s32_f32(fcr132x4); + + vx_int32 arr32[12]; + vst1q_s32(arr32, iy32x4); + vst1q_s32(arr32+4, icb32x4); + vst1q_s32(arr32+8, icr32x4); + + for(vx_uint8 i = 0; i < 4; i++) + { + y1[i] = usat8(arr32[i]); + cb1[2*i] = usat8(arr32[4 + i]); + cr1[2*i+1] = usat8(arr32[8 + i]); + } +} + + +static void yuv2rgb_bt601V(vx_float32* y, vx_float32* cb, vx_float32* cr, + vx_uint8 *rUint8, vx_uint8 *gUint8, vx_uint8 *bUint8) +{ + float32x4_t y32X4Value = vld1q_f32(y); + float32x4_t cb32X4Value = vld1q_f32(cb); + float32x4_t cr32X4Value = vld1q_f32(cr); + float32x4_t All128 = vdupq_n_f32(128.0f); + float32x4_t AllZero = vdupq_n_f32(0.0f); + float32x4_t rFloatValue, gFloatValue, bFloatValue; + int32x4_t rIntValue, gIntValue, bIntValue; + cb32X4Value = vsubq_f32(cb32X4Value,All128); + cr32X4Value = vsubq_f32(cr32X4Value,All128); + + // R'= Y' + 0.000*U' + 1.403*V' + // G'= Y' - 0.344*U' - 0.714*V' + // B'= Y' + 1.773*U' + 0.000*V' + rFloatValue = vmlaq_n_f32(y32X4Value, cr32X4Value, 1.403f); + + gFloatValue = vmlaq_n_f32(y32X4Value, cb32X4Value, -0.344f); + gFloatValue = vmlaq_n_f32(gFloatValue, cr32X4Value, -0.714f); + + bFloatValue = vmlaq_n_f32(y32X4Value, cb32X4Value, 1.773f); + + rIntValue = vcvtq_s32_f32(rFloatValue); + gIntValue = vcvtq_s32_f32(gFloatValue); + bIntValue = vcvtq_s32_f32(bFloatValue); + + rUint8[0] = usat8(vgetq_lane_s32(rIntValue, 0)); + gUint8[0] = usat8(vgetq_lane_s32(gIntValue, 0)); + bUint8[0] = usat8(vgetq_lane_s32(bIntValue, 0)); + + rUint8[1] = usat8(vgetq_lane_s32(rIntValue, 1)); + gUint8[1] = usat8(vgetq_lane_s32(gIntValue, 1)); + bUint8[1] = usat8(vgetq_lane_s32(bIntValue, 1)); + + rUint8[2] = usat8(vgetq_lane_s32(rIntValue, 2)); + gUint8[2] = usat8(vgetq_lane_s32(gIntValue, 2)); + bUint8[2] = usat8(vgetq_lane_s32(bIntValue, 2)); + + rUint8[3] = usat8(vgetq_lane_s32(rIntValue, 3)); + gUint8[3] = usat8(vgetq_lane_s32(gIntValue, 3)); + bUint8[3] = usat8(vgetq_lane_s32(bIntValue, 3)); +} + +static void yuv2rgb_bt709V(vx_float32* y, vx_float32* cb, vx_float32* cr, + vx_uint8 *rUint8, vx_uint8 *gUint8, vx_uint8 *bUint8) +{ + float32x4_t y32X4Value = vld1q_f32(y); + float32x4_t cb32X4Value = vld1q_f32(cb); + float32x4_t cr32X4Value = vld1q_f32(cr); + float32x4_t All128 = vdupq_n_f32(128.0f); + float32x4_t AllZero = vdupq_n_f32(0.0f); + float32x4_t rFloatValue, gFloatValue, bFloatValue; + int32x4_t rIntValue, gIntValue, bIntValue; + cb32X4Value = vsubq_f32(cb32X4Value,All128); + cr32X4Value = vsubq_f32(cr32X4Value,All128); + + // R'= Y' + 0.000*U' + 1.403*V' + // G'= Y' - 0.344*U' - 0.714*V' + // B'= Y' + 1.773*U' + 0.000*V' + rFloatValue = vmlaq_n_f32(y32X4Value, cr32X4Value, 1.5748f); + + gFloatValue = vmlaq_n_f32(y32X4Value, cb32X4Value, -0.1873f); + gFloatValue = vmlaq_n_f32(gFloatValue, cr32X4Value, -0.4681f); + + bFloatValue = vmlaq_n_f32(y32X4Value, cb32X4Value, 1.8556f); + + rIntValue = vcvtq_s32_f32(rFloatValue); + gIntValue = vcvtq_s32_f32(gFloatValue); + bIntValue = vcvtq_s32_f32(bFloatValue); + + rUint8[0] = usat8(vgetq_lane_s32(rIntValue, 0)); + gUint8[0] = usat8(vgetq_lane_s32(gIntValue, 0)); + bUint8[0] = usat8(vgetq_lane_s32(bIntValue, 0)); + + rUint8[1] = usat8(vgetq_lane_s32(rIntValue, 1)); + gUint8[1] = usat8(vgetq_lane_s32(gIntValue, 1)); + bUint8[1] = usat8(vgetq_lane_s32(bIntValue, 1)); + + rUint8[2] = usat8(vgetq_lane_s32(rIntValue, 2)); + gUint8[2] = usat8(vgetq_lane_s32(gIntValue, 2)); + bUint8[2] = usat8(vgetq_lane_s32(bIntValue, 2)); + + rUint8[3] = usat8(vgetq_lane_s32(rIntValue, 3)); + gUint8[3] = usat8(vgetq_lane_s32(gIntValue, 3)); + bUint8[3] = usat8(vgetq_lane_s32(bIntValue, 3)); +} + + +void ConvertColor_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + void *src_base[4] = {NULL}; + void *dst_base[4] = {NULL}; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint32 width = vxTileWidth(out, 0); + vx_uint32 height = vxTileHeight(out, 0); + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = out->tile_x + out->tile_block.width; + + src_base[0] = in->base[0]; + dst_base[0] = out->base[0]; + + src_base[1] = in->base[1]; + dst_base[1] = out->base[1]; + + src_base[2] = in->base[2]; + dst_base[2] = out->base[2]; + + vx_uint32 srcP0StrideX = in->addr[0].stride_x; + vx_uint32 srcP0StrideY = in->addr[0].stride_y; + vx_uint32 dstP0StrideX = out->addr[0].stride_x; + vx_uint32 dstP0StrideY = out->addr[0].stride_y; + + vx_uint32 srcP1StrideX = in->addr[1].stride_x; + vx_uint32 srcP1StrideY = in->addr[1].stride_y; + vx_uint32 dstP1StrideX = out->addr[1].stride_x; + vx_uint32 dstP1StrideY = out->addr[1].stride_y; + + vx_uint32 srcP2StrideX = in->addr[2].stride_x; + vx_uint32 srcP2StrideY = in->addr[2].stride_y; + vx_uint32 dstP2StrideX = out->addr[2].stride_x; + vx_uint32 dstP2StrideY = out->addr[2].stride_y; + + vx_df_image src_format, dst_format; + + src_format = in->image.format; + dst_format = out->image.format; + + vx_enum src_space = in->image.space; + + if ((src_format == VX_DF_IMAGE_RGB) || (src_format == VX_DF_IMAGE_RGBX)) + { + if (dst_format == VX_DF_IMAGE_RGB || dst_format == VX_DF_IMAGE_RGBX) + { + if (dst_format == VX_DF_IMAGE_RGB) + { + for (y = low_y; y < high_y; y++) + { + for (x = low_x; x < high_x; x += 8) + { + vx_uint8 *srcP0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + vx_uint8 *dstP0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + + uint8x8x4_t s = vld4_u8(srcP0); + + uint8x8x3_t d; + d.val[0] = s.val[0]; + d.val[1] = s.val[1]; + d.val[2] = s.val[2]; + + vst3_u8(dstP0, d); + } + } + } + else + { + for (y = low_y; y < high_y; y++) + { + for (x = low_x; x < high_x; x += 8) + { + vx_uint8 *srcP0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + vx_uint8 *dstP0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + + uint8x8x3_t s = vld3_u8(srcP0); + + uint8x8x4_t d; + d.val[0] = s.val[0]; + d.val[1] = s.val[1]; + d.val[2] = s.val[2]; + d.val[3] = vdup_n_u8(255); + + vst4_u8(dstP0, d); + } + } + } + } + else if (dst_format == VX_DF_IMAGE_NV12) + { + vx_uint8 cb[4]; + vx_uint8 cr[4]; + vx_uint8 *rgb[4]; + vx_uint8 *luma[4]; + vx_uint8 *cbcr; + + for (y = low_y; y < high_y; y += 2) + { + for (x = low_x; x < high_x; x += 2) + { + rgb[0] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + rgb[1] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+1) * srcP0StrideX; + rgb[2] = (vx_uint8 *)src_base[0] + (y+1) * srcP0StrideY + x * srcP0StrideX; + rgb[3] = (vx_uint8 *)src_base[0] + (y+1) * srcP0StrideY + (x+1) * srcP0StrideX; + + luma[0] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + luma[1] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX; + luma[2] = (vx_uint8 *)dst_base[0] + (y+1) * dstP0StrideY + x * dstP0StrideX; + luma[3] = (vx_uint8 *)dst_base[0] + (y+1) * dstP0StrideY + (x+1) * dstP0StrideX; + + cbcr = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1); + + vx_float32 arrfr[4] = { (vx_float32)rgb[0][0], (vx_float32)rgb[1][0], (vx_float32)rgb[2][0], (vx_float32)rgb[3][0] }; + vx_float32 arrfg[4] = { (vx_float32)rgb[0][1], (vx_float32)rgb[1][1], (vx_float32)rgb[2][1], (vx_float32)rgb[3][1] }; + vx_float32 arrfb[4] = { (vx_float32)rgb[0][2], (vx_float32)rgb[1][2], (vx_float32)rgb[2][2], (vx_float32)rgb[3][2] }; + + rgb2yuv_bt709_neon(arrfr, arrfg, arrfb, luma, &cb[0], &cr[0]); + + cbcr[0] = (cb[0] + cb[1] + cb[2] + cb[3]) / 4; + cbcr[1] = (cr[0] + cr[1] + cr[2] + cr[3]) / 4; + } + } + } + else if (dst_format == VX_DF_IMAGE_YUV4) + { + vx_uint8 cb[4]; + vx_uint8 cr[4]; + vx_uint8 *rgb[4]; + vx_uint8 *luma[4]; + vx_uint8 *u[4]; + vx_uint8 *v[4]; + for (y = low_y; y < high_y; y++) + { + for (x = low_x; x < high_x; x += 4) + { + rgb[0] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + rgb[1] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+1) * srcP0StrideX; + rgb[2] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+2) * srcP0StrideX; + rgb[3] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+3) * srcP0StrideX; + + luma[0] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + luma[1] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX; + luma[2] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+2) * dstP0StrideX; + luma[3] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+3) * dstP0StrideX; + + u[0] = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX; + u[1] = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + (x+1) * dstP1StrideX; + u[2] = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + (x+2) * dstP1StrideX; + u[3] = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + (x+3) * dstP1StrideX; + + v[0] = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX; + v[1] = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + (x+1) * dstP2StrideX; + v[2] = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + (x+2) * dstP2StrideX; + v[3] = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + (x+3) * dstP2StrideX; + + vx_float32 arrfr[4] = { (vx_float32)rgb[0][0], (vx_float32)rgb[1][0], (vx_float32)rgb[2][0], (vx_float32)rgb[3][0] }; + vx_float32 arrfg[4] = { (vx_float32)rgb[0][1], (vx_float32)rgb[1][1], (vx_float32)rgb[2][1], (vx_float32)rgb[3][1] }; + vx_float32 arrfb[4] = { (vx_float32)rgb[0][2], (vx_float32)rgb[1][2], (vx_float32)rgb[2][2], (vx_float32)rgb[3][2] }; + + rgb2yuv_bt709_neon(arrfr, arrfg, arrfb, luma, &cb[0], &cr[0]); + + *u[0] = cb[0]; + *u[1] = cb[1]; + *u[2] = cb[2]; + *u[3] = cb[3]; + + *v[0] = cr[0]; + *v[1] = cr[1]; + *v[2] = cr[2]; + *v[3] = cr[3]; + } + } + } + else if (dst_format == VX_DF_IMAGE_IYUV) + { + vx_uint8 cb[4]; + vx_uint8 cr[4]; + vx_uint8 *rgb[4]; + vx_uint8 *luma[4]; + vx_uint8 *cbp; + vx_uint8 *crp; + for (y = low_y; y < high_y; y += 2) + { + for (x = low_x; x < high_x; x += 2) + { + rgb[0] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + rgb[1] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+1) * srcP0StrideX; + rgb[2] = (vx_uint8 *)src_base[0] + (y+1) * srcP0StrideY + x * srcP0StrideX; + rgb[3] = (vx_uint8 *)src_base[0] + (y+1) * srcP0StrideY + (x+1) * srcP0StrideX; + + luma[0] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + luma[1] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX; + luma[2] = (vx_uint8 *)dst_base[0] + (y+1) * dstP0StrideY + x * dstP0StrideX; + luma[3] = (vx_uint8 *)dst_base[0] + (y+1) * dstP0StrideY + (x+1) * dstP0StrideX; + + cbp = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1); + crp = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * (x >> 1); + + vx_float32 arrfr[4] = { (vx_float32)rgb[0][0], (vx_float32)rgb[1][0], (vx_float32)rgb[2][0], (vx_float32)rgb[3][0] }; + vx_float32 arrfg[4] = { (vx_float32)rgb[0][1], (vx_float32)rgb[1][1], (vx_float32)rgb[2][1], (vx_float32)rgb[3][1] }; + vx_float32 arrfb[4] = { (vx_float32)rgb[0][2], (vx_float32)rgb[1][2], (vx_float32)rgb[2][2], (vx_float32)rgb[3][2] }; + + rgb2yuv_bt709_neon(arrfr, arrfg, arrfb, luma, &cb[0], &cr[0]); + + cbp[0] = (vx_uint8)(((vx_uint16)cb[0] + (vx_uint16)cb[1] + (vx_uint16)cb[2] + (vx_uint16)cb[3]) >> 2); + crp[0] = (vx_uint8)(((vx_uint16)cr[0] + (vx_uint16)cr[1] + (vx_uint16)cr[2] + (vx_uint16)cr[3]) >> 2); + } + } + } + } + else if (src_format == VX_DF_IMAGE_NV21 || src_format == VX_DF_IMAGE_NV12) + { + int u_pix = src_format == VX_DF_IMAGE_NV12 ? 0 : 1; + int v_pix = src_format == VX_DF_IMAGE_NV12 ? 1 : 0; + if ((dst_format == VX_DF_IMAGE_RGB) || (dst_format == VX_DF_IMAGE_RGBX)) + { + vx_uint8 *rgb[4]; + vx_uint8 *luma[4]; + vx_uint8 *crcb; + for (y = low_y; y < high_y; y += 2) + { + for (x = low_x; x < high_x; x += 2) + { + luma[0] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + luma[1] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+1) * srcP0StrideX; + luma[2] = (vx_uint8 *)src_base[0] + (y+1) * srcP0StrideY + x * srcP0StrideX; + luma[3] = (vx_uint8 *)src_base[0] + (y+1) * srcP0StrideY + (x+1) * srcP0StrideX; + + crcb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1); + + rgb[0] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + rgb[1] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX; + rgb[2] = (vx_uint8 *)dst_base[0] + (y+1) * dstP0StrideY + x * dstP0StrideX; + rgb[3] = (vx_uint8 *)dst_base[0] + (y+1) * dstP0StrideY + (x+1) * dstP0StrideX; + + if (dst_format == VX_DF_IMAGE_RGBX) + { + rgb[0][3] = 255; + rgb[1][3] = 255; + rgb[2][3] = 255; + rgb[3][3] = 255; + + } + if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625) + { + yuv2rgb_bt601_neon(luma, crcb[u_pix], crcb[v_pix], rgb, rgb, rgb); + } + else + { + yuv2rgb_bt709_neon(luma, crcb[u_pix], crcb[v_pix], rgb, rgb, rgb); + } + } + } + } + else if (dst_format == VX_DF_IMAGE_NV12 || dst_format == VX_DF_IMAGE_NV21) + { + for (y = low_y; y < high_y; y++) + { + for (x = low_x; x < high_x; x++) + { + vx_uint8 *luma[2] = {(vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX, + (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX}; + + vx_uint8 *cbcr = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1); + vx_uint8 *crcb = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1); + + yuv2yuv_601to709(luma[0][0],cbcr[0],cbcr[1],&luma[1][0],&crcb[1],&crcb[0]); + } + } + } + else if (dst_format == VX_DF_IMAGE_YUV4) + { + for (y = low_y; y < high_y; y++) + { + for (x = low_x; x < high_x; x += 8) + { + vx_uint8 *srcP0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + vx_uint8 *dstP0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + + uint8x8_t lumaV8 = vld1_u8(srcP0); + vst1_u8(dstP0, lumaV8); + } + } + + vx_uint8 *crcb = NULL; + vx_uint8 *cb[4] = { NULL }; + vx_uint8 *cr[4] = { NULL }; + for (y = low_y; y < high_y; y += 2) + { + for (x = low_x; x < high_x; x += 2) + { + crcb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1); + + cb[0] = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX; + cb[1] = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + (x+1) * dstP1StrideX; + cb[2] = (vx_uint8 *)dst_base[1] + (y+1) * dstP1StrideY + x * dstP1StrideX; + cb[3] = (vx_uint8 *)dst_base[1] + (y+1) * dstP1StrideY + (x+1) * dstP1StrideX; + + cr[0] = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX; + cr[1] = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + (x+1) * dstP2StrideX; + cr[2] = (vx_uint8 *)dst_base[2] + (y+1) * dstP2StrideY + x * dstP2StrideX; + cr[3] = (vx_uint8 *)dst_base[2] + (y+1) * dstP2StrideY + (x+1) * dstP2StrideX; + + cb[0][0] = crcb[u_pix]; + cb[1][0] = crcb[u_pix]; + cb[2][0] = crcb[u_pix]; + cb[3][0] = crcb[u_pix]; + + cr[0][0] = crcb[v_pix]; + cr[1][0] = crcb[v_pix]; + cr[2][0] = crcb[v_pix]; + cr[3][0] = crcb[v_pix]; + + } + } + } + else if (dst_format == VX_DF_IMAGE_IYUV) + { + for (y = low_y; y < high_y; y++) + { + for (x = low_x; x < high_x; x += 8) + { + vx_uint8 *srcP0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + vx_uint8 *dstP0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + + uint8x8_t lumaV8 = vld1_u8(srcP0); + vst1_u8(dstP0, lumaV8); + } + } + + vx_uint8 *crcb[4]; + vx_uint8 *cb[4]; + vx_uint8 *cr[4]; + for (y = low_y; y < high_y; y++) + { + for (x = low_x; x < high_x; x += 4) + { + crcb[0] = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * ((x + 0) / 2); + crcb[1] = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * ((x + 1) / 2); + crcb[2] = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * ((x + 2) / 2); + crcb[3] = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * ((x + 3) / 2); + + cb[0] = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * ((x + 0) / 2); + cb[1] = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * ((x + 1) / 2); + cb[2] = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * ((x + 2) / 2); + cb[3] = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * ((x + 3) / 2); + + cr[0] = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * ((x + 0) / 2); + cr[1] = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * ((x + 1) / 2); + cr[2] = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * ((x + 2) / 2); + cr[3] = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * ((x + 3) / 2); + + + cb[0][0] = crcb[0][u_pix]; + cb[1][0] = crcb[1][u_pix]; + cb[2][0] = crcb[2][u_pix]; + cb[3][0] = crcb[3][u_pix]; + + cr[0][0] = crcb[0][v_pix]; + cr[1][0] = crcb[1][v_pix]; + cr[2][0] = crcb[2][v_pix]; + cr[3][0] = crcb[3][v_pix]; + } + } + } + } + else if (src_format == VX_DF_IMAGE_YUYV) + { + if (dst_format == VX_DF_IMAGE_RGB || dst_format == VX_DF_IMAGE_RGBX) + { + vx_uint32 x, y; + for (y = low_y; y < high_y; y++) + { + for (x = low_x; x < high_x; x += 4) + { + vx_uint8 *yuyv = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + vx_uint8 *yuyv1 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+2) * srcP0StrideX; + + vx_float32 yValue[4] = {(vx_float32)yuyv[0],(vx_float32)yuyv[2],(vx_float32)yuyv1[0],(vx_float32)yuyv1[2]}; + vx_float32 cbValue[4] = {(vx_float32)yuyv[1],(vx_float32)yuyv[1],(vx_float32)yuyv1[1],(vx_float32)yuyv1[1]}; + vx_float32 crValue[4] = {(vx_float32)yuyv[3],(vx_float32)yuyv[3],(vx_float32)yuyv1[3],(vx_float32)yuyv1[3]}; + + vx_uint8 *rgb0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + vx_uint8 *rgb1 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX; + vx_uint8 *rgb2 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+2) * dstP0StrideX; + vx_uint8 *rgb3 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+3) * dstP0StrideX; + + vx_uint8 bUint8[4]; + vx_uint8 gUint8[4]; + vx_uint8 rUint8[4]; + + if(src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625) + { + yuv2rgb_bt601V(yValue, cbValue, crValue, rUint8, gUint8, bUint8); + + rgb0[0] = rUint8[0]; + rgb1[0] = rUint8[1]; + rgb2[0] = rUint8[2]; + rgb3[0] = rUint8[3]; + + rgb0[1] = gUint8[0]; + rgb1[1] = gUint8[1]; + rgb2[1] = gUint8[2]; + rgb3[1] = gUint8[3]; + + rgb0[2] = bUint8[0]; + rgb1[2] = bUint8[1]; + rgb2[2] = bUint8[2]; + rgb3[2] = bUint8[3]; + if (dst_format == VX_DF_IMAGE_RGBX) + { + rgb0[3] = 255; + rgb1[3] = 255; + rgb2[3] = 255; + rgb3[3] = 255; + } + } + else + { + yuv2rgb_bt709V(yValue, cbValue, crValue, rUint8, gUint8, bUint8); + + rgb0[0] = rUint8[0]; + rgb1[0] = rUint8[1]; + rgb2[0] = rUint8[2]; + rgb3[0] = rUint8[3]; + + rgb0[1] = gUint8[0]; + rgb1[1] = gUint8[1]; + rgb2[1] = gUint8[2]; + rgb3[1] = gUint8[3]; + + rgb0[2] = bUint8[0]; + rgb1[2] = bUint8[1]; + rgb2[2] = bUint8[2]; + rgb3[2] = bUint8[3]; + if (dst_format == VX_DF_IMAGE_RGBX) + { + rgb0[3] = 255; + rgb1[3] = 255; + rgb2[3] = 255; + rgb3[3] = 255; + } + } + } + } + } + else if (dst_format == VX_DF_IMAGE_NV12) + { + vx_uint32 x, y; + vx_uint8 *yuyv[2]; + vx_uint8 *luma[4]; + for (y = low_y; y < high_y; y += 2) + { + vx_uint8 *src0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY; + vx_uint8 *src1 = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY; + vx_uint8 *dstLuma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY; + vx_uint8 *dstLuma1 = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY; + vx_uint8 *dstCbCr = (vx_uint8 *)dst_base[1] + (y >> 1) * dstP0StrideY; + for (x = low_x; x < high_x; x += 8) + { + uint8x8_t srcValue00 = vld1_u8(src0 + x * srcP0StrideX); + uint8x8_t srcValue01 = vld1_u8(src0 + (x + 4) * srcP0StrideX); + uint8x8x2_t dstValue0 = vuzp_u8(srcValue00, srcValue01); + vst1_u8((dstLuma + x * dstP0StrideX),dstValue0.val[0]); + + uint8x8_t srcValue10 = vld1_u8(src1 + x * srcP0StrideX); + uint8x8_t srcValue11 = vld1_u8(src1 + (x + 4) * srcP0StrideX); + uint8x8x2_t dstValue1 = vuzp_u8(srcValue10, srcValue11); + vst1_u8((dstLuma1 + x * dstP0StrideX),dstValue1.val[0]); + + uint16x8_t cbcrValue = vaddl_u8(dstValue0.val[1],dstValue1.val[1]); + + vx_uint16 cbcrValuek[8]; + vst1q_u16(cbcrValuek,cbcrValue); + for (vx_uint32 kx = 0; kx < 8; kx += 2) + { + *(dstCbCr + ((x + kx) >> 1) * dstP1StrideX) = cbcrValuek[kx] / 2; + *(dstCbCr + ((x + kx) >> 1) * dstP1StrideX + 1) = cbcrValuek[kx + 1] / 2; + } + + } + } + } + else if (dst_format == VX_DF_IMAGE_YUV4) + { + for (y = low_y; y < high_y; y++) + { + for (x = low_x; x < high_x; x += 2) + { + vx_uint8 *yuyv = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + vx_uint8 *cb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX; + vx_uint8 *cr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX; + + luma[0] = yuyv[0]; + luma[1] = yuyv[2]; + cb[0] = yuyv[1]; + cr[0] = yuyv[3]; + cb[1] = yuyv[1]; + cr[1] = yuyv[3]; + } + } + } + else if (dst_format == VX_DF_IMAGE_IYUV) + { + vx_uint32 x, y; + vx_uint8 *yuyv[2]; + vx_uint8 *_luma[4]; + for (y = low_y; y < high_y; y += 2) + { + vx_uint8 *src0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY; + vx_uint8 *src1 = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY; + vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY; + vx_uint8 *luma1 = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY; + vx_uint8 *cb = (vx_uint8 *)dst_base[1] + (y >> 1) * dstP1StrideY; + vx_uint8 *cr = (vx_uint8 *)dst_base[2] + (y >> 1) * dstP2StrideY; + + for (x = low_x; x < high_x; x += 8) + { + uint8x8_t src00Value = vld1_u8(src0 + x * srcP0StrideX); + uint8x8_t src01Value = vld1_u8(src0 + (x + 4) * srcP0StrideX); + uint8x8x2_t dst0Value = vuzp_u8(src00Value,src01Value); + vst1_u8((luma + x * dstP0StrideX),dst0Value.val[0]); + + uint8x8_t src10Value = vld1_u8(src1 + x * srcP0StrideX); + uint8x8_t src11Value = vld1_u8(src1 + (x + 4) * srcP0StrideX); + uint8x8x2_t dst1Value = vuzp_u8(src10Value,src11Value); + vst1_u8((luma1 + x * dstP0StrideX),dst1Value.val[0]); + + uint16x8_t cbcrValue = vaddl_u8(dst0Value.val[1],dst1Value.val[1]); + vx_uint16 cbcrValuek[8]; + vst1q_u16(cbcrValuek,cbcrValue); + for (vx_uint32 kx = 0; kx < 8; kx += 2) + { + *(cb + ((x + kx) >> 1) * dstP1StrideX) = cbcrValuek[kx] / 2; + *(cr + ((x + kx) >> 1) * dstP2StrideX) = cbcrValuek[kx + 1] / 2; + } + } + } + } + } + else if (src_format == VX_DF_IMAGE_UYVY) + { + if (dst_format == VX_DF_IMAGE_RGB || dst_format == VX_DF_IMAGE_RGBX) + { + vx_uint32 x, y; + for (y = low_y; y < high_y; y++) + { + for (x = low_x; x < high_x; x += 4) + { + vx_uint8 *uyvy = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + vx_uint8 *uyvy1 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+2) * srcP0StrideX; + + vx_float32 yValue[4] = {(vx_float32)uyvy[1],(vx_float32)uyvy[3],(vx_float32)uyvy1[1],(vx_float32)uyvy1[3]}; + vx_float32 cbValue[4] = {(vx_float32)uyvy[0],(vx_float32)uyvy[0],(vx_float32)uyvy1[0],(vx_float32)uyvy1[0]}; + vx_float32 crValue[4] = {(vx_float32)uyvy[2],(vx_float32)uyvy[2],(vx_float32)uyvy1[2],(vx_float32)uyvy1[2]}; + + vx_uint8 *rgb0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + vx_uint8 *rgb1 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX; + vx_uint8 *rgb2 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+2) * dstP0StrideX; + vx_uint8 *rgb3 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+3) * dstP0StrideX; + + vx_uint8 bUint8[4]; + vx_uint8 gUint8[4]; + vx_uint8 rUint8[4]; + + if(src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625) + { + yuv2rgb_bt601V(yValue, cbValue, crValue, rUint8, gUint8, bUint8); + + rgb0[0] = rUint8[0]; + rgb1[0] = rUint8[1]; + rgb2[0] = rUint8[2]; + rgb3[0] = rUint8[3]; + + rgb0[1] = gUint8[0]; + rgb1[1] = gUint8[1]; + rgb2[1] = gUint8[2]; + rgb3[1] = gUint8[3]; + + rgb0[2] = bUint8[0]; + rgb1[2] = bUint8[1]; + rgb2[2] = bUint8[2]; + rgb3[2] = bUint8[3]; + if (dst_format == VX_DF_IMAGE_RGBX) + { + rgb0[3] = 255; + rgb1[3] = 255; + rgb2[3] = 255; + rgb3[3] = 255; + } + } + else + { + yuv2rgb_bt709V(yValue, cbValue, crValue, rUint8, gUint8, bUint8); + + rgb0[0] = rUint8[0]; + rgb1[0] = rUint8[1]; + rgb2[0] = rUint8[2]; + rgb3[0] = rUint8[3]; + + rgb0[1] = gUint8[0]; + rgb1[1] = gUint8[1]; + rgb2[1] = gUint8[2]; + rgb3[1] = gUint8[3]; + + rgb0[2] = bUint8[0]; + rgb1[2] = bUint8[1]; + rgb2[2] = bUint8[2]; + rgb3[2] = bUint8[3]; + if (dst_format == VX_DF_IMAGE_RGBX) + { + rgb0[3] = 255; + rgb1[3] = 255; + rgb2[3] = 255; + rgb3[3] = 255; + } + } + } + } + } + else if (dst_format == VX_DF_IMAGE_NV12) + { + vx_uint32 x, y; + vx_uint8 *uyvy[2]; + vx_uint8 *luma[4]; + for (y = low_y; y < high_y; y += 2) + { + vx_uint8 *src0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY; + vx_uint8 *src1 = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY; + vx_uint8 *dstLuma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY; + vx_uint8 *dstLuma1 = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY; + vx_uint8 *dstCbCr = (vx_uint8 *)dst_base[1] + (y >> 1) * dstP0StrideY; + for (x = low_x; x < high_x; x += 8) + { + uint8x8_t srcValue00 = vld1_u8(src0 + x * srcP0StrideX); + uint8x8_t srcValue01 = vld1_u8(src0 + (x + 4) * srcP0StrideX); + uint8x8x2_t dstValue0 = vuzp_u8(srcValue00, srcValue01); + vst1_u8((dstLuma + x * dstP0StrideX),dstValue0.val[1]); + + uint8x8_t srcValue10 = vld1_u8(src1 + x * srcP0StrideX); + uint8x8_t srcValue11 = vld1_u8(src1 + (x + 4) * srcP0StrideX); + uint8x8x2_t dstValue1 = vuzp_u8(srcValue10, srcValue11); + vst1_u8((dstLuma1 + x * dstP0StrideX),dstValue1.val[1]); + + uint16x8_t cbcrValue = vaddl_u8(dstValue0.val[0],dstValue1.val[0]); + + vx_uint16 cbcrValuek[8]; + vst1q_u16(cbcrValuek,cbcrValue); + for (vx_uint32 kx = 0; kx < 8; kx += 2) + { + *(dstCbCr + ((x + kx) >> 1) * dstP1StrideX) = cbcrValuek[kx] / 2; + *(dstCbCr + ((x + kx) >> 1) * dstP1StrideX + 1) = cbcrValuek[kx + 1] / 2; + } + + } + } + } + else if (dst_format == VX_DF_IMAGE_YUV4) + { + for (y = low_y; y < high_y; y++) + { + for (x = low_x; x < high_x; x += 2) + { + vx_uint8 *uyvy = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + vx_uint8 *cb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX; + vx_uint8 *cr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX; + + luma[0] = uyvy[1]; + luma[1] = uyvy[3]; + cb[0] = uyvy[0]; + cr[0] = uyvy[2]; + cb[1] = uyvy[0]; + cr[1] = uyvy[2]; + } + } + } + else if (dst_format == VX_DF_IMAGE_IYUV) + { + vx_uint32 x, y; + vx_uint8 *uyvy[2]; + vx_uint8 *_luma[4]; + for (y = low_y; y < high_y; y += 2) + { + vx_uint8 *src0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY; + vx_uint8 *src1 = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY; + vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY; + vx_uint8 *luma1 = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY; + vx_uint8 *cb = (vx_uint8 *)dst_base[1] + (y >> 1) * dstP1StrideY; + vx_uint8 *cr = (vx_uint8 *)dst_base[2] + (y >> 1) * dstP2StrideY; + + for (x = low_x; x < high_x; x += 8) + { + uint8x8_t src00Value = vld1_u8(src0 + x * srcP0StrideX); + uint8x8_t src01Value = vld1_u8(src0 + (x + 4) * srcP0StrideX); + uint8x8x2_t dst0Value = vuzp_u8(src00Value,src01Value); + vst1_u8((luma + x * dstP0StrideX),dst0Value.val[1]); + + uint8x8_t src10Value = vld1_u8(src1 + x * srcP0StrideX); + uint8x8_t src11Value = vld1_u8(src1 + (x + 4) * srcP0StrideX); + uint8x8x2_t dst1Value = vuzp_u8(src10Value,src11Value); + vst1_u8((luma1 + x * dstP0StrideX),dst1Value.val[1]); + + uint16x8_t cbcrValue = vaddl_u8(dst0Value.val[0], dst1Value.val[0]); + vx_uint16 cbcrValuek[8]; + vst1q_u16(cbcrValuek, cbcrValue); + for (vx_uint32 kx = 0; kx < 8; kx += 2) + { + *(cb + ((x + kx) >> 1) * dstP1StrideX) = cbcrValuek[kx] / 2; + *(cr + ((x + kx) >> 1) * dstP2StrideX) = cbcrValuek[kx + 1] / 2; + } + } + } + } + } + else if (src_format == VX_DF_IMAGE_IYUV) + { + if (dst_format == VX_DF_IMAGE_RGB || dst_format == VX_DF_IMAGE_RGBX) + { + vx_uint32 x, y; + for (y = low_y; y < high_y; y++) + { + for (x = low_x; x < high_x; x += 4) + { + vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; + vx_uint8 *cb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1); + vx_uint8 *cr = (vx_uint8 *)src_base[2] + srcP2StrideY * (y >> 1) + srcP2StrideX * (x >> 1); + + vx_float32 yValue[4] = {(vx_float32)luma[0],(vx_float32)luma[1],(vx_float32)luma[2],(vx_float32)luma[3]}; + vx_float32 cbValue[4] = {(vx_float32)cb[0],(vx_float32)cb[0],(vx_float32)cb[1],(vx_float32)cb[1]}; + vx_float32 crValue[4] = {(vx_float32)cr[0],(vx_float32)cr[0],(vx_float32)cr[1],(vx_float32)cr[1]}; + + vx_uint8 *rgb0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; + vx_uint8 *rgb1 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX; + vx_uint8 *rgb2 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+2) * dstP0StrideX; + vx_uint8 *rgb3 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+3) * dstP0StrideX; + + vx_uint8 bUint8[4]; + vx_uint8 gUint8[4]; + vx_uint8 rUint8[4]; + + if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625) + { + yuv2rgb_bt601V(yValue, cbValue, crValue, rUint8, gUint8, bUint8); + + rgb0[0] = rUint8[0]; + rgb1[0] = rUint8[1]; + rgb2[0] = rUint8[2]; + rgb3[0] = rUint8[3]; + + rgb0[1] = gUint8[0]; + rgb1[1] = gUint8[1]; + rgb2[1] = gUint8[2]; + rgb3[1] = gUint8[3]; + + rgb0[2] = bUint8[0]; + rgb1[2] = bUint8[1]; + rgb2[2] = bUint8[2]; + rgb3[2] = bUint8[3]; + if (dst_format == VX_DF_IMAGE_RGBX) + { + rgb0[3] = 255; + rgb1[3] = 255; + rgb2[3] = 255; + rgb3[3] = 255; + } + } + else + { + yuv2rgb_bt709V(yValue, cbValue, crValue, rUint8, gUint8, bUint8); + + rgb0[0] = rUint8[0]; + rgb1[0] = rUint8[1]; + rgb2[0] = rUint8[2]; + rgb3[0] = rUint8[3]; + + rgb0[1] = gUint8[0]; + rgb1[1] = gUint8[1]; + rgb2[1] = gUint8[2]; + rgb3[1] = gUint8[3]; + + rgb0[2] = bUint8[0]; + rgb1[2] = bUint8[1]; + rgb2[2] = bUint8[2]; + rgb3[2] = bUint8[3]; + if (dst_format == VX_DF_IMAGE_RGBX) + { + rgb0[3] = 255; + rgb1[3] = 255; + rgb2[3] = 255; + rgb3[3] = 255; + } + } + } + } + } + else if (dst_format == VX_DF_IMAGE_NV12) + { + vx_uint32 x, y; + for (y = low_y; y < high_y; y += 2) + { + vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY; + vx_uint8 *luma1 = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY; + vx_uint8 *cb = (vx_uint8 *)src_base[1] + (y >> 1) * srcP1StrideY; + vx_uint8 *cr = (vx_uint8 *)src_base[2] + (y >> 1) * srcP2StrideY; + vx_uint8 *nv12Y = (vx_uint8 *)dst_base[0] + y * dstP0StrideY; + vx_uint8 *nv12Y1 = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY; + vx_uint8 *nv12CbCr = (vx_uint8 *)dst_base[1] + (y >> 1) * dstP1StrideY; + + for (x = low_x; x < high_x; x += 8) + { + uint8x8_t lumaValue = vld1_u8(luma + x * srcP0StrideX); + vst1_u8((nv12Y + x * dstP0StrideX), lumaValue); + + uint8x8_t luma1Value = vld1_u8(luma1 + x * srcP0StrideX); + vst1_u8((nv12Y1 + x * dstP0StrideX), luma1Value); + + uint8x8_t cbValue = vld1_u8(cb + (x >> 1) * srcP1StrideX); + uint8x8_t crValue = vld1_u8(cr + (x >> 1) * srcP2StrideX); + + uint8x8x2_t cbcrValue = vzip_u8(cbValue, crValue); + + vst1_u8((nv12CbCr + (x >> 1) * dstP1StrideX), cbcrValue.val[0]); + } + } + } + else if (dst_format == VX_DF_IMAGE_YUV4) + { + vx_uint32 x, y; + for (y = low_y; y < high_y; y += 2) + { + vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY; + vx_uint8 *luma1 = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY; + vx_uint8 *cb = (vx_uint8 *)src_base[1] + (y >> 1) * srcP1StrideY; + vx_uint8 *cr = (vx_uint8 *)src_base[2] + (y >> 1) * srcP2StrideY; + vx_uint8 *dstLuma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY; + vx_uint8 *dstLuma1 = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY; + vx_uint8 *dstcb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY; + vx_uint8 *dstcb1 = (vx_uint8 *)dst_base[1] + (y + 1) * dstP1StrideY; + vx_uint8 *dstcr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY; + vx_uint8 *dstcr1 = (vx_uint8 *)dst_base[2] + (y + 1) * dstP1StrideY; + + for (x = low_x; x < high_x; x += 8) + { + uint8x8_t lumaValue = vld1_u8(luma + x * srcP0StrideX); + vst1_u8((dstLuma + x * dstP0StrideX), lumaValue); + + uint8x8_t luma1Value = vld1_u8(luma1 + x * srcP0StrideX); + vst1_u8((dstLuma1 + x * dstP0StrideX), luma1Value); + + uint8x8_t cbValue = vld1_u8(cb + (x >> 1) * srcP1StrideX); + uint8x8x2_t dstCbValue = vzip_u8(cbValue, cbValue); + vst1_u8((dstcb + x * dstP1StrideX), dstCbValue.val[0]); + vst1_u8((dstcb1 + x * dstP1StrideX), dstCbValue.val[0]); + + uint8x8_t crValue = vld1_u8(cr + (x >> 1) * srcP2StrideX); + uint8x8x2_t dstCrValue = vzip_u8(crValue, crValue); + vst1_u8((dstcr + x * dstP2StrideX), dstCrValue.val[0]); + vst1_u8((dstcr1 + x * dstP2StrideX), dstCrValue.val[0]); + } + } + } + } +} + + +#define RGBX_RGB(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint8 *srcP0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *dstP0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + \ + dstP0[0] = srcP0[0]; \ + dstP0[1] = srcP0[1]; \ + dstP0[2] = srcP0[2]; \ + } \ + } + +#define RGB_RGBX(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint8 *srcP0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *dstP0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + \ + dstP0[0] = srcP0[0]; \ + dstP0[1] = srcP0[1]; \ + dstP0[2] = srcP0[2]; \ + dstP0[3] = 255; \ + } \ + } + +#define RGB_NV12(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y += 2) \ + { \ + for (x = low_x; x < high_x; x += 2) \ + { \ + rgb[0] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + rgb[1] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x + 1) * srcP0StrideX; \ + rgb[2] = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + x * srcP0StrideX; \ + rgb[3] = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + (x + 1) * srcP0StrideX; \ + \ + luma[0] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + luma[1] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x + 1) * dstP0StrideX; \ + luma[2] = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + x * dstP0StrideX; \ + luma[3] = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + (x + 1) * dstP0StrideX; \ + \ + cbcr = (vx_uint8 *)dst_base[1] + y * dstP1StrideY / 2 + x * dstP1StrideX / 2; \ + \ + vx_float32 arrfr[4] = { (vx_float32)rgb[0][0], (vx_float32)rgb[1][0], (vx_float32)rgb[2][0], (vx_float32)rgb[3][0] }; \ + vx_float32 arrfg[4] = { (vx_float32)rgb[0][1], (vx_float32)rgb[1][1], (vx_float32)rgb[2][1], (vx_float32)rgb[3][1] }; \ + vx_float32 arrfb[4] = { (vx_float32)rgb[0][2], (vx_float32)rgb[1][2], (vx_float32)rgb[2][2], (vx_float32)rgb[3][2] }; \ + \ + rgb2yuv_bt709(rgb[0][0], rgb[0][1], rgb[0][2], &luma[0][0], &cb[0], &cr[0]); \ + rgb2yuv_bt709(rgb[1][0], rgb[1][1], rgb[1][2], &luma[1][0], &cb[1], &cr[1]); \ + rgb2yuv_bt709(rgb[2][0], rgb[2][1], rgb[2][2], &luma[2][0], &cb[2], &cr[2]); \ + rgb2yuv_bt709(rgb[3][0], rgb[3][1], rgb[3][2], &luma[3][0], &cb[3], &cr[3]); \ + \ + cbcr[0] = (cb[0] + cb[1] + cb[2] + cb[3]) / 4; \ + cbcr[1] = (cr[0] + cr[1] + cr[2] + cr[3]) / 4; \ + } \ + } + +#define RGB_YUV4(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint8 *rgb = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + vx_uint8 *cb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX; \ + vx_uint8 *cr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX; \ + \ + rgb2yuv_bt709(rgb[0], rgb[1], rgb[2], luma, cb, cr); \ + } \ + } + +#define RGB_IYUV(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y += 2) \ + { \ + for (x = low_x; x < high_x; x += 2) \ + { \ + rgb[0] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + rgb[1] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x + 1) * srcP0StrideX; \ + rgb[2] = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + x * srcP0StrideX; \ + rgb[3] = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + (x + 1) * srcP0StrideX; \ + \ + luma[0] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + luma[1] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x + 1) * dstP0StrideX; \ + luma[2] = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + x * dstP0StrideX; \ + luma[3] = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + (x + 1) * dstP0StrideX; \ + \ + cbp = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1); \ + crp = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * (x >> 1); \ + \ + rgb2yuv_bt709(rgb[0][0], rgb[0][1], rgb[0][2], &luma[0][0], &cb[0], &cr[0]); \ + rgb2yuv_bt709(rgb[1][0], rgb[1][1], rgb[1][2], &luma[1][0], &cb[1], &cr[1]); \ + rgb2yuv_bt709(rgb[2][0], rgb[2][1], rgb[2][2], &luma[2][0], &cb[2], &cr[2]); \ + rgb2yuv_bt709(rgb[3][0], rgb[3][1], rgb[3][2], &luma[3][0], &cb[3], &cr[3]); \ + \ + cbp[0] = (uint8_t)(((vx_uint16)cb[0] + (vx_uint16)cb[1] + (vx_uint16)cb[2] + (vx_uint16)cb[3]) >> 2); \ + crp[0] = (uint8_t)(((vx_uint16)cr[0] + (vx_uint16)cr[1] + (vx_uint16)cr[2] + (vx_uint16)cr[3]) >> 2); \ + } \ + } + +#define NV12_RGB(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *crcb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1); \ + vx_uint8 *rgb = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + \ + if (dst_format == VX_DF_IMAGE_RGBX) \ + rgb[3] = 255; \ + \ + if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625) \ + yuv2rgb_bt601(luma[0], crcb[u_pix], crcb[v_pix], &rgb[0], &rgb[1], &rgb[2]); \ + else /*if (src_space == VX_COLOR_SPACE_BT709)*/ \ + yuv2rgb_bt709(luma[0], crcb[u_pix], crcb[v_pix], &rgb[0], &rgb[1], &rgb[2]); \ + } \ + } + +#define NV12_NV21(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint8 *luma[2] = { (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX, \ + (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX }; \ + \ + vx_uint8 *cbcr = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1); \ + vx_uint8 *crcb = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1); \ + \ + yuv2yuv_601to709(luma[0][0], cbcr[0], cbcr[1], &luma[1][0], &crcb[1], &crcb[0]); \ + } \ + } + +#define NV12_YUV4(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *yout = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + vx_uint8 *crcb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1); \ + vx_uint8 *cb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX; \ + vx_uint8 *cr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX; \ + \ + yout[0] = luma[0]; \ + cb[0] = crcb[u_pix]; \ + cr[0] = crcb[v_pix]; \ + } \ + } + +#define NV12_IYUV(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *yout = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + vx_uint8 *crcb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * ((x + 0) / 2); \ + vx_uint8 *cb = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * ((x + 0) / 2); \ + vx_uint8 *cr = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * ((x + 0) / 2); \ + \ + yout[0] = luma[0]; \ + cb[0] = crcb[u_pix]; \ + cr[0] = crcb[v_pix]; \ + } \ + } + +#define YUYV_RGB(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x += 2) \ + { \ + vx_uint8 *yuyv = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *rgb = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + \ + if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625) \ + { \ + yuv2rgb_bt601(yuyv[0], yuyv[1], yuyv[3], &rgb[0], &rgb[1], &rgb[2]); \ + yuv2rgb_bt601(yuyv[2], yuyv[1], yuyv[3], &rgb[3], &rgb[4], &rgb[5]); \ + } \ + else /*if (src_space == VX_COLOR_SPACE_BT709)*/ \ + { \ + yuv2rgb_bt709(yuyv[0], yuyv[1], yuyv[3], &rgb[0], &rgb[1], &rgb[2]); \ + yuv2rgb_bt709(yuyv[2], yuyv[1], yuyv[3], &rgb[3], &rgb[4], &rgb[5]); \ + } \ + } \ + } + + +#define YUYV_RGBX(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x += 2) \ + { \ + vx_uint8 *yuyv = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *rgb = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + rgb[3] = rgb[7] = 255; \ + \ + if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625) \ + { \ + yuv2rgb_bt601(yuyv[0], yuyv[1], yuyv[3], &rgb[0], &rgb[1], &rgb[2]); \ + yuv2rgb_bt601(yuyv[2], yuyv[1], yuyv[3], &rgb[4], &rgb[5], &rgb[6]); \ + } \ + else /*if (src_space == VX_COLOR_SPACE_BT709)*/ \ + { \ + yuv2rgb_bt709(yuyv[0], yuyv[1], yuyv[3], &rgb[0], &rgb[1], &rgb[2]); \ + yuv2rgb_bt709(yuyv[2], yuyv[1], yuyv[3], &rgb[4], &rgb[5], &rgb[6]); \ + } \ + } \ + } + + +#define YUYV_NV12(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y += 2) \ + { \ + for (x = low_x; x < high_x; x += 2) \ + { \ + vx_uint8 *yuyv[2] = { (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX, \ + (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + x * srcP0StrideX }; \ + vx_uint8 *luma[4] = { (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX, \ + (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x + 1) * dstP0StrideX, \ + (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + x * dstP0StrideX, \ + (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + (x + 1) * dstP0StrideX }; \ + vx_uint8 *cbcr = (vx_uint8 *)dst_base[1] + (y >> 1) * dstP1StrideY + (x >> 1) * dstP1StrideX; \ + \ + luma[0][0] = yuyv[0][0]; \ + luma[1][0] = yuyv[0][2]; \ + luma[2][0] = yuyv[1][0]; \ + luma[3][0] = yuyv[1][2]; \ + cbcr[0] = (yuyv[0][1] + yuyv[1][1]) / 2; \ + cbcr[1] = (yuyv[0][3] + yuyv[1][3]) / 2; \ + } \ + } + + +#define YUYV_YUV4(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x += 2) \ + { \ + vx_uint8 *yuyv = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + vx_uint8 *cb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX; \ + vx_uint8 *cr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX; \ + \ + luma[0] = yuyv[0]; \ + luma[1] = yuyv[2]; \ + cb[0] = yuyv[1]; \ + cr[0] = yuyv[3]; \ + cb[1] = yuyv[1]; \ + cr[1] = yuyv[3]; \ + } \ + } + + +#define YUYV_IYUV(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y += 2) \ + { \ + for (x = low_x; x < high_x; x += 2) \ + { \ + vx_uint8 *yuyv[2] = { (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX, \ + (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + x * srcP0StrideX }; \ + vx_uint8 *luma[4] = { (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX, \ + (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x + 1) * dstP0StrideX, \ + (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + x * dstP0StrideX, \ + (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + (x + 1) * dstP0StrideX }; \ + vx_uint8 *cb = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1); \ + vx_uint8 *cr = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * (x >> 1); \ + \ + luma[0][0] = yuyv[0][0]; \ + luma[1][0] = yuyv[0][2]; \ + luma[2][0] = yuyv[1][0]; \ + luma[3][0] = yuyv[1][2]; \ + cb[0] = (yuyv[0][1] + yuyv[1][1]) / 2; \ + cr[0] = (yuyv[0][3] + yuyv[1][3]) / 2; \ + } \ + } + + +#define UYVY_RGB(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x += 2) \ + { \ + vx_uint8 *uyvy = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *rgb = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + \ + if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625) \ + { \ + yuv2rgb_bt601(uyvy[1], uyvy[0], uyvy[2], &rgb[0], &rgb[1], &rgb[2]); \ + yuv2rgb_bt601(uyvy[3], uyvy[0], uyvy[2], &rgb[3], &rgb[4], &rgb[5]); \ + } \ + else /*if (src_space == VX_COLOR_SPACE_BT709)*/ \ + { \ + yuv2rgb_bt709(uyvy[1], uyvy[0], uyvy[2], &rgb[0], &rgb[1], &rgb[2]); \ + yuv2rgb_bt709(uyvy[3], uyvy[0], uyvy[2], &rgb[3], &rgb[4], &rgb[5]); \ + } \ + } \ + } + + +#define UYVY_RGBX(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x += 2) \ + { \ + vx_uint8 *uyvy = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *rgb = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + rgb[3] = rgb[7] = 255; \ + \ + if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625) \ + { \ + yuv2rgb_bt601(uyvy[1], uyvy[0], uyvy[2], &rgb[0], &rgb[1], &rgb[2]); \ + yuv2rgb_bt601(uyvy[3], uyvy[0], uyvy[2], &rgb[4], &rgb[5], &rgb[6]); \ + } \ + else /*if (src_space == VX_COLOR_SPACE_BT709)*/ \ + { \ + yuv2rgb_bt709(uyvy[1], uyvy[0], uyvy[2], &rgb[0], &rgb[1], &rgb[2]); \ + yuv2rgb_bt709(uyvy[3], uyvy[0], uyvy[2], &rgb[4], &rgb[5], &rgb[6]); \ + } \ + } \ + } + + +#define UYVY_NV12(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y += 2) \ + { \ + for (x = low_x; x < high_x; x += 2) \ + { \ + vx_uint8 *uyvy[2] = { (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX, \ + (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + x * srcP0StrideX }; \ + vx_uint8 *luma[4] = { (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX, \ + (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x + 1) * dstP0StrideX, \ + (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + x * dstP0StrideX, \ + (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + (x + 1) * dstP0StrideX }; \ + vx_uint8 *cbcr = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1); \ + \ + luma[0][0] = uyvy[0][1]; \ + luma[1][0] = uyvy[0][3]; \ + luma[2][0] = uyvy[1][1]; \ + luma[3][0] = uyvy[1][3]; \ + cbcr[0] = (uyvy[0][0] + uyvy[1][0]) / 2; \ + cbcr[1] = (uyvy[0][2] + uyvy[1][2]) / 2; \ + } \ + } + + +#define UYVY_YUV4(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x += 2) \ + { \ + vx_uint8 *uyvy = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + vx_uint8 *cb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX; \ + vx_uint8 *cr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX; \ + \ + luma[0] = uyvy[1]; \ + luma[1] = uyvy[3]; \ + cb[0] = uyvy[0]; \ + cr[0] = uyvy[2]; \ + cb[1] = uyvy[0]; \ + cr[1] = uyvy[2]; \ + } \ + } + + +#define UYVY_IYUV(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y += 2) \ + { \ + for (x = low_x; x < high_x; x += 2) \ + { \ + vx_uint8 *uyvy[2] = { (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX, \ + (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + x * srcP0StrideX }; \ + vx_uint8 *luma[4] = { (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX, \ + (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x + 1) * dstP0StrideX, \ + (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + x * dstP0StrideX, \ + (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + (x + 1) * dstP0StrideX }; \ + vx_uint8 *cb = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1); \ + vx_uint8 *cr = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * (x >> 1); \ + \ + luma[0][0] = uyvy[0][1]; \ + luma[1][0] = uyvy[0][3]; \ + luma[2][0] = uyvy[1][1]; \ + luma[3][0] = uyvy[1][3]; \ + cb[0] = (uyvy[0][0] + uyvy[1][0]) / 2; \ + cr[0] = (uyvy[0][2] + uyvy[1][2]) / 2; \ + } \ + } + + +#define IYUV_RGB(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *cb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1); \ + vx_uint8 *cr = (vx_uint8 *)src_base[2] + srcP2StrideY * (y >> 1) + srcP2StrideX * (x >> 1); \ + vx_uint8 *rgb = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX; \ + \ + if (dst_format == VX_DF_IMAGE_RGBX) \ + rgb[3] = 255; \ + \ + /*! \todo restricted range 601 ? */ \ + if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625) \ + yuv2rgb_bt601(luma[0], cb[0], cr[0], &rgb[0], &rgb[1], &rgb[2]); \ + else /*if (src_space == VX_COLOR_SPACE_BT709)*/ \ + yuv2rgb_bt709(luma[0], cb[0], cr[0], &rgb[0], &rgb[1], &rgb[2]); \ + } \ + } + + +#define IYUV_NV12(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX; \ + vx_uint8 *cb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1); \ + vx_uint8 *cr = (vx_uint8 *)src_base[2] + srcP2StrideY * (y >> 1) + srcP2StrideX * (x >> 1); \ + vx_uint8 *nv12[2] = { (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX, \ + (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1) }; \ + nv12[0][0] = luma[0]; \ + nv12[1][0] = cb[0]; \ + nv12[1][1] = cr[0]; \ + } \ + } + + +#define IYUV_YUV4(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint8 *luma[2] = { (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX, \ + (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX }; \ + vx_uint8 *cb[2] = { (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1), \ + (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX }; \ + vx_uint8 *cr[2] = { (vx_uint8 *)src_base[2] + srcP2StrideY * (y >> 1) + srcP2StrideX * (x >> 1), \ + (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX }; \ + \ + luma[1][0] = luma[0][0]; \ + cb[1][0] = cb[0][0]; \ + cr[1][0] = cr[0][0]; \ + } \ + } + +void ConvertColor_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + void *src_base[4] = { NULL }; + void *dst_base[4] = { NULL }; + + src_base[0] = in->base[0]; + dst_base[0] = out->base[0]; + + src_base[1] = in->base[1]; + dst_base[1] = out->base[1]; + + src_base[2] = in->base[2]; + dst_base[2] = out->base[2]; + + vx_uint32 srcP0StrideX = in->addr[0].stride_x; + vx_uint32 srcP0StrideY = in->addr[0].stride_y; + vx_uint32 dstP0StrideX = out->addr[0].stride_x; + vx_uint32 dstP0StrideY = out->addr[0].stride_y; + + vx_uint32 srcP1StrideX = in->addr[1].stride_x; + vx_uint32 srcP1StrideY = in->addr[1].stride_y; + vx_uint32 dstP1StrideX = out->addr[1].stride_x; + vx_uint32 dstP1StrideY = out->addr[1].stride_y; + + vx_uint32 srcP2StrideX = in->addr[2].stride_x; + vx_uint32 srcP2StrideY = in->addr[2].stride_y; + vx_uint32 dstP2StrideX = out->addr[2].stride_x; + vx_uint32 dstP2StrideY = out->addr[2].stride_y; + + vx_df_image src_format, dst_format; + + src_format = in->image.format; + dst_format = out->image.format; + + vx_enum src_space = in->image.space; + + if ((src_format == VX_DF_IMAGE_RGB) || (src_format == VX_DF_IMAGE_RGBX)) + { + if (dst_format == VX_DF_IMAGE_RGB || dst_format == VX_DF_IMAGE_RGBX) + { + if (dst_format == VX_DF_IMAGE_RGB) + { + if (low_y == 0 && low_x == 0) + { + RGBX_RGB(low_y, high_y, low_x) + } + else + { + RGBX_RGB(0, low_y, low_x) + RGBX_RGB(low_y, high_y, 0) + } + } + else + { + if (low_y == 0 && low_x == 0) + { + RGB_RGBX(low_y, high_y, low_x) + } + else + { + RGB_RGBX(0, low_y, low_x) + RGB_RGBX(low_y, high_y, 0) + } + } + } + else if (dst_format == VX_DF_IMAGE_NV12) + { + vx_uint8 cb[4]; + vx_uint8 cr[4]; + vx_uint8 *rgb[4]; + vx_uint8 *luma[4]; + vx_uint8 *cbcr; + + if (low_y == 0 && low_x == 0) + { + RGB_NV12(low_y, high_y, low_x) + } + else + { + RGB_NV12(0, low_y, low_x) + RGB_NV12(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_YUV4) + { + vx_uint8 cb[4]; + vx_uint8 cr[4]; + vx_uint8 *rgb[4]; + vx_uint8 *luma[4]; + vx_uint8 *u[4]; + vx_uint8 *v[4]; + + if (low_y == 0 && low_x == 0) + { + RGB_YUV4(low_y, high_y, low_x) + } + else + { + RGB_YUV4(0, low_y, low_x) + RGB_YUV4(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_IYUV) + { + vx_uint8 cb[4]; + vx_uint8 cr[4]; + vx_uint8 *rgb[4]; + vx_uint8 *luma[4]; + vx_uint8 *cbp; + vx_uint8 *crp; + + if (low_y == 0 && low_x == 0) + { + RGB_IYUV(low_y, high_y, low_x) + } + else + { + RGB_IYUV(0, low_y, low_x) + RGB_IYUV(low_y, high_y, 0) + } + } + } + else if (src_format == VX_DF_IMAGE_NV21 || src_format == VX_DF_IMAGE_NV12) + { + int u_pix = src_format == VX_DF_IMAGE_NV12 ? 0 : 1; + int v_pix = src_format == VX_DF_IMAGE_NV12 ? 1 : 0; + if ((dst_format == VX_DF_IMAGE_RGB) || (dst_format == VX_DF_IMAGE_RGBX)) + { + if (low_y == 0 && low_x == 0) + { + NV12_RGB(low_y, high_y, low_x) + } + else + { + NV12_RGB(0, low_y, low_x) + NV12_RGB(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_NV12 || dst_format == VX_DF_IMAGE_NV21) + { + if (low_y == 0 && low_x == 0) + { + NV12_NV21(low_y, high_y, low_x) + } + else + { + NV12_NV21(0, low_y, low_x) + NV12_NV21(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_YUV4) + { + if (low_y == 0 && low_x == 0) + { + NV12_YUV4(low_y, high_y, low_x) + } + else + { + NV12_YUV4(0, low_y, low_x) + NV12_YUV4(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_IYUV) + { + if (low_y == 0 && low_x == 0) + { + NV12_IYUV(low_y, high_y, low_x) + } + else + { + NV12_IYUV(0, low_y, low_x) + NV12_IYUV(low_y, high_y, 0) + } + } + } + else if (src_format == VX_DF_IMAGE_YUYV) + { + if (dst_format == VX_DF_IMAGE_RGB) + { + if (low_y == 0 && low_x == 0) + { + YUYV_RGB(low_y, high_y, low_x) + } + else + { + YUYV_RGB(0, low_y, low_x) + YUYV_RGB(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_RGBX) + { + if (low_y == 0 && low_x == 0) + { + YUYV_RGBX(low_y, high_y, low_x) + } + else + { + YUYV_RGBX(0, low_y, low_x) + YUYV_RGBX(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_NV12) + { + if (low_y == 0 && low_x == 0) + { + YUYV_NV12(low_y, high_y, low_x) + } + else + { + YUYV_NV12(0, low_y, low_x) + YUYV_NV12(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_YUV4) + { + if (low_y == 0 && low_x == 0) + { + YUYV_YUV4(low_y, high_y, low_x) + } + else + { + YUYV_YUV4(0, low_y, low_x) + YUYV_YUV4(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_IYUV) + { + if (low_y == 0 && low_x == 0) + { + YUYV_IYUV(low_y, high_y, low_x) + } + else + { + YUYV_IYUV(0, low_y, low_x) + YUYV_IYUV(low_y, high_y, 0) + } + } + } + else if (src_format == VX_DF_IMAGE_UYVY) + { + if (dst_format == VX_DF_IMAGE_RGB) + { + if (low_y == 0 && low_x == 0) + { + UYVY_RGB(low_y, high_y, low_x) + } + else + { + UYVY_RGB(0, low_y, low_x) + UYVY_RGB(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_RGBX) + { + if (low_y == 0 && low_x == 0) + { + UYVY_RGBX(low_y, high_y, low_x) + } + else + { + UYVY_RGBX(0, low_y, low_x) + UYVY_RGBX(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_NV12) + { + if (low_y == 0 && low_x == 0) + { + UYVY_NV12(low_y, high_y, low_x) + } + else + { + UYVY_NV12(0, low_y, low_x) + UYVY_NV12(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_YUV4) + { + if (low_y == 0 && low_x == 0) + { + UYVY_YUV4(low_y, high_y, low_x) + } + else + { + UYVY_YUV4(0, low_y, low_x) + UYVY_YUV4(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_IYUV) + { + if (low_y == 0 && low_x == 0) + { + UYVY_IYUV(low_y, high_y, low_x) + } + else + { + UYVY_IYUV(0, low_y, low_x) + UYVY_IYUV(low_y, high_y, 0) + } + } + } + else if (src_format == VX_DF_IMAGE_IYUV) + { + if (dst_format == VX_DF_IMAGE_RGB || dst_format == VX_DF_IMAGE_RGBX) + { + if (low_y == 0 && low_x == 0) + { + IYUV_RGB(low_y, high_y, low_x) + } + else + { + IYUV_RGB(0, low_y, low_x) + IYUV_RGB(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_NV12) + { + if (low_y == 0 && low_x == 0) + { + IYUV_NV12(low_y, high_y, low_x) + } + else + { + IYUV_NV12(0, low_y, low_x) + IYUV_NV12(low_y, high_y, 0) + } + } + else if (dst_format == VX_DF_IMAGE_YUV4) + { + if (low_y == 0 && low_x == 0) + { + IYUV_YUV4(low_y, high_y, low_x) + } + else + { + IYUV_YUV4(0, low_y, low_x) + IYUV_YUV4(low_y, high_y, 0) + } + } + } +} diff --git a/kernels/tiling/tiling_convertdepth.c b/kernels/tiling/tiling_convertdepth.c new file mode 100644 index 0000000..1eea83e --- /dev/null +++ b/kernels/tiling/tiling_convertdepth.c @@ -0,0 +1,173 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include + +void ConvertDepth_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + vx_enum *policy = (vx_enum *)parameters[2]; + vx_int32 *shift = (vx_int32 *)parameters[3]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + if (in->image.format == VX_DF_IMAGE_U8 && out->image.format == VX_DF_IMAGE_S16) + { + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_int16 *dst_base = (vx_int16 *)out->base[0] + out->tile_x; + + int16x8_t sh=vdupq_n_s16(*shift); + + for (y = low_y; y < high_y; y++) + { + vx_uint8* srcp = (vx_uint8 *)src_base + y * in->addr->stride_y; + vx_int16* dstp = (vx_int16 *)dst_base + y * out->addr->stride_y / 2; + for (x = 0; x < out->tile_block.width; x += 16) + { + uint8x16_t v_src = vld1q_u8(srcp); + int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src))); + int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src))); + + vst1q_s16(dstp, vshlq_s16(v_dst0, sh)); + vst1q_s16(dstp+8, vshlq_s16(v_dst1, sh)); + + srcp+=16; + dstp+=16; + } + } + } + else if (in->image.format == VX_DF_IMAGE_S16 && out->image.format == VX_DF_IMAGE_U8) + { + vx_int16 *src_base = (vx_int16 *)in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + int16x8_t sh=vdupq_n_s16(-(*shift)); + + for (y = low_y; y < high_y; y++) + { + vx_int16* srcp = (vx_int16 *)src_base + y * in->addr->stride_y / 2; + vx_uint8* dstp = (vx_uint8 *)dst_base + y * out->addr->stride_y; + for (x = 0; x < out->tile_block.width; x += 16) + { + int16x8_t v_src0 = vld1q_s16(srcp); + int16x8_t v_src1 = vld1q_s16(srcp+8); + + if (*policy == VX_CONVERT_POLICY_SATURATE) + { + int16x8_t v_dst0= vqshlq_s16(v_src0,sh); + int16x8_t v_dst1= vqshlq_s16(v_src1,sh); + uint8x8_t v_dst00 = vqmovun_s16(v_dst0); + uint8x8_t v_dst01 = vqmovun_s16(v_dst1); + uint8x16_t v_dst = vcombine_u8(v_dst00,v_dst01); + + vst1q_u8(dstp, v_dst); + } + else if (*policy == VX_CONVERT_POLICY_WRAP) + { + int16x8_t v_dst0= vshlq_s16(v_src0,sh); + int16x8_t v_dst1= vshlq_s16(v_src1,sh); + uint8x16_t v_dst = vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(v_dst0)),vmovn_u16(vreinterpretq_u16_s16(v_dst1))); + + vst1q_u8(dstp, v_dst); + } + srcp+=16; + dstp+=16; + } + } + } +} + + +#define CONVERT_DEPTH(low_y, high_y, low_x, in_tile_x, out_tile_x) \ + if (in->image.format == VX_DF_IMAGE_U8 && out->image.format == VX_DF_IMAGE_S16) \ + { \ + vx_uint8 *src_base = in->base[0] + in_tile_x; \ + vx_int16 *dst_base = (vx_int16 *)out->base[0] + out_tile_x; \ + for (y = low_y; y < high_y; y++) \ + { \ + vx_uint8 *src = (vx_uint8 *)src_base + y * in->addr->stride_y; \ + vx_int16 *dst = (vx_int16 *)dst_base + y * out->addr->stride_y / 2; \ + for (x = low_x; x < high_x; x++) \ + { \ + *dst = ((vx_int16)(*src)) << (*shift); \ + \ + src++; \ + dst++; \ + } \ + } \ + } \ + else if (in->image.format == VX_DF_IMAGE_S16 && out->image.format == VX_DF_IMAGE_U8) \ + { \ + vx_int16 *src_base = (vx_int16 *)in->base[0] + in_tile_x; \ + vx_uint8 *dst_base = out->base[0] + out_tile_x; \ + for (y = low_y; y < high_y; y++) \ + { \ + vx_int16 *src = (vx_int16 *)src_base + y * in->addr->stride_y / 2; \ + vx_uint8 *dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; \ + for (x = low_x; x < high_x; x++) \ + { \ + if (*policy == VX_CONVERT_POLICY_WRAP) \ + { \ + *dst = (vx_uint8)((*src) >> (*shift)); \ + \ + src++; \ + dst++; \ + } \ + else if (*policy == VX_CONVERT_POLICY_SATURATE) \ + { \ + vx_int16 value = (*src) >> (*shift); \ + value = (value < 0 ? 0 : value); \ + value = (value > UINT8_MAX ? UINT8_MAX : value); \ + *dst = (vx_uint8)value; \ + \ + src++; \ + dst++; \ + } \ + } \ + } \ + } + +void ConvertDepth_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + vx_enum *policy = (vx_enum *)parameters[2]; + vx_int32 *shift = (vx_int32 *)parameters[3]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + if (low_y == 0 && low_x == 0) + { + CONVERT_DEPTH(low_y, high_y, low_x, in->tile_x, out->tile_x) + } + else + { + CONVERT_DEPTH(0, low_y, low_x, in->tile_x, out->tile_x) + CONVERT_DEPTH(low_y, high_y, 0, 0, 0) + } +} diff --git a/kernels/tiling/tiling_convolve.c b/kernels/tiling/tiling_convolve.c new file mode 100644 index 0000000..7f11277 --- /dev/null +++ b/kernels/tiling/tiling_convolve.c @@ -0,0 +1,1001 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include + +#include + +#include + +static vx_uint8 u32Tou8(vx_uint32 x) +{ + vx_uint8 ret = 0; + if (x == 0) + { + return 32; + } + if ((x & 0x0000FFFF) == 0) + { + ret = ret + 16; + x = x >> 16; + } + if ((x & 0x000000FF) == 0) + { + ret = ret + 8; + x = x >> 8; + } + if ((x & 0x0000000F) == 0) + { + ret = ret + 4; + x = x >> 4; + } + if ((x & 0x00000003) == 0) + { + ret = ret + 2; + x = x >> 2; + } + if ((x & 0x00000001) == 0) + { + ret = ret + 1; + } + return ret; +} + +static void s32ShiftR(int32x4_t *pv32x4, vx_int32 shift) +{ + switch(shift) + { + case 0: + break; + case 1: + *pv32x4 = vshrq_n_s32(*pv32x4, 1); + break; + case 2: + *pv32x4 = vshrq_n_s32(*pv32x4, 2); + break; + case 3: + *pv32x4 = vshrq_n_s32(*pv32x4, 3); + break; + case 4: + *pv32x4 = vshrq_n_s32(*pv32x4, 4); + break; + case 5: + *pv32x4 = vshrq_n_s32(*pv32x4, 5); + break; + case 6: + *pv32x4 = vshrq_n_s32(*pv32x4, 6); + break; + case 7: + *pv32x4 = vshrq_n_s32(*pv32x4, 7); + break; + case 8: + *pv32x4 = vshrq_n_s32(*pv32x4, 8); + break; + case 9: + *pv32x4 = vshrq_n_s32(*pv32x4, 9); + break; + case 10: + *pv32x4 = vshrq_n_s32(*pv32x4, 10); + break; + case 11: + *pv32x4 = vshrq_n_s32(*pv32x4, 11); + break; + case 12: + *pv32x4 = vshrq_n_s32(*pv32x4, 12); + break; + case 13: + *pv32x4 = vshrq_n_s32(*pv32x4, 13); + break; + case 14: + *pv32x4 = vshrq_n_s32(*pv32x4, 14); + break; + case 15: + *pv32x4 = vshrq_n_s32(*pv32x4, 15); + break; + case 16: + *pv32x4 = vshrq_n_s32(*pv32x4, 16); + break; + case 17: + *pv32x4 = vshrq_n_s32(*pv32x4, 17); + break; + case 18: + *pv32x4 = vshrq_n_s32(*pv32x4, 18); + break; + case 19: + *pv32x4 = vshrq_n_s32(*pv32x4, 19); + break; + case 20: + *pv32x4 = vshrq_n_s32(*pv32x4, 20); + break; + case 21: + *pv32x4 = vshrq_n_s32(*pv32x4, 21); + break; + case 22: + *pv32x4 = vshrq_n_s32(*pv32x4, 22); + break; + case 23: + *pv32x4 = vshrq_n_s32(*pv32x4, 23); + break; + case 24: + *pv32x4 = vshrq_n_s32(*pv32x4, 24); + break; + case 25: + *pv32x4 = vshrq_n_s32(*pv32x4, 25); + break; + case 26: + *pv32x4 = vshrq_n_s32(*pv32x4, 26); + break; + case 27: + *pv32x4 = vshrq_n_s32(*pv32x4, 27); + break; + case 28: + *pv32x4 = vshrq_n_s32(*pv32x4, 28); + break; + case 29: + *pv32x4 = vshrq_n_s32(*pv32x4, 29); + break; + case 30: + *pv32x4 = vshrq_n_s32(*pv32x4, 30); + break; + case 31: + *pv32x4 = vshrq_n_s32(*pv32x4, 31); + break; + case 32: + *pv32x4 = vshrq_n_s32(*pv32x4, 32); + break; + default: + break; + } + return; +} + +static void convStru8u8(int32x4_t *pvOut0, int32x4_t *pvOut1, int32x4_t *pvOut2, int32x4_t *pvOut3, + vx_uint8 *dst, vx_uint8 fillCnt) +{ + int32x4_t out0 = *pvOut0; + int32x4_t out1 = *pvOut1; + int32x4_t out2 = *pvOut2; + int32x4_t out3 = *pvOut3; + int32x4_t vMaxu8 = vdupq_n_s32(UINT8_MAX); + int32x4_t vZero = vdupq_n_s32(0); + uint16x8_t vRetLow, vRetHigh; + vx_uint8 szTmp[16]; + + out0 = vminq_s32(out0, vMaxu8); + out1 = vminq_s32(out1, vMaxu8); + out2 = vminq_s32(out2, vMaxu8); + out3 = vminq_s32(out3, vMaxu8); + + out0 = vmaxq_s32(out0, vZero); + out1 = vmaxq_s32(out1, vZero); + out2 = vmaxq_s32(out2, vZero); + out3 = vmaxq_s32(out3, vZero); + + vRetLow = vreinterpretq_u16_s16(vcombine_s16(vqmovn_s32(out0), vqmovn_s32(out1))); + vRetHigh = vreinterpretq_u16_s16(vcombine_s16(vqmovn_s32(out2), vqmovn_s32(out3))); + + if (16 == fillCnt) + { + vst1q_u8(dst, vcombine_u8(vqmovn_u16(vRetLow), vqmovn_u16(vRetHigh))); + } + else + { + vst1q_u8(szTmp, vcombine_u8(vqmovn_u16(vRetLow), vqmovn_u16(vRetHigh))); + for (vx_uint8 idx = 0; idx < fillCnt; idx++) + { + dst[idx] = szTmp[idx]; + } + } + + return; +} + +static void convStru8s16(int32x4_t *pvOut0, int32x4_t *pvOut1, int32x4_t *pvOut2, int32x4_t *pvOut3, + vx_int16 *dst, vx_uint8 fillCnt) +{ + int32x4_t out0 = *pvOut0; + int32x4_t out1 = *pvOut1; + int32x4_t out2 = *pvOut2; + int32x4_t out3 = *pvOut3; + int32x4_t vMaxs16 = vdupq_n_s32(INT16_MAX); + int32x4_t vMins16 = vdupq_n_s32(INT16_MIN); + vx_int16 szTmp[16]; + + out0 = vminq_s32(out0, vMaxs16); + out1 = vminq_s32(out1, vMaxs16); + out2 = vminq_s32(out2, vMaxs16); + out3 = vminq_s32(out3, vMaxs16); + + out0 = vmaxq_s32(out0, vMins16); + out1 = vmaxq_s32(out1, vMins16); + out2 = vmaxq_s32(out2, vMins16); + out3 = vmaxq_s32(out3, vMins16); + + if (16 == fillCnt) + { + vst1q_s16(dst, vcombine_s16(vqmovn_s32(out0), vqmovn_s32(out1))); + vst1q_s16(dst + 8, vcombine_s16(vqmovn_s32(out2), vqmovn_s32(out3))); + } + else + { + vst1q_s16(szTmp, vcombine_s16(vqmovn_s32(out0), vqmovn_s32(out1))); + vst1q_s16(szTmp + 8, vcombine_s16(vqmovn_s32(out2), vqmovn_s32(out3))); + for (vx_uint8 idx = 0; idx < fillCnt; idx++) + { + dst[idx] = szTmp[idx]; + } + } + + return; +} + +static void convStrs16u8(int32x4_t *pvOut0, int32x4_t *pvOut1, vx_uint8 *dst, vx_uint8 fillCnt) +{ + int32x4_t out0 = *pvOut0; + int32x4_t out1 = *pvOut1; + int32x4_t vMaxu8 = vdupq_n_s32(UINT8_MAX); + int32x4_t vZero = vdupq_n_s32(0); + int16x8_t vRet; + vx_uint8 szTmp[8]; + + out0 = vminq_s32(out0, vMaxu8); + out1 = vminq_s32(out1, vMaxu8); + + out0 = vmaxq_s32(out0, vZero); + out1 = vmaxq_s32(out1, vZero); + + vRet = vcombine_s16(vqmovn_s32(out0), vqmovn_s32(out1)); + if (8 == fillCnt) + { + vst1_u8(dst, vqmovn_u16(vreinterpretq_u16_s16(vRet))); + } + else + { + vst1_u8(szTmp, vqmovn_u16(vreinterpretq_u16_s16(vRet))); + for (vx_uint8 idx = 0; idx < fillCnt; idx++) + { + dst[idx] = szTmp[idx]; + } + } + + return; +} + +static void convStrs16s16(int32x4_t *pvOut0, int32x4_t *pvOut1, vx_int16 *dst, vx_uint8 fillCnt) +{ + int32x4_t out0 = *pvOut0; + int32x4_t out1 = *pvOut1; + int32x4_t vMaxs16 = vdupq_n_s32(INT16_MAX); + int32x4_t vMins16 = vdupq_n_s32(INT16_MIN); + vx_int16 szTmp[8]; + + out0 = vminq_s32(out0, vMaxs16); + out1 = vminq_s32(out1, vMaxs16); + + out0 = vmaxq_s32(out0, vMins16); + out1 = vmaxq_s32(out1, vMins16); + + if (8 == fillCnt) + { + vst1q_s16(dst, vcombine_s16(vqmovn_s32(out0), vqmovn_s32(out1))); + } + else + { + vst1q_s16(szTmp, vcombine_s16(vqmovn_s32(out0), vqmovn_s32(out1))); + for (vx_uint8 idx = 0; idx < fillCnt; idx++) + { + dst[idx] = szTmp[idx]; + } + } +} + +static void convRow3x1u8(uint8x16_t *pvPrv, uint8x16_t *pvCur, uint8x16_t *pvNxt, vx_int16 *coeff, + int32x4_t *pvOut0, int32x4_t *pvOut1, int32x4_t *pvOut2, int32x4_t *pvOut3) +{ + uint8x16_t vPrv = *pvPrv; + uint8x16_t vCur = *pvCur; + uint8x16_t vNxt = *pvNxt; + + uint8x16_t vData = vextq_u8(vPrv, vCur, 15); + int16x8_t s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + int16x8_t s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[2]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[2]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[2]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[2]); + + vData = vextq_u8(vCur, vNxt, 1); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[0]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[0]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[0]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[0]); + + vData = vCur; + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[1]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[1]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[1]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[1]); + + return; +} + +static void convRow5x1u8(uint8x16_t *pvPrv, uint8x16_t *pvCur, uint8x16_t *pvNxt, vx_int16 *coeff, + int32x4_t *pvOut0, int32x4_t *pvOut1, int32x4_t *pvOut2, int32x4_t *pvOut3) +{ + uint8x16_t vPrv = *pvPrv; + uint8x16_t vCur = *pvCur; + uint8x16_t vNxt = *pvNxt; + + uint8x16_t vData = vextq_u8(vPrv, vCur, 14); + int16x8_t s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + int16x8_t s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[4]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[4]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[4]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[4]); + + vData = vextq_u8(vCur, vNxt, 2); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[0]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[0]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[0]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[0]); + + vData = vextq_u8(vPrv, vCur, 15); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[3]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[3]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[3]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[3]); + + vData = vextq_u8(vCur, vNxt, 1); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[1]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[1]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[1]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[1]); + + vData = vCur; + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[2]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[2]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[2]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[2]); + + return; +} + +static void convRow7x1u8(uint8x16_t *pvPrv, uint8x16_t *pvCur, uint8x16_t *pvNxt, vx_int16 *coeff, + int32x4_t *pvOut0, int32x4_t *pvOut1, int32x4_t *pvOut2, int32x4_t *pvOut3) +{ + uint8x16_t vPrv = *pvPrv; + uint8x16_t vCur = *pvCur; + uint8x16_t vNxt = *pvNxt; + + uint8x16_t vData = vextq_u8(vPrv, vCur, 13); + int16x8_t s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + int16x8_t s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[6]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[6]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[6]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[6]); + + vData = vextq_u8(vCur, vNxt, 3); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[0]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[0]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[0]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[0]); + + vData = vextq_u8(vPrv, vCur, 14); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[5]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[5]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[5]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[5]); + + vData = vextq_u8(vCur, vNxt, 2); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[1]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[1]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[1]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[1]); + + vData = vextq_u8(vPrv, vCur, 15); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[4]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[4]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[4]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[4]); + + vData = vextq_u8(vCur, vNxt, 1); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[2]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[2]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[2]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[2]); + + vData = vCur; + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[3]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[3]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[3]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[3]); + + return; +} + +static void convRow9x1u8(uint8x16_t *pvPrv, uint8x16_t *pvCur, uint8x16_t *pvNxt, vx_int16 *coeff, + int32x4_t *pvOut0, int32x4_t *pvOut1, int32x4_t *pvOut2, int32x4_t *pvOut3) +{ + uint8x16_t vPrv = *pvPrv; + uint8x16_t vCur = *pvCur; + uint8x16_t vNxt = *pvNxt; + + uint8x16_t vData = vextq_u8(vPrv, vCur, 12); + int16x8_t s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + int16x8_t s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[8]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[8]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[8]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[8]); + + vData = vextq_u8(vCur, vNxt, 4); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[0]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[0]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[0]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[0]); + + vData = vextq_u8(vPrv, vCur, 13); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[7]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[7]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[7]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[7]); + + vData = vextq_u8(vCur, vNxt, 3); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[1]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[1]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[1]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[1]); + + vData = vextq_u8(vPrv, vCur, 14); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[6]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[6]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[6]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[6]); + + vData = vextq_u8(vCur, vNxt, 2); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[2]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[2]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[2]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[2]); + + vData = vextq_u8(vPrv, vCur, 15); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[5]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[5]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[5]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[5]); + + vData = vextq_u8(vCur, vNxt, 1); + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[3]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[3]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[3]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[3]); + + vData = vCur; + s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData))); + s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData))); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[4]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[4]); + *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[4]); + *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[4]); + + return; +} + + +static void convRow3x1s16(int16x8_t *pvPrv, int16x8_t *pvCur, int16x8_t *pvNxt, vx_int16 *coeff, + int32x4_t *pvOut0, int32x4_t *pvOut1) +{ + int16x8_t vPrv = *pvPrv; + int16x8_t vCur = *pvCur; + int16x8_t vNxt = *pvNxt; + + int16x8_t vData = vextq_s16(vPrv, vCur, 7); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[2]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[2]); + + vData = vextq_s16(vCur, vNxt, 1); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[0]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[0]); + + vData = vCur; + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[1]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[1]); + + return; +} + +static void convRow5x1s16(int16x8_t *pvPrv, int16x8_t *pvCur, int16x8_t *pvNxt, vx_int16 *coeff, + int32x4_t *pvOut0, int32x4_t *pvOut1) +{ + int16x8_t vPrv = *pvPrv; + int16x8_t vCur = *pvCur; + int16x8_t vNxt = *pvNxt; + + int16x8_t vData = vextq_s16(vPrv, vCur, 6); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[4]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[4]); + + vData = vextq_s16(vCur, vNxt, 2); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[0]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[0]); + + vData = vextq_s16(vPrv, vCur, 7); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[3]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[3]); + + vData = vextq_s16(vCur, vNxt, 1); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[1]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[1]); + + vData = vCur; + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[2]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[2]); + + return; +} + +static void convRow7x1s16(int16x8_t *pvPrv, int16x8_t *pvCur, int16x8_t *pvNxt, vx_int16 *coeff, + int32x4_t *pvOut0, int32x4_t *pvOut1) +{ + int16x8_t vPrv = *pvPrv; + int16x8_t vCur = *pvCur; + int16x8_t vNxt = *pvNxt; + + int16x8_t vData = vextq_s16(vPrv, vCur, 5); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[6]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[6]); + + vData = vextq_s16(vCur, vNxt, 3); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[0]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[0]); + + + vData = vextq_s16(vPrv, vCur, 6); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[5]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[5]); + + vData = vextq_s16(vCur, vNxt, 2); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[1]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[1]); + + vData = vextq_s16(vPrv, vCur, 7); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[4]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[4]); + + vData = vextq_s16(vCur, vNxt, 1); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[2]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[2]); + + vData = vCur; + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[3]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[3]); + + return; +} + +static void convRow9x1s16(int16x8_t *pvPrv, int16x8_t *pvCur, int16x8_t *pvNxt, vx_int16 *coeff, + int32x4_t *pvOut0, int32x4_t *pvOut1) +{ + int16x8_t vPrv = *pvPrv; + int16x8_t vCur = *pvCur; + int16x8_t vNxt = *pvNxt; + + int16x8_t vData = vextq_s16(vPrv, vCur, 4); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[8]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[8]); + + vData = vextq_s16(vCur, vNxt, 4); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[0]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[0]); + + vData = vextq_s16(vPrv, vCur, 5); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[7]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[7]); + + vData = vextq_s16(vCur, vNxt, 3); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[1]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[1]); + + + vData = vextq_s16(vPrv, vCur, 6); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[6]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[6]); + + vData = vextq_s16(vCur, vNxt, 2); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[2]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[2]); + + vData = vextq_s16(vPrv, vCur, 7); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[5]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[5]); + + vData = vextq_s16(vCur, vNxt, 1); + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[3]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[3]); + + vData = vCur; + *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[4]); + *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[4]); + + return; +} + +void Convolve_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_convolution_t *conv = (vx_tile_convolution_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + + vx_uint8 *src_base = in->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = out->tile_x + out->tile_block.width; + + vx_size conv_width = (*conv).conv_width; + vx_size conv_height = (*conv).conv_height; + + vx_int32 conv_radius_x, conv_radius_y; + + conv_radius_x = (vx_int32)conv_width / 2; + conv_radius_y = (vx_int32)conv_height / 2; + + vx_uint32 src_format = in->image.format; + vx_uint32 dst_format = out->image.format; + + vx_int32 sum = 0, value = 0; + + vx_uint32 scale = (*conv).scale; + + vx_int16 conv_mat[C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM] = { 0 }; + + memcpy(conv_mat, ((*conv).conv_mat), conv_width * conv_height * sizeof(vx_int16)); + + vx_int32 shift = (vx_int32)u32Tou8(scale); + + if ( high_y == vxTileHeight(out, 0) ) + { + uint8x16_t vPrv[C_MAX_CONVOLUTION_DIM]; + uint8x16_t vCur[C_MAX_CONVOLUTION_DIM]; + uint8x16_t vNxt[C_MAX_CONVOLUTION_DIM]; + int32x4_t out0 = vdupq_n_s32(0); + int32x4_t out1 = out0; + int32x4_t out2 = out0; + int32x4_t out3 = out0; + + vx_uint32 dstY = conv_radius_y; + vx_uint8 *dstTmp; + for (x = low_x; x < high_x; x += 16) + { + dstTmp = (vx_uint8 *)dst_base + x * out->addr->stride_x; + dstY = conv_radius_y; + if (0 == x) + { + for (y = 0; y < conv_height; y++) + { + vPrv[y] = vdupq_n_u8(0); + vCur[y] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + x * in->addr->stride_x); + vNxt[y] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + (x + 16) * in->addr->stride_x); + } + } + else + { + for (y = 0; y < conv_height; y++) + { + vPrv[y] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + (x - 16) * in->addr->stride_x); + vCur[y] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + x * in->addr->stride_x); + vNxt[y] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + (x + 16) * in->addr->stride_x); + } + } + + for (y = conv_height; y < high_y; (++y, dstY++)) + { + out0 = vdupq_n_s32(0); + out1 = out0; + out2 = out0; + out3 = out0; + for (vx_uint8 convY = 0; convY < conv_height; convY++) + { + if (3 == conv_width) + { + convRow3x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY], + conv_mat + (conv_height - (convY + 1)) * conv_width, + &out0, &out1, &out2, &out3); + } + else if (5 == conv_width) + { + convRow5x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY], + conv_mat + (conv_height - (convY + 1)) * conv_width, + &out0, &out1, &out2, &out3); + } + else if (7 == conv_width) + { + convRow7x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY], + conv_mat + (conv_height - (convY + 1)) * conv_width, + &out0, &out1, &out2, &out3); + } + else if (9 == conv_width) + { + convRow9x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY], + conv_mat + (conv_height - (convY + 1)) * conv_width, + &out0, &out1, &out2, &out3); + } + } + + s32ShiftR(&out0, shift); + s32ShiftR(&out1, shift); + s32ShiftR(&out2, shift); + s32ShiftR(&out3, shift); + + if (dst_format == VX_DF_IMAGE_U8) + { + convStru8u8(&out0, &out1, &out2, &out3, dstTmp + dstY * out->addr->stride_y, 16); + } + else if (dst_format == VX_DF_IMAGE_S16) + { + convStru8s16(&out0, &out1, &out2, &out3, (vx_int16 *)(dstTmp + dstY * out->addr->stride_y), 16); + } + + //swap data and acquire next data + for (vx_uint8 convY = 0; convY < (conv_height - 1); convY++) + { + vPrv[convY] = vPrv[convY + 1]; + vCur[convY] = vCur[convY + 1]; + vNxt[convY] = vNxt[convY + 1]; + } + + if (0 == x) + { + vPrv[conv_height - 1] = vdupq_n_u8(0); + } + else + { + vPrv[conv_height - 1] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + (x - 16) * in->addr->stride_x); + } + vCur[conv_height - 1] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + x * in->addr->stride_x); + vNxt[conv_height - 1] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + (x + 16) * in->addr->stride_x); + } + + //process the last one + out0 = vdupq_n_s32(0); + out1 = out0; + out2 = out0; + out3 = out0; + for (vx_uint8 convY = 0; convY < conv_height; convY++) + { + if (3 == conv_width) + { + convRow3x1u8(&(vPrv[convY]), &(vCur[convY]), &(vNxt[convY]), + conv_mat + (conv_height - (convY + 1)) * conv_width, + &out0, &out1, &out2, &out3); + } + else if (5 == conv_width) + { + convRow5x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY], + conv_mat + (conv_height - (convY + 1)) * conv_width, + &out0, &out1, &out2, &out3); + } + else if (7 == conv_width) + { + convRow7x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY], + conv_mat + (conv_height - (convY + 1)) * conv_width, + &out0, &out1, &out2, &out3); + } + else if (9 == conv_width) + { + convRow9x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY], + conv_mat + (conv_height - (convY + 1)) * conv_width, + &out0, &out1, &out2, &out3); + } + } + + s32ShiftR(&out0, shift); + s32ShiftR(&out1, shift); + s32ShiftR(&out2, shift); + s32ShiftR(&out3, shift); + + if (dst_format == VX_DF_IMAGE_U8) + { + convStru8u8(&out0, &out1, &out2, &out3, dstTmp + dstY * out->addr->stride_y, 16); + } + else if (dst_format == VX_DF_IMAGE_S16) + { + convStru8s16(&out0, &out1, &out2, &out3, (vx_int16 *)(dstTmp + dstY * out->addr->stride_y), 16); + } + } + } +} + + +static void vxReadRectangle_flexible(const void *base, const vx_imagepatch_addressing_t *addr, + vx_df_image type, vx_uint32 center_x, vx_uint32 center_y, + vx_uint32 radius_x, vx_uint32 radius_y, void *destination) +{ + vx_int32 width = (vx_int32)addr->dim_x, height = (vx_int32)addr->dim_y; + vx_int32 stride_y = addr->stride_y; + vx_int32 stride_x = addr->stride_x; + const vx_uint8 *ptr = (const vx_uint8 *)base; + vx_int32 ky, kx; + vx_uint32 dest_index = 0; + // kx, ky - kernel x and y + for (ky = -(int32_t)radius_y; ky <= (int32_t)radius_y; ++ky) + { + vx_int32 y = (vx_int32)(center_y + ky); + y = y < 0 ? 0 : y >= height ? height - 1 : y; + + for (kx = -(int32_t)radius_x; kx <= (int32_t)radius_x; ++kx, ++dest_index) + { + vx_int32 x = (int32_t)(center_x + kx); + x = x < 0 ? 0 : x >= width ? width - 1 : x; + + switch (type) + { + case VX_DF_IMAGE_U8: + ((vx_uint8*)destination)[dest_index] = *(vx_uint8*)(ptr + y*stride_y + x*stride_x); + break; + case VX_DF_IMAGE_S16: + case VX_DF_IMAGE_U16: + ((vx_uint16*)destination)[dest_index] = *(vx_uint16*)(ptr + y*stride_y + x*stride_x); + break; + } + } + } +} + +#define CONVOLVE(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; ++y) \ + { \ + for (x = low_x; x < high_x; ++x) \ + { \ + sum = 0; \ + if (src_format == VX_DF_IMAGE_U8) \ + { \ + vx_uint8 slice[C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM] = { 0 }; \ + \ + vxReadRectangle_flexible(src_base, in->addr, src_format, x, y, conv_radius_x, conv_radius_y, slice); \ + \ + for (i = 0; i < (vx_int32)(conv_width * conv_height); ++i) \ + sum += conv_mat[conv_width * conv_height - 1 - i] * slice[i]; \ + } \ + else if (src_format == VX_DF_IMAGE_S16) \ + { \ + vx_int16 slice[C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM] = { 0 }; \ + \ + vxReadRectangle_flexible(src_base, in->addr, src_format, x, y, conv_radius_x, conv_radius_y, slice); \ + \ + for (i = 0; i < (vx_int32)(conv_width * conv_height); ++i) \ + sum += conv_mat[conv_width * conv_height - 1 - i] * slice[i]; \ + } \ + \ + value = sum / (vx_int32)scale; \ + \ + if (dst_format == VX_DF_IMAGE_U8) \ + { \ + vx_uint8 *dstp = (vx_uint8 *)dst_base + y * out->addr->stride_y + x * out->addr->stride_x; \ + if (value < 0) *dstp = 0; \ + else if (value > UINT8_MAX) *dstp = UINT8_MAX; \ + else *dstp = value; \ + } \ + else if (dst_format == VX_DF_IMAGE_S16) \ + { \ + vx_int16 *dstp = (vx_int16 *)dst_base + y * out->addr->stride_y / 2 + x * out->addr->stride_x / 2; \ + if (value < INT16_MIN) *dstp = INT16_MIN; \ + else if (value > INT16_MAX) *dstp = INT16_MAX; \ + else *dstp = value; \ + } \ + } \ + } + +void Convolve_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0, i; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_convolution_t *conv = (vx_tile_convolution_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + vx_size conv_width = (*conv).conv_width; + vx_size conv_height = (*conv).conv_height; + + vx_int32 conv_radius_x, conv_radius_y; + + conv_radius_x = (vx_int32)conv_width / 2; + conv_radius_y = (vx_int32)conv_height / 2; + + vx_uint32 src_format = in->image.format; + vx_uint32 dst_format = out->image.format; + + vx_int32 sum = 0, value = 0; + + vx_uint32 scale = (*conv).scale; + + vx_int16 conv_mat[C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM] = { 0 }; + + memcpy(conv_mat, ((*conv).conv_mat), conv_width * conv_height * sizeof(vx_int16)); + + if (low_y == 0 && low_x == 0) + { + CONVOLVE(low_y + conv_radius_y, high_y - conv_radius_y, low_x + conv_radius_x, high_x - conv_radius_x) + } + else + { + CONVOLVE(conv_radius_y, low_y, low_x, high_x - conv_radius_x) + + src_base = in->base[0]; + dst_base = out->base[0]; + CONVOLVE(low_y, high_y, conv_radius_x, high_x - conv_radius_x) + } +} diff --git a/kernels/tiling/tiling_fast9.c b/kernels/tiling/tiling_fast9.c new file mode 100644 index 0000000..0f38824 --- /dev/null +++ b/kernels/tiling/tiling_fast9.c @@ -0,0 +1,860 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include +#include + +#define PERMUTATIONS 16 +#define APERTURE 3 +#define PERM_SIZE 16 + +static const vx_uint8 permutations_table[PERMUTATIONS][PERM_SIZE] = + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 255, 255, 255, 255, 255, 255, 255 }, + { 15, 0, 1, 2, 3, 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255 }, + { 14, 15, 0, 1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255, 255 }, + { 13, 14, 15, 0, 1, 2, 3, 4, 5, 255, 255, 255, 255, 255, 255, 255 }, + { 12, 13, 14, 15, 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 255, 255 }, + { 11, 12, 13, 14, 15, 0, 1, 2, 3, 255, 255, 255, 255, 255, 255, 255 }, + { 10, 11, 12, 13, 14, 15, 0, 1, 2, 255, 255, 255, 255, 255, 255, 255 }, + { 9, 10, 11, 12, 13, 14, 15, 0, 1, 255, 255, 255, 255, 255, 255, 255 }, + { 8, 9, 10, 11, 12, 13, 14, 15, 0, 255, 255, 255, 255, 255, 255, 255 }, + { 7, 8, 9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 }, + { 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 }, + { 5, 6, 7, 8, 9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 }, + { 4, 5, 6, 7, 8, 9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 }, + { 3, 4, 5, 6, 7, 8, 9, 10, 11, 255, 255, 255, 255, 255, 255, 255 }, + { 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 255, 255 }, + { 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 255 } + }; + +/* The following creates the index registers to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P. + . . F 0 1 . . . + . E . . . 2 . . + D . . . . . 3 . + C . . P . . 4 . + B . . . . . 5 . + . A . . . 6 . . + . . 9 8 7 . . . + Where . is an irrelevant texel value + We want to retrieve all texels [0,F] + The 4 registers in r will then be used to get these texels out of two tables in the function get_circle_texels() + The first table holds the top 4 rows of texels + . . F 0 1 . . . + . E . . . 2 . . + D . . . . . 3 . + C . . P . . 4 . + The second table the bottom 3 rows of texels + B . . . . . 5 . + . A . . . 6 . . + . . 9 8 7 . . . +*/ +static const vx_uint8 top_right[8] = +{ + /* The register r.val[0] will be used to retrieve these texels: + . . . 0 1 . . . + . . . . . 2 . . + . . . . . . 3 . + . . . . . . 4 . + */ + 3 /* top table, first row, elem 4, value 0 in the diagram above */, + 4 /* top table, first row, elem 5, value 1 in the diagram above */, + 13 /* top table, second row, elem 6, value 2 in the diagram above */, + 22 /* top table, third row, elem 7, value 3 in the diagram above*/, + 30 /* top table, fourth row, elem 7, value 4 in the diagram above*/, + 255, + 255, + 255 +}; + +static const vx_uint8 bottom_right[8] = +{ + /* The register r.val[1] will be used to retrieve these texels: + . . . . . . 5 . + . . . . . 6 . . + . . . . 7 . . . + */ + 255, + 255, + 255, + 255, + 255, + 6 /* low table, first row, elem 7, value 5 in the diagram above*/, + 13 /* low table, second row, elem 6, value 6 in the diagram above*/, + 20 /* low table, third row, elem 5, value 7 in the diagram above*/ +}; + +static const vx_uint8 top_left[8] = +{ + /* The register r.val[2] will be used to retrieve these texels: + . . F . . . . . + . E . . . . . . + D . . . . . . . + C . . . . . . . + */ + 255, + 255, + 255, + 255, + 24 /* top table, fourth row, elem 1, value C in the diagram above */, + 16 /* top table, third row, elem 1, value D in the diagram above*/, + 9 /* top table, second row, elem 2, value E in the diagram above*/, + 2 /* top table, first row, elem 3, value F in the diagram above*/ +}; + +static const vx_uint8 bottom_left[8] = +{ + /* The register r.val[3] will be used to retrieve these texels: + B . . . . . . . + . A . . . . . . + . . 9 8 . . . . + */ + 19 /* low table, third row, elem 4, value 8 in the diagram above */, + 18 /* low table, third row, elem 3, value 9 in the diagram above */, + 9 /* low table, second row, elem 2, value A in the diagram above */, + 0 /* low table, first row, elem 1, value B in the diagram above */, + 255, + 255, + 255, + 255 +}; + +static void vxAddArrayItems_tiling(vx_tile_array_t *arr, vx_size count, const void *ptr, vx_size stride) +{ + if ((count > 0) && (ptr != NULL) && (stride >= arr->item_size)) + { + if (arr->num_items + count <= arr->capacity) + { + vx_size offset = arr->num_items * arr->item_size; + vx_uint8 *dst_ptr = (vx_uint8 *)arr->ptr + offset; + + vx_size i; + for (i = 0; i < count; ++i) + { + vx_uint8 *tmp = (vx_uint8 *)ptr; + memcpy(&dst_ptr[i * arr->item_size], &tmp[i * stride], arr->item_size); + } + + arr->num_items += count; + } + } +} + +static void addCorner(vx_int32 y, int16x8_t *pvX, uint8x8_t *pvPred, vx_uint8 *pStrength, vx_size dst_capacity, + vx_size *num_corners, vx_tile_array_t *points) +{ + uint8x8_t vPred = *pvPred; + int16x8_t vX = *pvX; + vx_keypoint_t kp; + if (0 != vget_lane_u8(vPred, 0) && (*num_corners) < dst_capacity) + { + kp.x = vgetq_lane_s16(vX, 0); + kp.y = y; + kp.strength = pStrength[0]; + kp.scale = 0.0f; + kp.orientation = 0.0f; + kp.tracking_status = 1; + kp.error = 0.0f; + (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp)); + *num_corners += 1; + } + if (0 != vget_lane_u8(vPred, 1) && (*num_corners) < dst_capacity) + { + kp.x = vgetq_lane_s16(vX, 1); + kp.y = y; + kp.strength = pStrength[1]; + kp.scale = 0.0f; + kp.orientation = 0.0f; + kp.tracking_status = 1; + kp.error = 0.0f; + (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp)); + *num_corners += 1; + } + if (0 != vget_lane_u8(vPred, 2) && (*num_corners) < dst_capacity) + { + kp.x = vgetq_lane_s16(vX, 2); + kp.y = y; + kp.strength = pStrength[2]; + kp.scale = 0.0f; + kp.orientation = 0.0f; + kp.tracking_status = 1; + kp.error = 0.0f; + (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp)); + *num_corners += 1; + } + if (0 != vget_lane_u8(vPred, 3) && (*num_corners) < dst_capacity) + { + kp.x = vgetq_lane_s16(vX, 3); + kp.y = y; + kp.strength = pStrength[3]; + kp.scale = 0.0f; + kp.orientation = 0.0f; + kp.tracking_status = 1; + kp.error = 0.0f; + (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp)); + *num_corners += 1; + } + if (0 != vget_lane_u8(vPred, 4) && (*num_corners) < dst_capacity) + { + kp.x = vgetq_lane_s16(vX, 4); + kp.y = y; + kp.strength = pStrength[4]; + kp.scale = 0.0f; + kp.orientation = 0.0f; + kp.tracking_status = 1; + kp.error = 0.0f; + (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp)); + *num_corners += 1; + } + if (0 != vget_lane_u8(vPred, 5) && (*num_corners) < dst_capacity) + { + kp.x = vgetq_lane_s16(vX, 5); + kp.y = y; + kp.strength = pStrength[5]; + kp.scale = 0.0f; + kp.orientation = 0.0f; + kp.tracking_status = 1; + kp.error = 0.0f; + (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp)); + *num_corners += 1; + } + if (0 != vget_lane_u8(vPred, 6) && (*num_corners) < dst_capacity) + { + kp.x = vgetq_lane_s16(vX, 6); + kp.y = y; + kp.strength = pStrength[6]; + kp.scale = 0.0f; + kp.orientation = 0.0f; + kp.tracking_status = 1; + kp.error = 0.0f; + (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp)); + *num_corners += 1; + } + if (0 != vget_lane_u8(vPred, 7) && (*num_corners) < dst_capacity) + { + kp.x = vgetq_lane_s16(vX, 7); + kp.y = y; + kp.strength = pStrength[7]; + kp.scale = 0.0f; + kp.orientation = 0.0f; + kp.tracking_status = 1; + kp.error = 0.0f; + (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp)); + *num_corners += 1; + } +} + +static void getPermIdx(vx_uint32 idx, uint8x8x2_t *pvPermIdx) +{ + uint8x8x2_t vPermIdx = { + {vld1_u8(permutations_table[idx]), vld1_u8(permutations_table[idx] + 8)}}; + *pvPermIdx = vPermIdx; +} + +static void getElemIdx(uint8x8x4_t *pvIdx) +{ + uint8x8x4_t reg = { + { + vld1_u8(top_right), + vld1_u8(bottom_right), + vld1_u8(top_left), + vld1_u8(bottom_left) + }}; + *pvIdx = reg; +} + +static vx_uint8 isFastCorner(uint8x8_t *pvVal, vx_uint8 p, vx_uint8 tolerance) +{ + uint8x8x4_t vIdx; + uint8x8x2_t vPermIdx; + uint8x8x4_t vTbl_hi = {{pvVal[0], pvVal[1], pvVal[2], pvVal[3]}}; + uint8x8x3_t vTbl_lo = {{pvVal[4], pvVal[5], pvVal[6]}}; + uint8x16_t vPG = vqaddq_u8(vdupq_n_u8(p), vdupq_n_u8(tolerance)); + uint8x16_t vPL = vqsubq_u8(vdupq_n_u8(p), vdupq_n_u8(tolerance)); + + getElemIdx(&vIdx); + + uint8x16_t vPixel = vcombine_u8(vtbx3_u8(vtbl4_u8(vTbl_hi, vIdx.val[0]), vTbl_lo, vIdx.val[1]), + vtbx3_u8(vtbl4_u8(vTbl_hi, vIdx.val[2]), vTbl_lo, vIdx.val[3])); + uint8x8x2_t vTmp = {{vget_low_u8(vPixel), vget_high_u8(vPixel)}}; + uint8x8_t vPermR = vdup_n_u8(0xFF); + vx_uint8 bPG = 0; + vx_uint8 bPL = 0; + + for (vx_uint8 idx = 0; idx < PERMUTATIONS; idx++) + { + getPermIdx(idx, &vPermIdx); + uint8x16_t vVal = vcombine_u8(vtbl2_u8(vTmp, vPermIdx.val[0]), + vtbx2_u8(vPermR, vTmp, vPermIdx.val[1])); + uint8x16_t vPred = vcgtq_u8(vVal, vPG); + uint64x1_t vRet = vreinterpret_u64_u8(vand_u8(vget_high_u8(vPred), vget_low_u8(vPred))); + bPG |= (vget_lane_u64(vRet, 0) == UINT64_MAX); + + vPred = vcltq_u8(vVal, vPL); + uint64x2_t vRet2 = vreinterpretq_u64_u8(vPred); + bPL |= ((vgetq_lane_u64(vRet2, 0) == UINT64_MAX) && (vgetq_lane_u64(vRet2, 1) == 0xFF)); + } + + return (bPG | bPL); +} + +static vx_uint8 getStrength(vx_uint8 bCorner, uint8x8_t *pvVal, vx_uint8 p, vx_uint8 tolerance) +{ + vx_uint8 a = 0, b = 255; + + if (bCorner) + { + a = tolerance; + while (b - a > 1) + { + vx_uint8 c = (a + b)/2; + if (isFastCorner(pvVal, p, c)) + a = c; + else + b = c; + } + } + + return a; +} + +static void fast9CornersPerRow(uint8x8_t *pvPrv, uint8x8_t *pvCur, uint8x8_t *pvNxt, int16x8_t *pvXStep, + vx_imagepatch_addressing_t *src_addr, vx_uint8 tolerance, vx_uint8 *pStrength) +{ + vx_uint8 bCorner; + int16x8_t vX = *pvXStep; + uint8x8_t vPrv[7], vCur[7], vNxt[7]; + uint8x8_t vTmp[7]; + + vx_int32 x; + for (x = 0; x < 7; x++) + { + vPrv[x] = pvPrv[x]; + vCur[x] = pvCur[x]; + vNxt[x] = pvNxt[x]; + } + + if (vgetq_lane_s16(vX, 0) >= APERTURE && vgetq_lane_s16(vX, 0) < (src_addr->dim_x - APERTURE)) + { + for (vx_uint32 idx = 0; idx < 7; idx++) + { + vTmp[idx] = vext_u8(vPrv[idx], vCur[idx], 5); + } + + bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 0), tolerance); + pStrength[0] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 0), tolerance); + } + + if (vgetq_lane_s16(vX, 1) >= APERTURE && vgetq_lane_s16(vX, 1) < (src_addr->dim_x - APERTURE)) + { + for (vx_uint32 idx = 0; idx < 7; idx++) + { + vTmp[idx] = vext_u8(vPrv[idx], vCur[idx], 6); + } + + bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 1), tolerance); + pStrength[1] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 1), tolerance); + } + + if (vgetq_lane_s16(vX, 2) >= APERTURE && vgetq_lane_s16(vX, 2) < (src_addr->dim_x - APERTURE)) + { + for (vx_uint32 idx = 0; idx < 7; idx++) + { + vTmp[idx] = vext_u8(vPrv[idx], vCur[idx], 7); + } + + bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 2), tolerance); + pStrength[2] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 2), tolerance); + } + if (vgetq_lane_s16(vX, 3) >= APERTURE && vgetq_lane_s16(vX, 3) < (src_addr->dim_x - APERTURE)) + { + bCorner = isFastCorner(vCur, vget_lane_u8(vCur[3], 3), tolerance); + pStrength[3] = getStrength(bCorner, vCur, vget_lane_u8(vCur[3], 3), tolerance); + } + if (vgetq_lane_s16(vX, 4) >= APERTURE && vgetq_lane_s16(vX, 4) < (src_addr->dim_x - APERTURE)) + { + for (vx_uint32 idx = 0; idx < 7; idx++) + { + vTmp[idx] = vext_u8(vCur[idx], vNxt[idx], 1); + } + + bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 4), tolerance); + pStrength[4] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 4), tolerance); + } + if (vgetq_lane_s16(vX, 5) >= APERTURE && vgetq_lane_s16(vX, 5) < (src_addr->dim_x - APERTURE)) + { + for (vx_uint32 idx = 0; idx < 7; idx++) + { + vTmp[idx] = vext_u8(vCur[idx], vNxt[idx], 2); + } + + bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 5), tolerance); + pStrength[5] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 5), tolerance); + } + if (vgetq_lane_s16(vX, 6) >= APERTURE && vgetq_lane_s16(vX, 6) < (src_addr->dim_x - APERTURE)) + { + for (vx_uint32 idx = 0; idx < 7; idx++) + { + vTmp[idx] = vext_u8(vCur[idx], vNxt[idx], 3); + } + + bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 6), tolerance); + pStrength[6] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 6), tolerance); + } + if (vgetq_lane_s16(vX, 7) >= APERTURE && vgetq_lane_s16(vX, 7) < (src_addr->dim_x - APERTURE)) + { + for (vx_uint32 idx = 0; idx < 7; idx++) + { + vTmp[idx] = vext_u8(vCur[idx], vNxt[idx], 4); + } + + bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 7), tolerance); + pStrength[7] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 7), tolerance); + } +} + +static vx_uint8 indexes[PERMUTATIONS][9] = +{ + { 0, 1, 2, 3, 4, 5, 6, 7, 8 }, + { 15, 0, 1, 2, 3, 4, 5, 6, 7 }, + { 14,15, 0, 1, 2, 3, 4, 5, 6 }, + { 13,14,15, 0, 1, 2, 3, 4, 5 }, + { 12,13,14,15, 0, 1, 2, 3, 4 }, + { 11,12,13,14,15, 0, 1, 2, 3 }, + { 10,11,12,13,14,15, 0, 1, 2 }, + { 9,10,11,12,13,14,15, 0, 1 }, + { 8, 9,10,11,12,13,14,15, 0 }, + { 7, 8, 9,10,11,12,13,14,15 }, + { 6, 7, 8, 9,10,11,12,13,14 }, + { 5, 6, 7, 8, 9,10,11,12,13 }, + { 4, 5, 6, 7, 8, 9,10,11,12 }, + { 3, 4, 5, 6, 7, 8, 9,10,11 }, + { 2, 3, 4, 5, 6, 7, 8, 9,10 }, + { 1, 2, 3, 4, 5, 6, 7, 8, 9 }, +}; + +/* offsets from "p" */ +static vx_int32 offsets[16][2] = +{ + { 0, -3 }, + { 1, -3 }, + { 2, -2 }, + { 3, -1 }, + { 3, 0 }, + { 3, 1 }, + { 2, 2 }, + { 1, 3 }, + { 0, 3 }, + { -1, 3 }, + { -2, 2 }, + { -3, 1 }, + { -3, 0 }, + { -3, -1 }, + { -2, -2 }, + { -1, -3 }, +}; + + +static vx_bool vxIsFastCorner(const vx_uint8* buf, vx_uint8 p, vx_uint8 tolerance) +{ + vx_int32 i, a; + for (a = 0; a < PERMUTATIONS; a++) + { + vx_bool isacorner = vx_true_e; + for (i = 0; i < dimof(indexes[a]); i++) + { + vx_uint8 j = indexes[a][i]; + vx_uint8 v = buf[j]; + if (v <= (p + tolerance)) + { + isacorner = vx_false_e; + } + } + if (isacorner == vx_true_e) + return isacorner; + isacorner = vx_true_e; + for (i = 0; i < dimof(indexes[a]); i++) + { + vx_uint8 j = indexes[a][i]; + vx_uint8 v = buf[j]; + if (v >= (p - tolerance)) + { + isacorner = vx_false_e; + } + } + if (isacorner == vx_true_e) + return isacorner; + } + return vx_false_e; +} + + +static vx_uint8 vxGetFastCornerStrength(vx_int32 x, vx_int32 y, void* src_base, + vx_imagepatch_addressing_t* src_addr, vx_uint8 tolerance) +{ + if (x < APERTURE || y < APERTURE || x >= (vx_int32)src_addr->dim_x - APERTURE || y >= (vx_int32)src_addr->dim_y - APERTURE) + return 0; + { + vx_uint8 p = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y, src_addr); + vx_uint8 buf[16]; + vx_int32 j; + vx_uint8 a, b = 255; + + for (j = 0; j < 16; j++) + { + buf[j] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + offsets[j][0], y + offsets[j][1], src_addr); + } + + if (!vxIsFastCorner(buf, p, tolerance)) + return 0; + + a = tolerance; + while (b - a > 1) + { + vx_uint8 c = (a + b) / 2; + if (vxIsFastCorner(buf, p, c)) + a = c; + else + b = c; + } + return a; + } +} + +void Fast9Corners_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_float32 *sens = (vx_float32*)parameters[1]; + vx_bool *nonm = (vx_bool*)parameters[2]; + vx_tile_array_t *points = (vx_tile_array_t *)parameters[3]; + vx_scalar s_num_corners = (vx_scalar)parameters[4]; + + vx_keypoint_t kp; + + vx_size num_corners = 0; + + vx_uint8 *src_base = in->base[0]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = in->tile_y + in->tile_block.height; + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = in->tile_x + in->tile_block.width; + + vx_uint8 tolerance = (vx_uint8)(*sens); + vx_bool do_nonmax = *nonm; + vx_size dst_capacity = points->capacity; + + memset(&kp, 0, sizeof(kp)); + + vx_int32 w8 = ((in->image.width - 2 * APERTURE) >> 3) << 3; + vx_int16 szXStep[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + vx_uint8 szStrength[8]; + int16x8_t vXStep = vld1q_s16(szXStep); + uint8x8_t vZero = vdup_n_u8(0); + uint8x8_t vPrv[7], vCur[7], vNxt[7]; + uint8x8_t vNMPrv[7], vNMCur[7], vNMNxt[7]; + + if (high_y == in->image.height && high_x == in->image.width) + { + for (y = APERTURE; y < in->image.height - APERTURE; y++) + { + for (vx_uint8 idx = 0; idx < 7; idx++) + { + vPrv[idx] = vdup_n_u8(0); + vCur[idx] = vld1_u8((vx_uint8 *)src_base + (y - APERTURE + idx) * in->addr->stride_y); + } + for (x = 0; x < in->image.width - APERTURE; x += 8) + { + for (vx_uint8 idx = 0; idx < 7; idx++) + { + vNxt[idx] = vld1_u8((vx_uint8 *)src_base + (y - APERTURE + idx) * in->addr->stride_y + (x + 8) * in->addr->stride_x); + } + int16x8_t vX = vaddq_s16(vdupq_n_s16(x), vXStep); + + memset(szStrength, 0, 8); + fast9CornersPerRow(vPrv, vCur, vNxt, &vX, in->addr, tolerance, szStrength); + uint8x8_t vStrength = vld1_u8(szStrength); + uint8x8_t vPred = vcgt_u8(vStrength, vZero); + uint64x1_t vRetBit = vreinterpret_u64_u8(vPred); + + if (do_nonmax && (0 != vget_lane_u64(vRetBit, 0))) + { + vx_uint8 szNMStrength[8]; + uint8x8_t vTmpPrv[7], vTmpCur[7], vTmpNxt[7]; + uint8x8_t vNMStrength; + uint8x8_t vTmpPred; + int16x8_t vNMX; + if ((y - 1) >= APERTURE) + { + if (x != 0) + { + vNMPrv[0] = vld1_u8((vx_uint8 *)src_base + (y - APERTURE - 1) * in->addr->stride_y + (x - 8) * in->addr->stride_x); + } + else + { + vNMPrv[0] = vdup_n_u8(0); + } + vNMCur[0] = vld1_u8((vx_uint8 *)src_base + (y - APERTURE - 1) * in->addr->stride_y + x * in->addr->stride_x); + vNMNxt[0] = vld1_u8((vx_uint8 *)src_base + (y - APERTURE - 1) * in->addr->stride_y + (x + 8) * in->addr->stride_x); + for (vx_uint8 idx = 1; idx < 7; idx++) + { + vNMPrv[idx] = vPrv[idx - 1]; + vNMCur[idx] = vCur[idx - 1]; + vNMNxt[idx] = vNxt[idx - 1]; + } + + for (vx_uint8 idx = 0; idx < 7; idx++) + { + vTmpPrv[idx] = vext_u8(vZero, vNMPrv[idx], 7); + vTmpCur[idx] = vext_u8(vNMPrv[idx], vNMCur[idx], 7); + vTmpNxt[idx] = vext_u8(vNMCur[idx], vNMNxt[idx], 7); + } + vNMX = vsubq_s16(vX, vdupq_n_s16(1)); + memset(szNMStrength, 0, 8); + fast9CornersPerRow(vTmpPrv, vTmpCur, vTmpNxt, &vNMX, in->addr, tolerance, szNMStrength); + vNMStrength = vld1_u8(szNMStrength); + vTmpPred = vcge_u8(vStrength, vNMStrength); + vPred = vand_u8(vPred, vTmpPred); + vRetBit = vreinterpret_u64_u8(vPred); + + if (0 != vget_lane_u64(vRetBit, 0)) + { + memset(szNMStrength, 0, 8); + fast9CornersPerRow(vNMPrv, vNMCur, vNMNxt, &vX, in->addr, tolerance, szNMStrength); + vNMStrength = vld1_u8(szNMStrength); + vTmpPred = vcge_u8(vStrength, vNMStrength); + vPred = vand_u8(vPred, vTmpPred); + vRetBit = vreinterpret_u64_u8(vPred); + } + + if (0 != vget_lane_u64(vRetBit, 0)) + { + for (vx_uint8 idx = 0; idx < 7; idx++) + { + vTmpPrv[idx] = vext_u8(vNMPrv[idx], vNMCur[idx], 1); + vTmpCur[idx] = vext_u8(vNMCur[idx], vNMNxt[idx], 1); + vTmpNxt[idx] = vext_u8(vNMNxt[idx], vZero, 1); + } + vNMX = vaddq_s16(vX, vdupq_n_s16(1)); + memset(szNMStrength, 0, 8); + fast9CornersPerRow(vTmpPrv, vTmpCur, vTmpNxt, &vNMX, in->addr, tolerance, szNMStrength); + vNMStrength = vld1_u8(szNMStrength); + vTmpPred = vcge_u8(vStrength, vNMStrength); + vPred = vand_u8(vPred, vTmpPred); + vRetBit = vreinterpret_u64_u8(vPred); + } + } + + if (0 != vget_lane_u64(vRetBit, 0)) + { + for (vx_uint8 idx = 0; idx < 7; idx++) + { + vTmpPrv[idx] = vext_u8(vZero, vPrv[idx], 7); + vTmpCur[idx] = vext_u8(vPrv[idx], vCur[idx], 7); + vTmpNxt[idx] = vext_u8(vCur[idx], vNxt[idx], 7); + } + vNMX = vsubq_s16(vX, vdupq_n_s16(1)); + memset(szNMStrength, 0, 8); + fast9CornersPerRow(vTmpPrv, vTmpCur, vTmpNxt, &vNMX, in->addr, tolerance, szNMStrength); + vNMStrength = vld1_u8(szNMStrength); + vTmpPred = vcge_u8(vStrength, vNMStrength); + vPred = vand_u8(vPred, vTmpPred); + vRetBit = vreinterpret_u64_u8(vPred); + } + + if (0 != vget_lane_u64(vRetBit, 0)) + { + for (vx_uint8 idx = 0; idx < 7; idx++) + { + vTmpPrv[idx] = vext_u8(vPrv[idx], vCur[idx], 1); + vTmpCur[idx] = vext_u8(vCur[idx], vNxt[idx], 1); + vTmpNxt[idx] = vext_u8(vNxt[idx], vZero, 1); + } + vNMX = vaddq_s16(vX, vdupq_n_s16(1)); + memset(szNMStrength, 0, 8); + fast9CornersPerRow(vTmpPrv, vTmpCur, vTmpNxt, &vNMX, in->addr, tolerance, szNMStrength); + vNMStrength = vld1_u8(szNMStrength); + vTmpPred = vcgt_u8(vStrength, vNMStrength); + vPred = vand_u8(vPred, vTmpPred); + vRetBit = vreinterpret_u64_u8(vPred); + } + + if ((y + 1) < (in->image.height - APERTURE)) + { + if (0 != vget_lane_u64(vRetBit, 0)) + { + if (x != 0) + { + vNMPrv[6] = vld1_u8((vx_uint8 *)src_base + (y + APERTURE + 1) * in->addr->stride_y + (x - 8) * in->addr->stride_x); + } + else + { + vNMPrv[6] = vdup_n_u8(0); + } + vNMCur[6] = vld1_u8((vx_uint8 *)src_base + (y + APERTURE + 1) * in->addr->stride_y + x * in->addr->stride_x); + vNMNxt[6] = vld1_u8((vx_uint8 *)src_base + (y + APERTURE + 1) * in->addr->stride_y + (x + 8) * in->addr->stride_x); + for (vx_uint8 idx = 0; idx < 6; idx++) + { + vNMPrv[idx] = vPrv[idx + 1]; + vNMCur[idx] = vCur[idx + 1]; + vNMNxt[idx] = vNxt[idx + 1]; + } + + for (vx_uint8 idx = 0; idx < 7; idx++) + { + vTmpPrv[idx] = vext_u8(vZero, vNMPrv[idx], 7); + vTmpCur[idx] = vext_u8(vNMPrv[idx], vNMCur[idx], 7); + vTmpNxt[idx] = vext_u8(vNMCur[idx], vNMNxt[idx], 7); + } + vNMX = vsubq_s16(vX, vdupq_n_s16(1)); + memset(szNMStrength, 0, 8); + fast9CornersPerRow(vTmpPrv, vTmpCur, vTmpNxt, &vNMX, in->addr, tolerance, szNMStrength); + vNMStrength = vld1_u8(szNMStrength); + vTmpPred = vcgt_u8(vStrength, vNMStrength); + vPred = vand_u8(vPred, vTmpPred); + vRetBit = vreinterpret_u64_u8(vPred); + } + + if (0 != vget_lane_u64(vRetBit, 0)) + { + memset(szNMStrength, 0, 8); + fast9CornersPerRow(vNMPrv, vNMCur, vNMNxt, &vX, in->addr, tolerance, szNMStrength); + vNMStrength = vld1_u8(szNMStrength); + vTmpPred = vcgt_u8(vStrength, vNMStrength); + vPred = vand_u8(vPred, vTmpPred); + vRetBit = vreinterpret_u64_u8(vPred); + } + + if (0 != vget_lane_u64(vRetBit, 0)) + { + for (vx_uint8 idx = 0; idx < 7; idx++) + { + vTmpPrv[idx] = vext_u8(vNMPrv[idx], vNMCur[idx], 1); + vTmpCur[idx] = vext_u8(vNMCur[idx], vNMNxt[idx], 1); + vTmpNxt[idx] = vext_u8(vNMNxt[idx], vZero, 1); + } + vNMX = vaddq_s16(vX, vdupq_n_s16(1)); + memset(szNMStrength, 0, 8); + fast9CornersPerRow(vTmpPrv, vTmpCur, vTmpNxt, &vNMX, in->addr, tolerance, szNMStrength); + vNMStrength = vld1_u8(szNMStrength); + vTmpPred = vcgt_u8(vStrength, vNMStrength); + vPred = vand_u8(vPred, vTmpPred); + } + } + } + + vRetBit = vreinterpret_u64_u8(vPred); + if (0 != vget_lane_u64(vRetBit, 0)) + { + addCorner(y, &vX, &vPred, szStrength, dst_capacity, &num_corners, points); + } + + for (vx_uint8 idx = 0; idx < 7; idx++) + { + vPrv[idx] = vCur[idx]; + vCur[idx] = vNxt[idx]; + } + } + } + } +} + + +#define FAST9CORNERS(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint8 strength = vxGetFastCornerStrength(x, y, src_base, in->addr, tolerance); \ + if (strength > 0) \ + { \ + if (do_nonmax) \ + { \ + if (strength >= vxGetFastCornerStrength(x - 1, y - 1, src_base, in->addr, tolerance) && \ + strength >= vxGetFastCornerStrength(x, y - 1, src_base, in->addr, tolerance) && \ + strength >= vxGetFastCornerStrength(x + 1, y - 1, src_base, in->addr, tolerance) && \ + strength >= vxGetFastCornerStrength(x - 1, y, src_base, in->addr, tolerance) && \ + strength > vxGetFastCornerStrength(x + 1, y, src_base, in->addr, tolerance) && \ + strength > vxGetFastCornerStrength(x - 1, y + 1, src_base, in->addr, tolerance) && \ + strength > vxGetFastCornerStrength(x, y + 1, src_base, in->addr, tolerance) && \ + strength > vxGetFastCornerStrength(x + 1, y + 1, src_base, in->addr, tolerance)) \ + ; \ + else \ + continue; \ + } \ + if (num_corners < dst_capacity) \ + { \ + kp.x = x; \ + kp.y = y; \ + kp.strength = strength; \ + kp.scale = 0.0f; \ + kp.orientation = 0.0f; \ + kp.tracking_status = 1; \ + kp.error = 0.0f; \ + vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp)); \ + } \ + num_corners++; \ + } \ + } \ + } + + +void Fast9Corners_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_float32 *sens = (vx_float32*)parameters[1]; + vx_bool *nonm = (vx_bool*)parameters[2]; + vx_tile_array_t *points = (vx_tile_array_t *)parameters[3]; + vx_scalar s_num_corners = (vx_scalar)parameters[4]; + + vx_keypoint_t kp; + + vx_size num_corners = 0; + + vx_uint8 *src_base = in->base[0]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = vxTileHeight(in, 0); + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = vxTileWidth(in, 0); + + vx_uint8 tolerance = (vx_uint8)(*sens); + vx_bool do_nonmax = *nonm; + vx_size dst_capacity = points->capacity; + + memset(&kp, 0, sizeof(kp)); + + if (low_y == 0 && low_x == 0) + { + FAST9CORNERS(low_y + APERTURE, high_y - APERTURE, low_x + APERTURE, high_x - APERTURE) + } + else + { + FAST9CORNERS(APERTURE, low_y, low_x, high_x - APERTURE) + FAST9CORNERS(low_y, high_y, APERTURE, high_x - APERTURE) + } + + if (s_num_corners) + vxCopyScalar(s_num_corners, &num_corners, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); +} diff --git a/kernels/tiling/tiling_filter.c b/kernels/tiling/tiling_filter.c new file mode 100644 index 0000000..7b3e780 --- /dev/null +++ b/kernels/tiling/tiling_filter.c @@ -0,0 +1,481 @@ +/* +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include + +#include + +void box3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x, y; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + float32x4_t oneovernine = vdupq_n_f32(1.0f / 9.0f); + vx_uint8 *src = in->base[0] + in->tile_x; + vx_uint8 *dst = out->base[0] + out->tile_x; + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + if (low_y == 0) + { + low_y = 1; + } + if (high_y == out->image.height) + { + high_y = high_y - 1; + } + + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst_u8 = (vx_uint8 *)dst + 1 + y * out->image.width; + vx_uint8* top_src = (vx_uint8 *)src + (y - 1) * in->image.width; + vx_uint8* mid_src = (vx_uint8 *)src + (y)* in->image.width; + vx_uint8* bot_src = (vx_uint8 *)src + (y + 1)* in->image.width; + + for (x = 0; x < out->tile_block.width; x += 8) + { + const uint8x16_t top_data = vld1q_u8(top_src); + const uint8x16_t mid_data = vld1q_u8(mid_src); + const uint8x16_t bot_data = vld1q_u8(bot_src); + + const int16x8x2_t top_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data))) + } + }; + const int16x8x2_t mid_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data))) + } + }; + const int16x8x2_t bot_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data))) + } + }; + + //top left + int16x8_t vOut = top_s16.val[0]; + //top mid + vOut = vaddq_s16(vOut, vextq_s16(top_s16.val[0], top_s16.val[1], 1)); + //top right + vOut = vaddq_s16(vOut, vextq_s16(top_s16.val[0], top_s16.val[1], 2)); + //mid left + vOut = vaddq_s16(vOut, mid_s16.val[0]); + //mid mid + vOut = vaddq_s16(vOut, vextq_s16(mid_s16.val[0], mid_s16.val[1], 1)); + //mid right + vOut = vaddq_s16(vOut, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2)); + //bot left + vOut = vaddq_s16(vOut, bot_s16.val[0]); + //bot mid + vOut = vaddq_s16(vOut, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1)); + //bot right + vOut = vaddq_s16(vOut, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2)); + + float32x4_t outfloathigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vOut))); + float32x4_t outfloatlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(vOut))); + + outfloathigh = vmulq_f32(outfloathigh, oneovernine); + outfloatlow = vmulq_f32(outfloatlow, oneovernine); + + vOut = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(outfloatlow)), + vqmovn_s32(vcvtq_s32_f32(outfloathigh))); + + vst1_u8(dst_u8, vqmovun_s16(vOut)); + + top_src += 8; + mid_src += 8; + bot_src += 8; + dst_u8 += 8; + } + } +} + + +void box3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + + if (ty == 0 && tx == 0) + { + for (y = 1; y < vxTileHeight(out, 0); y++) + { + for (x = 1; x < vxTileWidth(out, 0); x++) + { + vx_int32 j, i; + vx_uint32 sum = 0; + vx_uint32 count = 0; + for (j = vxNeighborhoodTop(in); j <= vxNeighborhoodBottom(in); j++) + { + for (i = vxNeighborhoodLeft(in); i <= vxNeighborhoodRight(in); i++) + { + sum += vxImagePixel(vx_uint8, in, 0, x, y, i, j); + count++; + } + } + sum /= count; + if (sum > 255) + sum = 255; + vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = (vx_uint8)sum; + } + } + } + else + { + for (y = 1; y < ty; y++) + { + for (x = tx; x < vxTileWidth(out, 0); x++) + { + vx_int32 j, i; + vx_uint32 sum = 0; + vx_uint32 count = 0; + for (j = vxNeighborhoodTop(in); j <= vxNeighborhoodBottom(in); j++) + { + for (i = vxNeighborhoodLeft(in); i <= vxNeighborhoodRight(in); i++) + { + + sum += vxImagePixel(vx_uint8, in, 0, x, y, i, j); + count++; + } + } + sum /= count; + if (sum > 255) + sum = 255; + vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = (vx_uint8)sum; + } + } + + for (y = ty; y < vxTileHeight(out, 0); y++) + { + for (x = 1; x < vxTileWidth(out, 0); x++) + { + vx_int32 j, i; + vx_uint32 sum = 0; + vx_uint32 count = 0; + for (j = vxNeighborhoodTop(in); j <= vxNeighborhoodBottom(in); j++) + { + for (i = vxNeighborhoodLeft(in); i <= vxNeighborhoodRight(in); i++) + { + sum += vxImagePixel(vx_uint8, in, 0, x, y, i, j); + count++; + } + } + sum /= count; + if (sum > 255) + sum = 255; + vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = (vx_uint8)sum; + } + } + } +} + +static inline void sort(uint8x8_t *a, uint8x8_t *b) +{ + const uint8x8_t min = vmin_u8(*a, *b); + const uint8x8_t max = vmax_u8(*a, *b); + *a = min; + *b = max; +} + +void Median3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x, y; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + if (low_y == 0) + { + low_y = 1; + } + if (high_y == out->image.height) + { + high_y = high_y - 1; + } + + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * out->addr->stride_y; + vx_uint8* top_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y; + vx_uint8* mid_src = (vx_uint8 *)src_base + (y) * in->addr->stride_y; + vx_uint8* bot_src = (vx_uint8 *)src_base + (y + 1) * in->addr->stride_y; + + for (x = 0; x < out->tile_block.width; x += 8) + { + const uint8x16_t top_data = vld1q_u8(top_src); + const uint8x16_t mid_data = vld1q_u8(mid_src); + const uint8x16_t bot_data = vld1q_u8(bot_src); + + uint8x8_t p0 = vget_low_u8(top_data); + uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1); + uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2); + uint8x8_t p3 = vget_low_u8(mid_data); + uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1); + uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2); + uint8x8_t p6 = vget_low_u8(bot_data); + uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1); + uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2); + + sort(&p1, &p2); + sort(&p4, &p5); + sort(&p7, &p8); + + sort(&p0, &p1); + sort(&p3, &p4); + sort(&p6, &p7); + + sort(&p1, &p2); + sort(&p4, &p5); + sort(&p7, &p8); + + sort(&p0, &p3); + sort(&p5, &p8); + sort(&p4, &p7); + + sort(&p3, &p6); + sort(&p1, &p4); + sort(&p2, &p5); + + sort(&p4, &p7); + sort(&p4, &p2); + sort(&p6, &p4); + + sort(&p4, &p2); + + vst1_u8(dst, p4); + + top_src+=8; + mid_src+=8; + bot_src+=8; + dst += 8; + } + } +} + + +static int vx_uint8_compare(const void *p1, const void *p2) +{ + vx_uint8 a = *(vx_uint8 *)p1; + vx_uint8 b = *(vx_uint8 *)p2; + if (a > b) + return 1; + else if (a == b) + return 0; + else + return -1; +} + + +#define Median3x3(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_int32 j, i; \ + vx_uint8 values[9]; \ + vx_uint32 count = 0; \ + for (j = vxNeighborhoodTop(in); j <= vxNeighborhoodBottom(in); j++) \ + { \ + for (i = vxNeighborhoodLeft(in); i <= vxNeighborhoodRight(in); i++) \ + { \ + values[count++] = vxImagePixel(vx_uint8, in, 0, x, y, i, j); \ + } \ + } \ + qsort(values, dimof(values), sizeof(vx_uint8), vx_uint8_compare); \ + vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = values[4]; \ + } \ + } + + +void Median3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + if (low_y == 0 && low_x == 0) + { + Median3x3(low_y + 1, high_y - 1, low_x + 1, high_x - 1) + } + else + { + Median3x3(1, low_y, low_x, high_x - 1) + Median3x3(low_y, high_y, 1, high_x - 1) + } +} + + +void Gaussian3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x, y; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + int16x8_t two = vdupq_n_s16(2); + int16x8_t four = vdupq_n_s16(4); + + if (low_y == 0) + { + low_y = 1; + } + if (high_y == out->image.height) + { + high_y = high_y - 1; + } + + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * out->addr->stride_y; + vx_uint8* top_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y; + vx_uint8* mid_src = (vx_uint8 *)src_base + (y) * in->addr->stride_y; + vx_uint8* bot_src = (vx_uint8 *)src_base + (y + 1) * in->addr->stride_y; + + for (x = 0; x < out->tile_block.width; x += 8) + { + const uint8x16_t top_data = vld1q_u8(top_src); + const uint8x16_t mid_data = vld1q_u8(mid_src); + const uint8x16_t bot_data = vld1q_u8(bot_src); + + const int16x8x2_t top_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data))) + } + }; + const int16x8x2_t mid_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data))) + } + }; + const int16x8x2_t bot_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data))) + } + }; + + //top left + int16x8_t out = top_s16.val[0]; + //top mid + out = vmlaq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1), two); + //top right + out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2)); + //mid left + out = vmlaq_s16(out, mid_s16.val[0], two); + //mid mid + out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 1), four); + //mid right + out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two); + //bot left + out = vaddq_s16(out, bot_s16.val[0]); + //bot mid + out = vmlaq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two); + //bot right + out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2)); + + vst1_u8(dst, vqshrun_n_s16(out, 4)); + + top_src+=8; + mid_src+=8; + bot_src+=8; + dst += 8; + } + } +} + +#define Gaussian3x3(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint32 sum = 0; \ + \ + sum += vxImagePixel(vx_uint8, in, 0, x, y, -1, -1); \ + sum += vxImagePixel(vx_uint8, in, 0, x, y, 0, -1) << 1; \ + sum += vxImagePixel(vx_uint8, in, 0, x, y, +1, -1); \ + sum += vxImagePixel(vx_uint8, in, 0, x, y, -1, 0) << 1; \ + sum += vxImagePixel(vx_uint8, in, 0, x, y, 0, 0) << 2; \ + sum += vxImagePixel(vx_uint8, in, 0, x, y, +1, 0) << 1; \ + sum += vxImagePixel(vx_uint8, in, 0, x, y, -1, +1); \ + sum += vxImagePixel(vx_uint8, in, 0, x, y, 0, +1) << 1; \ + sum += vxImagePixel(vx_uint8, in, 0, x, y, +1, +1); \ + sum >>= 4; \ + if (sum > 255) \ + sum = 255; \ + vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = (vx_uint8)sum; \ + } \ + } + + +void Gaussian3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + if (low_y == 0 && low_x == 0) + { + Gaussian3x3(low_y + 1, high_y - 1, low_x + 1, high_x - 1) + } + else + { + Gaussian3x3(1, low_y, low_x, high_x - 1) + Gaussian3x3(low_y, high_y, 1, high_x - 1) + } +} diff --git a/kernels/tiling/tiling_hog.c b/kernels/tiling/tiling_hog.c new file mode 100644 index 0000000..a6289a6 --- /dev/null +++ b/kernels/tiling/tiling_hog.c @@ -0,0 +1,403 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include +#include + +#define min(a,b) (aimage.width) / ((vx_float64)(*cell_w))); + vx_uint32 low_height = in->tile_y; + vx_uint32 height = in->tile_y + in->tile_block.height; + vx_uint32 low_width = in->tile_x; + vx_uint32 width = in->tile_x + in->tile_block.width; + + vx_float32 gx_0, gx_1, gx_2, gx_3; + vx_float32 gy_0, gy_1, gy_2, gy_3; + float32x4_t magnitude_f32x4; + float32x4_t orientation_f32x4; + float32x4_t fv_0_5_32x4 = vdupq_n_f32(0.5f); + float32x4_t num_div_360_f32x4 = vdupq_n_f32(num_div_360); + int32x4_t bin_s32x4; + vx_int32 cell_wxh = (*cell_w)*(*cell_h); + int32x4_t num_orientations_s32x4 = vdupq_n_s32((*num_orientations)); + int32x4_t num_cellw_s32x4 = vdupq_n_s32(num_cellw); + float32_t pi_3_14 = 180 / 3.14159265; + + for (vx_int32 j = low_height; j < height; j++) + { + int32x4_t celly_s32x4 = vdupq_n_s32(j/(*cell_h)); + vx_int32 y1 = j - 1 < 0 ? 0 : j - 1; + vx_int32 y2 = j + 1 >= in->image.height ? in->image.height - 1 : j + 1; + vx_uint8 *src_base_y = (vx_uint8 *)in->base[0] + j*in->addr[0].stride_y; + vx_uint8 *src_base_y_y1 = (vx_uint8 *)in->base[0] + y1*in->addr[0].stride_y; + vx_uint8 *src_base_y_y2 = (vx_uint8 *)in->base[0] + y2*in->addr[0].stride_y; + for (int i = low_width; i < width; i+=4) + { + vx_int32 x1 = i - 1 < 0 ? 0 : i - 1; + vx_int32 x2 = i + 1 >= in->image.width ? in->image.width - 1 : i + 1; + gx_0 = *(src_base_y + x2) - *(src_base_y + x1); + x1 = i < 0 ? 0 : i; + x2 = i + 2 >= in->image.width ? in->image.width - 1 : i+2; + gx_1 = *(src_base_y + x2) - *(src_base_y + x1); + x1 = i + 1 < 0 ? 0 : i + 1; + x2 = i+3 >= in->image.width ? in->image.width - 1 : i+3; + gx_2 = *(src_base_y + x2) - *(src_base_y + x1); + x1 = i+2 < 0 ? 0 : i+2; + x2 = i+4 >= in->image.width ? in->image.width - 1 : i+4; + gx_3 = *(src_base_y + x2) - *(src_base_y + x1); + gy_0 = *(src_base_y_y2 + i) - *(src_base_y_y1 + i); + gy_1 = *(src_base_y_y2 + i + 1) - *(src_base_y_y1 + i + 1); + gy_2 = *(src_base_y_y2 + i + 2) - *(src_base_y_y1 + i + 2); + gy_3 = *(src_base_y_y2 + i + 3) - *(src_base_y_y1 + i + 3); + + //calculating mag and orientation + magnitude_f32x4 = vsetq_lane_f32(sqrtf(gx_0*gx_0 + gy_0*gy_0) / cell_wxh, magnitude_f32x4, 0); + magnitude_f32x4 = vsetq_lane_f32(sqrtf(gx_1*gx_1 + gy_1*gy_1) / cell_wxh, magnitude_f32x4, 1); + magnitude_f32x4 = vsetq_lane_f32(sqrtf(gx_2*gx_2 + gy_2*gy_2) / cell_wxh, magnitude_f32x4, 2); + magnitude_f32x4 = vsetq_lane_f32(sqrtf(gx_3*gx_3 + gy_3*gy_3) / cell_wxh, magnitude_f32x4, 3); + orientation_f32x4 = vsetq_lane_f32(fmod(atan2f(gy_0, gx_0) * pi_3_14, 360), orientation_f32x4, 0); + orientation_f32x4 = vsetq_lane_f32(fmod(atan2f(gy_1, gx_1) * pi_3_14, 360), orientation_f32x4, 1); + orientation_f32x4 = vsetq_lane_f32(fmod(atan2f(gy_2, gx_2) * pi_3_14, 360), orientation_f32x4, 2); + orientation_f32x4 = vsetq_lane_f32(fmod(atan2f(gy_3, gx_3) * pi_3_14, 360), orientation_f32x4, 3); + uint32x4_t lt0 = vcltq_f32(orientation_f32x4, vdupq_n_f32(0.0)); + float32x4_t orientation_f32x4_360 = vaddq_f32(orientation_f32x4, vdupq_n_f32(360.0)); + orientation_f32x4 = vbslq_f32(lt0, orientation_f32x4_360, orientation_f32x4); + + //calculating bin. + int32x4_t bin_s32x4 = vcvtq_s32_f32(vmulq_f32(orientation_f32x4, num_div_360_f32x4)); + + int32x4_t cellx_s32x4 = vsetq_lane_s32(i/(*cell_w), cellx_s32x4, 0); + cellx_s32x4 = vsetq_lane_s32((i+1)/(*cell_w), cellx_s32x4, 1); + cellx_s32x4 = vsetq_lane_s32((i+2)/(*cell_w), cellx_s32x4, 2); + cellx_s32x4 = vsetq_lane_s32((i+3)/(*cell_w), cellx_s32x4, 3); + int32x4_t magnitudes_index_s32x4 = vaddq_s32(vmulq_s32(celly_s32x4, num_cellw_s32x4), cellx_s32x4); + int32x4_t bins_index_s32x4 = vaddq_s32(vmulq_s32(magnitudes_index_s32x4, num_orientations_s32x4), bin_s32x4); + + void *mag_ptr = (vx_int8 *)magnitudes_data + vgetq_lane_s32(magnitudes_index_s32x4, 0)*2; + *(vx_int16 *)(mag_ptr) = *(vx_int16 *)(mag_ptr) + vgetq_lane_f32(magnitude_f32x4, 0); + mag_ptr = (vx_int8 *)magnitudes_data + vgetq_lane_s32(magnitudes_index_s32x4, 1)*2; + *(vx_int16 *)(mag_ptr) = *(vx_int16 *)(mag_ptr) + vgetq_lane_f32(magnitude_f32x4, 1); + mag_ptr = (vx_int8 *)magnitudes_data + vgetq_lane_s32(magnitudes_index_s32x4, 2)*2; + *(vx_int16 *)(mag_ptr) = *(vx_int16 *)(mag_ptr) + vgetq_lane_f32(magnitude_f32x4, 2); + mag_ptr = (vx_int8 *)magnitudes_data + vgetq_lane_s32(magnitudes_index_s32x4, 3)*2; + *(vx_int16 *)(mag_ptr) = *(vx_int16 *)(mag_ptr) + vgetq_lane_f32(magnitude_f32x4, 3); + vx_int8 *bins_ptr = (vx_int8 *)bins_data + vgetq_lane_s32(bins_index_s32x4, 0); + *bins_ptr = *bins_ptr + vgetq_lane_f32(magnitude_f32x4, 0); + bins_ptr = (vx_int8 *)bins_data + vgetq_lane_s32(bins_index_s32x4, 1); + *bins_ptr = *bins_ptr + vgetq_lane_f32(magnitude_f32x4, 1); + bins_ptr = (vx_int8 *)bins_data + vgetq_lane_s32(bins_index_s32x4, 2); + *bins_ptr = *bins_ptr + vgetq_lane_f32(magnitude_f32x4, 2); + bins_ptr = (vx_int8 *)bins_data + vgetq_lane_s32(bins_index_s32x4, 3); + *bins_ptr = *bins_ptr + vgetq_lane_f32(magnitude_f32x4, 3); + } + } +} + +#define HOGCELLS_SCALING(low_y, low_x, high_y, high_x, in_tile_x)\ + for (int j = low_y; j < high_y; j++) {\ + for (int i = low_x; i < high_x; i++) {\ + int x1 = i - 1 < 0 ? 0 : i - 1;\ + int x2 = i + 1 >= high_x ? high_x - 1 : i + 1;\ + vx_uint8 *gx1 = (vx_uint8 *)in->base[0] + in_tile_x + j * in->addr[0].stride_y + x1 * in->addr[0].stride_x;\ + vx_uint8 *gx2 = (vx_uint8 *)in->base[0] + in_tile_x + j * in->addr[0].stride_y + x2 * in->addr[0].stride_x;\ + gx = *gx2 - *gx1;\ + int y1 = j - 1 < 0 ? 0 : j - 1;\ + int y2 = j + 1 >= high_y ? high_y - 1 : j + 1;\ + vx_uint8 *gy1 = (vx_uint8 *)in->base[0] + in_tile_x + y1 * in->addr[0].stride_y + i * in->addr[0].stride_x;\ + vx_uint8 *gy2 = (vx_uint8 *)in->base[0] + in_tile_x + y2 * in->addr[0].stride_y + i * in->addr[0].stride_x;\ + gy = *gy2 - *gy1;\ + magnitude = sqrtf(powf(gx, 2) + powf(gy, 2));\ + orientation = fmod(atan2f(gy, gx + 0.00000000000001)\ + * (180 / 3.14159265), 360);\ + if (orientation < 0) {\ + orientation += 360;\ + }\ + bin = (vx_int8)floor(orientation * num_div_360);\ + vx_int32 cellx = i / (*cell_w);\ + vx_int32 celly = j / (*cell_h);\ + vx_int32 magnitudes_index = celly * num_cellw + cellx;\ + vx_int32 bins_index = (celly * num_cellw + cellx) * (*num_orientations) + bin;\ + vx_size magnitudes_pos = 2 * magnitudes_index;\ + vx_size bins_pos = bins_index;\ + void *mag_ptr = (vx_int8 *)magnitudes_data + magnitudes_pos;\ + void *bins_ptr = (vx_int8 *)bins_data + bins_pos;\ + *(vx_int16 *)(mag_ptr) = *(vx_int16 *)(mag_ptr) + magnitude / ((*cell_w) * (*cell_h));\ + *(vx_int8 *)(bins_ptr) = *(vx_int8 *)(bins_ptr) + magnitude / ((*cell_w) * (*cell_h));\ + }\ + }\ + +void HogCells_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_int32 *cell_w = (vx_int32 *)parameters[1]; + vx_int32 *cell_h = (vx_int32 *)parameters[2]; + vx_int32 *num_orientations = (vx_int32 *)parameters[3]; + void* magnitudes_data = parameters[4]; + void* bins_data = parameters[5]; + vx_float32 gx; + vx_float32 gy; + vx_float32 orientation; + vx_float32 magnitude; + vx_int8 bin; + + float num_div_360 = (float)(*num_orientations) / 360.0f; + vx_int32 num_cellw = (vx_int32)floor(((vx_float64)in->image.width) / ((vx_float64)(*cell_w))); + vx_uint32 ty = in->tile_y; + vx_uint32 tx = in->tile_x; + if (ty == 0 && tx == 0) + { + HOGCELLS_SCALING(0, 0, vxTileHeight(in, 0), vxTileWidth(in, 0), in->tile_x) + } + else + { + HOGCELLS_SCALING(0, tx, ty, vxTileWidth(in, 0), in->tile_x) + HOGCELLS_SCALING(ty, 0, vxTileHeight(in, 0), vxTileWidth(in, 0), 0) + } +} + +void HogFeatures_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_int32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + void *magnitudes_data = parameters[1]; + void * bins_data = parameters[2]; + vx_tile_array_t *hog_params = (vx_tile_array_t *)parameters[3]; + void * features_data = parameters[5]; + + vx_uint32 high_y = in->tile_y + in->tile_block.height; + + vx_uint32 high_x = in->tile_x + in->tile_block.width; + + vx_int32 width, height; + + vx_hog_t *hog_params_t = (vx_hog_t *)hog_params->ptr; + + if (hog_params_t->num_bins > 0 && hog_params_t->num_bins < 360) + { + width = high_x; + height = high_y; + vx_int32 num_blockW = width / hog_params_t->cell_width - 1; + vx_int32 num_blockH = height / hog_params_t->cell_height - 1; + vx_int32 n_cellsx = width / hog_params_t->cell_width; + vx_int32 cells_per_block_w = hog_params_t->block_width / hog_params_t->cell_width; + vx_int32 cells_per_block_h = hog_params_t->block_height / hog_params_t->cell_height; + + vx_int16 *ptr_src = (vx_int16 *)magnitudes_data; + vx_int8 *ptr_bins = (vx_int8 *)bins_data; + vx_int16 *ptr_dst = (vx_int16 *)features_data; + vx_int32 num_bins_s32 = hog_params_t->num_bins; + vx_int32 roiw4 = num_blockW * num_bins_s32 >= 3*num_bins_s32 ? num_blockW * num_bins_s32 : 0; + + for (y = 0; y < num_blockH; y++) + { + vx_int16 *src_r1 = ptr_src + (y + 0) * n_cellsx; + vx_int16 *src_r2 = ptr_src + (y + 1) * n_cellsx; + vx_int8 *bins_r1 = ptr_bins + (y + 0) * n_cellsx * hog_params_t->num_bins; + vx_int8 *bins_r2 = ptr_bins + (y + 1) * n_cellsx * hog_params_t->num_bins; + vx_int16 *dst_r1 = ptr_dst + y * (num_blockW + 1) * hog_params_t->num_bins; + for (x = 0; x < roiw4; x += 4*num_bins_s32) + { + int32x4_t bidx_s32x4; + vsetq_lane_s32(x / num_bins_s32, bidx_s32x4, 0); + vsetq_lane_s32((x + num_bins_s32) / num_bins_s32, bidx_s32x4, 1); + vsetq_lane_s32((x + 2 * num_bins_s32) / num_bins_s32, bidx_s32x4, 2); + vsetq_lane_s32((x + 3 * num_bins_s32) / num_bins_s32, bidx_s32x4, 3); + + float32x4_t sum_f32x4; + int16x4_t value1_s16x4; + int16x4_t value2_s16x4; + int16x4_t value3_s16x4; + int16x4_t value4_s16x4; + value1_s16x4 = vset_lane_s16(src_r1[vgetq_lane_s32(bidx_s32x4, 0)], value1_s16x4, 0); + value1_s16x4 = vset_lane_s16(src_r1[vgetq_lane_s32(bidx_s32x4, 1)], value1_s16x4, 1); + value1_s16x4 = vset_lane_s16(src_r1[vgetq_lane_s32(bidx_s32x4, 2)], value1_s16x4, 2); + + value2_s16x4 = vset_lane_s16(src_r1[vgetq_lane_s32(bidx_s32x4, 0) + 1], value2_s16x4, 0); + value2_s16x4 = vset_lane_s16(src_r1[vgetq_lane_s32(bidx_s32x4, 1) + 1], value2_s16x4, 1); + value2_s16x4 = vset_lane_s16(src_r1[vgetq_lane_s32(bidx_s32x4, 2) + 1], value2_s16x4, 2); + + value3_s16x4 = vset_lane_s16(src_r2[vgetq_lane_s32(bidx_s32x4, 0)], value3_s16x4, 0); + value3_s16x4 = vset_lane_s16(src_r2[vgetq_lane_s32(bidx_s32x4, 1)], value3_s16x4, 1); + value3_s16x4 = vset_lane_s16(src_r2[vgetq_lane_s32(bidx_s32x4, 2)], value3_s16x4, 2); + + value4_s16x4 = vset_lane_s16(src_r2[vgetq_lane_s32(bidx_s32x4, 0) + 1], value4_s16x4, 0); + value4_s16x4 = vset_lane_s16(src_r2[vgetq_lane_s32(bidx_s32x4, 1) + 1], value4_s16x4, 1); + value4_s16x4 = vset_lane_s16(src_r2[vgetq_lane_s32(bidx_s32x4, 2) + 1], value4_s16x4, 2); + + sum_f32x4 = vcvtq_f32_s32(vmovl_s16(vadd_s16(vadd_s16(vmul_s16(value1_s16x4, value1_s16x4), vmul_s16(value2_s16x4, value2_s16x4)), + vadd_s16(vmul_s16(value3_s16x4, value3_s16x4), vmul_s16(value4_s16x4, value4_s16x4))))); + + vx_float32 scale = 1.f / sqrtf(vgetq_lane_f32(sum_f32x4, 0) + 0.00000000000001); + vx_int8 *bins1 = bins_r1 + (x + 0); + vx_int8 *bins2 = bins_r1 + (x + 1); + vx_int8 *bins3 = bins_r2 + (x + 0); + vx_int8 *bins4 = bins_r2 + (x + 1); + vx_int16 *dst = dst_r1 + x; + for (int k = 0; k < num_bins_s32; k++) + { + vx_float32 hist = 0.0; + hist += min(bins1[k] * scale, hog_params_t->threshold); + hist += min(bins2[k] * scale, hog_params_t->threshold); + hist += min(bins3[k] * scale, hog_params_t->threshold); + hist += min(bins4[k] * scale, hog_params_t->threshold); + dst[k] += hist; + } + + scale = 1.f / sqrtf(vgetq_lane_f32(sum_f32x4, 1) + 0.00000000000001); + bins1 = bins_r1 + (x + 0 + num_bins_s32); + bins2 = bins_r1 + (x + 1 + num_bins_s32); + bins3 = bins_r2 + (x + 0 + num_bins_s32); + bins4 = bins_r2 + (x + 1 + num_bins_s32); + dst = dst_r1 + x + num_bins_s32; + for (int k = 0; k < num_bins_s32; k++) + { + vx_float32 hist = 0.0; + hist += min(bins1[k] * scale, hog_params_t->threshold); + hist += min(bins2[k] * scale, hog_params_t->threshold); + hist += min(bins3[k] * scale, hog_params_t->threshold); + hist += min(bins4[k] * scale, hog_params_t->threshold); + dst[k] += hist; + } + + scale = 1.f / sqrtf(vgetq_lane_f32(sum_f32x4, 2) + 0.00000000000001); + bins1 = bins_r1 + (x + 0 + 2*num_bins_s32); + bins2 = bins_r1 + (x + 1 + 2*num_bins_s32); + bins3 = bins_r2 + (x + 0 + 2*num_bins_s32); + bins4 = bins_r2 + (x + 1 + 2*num_bins_s32); + dst = dst_r1 + x + 2*num_bins_s32; + for (int k = 0; k < num_bins_s32; k++) + { + vx_float32 hist = 0.0; + hist += min(bins1[k] * scale, hog_params_t->threshold); + hist += min(bins2[k] * scale, hog_params_t->threshold); + hist += min(bins3[k] * scale, hog_params_t->threshold); + hist += min(bins4[k] * scale, hog_params_t->threshold); + dst[k] += hist; + } + + scale = 1.f / sqrtf(vgetq_lane_f32(sum_f32x4, 3) + 0.00000000000001); + bins1 = bins_r1 + (x + 0 + 3*num_bins_s32); + bins2 = bins_r1 + (x + 1 + 3*num_bins_s32); + bins3 = bins_r2 + (x + 0 + 3*num_bins_s32); + bins4 = bins_r2 + (x + 1 + 3*num_bins_s32); + dst = dst_r1 + x + 3*num_bins_s32; + for (int k = 0; k < num_bins_s32; k++) + { + vx_float32 hist = 0.0; + hist += min(bins1[k] * scale, hog_params_t->threshold); + hist += min(bins2[k] * scale, hog_params_t->threshold); + hist += min(bins3[k] * scale, hog_params_t->threshold); + hist += min(bins4[k] * scale, hog_params_t->threshold); + dst[k] += hist; + } + } + } + } +} + + +#define HOGFEATURES(low_y, high_y, low_x) \ + for (vx_int32 blkH = 0; blkH < num_blockH; blkH++) \ + { \ + for (vx_int32 blkW = 0; blkW < num_blockW; blkW++) \ + { \ + vx_float32 sum = 0; \ + for (vx_int32 y = 0; y < cells_per_block_h; y++) \ + { \ + for (vx_int32 x = 0; x < cells_per_block_w; x++) \ + { \ + vx_int32 index = (blkH + y)*n_cellsx + (blkW + x); \ + void *mag_ptr = (vx_int8 *)magnitudes_data + index; \ + sum += (*(vx_int16 *)mag_ptr) * (*(vx_int16 *)mag_ptr); \ + } \ + } \ + sum = sqrtf(sum + 0.00000000000001); \ + for (vx_int32 y = 0; y < cells_per_block_h; y++) \ + { \ + for (vx_int32 x = 0; x < cells_per_block_w; x++) \ + { \ + for (vx_int32 k = 0; k < hog_params_t->num_bins; k++) \ + { \ + vx_int32 bins_index = (blkH + y)*n_cellsx * hog_params_t->num_bins + (blkW + x)*hog_params_t->num_bins + k; \ + vx_int32 block_index = blkH * num_blockW * hog_params_t->num_bins + blkW * hog_params_t->num_bins + k; \ + float hist = min((vx_int8)(*((vx_int8 *)bins_data + bins_index)) / sum, hog_params_t->threshold); \ + void *features_ptr = (vx_int8 *)features_data + block_index; \ + *(vx_int16 *)features_ptr = *(vx_int16 *)features_ptr + hist; \ + } \ + } \ + } \ + } \ + } + +void HogFeatures_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + void *magnitudes_data = parameters[1]; + void *bins_data = parameters[2]; + vx_tile_array_t *hog_params = (vx_tile_array_t *)parameters[3]; + void * features_data = parameters[5]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = vxTileHeight(in, 0); + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = vxTileWidth(in, 0); + + vx_int32 width = high_x, height = high_y; + + vx_hog_t *hog_params_t = (vx_hog_t *)hog_params->ptr; + + vx_int32 num_blockW = width / hog_params_t->cell_width - 1; + vx_int32 num_blockH = height / hog_params_t->cell_height - 1; + vx_int32 n_cellsx = width / hog_params_t->cell_width; + vx_int32 cells_per_block_w = hog_params_t->block_width / hog_params_t->cell_width; + vx_int32 cells_per_block_h = hog_params_t->block_height / hog_params_t->cell_height; + + if (hog_params_t->num_bins > 0 && hog_params_t->num_bins < 360) + { + if (low_y == 0 && low_x == 0) + { + HOGFEATURES(low_y, high_y, low_x) + } + else + { + HOGFEATURES(0, low_y, low_x) + HOGFEATURES(low_y, high_y, 0) + } + } +} diff --git a/kernels/tiling/tiling_integralimage.c b/kernels/tiling/tiling_integralimage.c new file mode 100644 index 0000000..62dfc96 --- /dev/null +++ b/kernels/tiling/tiling_integralimage.c @@ -0,0 +1,218 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include + +void IntegralImage_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = out->tile_x + out->tile_block.width; + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint32 *dst_base = (vx_uint32 *)out->base[0] + out->tile_x; + + for (y = low_y; y < high_y; y++) + { + const vx_uint8 *pixels_ptr = src_base + y * in->addr->stride_y; + vx_uint32 *sums = dst_base + y * out->addr->stride_y / 4; + + if (y == 0) + { + for (x = low_x; x < high_x; x += 16) + { + const uint8x16_t input_pixels = vld1q_u8(pixels_ptr); + + const uint16x8x2_t temp = + { + { + vmovl_u8(vget_low_u8(input_pixels)), + vmovl_u8(vget_high_u8(input_pixels)) + } + }; + + uint32x4x4_t pixels = + { + { + vmovl_u16(vget_low_u16(temp.val[0])), + vmovl_u16(vget_high_u16(temp.val[0])), + vmovl_u16(vget_low_u16(temp.val[1])), + vmovl_u16(vget_high_u16(temp.val[1])) + } + }; + + vst1q_u32(sums, pixels.val[0]); + + vst1q_u32(sums + 4, pixels.val[1]); + + vst1q_u32(sums + 8, pixels.val[2]); + + vst1q_u32(sums + 12, pixels.val[3]); + + if (x == 0) + { + sums[0] = pixels_ptr[0]; + + // Perform prefix summation + for (vx_int32 i = 1; i < 16; i++) + { + sums[i] += sums[i-1]; + } + } + else + { + // Perform prefix summation + for (vx_int32 i = 0; i < 16; i++) + { + sums[i] += sums[i-1]; + } + } + + pixels_ptr += 16; + sums += 16; + } + } + else + { + vx_uint32 *prev_sums_mid = dst_base + (y-1) * out->addr->stride_y / 4; //(0,-1) + vx_uint32 *prev_sums_left = dst_base + (y-1) * out->addr->stride_y / 4 - out->addr->stride_x / 4; //(-1,-1) + + for (x = low_x; x < high_x; x += 16) + { + const uint8x16_t input_pixels = vld1q_u8(pixels_ptr); + + const uint16x8x2_t temp = + { + { + vmovl_u8(vget_low_u8(input_pixels)), + vmovl_u8(vget_high_u8(input_pixels)) + } + }; + + uint32x4x4_t pixels = + { + { + vmovl_u16(vget_low_u16(temp.val[0])), + vmovl_u16(vget_high_u16(temp.val[0])), + vmovl_u16(vget_low_u16(temp.val[1])), + vmovl_u16(vget_high_u16(temp.val[1])) + } + }; + + // Add top mid pixel values + pixels.val[0] = vaddq_u32(vld1q_u32(prev_sums_mid), pixels.val[0]); + pixels.val[1] = vaddq_u32(vld1q_u32(prev_sums_mid + 4), pixels.val[1]); + pixels.val[2] = vaddq_u32(vld1q_u32(prev_sums_mid + 8), pixels.val[2]); + pixels.val[3] = vaddq_u32(vld1q_u32(prev_sums_mid + 12), pixels.val[3]); + + // Subtract top left diagonal values + pixels.val[0] = vsubq_u32(pixels.val[0], vld1q_u32(prev_sums_left)); + vst1q_u32(sums, pixels.val[0]); + + pixels.val[1] = vsubq_u32(pixels.val[1], vld1q_u32(prev_sums_left + 4)); + vst1q_u32(sums + 4, pixels.val[1]); + + pixels.val[2] = vsubq_u32(pixels.val[2], vld1q_u32(prev_sums_left + 8)); + vst1q_u32(sums + 8, pixels.val[2]); + + pixels.val[3] = vsubq_u32(pixels.val[3], vld1q_u32(prev_sums_left + 12)); + vst1q_u32(sums + 12, pixels.val[3]); + + if (x == 0) + { + sums[0] = prev_sums_mid[0] + pixels_ptr[0]; + // Perform prefix summation + for (vx_int32 i = 1; i < 16; i++) + { + sums[i] += sums[i-1]; + } + } + else + { + // Perform prefix summation + for (vx_int32 i = 0; i < 16; i++) + { + sums[i] += sums[i-1]; + } + } + + pixels_ptr += 16; + sums += 16; + prev_sums_mid += 16; + prev_sums_left += 16; + } + } + } +} + +#define INTEGRAL_IMAGE(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + vx_uint8 *pixels = (vx_uint8 *)src_base + y * in->addr->stride_y; \ + vx_uint32 *sums = (vx_uint32 *)dst_base + y * out->addr->stride_y / 4; \ + if (y == 0) \ + { \ + sums[0] = pixels[0]; \ + for (x = low_x; x < high_x; x++) \ + sums[x] = sums[x - 1] + pixels[x]; \ + } \ + else \ + { \ + vx_uint32 *prev_sums = (vx_uint32 *)dst_base + (y - 1) * out->addr->stride_y / 4; \ + sums[0] = prev_sums[0] + pixels[0]; \ + for (x = low_x; x < high_x; x++) \ + sums[x] = pixels[x] + sums[x - 1] + prev_sums[x] - prev_sums[x - 1]; \ + } \ + } + +void IntegralImage_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = vxTileHeight(in, 0); + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = vxTileWidth(in, 0); + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + if (low_y == 0 && low_x == 0) + { + INTEGRAL_IMAGE(low_y, high_y, low_x + 1) + } + else + { + INTEGRAL_IMAGE(0, low_y, low_x) + + vx_uint8 *src_base = in->base[0]; + vx_uint8 *dst_base = out->base[0]; + INTEGRAL_IMAGE(low_y, high_y, 1) + } +} diff --git a/kernels/tiling/tiling_lbp.c b/kernels/tiling/tiling_lbp.c new file mode 100644 index 0000000..0c1a53b --- /dev/null +++ b/kernels/tiling/tiling_lbp.c @@ -0,0 +1,804 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include +#include + +static void vxLBPStandard_tiling_fast(vx_tile_t *in, vx_int8 ksize, vx_tile_t *out) +{ + vx_uint32 x = 0, y = 0; + + vx_uint8 *src_base = in->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = in->tile_y + in->tile_block.height; + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = in->tile_x + in->tile_block.width; + + if(ksize == 3) + { + if (low_y == 0) + { + low_y = 1; + } + if (high_y == in->image.height) + { + high_y = high_y - 1; + } + if (high_x == in->image.width) + { + high_x = high_x - 1; + } + + uint8x16_t vPrv[3], vCur[3], vNxt[3]; + uint8x16_t vOne = vdupq_n_u8(1); + + for (y = low_y; y < high_y; y += in->addr->step_y) + { + vx_uint8 *ptr_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y; + vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; + for (vx_uint8 idx = 0; idx < 3; idx++) + { + vPrv[idx] = vdupq_n_u8(0); + vCur[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y); + vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + 16 * in->addr->stride_x); + } + for (x = 0; x < high_x; x += 16) + { + uint8x16_t vSum = vdupq_n_u8(0); + uint8x16_t vTmp = vextq_u8(vPrv[0], vCur[0], 15); + uint8x16_t vPred = vcgeq_u8(vTmp, vCur[1]); + uint8x16_t vVal = vandq_u8(vPred, vOne); + vSum = vVal; + + vPred = vcgeq_u8(vCur[0], vCur[1]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 1); + vSum = vaddq_u8(vVal, vSum); + + vTmp = vextq_u8(vCur[0], vNxt[0], 1); + vPred = vcgeq_u8(vTmp, vCur[1]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 2); + vSum = vaddq_u8(vVal, vSum); + + vTmp = vextq_u8(vCur[1], vNxt[1], 1); + vPred = vcgeq_u8(vTmp, vCur[1]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 3); + vSum = vaddq_u8(vVal, vSum); + + vTmp = vextq_u8(vCur[2], vNxt[2], 1); + vPred = vcgeq_u8(vTmp, vCur[1]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 4); + vSum = vaddq_u8(vVal, vSum); + + vPred = vcgeq_u8(vCur[2], vCur[1]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 5); + vSum = vaddq_u8(vVal, vSum); + + vTmp = vextq_u8(vPrv[2], vCur[2], 15); + vPred = vcgeq_u8(vTmp, vCur[1]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 6); + vSum = vaddq_u8(vVal, vSum); + + vTmp = vextq_u8(vPrv[1], vCur[1], 15); + vPred = vcgeq_u8(vTmp, vCur[1]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 7); + vSum = vaddq_u8(vVal, vSum); + + vst1q_u8(ptr_dst + x, vSum); + + for (vx_uint8 idx = 0; idx < 3; idx++) + { + vPrv[idx] = vCur[idx]; + vCur[idx] = vNxt[idx]; + vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + (x + 32) * in->addr->stride_x); + } + } + } + } + else if (ksize == 5) + { + if (low_y == 0) + { + low_y = 2; + } + if (high_y == in->image.height) + { + high_y = high_y - 2; + } + if (high_x == in->image.width) + { + high_x = high_x - 2; + } + + uint8x16_t vPrv[5], vCur[5], vNxt[5]; + uint8x16_t vOne = vdupq_n_u8(1); + + for (y = low_y; y < high_y; y += in->addr->step_y) + { + vx_uint8 *ptr_src = (vx_uint8 *)src_base + (y - 2) * in->addr->stride_y; + vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; + for (vx_uint8 idx = 0; idx < 5; idx++) + { + vPrv[idx] = vdupq_n_u8(0); + vCur[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y); + vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + 16 * in->addr->stride_x); + } + for (x = 0; x < high_x; x += 16) + { + uint8x16_t vSum = vdupq_n_u8(0); + uint8x16_t vTmp = vextq_u8(vPrv[1], vCur[1], 15); + uint8x16_t vPred = vcgeq_u8(vTmp, vCur[2]); + uint8x16_t vVal = vandq_u8(vPred, vOne); + vSum = vVal; + + vPred = vcgeq_u8(vCur[0], vCur[2]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 1); + vSum = vaddq_u8(vVal, vSum); + + vTmp = vextq_u8(vCur[1], vNxt[1], 1); + vPred = vcgeq_u8(vTmp, vCur[2]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 2); + vSum = vaddq_u8(vVal, vSum); + + vTmp = vextq_u8(vCur[2], vNxt[2], 2); + vPred = vcgeq_u8(vTmp, vCur[2]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 3); + vSum = vaddq_u8(vVal, vSum); + + vTmp = vextq_u8(vCur[3], vNxt[3], 1); + vPred = vcgeq_u8(vTmp, vCur[2]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 4); + vSum = vaddq_u8(vVal, vSum); + + vPred = vcgeq_u8(vCur[4], vCur[2]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 5); + vSum = vaddq_u8(vVal, vSum); + + vTmp = vextq_u8(vPrv[3], vCur[3], 15); + vPred = vcgeq_u8(vTmp, vCur[2]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 6); + vSum = vaddq_u8(vVal, vSum); + + vTmp = vextq_u8(vPrv[2], vCur[2], 14); + vPred = vcgeq_u8(vTmp, vCur[2]); + vVal = vandq_u8(vPred, vOne); + vVal = vshlq_n_u8(vVal, 7); + vSum = vaddq_u8(vVal, vSum); + + vst1q_u8(ptr_dst + x, vSum); + + for (vx_uint8 idx = 0; idx < 5; idx++) + { + vPrv[idx] = vCur[idx]; + vCur[idx] = vNxt[idx]; + vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + (x + 32) * in->addr->stride_x); + } + } + } + } +} + +static void vxLBPModified_tiling_fast(vx_tile_t *in, vx_tile_t *out) +{ + vx_uint32 x = 0, y = 0; + + vx_uint8 *src_base = in->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = in->tile_y + in->tile_block.height; + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = in->tile_x + in->tile_block.width; + + uint8x16_t vPrv[3], vCur[3], vNxt[3], vG[8]; + vx_uint32 w16; + uint8x16_t vOne = vdupq_n_u8(1); + vx_uint8 szCoeff[8] = { 1 << 0, 1 << 1, 1 << 2, 1 << 3, + 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; + + if (low_y == 0) + { + low_y = 2; + } + if (high_y == in->image.height) + { + high_y = high_y - 2; + } + if (high_x == in->image.width) + { + high_x = high_x - 2; + } + + for (y = low_y; y < high_y; y += in->addr->step_y) + { + vx_uint8 *ptr_src = (vx_uint8 *)src_base + (y - 2) * in->addr->stride_y; + vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; + for (vx_uint8 idx = 0, idxY = 0; idxY < 5; (idx++, idxY += 2)) + { + vPrv[idx] = vdupq_n_u8(0); + vCur[idx] = vld1q_u8(ptr_src + idxY * in->addr->stride_y); + vNxt[idx] = vld1q_u8(ptr_src + idxY * in->addr->stride_y + 16 * in->addr->stride_x); + } + for (x = 0; x < high_x; x += 16) + { + uint16x8_t vSumu16_lo = vdupq_n_u16(0); + uint16x8_t vSumu16_hi = vdupq_n_u16(0); + uint8x16_t vAvg, vPred, vSum; + + vG[0] = vextq_u8(vPrv[0], vCur[0], 14); + vG[1] = vCur[0]; + vG[2] = vextq_u8(vCur[0], vNxt[0], 2); + vG[3] = vextq_u8(vCur[1], vNxt[1], 2); + vG[4] = vextq_u8(vCur[2], vNxt[2], 2); + vG[5] = vCur[2]; + vG[6] = vextq_u8(vPrv[2], vCur[2], 14); + vG[7] = vextq_u8(vPrv[1], vCur[1], 14); + + for (vx_uint8 idx = 0; idx < 8; idx++) + { + vSumu16_lo = vaddq_u16(vSumu16_lo, vmovl_u8(vget_low_u8(vG[idx]))); + vSumu16_hi = vaddq_u16(vSumu16_hi, vmovl_u8(vget_high_u8(vG[idx]))); + } + + vSumu16_lo = vaddq_u16(vSumu16_lo, vdupq_n_u16(1)); + vSumu16_hi = vaddq_u16(vSumu16_hi, vdupq_n_u16(1)); + vSumu16_lo = vshrq_n_u16(vSumu16_lo, 3); + vSumu16_hi = vshrq_n_u16(vSumu16_hi, 3); + vAvg = vcombine_u8(vmovn_u16(vSumu16_lo), vmovn_u16(vSumu16_hi)); + + vSumu16_lo = vdupq_n_u16(0); + vSumu16_hi = vdupq_n_u16(0); + for (vx_uint8 idx = 0; idx < 8; idx++) + { + vPred = vcgtq_u8(vG[idx], vAvg); + vPred = vandq_u8(vPred, vOne); + vSumu16_lo = vmlaq_n_u16(vSumu16_lo, vmovl_u8(vget_low_u8(vPred)), szCoeff[idx]); + vSumu16_hi = vmlaq_n_u16(vSumu16_hi, vmovl_u8(vget_high_u8(vPred)), szCoeff[idx]); + } + + vSum = vcombine_u8(vmovn_u16(vSumu16_lo), vmovn_u16(vSumu16_hi)); + vst1q_u8(ptr_dst + x, vSum); + + for (vx_uint8 idx = 0, idxY = 0; idxY < 5; (idx++, idxY += 2)) + { + vPrv[idx] = vCur[idx]; + vCur[idx] = vNxt[idx]; + vNxt[idx] = vld1q_u8(ptr_src + idxY * in->addr->stride_y + (x + 32) * in->addr->stride_x); + } + } + } +} + +static void vxLBPUniform_tiling_fast(vx_tile_t *in, vx_int8 ksize, vx_tile_t *out) +{ + vx_uint32 x = 0, y = 0; + + vx_uint8 *src_base = in->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = in->tile_y + in->tile_block.height; + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = in->tile_x + in->tile_block.width; + + vx_uint8 szCoeff[8] = { 1 << 0, 1 << 1, 1 << 2, 1 << 3, + 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; + + uint8x16_t vOne = vdupq_n_u8(1); + uint8x16_t vNine = vdupq_n_u8(9); + uint8x16_t vTwo = vdupq_n_u8(2); + + if(ksize == 3) + { + if (low_y == 0) + { + low_y = 1; + } + if (high_y == in->image.height) + { + high_y = high_y - 1; + } + if (high_x == in->image.width) + { + high_x = high_x - 1; + } + + uint8x16_t vPrv[3], vCur[3], vNxt[3], vG[8]; + + for (y = low_y; y < high_y; y += in->addr->step_y) + { + vx_uint8 *ptr_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y; + vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; + for (vx_uint8 idx = 0; idx < 3; idx++) + { + vPrv[idx] = vdupq_n_u8(0); + vCur[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y); + vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + 16 * in->addr->stride_x); + } + for (x = 0; x < high_x; x += 16) + { + vG[0] = vextq_u8(vPrv[0], vCur[0], 15); + vG[1] = vCur[0]; + vG[2] = vextq_u8(vCur[0], vNxt[0], 1); + vG[3] = vextq_u8(vCur[1], vNxt[1], 1); + vG[4] = vextq_u8(vCur[2], vNxt[2], 1); + vG[5] = vCur[2]; + vG[6] = vextq_u8(vPrv[2], vCur[2], 15); + vG[7] = vextq_u8(vPrv[1], vCur[1], 15); + + uint8x16_t vPred = vcgeq_u8(vG[7], vCur[1]); + uint8x16_t vU1 = vandq_u8(vPred, vOne); + vPred = vcgeq_u8(vG[0], vCur[1]); + uint8x16_t vU2 = vandq_u8(vPred, vOne); + uint8x16_t vAbs1 = vabdq_u8(vU1, vU2); + uint8x16_t vAbs2 = vdupq_n_u8(0); + + for (vx_uint8 idx = 1; idx < 8; idx++) + { + vPred = vcgeq_u8(vG[idx], vCur[1]); + vU1 = vandq_u8(vPred, vOne); + vPred = vcgeq_u8(vG[idx - 1], vCur[1]); + vU2 = vandq_u8(vPred, vOne); + vAbs2 = vaddq_u8(vAbs2, vabdq_u8(vU1, vU2)); + } + vAbs1 = vaddq_u8(vAbs1, vAbs2); + + uint16x8_t vSumu16_lo = vdupq_n_u16(0); + uint16x8_t vSumu16_hi = vdupq_n_u16(0); + for (vx_uint8 idx = 0; idx < 8; idx++) + { + vPred = vcgeq_u8(vG[idx], vCur[1]); + vPred = vandq_u8(vPred, vOne); + vSumu16_lo = vmlaq_n_u16(vSumu16_lo, vmovl_u8(vget_low_u8(vPred)), szCoeff[idx]); + vSumu16_hi = vmlaq_n_u16(vSumu16_hi, vmovl_u8(vget_high_u8(vPred)), szCoeff[idx]); + } + + uint8x16_t vSum = vcombine_u8(vmovn_u16(vSumu16_lo), vmovn_u16(vSumu16_hi)); + vPred = vcleq_u8(vAbs1, vTwo); + vSum = vbslq_u8(vPred, vSum, vNine); + + vst1q_u8(ptr_dst + x, vSum); + + for (vx_uint8 idx = 0; idx < 3; idx++) + { + vPrv[idx] = vCur[idx]; + vCur[idx] = vNxt[idx]; + vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + (x + 32) * in->addr->stride_x); + } + } + } + } + else if (ksize == 5) + { + if (low_y == 0) + { + low_y = 2; + } + if (high_y == in->image.height) + { + high_y = high_y - 2; + } + if (high_x == in->image.width) + { + high_x = high_x - 2; + } + + uint8x16_t vPrv[5], vCur[5], vNxt[5], vG[8]; + + for (y = low_y; y < high_y; y += in->addr->step_y) + { + vx_uint8 *ptr_src = (vx_uint8 *)src_base + (y - 2) * in->addr->stride_y; + vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; + for (vx_uint8 idx = 0; idx < 5; idx++) + { + vPrv[idx] = vdupq_n_u8(0); + vCur[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y); + vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + 16 * in->addr->stride_x); + } + for (x = 0; x < high_x; x += 16) + { + vG[0] = vextq_u8(vPrv[1], vCur[1], 15); + vG[1] = vCur[0]; + vG[2] = vextq_u8(vCur[1], vNxt[1], 1); + vG[3] = vextq_u8(vCur[2], vNxt[2], 2); + vG[4] = vextq_u8(vCur[3], vNxt[3], 1); + vG[5] = vCur[4]; + vG[6] = vextq_u8(vPrv[3], vCur[3], 15); + vG[7] = vextq_u8(vPrv[2], vCur[2], 14); + + uint8x16_t vPred = vcgeq_u8(vG[7], vCur[2]); + uint8x16_t vU1 = vandq_u8(vPred, vOne); + vPred = vcgeq_u8(vG[0], vCur[2]); + uint8x16_t vU2 = vandq_u8(vPred, vOne); + uint8x16_t vAbs1 = vabdq_u8(vU1, vU2); + uint8x16_t vAbs2 = vdupq_n_u8(0); + + for (vx_uint8 idx = 1; idx < 8; idx++) + { + vPred = vcgeq_u8(vG[idx], vCur[2]); + vU1 = vandq_u8(vPred, vOne); + vPred = vcgeq_u8(vG[idx - 1], vCur[2]); + vU2 = vandq_u8(vPred, vOne); + vAbs2 = vaddq_u8(vAbs2, vabdq_u8(vU1, vU2)); + } + vAbs1 = vaddq_u8(vAbs1, vAbs2); + + uint16x8_t vSumu16_lo = vdupq_n_u16(0); + uint16x8_t vSumu16_hi = vdupq_n_u16(0); + for (vx_uint8 idx = 0; idx < 8; idx++) + { + vPred = vcgeq_u8(vG[idx], vCur[2]); + vPred = vandq_u8(vPred, vOne); + vSumu16_lo = vmlaq_n_u16(vSumu16_lo, vmovl_u8(vget_low_u8(vPred)), szCoeff[idx]); + vSumu16_hi = vmlaq_n_u16(vSumu16_hi, vmovl_u8(vget_high_u8(vPred)), szCoeff[idx]); + } + + uint8x16_t vSum = vcombine_u8(vmovn_u16(vSumu16_lo), vmovn_u16(vSumu16_hi)); + vPred = vcleq_u8(vAbs1, vTwo); + vSum = vbslq_u8(vPred, vSum, vNine); + + vst1q_u8(ptr_dst + x, vSum); + + for (vx_uint8 idx = 0; idx < 5; idx++) + { + vPrv[idx] = vCur[idx]; + vCur[idx] = vNxt[idx]; + vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + (x + 32) * in->addr->stride_x); + } + } + } + } +} + +void LBP_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_enum *format = (vx_enum *)parameters[1]; + vx_int8 *size = (vx_int8 *)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + + switch (*format) + { + case VX_LBP: + vxLBPStandard_tiling_fast(in, *size, out); + break; + case VX_MLBP: + vxLBPModified_tiling_fast(in, out); + break; + case VX_ULBP: + vxLBPUniform_tiling_fast(in, *size, out); + break; + } +} + +vx_uint8 vx_lbp_s(vx_int16 x) +{ + if (x >= 0) + { + return 1; + } + else + { + return 0; + } +} + +vx_uint8 vx_lbp_u(vx_uint8 *g, vx_uint8 gc) +{ + vx_uint8 u1 = vx_lbp_s(g[7] - gc); + vx_uint8 u2 = vx_lbp_s(g[0] - gc); + + vx_uint8 abs1 = abs(u1 - u2); + + vx_uint8 abs2 = 0; + for (vx_int8 p = 1; p < 8; p++) + { + u1 = vx_lbp_s(g[p] - gc); + u2 = vx_lbp_s(g[p - 1] - gc); + abs2 += abs(u1 - u2); + } + + return abs1 + abs2; +} + +#define LBPSTANDARD_3x3(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; y += in->addr->step_y) \ + { \ + for (x = low_x; x < high_x; x += in->addr->step_x) \ + { \ + g[0] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y - 1, in->addr); \ + g[1] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y - 1, in->addr); \ + g[2] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y - 1, in->addr); \ + g[3] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y, in->addr); \ + g[4] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y + 1, in->addr); \ + g[5] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y + 1, in->addr); \ + g[6] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y + 1, in->addr); \ + g[7] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y, in->addr); \ + gc = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y, in->addr); \ + \ + sum = 0; \ + for (vx_int8 p = 0; p < 8; p++) \ + { \ + sum += vx_lbp_s(g[p] - gc) * (1 << p); \ + } \ + \ + vx_uint8 *dst_ptr = vxFormatImagePatchAddress2d(dst_base, x, y, out->addr); \ + *dst_ptr = sum; \ + } \ + } + + +#define LBPSTANDARD_5x5(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; y += in->addr->step_y) \ + { \ + for (x = low_x; x < high_x; x += in->addr->step_x) \ + { \ + g[0] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y - 1, in->addr); \ + g[1] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y - 2, in->addr); \ + g[2] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y - 1, in->addr); \ + g[3] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 2, y, in->addr); \ + g[4] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y + 1, in->addr); \ + g[5] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y + 2, in->addr); \ + g[6] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y + 1, in->addr); \ + g[7] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 2, y, in->addr); \ + gc = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y, in->addr); \ + \ + sum = 0; \ + for (vx_int8 p = 0; p < 8; p++) \ + { \ + sum += vx_lbp_s(g[p] - gc) * (1 << p); \ + } \ + \ + vx_uint8 *dst_ptr = vxFormatImagePatchAddress2d(dst_base, x, y, out->addr); \ + *dst_ptr = sum; \ + } \ + } + +static void vxLBPStandard_tiling_flexible(vx_tile_t *in, vx_int8 ksize, vx_tile_t *out) +{ + vx_uint32 x = 0, y = 0; + + vx_uint8 *src_base = in->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = vxTileHeight(in, 0); + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = vxTileWidth(in, 0); + + vx_uint8 gc, g[8], sum; + + if (low_y == 0 && low_x == 0) + { + if (ksize == 3) + LBPSTANDARD_3x3(low_y + 1, high_y - 1, low_x + 1, high_x - 1) + else if (ksize == 5) + LBPSTANDARD_5x5(low_y + 2, high_y - 2, low_x + 2, high_x - 2) + } + else + { + if (ksize == 3) + { + LBPSTANDARD_3x3(1, low_y, low_x, high_x - 1) + LBPSTANDARD_3x3(low_y, high_y, 1, high_x - 1) + } + else if (ksize == 5) + { + LBPSTANDARD_5x5(2, low_y, low_x, high_x - 2) + LBPSTANDARD_5x5(low_y, high_y, 2, high_x - 2) + } + } +} + +#define LBPMODIFIED(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; y += in->addr->step_y) \ + { \ + for (x = low_x; x < high_x; x += in->addr->step_x) \ + { \ + g[0] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 2, y - 2, in->addr); \ + g[1] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y - 2, in->addr); \ + g[2] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 2, y - 2, in->addr); \ + g[3] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 2, y, in->addr); \ + g[4] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 2, y + 2, in->addr); \ + g[5] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y + 2, in->addr); \ + g[6] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 2, y + 2, in->addr); \ + g[7] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 2, y, in->addr); \ + \ + avg = (g[0] + g[1] + g[2] + g[3] + g[4] + g[5] + g[6] + g[7] + 1) / 8; \ + \ + sum = 0; \ + for (vx_int8 p = 0; p < 8; p++) \ + { \ + sum += ((g[p] > avg) * (1 << p)); \ + } \ + \ + vx_uint8 *dst_ptr = vxFormatImagePatchAddress2d(dst_base, x, y, out->addr); \ + *dst_ptr = sum; \ + } \ + } + +void vxLBPModified_tiling_flexible(vx_tile_t *in, vx_tile_t *out) +{ + vx_uint32 x = 0, y = 0; + + vx_uint8 *src_base = in->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = vxTileHeight(in, 0); + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = vxTileWidth(in, 0); + + vx_uint8 avg, g[8], sum; + + if (low_y == 0 && low_x == 0) + { + LBPMODIFIED(low_y + 2, high_y - 2, low_x + 2, high_x - 2) + } + else + { + LBPMODIFIED(2, low_y, low_x, high_x - 2) + LBPMODIFIED(low_y, high_y, 2, high_x - 2) + } +} + +#define LBPUNIFORM_3x3(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; y += in->addr->step_y) \ + { \ + for (x = low_x; x < high_x; x += in->addr->step_x) \ + { \ + g[0] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y - 1, in->addr); \ + g[1] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y - 1, in->addr); \ + g[2] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y - 1, in->addr); \ + g[3] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y, in->addr); \ + g[4] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y + 1, in->addr); \ + g[5] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y + 1, in->addr); \ + g[6] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y + 1, in->addr); \ + g[7] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y, in->addr); \ + gc = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y, in->addr); \ + \ + sum = 0; \ + if (vx_lbp_u(g, gc) <= 2) \ + { \ + for (vx_uint8 p = 0; p < 8; p++) \ + { \ + sum += vx_lbp_s(g[p] - gc)*(1 << p); \ + } \ + } \ + else \ + { \ + sum = 9; \ + } \ + \ + vx_uint8 *dst_ptr = vxFormatImagePatchAddress2d(dst_base, x, y, out->addr); \ + *dst_ptr = sum; \ + } \ + } + + +#define LBPUNIFORM_5x5(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; y += in->addr->step_y) \ + { \ + for (x = low_x; x < high_x; x += in->addr->step_x) \ + { \ + g[0] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y - 1, in->addr); \ + g[1] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y - 2, in->addr); \ + g[2] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y - 1, in->addr); \ + g[3] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 2, y, in->addr); \ + g[4] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y + 1, in->addr); \ + g[5] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y + 2, in->addr); \ + g[6] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y + 1, in->addr); \ + g[7] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 2, y, in->addr); \ + gc = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y, in->addr); \ + \ + sum = 0; \ + if (vx_lbp_u(g, gc) <= 2) \ + { \ + for (vx_uint8 p = 0; p < 8; p++) \ + { \ + sum += vx_lbp_s(g[p] - gc)*(1 << p); \ + } \ + } \ + else \ + { \ + sum = 9; \ + } \ + \ + vx_uint8 *dst_ptr = vxFormatImagePatchAddress2d(dst_base, x, y, out->addr); \ + *dst_ptr = sum; \ + } \ + } + + +void vxLBPUniform_tiling_flexible(vx_tile_t *in, vx_int8 ksize, vx_tile_t *out) +{ + vx_uint32 x = 0, y = 0; + + vx_uint8 *src_base = in->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = vxTileHeight(in, 0); + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = vxTileWidth(in, 0); + + vx_uint8 gc, g[8], sum; + + if (low_y == 0 && low_x == 0) + { + if (ksize == 3) + LBPUNIFORM_3x3(low_y + 1, high_y - 1, low_x + 1, high_x - 1) + else if (ksize == 5) + LBPUNIFORM_5x5(low_y + 2, high_y - 2, low_x + 2, high_x - 2) + } + else + { + if (ksize == 3) + { + LBPUNIFORM_3x3(1, low_y, low_x, high_x - 1) + LBPUNIFORM_3x3(low_y, high_y, 1, high_x - 1) + } + else if (ksize == 5) + { + LBPUNIFORM_5x5(2, low_y, low_x, high_x - 2) + LBPUNIFORM_5x5(low_y, high_y, 2, high_x - 2) + } + } +} + + +void LBP_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_enum *format = (vx_enum *)parameters[1]; + vx_int8 *size = (vx_int8 *)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + + switch (*format) + { + case VX_LBP: + vxLBPStandard_tiling_flexible(in, *size, out); + break; + case VX_MLBP: + vxLBPModified_tiling_flexible(in, out); + break; + case VX_ULBP: + vxLBPUniform_tiling_flexible(in, *size, out); + break; + } +} diff --git a/kernels/tiling/tiling_lut.c b/kernels/tiling/tiling_lut.c new file mode 100644 index 0000000..ac32f14 --- /dev/null +++ b/kernels/tiling/tiling_lut.c @@ -0,0 +1,230 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include + +void TableLookup_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_array_t *lut = (vx_tile_array_t*)parameters[1]; + vx_tile_t *out = (vx_tile_t*)parameters[2]; + + vx_uint8 *src_base = in->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = in->tile_y + in->tile_block.height; + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = in->tile_x + in->tile_block.width; + + vx_enum type = lut->item_type; + vx_size count = lut->num_items; + vx_uint32 offset = lut->offset; + + void *lut_ptr = lut->ptr; + + if (type == VX_TYPE_UINT8) + { + int32x4_t vOffset = vdupq_n_s32((vx_int32)offset); + int32x4_t vCnt = vdupq_n_s32((vx_int32)count); + int32x4_t vZero = vdupq_n_s32(0); + + for (y = low_y; y < high_y; y++) + { + vx_uint8 *ptr_src = (vx_uint8 *)src_base + y * in->addr->stride_y; + vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; + for (x = low_x; x < high_x; x += 16) + { + vx_uint8 *lut_tmp = (vx_uint8 *)lut_ptr; + uint8x16_t vSrc = vld1q_u8(ptr_src + x); + uint16x8_t vSrcs16_low = vmovl_u8(vget_low_u8(vSrc)); + uint16x8_t vSrcs16_high = vmovl_u8(vget_high_u8(vSrc)); + int32x4_t vPoss32_low = vaddq_s32(vOffset, vmovl_s16(vreinterpret_s16_u16(vget_low_u16(vSrcs16_low)))); + int32x4_t vPoss32_high = vaddq_s32(vOffset, vmovl_s16(vreinterpret_s16_u16(vget_high_u16(vSrcs16_low)))); + uint32x4_t vPreds32_low = vcgeq_s32(vPoss32_low, vZero); + uint32x4_t vPreds32_tmp = vcltq_s32(vPoss32_low, vCnt); + vPreds32_low = vandq_u32(vPreds32_low, vPreds32_tmp); + vPoss32_low = vbslq_s32(vPreds32_low, vPoss32_low, vZero); + uint32x4_t vPreds32_high = vcgeq_s32(vPoss32_high, vZero); + vPreds32_tmp = vcltq_s32(vPoss32_high, vCnt); + vPreds32_high = vandq_u32(vPreds32_high, vPreds32_tmp); + vPoss32_high = vbslq_s32(vPreds32_high, vPoss32_high, vZero); + uint8x8_t vPredu8_low = vmovn_u16(vcombine_u16(vmovn_u32(vPreds32_low), vmovn_u32(vPreds32_high))); + + uint8x16_t vVal = vdupq_n_u8(0); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 0)], vVal, 0); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 1)], vVal, 1); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 2)], vVal, 2); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 3)], vVal, 3); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 0)], vVal, 4); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 1)], vVal, 5); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 2)], vVal, 6); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 3)], vVal, 7); + + vPoss32_low = vaddq_s32(vOffset, vmovl_s16(vreinterpret_s16_u16(vget_low_u16(vSrcs16_high)))); + vPoss32_high = vaddq_s32(vOffset, vmovl_s16(vreinterpret_s16_u16(vget_high_u16(vSrcs16_high)))); + vPreds32_low = vcgeq_s32(vPoss32_low, vZero); + vPreds32_tmp = vcltq_s32(vPoss32_low, vCnt); + vPreds32_low = vandq_u32(vPreds32_low, vPreds32_tmp); + vPoss32_low = vbslq_s32(vPreds32_low, vPoss32_low, vZero); + vPreds32_high = vcgeq_s32(vPoss32_high, vZero); + vPreds32_tmp = vcltq_s32(vPoss32_high, vCnt); + vPreds32_high = vandq_u32(vPreds32_high, vPreds32_tmp); + vPoss32_high = vbslq_s32(vPreds32_high, vPoss32_high, vZero); + uint8x8_t vPredu8_high = vmovn_u16(vcombine_u16(vmovn_u32(vPreds32_low), vmovn_u32(vPreds32_high))); + + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 0)], vVal, 8); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 1)], vVal, 9); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 2)], vVal, 10); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 3)], vVal, 11); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 0)], vVal, 12); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 1)], vVal, 13); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 2)], vVal, 14); + vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 3)], vVal, 15); + + uint8x16_t vPredu8 = vcombine_u8(vPredu8_low, vPredu8_high); + uint8x16_t vDstOrg = vld1q_u8(ptr_dst + x); + vVal = vbslq_u8(vPredu8, vVal, vDstOrg); + vst1q_u8(ptr_dst + x, vVal); + } + } + } + else if (type == VX_TYPE_INT16) + { + int32x4_t vOffset = vdupq_n_s32((vx_int32)offset); + int32x4_t vCnt = vdupq_n_s32((vx_int32)count); + int32x4_t vZero = vdupq_n_s32(0); + + vx_int16 *lut_tmp = (vx_int16 *)lut_ptr; + for (y = low_y; y < high_y; y++) + { + vx_uint8 *ptr_src = (vx_uint8 *)src_base + y * in->addr->stride_y; + vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; + for (x = low_x; x < high_x; x += 8) + { + int16x8_t vSrc = vld1q_s16((vx_int16 *)(ptr_src + x * in->addr->stride_x)); + int32x4_t vPoss32_low = vaddq_s32(vOffset, vmovl_s16(vget_low_s16(vSrc))); + int32x4_t vPoss32_high = vaddq_s32(vOffset, vmovl_s16(vget_high_s16(vSrc))); + uint32x4_t vPreds32_low = vcgeq_s32(vPoss32_low, vZero); + uint32x4_t vPreds32_tmp = vcltq_s32(vPoss32_low, vCnt); + vPreds32_low = vandq_u32(vPreds32_low, vPreds32_tmp); + vPoss32_low = vbslq_s32(vPreds32_low, vPoss32_low, vZero); + uint32x4_t vPreds32_high = vcgeq_s32(vPoss32_high, vZero); + vPreds32_tmp = vcltq_s32(vPoss32_high, vCnt); + vPreds32_high = vandq_u32(vPreds32_high, vPreds32_tmp); + vPoss32_high = vbslq_s32(vPreds32_high, vPoss32_high, vZero); + uint16x8_t vPredu16 = vcombine_u16(vmovn_u32(vPreds32_low), vmovn_u32(vPreds32_high)); + + int16x8_t vVal = vdupq_n_s16(0); + vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_low, 0)], vVal, 0); + vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_low, 1)], vVal, 1); + vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_low, 2)], vVal, 2); + vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_low, 3)], vVal, 3); + vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_high, 0)], vVal, 4); + vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_high, 1)], vVal, 5); + vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_high, 2)], vVal, 6); + vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_high, 3)], vVal, 7); + + int16x8_t vDstOrg = vld1q_s16((vx_int16 *)(ptr_dst + x * out->addr->stride_x)); + vVal = vbslq_s16(vPredu16, vVal, vDstOrg); + vst1q_s16((vx_int16 *)(ptr_dst + x * out->addr->stride_x), vVal); + } + } + } +} + +#define TABLELOOKUP(type, low_y, high_y, low_x, high_x, type_size) \ + for (y = low_y; y < high_y; y++) \ + { \ + type *src_ptr = (type *)src_base + y * in->addr->stride_y / type_size; \ + type *dst_ptr = (type *)dst_base + y * out->addr->stride_y / type_size; \ + for (x = low_x; x < high_x; x++) \ + { \ + type *lut_tmp = (type *)lut_ptr; \ + vx_int32 index = (vx_int32)offset + (vx_int32)(*src_ptr); \ + if (index >= 0 && index < (vx_int32)count) \ + { \ + *dst_ptr = lut_tmp[index]; \ + } \ + src_ptr++; \ + dst_ptr++; \ + } \ + } + +void TableLookup_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_array_t *lut = (vx_tile_array_t*)parameters[1]; + vx_tile_t *out = (vx_tile_t*)parameters[2]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = vxTileHeight(in, 0); + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = vxTileWidth(in, 0); + + vx_enum type = lut->item_type; + vx_size count = lut->num_items; + vx_uint32 offset = lut->offset; + + void *lut_ptr = lut->ptr; + + if (low_y == 0 && low_x == 0) + { + if (type == VX_TYPE_UINT8) + { + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + TABLELOOKUP(vx_uint8, low_y, high_y, low_x, high_x, 1) + } + else if (type == VX_TYPE_INT16) + { + vx_int16 *src_base = (vx_int16 *)in->base[0] + in->tile_x; + vx_int16 *dst_base = (vx_int16 *)out->base[0] + out->tile_x; + TABLELOOKUP(vx_int16, low_y, high_y, low_x, high_x, 2) + } + } + else + { + if (type == VX_TYPE_UINT8) + { + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + TABLELOOKUP(vx_uint8, 0, low_y, low_x, high_x, 1) + + src_base = in->base[0]; + dst_base = out->base[0]; + TABLELOOKUP(vx_uint8, low_y, high_y, 0, high_x, 1) + } + else if (type == VX_TYPE_INT16) + { + vx_int16 *src_base = (vx_int16 *)in->base[0] + in->tile_x; + vx_int16 *dst_base = (vx_int16 *)out->base[0] + out->tile_x; + TABLELOOKUP(vx_int16, 0, low_y, low_x, high_x, 2) + + src_base = (vx_int16 *)in->base[0]; + dst_base = (vx_int16 *)out->base[0]; + TABLELOOKUP(vx_int16, low_y, high_y, 0, high_x, 2) + } + } +} diff --git a/kernels/tiling/tiling_magnitude.c b/kernels/tiling/tiling_magnitude.c new file mode 100644 index 0000000..5a90266 --- /dev/null +++ b/kernels/tiling/tiling_magnitude.c @@ -0,0 +1,205 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +// nodeless version of the Magnitude kernel +void Magnitude_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x, value; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint32 low_height = out->tile_y; + vx_uint32 height = out->tile_y + out->tile_block.height; + + for (y = low_height; y < height; y++) + { + vx_int16 *in_x = (vx_int16 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width; + vx_int16 *in_y = (vx_int16 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; + vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out->tile_x + y * out->image.width; + vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out->tile_x + y * out->image.width; + for (x = 0; x < out->tile_block.width; x += 8) + { + int16x8_t in_x16x8 = vld1q_s16(in_x); + int16x8_t in_y16x8 = vld1q_s16(in_y); + if (out->image.format == VX_DF_IMAGE_U8) + { + const int32x4x2_t low_grad = + { + { + vmovl_s16(vmul_s16(vget_low_s16(in_x16x8), vget_low_s16(in_x16x8))), + vmovl_s16(vmul_s16(vget_low_s16(in_y16x8), vget_low_s16(in_y16x8))) + } + }; + const int32x4x2_t top_grad = + { + { + vmovl_s16(vmul_s16(vget_high_s16(in_x16x8), vget_high_s16(in_x16x8))), + vmovl_s16(vmul_s16(vget_high_s16(in_y16x8), vget_high_s16(in_y16x8))) + } + }; + + vx_float64 sum1 = vgetq_lane_s32(low_grad.val[0], 0) + vgetq_lane_s32(low_grad.val[1], 0) ; + value = ((vx_int32)sqrt(sum1))/4; + *dstp = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value); + + vx_float64 sum2 = vgetq_lane_s32(low_grad.val[0], 1) + vgetq_lane_s32(low_grad.val[1], 1) ; + value = ((vx_int32)sqrt(sum2))/4; + *(dstp+1) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value); + + vx_float64 sum3 = vgetq_lane_s32(low_grad.val[0], 2) + vgetq_lane_s32(low_grad.val[1], 2) ; + value = ((vx_int32)sqrt(sum3))/4; + *(dstp+2) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value); + + vx_float64 sum4 = vgetq_lane_s32(low_grad.val[0], 3) + vgetq_lane_s32(low_grad.val[1], 3) ; + value = ((vx_int32)sqrt(sum4))/4; + *(dstp+3) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value); + + vx_float64 sum5 = vgetq_lane_s32(top_grad.val[0], 0) + vgetq_lane_s32(top_grad.val[1], 0) ; + value = ((vx_int32)sqrt(sum5))/4; + *(dstp+4) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value); + + vx_float64 sum6 = vgetq_lane_s32(top_grad.val[0], 1) + vgetq_lane_s32(top_grad.val[1], 1) ; + value = ((vx_int32)sqrt(sum6))/4; + *(dstp+5) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value); + + vx_float64 sum7 = vgetq_lane_s32(top_grad.val[0], 2) + vgetq_lane_s32(top_grad.val[1], 2) ; + value = ((vx_int32)sqrt(sum7))/4; + *(dstp+6) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value); + + vx_float64 sum8 = vgetq_lane_s32(top_grad.val[0], 3) + vgetq_lane_s32(top_grad.val[1], 3) ; + value = ((vx_int32)sqrt(sum8))/4; + *(dstp+7) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value); + dstp += 8; + } + else if (out->image.format == VX_DF_IMAGE_S16) + { + vx_int16 tmpx1 = vgetq_lane_s16(in_x16x8 ,0); + vx_int16 tmpy1 = vgetq_lane_s16(in_y16x8 ,0); + vx_float64 grad1[2] = {(vx_float64)tmpx1*tmpx1, (vx_float64)tmpy1*tmpy1}; + vx_float64 sum1 = grad1[0] + grad1[1]; + value = (vx_int32)(sqrt(sum1) + 0.5); + *dstp_16 = (vx_int16)(value > INT16_MAX ? INT16_MAX : value); + + vx_int16 tmpx2 = vgetq_lane_s16(in_x16x8 ,1); + vx_int16 tmpy2 = vgetq_lane_s16(in_y16x8 ,1); + vx_float64 grad2[2] = {(vx_float64)tmpx2*tmpx2, (vx_float64)tmpy2*tmpy2}; + vx_float64 sum2 = grad2[0] + grad2[1]; + value = (vx_int32)(sqrt(sum2) + 0.5); + *(dstp_16+1) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value); + + vx_int16 tmpx3 = vgetq_lane_s16(in_x16x8 ,2); + vx_int16 tmpy3 = vgetq_lane_s16(in_y16x8 ,2); + vx_float64 grad3[2] = {(vx_float64)tmpx3*tmpx3, (vx_float64)tmpy3*tmpy3}; + vx_float64 sum3 = grad3[0] + grad3[1]; + value = (vx_int32)(sqrt(sum3) + 0.5); + *(dstp_16+2) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value); + + vx_int16 tmpx4 = vgetq_lane_s16(in_x16x8 ,3); + vx_int16 tmpy4 = vgetq_lane_s16(in_y16x8 ,3); + vx_float64 grad4[2] = {(vx_float64)tmpx4*tmpx4, (vx_float64)tmpy4*tmpy4}; + vx_float64 sum4 = grad4[0] + grad4[1]; + value = (vx_int32)(sqrt(sum4) + 0.5); + *(dstp_16+3) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value); + + vx_int16 tmpx5 = vgetq_lane_s16(in_x16x8 ,4); + vx_int16 tmpy5 = vgetq_lane_s16(in_y16x8 ,4); + vx_float64 grad5[2] = {(vx_float64)tmpx5*tmpx5, (vx_float64)tmpy5*tmpy5}; + vx_float64 sum5 = grad5[0] + grad5[1]; + value = (vx_int32)(sqrt(sum5) + 0.5); + *(dstp_16+4) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value); + + vx_int16 tmpx6 = vgetq_lane_s16(in_x16x8 ,5); + vx_int16 tmpy6 = vgetq_lane_s16(in_y16x8 ,5); + vx_float64 grad6[2] = {(vx_float64)tmpx6*tmpx6, (vx_float64)tmpy6*tmpy6}; + vx_float64 sum6 = grad6[0] + grad6[1]; + value = (vx_int32)(sqrt(sum6) + 0.5); + *(dstp_16+5) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value); + + vx_int16 tmpx7 = vgetq_lane_s16(in_x16x8 ,6); + vx_int16 tmpy7 = vgetq_lane_s16(in_y16x8 ,6); + vx_float64 grad7[2] = {(vx_float64)tmpx7*tmpx7, (vx_float64)tmpy7*tmpy7}; + vx_float64 sum7 = grad7[0] + grad7[1]; + value = (vx_int32)(sqrt(sum7) + 0.5); + *(dstp_16+6) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value); + + vx_int16 tmpx8 = vgetq_lane_s16(in_x16x8 ,7); + vx_int16 tmpy8 = vgetq_lane_s16(in_y16x8 ,7); + vx_float64 grad8[2] = {(vx_float64)tmpx8*tmpx8, (vx_float64)tmpy8*tmpy8}; + vx_float64 sum8 = grad8[0] + grad8[1]; + value = (vx_int32)(sqrt(sum8) + 0.5); + *(dstp_16+7) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value); + dstp_16 += 8; + } + in_x += 8; + in_y += 8; + } + } +} + +#define MAGNITUDE_FLEXIBLE(low_y, low_x, high_y, high_x, in_1_tile_x, in_2_tile_x, out_tile_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + vx_int16 *in_x = (vx_int16 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width; \ + vx_int16 *in_y = (vx_int16 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width; \ + vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out_tile_x + y * out->image.width; \ + vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out_tile_x + y * out->image.width; \ + for (x = low_x; x < high_x; x++) \ + { \ + if (out->image.format == VX_DF_IMAGE_U8) \ + { \ + vx_int32 grad[2] = {in_x[0]*in_x[0], in_y[0]*in_y[0]}; \ + vx_float64 sum = grad[0] + grad[1]; \ + value = ((vx_int32)sqrt(sum))/4; \ + *dstp = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value); \ + dstp += 1; \ + } \ + else if (out->image.format == VX_DF_IMAGE_S16) \ + { \ + vx_float64 grad[2] = {(vx_float64)in_x[0]*in_x[0], (vx_float64)in_y[0]*in_y[0]}; \ + vx_float64 sum = grad[0] + grad[1]; \ + value = (vx_int32)(sqrt(sum) + 0.5); \ + *dstp_16 = (vx_int16)(value > INT16_MAX ? INT16_MAX : value); \ + dstp_16 += 1; \ + } \ + in_x += 1; \ + in_y += 1; \ + } \ + } \ + +void Magnitude_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x, value; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + if (ty == 0 && tx == 0) + { + MAGNITUDE_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x) + } + else + { + MAGNITUDE_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x) + MAGNITUDE_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), 0, 0, 0) + } +} diff --git a/kernels/tiling/tiling_minmax.c b/kernels/tiling/tiling_minmax.c new file mode 100644 index 0000000..f446afc --- /dev/null +++ b/kernels/tiling/tiling_minmax.c @@ -0,0 +1,199 @@ +/* + + * Copyright (c) 2017-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +void Max_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint32 low_height = out->tile_y; + vx_uint32 height = out->tile_y + out->tile_block.height; + switch (out->image.format) + { + case VX_DF_IMAGE_U8: + for (y = low_height; y < height; y++) + { + vx_uint8* src0p = (vx_uint8 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width; + vx_uint8* src1p = (vx_uint8 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; + vx_uint8* dstp = (vx_uint8 *)out->base[0] + out->tile_x + y * out->image.width; + for (x = 0; x < out->tile_block.width; x += 16) + { + uint8x16_t vsrc0 = vld1q_u8( src0p + x); + uint8x16_t vsrc1 = vld1q_u8( src1p + x); + vst1q_u8( dstp + x, vmaxq_u8( vsrc0, vsrc1 ) ); + } + } + break; + case VX_DF_IMAGE_S16: + for (y = low_height; y < height; y++) + { + vx_uint8* src0p = (vx_uint8 *)in_1->base[0] + 2*in_1->tile_x + y * in_1->addr->stride_y; + vx_uint8* src1p = (vx_uint8 *)in_2->base[0] + 2*in_2->tile_x + y * in_2->addr->stride_y; + vx_uint8* dstp = (vx_uint8 *)out->base[0] + 2*out->tile_x + y * out->addr->stride_y; + for (x = 0; x < out->tile_block.width; x += 8) + { + int16x8_t vsrc0 = vld1q_s16( (vx_int16 *)(src0p + x * in_1->addr[0].stride_x)); + int16x8_t vsrc1 = vld1q_s16( (vx_int16 *)(src1p + x * in_2->addr[0].stride_x)); + vst1q_s16( (vx_int16 *)(dstp + x * out->addr[0].stride_x), vmaxq_s16( vsrc0, vsrc1 ) ); + } + } + break; + } +} + +#define MAX_FLEXIBLE(low_y, low_x, high_y, high_x, in_1_tile_x, in_2_tile_x, out_tile_x) \ + for (y = low_y; y < high_y; ++y) \ + { \ + for (x = low_x; x < high_x; ++x) \ + { \ + switch (out->image.format) \ + { \ + case VX_DF_IMAGE_U8: \ + src0p = (vx_uint8 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width + x * in_1->addr[0].stride_x; \ + src1p = (vx_uint8 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width + x * in_2->addr[0].stride_x; \ + dstp = (vx_uint8 *)out->base[0] + out_tile_x + y * out->image.width + x * out->addr[0].stride_x; \ + val0 = *(src0p); \ + val1 = *(src1p); \ + *dstp = val0 > val1 ? val0 : val1; \ + break; \ + case VX_DF_IMAGE_S16: \ + src0p = (vx_uint8 *)in_1->base[0] + 2*in_1_tile_x + y * in_1->addr->stride_y + x * in_1->addr[0].stride_x;\ + src1p = (vx_uint8 *)in_2->base[0] + 2*in_2_tile_x + y * in_2->addr->stride_y + x * in_2->addr[0].stride_x;\ + dstp = (vx_uint8 *)out->base[0] + 2*out_tile_x + y * out->addr->stride_y + x * out->addr[0].stride_x; \ + val0_16 = *(vx_int16 *)(src0p); \ + val1_16 = *(vx_int16 *)(src1p); \ + *(vx_int16 *)dstp = val0_16 > val1_16 ? val0_16 : val1_16; \ + break; \ + } \ + } \ + } \ + +void Max_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + vx_uint8 *src0p, *src1p, *dstp; + vx_uint8 val0, val1; + vx_int16 val0_16, val1_16; + if (ty == 0 && tx == 0) + { + MAX_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x) + } + else + { + MAX_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x) + MAX_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), 0, 0, 0) + } +} + +void Min_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint32 low_height = out->tile_y; + vx_uint32 height = out->tile_y + out->tile_block.height; + switch (out->image.format) + { + case VX_DF_IMAGE_U8: + for (y = low_height; y < height; y++) + { + vx_uint8* src0p = (vx_uint8 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width; + vx_uint8* src1p = (vx_uint8 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; + vx_uint8* dstp = (vx_uint8 *)out->base[0] + out->tile_x + y * out->image.width; + for (x = 0; x < out->tile_block.width; x += 16) + { + uint8x16_t vsrc0 = vld1q_u8( src0p + x); + uint8x16_t vsrc1 = vld1q_u8( src1p + x); + vst1q_u8( dstp + x, vminq_u8( vsrc0, vsrc1 ) ); + } + } + break; + case VX_DF_IMAGE_S16: + for (y = low_height; y < height; y++) + { + vx_uint8* src0p = (vx_uint8 *)in_1->base[0] + 2*in_1->tile_x + y * in_1->addr->stride_y; + vx_uint8* src1p = (vx_uint8 *)in_2->base[0] + 2*in_2->tile_x + y * in_2->addr->stride_y; + vx_uint8* dstp = (vx_uint8 *)out->base[0] + 2*out->tile_x + y * out->addr->stride_y; + for (x = 0; x < out->tile_block.width; x += 8) + { + int16x8_t vsrc0 = vld1q_s16( (vx_int16 *)(src0p + x * in_1->addr[0].stride_x)); + int16x8_t vsrc1 = vld1q_s16( (vx_int16 *)(src1p + x * in_2->addr[0].stride_x)); + vst1q_s16( (vx_int16 *)(dstp + x * out->addr[0].stride_x), vminq_s16( vsrc0, vsrc1 ) ); + } + } + break; + } +} + +#define MIN_FLEXIBLE(low_y, low_x, high_y, high_x, in_1_tile_x, in_2_tile_x, out_tile_x) \ + for (y = low_y; y < high_y; ++y) \ + { \ + for (x = low_x; x < high_x; ++x) \ + { \ + switch (out->image.format) \ + { \ + case VX_DF_IMAGE_U8: \ + src0p = (vx_uint8 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width + x * in_1->addr[0].stride_x; \ + src1p = (vx_uint8 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width + x * in_2->addr[0].stride_x; \ + dstp = (vx_uint8 *)out->base[0] + out_tile_x + y * out->image.width + x * out->addr[0].stride_x; \ + val0 = *(src0p); \ + val1 = *(src1p); \ + *dstp = val0 < val1 ? val0 : val1; \ + break; \ + case VX_DF_IMAGE_S16: \ + src0p = (vx_uint8 *)in_1->base[0] + 2*in_1_tile_x + y * in_1->addr->stride_y + x * in_1->addr[0].stride_x;\ + src1p = (vx_uint8 *)in_2->base[0] + 2*in_2_tile_x + y * in_2->addr->stride_y + x * in_2->addr[0].stride_x;\ + dstp = (vx_uint8 *)out->base[0] + 2*out_tile_x + y * out->addr->stride_y + x * out->addr[0].stride_x; \ + val0_16 = *(vx_int16 *)(src0p); \ + val1_16 = *(vx_int16 *)(src1p); \ + *(vx_int16 *)dstp = val0_16 < val1_16 ? val0_16 : val1_16; \ + break; \ + } \ + } \ + } \ + +void Min_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + vx_uint8 *src0p, *src1p, *dstp; + vx_uint8 val0, val1; + vx_int16 val0_16, val1_16; + if (ty == 0 && tx == 0) + { + MIN_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x) + } + else + { + MIN_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x) + MIN_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), 0, 0, 0) + } +} diff --git a/kernels/tiling/tiling_morphology.c b/kernels/tiling/tiling_morphology.c new file mode 100644 index 0000000..678b6c2 --- /dev/null +++ b/kernels/tiling/tiling_morphology.c @@ -0,0 +1,249 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include + +static inline void opt_max(uint8x8_t *a, uint8x8_t *b) +{ + const uint8x8_t max = vmax_u8(*a, *b); + *a = max; +} +static inline void opt_min(uint8x8_t *a, uint8x8_t *b) +{ + const uint8x8_t min = vmin_u8(*a, *b); + *a = min; +} + +void Erode3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x, y; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + if (low_y == 0) + { + low_y = 1; + } + if (high_y == out->image.height) + { + high_y = high_y - 1; + } + + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * out->addr->stride_y; + vx_uint8* top_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y; + vx_uint8* mid_src = (vx_uint8 *)src_base + (y) * in->addr->stride_y; + vx_uint8* bot_src = (vx_uint8 *)src_base + (y + 1) * in->addr->stride_y; + + for (x = 0; x < out->tile_block.width; x += 8) + { + const uint8x16_t top_data = vld1q_u8(top_src); + const uint8x16_t mid_data = vld1q_u8(mid_src); + const uint8x16_t bot_data = vld1q_u8(bot_src); + + uint8x8_t p0 = vget_low_u8(top_data); + uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1); + uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2); + uint8x8_t p3 = vget_low_u8(mid_data); + uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1); + uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2); + uint8x8_t p6 = vget_low_u8(bot_data); + uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1); + uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2); + + opt_min(&p0, &p1); + opt_min(&p0, &p2); + opt_min(&p0, &p3); + opt_min(&p0, &p4); + opt_min(&p0, &p5); + opt_min(&p0, &p6); + opt_min(&p0, &p7); + opt_min(&p0, &p8); + + vst1_u8(dst, p0); + + top_src+=8; + mid_src+=8; + bot_src+=8; + dst += 8; + } + } +} + +#define Erode3x3(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_int32 j, i; \ + vx_uint8 min_pixel = vxImagePixel(vx_uint8, in, 0, x, y, -1, -1); \ + for (j = vxNeighborhoodTop(in); j <= vxNeighborhoodBottom(in); j++) \ + { \ + for (i = vxNeighborhoodLeft(in); i <= vxNeighborhoodRight(in); i++) \ + { \ + if (min_pixel < vxImagePixel(vx_uint8, in, 0, x, y, i, j)) \ + min_pixel = min_pixel; \ + else \ + min_pixel = vxImagePixel(vx_uint8, in, 0, x, y, i, j); \ + } \ + } \ + vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = min_pixel; \ + } \ + } + +void Erode3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + if (low_y == 0 && low_x == 0) + { + Erode3x3(low_y + 1, high_y - 1, low_x + 1, high_x - 1) + } + else + { + Erode3x3(1, low_y, low_x, high_x - 1) + Erode3x3(low_y, high_y, 1, high_x - 1) + } +} + + +void Dilate3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x, y; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + if (low_y == 0) + { + low_y = 1; + } + if (high_y == out->image.height) + { + high_y = high_y - 1; + } + + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * out->addr->stride_y; + vx_uint8* top_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y; + vx_uint8* mid_src = (vx_uint8 *)src_base + (y) * in->addr->stride_y; + vx_uint8* bot_src = (vx_uint8 *)src_base + (y + 1) * in->addr->stride_y; + + for (x = 0; x < out->tile_block.width; x += 8) + { + const uint8x16_t top_data = vld1q_u8(top_src); + const uint8x16_t mid_data = vld1q_u8(mid_src); + const uint8x16_t bot_data = vld1q_u8(bot_src); + + uint8x8_t p0 = vget_low_u8(top_data); + uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1); + uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2); + uint8x8_t p3 = vget_low_u8(mid_data); + uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1); + uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2); + uint8x8_t p6 = vget_low_u8(bot_data); + uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1); + uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2); + + opt_max(&p0, &p1); + opt_max(&p0, &p2); + opt_max(&p0, &p3); + opt_max(&p0, &p4); + opt_max(&p0, &p5); + opt_max(&p0, &p6); + opt_max(&p0, &p7); + opt_max(&p0, &p8); + + vst1_u8(dst, p0); + + top_src+=8; + mid_src+=8; + bot_src+=8; + dst += 8; + } + } +} + + +#define Dilate3x3(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_int32 j, i; \ + vx_uint8 max_pixel = vxImagePixel(vx_uint8, in, 0, x, y, -1, -1); \ + for (j = vxNeighborhoodTop(in); j <= vxNeighborhoodBottom(in); j++) \ + { \ + for (i = vxNeighborhoodLeft(in); i <= vxNeighborhoodRight(in); i++) \ + { \ + if (max_pixel > vxImagePixel(vx_uint8, in, 0, x, y, i, j)) \ + max_pixel = max_pixel; \ + else \ + max_pixel = vxImagePixel(vx_uint8, in, 0, x, y, i, j); \ + } \ + } \ + vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = max_pixel; \ + } \ + } + + +void Dilate3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + if (low_y == 0 && low_x == 0) + { + Dilate3x3(low_y + 1, high_y - 1, low_x + 1, high_x - 1) + } + else + { + Dilate3x3(1, low_y, low_x, high_x - 1) + Dilate3x3(low_y, high_y, 1, high_x - 1) + } +} diff --git a/kernels/tiling/tiling_multiply.c b/kernels/tiling/tiling_multiply.c new file mode 100644 index 0000000..d0db034 --- /dev/null +++ b/kernels/tiling/tiling_multiply.c @@ -0,0 +1,267 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +// nodeless version of the Multiply kernel +void Multiply_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_float32 *scale = (vx_float32*)parameters[2]; + vx_enum *overflow_policy = (vx_enum*)parameters[3]; + vx_enum *rounding_policy = (vx_enum*)parameters[4]; + vx_tile_t *out = (vx_tile_t *)parameters[5]; + vx_uint32 low_height = out->tile_y; + vx_uint32 height = out->tile_y + out->tile_block.height; + + for (y = low_height; y < height; y++) + { + vx_uint8 *src0p = (vx_uint8 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width; + vx_uint8 *src1p = (vx_uint8 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; + vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out->tile_x + y * out->image.width; + vx_int16 *src0p_16 = (vx_int16 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width; + vx_int16 *src1p_16 = (vx_int16 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; + vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out->tile_x + y * out->image.width; + for (x = 0; x < out->tile_block.width; x += 8) + { + int32x4_t src01; + int32x4_t src02; + int32x4_t src11; + int32x4_t src12; + if(in_1->image.format == VX_DF_IMAGE_U8) + { + uint8x8_t in01_8x8_data = vld1_u8((vx_uint8*)src0p); + uint16x8_t tmp16x8 = vmovl_u8 (in01_8x8_data); + int32x4x2_t tmp32x4_int_u8 = + { + { + vreinterpretq_s32_u32 (vmovl_u16 (vget_low_u16(tmp16x8))), + vreinterpretq_s32_u32 (vmovl_u16 (vget_high_u16(tmp16x8))) + } + }; + src01 = tmp32x4_int_u8.val[0]; + src02 = tmp32x4_int_u8.val[1]; + src0p += 8; + } + else + { + int16x8_t int02_16x8_data = vld1q_s16((vx_int16*)src0p_16); + int32x4x2_t tmp32x4_int_s16 = + { + { + vmovl_s16 (vget_low_s16(int02_16x8_data)), + vmovl_s16 (vget_high_s16(int02_16x8_data)) + } + }; + src01 = tmp32x4_int_s16.val[0]; + src02 = tmp32x4_int_s16.val[1]; + src0p_16 += 8; + } + if(in_2->image.format == VX_DF_IMAGE_U8) + { + uint8x8_t in01_8x8_data = vld1_u8((vx_uint8*)src1p); + uint16x8_t tmp16x8 = vmovl_u8 (in01_8x8_data); + int32x4x2_t tmp32x4_int_u8 = + { + { + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp16x8))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp16x8))) + } + }; + src11 = tmp32x4_int_u8.val[0]; + src12 = tmp32x4_int_u8.val[1]; + src1p += 8; + } + else + { + int16x8_t int02_16x8_data = vld1q_s16((vx_int16*)src1p_16); + int32x4x2_t tmp32x4_int_s16 = + { + { + vmovl_s16(vget_low_s16(int02_16x8_data)), + vmovl_s16(vget_high_s16(int02_16x8_data)) + } + }; + src11 = tmp32x4_int_s16.val[0]; + src12 = tmp32x4_int_s16.val[1]; + src1p_16 += 8; + } + int32x4_t unscaled_unconverted_result1 = vmulq_s32(src01, src11); + int32x4_t unscaled_unconverted_result2 = vmulq_s32(src02, src12); + vx_int32 tmp0 = vgetq_lane_s32(unscaled_unconverted_result1, 0); + vx_int32 tmp1 = vgetq_lane_s32(unscaled_unconverted_result1, 1); + vx_int32 tmp2 = vgetq_lane_s32(unscaled_unconverted_result1, 2); + vx_int32 tmp3 = vgetq_lane_s32(unscaled_unconverted_result1, 3); + vx_int32 tmp4 = vgetq_lane_s32(unscaled_unconverted_result2, 0); + vx_int32 tmp5 = vgetq_lane_s32(unscaled_unconverted_result2, 1); + vx_int32 tmp6 = vgetq_lane_s32(unscaled_unconverted_result2, 2); + vx_int32 tmp7 = vgetq_lane_s32(unscaled_unconverted_result2, 3); + + vx_int32 i; + for(i = 0; i < 8; i++) + { + vx_int32 tmp_int32; + if(i == 0) + tmp_int32 = tmp0; + else if(i == 1) + tmp_int32 = tmp1; + else if(i == 2) + tmp_int32 = tmp2; + else if(i == 3) + tmp_int32 = tmp3; + else if(i == 4) + tmp_int32 = tmp4; + else if(i == 5) + tmp_int32 = tmp5; + else if(i == 6) + tmp_int32 = tmp6; + else if(i == 7) + tmp_int32 = tmp7; + vx_float64 unscaled_result = (vx_float64)tmp_int32; + vx_float64 scaled_result = (*scale) * unscaled_result; + vx_int32 int_typed_result = (vx_int32)scaled_result; + vx_int32 final_result_value; + if (*overflow_policy == VX_CONVERT_POLICY_SATURATE) + { + if (out->image.format == VX_DF_IMAGE_U8) + { + if (int_typed_result > UINT8_MAX) + final_result_value = UINT8_MAX; + else if (int_typed_result < 0) + final_result_value = 0; + else + final_result_value = int_typed_result; + } + else + { + if (int_typed_result > INT16_MAX) + final_result_value = INT16_MAX; + else if (int_typed_result < INT16_MIN) + final_result_value = INT16_MIN; + else + final_result_value = int_typed_result; + } + } + else + { + final_result_value = (out->image.format == VX_DF_IMAGE_U8) ? + (vx_uint8)int_typed_result : (vx_int16)int_typed_result; + } + + if (out->image.format == VX_DF_IMAGE_U8) + { + *dstp = (vx_uint8)final_result_value; + dstp += 1; + } + else + { + *dstp_16 = (vx_int16)final_result_value; + dstp_16 += 1; + } + } + } + } +} + +#define MULTIPLY_FLEXIBLE(low_y, low_x, high_y, high_x, in_1_tile_x, in_2_tile_x, out_tile_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + vx_uint8 *src0p = (vx_uint8 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width; \ + vx_uint8 *src1p = (vx_uint8 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width; \ + vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out_tile_x + y * out->image.width; \ + vx_int16 *src0p_16 = (vx_int16 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width; \ + vx_int16 *src1p_16 = (vx_int16 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width; \ + vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out_tile_x + y * out->image.width; \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_int32 src0 = in_1->image.format == VX_DF_IMAGE_U8 ? *src0p : *src0p_16; \ + vx_int32 src1 = in_2->image.format == VX_DF_IMAGE_U8 ? *src1p : *src1p_16; \ + src0p++; \ + src1p++; \ + src0p_16++; \ + src1p_16++; \ + vx_int32 unscaled_unconverted_result = src0 * src1; \ + vx_float64 unscaled_result = (vx_float64)unscaled_unconverted_result; \ + vx_float64 scaled_result = (*scale) * unscaled_result; \ + vx_int32 int_typed_result = (vx_int32)scaled_result; \ + vx_int32 final_result_value; \ + if (*overflow_policy == VX_CONVERT_POLICY_SATURATE) \ + { \ + if (out->image.format == VX_DF_IMAGE_U8) \ + { \ + if (int_typed_result > UINT8_MAX) \ + final_result_value = UINT8_MAX; \ + else if (int_typed_result < 0) \ + final_result_value = 0; \ + else \ + final_result_value = int_typed_result; \ + } \ + else \ + { \ + if (int_typed_result > INT16_MAX) \ + final_result_value = INT16_MAX; \ + else if (int_typed_result < INT16_MIN) \ + final_result_value = INT16_MIN; \ + else \ + final_result_value = int_typed_result; \ + } \ + } \ + else \ + { \ + final_result_value = (out->image.format == VX_DF_IMAGE_U8) ? \ + (vx_uint8)int_typed_result : (vx_int16)int_typed_result; \ + } \ + if (out->image.format == VX_DF_IMAGE_U8) \ + { \ + *dstp = (vx_uint8)final_result_value; \ + dstp++; \ + } \ + else \ + { \ + *dstp_16 = (vx_int16)final_result_value; \ + dstp_16++; \ + } \ + } \ + } + + +void Multiply_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in_1 = (vx_tile_t *)parameters[0]; + vx_tile_t *in_2 = (vx_tile_t *)parameters[1]; + vx_float32 *scale = (vx_float32*)parameters[2]; + vx_enum *overflow_policy = (vx_enum*)parameters[3]; + vx_enum *rounding_policy = (vx_enum*)parameters[4]; + vx_tile_t *out = (vx_tile_t *)parameters[5]; + + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + if (ty == 0 && tx == 0) + { + MULTIPLY_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x) + } + else + { + MULTIPLY_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x) + MULTIPLY_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), 0, 0, 0) + } +} + diff --git a/kernels/tiling/tiling_nonlinearfilter.c b/kernels/tiling/tiling_nonlinearfilter.c new file mode 100644 index 0000000..2b57318 --- /dev/null +++ b/kernels/tiling/tiling_nonlinearfilter.c @@ -0,0 +1,1217 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include + +#include + +#include + +struct src_ptr +{ + vx_uint8* top2_src; + vx_uint8* top_src; + vx_uint8* mid_src; + vx_uint8* bot_src; + vx_uint8* bot2_src; +}; + + +static void sort(uint8x8_t *a, uint8x8_t *b) +{ + const uint8x8_t min = vmin_u8(*a, *b); + const uint8x8_t max = vmax_u8(*a, *b); + *a = min; + *b = max; +} + +static void sort_min(uint8x8_t *a, uint8x8_t *b) +{ + const uint8x8_t min = vmin_u8(*a, *b); + *a = min; +} + +static void sort_max(uint8x8_t *a, uint8x8_t *b) +{ + const uint8x8_t max = vmax_u8(*a, *b); + *a = max; +} + +// Calculations that do not affect the median were removed. +static void sort5_mid(uint8x8_t *p0, uint8x8_t *p1, uint8x8_t *p2, uint8x8_t *p3, uint8x8_t *p4) +{ + sort(p0, p1); + sort(p2, p3); + sort(p0, p2); + sort(p1, p3); + sort(p1, p2); + sort(p0, p4); + sort(p1, p4); + sort(p2, p4); +} + +static void sort5_min(uint8x8_t *p0, uint8x8_t *p1, uint8x8_t *p2, uint8x8_t *p3, uint8x8_t *p4) +{ + sort_min(p0, p1); + sort_min(p0, p2); + sort_min(p0, p3); + sort_min(p0, p4); +} + +static void sort5_max(uint8x8_t *p0, uint8x8_t *p1, uint8x8_t *p2, uint8x8_t *p3, uint8x8_t *p4) +{ + sort_max(p0, p1); + sort_max(p0, p2); + sort_max(p0, p3); + sort_max(p0, p4); +} + +static void sort9_mid(uint8x8_t *p0, uint8x8_t *p1, uint8x8_t *p2, + uint8x8_t *p3, uint8x8_t *p4, uint8x8_t *p5, + uint8x8_t *p6, uint8x8_t *p7, uint8x8_t *p8) +{ + sort(p1, p2); + sort(p4, p5); + sort(p7, p8); + sort(p0, p1); + sort(p3, p4); + sort(p6, p7); + sort(p1, p2); + sort(p4, p5); + sort(p7, p8); + sort(p0, p3); + sort(p5, p8); + sort(p4, p7); + sort(p3, p6); + sort(p1, p4); + sort(p2, p5); + sort(p4, p7); + sort(p4, p2); + sort(p6, p4); + sort(p4, p2); +} + +static void sort9_min(uint8x8_t *p0, uint8x8_t *p1, uint8x8_t *p2, + uint8x8_t *p3, uint8x8_t *p4, uint8x8_t *p5, + uint8x8_t *p6, uint8x8_t *p7, uint8x8_t *p8) +{ + sort_min(p0, p1); + sort_min(p0, p2); + sort_min(p0, p3); + sort_min(p0, p4); + sort_min(p0, p5); + sort_min(p0, p6); + sort_min(p0, p7); + sort_min(p0, p8); +} + +static void sort9_max(uint8x8_t *p0, uint8x8_t *p1, uint8x8_t *p2, + uint8x8_t *p3, uint8x8_t *p4, uint8x8_t *p5, + uint8x8_t *p6, uint8x8_t *p7, uint8x8_t *p8) +{ + sort_max(p0, p1); + sort_max(p0, p2); + sort_max(p0, p3); + sort_max(p0, p4); + sort_max(p0, p5); + sort_max(p0, p6); + sort_max(p0, p7); + sort_max(p0, p8); +} + +static void sort21_mid(uint8x8_t p[21]) +{ + sort(&p[0], &p[1]); + sort(&p[2], &p[3]); + sort(&p[4], &p[5]); + sort(&p[6], &p[7]); + sort(&p[8], &p[9]); + sort(&p[10], &p[11]); + sort(&p[12], &p[13]); + sort(&p[14], &p[15]); + sort(&p[16], &p[17]); + sort(&p[18], &p[19]); + sort(&p[0], &p[2]); + sort(&p[1], &p[3]); + sort(&p[4], &p[6]); + sort(&p[5], &p[7]); + sort(&p[8], &p[10]); + sort(&p[9], &p[11]); + sort(&p[12], &p[14]); + sort(&p[13], &p[15]); + sort(&p[16], &p[18]); + sort(&p[17], &p[19]); + sort(&p[1], &p[2]); + sort(&p[5], &p[6]); + sort(&p[0], &p[4]); + sort(&p[3], &p[7]); + sort(&p[9], &p[10]); + sort(&p[13], &p[14]); + sort(&p[8], &p[12]); + sort(&p[11], &p[15]); + sort(&p[17], &p[18]); + sort(&p[16], &p[20]); + sort(&p[1], &p[5]); + sort(&p[2], &p[6]); + sort(&p[9], &p[13]); + sort(&p[10], &p[14]); + sort(&p[0], &p[8]); + sort(&p[7], &p[15]); + sort(&p[17], &p[20]); + sort(&p[1], &p[4]); + sort(&p[3], &p[6]); + sort(&p[9], &p[12]); + sort(&p[11], &p[14]); + sort(&p[18], &p[20]); + sort(&p[0], &p[16]); + sort(&p[2], &p[4]); + sort(&p[3], &p[5]); + sort(&p[10], &p[12]); + sort(&p[11], &p[13]); + sort(&p[1], &p[9]); + sort(&p[6], &p[14]); + sort(&p[19], &p[20]); + sort(&p[3], &p[4]); + sort(&p[11], &p[12]); + sort(&p[1], &p[8]); + sort(&p[2], &p[10]); + sort(&p[5], &p[13]); + sort(&p[7], &p[14]); + sort(&p[3], &p[11]); + sort(&p[2], &p[8]); + sort(&p[4], &p[12]); + sort(&p[7], &p[13]); + sort(&p[1], &p[17]); + sort(&p[3], &p[10]); + sort(&p[5], &p[12]); + sort(&p[1], &p[16]); + sort(&p[2], &p[18]); + sort(&p[3], &p[9]); + sort(&p[6], &p[12]); + sort(&p[2], &p[16]); + sort(&p[3], &p[8]); + sort(&p[7], &p[12]); + sort(&p[5], &p[9]); + sort(&p[6], &p[10]); + sort(&p[4], &p[8]); + sort(&p[7], &p[11]); + sort(&p[3], &p[19]); + sort(&p[5], &p[8]); + sort(&p[7], &p[10]); + sort(&p[3], &p[18]); + sort(&p[4], &p[20]); + sort(&p[6], &p[8]); + sort(&p[7], &p[9]); + sort(&p[3], &p[17]); + sort(&p[5], &p[20]); + sort(&p[7], &p[8]); + sort(&p[3], &p[16]); + sort(&p[6], &p[20]); + sort(&p[5], &p[17]); + sort(&p[7], &p[20]); + sort(&p[4], &p[16]); + sort(&p[6], &p[18]); + sort(&p[5], &p[16]); + sort(&p[7], &p[19]); + sort(&p[7], &p[18]); + sort(&p[6], &p[16]); + sort(&p[7], &p[17]); + sort(&p[10], &p[18]); + sort(&p[7], &p[16]); + sort(&p[9], &p[17]); + sort(&p[8], &p[16]); + sort(&p[9], &p[16]); + sort(&p[10], &p[16]); +} + +static void sort21_min(uint8x8_t p[21]) +{ + sort_min(&p[0], &p[1]); + sort_min(&p[0], &p[2]); + sort_min(&p[0], &p[3]); + sort_min(&p[0], &p[4]); + sort_min(&p[0], &p[5]); + sort_min(&p[0], &p[6]); + sort_min(&p[0], &p[7]); + sort_min(&p[0], &p[8]); + sort_min(&p[0], &p[9]); + sort_min(&p[0], &p[10]); + sort_min(&p[0], &p[11]); + sort_min(&p[0], &p[12]); + sort_min(&p[0], &p[13]); + sort_min(&p[0], &p[14]); + sort_min(&p[0], &p[15]); + sort_min(&p[0], &p[16]); + sort_min(&p[0], &p[17]); + sort_min(&p[0], &p[18]); + sort_min(&p[0], &p[19]); + sort_min(&p[0], &p[20]); +} + + +static void sort21_max(uint8x8_t p[21]) +{ + sort_max(&p[0], &p[1]); + sort_max(&p[0], &p[2]); + sort_max(&p[0], &p[3]); + sort_max(&p[0], &p[4]); + sort_max(&p[0], &p[5]); + sort_max(&p[0], &p[6]); + sort_max(&p[0], &p[7]); + sort_max(&p[0], &p[8]); + sort_max(&p[0], &p[9]); + sort_max(&p[0], &p[10]); + sort_max(&p[0], &p[11]); + sort_max(&p[0], &p[12]); + sort_max(&p[0], &p[13]); + sort_max(&p[0], &p[14]); + sort_max(&p[0], &p[15]); + sort_max(&p[0], &p[16]); + sort_max(&p[0], &p[17]); + sort_max(&p[0], &p[18]); + sort_max(&p[0], &p[19]); + sort_max(&p[0], &p[20]); +} + +static void sort25_mid(uint8x8_t p[25]) +{ + sort(&p[1], &p[2]); + sort(&p[0], &p[1]); + sort(&p[1], &p[2]); + sort(&p[4], &p[5]); + sort(&p[3], &p[4]); + sort(&p[4], &p[5]); + sort(&p[0], &p[3]); + sort(&p[2], &p[5]); + sort(&p[2], &p[3]); + sort(&p[1], &p[4]); + sort(&p[1], &p[2]); + sort(&p[3], &p[4]); + sort(&p[7], &p[8]); + sort(&p[6], &p[7]); + sort(&p[7], &p[8]); + sort(&p[10], &p[11]); + sort(&p[9], &p[10]); + sort(&p[10], &p[11]); + sort(&p[6], &p[9]); + sort(&p[8], &p[11]); + sort(&p[8], &p[9]); + sort(&p[7], &p[10]); + sort(&p[7], &p[8]); + sort(&p[9], &p[10]); + sort(&p[0], &p[6]); + sort(&p[4], &p[10]); + sort(&p[4], &p[6]); + sort(&p[2], &p[8]); + sort(&p[2], &p[4]); + sort(&p[6], &p[8]); + sort(&p[1], &p[7]); + sort(&p[5], &p[11]); + sort(&p[5], &p[7]); + sort(&p[3], &p[9]); + sort(&p[3], &p[5]); + sort(&p[7], &p[9]); + sort(&p[1], &p[2]); + sort(&p[3], &p[4]); + sort(&p[5], &p[6]); + sort(&p[7], &p[8]); + sort(&p[9], &p[10]); + sort(&p[13], &p[14]); + sort(&p[12], &p[13]); + sort(&p[13], &p[14]); + sort(&p[16], &p[17]); + sort(&p[15], &p[16]); + sort(&p[16], &p[17]); + sort(&p[12], &p[15]); + sort(&p[14], &p[17]); + sort(&p[14], &p[15]); + sort(&p[13], &p[16]); + sort(&p[13], &p[14]); + sort(&p[15], &p[16]); + sort(&p[19], &p[20]); + sort(&p[18], &p[19]); + sort(&p[19], &p[20]); + sort(&p[21], &p[22]); + sort(&p[23], &p[24]); + sort(&p[21], &p[23]); + sort(&p[22], &p[24]); + sort(&p[22], &p[23]); + sort(&p[18], &p[21]); + sort(&p[20], &p[23]); + sort(&p[20], &p[21]); + sort(&p[19], &p[22]); + sort(&p[22], &p[24]); + sort(&p[19], &p[20]); + sort(&p[21], &p[22]); + sort(&p[23], &p[24]); + sort(&p[12], &p[18]); + sort(&p[16], &p[22]); + sort(&p[16], &p[18]); + sort(&p[14], &p[20]); + sort(&p[20], &p[24]); + sort(&p[14], &p[16]); + sort(&p[18], &p[20]); + sort(&p[22], &p[24]); + sort(&p[13], &p[19]); + sort(&p[17], &p[23]); + sort(&p[17], &p[19]); + sort(&p[15], &p[21]); + sort(&p[15], &p[17]); + sort(&p[19], &p[21]); + sort(&p[13], &p[14]); + sort(&p[15], &p[16]); + sort(&p[17], &p[18]); + sort(&p[19], &p[20]); + sort(&p[21], &p[22]); + sort(&p[23], &p[24]); + sort(&p[0], &p[12]); + sort(&p[8], &p[20]); + sort(&p[8], &p[12]); + sort(&p[4], &p[16]); + sort(&p[16], &p[24]); + sort(&p[12], &p[16]); + sort(&p[2], &p[14]); + sort(&p[10], &p[22]); + sort(&p[10], &p[14]); + sort(&p[6], &p[18]); + sort(&p[6], &p[10]); + sort(&p[10], &p[12]); + sort(&p[1], &p[13]); + sort(&p[9], &p[21]); + sort(&p[9], &p[13]); + sort(&p[5], &p[17]); + sort(&p[13], &p[17]); + sort(&p[3], &p[15]); + sort(&p[11], &p[23]); + sort(&p[11], &p[15]); + sort(&p[7], &p[19]); + sort(&p[7], &p[11]); + sort(&p[11], &p[13]); + sort(&p[11], &p[12]); +} + + +static void sort25_min(uint8x8_t p[25]) +{ + sort_min(&p[0], &p[1]); + sort_min(&p[0], &p[2]); + sort_min(&p[0], &p[3]); + sort_min(&p[0], &p[4]); + sort_min(&p[0], &p[5]); + sort_min(&p[0], &p[6]); + sort_min(&p[0], &p[7]); + sort_min(&p[0], &p[8]); + sort_min(&p[0], &p[9]); + sort_min(&p[0], &p[10]); + sort_min(&p[0], &p[11]); + sort_min(&p[0], &p[12]); + sort_min(&p[0], &p[13]); + sort_min(&p[0], &p[14]); + sort_min(&p[0], &p[15]); + sort_min(&p[0], &p[16]); + sort_min(&p[0], &p[17]); + sort_min(&p[0], &p[18]); + sort_min(&p[0], &p[19]); + sort_min(&p[0], &p[20]); + sort_min(&p[0], &p[21]); + sort_min(&p[0], &p[22]); + sort_min(&p[0], &p[23]); + sort_min(&p[0], &p[24]); +} + +static void sort25_max(uint8x8_t p[25]) +{ + sort_max(&p[0], &p[1]); + sort_max(&p[0], &p[2]); + sort_max(&p[0], &p[3]); + sort_max(&p[0], &p[4]); + sort_max(&p[0], &p[5]); + sort_max(&p[0], &p[6]); + sort_max(&p[0], &p[7]); + sort_max(&p[0], &p[8]); + sort_max(&p[0], &p[9]); + sort_max(&p[0], &p[10]); + sort_max(&p[0], &p[11]); + sort_max(&p[0], &p[12]); + sort_max(&p[0], &p[13]); + sort_max(&p[0], &p[14]); + sort_max(&p[0], &p[15]); + sort_max(&p[0], &p[16]); + sort_max(&p[0], &p[17]); + sort_max(&p[0], &p[18]); + sort_max(&p[0], &p[19]); + sort_max(&p[0], &p[20]); + sort_max(&p[0], &p[21]); + sort_max(&p[0], &p[22]); + sort_max(&p[0], &p[23]); + sort_max(&p[0], &p[24]); +} + +static void filter_cross_3x3_neon(struct src_ptr src, vx_uint8* dst, vx_int32 low_x, vx_int32 high_x, vx_enum function) +{ + vx_uint32 x; + for (x = low_x; x < high_x; x += 8) + { + const uint8x8_t top_data = vld1_u8(src.top_src); + const uint8x16_t mid_data = vld1q_u8(src.mid_src); + const uint8x8_t bot_data = vld1_u8(src.bot_src); + + uint8x8_t p0 = top_data; + uint8x8_t p1 = vget_low_u8(mid_data); + uint8x8_t p2 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1); + uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2); + uint8x8_t p4 = bot_data; + + switch (function) + { + /* minimal value */ + case VX_NONLINEAR_FILTER_MIN: + { + sort5_min(&p0, &p1, &p2, &p3, &p4); + vst1_u8(dst, p0); + break; + } + /* maximum value */ + case VX_NONLINEAR_FILTER_MAX: + { + sort5_max(&p0, &p1, &p2, &p3, &p4); + vst1_u8(dst, p0); + break; + } + /* pick the middle value */ + case VX_NONLINEAR_FILTER_MEDIAN: + { + sort5_mid(&p0, &p1, &p2, &p3, &p4); + vst1_u8(dst, p2); + break; + } + } + + dst += 8; + src.top_src += 8; + src.mid_src += 8; + src.bot_src += 8; + } +} + + +static void filter_cross_3x3(vx_tile_t *in, vx_tile_t *out, vx_enum function, vx_size ry0) +{ + vx_uint32 y; + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + vx_uint32 low_x = 0; + vx_uint32 high_x = out->tile_block.width; + + vx_int32 src_stride_y = in->addr->stride_y; + vx_int32 dst_stride_y = out->addr->stride_y; + + struct src_ptr src; + + if (ry0 == 0) + { + if (high_y == out->image.height) + { + high_y = high_y - 2; + } + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + y * dst_stride_y; + src.top_src = (vx_uint8 *)src_base + 1 + (y)* src_stride_y; + src.mid_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y; + src.bot_src = (vx_uint8 *)src_base + 1 + (y + 2) * src_stride_y; + + filter_cross_3x3_neon(src, dst, low_x, high_x, function); + } + } + else + { + if (low_y == 0) + { + low_y = 1; + } + if (high_y == out->image.height) + { + high_y = high_y - 1; + } + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * dst_stride_y; + src.top_src = (vx_uint8 *)src_base + 1 + (y - 1) * src_stride_y; + src.mid_src = (vx_uint8 *)src_base + (y)* src_stride_y; + src.bot_src = (vx_uint8 *)src_base + 1 + (y + 1) * src_stride_y; + + filter_cross_3x3_neon(src, dst, low_x, high_x, function); + } + } +} + + +static void filter_box_3x3_neon(struct src_ptr src, vx_uint8* dst, vx_int32 low_x, vx_int32 high_x, vx_enum function) +{ + vx_uint32 x; + for (x = low_x; x < high_x; x += 8) + { + const uint8x16_t top_data = vld1q_u8(src.top_src); + const uint8x16_t mid_data = vld1q_u8(src.mid_src); + const uint8x16_t bot_data = vld1q_u8(src.bot_src); + + uint8x8_t p0 = vget_low_u8(top_data); + uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1); + uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2); + uint8x8_t p3 = vget_low_u8(mid_data); + uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1); + uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2); + uint8x8_t p6 = vget_low_u8(bot_data); + uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1); + uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2); + + switch (function) + { + /* minimal value */ + case VX_NONLINEAR_FILTER_MIN: + { + sort9_min(&p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8); + vst1_u8(dst, p0); + break; + } + /* maximum value */ + case VX_NONLINEAR_FILTER_MAX: + { + sort9_max(&p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8); + vst1_u8(dst, p0); + break; + } + /* pick the middle value */ + case VX_NONLINEAR_FILTER_MEDIAN: + { + sort9_mid(&p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8); + vst1_u8(dst, p4); + break; + } + } + + dst += 8; + src.top_src += 8; + src.mid_src += 8; + src.bot_src += 8; + } +} + + +static void filter_box_3x3(vx_tile_t *in, vx_tile_t *out, vx_enum function, vx_size ry0) +{ + vx_uint32 y; + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + vx_uint32 low_x = 0; + vx_uint32 high_x = out->tile_block.width; + + vx_int32 src_stride_y = in->addr->stride_y; + vx_int32 dst_stride_y = out->addr->stride_y; + + struct src_ptr src; + + if (ry0 == 0) + { + if (high_y == out->image.height) + { + high_y = high_y - 2; + } + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + y * dst_stride_y; + src.top_src = (vx_uint8 *)src_base + (y)* src_stride_y; + src.mid_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y; + src.bot_src = (vx_uint8 *)src_base + (y + 2) * src_stride_y; + + filter_box_3x3_neon(src, dst, low_x, high_x, function); + } + } + else + { + if (low_y == 0) + { + low_y = 1; + } + if (high_y == out->image.height) + { + high_y = high_y - 1; + } + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * dst_stride_y; + src.top_src = (vx_uint8 *)src_base + (y - 1) * src_stride_y; + src.mid_src = (vx_uint8 *)src_base + (y)* src_stride_y; + src.bot_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y; + + filter_box_3x3_neon(src, dst, low_x, high_x, function); + } + } +} + + +static void filter_cross_5x5_neon(struct src_ptr src, vx_uint8* dst, vx_int32 low_x, vx_int32 high_x, vx_enum function) +{ + vx_uint32 x; + for (x = low_x; x < high_x; x += 8) + { + const uint8x8_t top2_data = vld1_u8(src.top2_src); + const uint8x8_t top_data = vld1_u8(src.top_src); + const uint8x16_t mid_data = vld1q_u8(src.mid_src); + const uint8x8_t bot_data = vld1_u8(src.bot_src); + const uint8x8_t bot2_data = vld1_u8(src.bot2_src); + + uint8x8_t p0 = top2_data; + uint8x8_t p1 = top_data; + uint8x8_t p2 = vget_low_u8(mid_data); + uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1); + uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2); + uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 3); + uint8x8_t p6 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 4); + uint8x8_t p7 = bot_data; + uint8x8_t p8 = bot2_data; + + switch (function) + { + /* minimal value */ + case VX_NONLINEAR_FILTER_MIN: + { + sort9_min(&p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8); + vst1_u8(dst, p0); + break; + } + /* maximum value */ + case VX_NONLINEAR_FILTER_MAX: + { + sort9_max(&p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8); + vst1_u8(dst, p0); + break; + } + /* pick the middle value */ + case VX_NONLINEAR_FILTER_MEDIAN: + { + sort9_mid(&p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8); + vst1_u8(dst, p4); + break; + } + } + + dst += 8; + src.top2_src += 8; + src.top_src += 8; + src.mid_src += 8; + src.bot_src += 8; + src.bot2_src += 8; + } +} + + +static void* filter_cross_5x5(vx_tile_t *in, vx_tile_t *out, vx_enum function, vx_size ry0) +{ + vx_uint32 y; + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + vx_uint32 low_x = 0; + vx_uint32 high_x = out->tile_block.width; + + vx_int32 src_stride_y = in->addr->stride_y; + vx_int32 dst_stride_y = out->addr->stride_y; + + struct src_ptr src; + + if (ry0 == 1) + { + if (high_y == out->image.height) + { + high_y = high_y - 3; + } + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * dst_stride_y; + src.top2_src = (vx_uint8 *)src_base + 2 + (y - 1)* src_stride_y; + src.top_src = (vx_uint8 *)src_base + 2 + (y)* src_stride_y; + src.mid_src = (vx_uint8 *)src_base + (y + 1)* src_stride_y; + src.bot_src = (vx_uint8 *)src_base + 2 + (y + 2)* src_stride_y; + src.bot2_src = (vx_uint8 *)src_base + 2 + (y + 3)* src_stride_y; + + filter_cross_5x5_neon(src, dst, low_x, high_x, function); + } + } + else + { + if (low_y == 0) + { + low_y = 2; + } + if (high_y == out->image.height) + { + high_y = high_y - 2; + } + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + 2 + y * dst_stride_y; + src.top2_src = (vx_uint8 *)src_base + 2 + (y - 2) * src_stride_y; + src.top_src = (vx_uint8 *)src_base + 2 + (y - 1) * src_stride_y; + src.mid_src = (vx_uint8 *)src_base + (y)* src_stride_y; + src.bot_src = (vx_uint8 *)src_base + 2 + (y + 1)* src_stride_y; + src.bot2_src = (vx_uint8 *)src_base + 2 + (y + 2)* src_stride_y; + + filter_cross_5x5_neon(src, dst, low_x, high_x, function); + } + } +} + + +static void filter_box_5x5_neon(struct src_ptr src, vx_uint8* dst, vx_int32 low_x, vx_int32 high_x, vx_enum function) +{ + vx_uint32 x; + for (x = low_x; x < high_x; x += 8) + { + const uint8x16_t top2_data = vld1q_u8(src.top2_src); + const uint8x16_t top_data = vld1q_u8(src.top_src); + const uint8x16_t mid_data = vld1q_u8(src.mid_src); + const uint8x16_t bot_data = vld1q_u8(src.bot_src); + const uint8x16_t bot2_data = vld1q_u8(src.bot2_src); + + const uint8x8_t d[] = + { + vget_low_u8(top2_data), + vget_high_u8(top2_data), + vget_low_u8(top_data), + vget_high_u8(top_data), + vget_low_u8(mid_data), + vget_high_u8(mid_data), + vget_low_u8(bot_data), + vget_high_u8(bot_data), + vget_low_u8(bot2_data), + vget_high_u8(bot2_data) + }; + + uint8x8_t p[25]; + for (vx_uint32 i = 0; i < 5; ++i) + { + const vx_uint32 idx_d = i * 2; + const vx_uint32 idx_p = i * 5; + + p[idx_p] = d[idx_d]; + p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1); + p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2); + p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3); + p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4); + } + + switch (function) + { + /* minimal value */ + case VX_NONLINEAR_FILTER_MIN: + { + sort25_min(p); + vst1_u8(dst, p[0]); + break; + } + /* maximum value */ + case VX_NONLINEAR_FILTER_MAX: + { + sort25_max(p); + vst1_u8(dst, p[0]); + break; + } + /* pick the middle value */ + case VX_NONLINEAR_FILTER_MEDIAN: + { + sort25_mid(p); + vst1_u8(dst, p[12]); + break; + } + } + + dst += 8; + src.top2_src += 8; + src.top_src += 8; + src.mid_src += 8; + src.bot_src += 8; + src.bot2_src += 8; + } +} + + +static void* filter_box_5x5(vx_tile_t *in, vx_tile_t *out, vx_enum function, vx_size ry0) +{ + vx_uint32 y; + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + vx_uint32 low_x = 0; + vx_uint32 high_x = out->tile_block.width; + + vx_int32 src_stride_y = in->addr->stride_y; + vx_int32 dst_stride_y = out->addr->stride_y; + + struct src_ptr src; + + if (ry0 == 1) + { + if (high_y == out->image.height) + { + high_y = high_y - 3; + } + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * dst_stride_y; + src.top2_src = (vx_uint8 *)src_base + (y - 1)* src_stride_y; + src.top_src = (vx_uint8 *)src_base + (y)* src_stride_y; + src.mid_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y; + src.bot_src = (vx_uint8 *)src_base + (y + 2) * src_stride_y; + src.bot2_src = (vx_uint8 *)src_base + (y + 3) * src_stride_y; + + filter_box_5x5_neon(src, dst, low_x, high_x, function); + } + } + else + { + if (low_y == 0) + { + low_y = 2; + } + if (high_y == out->image.height) + { + high_y = high_y - 2; + } + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + 2 + y * dst_stride_y; + src.top2_src = (vx_uint8 *)src_base + (y - 2) * src_stride_y; + src.top_src = (vx_uint8 *)src_base + (y - 1) * src_stride_y; + src.mid_src = (vx_uint8 *)src_base + (y)* src_stride_y; + src.bot_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y; + src.bot2_src = (vx_uint8 *)src_base + (y + 2) * src_stride_y; + + filter_box_5x5_neon(src, dst, low_x, high_x, function); + } + } +} + + +static void filter_disk_5x5_neon(struct src_ptr src, vx_uint8* dst, vx_int32 low_x, vx_int32 high_x, vx_enum function) +{ + vx_uint32 x; + const uint8x16_t zero = vdupq_n_u8(0); + for (x = low_x; x < high_x; x += 8) + { + const uint8x16_t top2_data = vextq_u8(vld1q_u8(src.top2_src), zero, 1); + const uint8x16_t top_data = vld1q_u8(src.top_src); + const uint8x16_t mid_data = vld1q_u8(src.mid_src); + const uint8x16_t bot_data = vld1q_u8(src.bot_src); + const uint8x16_t bot2_data = vextq_u8(vld1q_u8(src.bot2_src), zero, 1); + + uint8x8_t d[] = + { + vget_low_u8(top2_data), + vget_high_u8(top2_data), + vget_low_u8(top_data), + vget_high_u8(top_data), + vget_low_u8(mid_data), + vget_high_u8(mid_data), + vget_low_u8(bot_data), + vget_high_u8(bot_data), + vget_low_u8(bot2_data), + vget_high_u8(bot2_data) + }; + + uint8x8_t p[21]; + p[0] = d[0]; + p[1] = vext_u8(d[0], d[1], 1); + p[2] = vext_u8(d[0], d[1], 2); + p[18] = d[8]; + p[19] = vext_u8(d[8], d[9], 1); + p[20] = vext_u8(d[8], d[9], 2); + + for (vx_uint32 i = 0; i < 3; ++i) + { + const vx_uint32 idx_d = 2 + i * 2; + const vx_uint32 idx_p = 3 + i * 5; + + p[idx_p] = d[idx_d]; + p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1); + p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2); + p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3); + p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4); + } + + switch (function) + { + /* minimal value */ + case VX_NONLINEAR_FILTER_MIN: + { + sort21_min(p); + vst1_u8(dst, p[0]); + break; + } + /* maximum value */ + case VX_NONLINEAR_FILTER_MAX: + { + sort21_max(p); + vst1_u8(dst, p[0]); + break; + } + /* pick the middle value */ + case VX_NONLINEAR_FILTER_MEDIAN: + { + sort21_mid(p); + vst1_u8(dst, p[10]); + break; + } + } + + dst += 8; + src.top2_src += 8; + src.top_src += 8; + src.mid_src += 8; + src.bot_src += 8; + src.bot2_src += 8; + } +} + + +static void* filter_disk_5x5(vx_tile_t *in, vx_tile_t *out, vx_enum function, vx_size ry0) +{ + vx_uint32 y; + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + vx_uint32 low_x = 0; + vx_uint32 high_x = out->tile_block.width; + + vx_int32 src_stride_y = in->addr->stride_y; + vx_int32 dst_stride_y = out->addr->stride_y; + + struct src_ptr src; + + if (ry0 == 1) + { + if (high_y == out->image.height) + { + high_y = high_y - 3; + } + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * dst_stride_y; + src.top2_src = (vx_uint8 *)src_base + (y - 1)* src_stride_y; + src.top_src = (vx_uint8 *)src_base + (y)* src_stride_y; + src.mid_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y; + src.bot_src = (vx_uint8 *)src_base + (y + 2) * src_stride_y; + src.bot2_src = (vx_uint8 *)src_base + (y + 3) * src_stride_y; + + filter_disk_5x5_neon(src, dst, low_x, high_x, function); + } + } + else + { + if (low_y == 0) + { + low_y = 2; + } + if (high_y == out->image.height) + { + high_y = high_y - 2; + } + for (y = low_y; y < high_y; y++) + { + vx_uint8* dst = (vx_uint8 *)dst_base + 2 + y * dst_stride_y; + src.top2_src = (vx_uint8 *)src_base + (y - 2) * src_stride_y; + src.top_src = (vx_uint8 *)src_base + (y - 1) * src_stride_y; + src.mid_src = (vx_uint8 *)src_base + (y)* src_stride_y; + src.bot_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y; + src.bot2_src = (vx_uint8 *)src_base + (y + 2) * src_stride_y; + + filter_disk_5x5_neon(src, dst, low_x, high_x, function); + } + } +} + +void NonLinearFilter_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + vx_enum *func = (vx_enum *)parameters[0]; + vx_tile_t *in = (vx_tile_t *)parameters[1]; + vx_tile_matrix_t *mask = (vx_tile_matrix_t *)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + + vx_size ry0 = mask->origin.y; + + vx_int32 count_mask = 0; + vx_int32 mask_index = 0; + + for (vx_uint32 r = 0; r < mask->rows; ++r) + { + for (vx_uint32 c = 0; c < mask->columns; ++c, ++mask_index) + { + if (mask->m[mask_index]) + { + ++count_mask; + } + } + } + + switch (mask->rows) + { + case 3: // mask = 3x3 + { + if (count_mask == 5) + filter_cross_3x3(in, out, *func, ry0); + else // count_mask = 9 + filter_box_3x3(in, out, *func, ry0); + + break; + } + + case 5: // mask = 5x5 + { + if (count_mask == 9) + filter_cross_5x5(in, out, *func, ry0); + else if (count_mask == 21) + filter_disk_5x5(in, out, *func, ry0); + else // count_mask = 25 + filter_box_5x5(in, out, *func, ry0); + break; + } + } +} + + +// helpers +static vx_int32 vx_uint8_compare(const void *p1, const void *p2) +{ + vx_uint8 a = *(vx_uint8 *)p1; + vx_uint8 b = *(vx_uint8 *)p2; + if (a > b) + return 1; + else if (a == b) + return 0; + else + return -1; +} + + +static vx_uint32 readMaskedRectangle_U8(const void *base, const vx_imagepatch_addressing_t *addr, vx_uint32 center_x, vx_uint32 center_y, + vx_uint32 left, vx_uint32 top, vx_uint32 right, vx_uint32 bottom, vx_uint8 *mask, vx_uint8 *destination) +{ + vx_int32 width = (vx_int32)addr->dim_x, height = (vx_int32)addr->dim_y; + vx_int32 stride_y = addr->stride_y; + vx_int32 stride_x = addr->stride_x; + const vx_uint8 *ptr = (const vx_uint8 *)base; + vx_int32 ky, kx; + vx_uint32 mask_index = 0; + vx_uint32 dest_index = 0; + + for (ky = -(vx_int32)top; ky <= (vx_int32)bottom; ++ky) + { + vx_int32 y = (vx_int32)(center_y + ky); + y = y < 0 ? 0 : y >= height ? height - 1 : y; + + for (kx = -(vx_int32)left; kx <= (vx_int32)right; ++kx, ++mask_index) + { + vx_int32 x = (vx_int32)(center_x + kx); + x = x < 0 ? 0 : x >= width ? width - 1 : x; + if (mask[mask_index]) + ((vx_uint8*)destination)[dest_index++] = *(vx_uint8*)(ptr + y*stride_y + x*stride_x); + } + } + + return dest_index; +} + + +#define NonLinearFilter(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_uint8 *dst = (vx_uint8 *)dst_base + y * out->addr->stride_y + x * out->addr->stride_x; \ + vx_uint32 count = (vx_uint32)readMaskedRectangle_U8(src_base, in->addr, x, y, (vx_uint32)rx0, (vx_uint32)ry0, (vx_uint32)rx1, (vx_uint32)ry1, mask->m, v); \ + \ + qsort(v, count, sizeof(vx_uint8), vx_uint8_compare); \ + \ + switch(*func) \ + { \ + case VX_NONLINEAR_FILTER_MIN : *dst = v[0]; break; \ + case VX_NONLINEAR_FILTER_MAX : *dst = v[count - 1]; break; \ + case VX_NONLINEAR_FILTER_MEDIAN : *dst = v[count / 2]; break; \ + } \ + } \ + } + + +void NonLinearFilter_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + vx_enum *func = (vx_enum *)parameters[0]; + vx_tile_t *in = (vx_tile_t *)parameters[1]; + vx_tile_matrix_t *mask = (vx_tile_matrix_t *)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + + vx_uint8 v[C_MAX_NONLINEAR_DIM * C_MAX_NONLINEAR_DIM]; + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + vx_size rx0 = mask->origin.x; + vx_size ry0 = mask->origin.y; + vx_size rx1 = mask->columns - mask->origin.x - 1; + vx_size ry1 = mask->rows - mask->origin.y - 1; + + if (low_y == 0 && low_x == 0) + { + NonLinearFilter(low_y, high_y, low_x) + } + else + { + NonLinearFilter(0, low_y, low_x) + + src_base = in->base[0]; + dst_base = out->base[0]; + NonLinearFilter(low_y, high_y, 0) + } + +} diff --git a/kernels/tiling/tiling_nonmaxsuppression.c b/kernels/tiling/tiling_nonmaxsuppression.c new file mode 100644 index 0000000..e758de9 --- /dev/null +++ b/kernels/tiling/tiling_nonmaxsuppression.c @@ -0,0 +1,445 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#include +#include +#include +#include +#include +#include +#include + +static void nonMaxSuppression_U8(vx_uint8* src, + vx_uint32 srcWidth, + vx_uint32 srcHeight, + vx_uint32 srcStride, + vx_uint8* mask, + vx_uint32 maskStride, + vx_uint8* dst, + vx_uint32 dstStride, + vx_int32 windowSize, + vx_uint32 low_height, + vx_uint32 height, + vx_uint32 x_step) +{ + vx_uint32 radius = (vx_uint32)windowSize >> 1; + vx_uint8 *maskCurr = NULL; + vx_uint8 *maskLeftTop = NULL; + vx_uint32 x, y; + uint8x16_t vOne16 = vdupq_n_u8(1); + uint8x8_t vOne8 = vdup_n_u8(1); + for (y = low_height + radius; y < height - radius; y++) + { + vx_uint8 *srcCurr = src + y * srcStride + radius; + vx_uint8 *dstCurr = dst + y * dstStride + radius; + vx_uint8 *leftTop = src + (y - radius) * srcStride; + if (mask) + { + maskCurr = mask + y * maskStride + radius; + maskLeftTop = mask + (y - radius) * maskStride; + } + + for (x = 0; x < x_step; x += 16) + { + uint8x16_t vSrcCurr = vld1q_u8(srcCurr); + uint8x16_t vDstCurr = vld1q_u8(dstCurr); + uint8x16_t vMaskCurr = vdupq_n_u8(0); + if (maskCurr) + { + vMaskCurr = vld1q_u8(maskCurr); + } + uint8x16_t vNeighborCurr; + uint8x16_t vTempResult; + uint8x16_t vFlag = vdupq_n_u8(1); + for (vx_uint32 j = 0; j < windowSize; j++) + { + for (vx_uint32 i = 0; i < windowSize; i++) + { + if (j == radius && i == radius) + continue; + else + { + vNeighborCurr = vld1q_u8(leftTop + j * srcStride + i); + if (mask != NULL) + { + uint8x16_t vMaskNeighborCurr = vld1q_u8(maskLeftTop + j * maskStride + i); + vMaskNeighborCurr = vsubq_u8(vOne16, vorrq_u8(vMaskNeighborCurr, vMaskCurr)); + vNeighborCurr = vmulq_u8(vNeighborCurr, vMaskNeighborCurr); + } + vTempResult = (j < radius || (j == radius && i < radius)) ? vcgeq_u8(vSrcCurr, vNeighborCurr) : vcgtq_u8(vSrcCurr, vNeighborCurr); + vFlag = vmulq_u8(vFlag, vTempResult); + } + } + } + vDstCurr = vmulq_u8(vFlag, vSrcCurr); + vst1q_u8((vx_uint8 *)dstCurr, vDstCurr); + srcCurr += 16; + dstCurr += 16; + leftTop += 16; + if (mask) + { + maskCurr += 16; + maskLeftTop += 16; + } + } + } +} + +void NonMaxSuppression_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_uint8 mask_data = 0; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *mask = (vx_tile_t *)parameters[1]; + vx_int32 *wsize = (vx_int32*)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + vx_df_image format = in->image.format; + vx_int32 border = *wsize / 2; + + vx_uint32 low_height = out->tile_y; + vx_uint32 height = out->tile_y + out->tile_block.height; + + vx_uint32 low_width = out->tile_x; + vx_uint32 width = out->tile_x + out->tile_block.width; + + if(low_height == 0) + { + low_height = low_height + border; + } + if(height == out->image.height) + { + height = height - border; + } + if (format == VX_DF_IMAGE_U8) + { + vx_uint8 *maskCurr = NULL; + vx_uint8 *maskLeftTop = NULL; + uint8x16_t vOne16 = vdupq_n_u8(1); + uint8x8_t vOne8 = vdup_n_u8(1); + for (y = low_height; y < height; y++) + { + vx_uint8 *srcCurr = (vx_uint8 *)in->base[0] + in->tile_x + y * in->addr[0].stride_y + border; + vx_uint8 *dstCurr = (vx_uint8 *)out->base[0] + out->tile_x + y * out->addr[0].stride_y + border; + vx_uint8 *leftTop = (vx_uint8 *)in->base[0] + in->tile_x + (y - border) * in->addr[0].stride_y; + if (mask->base[0] != NULL) + { + maskCurr = mask->base[0] + y * mask->addr[0].stride_y + border; + maskLeftTop = mask->base[0] + (y - border) * mask->addr[0].stride_y; + } + if(low_width == 0) + { + low_width = low_width + border; + } + if(width == out->image.width) + { + width = width - border; + } + for (x = low_width; x < width; x += 16) + { + uint8x16_t vSrcCurr = vld1q_u8(srcCurr); + uint8x16_t vDstCurr = vld1q_u8(dstCurr); + uint8x16_t vMaskCurr = vdupq_n_u8(0); + if (maskCurr) + { + vMaskCurr = vld1q_u8(maskCurr); + } + uint8x16_t vNeighborCurr; + uint8x16_t vTempResult; + uint8x16_t vFlag = vdupq_n_u8(1); + for (vx_uint32 j = 0; j < *wsize; j++) + { + for (vx_uint32 i = 0; i < *wsize; i++) + { + if (j == border && i == border) + continue; + else + { + vNeighborCurr = vld1q_u8(leftTop + j * in->addr[0].stride_y + i); + if (mask->base[0] != NULL) + { + uint8x16_t vMaskNeighborCurr = vld1q_u8(maskLeftTop + j * mask->addr[0].stride_y + i); + vMaskNeighborCurr = vsubq_u8(vOne16, vorrq_u8(vMaskNeighborCurr, vMaskCurr)); + vNeighborCurr = vmulq_u8(vNeighborCurr, vMaskNeighborCurr); + } + vTempResult = (j < border || (j == border && i < border)) ? vcgeq_u8(vSrcCurr, vNeighborCurr) : vcgtq_u8(vSrcCurr, vNeighborCurr); + vFlag = vmulq_u8(vFlag, vTempResult); + } + } + } + vDstCurr = vmulq_u8(vFlag, vSrcCurr); + vst1q_u8((vx_uint8 *)dstCurr, vDstCurr); + srcCurr += 16; + dstCurr += 16; + leftTop += 16; + if (mask->base[0] != NULL) + { + maskCurr += 16; + maskLeftTop += 16; + } + } + } + } + else + { + vx_int32 border = *wsize / 2; + for (vx_int32 y = low_height; y < height; y++) + { + vx_int32 x = 0; + for (x = low_width; x < width; x+=8) + { + uint8x8_t _mask_8x8_o; + vx_uint8 *_maskp_o; + if (mask->base[0] != NULL) + { + _maskp_o = (vx_uint8 *)mask->base[0] + y*mask->addr[0].stride_y + x*mask->addr[0].stride_x; + _mask_8x8_o = vld1_u8(_maskp_o); + } + else + { + _mask_8x8_o = vdup_n_u8(0); + } + vx_int16 *val_p = (vx_int16 *)((vx_uint8 *)in->base[0] + y*in->addr[0].stride_y + x*in->addr[0].stride_x); + vx_int16 *dest = (vx_int16 *)((vx_uint8 *)out->base[0] + y*out->addr[0].stride_y + x*out->addr[0].stride_x); + int16x8_t src_val_16x8 = vld1q_s16(val_p); + + int16x8_t dst_16x8; + uint8x8_t t_8x8 = vdup_n_u8(0); + uint8x8_t maskequal0_8x8_o = vceq_u8(_mask_8x8_o, vdup_n_u8(0)); + dst_16x8 = vbslq_s16(vmovl_u8(maskequal0_8x8_o), dst_16x8, src_val_16x8); + t_8x8 = vbsl_u8(maskequal0_8x8_o, t_8x8, vdup_n_u8(1)); + + for (vx_int32 j = -border; j <= border; j++) + { + for (vx_int32 i = -border; i <= border; i++) + { + vx_int16 *neighbor = (vx_int16 *)((vx_uint8 *)in->base[0] + + (y + j)*in->addr[0].stride_y + + (x + i)*in->addr[0].stride_x); + int16x8_t neighbor_val_16x8 = vld1q_s16(neighbor); + uint8x8_t _mask_8x8_i; + vx_uint8 *_maskp_i; + if (mask->base[0] != NULL) + { + _maskp_i = (vx_uint8 *)mask->base[0] + (y + j)*mask->addr[0].stride_y + (x + i)*mask->addr[0].stride_x; + _mask_8x8_i = vld1_u8(_maskp_i); + } + else + { + _mask_8x8_i = vdup_n_u8(0); + } + + uint8x8_t maskequal0_8x8_i = vceq_u8(_mask_8x8_i, vdup_n_u8(0));//(*_mask == 0) + uint16x8_t j1 = vdupq_n_u16(0); + if(j < 0 || (j == 0 && i <= 0)) + { + j1 = vdupq_n_u16(1); + } + uint16x8_t j2 = vdupq_n_u16(0); + if(j > 0 || (j == 0 && i > 0)) + { + j2 = vdupq_n_u16(1); + } + uint16x8_t srcltval = vcltq_s16(src_val_16x8, neighbor_val_16x8);//< + uint16x8_t srclqval = vcleq_s16(src_val_16x8, neighbor_val_16x8);//<= + + uint16x8_t result_16x8 = vandq_u16(vmovl_u8(maskequal0_8x8_i), + vorrq_u16(vandq_u16(j1, srcltval),vandq_u16(j2,srclqval))); + if(vgetq_lane_u16(result_16x8, 0) != 0 && vget_lane_u8(t_8x8, 0) ==0) + { + dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 0); + t_8x8 = vset_lane_u8(1, t_8x8, 0); + } + if(vget_lane_u8(t_8x8, 0) ==0) + { + dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 0), dst_16x8, 0); + } + + if(vgetq_lane_u16(result_16x8, 1) != 0 && vget_lane_u8(t_8x8, 1) ==0) + { + dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 1); + t_8x8 = vset_lane_u8(1, t_8x8, 1); + } + if(vget_lane_u8(t_8x8, 1) ==0) + { + dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 1), dst_16x8, 1); + } + + if(vgetq_lane_u16(result_16x8, 2) != 0 && vget_lane_u8(t_8x8, 2) ==0) + { + dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 2); + t_8x8 = vset_lane_u8(1, t_8x8, 2); + } + if(vget_lane_u8(t_8x8, 2) ==0) + { + dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 2), dst_16x8, 2); + } + + if(vgetq_lane_u16(result_16x8, 3) != 0 && vget_lane_u8(t_8x8, 3) ==0) + { + dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 3); + t_8x8 = vset_lane_u8(1, t_8x8, 3); + } + if(vget_lane_u8(t_8x8, 3) ==0) + { + dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 3), dst_16x8, 3); + } + + if(vgetq_lane_u16(result_16x8, 4) != 0 && vget_lane_u8(t_8x8, 4) ==0) + { + dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 4); + t_8x8 = vset_lane_u8(1, t_8x8, 4); + } + if(vget_lane_u8(t_8x8, 4) ==0) + { + dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 4), dst_16x8, 4); + } + + if(vgetq_lane_u16(result_16x8, 5) != 0 && vget_lane_u8(t_8x8, 5) ==0) + { + dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 5); + t_8x8 = vset_lane_u8(1, t_8x8, 5); + } + if(vget_lane_u8(t_8x8, 5) ==0) + { + dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 5), dst_16x8, 5); + } + + if(vgetq_lane_u16(result_16x8, 6) != 0 && vget_lane_u8(t_8x8, 6) ==0) + { + dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 6); + t_8x8 = vset_lane_u8(1, t_8x8, 6); + } + if(vget_lane_u8(t_8x8, 6) ==0) + { + dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 6), dst_16x8, 6); + } + + if(vgetq_lane_u16(result_16x8, 7) != 0 && vget_lane_u8(t_8x8, 7) ==0) + { + dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 7); + t_8x8 = vset_lane_u8(1, t_8x8, 7); + } + if(vget_lane_u8(t_8x8, 7) ==0) + { + dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 7), dst_16x8, 7); + } + } + } + vst1q_s16(dest, dst_16x8); + } + } + } +} + +#define NONMAXSUPPRESSION_FLEXIBLE(low_y, low_x, high_y, high_x, in_tile_x, out_tile_x)\ +for (vx_int32 y = low_y; y < high_y; y++)\ +{\ + for (vx_int32 x = low_x; x < high_x; x++)\ + {\ + vx_uint8 *_mask;\ + if (mask->base[0] != NULL)\ + {\ + _mask = (vx_uint8 *)mask->base[0] + mask->tile_x + y * mask->addr[0].stride_y + x * mask->addr[0].stride_x;\ + }\ + else\ + {\ + _mask = &mask_data;\ + }\ + void *val_p = (vx_uint8 *)in->base[0] + in_tile_x + y * in->addr[0].stride_y + x * in->addr[0].stride_x;\ + void *dest = (vx_uint8 *)out->base[0] + out_tile_x + y * out->addr[0].stride_y + x * out->addr[0].stride_x;\ + vx_int32 src_val = format == VX_DF_IMAGE_U8 ? *(vx_uint8 *)val_p : *(vx_int16 *)val_p;\ + if (*_mask != 0)\ + {\ + if (format == VX_DF_IMAGE_U8)\ + *(vx_uint8 *)dest = (vx_uint8)src_val;\ + else\ + *(vx_int16 *)dest = (vx_int16)src_val;\ + }\ + else\ + {\ + vx_bool flag = 1;\ + for (vx_int32 i = -border; i <= border; i++)\ + {\ + for (vx_int32 j = -border; j <= border; j++)\ + {\ + void *neighbor = (vx_uint8 *)in->base[0] + in_tile_x + (y + j) * in->addr[0].stride_y + (x + i) * in->addr[0].stride_x;\ + if (mask->base[0] != NULL)\ + {\ + _mask = (vx_uint8 *)mask->base[0] + mask->tile_x + (y + j) * mask->addr[0].stride_y + (x + i) * mask->addr[0].stride_x;\ + }\ + else\ + {\ + _mask = &mask_data;\ + }\ + vx_int32 neighbor_val = format == VX_DF_IMAGE_U8 ? *(vx_uint8 *)neighbor : *(vx_int16 *)neighbor;\ + if ((*_mask == 0)\ + && (((j < 0 || (j == 0 && i <= 0)) && (src_val < neighbor_val))\ + || ((j > 0 || (j == 0 && i > 0)) && (src_val <= neighbor_val))))\ + {\ + flag = 0;\ + break;\ + }\ + }\ + if (flag == 0)\ + {\ + break;\ + }\ + }\ + if (flag)\ + {\ + if (format == VX_DF_IMAGE_U8)\ + *(vx_uint8 *)dest = (vx_uint8)src_val;\ + else\ + *(vx_int16 *)dest = (vx_int16)src_val;\ + }\ + else\ + {\ + if (format == VX_DF_IMAGE_U8)\ + *(vx_uint8 *)dest = 0;\ + else\ + *(vx_int16 *)dest = INT16_MIN;\ + }\ + }\ + }\ +}\ + + +void NonMaxSuppression_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_uint8 mask_data = 0; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *mask = (vx_tile_t *)parameters[1]; + vx_int32 *wsize = (vx_int32*)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + vx_df_image format = in->image.format; + vx_int32 border = *wsize / 2; + + if (ty == 0 && tx == 0) + { + NONMAXSUPPRESSION_FLEXIBLE(border, border, (vxTileHeight(out, 0) - border), (vxTileWidth(out, 0) - border), in->tile_x, out->tile_x) + } + else + { + NONMAXSUPPRESSION_FLEXIBLE(border, tx, ty, (vxTileWidth(out, 0) - border), in->tile_x, out->tile_x) + NONMAXSUPPRESSION_FLEXIBLE(ty, border, (vxTileHeight(out, 0) - border), (vxTileWidth(out, 0) - border), 0, 0) + } +} diff --git a/kernels/tiling/tiling_phase.c b/kernels/tiling/tiling_phase.c new file mode 100644 index 0000000..e32dd6d --- /dev/null +++ b/kernels/tiling/tiling_phase.c @@ -0,0 +1,261 @@ +/* +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include + +#include +#include + +#ifndef M_PI +#define M_PI 3.1415926535897932384626433832795 +#endif + +#define DBL_EPSILON 2.2204460492503131e-016 /* smallest such that 1.0+DBL_EPSILON != 1.0 */ + +static float32x4_t vrecpq_f32(float32x4_t val) +{ + float32x4_t reciprocal = vrecpeq_f32(val); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + return reciprocal; +} + +#define FASTATAN2CONST(scale) \ + vx_float32 P1 = ((vx_float32)( 0.9997878412794807 * (180.0 / M_PI) * scale)), \ + P3 = ((vx_float32)(-0.3258083974640975 * (180.0 / M_PI) * scale)), \ + P5 = ((vx_float32)( 0.1555786518463281 * (180.0 / M_PI) * scale)), \ + P7 = ((vx_float32)(-0.04432655554792128 * (180.0 / M_PI) * scale)), \ + A_90 = ((vx_float32)(90.f * scale)), \ + A_180 = ((vx_float32)(180.f * scale)), \ + A_360 = ((vx_float32)(360.f * scale)); \ + float32x4_t eps = (vdupq_n_f32((vx_float32)DBL_EPSILON)), \ + _90 = (vdupq_n_f32(A_90)), \ + _180 = (vdupq_n_f32(A_180)), \ + _360 = (vdupq_n_f32(A_360)), \ + z = (vdupq_n_f32(0.0f)), \ + p1 = (vdupq_n_f32(P1)), \ + p3 = (vdupq_n_f32(P3)), \ + p5 = (vdupq_n_f32(P5)), \ + p7 = (vdupq_n_f32(P7)); + +#define FASTATAN2SCALAR(y, x, a) \ + { \ + vx_float32 ax = abs(x), ay = abs(y); \ + vx_float32 c, c2; \ + if (ax >= ay) \ + { \ + c = ay / (ax + (vx_float32)DBL_EPSILON); \ + c2 = c * c; \ + a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \ + } \ + else \ + { \ + c = ax / (ay + (vx_float32)DBL_EPSILON); \ + c2 = c * c; \ + a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c; \ + } \ + if (x < 0) \ + a = A_180 - a; \ + if (y < 0) \ + a = A_360 - a; \ + } + +#define FASTATAN2VECTOR(v_y, v_x, a) \ + { \ + float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y); \ + float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay); \ + float32x4_t c = vmulq_f32(tmin, vrecpq_f32(vaddq_f32(tmax, eps))); \ + float32x4_t c2 = vmulq_f32(c, c); \ + a = vmulq_f32(c2, p7); \ + \ + a = vmulq_f32(vaddq_f32(a, p5), c2); \ + a = vmulq_f32(vaddq_f32(a, p3), c2); \ + a = vmulq_f32(vaddq_f32(a, p1), c); \ + \ + a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a)); \ + a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a); \ + a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a); \ + \ + } + + +void Phase_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x, y; + vx_tile_t *grad_x = (vx_tile_t *)parameters[0]; + vx_tile_t *grad_y = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + + vx_uint8 *src_base_x = grad_x->base[0]; + vx_uint8 *src_base_y = grad_y->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = out->tile_x + out->tile_block.width; + + FASTATAN2CONST(256.0f / 360.0f); + vx_uint32 roiw16 = high_x >= 15 ? high_x - 15 : 0; + vx_uint32 roiw8 = high_x >= 7 ? high_x - 7 : 0; + + float32x4_t v_05 = vdupq_n_f32(0.5f); + + for (y = low_y; y < high_y; y++) + { + const vx_int16 * src0 = (vx_int16 *)src_base_x + y * grad_x->addr->stride_y / 2; + const vx_int16 * src1 = (vx_int16 *)src_base_y + y * grad_y->addr->stride_y / 2; + vx_uint8 * dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; + + x = low_x; + + for (; x < roiw16; x += 16) + { + int16x8_t v_src00 = vld1q_s16(src0 + x), v_src01 = vld1q_s16(src0 + x + 8); + int16x8_t v_src10 = vld1q_s16(src1 + x), v_src11 = vld1q_s16(src1 + x + 8); + + // 0 + float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00))); + float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10))); + float32x4_t v_dst32f0; + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0); + + v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00))); + v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10))); + float32x4_t v_dst32f1; + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1); + + uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), + vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); + + // 1 + v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01))); + v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11))); + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0); + + v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01))); + v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11))); + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1); + + uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), + vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); + + vst1q_u8(dst + x, vcombine_u8(vmovn_u16(v_dst16s0), + vmovn_u16(v_dst16s1))); + } + + for (; x < roiw8; x += 8) + { + int16x8_t v_src0 = vld1q_s16(src0 + x); + int16x8_t v_src1 = vld1q_s16(src1 + x); + + float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0))); + float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))); + float32x4_t v_dst32f0; + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0); + + v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0))); + v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))); + float32x4_t v_dst32f1; + FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1); + + uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), + vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); + + vst1_u8(dst + x, vmovn_u16(v_dst)); + } + + for (; x < high_x; x++) + { + vx_float32 val_x = src0[x], val_y = src1[x]; + vx_float32 a; + FASTATAN2SCALAR(val_y, val_x, a); + dst[x] = (vx_uint8)(vx_uint32)floor(a + 0.5f); + } + } +} + + +void Phase_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x, y; + vx_tile_t *grad_x = (vx_tile_t *)parameters[0]; + vx_tile_t *grad_y = (vx_tile_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + + vx_uint8 *src_base_x = grad_x->base[0]; + vx_uint8 *src_base_y = grad_y->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + FASTATAN2CONST(256.0f / 360.0f); + + if (low_y == 0 && low_x == 0) + { + for (y = low_y; y < high_y; y++) + { + const vx_int16 * src0 = (vx_int16 *)src_base_x + y * grad_x->addr->stride_y / 2; + const vx_int16 * src1 = (vx_int16 *)src_base_y + y * grad_y->addr->stride_y / 2; + vx_uint8 * dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; + + for (x = low_x; x < high_x; x++) + { + vx_float32 val_x = src0[x], val_y = src1[x]; + vx_float32 a; + FASTATAN2SCALAR(val_y, val_x, a); + dst[x] = (vx_uint8)(vx_uint32)floor(a + 0.5f); + } + } + } + else + { + for (y = 0; y < low_y; y++) + { + const vx_int16 * src0 = (vx_int16 *)src_base_x + y * grad_x->addr->stride_y / 2; + const vx_int16 * src1 = (vx_int16 *)src_base_y + y * grad_y->addr->stride_y / 2; + vx_uint8 * dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; + + for (x = low_x; x < high_x; x++) + { + vx_float32 val_x = src0[x], val_y = src1[x]; + vx_float32 a; + FASTATAN2SCALAR(val_y, val_x, a); + dst[x] = (vx_uint8)(vx_uint32)floor(a + 0.5f); + } + } + for (y = low_y; y < high_y; y++) + { + const vx_int16 * src0 = (vx_int16 *)src_base_x + y * grad_x->addr->stride_y / 2; + const vx_int16 * src1 = (vx_int16 *)src_base_y + y * grad_y->addr->stride_y / 2; + vx_uint8 * dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; + + for (x = 0; x < high_x; x++) + { + vx_float32 val_x = src0[x], val_y = src1[x]; + vx_float32 a; + FASTATAN2SCALAR(val_y, val_x, a); + dst[x] = (vx_uint8)(vx_uint32)floor(a + 0.5f); + } + } + } +} diff --git a/kernels/tiling/tiling_scale.c b/kernels/tiling/tiling_scale.c new file mode 100644 index 0000000..37c3c07 --- /dev/null +++ b/kernels/tiling/tiling_scale.c @@ -0,0 +1,841 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +static vx_bool read_pixel(void *base, vx_imagepatch_addressing_t *addr, vx_uint32 src_height, vx_uint32 src_width, + vx_int32 x, vx_int32 y, vx_uint8 *pixel) +{ + vx_bool out_of_bounds = (x < 0 || y < 0 || x >= src_width || y >= src_height); + vx_uint32 bx, by; + vx_uint8 *bpixel; + if (out_of_bounds) + { + return vx_false_e; + } + + bx = x < 0 ? 0 : x >= src_width ? src_width - 1 : (vx_uint32)x; + by = y < 0 ? 0 : y >= src_height ? src_height - 1 : (vx_uint32)y; + + vx_uint8 *new_ptr = NULL; + vx_uint32 offset = (addr->stride_y * by + addr->stride_x * bx); + new_ptr = (vx_uint8*)base; + bpixel = &new_ptr[offset]; + + *pixel = *bpixel; + + return vx_true_e; +} + +static void read_pixel_v(void *base, vx_imagepatch_addressing_t *addr, vx_uint32 src_height, vx_uint32 src_width, + int32x4_t x_32x4, int32x4_t y_32x4, vx_uint8 *dst) +{ + int32x4_t o_32x4 = vdupq_n_s32(0); + int32x4_t dimx_32x4 = vdupq_n_s32((vx_int32)src_width); + int32x4_t dimy_32x4 = vdupq_n_s32((vx_int32)src_height); + uint32x4_t out_of_bounds_32x4 = vorrq_u32(vorrq_u32(vcltq_s32(x_32x4, o_32x4), vcltq_s32(y_32x4, o_32x4)), + vorrq_u32(vcgeq_s32(x_32x4, dimx_32x4),vcgeq_s32(y_32x4, dimy_32x4))); + + char flag_1 = 0; + char flag_2 = 0; + char flag_3 = 0; + char flag_4 = 0; + if(vgetq_lane_u32(out_of_bounds_32x4, 0) == 0xFFFFFFFF) + { + flag_1 = 1; + } + if(vgetq_lane_u32(out_of_bounds_32x4, 1) == 0xFFFFFFFF) + { + flag_2 = 1; + } + if(vgetq_lane_u32(out_of_bounds_32x4, 2) == 0xFFFFFFFF) + { + flag_3 = 1; + } + if(vgetq_lane_u32(out_of_bounds_32x4, 3) == 0xFFFFFFFF) + { + flag_4 = 1; + } + + vx_uint8 *bpixel = NULL; + vx_uint8 *new_ptr = NULL; + vx_uint32 offset = 0; + if(flag_1 == 0) + { + offset = (addr->stride_y * vgetq_lane_s32(y_32x4, 0) + vgetq_lane_s32(x_32x4, 0)); + new_ptr = (vx_uint8*)base; + bpixel = &new_ptr[offset]; + *dst = *bpixel; + } + if(flag_2 == 0) + { + offset = (addr->stride_y * vgetq_lane_s32(y_32x4, 1) + vgetq_lane_s32(x_32x4, 1)); + new_ptr = (vx_uint8*)base; + bpixel = &new_ptr[offset]; + *(dst+1) = *bpixel; + } + if(flag_3 == 0) + { + offset = (addr->stride_y * vgetq_lane_s32(y_32x4, 2) + vgetq_lane_s32(x_32x4, 2)); + new_ptr = (vx_uint8*)base; + bpixel = &new_ptr[offset]; + *(dst+2) = *bpixel; + } + if(flag_4 == 0) + { + offset = (addr->stride_y * vgetq_lane_s32(y_32x4, 3) + vgetq_lane_s32(x_32x4, 3)); + new_ptr = (vx_uint8*)base; + bpixel = &new_ptr[offset]; + *(dst+3) = *bpixel; + } +} + +static vx_bool read_pixel_16s(void *base, vx_imagepatch_addressing_t *addr, vx_uint32 src_height, vx_uint32 src_width, + vx_int32 x, vx_int32 y, vx_int16 *pixel) +{ + vx_uint32 bx; + vx_uint32 by; + vx_int16* bpixel; + + vx_bool out_of_bounds = (x < 0 || y < 0 || x >= src_width || y >= src_height); + + if (out_of_bounds) + { + return vx_false_e; + } + + // bounded x/y + bx = x < 0 ? 0 : src_width ? src_width - 1 : (vx_uint32)x; + by = y < 0 ? 0 : y >= src_height ? src_height - 1 : (vx_uint32)y; + + vx_int16 *new_ptr = NULL; + vx_uint32 offset = (addr->stride_y * by + addr->stride_x * bx); + new_ptr = (vx_int16*)base; + bpixel = &new_ptr[offset]; + + *pixel = *bpixel; + + return vx_true_e; +} + +static void vxNearestScaling_fast(vx_tile_t *src_image, vx_tile_t *dst_image) +{ + vx_int32 x1,y1,x2,y2; + vx_rectangle_t src_rect, dst_rect; + vx_uint32 w1 = 0, h1 = 0, w2 = 0, h2 = 0; + vx_float32 wr, hr; + vx_df_image format = 0; + + w1 = src_image->image.width; + h1 = src_image->image.height; + format = src_image->image.format; + w2 = dst_image->image.width; + h2 = dst_image->image.height; + + src_rect.start_x = src_rect.start_y = 0; + src_rect.end_x = w1; + src_rect.end_y = h1; + + dst_rect.start_x = dst_rect.start_y = 0; + dst_rect.end_x = w2; + dst_rect.end_y = h2; + + wr = (vx_float32)w1/(vx_float32)w2; + hr = (vx_float32)h1/(vx_float32)h2; + + vx_uint32 low_height = dst_image->tile_y; + vx_uint32 height = dst_image->tile_y + dst_image->tile_block.height; + + vx_uint32 low_width = dst_image->tile_x; + vx_uint32 width = dst_image->tile_x + dst_image->tile_block.width; + + float32x4_t fv_0_5_32x4 = vdupq_n_f32(0.5f); + float32x4_t fv_wr_32x4 = vdupq_n_f32(wr); + float32x4_t fv_hr_32x4 = vdupq_n_f32(hr); + for (y2 = low_height; y2 < height; y2++) + { + vx_uint8* dst_u8 = (vx_uint8 *)dst_image->base[0] + dst_image->tile_x + y2 * dst_image->addr[0].stride_y; + vx_int16* dst_base_s16 = (vx_int16*)dst_image->base[0] + dst_image->tile_x + y2 * dst_image->addr[0].stride_y/2; + float32x4_t y2_32x4 = vdupq_n_f32((float32_t)y2); + float32x4_t y_src_32x4 = vsubq_f32(vmulq_f32(vaddq_f32(y2_32x4,fv_0_5_32x4), fv_hr_32x4), fv_0_5_32x4); + + int32x4_t y1_32x4 = vcvtq_s32_f32(vaddq_f32(y_src_32x4, fv_0_5_32x4)); + for (x2 = low_width; x2 < width; x2 += 8) + { + float32_t arr_int32[4]={(float32_t)x2, (float32_t)(x2+1), (float32_t)(x2+2), (float32_t)(x2+3)}; + float32x4_t x2_32x4 = vld1q_f32(arr_int32); + float32x4_t x_src_32x4 = vsubq_f32(vmulq_f32(vaddq_f32(x2_32x4,fv_0_5_32x4), fv_wr_32x4), fv_0_5_32x4); + + arr_int32[0] = (float32_t)(x2+4); + arr_int32[1] = (float32_t)(x2+5); + arr_int32[2] = (float32_t)(x2+6); + arr_int32[3] = (float32_t)(x2+7); + float32x4_t x2_32x4_1 = vld1q_f32(arr_int32); + float32x4_t x_src_32x4_1 = vsubq_f32(vmulq_f32(vaddq_f32(x2_32x4_1,fv_0_5_32x4), fv_wr_32x4), fv_0_5_32x4); + int32x4_t x1_32x4 = vcvtq_s32_f32(vaddq_f32(x_src_32x4, fv_0_5_32x4)); + int32x4_t x1_32x4_1 = vcvtq_s32_f32(vaddq_f32(x_src_32x4_1, fv_0_5_32x4)); + + if (VX_DF_IMAGE_U8 == format) + { + read_pixel_v((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1_32x4, y1_32x4, dst_u8); + read_pixel_v((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1_32x4_1, y1_32x4, (dst_u8+4)); + dst_u8 += 8; + } + else + { + vx_int16 v = 0; + vx_int16* dst = dst_base_s16 + dst_image->addr[0].stride_x/2*x2; + if (dst && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4, 0),vgetq_lane_s32(y1_32x4, 0),&v)) + *dst = v; + v=0; + if ((dst+1) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4, 1),vgetq_lane_s32(y1_32x4, 1),&v)) + *(dst+1) = v; + v=0; + if ((dst+2) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4, 2),vgetq_lane_s32(y1_32x4, 2),&v)) + *(dst+2) = v; + v=0; + if ((dst+3) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4, 3),vgetq_lane_s32(y1_32x4, 3),&v)) + *(dst+3) = v; + + if ((dst+4) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4_1, 0),vgetq_lane_s32(y1_32x4, 0),&v)) + *(dst+4) = v; + v=0; + if ((dst+5) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4_1, 1),vgetq_lane_s32(y1_32x4, 1),&v)) + *(dst+5) = v; + v=0; + if ((dst+6) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4_1, 2),vgetq_lane_s32(y1_32x4, 2),&v)) + *(dst+6) = v; + v=0; + if ((dst+7) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4_1, 3),vgetq_lane_s32(y1_32x4, 3),&v)) + *(dst+7) = v; + } + } + } +} + +static void vxBilinearScaling_fast(vx_tile_t *src_image, vx_tile_t *dst_image) +{ + vx_int32 x2,y2; + vx_rectangle_t src_rect, dst_rect; + vx_imagepatch_addressing_t src_addr, dst_addr; + vx_uint32 w1 = 0, h1 = 0, w2 = 0, h2 = 0; + vx_float32 wr, hr; + + w1 = src_image->image.width; + h1 = src_image->image.height; + w2 = dst_image->image.width; + h2 = dst_image->image.height; + + src_rect.start_x = src_rect.start_y = 0; + src_rect.end_x = w1; + src_rect.end_y = h1; + + dst_rect.start_x = dst_rect.start_y = 0; + dst_rect.end_x = w2; + dst_rect.end_y = h2; + + wr = (vx_float32)w1/(vx_float32)w2; + hr = (vx_float32)h1/(vx_float32)h2; + + vx_uint32 low_height = dst_image->tile_y; + vx_uint32 height = dst_image->tile_y + dst_image->tile_block.height; + + vx_uint32 low_width = dst_image->tile_x; + vx_uint32 width = dst_image->tile_x + dst_image->tile_block.width; + + for (y2 = low_height; y2 < height; y2++) + { + for (x2 = low_width; x2 < width; x2 += 8) + { + vx_uint8 tl = 0, tr = 0, bl = 0, br = 0; + vx_uint8* dst = (vx_uint8 *)dst_image->base[0] + y2 * dst_image->addr[0].stride_y + dst_image->addr[0].stride_x*x2; + float32x4_t x2_32x4; + x2_32x4 = vsetq_lane_f32((vx_float32)x2, x2_32x4, 0); + x2_32x4 = vsetq_lane_f32((vx_float32)(x2+1), x2_32x4, 1); + x2_32x4 = vsetq_lane_f32((vx_float32)(x2+2), x2_32x4, 2); + x2_32x4 = vsetq_lane_f32((vx_float32)(x2+3), x2_32x4, 3); + float32x4_t x_src_32x4 = vsubq_f32(vmulq_f32(vaddq_f32(x2_32x4,vdupq_n_f32(0.5f)), vdupq_n_f32(wr)),vdupq_n_f32(0.5f)); + + float32x4_t x2_32x4_1; + x2_32x4_1 = vsetq_lane_f32((vx_float32)(x2+4), x2_32x4_1, 0); + x2_32x4_1 = vsetq_lane_f32((vx_float32)(x2+5), x2_32x4_1, 1); + x2_32x4_1 = vsetq_lane_f32((vx_float32)(x2+6), x2_32x4_1, 2); + x2_32x4_1 = vsetq_lane_f32((vx_float32)(x2+7), x2_32x4_1, 3); + float32x4_t x_src_32x4_1 = vsubq_f32(vmulq_f32(vaddq_f32(x2_32x4_1,vdupq_n_f32(0.5f)), vdupq_n_f32(wr)),vdupq_n_f32(0.5f)); + + float32x4_t y2_32x4 = vdupq_n_f32((vx_float32)y2); + float32x4_t y_src_32x4 = vsubq_f32(vmulq_f32(vaddq_f32(y2_32x4,vdupq_n_f32(0.5f)), vdupq_n_f32(hr)),vdupq_n_f32(0.5f)); + + float32x4_t x_min_32x4; + x_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4, 0)), x_min_32x4, 0); + x_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4, 1)), x_min_32x4, 1); + x_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4, 2)), x_min_32x4, 2); + x_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4, 3)), x_min_32x4, 3); + + float32x4_t x_min_32x4_1; + x_min_32x4_1 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4_1, 0)), x_min_32x4_1, 0); + x_min_32x4_1 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4_1, 1)), x_min_32x4_1, 1); + x_min_32x4_1 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4_1, 2)), x_min_32x4_1, 2); + x_min_32x4_1 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4_1, 3)), x_min_32x4_1, 3); + + float32x4_t y_min_32x4; + y_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(y_src_32x4, 0)), y_min_32x4, 0); + y_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(y_src_32x4, 1)), y_min_32x4, 1); + y_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(y_src_32x4, 2)), y_min_32x4, 2); + y_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(y_src_32x4, 3)), y_min_32x4, 3); + + float32x4_t s_32x4 = vsubq_f32(x_src_32x4, x_min_32x4); + float32x4_t s_32x4_1 = vsubq_f32(x_src_32x4_1, x_min_32x4_1); + + float32_t s_0 = vgetq_lane_f32(s_32x4, 0); + float32_t s_1 = vgetq_lane_f32(s_32x4, 1); + float32_t s_2 = vgetq_lane_f32(s_32x4, 2); + float32_t s_3 = vgetq_lane_f32(s_32x4, 3); + + float32_t s_4 = vgetq_lane_f32(s_32x4_1, 0); + float32_t s_5 = vgetq_lane_f32(s_32x4_1, 1); + float32_t s_6 = vgetq_lane_f32(s_32x4_1, 2); + float32_t s_7 = vgetq_lane_f32(s_32x4_1, 3); + + float32x4_t t_32x4 = vsubq_f32(y_src_32x4, y_min_32x4); + + float32_t t_0 = vgetq_lane_f32(t_32x4, 0); + float32_t t_1 = vgetq_lane_f32(t_32x4, 1); + float32_t t_2 = vgetq_lane_f32(t_32x4, 2); + float32_t t_3 = vgetq_lane_f32(t_32x4, 3); + + // the first time + vx_bool defined_tl_0 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 0), (vx_int32)vgetq_lane_f32(y_min_32x4, 0), &tl); + vx_bool defined_tr_0 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 0)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 0), &tr); + vx_bool defined_bl_0 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 0), (vx_int32)vgetq_lane_f32(y_min_32x4, 0)+1, &bl); + vx_bool defined_br_0 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 0)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 0)+1, &br); + vx_bool defined_0 = defined_tl_0 & defined_tr_0 & defined_bl_0 & defined_br_0; + if (defined_0 == vx_false_e) + { + vx_bool defined_any = defined_tl_0 | defined_tr_0 | defined_bl_0 | defined_br_0; + if (defined_any) + { + if ((defined_tl_0 == vx_false_e || defined_tr_0 == vx_false_e) && fabs(t_0 - 1.0) <= 0.001) + defined_tl_0 = defined_tr_0 = vx_true_e; + else if ((defined_bl_0 == vx_false_e || defined_br_0 == vx_false_e) && fabs(t_0 - 0.0) <= 0.001) + defined_bl_0 = defined_br_0 = vx_true_e; + if ((defined_tl_0 == vx_false_e || defined_bl_0 == vx_false_e) && fabs(s_0 - 1.0) <= 0.001) + defined_tl_0 = defined_bl_0 = vx_true_e; + else if ((defined_tr_0 == vx_false_e || defined_br_0 == vx_false_e) && fabs(s_0 - 0.0) <= 0.001) + defined_tr_0 = defined_br_0 = vx_true_e; + defined_0 = defined_tl_0 & defined_tr_0 & defined_bl_0 & defined_br_0; + } + } + if (defined_0 == vx_true_e) + { + vx_float32 ref = + (1 - s_0) * (1 - t_0) * tl + + ( s_0) * (1 - t_0) * tr + + (1 - s_0) * ( t_0) * bl + + ( s_0) * ( t_0) * br; + vx_uint8 ref_8u; + if (ref > 255) + ref_8u = 255; + else + ref_8u = (vx_uint8)ref; + if (dst) + *dst = ref_8u; + } + + // the second time + vx_bool defined_tl_1 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 1), (vx_int32)vgetq_lane_f32(y_min_32x4, 1), &tl); + vx_bool defined_tr_1 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 1)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 1), &tr); + vx_bool defined_bl_1 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 1), (vx_int32)vgetq_lane_f32(y_min_32x4, 1)+1, &bl); + vx_bool defined_br_1 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 1)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 1)+1, &br); + vx_bool defined_1 = defined_tl_1 & defined_tr_1 & defined_bl_1 & defined_br_1; + if (defined_1 == vx_false_e) + { + vx_bool defined_any = defined_tl_1 | defined_tr_1 | defined_bl_1 | defined_br_1; + if (defined_any) + { + if ((defined_tl_1 == vx_false_e || defined_tr_1 == vx_false_e) && fabs(t_1 - 1.0) <= 0.001) + defined_tl_1 = defined_tr_1 = vx_true_e; + else if ((defined_bl_1 == vx_false_e || defined_br_1 == vx_false_e) && fabs(t_1 - 0.0) <= 0.001) + defined_bl_1 = defined_br_1 = vx_true_e; + if ((defined_tl_1 == vx_false_e || defined_bl_1 == vx_false_e) && fabs(s_1 - 1.0) <= 0.001) + defined_tl_1 = defined_bl_1 = vx_true_e; + else if ((defined_tr_1 == vx_false_e || defined_br_1 == vx_false_e) && fabs(s_1 - 0.0) <= 0.001) + defined_tr_1 = defined_br_1 = vx_true_e; + defined_1 = defined_tl_1 & defined_tr_1 & defined_bl_1 & defined_br_1; + } + } + if (defined_1 == vx_true_e) + { + vx_float32 ref = + (1 - s_1) * (1 - t_1) * tl + + ( s_1) * (1 - t_1) * tr + + (1 - s_1) * ( t_1) * bl + + ( s_1) * ( t_1) * br; + vx_uint8 ref_8u; + if (ref > 255) + ref_8u = 255; + else + ref_8u = (vx_uint8)ref; + if (dst+1) + *(dst+1) = ref_8u; + } + + // the third time + vx_bool defined_tl_2 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 2), (vx_int32)vgetq_lane_f32(y_min_32x4, 2), &tl); + vx_bool defined_tr_2 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 2)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 2), &tr); + vx_bool defined_bl_2 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 2), (vx_int32)vgetq_lane_f32(y_min_32x4, 2)+1, &bl); + vx_bool defined_br_2 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 2)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 2)+1, &br); + vx_bool defined_2 = defined_tl_2 & defined_tr_2 & defined_bl_2 & defined_br_2; + if (defined_2 == vx_false_e) + { + vx_bool defined_any = defined_tl_2 | defined_tr_2 | defined_bl_2 | defined_br_2; + if (defined_any) + { + if ((defined_tl_2 == vx_false_e || defined_tr_2 == vx_false_e) && fabs(t_2 - 1.0) <= 0.001) + defined_tl_2 = defined_tr_2 = vx_true_e; + else if ((defined_bl_2 == vx_false_e || defined_br_2 == vx_false_e) && fabs(t_2 - 0.0) <= 0.001) + defined_bl_2 = defined_br_2 = vx_true_e; + if ((defined_tl_2 == vx_false_e || defined_bl_2 == vx_false_e) && fabs(s_2 - 1.0) <= 0.001) + defined_tl_2 = defined_bl_2 = vx_true_e; + else if ((defined_tr_2 == vx_false_e || defined_br_2 == vx_false_e) && fabs(s_2 - 0.0) <= 0.001) + defined_tr_2 = defined_br_2 = vx_true_e; + defined_2 = defined_tl_2 & defined_tr_2 & defined_bl_2 & defined_br_2; + } + } + if (defined_2 == vx_true_e) + { + vx_float32 ref = + (1 - s_2) * (1 - t_2) * tl + + ( s_2) * (1 - t_2) * tr + + (1 - s_2) * ( t_2) * bl + + ( s_2) * ( t_2) * br; + vx_uint8 ref_8u; + if (ref > 255) + ref_8u = 255; + else + ref_8u = (vx_uint8)ref; + if (dst+2) + *(dst+2) = ref_8u; + } + + // the fourth time + vx_bool defined_tl_3 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 3), (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tl); + vx_bool defined_tr_3 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 3)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tr); + vx_bool defined_bl_3 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 3), (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &bl); + vx_bool defined_br_3 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 3)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &br); + vx_bool defined_3 = defined_tl_3 & defined_tr_3 & defined_bl_3 & defined_br_3; + if (defined_3 == vx_false_e) + { + vx_bool defined_any = defined_tl_3 | defined_tr_3 | defined_bl_3 | defined_br_3; + if (defined_any) + { + if ((defined_tl_3 == vx_false_e || defined_tr_3 == vx_false_e) && fabs(t_3 - 1.0) <= 0.001) + defined_tl_3 = defined_tr_3 = vx_true_e; + else if ((defined_bl_3 == vx_false_e || defined_br_3 == vx_false_e) && fabs(t_3 - 0.0) <= 0.001) + defined_bl_3 = defined_br_3 = vx_true_e; + if ((defined_tl_3 == vx_false_e || defined_bl_3 == vx_false_e) && fabs(s_3 - 1.0) <= 0.001) + defined_tl_3 = defined_bl_3 = vx_true_e; + else if ((defined_tr_3 == vx_false_e || defined_br_3 == vx_false_e) && fabs(s_3 - 0.0) <= 0.001) + defined_tr_3 = defined_br_3 = vx_true_e; + defined_3 = defined_tl_3 & defined_tr_3 & defined_bl_3 & defined_br_3; + } + } + if (defined_3 == vx_true_e) + { + vx_float32 ref = + (1 - s_3) * (1 - t_3) * tl + + ( s_3) * (1 - t_3) * tr + + (1 - s_3) * ( t_3) * bl + + ( s_3) * ( t_3) * br; + vx_uint8 ref_8u; + if (ref > 255) + ref_8u = 255; + else + ref_8u = (vx_uint8)ref; + if (dst+3) + *(dst+3) = ref_8u; + } + + // the fifth time + vx_bool defined_tl_4 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 0), (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tl); + vx_bool defined_tr_4 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 0)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tr); + vx_bool defined_bl_4 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 0), (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &bl); + vx_bool defined_br_4 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 0)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &br); + vx_bool defined_4 = defined_tl_4 & defined_tr_4 & defined_bl_4 & defined_br_4; + if (defined_4 == vx_false_e) + { + vx_bool defined_any = defined_tl_4 | defined_tr_4 | defined_bl_4 | defined_br_4; + if (defined_any) + { + if ((defined_tl_4 == vx_false_e || defined_tr_4 == vx_false_e) && fabs(t_3 - 1.0) <= 0.001) + defined_tl_4 = defined_tr_4 = vx_true_e; + else if ((defined_bl_4 == vx_false_e || defined_br_4 == vx_false_e) && fabs(t_3 - 0.0) <= 0.001) + defined_bl_4 = defined_br_4 = vx_true_e; + if ((defined_tl_4 == vx_false_e || defined_bl_4 == vx_false_e) && fabs(s_4 - 1.0) <= 0.001) + defined_tl_4 = defined_bl_4 = vx_true_e; + else if ((defined_tr_4 == vx_false_e || defined_br_4 == vx_false_e) && fabs(s_4 - 0.0) <= 0.001) + defined_tr_4 = defined_br_4 = vx_true_e; + defined_4 = defined_tl_4 & defined_tr_4 & defined_bl_4 & defined_br_4; + } + } + if (defined_4 == vx_true_e) + { + vx_float32 ref = + (1 - s_4) * (1 - t_3) * tl + + ( s_4) * (1 - t_3) * tr + + (1 - s_4) * ( t_3) * bl + + ( s_4) * ( t_3) * br; + vx_uint8 ref_8u; + if (ref > 255) + ref_8u = 255; + else + ref_8u = (vx_uint8)ref; + if (dst+4) + *(dst+4) = ref_8u; + } + + // the sixth time + vx_bool defined_tl_5 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 1), (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tl); + vx_bool defined_tr_5 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 1)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tr); + vx_bool defined_bl_5 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 1), (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &bl); + vx_bool defined_br_5 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 1)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &br); + vx_bool defined_5 = defined_tl_5 & defined_tr_5 & defined_bl_5 & defined_br_5; + if (defined_5 == vx_false_e) + { + vx_bool defined_any = defined_tl_5 | defined_tr_5 | defined_bl_5 | defined_br_5; + if (defined_any) + { + if ((defined_tl_5 == vx_false_e || defined_tr_5 == vx_false_e) && fabs(t_3 - 1.0) <= 0.001) + defined_tl_5 = defined_tr_5 = vx_true_e; + else if ((defined_bl_5 == vx_false_e || defined_br_5 == vx_false_e) && fabs(t_3 - 0.0) <= 0.001) + defined_bl_5 = defined_br_5 = vx_true_e; + if ((defined_tl_5 == vx_false_e || defined_bl_5 == vx_false_e) && fabs(s_5 - 1.0) <= 0.001) + defined_tl_5 = defined_bl_5 = vx_true_e; + else if ((defined_tr_5 == vx_false_e || defined_br_5 == vx_false_e) && fabs(s_5 - 0.0) <= 0.001) + defined_tr_5 = defined_br_5 = vx_true_e; + defined_5 = defined_tl_5 & defined_tr_5 & defined_bl_5 & defined_br_5; + } + } + if (defined_5 == vx_true_e) + { + vx_float32 ref = + (1 - s_5) * (1 - t_3) * tl + + ( s_5) * (1 - t_3) * tr + + (1 - s_5) * ( t_3) * bl + + ( s_5) * ( t_3) * br; + vx_uint8 ref_8u; + if (ref > 255) + ref_8u = 255; + else + ref_8u = (vx_uint8)ref; + if (dst+5) + *(dst+5) = ref_8u; + } + + // the seventh time + vx_bool defined_tl_6 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 2), (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tl); + vx_bool defined_tr_6 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 2)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tr); + vx_bool defined_bl_6 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 2), (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &bl); + vx_bool defined_br_6 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 2)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &br); + vx_bool defined_6 = defined_tl_6 & defined_tr_6 & defined_bl_6 & defined_br_6; + if (defined_6 == vx_false_e) + { + vx_bool defined_any = defined_tl_6 | defined_tr_6 | defined_bl_6 | defined_br_6; + if (defined_any) + { + if ((defined_tl_6 == vx_false_e || defined_tr_6 == vx_false_e) && fabs(t_3 - 1.0) <= 0.001) + defined_tl_6 = defined_tr_6 = vx_true_e; + else if ((defined_bl_6 == vx_false_e || defined_br_6 == vx_false_e) && fabs(t_3 - 0.0) <= 0.001) + defined_bl_6 = defined_br_6 = vx_true_e; + if ((defined_tl_6 == vx_false_e || defined_bl_6 == vx_false_e) && fabs(s_6 - 1.0) <= 0.001) + defined_tl_6 = defined_bl_6 = vx_true_e; + else if ((defined_tr_6 == vx_false_e || defined_br_6 == vx_false_e) && fabs(s_6 - 0.0) <= 0.001) + defined_tr_6 = defined_br_6 = vx_true_e; + defined_6 = defined_tl_6 & defined_tr_6 & defined_bl_6 & defined_br_6; + } + } + if (defined_6 == vx_true_e) + { + vx_float32 ref = + (1 - s_6) * (1 - t_3) * tl + + ( s_6) * (1 - t_3) * tr + + (1 - s_6) * ( t_3) * bl + + ( s_6) * ( t_3) * br; + vx_uint8 ref_8u; + if (ref > 255) + ref_8u = 255; + else + ref_8u = (vx_uint8)ref; + if (dst+6) + *(dst+6) = ref_8u; + } + + // the eighth time + vx_bool defined_tl_7 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 3), (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tl); + vx_bool defined_tr_7 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 3)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tr); + vx_bool defined_bl_7 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 3), (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &bl); + vx_bool defined_br_7 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 3)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &br); + vx_bool defined_7 = defined_tl_7 & defined_tr_7 & defined_bl_7 & defined_br_7; + if (defined_7 == vx_false_e) + { + vx_bool defined_any = defined_tl_7 | defined_tr_7 | defined_bl_7 | defined_br_7; + if (defined_any) + { + if ((defined_tl_7 == vx_false_e || defined_tr_7 == vx_false_e) && fabs(t_3 - 1.0) <= 0.001) + defined_tl_7 = defined_tr_7 = vx_true_e; + else if ((defined_bl_7 == vx_false_e || defined_br_7 == vx_false_e) && fabs(t_3 - 0.0) <= 0.001) + defined_bl_7 = defined_br_7 = vx_true_e; + if ((defined_tl_7 == vx_false_e || defined_bl_7 == vx_false_e) && fabs(s_7 - 1.0) <= 0.001) + defined_tl_7 = defined_bl_7 = vx_true_e; + else if ((defined_tr_7 == vx_false_e || defined_br_7 == vx_false_e) && fabs(s_7 - 0.0) <= 0.001) + defined_tr_7 = defined_br_7 = vx_true_e; + defined_7 = defined_tl_7 & defined_tr_7 & defined_bl_7 & defined_br_7; + } + } + if (defined_7 == vx_true_e) + { + vx_float32 ref = + (1 - s_7) * (1 - t_3) * tl + + ( s_7) * (1 - t_3) * tr + + (1 - s_7) * ( t_3) * bl + + ( s_7) * ( t_3) * br; + vx_uint8 ref_8u; + if (ref > 255) + ref_8u = 255; + else + ref_8u = (vx_uint8)ref; + if (dst+7) + *(dst+7) = ref_8u; + } + } + } +} + +void ScaleImage_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + vx_enum *type = (vx_enum*)parameters[2]; + if (*type == VX_INTERPOLATION_BILINEAR) + { + vxBilinearScaling_fast(in, out); + } + else if (*type == VX_INTERPOLATION_AREA) + { + vxNearestScaling_fast(in, out); + } + else if (*type == VX_INTERPOLATION_NEAREST_NEIGHBOR) + { + vxNearestScaling_fast(in, out); + } +} + + +#define NEAREST_SCALING(low_y, low_x, high_y, high_x, src_image_tile_x, dst_image_tile_x) \ + for (y2 = low_y; y2 < high_y; y2++)\ + {\ + for (x2 = low_x; x2 < high_x; x2++)\ + {\ + if (VX_DF_IMAGE_U8 == format)\ + {\ + vx_uint8 v = 0;\ + vx_uint8 *dst = (vx_uint8 *)dst_image->base[0] + y2 * dst_image->addr[0].stride_y + x2 * dst_image->addr[0].stride_x;\ + vx_float32 x_src = ((vx_float32)x2 + 0.5f)*wr - 0.5f;\ + vx_float32 y_src = ((vx_float32)y2 + 0.5f)*hr - 0.5f;\ + vx_float32 x_min = floorf(x_src);\ + vx_float32 y_min = floorf(y_src);\ + x1 = (vx_int32)x_min;\ + y1 = (vx_int32)y_min;\ + if (x_src - x_min >= 0.5f)\ + x1++;\ + if (y_src - y_min >= 0.5f)\ + y1++;\ + if (dst && vx_true_e == read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1, y1, &v))\ + *dst = v;\ + }\ + else\ + {\ + vx_int16 v = 0;\ + vx_int16 *dst = (vx_int16 *)dst_image->base[0] + y2 * dst_image->addr[0].stride_y / 2+ x2 * dst_image->addr[0].stride_x /2;\ + vx_float32 x_src = ((vx_float32)x2 + 0.5f)*wr - 0.5f;\ + vx_float32 y_src = ((vx_float32)y2 + 0.5f)*hr - 0.5f;\ + vx_float32 x_min = floorf(x_src);\ + vx_float32 y_min = floorf(y_src);\ + x1 = (vx_int32)x_min;\ + y1 = (vx_int32)y_min;\ + if (x_src - x_min >= 0.5f)\ + x1++;\ + if (y_src - y_min >= 0.5f)\ + y1++;\ + if (dst && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0], src_image->addr, h1, w1, x1, y1, &v))\ + *dst = v;\ + }\ + }\ + }\ + +static void vxNearestScaling(vx_tile_t *src_image, vx_tile_t *dst_image, vx_uint32 ty, vx_uint32 tx) +{ + vx_int32 x1,y1,x2,y2; + vx_rectangle_t src_rect, dst_rect; + vx_uint32 w1 = 0, h1 = 0, w2 = 0, h2 = 0; + vx_float32 wr, hr; + vx_df_image format = 0; + + w1 = src_image->image.width; + h1 = src_image->image.height; + format = src_image->image.format; + w2 = dst_image->image.width; + h2 = dst_image->image.height; + + src_rect.start_x = src_rect.start_y = 0; + src_rect.end_x = w1; + src_rect.end_y = h1; + + dst_rect.start_x = dst_rect.start_y = 0; + dst_rect.end_x = w2; + dst_rect.end_y = h2; + + wr = (vx_float32)w1/(vx_float32)w2; + hr = (vx_float32)h1/(vx_float32)h2; + + if (ty == 0 && tx == 0) + { + NEAREST_SCALING(0, 0, vxTileHeight(dst_image, 0), vxTileWidth(dst_image, 0), src_image->tile_x, dst_image->tile_x) + } + else + { + NEAREST_SCALING(0, tx, ty, vxTileWidth(dst_image, 0), src_image->tile_x, dst_image->tile_x) + NEAREST_SCALING(ty, 0, vxTileHeight(dst_image, 0), vxTileWidth(dst_image, 0), 0, 0) + } +} + +#define BILINEAR_SCALING(low_y, low_x, high_y, high_x, src_image_tile_x, dst_image_tile_x) \ + for (y2 = low_y; y2 < high_y; y2++)\ + {\ + for (x2 = low_x; x2 < high_x; x2++)\ + {\ + vx_uint8 tl = 0, tr = 0, bl = 0, br = 0;\ + vx_uint8 *dst = (vx_uint8 *)dst_image->base[0] + y2 * dst_image->addr[0].stride_y + x2 * dst_image->addr[0].stride_x;\ + vx_float32 x_src = ((vx_float32)x2+0.5f)*wr - 0.5f;\ + vx_float32 y_src = ((vx_float32)y2+0.5f)*hr - 0.5f;\ + vx_float32 x_min = floorf(x_src);\ + vx_float32 y_min = floorf(y_src);\ + vx_int32 x1 = (vx_int32)x_min;\ + vx_int32 y1 = (vx_int32)y_min;\ + vx_float32 s = x_src - x_min;\ + vx_float32 t = y_src - y_min;\ + vx_bool defined_tl = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1 + 0, y1 + 0, &tl);\ + vx_bool defined_tr = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1 + 1, y1 + 0, &tr);\ + vx_bool defined_bl = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1 + 0, y1 + 1, &bl);\ + vx_bool defined_br = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1 + 1, y1 + 1, &br);\ + vx_bool defined = defined_tl & defined_tr & defined_bl & defined_br;\ + if (defined == vx_false_e)\ + {\ + vx_bool defined_any = defined_tl | defined_tr | defined_bl | defined_br;\ + if (defined_any)\ + {\ + if ((defined_tl == vx_false_e || defined_tr == vx_false_e) && fabs(t - 1.0) <= 0.001)\ + defined_tl = defined_tr = vx_true_e;\ + else if ((defined_bl == vx_false_e || defined_br == vx_false_e) && fabs(t - 0.0) <= 0.001)\ + defined_bl = defined_br = vx_true_e;\ + if ((defined_tl == vx_false_e || defined_bl == vx_false_e) && fabs(s - 1.0) <= 0.001)\ + defined_tl = defined_bl = vx_true_e;\ + else if ((defined_tr == vx_false_e || defined_br == vx_false_e) && fabs(s - 0.0) <= 0.001)\ + defined_tr = defined_br = vx_true_e;\ + defined = defined_tl & defined_tr & defined_bl & defined_br;\ + }\ + }\ + if (defined == vx_true_e)\ + {\ + vx_float32 ref =\ + (1 - s) * (1 - t) * tl +\ + ( s) * (1 - t) * tr +\ + (1 - s) * ( t) * bl +\ + ( s) * ( t) * br;\ + vx_uint8 ref_8u;\ + if (ref > 255)\ + ref_8u = 255;\ + else\ + ref_8u = (vx_uint8)ref;\ + if (dst)\ + *dst = ref_8u;\ + }\ + }\ + }\ + + +static void vxBilinearScaling(vx_tile_t *src_image, vx_tile_t *dst_image, vx_uint32 ty, vx_uint32 tx) +{ + vx_int32 x2,y2; + vx_rectangle_t src_rect, dst_rect; + vx_imagepatch_addressing_t src_addr, dst_addr; + vx_uint32 w1 = 0, h1 = 0, w2 = 0, h2 = 0; + vx_float32 wr, hr; + + w1 = src_image->image.width; + h1 = src_image->image.height; + w2 = dst_image->image.width; + h2 = dst_image->image.height; + + src_rect.start_x = src_rect.start_y = 0; + src_rect.end_x = w1; + src_rect.end_y = h1; + + dst_rect.start_x = dst_rect.start_y = 0; + dst_rect.end_x = w2; + dst_rect.end_y = h2; + + wr = (vx_float32)w1/(vx_float32)w2; + hr = (vx_float32)h1/(vx_float32)h2; + + if (ty == 0 && tx == 0) + { + BILINEAR_SCALING(0, 0, vxTileHeight(dst_image, 0), vxTileWidth(dst_image, 0), src_image->tile_x, dst_image->tile_x) + } + else + { + BILINEAR_SCALING(0, tx, ty, vxTileWidth(dst_image, 0), src_image->tile_x, dst_image->tile_x) + BILINEAR_SCALING(ty, 0, vxTileHeight(dst_image, 0), vxTileWidth(dst_image, 0), 0, 0) + } +} + +void ScaleImage_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 y, x; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *out = (vx_tile_t *)parameters[1]; + vx_enum *type = (vx_enum*)parameters[2]; + vx_uint32 ty = out->tile_y; + vx_uint32 tx = out->tile_x; + if (*type == VX_INTERPOLATION_BILINEAR) + { + vxBilinearScaling(in, out, ty, tx); + } + else if (*type == VX_INTERPOLATION_AREA) + { + vxNearestScaling(in, out, ty, tx); + } + else if (*type == VX_INTERPOLATION_NEAREST_NEIGHBOR) + { + vxNearestScaling(in, out, ty, tx); + } +} + + diff --git a/kernels/tiling/tiling_sobel3x3.c b/kernels/tiling/tiling_sobel3x3.c new file mode 100644 index 0000000..c5eeca4 --- /dev/null +++ b/kernels/tiling/tiling_sobel3x3.c @@ -0,0 +1,239 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include + +#include + +#define SOBEL3x3_VALUE \ + const uint8x16_t top_data = vld1q_u8(top_src); \ + const uint8x16_t mid_data = vld1q_u8(mid_src); \ + const uint8x16_t bot_data = vld1q_u8(bot_src); \ + \ + const int16x8x2_t top_s16 = \ + { \ + { \ + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))), \ + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data))) \ + } \ + }; \ + \ + const int16x8x2_t mid_s16 = \ + { \ + { \ + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))), \ + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data))) \ + } \ + }; \ + const int16x8x2_t bot_s16 = \ + { \ + { \ + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))), \ + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data))) \ + } \ + }; + +void Sobel3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + int16x8_t two = vdupq_n_s16(2); + int16x8_t minustwo = vdupq_n_s16(-2); + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *grad_x = (vx_tile_t *)parameters[1]; + vx_tile_t *grad_y = (vx_tile_t *)parameters[2]; + + vx_uint8 *src_base = in->base[0] + in->tile_x; + + if (grad_x) + { + vx_int16 *grad_x_base = (vx_int16 *)grad_x->base[0] + grad_x->tile_x; + + vx_uint32 low_y = grad_x->tile_y; + vx_uint32 high_y = grad_x->tile_y + grad_x->tile_block.height; + + if (low_y == 0) + { + low_y = 1; + } + if (high_y == grad_x->image.height) + { + high_y = high_y - 1; + } + + for (y = low_y; y < high_y; y++) + { + vx_int16* dstp = (vx_int16 *)grad_x_base + 1 + y * grad_x->addr->stride_y / 2; + vx_uint8* top_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y; + vx_uint8* mid_src = (vx_uint8 *)src_base + (y) * in->addr->stride_y; + vx_uint8* bot_src = (vx_uint8 *)src_base + (y + 1) * in->addr->stride_y; + + for (x = 0; x < grad_x->tile_block.width; x += 8) + { + SOBEL3x3_VALUE + //top left + int16x8_t out_x = vnegq_s16(top_s16.val[0]); + //top right + out_x = vaddq_s16(out_x, vextq_s16(top_s16.val[0], top_s16.val[1], 2)); + //mid left + out_x = vmlaq_s16(out_x, mid_s16.val[0], minustwo); + //mid right + out_x = vmlaq_s16(out_x, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two); + //bot left + out_x = vsubq_s16(out_x, bot_s16.val[0]); + //bot right + out_x = vaddq_s16(out_x, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2)); + + vst1q_s16(dstp, out_x); + + top_src+=8; + mid_src+=8; + bot_src+=8; + dstp += 8; + } + } + } + if (grad_y) + { + vx_int16 *grad_y_base = (vx_int16 *)grad_y->base[0] + grad_y->tile_x; + + vx_uint32 low_y = grad_y->tile_y; + vx_uint32 high_y = grad_y->tile_y + grad_y->tile_block.height; + + if (low_y == 0) + { + low_y = 1; + } + if (high_y == grad_y->image.height) + { + high_y = high_y - 1; + } + + for (y = low_y; y < high_y; y++) + { + vx_int16* dstp = (vx_int16 *)grad_y_base + 1 + y * grad_y->addr->stride_y / 2; + vx_uint8* top_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y; + vx_uint8* mid_src = (vx_uint8 *)src_base + (y) * in->addr->stride_y; + vx_uint8* bot_src = (vx_uint8 *)src_base + (y + 1) * in->addr->stride_y; + + for (x = 0; x < grad_y->tile_block.width; x += 8) + { + SOBEL3x3_VALUE + //top left + int16x8_t out_y = vnegq_s16(top_s16.val[0]); + //top mid + out_y = vmlaq_s16(out_y, vextq_s16(top_s16.val[0], top_s16.val[1], 1), minustwo); + //top right + out_y = vsubq_s16(out_y, vextq_s16(top_s16.val[0], top_s16.val[1], 2)); + //bot left + out_y = vaddq_s16(out_y, bot_s16.val[0]); + //bot mid + out_y = vmlaq_s16(out_y, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two); + //bot right + out_y = vaddq_s16(out_y, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2)); + + vst1q_s16(dstp, out_y); + + top_src+=8; + mid_src+=8; + bot_src+=8; + dstp += 8; + } + } + } +} + + +#define SOBEL3x3_X(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_int32 value = 0; \ + \ + value -= vxImagePixel(vx_uint8, in, 0, x, y, -1, -1); \ + value += vxImagePixel(vx_uint8, in, 0, x, y, +1, -1); \ + value -= vxImagePixel(vx_uint8, in, 0, x, y, -1, 0) << 1; \ + value += vxImagePixel(vx_uint8, in, 0, x, y, +1, 0) << 1; \ + value -= vxImagePixel(vx_uint8, in, 0, x, y, -1, +1); \ + value += vxImagePixel(vx_uint8, in, 0, x, y, +1, +1); \ + \ + vxImagePixel(vx_int16, grad_x, 0, x, y, 0, 0) = (vx_int16)value; \ + } \ + } + +#define SOBEL3x3_Y(low_y, high_y, low_x, high_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_int32 value = 0; \ + \ + value -= vxImagePixel(vx_uint8, in, 0, x, y, -1, -1); \ + value -= vxImagePixel(vx_uint8, in, 0, x, y, 0, -1) << 1; \ + value -= vxImagePixel(vx_uint8, in, 0, x, y, +1, -1); \ + value += vxImagePixel(vx_uint8, in, 0, x, y, -1, +1); \ + value += vxImagePixel(vx_uint8, in, 0, x, y, 0, +1) << 1; \ + value += vxImagePixel(vx_uint8, in, 0, x, y, +1, +1); \ + \ + vxImagePixel(vx_int16, grad_y, 0, x, y, 0, 0) = (vx_int16)value; \ + } \ + } + + +void Sobel3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_t *grad_x = (vx_tile_t *)parameters[1]; + vx_tile_t *grad_y = (vx_tile_t *)parameters[2]; + + if (grad_x) + { + vx_uint32 low_y = grad_x->tile_y; + vx_uint32 high_y = vxTileHeight(grad_x, 0); + vx_uint32 low_x = grad_x->tile_x; + vx_uint32 high_x = vxTileWidth(grad_x, 0); + if (low_y == 0 && low_x == 0) + { + SOBEL3x3_X(low_y + 1, high_y - 1, low_x + 1, high_x - 1) + } + else + { + SOBEL3x3_X(1, low_y, low_x, high_x - 1) + SOBEL3x3_X(low_y, high_y, 1, high_x - 1) + } + } + if (grad_y) + { + vx_uint32 low_y = grad_y->tile_y; + vx_uint32 high_y = vxTileHeight(grad_y, 0); + vx_uint32 low_x = grad_y->tile_x; + vx_uint32 high_x = vxTileWidth(grad_y, 0); + if (low_y == 0 && low_x == 0) + { + SOBEL3x3_Y(low_y + 1, high_y - 1, low_x + 1, high_x - 1) + } + else + { + SOBEL3x3_Y(1, low_y, low_x, high_x - 1) + SOBEL3x3_Y(low_y, high_y, 1, high_x - 1) + } + } +} diff --git a/kernels/tiling/tiling_threshold.c b/kernels/tiling/tiling_threshold.c new file mode 100644 index 0000000..b8b8413 --- /dev/null +++ b/kernels/tiling/tiling_threshold.c @@ -0,0 +1,296 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include + +#include + +void Threshold_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_threshold_t *threshold = (vx_tile_threshold_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + vx_uint8 true_value_u8 = threshold->true_value.U8; + vx_uint8 false_value_u8 = threshold->false_value.U8; + + vx_uint8 _threshold_u8 = threshold->value.U8; + vx_uint8 _lower_threshold_u8 = threshold->lower.U8; + vx_uint8 _upper_threshold_u8 = threshold->upper.U8; + + vx_int16 _threshold_s16 = threshold->value.S16; + vx_int16 _lower_threshold_s16 = threshold->lower.S16; + vx_int16 _upper_threshold_s16 = threshold->upper.S16; + + vx_int32 format = threshold->input_format; + vx_int32 type = threshold->thresh_type; + + if (format == VX_DF_IMAGE_S16) + {//case of input: VX_DF_IMAGE_S16 -> output: VX_DF_IMAGE_U8 + vx_int16 *src_base = (vx_int16 *)in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + const uint8x8_t true_value = vdup_n_u8(true_value_u8); + const uint8x8_t false_value = vdup_n_u8(false_value_u8); + + const int16x8_t threshold = vdupq_n_s16(_threshold_s16); + const int16x8_t lower_threshold = vdupq_n_s16(_lower_threshold_s16); + const int16x8_t upper_threshold = vdupq_n_s16(_upper_threshold_s16); + + if (type == VX_THRESHOLD_TYPE_BINARY) + { + for (y = low_y; y < high_y; y++) + { + const vx_int16 *src_ptr = (vx_int16 *)src_base + y * in->addr->stride_y / 2; + vx_uint8 *dst_ptr = (vx_uint8 *)dst_base + y * out->addr->stride_y; + + for (x = 0; x < out->tile_block.width; x += 8) + { + const int16x8_t vSrc = vld1q_s16(src_ptr); + uint8x8_t mask = vmovn_u16(vcgtq_s16(vSrc, threshold)); + uint8x8_t dst_value = vbsl_u8(mask, true_value, false_value); + + vst1_u8(dst_ptr, dst_value); + + src_ptr += 8; + dst_ptr += 8; + } + } + } + else if (type == VX_THRESHOLD_TYPE_RANGE) + { + for (y = low_y; y < high_y; y++) + { + const vx_int16 *src_ptr = (vx_int16 *)src_base + y * in->addr->stride_y / 2; + vx_uint8 *dst_ptr = (vx_uint8 *)dst_base + y * out->addr->stride_y; + for (x = 0; x < out->tile_block.width; x += 8) + { + const int16x8_t vSrc = vld1q_s16(src_ptr); + uint16x8_t _mask = vcleq_s16(vSrc, upper_threshold); + _mask = vandq_u16(vcgeq_s16(vSrc, lower_threshold), _mask); + uint8x8_t mask = vmovn_u16(_mask); + vst1_u8(dst_ptr, vbsl_u8(mask, true_value, false_value)); + + src_ptr += 8; + dst_ptr += 8; + } + } + } + } + else + {//case of input: VX_DF_IMAGE_U8 -> output: VX_DF_IMAGE_U8 + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + const uint8x16_t true_value = vdupq_n_u8(true_value_u8); + const uint8x16_t false_value = vdupq_n_u8(false_value_u8); + + const uint8x16_t threshold = vdupq_n_u8(_threshold_u8); + const uint8x16_t lower_threshold = vdupq_n_u8(_lower_threshold_u8); + const uint8x16_t upper_threshold = vdupq_n_u8(_upper_threshold_u8); + + if (type == VX_THRESHOLD_TYPE_BINARY) + { + for (y = low_y; y < high_y; y++) + { + const vx_uint8 *src_ptr = (vx_uint8 *)src_base + y * in->addr->stride_y; + vx_uint8 *dst_ptr = (vx_uint8 *)dst_base + y * out->addr->stride_y; + + for (x = 0; x < out->tile_block.width; x += 16) + { + const uint8x16_t vSrc = vld1q_u8(src_ptr); + uint8x16_t mask = vcgtq_u8(vSrc, threshold); + vst1q_u8(dst_ptr, vbslq_u8(mask, true_value, false_value)); + + src_ptr += 16; + dst_ptr += 16; + } + } + } + else if (type == VX_THRESHOLD_TYPE_RANGE) + { + for (y = low_y; y < high_y; y++) + { + const vx_uint8 *src_ptr = (vx_uint8 *)src_base + y * in->addr->stride_y; + vx_uint8 *dst_ptr = (vx_uint8 *)dst_base + y * out->addr->stride_y; + + for (x = 0; x < out->tile_block.width; x += 16) + { + const uint8x16_t vSrc = vld1q_u8(src_ptr); + uint8x16_t mask = vcleq_u8(vSrc, upper_threshold); + mask = vandq_u8(vcgeq_u8(vSrc, lower_threshold), mask); + vst1q_u8(dst_ptr, vbslq_u8(mask, true_value, false_value)); + + src_ptr += 16; + dst_ptr += 16; + } + } + } + } +} + + +#define vxThreshold_BINARY(type, low_y, high_y, low_x, high_x, type_size) \ + for (y = low_y; y < high_y; y++) \ + { \ + const type *src_ptr = (type *)src_base + y * in->addr->stride_y / type_size; \ + vx_uint8 *dst_ptr = (vx_uint8 *)dst_base + y * out->addr->stride_y; \ + \ + for (x = low_x; x < high_x; x++) \ + { \ + if (*src_ptr > _threshold_s16) \ + { \ + *dst_ptr = true_value_u8; \ + } \ + else \ + { \ + *dst_ptr = false_value_u8; \ + } \ + src_ptr++; \ + dst_ptr++; \ + } \ + } \ + + +#define vxThreshold_RANGE(type, low_y, high_y, low_x, high_x, type_size) \ + for (y = low_y; y < high_y; y++) \ + { \ + const type *src_ptr = (type *)src_base + y * in->addr->stride_y / type_size; \ + vx_uint8 *dst_ptr = (vx_uint8 *)dst_base + y * out->addr->stride_y; \ + \ + for (x = low_x; x < high_x; x++) \ + { \ + if (*src_ptr > _upper_threshold_s16) \ + { \ + *dst_ptr = false_value_u8; \ + } \ + else if (*src_ptr < _lower_threshold_s16) \ + { \ + *dst_ptr = false_value_u8; \ + } \ + else \ + { \ + *dst_ptr = true_value_u8; \ + } \ + src_ptr++; \ + dst_ptr++; \ + } \ + } \ + + +void Threshold_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_threshold_t *threshold = (vx_tile_threshold_t *)parameters[1]; + vx_tile_t *out = (vx_tile_t *)parameters[2]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + vx_uint8 true_value_u8 = threshold->true_value.U8; + vx_uint8 false_value_u8 = threshold->false_value.U8; + + vx_uint8 _threshold_u8 = threshold->value.U8; + vx_uint8 _lower_threshold_u8 = threshold->lower.U8; + vx_uint8 _upper_threshold_u8 = threshold->upper.U8; + + vx_int16 _threshold_s16 = threshold->value.S16; + vx_int16 _lower_threshold_s16 = threshold->lower.S16; + vx_int16 _upper_threshold_s16 = threshold->upper.S16; + + vx_int32 format = threshold->input_format; + vx_int32 type = threshold->thresh_type; + + if (format == VX_DF_IMAGE_S16) + {//case of input: VX_DF_IMAGE_S16 -> output: VX_DF_IMAGE_U8 + vx_int16 *src_base = (vx_int16 *)in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + if (type == VX_THRESHOLD_TYPE_BINARY) + { + if (low_y == 0 && low_x == 0) + { + vxThreshold_BINARY(vx_int16, low_y, high_y, low_x, high_x, 2) + } + else + { + vxThreshold_BINARY(vx_int16, 0, low_y, low_x, high_x, 2) + + src_base = (vx_int16 *)in->base[0]; + dst_base = out->base[0]; + vxThreshold_BINARY(vx_int16, low_y, high_y, 0, high_x, 2) + } + } + else if (type == VX_THRESHOLD_TYPE_RANGE) + { + if (low_y == 0 && low_x == 0) + { + vxThreshold_RANGE(vx_int16, low_y, high_y, low_x, high_x, 2) + } + else + { + vxThreshold_RANGE(vx_int16, 0, low_y, low_x, high_x, 2) + + src_base = (vx_int16 *)in->base[0]; + dst_base = out->base[0]; + vxThreshold_RANGE(vx_int16, low_y, high_y, 0, high_x, 2) + } + } + } + else + {//case of input: VX_DF_IMAGE_U8 -> output: VX_DF_IMAGE_U8 + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + if (type == VX_THRESHOLD_TYPE_BINARY) + { + if (low_y == 0 && low_x == 0) + { + vxThreshold_BINARY(vx_uint8, low_y, high_y, low_x, high_x, 1) + } + else + { + vxThreshold_BINARY(vx_uint8, 0, low_y, low_x, high_x, 1) + + src_base = in->base[0]; + dst_base = out->base[0]; + vxThreshold_BINARY(vx_uint8, low_y, high_y, 0, high_x, 1) + } + } + else if (type == VX_THRESHOLD_TYPE_RANGE) + { + if (low_y == 0 && low_x == 0) + { + vxThreshold_RANGE(vx_uint8, low_y, high_y, low_x, high_x, 1) + } + else + { + vxThreshold_RANGE(vx_uint8, 0, low_y, low_x, high_x, 1) + + src_base = in->base[0]; + dst_base = out->base[0]; + vxThreshold_RANGE(vx_uint8, low_y, high_y, 0, high_x, 1) + } + } + } +} diff --git a/kernels/tiling/tiling_warp.c b/kernels/tiling/tiling_warp.c new file mode 100644 index 0000000..95e6527 --- /dev/null +++ b/kernels/tiling/tiling_warp.c @@ -0,0 +1,619 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include +#include + +#include + +#define MIN(a,b) (((a) < (b)) ? (a) : (b)) + +static vx_int32 * alignPtr(vx_int32* ptr, size_t n) +{ + return (vx_int32 *)(((size_t)ptr + n-1) & -n); +} + +static vx_float32 * alignPtr_f(vx_float32* ptr, size_t n) +{ + return (vx_float32 *)(((size_t)ptr + n-1) & -n); +} + +static void remapNearestNeighborConst(const size_t height, + const size_t width, + const vx_uint8 * srcBase, + const vx_int32 * map, + vx_uint8 * dstBase, ptrdiff_t dstStride, + vx_uint8 borderValue) +{ + for (size_t y = 0; y < height; ++y) + { + const vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * width * sizeof(vx_int32)); + vx_uint8 * dst_row = (vx_uint8 *)((vx_int8 *)dstBase + y * dstStride); + + for (size_t x = 0; x < width; ++x) + { + vx_int32 src_idx = map_row[x]; + dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue; + } + } +} + +static void remapLinearConst(const size_t height, + const size_t width, + const vx_uint8 * srcBase, + const vx_int32 * map, + const vx_float32 * coeffs, + vx_uint8 * dstBase, ptrdiff_t dstStride, + vx_uint8 borderValue) +{ + int16x8_t v_zero16 = vdupq_n_s16(0); + + for (size_t y = 0; y < height; ++y) + { + const vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * width * sizeof(vx_int32) * 4); + const vx_float32 * coeff_row = (vx_float32 *)((vx_int8 *)(coeffs) + y * width * sizeof(vx_float32) * 2); + + vx_uint8 * dst_row = (vx_uint8 *)((vx_int8 *)(dstBase) + y * dstStride); + + size_t x = 0; + + for ( ; x + 8 < width; x += 8) + { + int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 4] >= 0 ? srcBase[map_row[(x << 2) + 4]] : borderValue, v_src00, 1); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 8] >= 0 ? srcBase[map_row[(x << 2) + 8]] : borderValue, v_src00, 2); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7); + + int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 5] >= 0 ? srcBase[map_row[(x << 2) + 5]] : borderValue, v_src01, 1); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 9] >= 0 ? srcBase[map_row[(x << 2) + 9]] : borderValue, v_src01, 2); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7); + + int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 6] >= 0 ? srcBase[map_row[(x << 2) + 6]] : borderValue, v_src10, 1); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7); + + int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 7] >= 0 ? srcBase[map_row[(x << 2) + 7]] : borderValue, v_src11, 1); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7); + + // first part + float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00))); + float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10))); + + float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1)); + float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01), + vget_low_s16(v_src00))), v_coeff.val[0]); + float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11), + vget_low_s16(v_src10))), v_coeff.val[0]); + + float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]); + uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst)); + + // second part + v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00))); + v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10))); + + v_coeff = vld2q_f32(coeff_row + (x << 1) + 8); + v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01), + vget_high_s16(v_src00))), v_coeff.val[0]); + v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11), + vget_high_s16(v_src10))), v_coeff.val[0]); + + v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]); + uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst)); + + // store + vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1))); + } + for ( ; x < width; ++x) + { + int16_t src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue; + int16_t src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue; + int16_t src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue; + int16_t src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue; + + vx_float32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00; + vx_float32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10; + dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0); + } + } +} + +//BLOCK_SIZE is the same as tile_size set in "vx_warp.c" +#define BLOCK_SIZE 16 + +void WarpAffine_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_matrix_t *mask = (vx_tile_matrix_t *)parameters[1]; + vx_enum *type = (vx_enum *)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + + vx_uint8 *src_base = in->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 src_width = in->image.width; + vx_uint32 src_height = in->image.height; + vx_uint32 srcStride = in->addr->stride_y; + + vx_uint32 dst_width = out->image.width; + vx_uint32 dst_height = out->image.height; + vx_uint32 dstStride = out->addr->stride_y; + + int32x4_t v_width4 = vdupq_n_s32(src_width - 1), v_height4 = vdupq_n_s32(src_height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride); + float32x4_t v_4 = vdupq_n_f32(4.0f); + + float32x4_t v_m0 = vdupq_n_f32(mask->m_f32[0]); + float32x4_t v_m1 = vdupq_n_f32(mask->m_f32[1]); + float32x4_t v_m2 = vdupq_n_f32(mask->m_f32[2]); + float32x4_t v_m3 = vdupq_n_f32(mask->m_f32[3]); + float32x4_t v_m4 = vdupq_n_f32(mask->m_f32[4]); + float32x4_t v_m5 = vdupq_n_f32(mask->m_f32[5]); + + vx_uint8 borderValue = 0; + + size_t i = out->tile_y; + size_t blockHeight = MIN(BLOCK_SIZE, dst_height - i); + size_t j = out->tile_x; + size_t blockWidth = MIN(BLOCK_SIZE, dst_width - j); + + if (*type == VX_INTERPOLATION_NEAREST_NEIGHBOR) + { + vx_int32 _map[BLOCK_SIZE * BLOCK_SIZE + 16]; + vx_int32 * map = alignPtr(_map, 16); + + int32x4_t v_m1_4 = vdupq_n_s32(-1); + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(&map[0]) + y * blockWidth * sizeof(vx_int32)); + + size_t x = 0, y_ = y + i; + vx_float32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + + int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf); + uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)), + vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4))); + int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4); + vst1q_s32(map_row + x, v_src_index); + + v_x = vaddq_f32(v_x, v_4); + } + } + vx_uint8 * dstBase = (vx_uint8 *)((vx_int8 *)dst_base + i * dstStride); + // make remap + remapNearestNeighborConst(blockHeight, blockWidth, src_base, &map[0], dstBase + j, dstStride, borderValue); + } + else if (*type == VX_INTERPOLATION_BILINEAR) + { + vx_int32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16]; + vx_float32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16]; + vx_int32 * map = alignPtr(_map, 16); + vx_float32 * coeffs = alignPtr_f(_coeffs, 16); + + int32x4_t v_1 = vdupq_n_s32(1); + float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f); + + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + int32x4_t v_m1_4 = vdupq_n_s32(-1); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * blockWidth * sizeof(vx_int32) * 4); + vx_float32 * coeff_row = (vx_float32 *)((vx_int8 *)(coeffs) + y * blockWidth * sizeof(vx_float32) * 2); + + size_t x = 0, y_ = y + i; + vx_float32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f); + float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + + int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0)); + v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0); + v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0); + + int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1); + int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4); + + uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4)); + uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4)); + uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4)); + uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4)); + + v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4); + v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4); + v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4); + v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + + v_x = vaddq_f32(v_x, v_4); + } + } + + vx_uint8 * dstBase = (vx_uint8 *)((vx_int8 *)dst_base + i * dstStride); + + remapLinearConst(blockHeight, blockWidth, src_base, &map[0], &coeffs[0], dstBase + j, dstStride, borderValue); + } +} + +static inline float32x4_t vrecpq_f32(float32x4_t val) +{ + float32x4_t reciprocal = vrecpeq_f32(val); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); + return reciprocal; +} + +void WarpPerspective_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_matrix_t *mask = (vx_tile_matrix_t *)parameters[1]; + vx_enum *type = (vx_enum *)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + + vx_uint8 *src_base = in->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 src_width = in->image.width; + vx_uint32 src_height = in->image.height; + vx_uint32 srcStride = in->addr->stride_y; + + vx_uint32 dst_width = out->image.width; + vx_uint32 dst_height = out->image.height; + vx_uint32 dstStride = out->addr->stride_y; + + int32x4_t v_width4 = vdupq_n_s32(src_width - 1); + int32x4_t v_height4 = vdupq_n_s32(src_height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride); + float32x4_t v_4 = vdupq_n_f32(4.0f); + + float32x4_t v_m0 = vdupq_n_f32(mask->m_f32[0]); + float32x4_t v_m1 = vdupq_n_f32(mask->m_f32[1]); + float32x4_t v_m2 = vdupq_n_f32(mask->m_f32[2]); + float32x4_t v_m3 = vdupq_n_f32(mask->m_f32[3]); + float32x4_t v_m4 = vdupq_n_f32(mask->m_f32[4]); + float32x4_t v_m5 = vdupq_n_f32(mask->m_f32[5]); + float32x4_t v_m6 = vdupq_n_f32(mask->m_f32[6]); + float32x4_t v_m7 = vdupq_n_f32(mask->m_f32[7]); + float32x4_t v_m8 = vdupq_n_f32(mask->m_f32[8]); + + vx_uint8 borderValue = 0; + + size_t i = out->tile_y; + size_t blockHeight = MIN(BLOCK_SIZE, dst_height - i); + size_t j = out->tile_x; + size_t blockWidth = MIN(BLOCK_SIZE, dst_width - j); + + if (*type == VX_INTERPOLATION_NEAREST_NEIGHBOR) + { + vx_int32 _map[BLOCK_SIZE * BLOCK_SIZE + 16]; + vx_int32 * map = alignPtr(_map, 16); + + int32x4_t v_m1_4 = vdupq_n_s32(-1); + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(&map[0]) + y * blockWidth * sizeof(vx_int32)); + + size_t x = 0, y_ = y + i; + vx_float32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y); + float32x4_t v_yy = vmlaq_f32(v_m7, v_m4, v_y); + float32x4_t v_yw = vmlaq_f32(v_m8, v_m5, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + + float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x)); + + v_src_xf = vmulq_f32(v_wf, v_src_xf); + v_src_yf = vmulq_f32(v_wf, v_src_yf); + + int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf); + uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)), + vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4))); + int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4); + vst1q_s32(map_row + x, v_src_index); + + v_x = vaddq_f32(v_x, v_4); + } + } + + vx_uint8 * dstBase = (vx_uint8 *)((vx_int8 *)dst_base + i * dstStride); + // make remap + remapNearestNeighborConst(blockHeight, blockWidth, src_base, &map[0],dstBase + j, dstStride, borderValue); + } + else if (*type == VX_INTERPOLATION_BILINEAR) + { + vx_int32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16]; + vx_float32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16]; + vx_int32 * map = alignPtr(_map, 16); + vx_float32 * coeffs = alignPtr_f(_coeffs, 16); + + int32x4_t v_1 = vdupq_n_s32(1); + float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f); + + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + int32x4_t v_m1_4 = vdupq_n_s32(-1); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * blockWidth * sizeof(vx_int32) * 4); + vx_float32 * coeff_row = (vx_float32 *)((vx_int8 *)(coeffs) + y * blockWidth * sizeof(vx_float32) * 2); + + size_t x = 0, y_ = y + i; + vx_float32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f }; + float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_); + float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y), + v_yw = vmlaq_f32(v_m8, v_m5, v_y); + + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x); + float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x); + float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x)); + v_src_xf = vmulq_f32(v_wf, v_src_xf); + v_src_yf = vmulq_f32(v_wf, v_src_yf); + + int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf); + int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0)); + v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0); + v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0); + + int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1); + int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4); + + uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4)); + uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4)); + uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4)); + uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4)); + + v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4); + v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4); + v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4); + v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + + v_x = vaddq_f32(v_x, v_4); + } + } + + vx_uint8 * dstBase = (vx_uint8 *)((vx_int8 *)dst_base + i * dstStride); + + remapLinearConst(blockHeight, blockWidth, src_base, &map[0], &coeffs[0], dstBase + j, dstStride, borderValue); + } +} + +static vx_bool read_pixel_8u_C1(void *base, vx_imagepatch_addressing_t *addr, vx_uint32 src_height, vx_uint32 src_width, + vx_float32 x, vx_float32 y, vx_uint8 *pixel) +{ + vx_bool out_of_bounds = (x < 0 || y < 0 || x >= src_width || y >= src_height); + vx_uint32 bx, by; + vx_uint8 *bpixel; + + if (out_of_bounds) + { + return vx_false_e; + } + + // bounded x/y + bx = x < 0 ? 0 : x >= src_width ? src_width - 1 : (vx_uint32)x; + by = y < 0 ? 0 : y >= src_height ? src_height - 1 : (vx_uint32)y; + + vx_uint8 *new_ptr = NULL; + vx_uint32 offset = (addr->stride_y * by + addr->stride_x * bx); + new_ptr = (vx_uint8 *)base; + bpixel = &new_ptr[offset]; + + *pixel = *bpixel; + + return vx_true_e; +} + +static void transform_affine(vx_uint32 dst_x, vx_uint32 dst_y, vx_float32 m[], vx_float32 *src_x, vx_float32 *src_y) +{ + *src_x = dst_x * m[0] + dst_y * m[2] + m[4]; + *src_y = dst_x * m[1] + dst_y * m[3] + m[5]; +} + +static void transform_perspective(vx_uint32 dst_x, vx_uint32 dst_y, vx_float32 m[], vx_float32 *src_x, vx_float32 *src_y) +{ + vx_float32 z = dst_x * m[2] + dst_y * m[5] + m[8]; + + *src_x = (dst_x * m[0] + dst_y * m[3] + m[6]) / z; + *src_y = (dst_x * m[1] + dst_y * m[4] + m[7]) / z; +} + +#define WARP(low_y, high_y, low_x, transform) \ + for (y = low_y; y < high_y; y++) \ + { \ + vx_uint8 *dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_float32 xf; \ + vx_float32 yf; \ + transform(x, y, mask->m_f32, &xf, &yf); \ + \ + if (*type == VX_INTERPOLATION_NEAREST_NEIGHBOR) \ + { \ + read_pixel_8u_C1(src_base, in->addr, in->image.height, in->image.width, xf, yf, dst); \ + dst++; \ + } \ + else if (*type == VX_INTERPOLATION_BILINEAR) \ + { \ + vx_uint8 tl = 0, tr = 0, bl = 0, br = 0; \ + vx_bool defined = vx_true_e; \ + defined &= read_pixel_8u_C1(src_base, in->addr, in->image.height, in->image.width, floorf(xf), floorf(yf), &tl); \ + defined &= read_pixel_8u_C1(src_base, in->addr, in->image.height, in->image.width, floorf(xf) + 1, floorf(yf), &tr); \ + defined &= read_pixel_8u_C1(src_base, in->addr, in->image.height, in->image.width, floorf(xf), floorf(yf) + 1, &bl); \ + defined &= read_pixel_8u_C1(src_base, in->addr, in->image.height, in->image.width, floorf(xf) + 1, floorf(yf) + 1, &br); \ + if (defined) \ + { \ + vx_float32 ar = xf - floorf(xf); \ + vx_float32 ab = yf - floorf(yf); \ + vx_float32 al = 1.0f - ar; \ + vx_float32 at = 1.0f - ab; \ + *dst = (vx_uint8)(tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab); \ + } \ + dst++; \ + } \ + } \ + } + + +void WarpAffine_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_matrix_t *mask = (vx_tile_matrix_t *)parameters[1]; + vx_enum *type = (vx_enum *)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + if (low_y == 0 && low_x == 0) + { + WARP(low_y, high_y, low_x, transform_affine) + } + else + { + WARP(0, low_y, low_x, transform_affine) + + src_base = in->base[0]; + dst_base = out->base[0]; + WARP(low_y, high_y, 0, transform_affine) + } +} + +void WarpPerspective_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_tile_matrix_t *mask = (vx_tile_matrix_t *)parameters[1]; + vx_enum *type = (vx_enum *)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = vxTileHeight(out, 0); + + vx_uint32 low_x = out->tile_x; + vx_uint32 high_x = vxTileWidth(out, 0); + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + if (low_y == 0 && low_x == 0) + { + WARP(low_y, high_y, low_x, transform_perspective) + } + else + { + WARP(0, low_y, low_x, transform_perspective) + + src_base = in->base[0]; + dst_base = out->base[0]; + WARP(low_y, high_y, 0, transform_perspective) + } +} diff --git a/sample/framework/vx_context.c b/sample/framework/vx_context.c old mode 100644 new mode 100755 index 1050759..69dfbbe --- a/sample/framework/vx_context.c +++ b/sample/framework/vx_context.c @@ -22,21 +22,45 @@ const vx_char implementation[VX_MAX_IMPLEMENTATION_NAME] = "khronos.sample"; vx_char targetModules[][VX_MAX_TARGET_NAME] = { +#if defined(OPENVX_USE_TILING) + "openvx-tiling_chaining", +#endif "openvx-c_model", +#if defined(EXPERIMENTAL_USE_VENUM) + "openvx-venum", +#endif +#if defined(EXPERIMENTAL_USE_OPENCL) + "openvx-opencl", +#endif +#if defined(EXPERIMENTAL_USE_OPENMP) + "openvx-openmp" +#endif }; const vx_char extensions[] = #if defined(OPENVX_USE_TILING) OPENVX_KHR_TILING" " #endif -#if defined(OPENVX_USE_XML) +#if defined(EXPERIMENTAL_USE_XML) OPENVX_KHR_XML" " #endif +#if defined(EXPERIMENTAL_USE_OPENCL) + OPENVX_KHR_OPENCL" " +#endif +#if defined(EXPERIMENTAL_USE_NODE_MEMORY) + OPENVX_KHR_NODE_MEMORY" " +#endif #if defined(OPENVX_USE_S16) "vx_khr_s16 " #endif #if defined(EXPERIMENTAL_USE_DOT) OPENVX_KHR_DOT" " +#endif +#if defined(EXPERIMENTAL_USE_TARGET) + OPENVX_EXT_TARGET" " +#endif +#if defined(EXPERIMENTAL_USE_VARIANTS) + OPENVX_KHR_VARIANTS" " #endif " "; @@ -779,6 +803,18 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryContext(vx_context context, vx_enum at status = VX_ERROR_INVALID_PARAMETERS; } break; +#if defined(EXPERIMENTAL_USE_TARGET) + case VX_CONTEXT_TARGETS: + if (VX_CHECK_PARAM(ptr, size, vx_uint32, 0x3)) + { + *(vx_uint32 *)ptr = context->num_targets; + } + else + { + status = VX_ERROR_INVALID_PARAMETERS; + } + break; +#endif case VX_CONTEXT_IMPLEMENTATION: if (size <= VX_MAX_IMPLEMENTATION_NAME && ptr) { @@ -912,7 +948,21 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryContext(vx_context context, vx_enum at { VX_PRINT(VX_ZONE_INFO, "Kernel %s is unique\n", context->targets[t].kernels[k].name); table[numk].enumeration = context->targets[t].kernels[k].enumeration; +#if defined(EXPERIMENTAL_USE_TARGET) || defined(EXPERIMENTAL_USE_VARIANT) + // get the central string out + { + vx_uint32 c = 0; + strncpy(table[numk].name, context->targets[t].kernels[k].name, VX_MAX_KERNEL_NAME); + for (c = 0; table[numk].name[c] != '\0'; c++) { + if (table[numk].name[c] == ';') { + table[numk].name[c] = '\0'; + break; + } + } + } +#else strncpy(table[numk].name, context->targets[t].kernels[k].name, VX_MAX_KERNEL_NAME); +#endif numk++; } } @@ -932,8 +982,6 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryContext(vx_context context, vx_enum at VX_API_ENTRY vx_status VX_API_CALL vxHint(vx_reference reference, vx_enum hint, const void* data, vx_size data_size) { vx_status status = VX_SUCCESS; - (void)data; - (void)data_size; /* reference param should be a valid OpenVX reference*/ if (ownIsValidContext((vx_context)reference) == vx_false_e && ownIsValidReference(reference) == vx_false_e) @@ -1064,7 +1112,7 @@ VX_API_ENTRY vx_enum VX_API_CALL vxRegisterUserStructWithName(vx_context context { context->user_structs[i].type = VX_TYPE_USER_STRUCT_START + i; context->user_structs[i].size = size; - strncpy(context->user_structs[i].name, name, VX_MAX_STRUCT_NAME - 1); + strncpy(context->user_structs[i].name, name, VX_MAX_STRUCT_NAME); type = context->user_structs[i].type; break; } diff --git a/sample/framework/vx_kernel.c b/sample/framework/vx_kernel.c index 0893bda..8299b21 100644 --- a/sample/framework/vx_kernel.c +++ b/sample/framework/vx_kernel.c @@ -55,7 +55,7 @@ vx_kernel_t *ownAllocateKernel(vx_context context, if (vxGetStatus((vx_reference)kernel) == VX_SUCCESS && kernel->base.type == VX_TYPE_KERNEL) { /* setup the kernel meta-data */ - strncpy(kernel->name, name, VX_MAX_KERNEL_NAME - 1); + strncpy(kernel->name, name, VX_MAX_KERNEL_NAME); kernel->enumeration = kenum; kernel->function = function; kernel->signature.num_parameters = numParams; @@ -105,7 +105,7 @@ vx_status ownInitializeKernel(vx_context context, ownIncrementReference(&kernel->base, VX_INTERNAL); // setup the kernel meta-data - strncpy(kernel->name, name, VX_MAX_KERNEL_NAME - 1); + strncpy(kernel->name, name, VX_MAX_KERNEL_NAME); kernel->enumeration = kenum; kernel->function = function; kernel->signature.num_parameters = numParams; @@ -238,7 +238,7 @@ VX_API_ENTRY vx_status VX_API_CALL vxLoadKernels(vx_context context, const vx_ch } else { - strncpy(context->modules[m].name, name, VX_INT_MAX_PATH - 1); + strncpy(context->modules[m].name, name, VX_INT_MAX_PATH); context->modules[m].ref_count = 1; context->num_modules++; } @@ -349,6 +349,18 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxGetKernelByName(vx_context context, const v vx_size colons = strncount(string, VX_MAX_KERNEL_NAME, ':'); vx_char targetName[VX_MAX_TARGET_NAME] = "default"; vx_char kernelName[VX_MAX_KERNEL_NAME]; +#if defined(EXPERIMENTAL_USE_VARIANTS) + vx_char variantName[VX_MAX_VARIANT_NAME] = "default"; +#if defined(EXPERIMENTAL_USE_TARGET) + vx_char defaultTargets[][VX_MAX_TARGET_NAME] = { + "default", + "power", + "performance", + "memory", + "bandwidth", + }; +#endif +#endif #if defined(_WIN32) vx_char *nameBuffer = _strdup(string); #else @@ -356,23 +368,100 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxGetKernelByName(vx_context context, const v #endif if (colons == 0) { - strncpy(kernelName, string, VX_MAX_KERNEL_NAME - 1); + strncpy(kernelName, string, VX_MAX_KERNEL_NAME); } - else + else if (colons == 1) { - /* There should be no colon */ +#if defined(EXPERIMENTAL_USE_TARGET) || defined(EXPERIMENTAL_USE_VARIANTS) + /* could be either target:kernel or kernel:variant" */ + vx_char *front = strtok(nameBuffer, ":"); + vx_char *back = strtok(NULL, ":"); +#if defined(EXPERIMENTAL_USE_TARGET) && defined(EXPERIMENTAL_USE_VARIANTS) + vx_bool isTarget = vx_false_e; + /* does front match any targets? */ + for (t = 0u; t < context->num_targets; t++) + { + if (strncmp(front, context->targets[t].name, VX_MAX_TARGET_NAME) == 0) + { + isTarget = vx_true_e; + break; + } + } + if (isTarget == vx_false_e) + { + for (t = 0u; t < dimof(defaultTargets); t++) + { + if (strncmp(front, defaultTargets[t], VX_MAX_TARGET_NAME) == 0) + { + isTarget = vx_true_e; + break; + } + } + } + if (isTarget == vx_true_e) + { + strncpy(targetName, front, VX_MAX_TARGET_NAME); + strncpy(kernelName, back, VX_MAX_KERNEL_NAME); + } + else + { + strncpy(kernelName, front, VX_MAX_KERNEL_NAME); + strncpy(variantName, back, VX_MAX_VARIANT_NAME); + } +#elif defined(EXPERIMENTAL_USE_TARGET) + strncpy(targetName, front, VX_MAX_TARGET_NAME); + strncpy(kernelName, back, VX_MAX_KERNEL_NAME); +#elif defined(EXPERIMENTAL_USE_VARIANTS) + strncpy(kernelName, front, VX_MAX_KERNEL_NAME); + strncpy(variantName, back, VX_MAX_VARIANT_NAME); +#endif +#else /* defined(EXPERIMENTAL_USE_TARGET) || defined(EXPERIMENTAL_USE_VARIANTS) */ + /* If both TARGET and VARIANT extensions are disabled, there should be no colon */ /* Doing nothing will leave kern = NULL, causing error condition below */ VX_PRINT(VX_ZONE_ERROR, "Kernel name should not contain any ':' in this implementation\n"); +#endif /* defined(EXPERIMENTAL_USE_TARGET) || defined(EXPERIMENTAL_USE_VARIANTS) */ + } + else if (colons == 2) + { +#if defined(EXPERIMENTAL_USE_TARGET) && defined(EXPERIMENTAL_USE_VARIANTS) + /* target:kernel:variant */ + vx_char *target = strtok(nameBuffer, ":"); + vx_char *kernel = strtok(NULL, ":"); + vx_char *variant = strtok(NULL,":"); + strncpy(targetName, target, VX_MAX_TARGET_NAME); + strncpy(kernelName, kernel, VX_MAX_KERNEL_NAME); + strncpy(variantName, variant, VX_MAX_VARIANT_NAME); +#else /* defined(EXPERIMENTAL_USE_TARGET) && defined(EXPERIMENTAL_USE_VARIANTS) */ + /* If both TARGET and VARIANT extensions are disabled, there should be no colon */ + /* Doing nothing will leave kern = NULL, causing error condition below */ + VX_PRINT(VX_ZONE_ERROR, "Kernel name should not contain two ':' in this implementation\n"); +#endif /* defined(EXPERIMENTAL_USE_TARGET) && defined(EXPERIMENTAL_USE_VARIANTS) */ + } + else + { + /* no extension supports > 2 colons so far */ + /* Doing nothing will leave kern = NULL, causing error condition below */ + VX_PRINT(VX_ZONE_ERROR, "Kernel name should not contain more than two ':' in this implementation\n"); } free(nameBuffer); +#if defined(EXPERIMENTAL_USE_VARIANTS) + VX_PRINT(VX_ZONE_KERNEL, "Scanning in set of %u kernels on %u targets.\n" + "Target: %s\nKernel: %s\nVariant: %s\n", + context->num_kernels, context->num_targets, + targetName, kernelName, variantName); +#endif for (t = 0; t < context->num_targets && kern == NULL; t++) { vx_target_t *target = &context->targets[context->priority_targets[t]]; if (target == NULL || target->enabled == vx_false_e) continue; +#if defined(EXPERIMENTAL_USE_VARIANTS) + if (target->funcs.supports(target, targetName, kernelName, variantName, &k) == VX_SUCCESS) +#else if (target->funcs.supports(target, targetName, kernelName, &k) == VX_SUCCESS) +#endif { vx_kernel kernel = &target->kernels[k]; vxPrintKernel(kernel); @@ -433,6 +522,10 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxGetKernelByEnum(vx_context context, vx_enum break; } } + /* Acquire the highest priority target */ + if (kernel != NULL) { + break; + } } if (kernel == NULL) { VX_PRINT(VX_ZONE_KERNEL, "Kernel enum %x not found.\n", kernelenum); @@ -583,11 +676,15 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxAddUserKernel(vx_context c, VX_API_ENTRY vx_kernel VX_API_CALL vxAddTilingKernel(vx_context c, vx_char name[VX_MAX_KERNEL_NAME], vx_enum enumeration, + vx_kernel_f function, vx_tiling_kernel_f flexible_func_ptr, vx_tiling_kernel_f fast_func_ptr, vx_uint32 num_params, + vx_kernel_validate_f validate, vx_kernel_input_validate_f input, - vx_kernel_output_validate_f output) + vx_kernel_output_validate_f output, + vx_kernel_initialize_f initialize, + vx_kernel_deinitialize_f deinitialize) { vx_context_t *context = (vx_context_t *)c; vx_kernel kernel = 0; @@ -601,9 +698,7 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxAddTilingKernel(vx_context c, VX_PRINT(VX_ZONE_ERROR, "Invalid Context\n"); return (vx_kernel)NULL; } - if ((flexible_func_ptr == NULL && fast_func_ptr == NULL) || - input == NULL || - output == NULL || + if ( ((validate == NULL) && (input == NULL || output == NULL)) || num_params > VX_INT_MAX_PARAMS || num_params == 0 || name == NULL || strncmp(name, "", VX_MAX_KERNEL_NAME) == 0) @@ -618,7 +713,7 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxAddTilingKernel(vx_context c, index = strnindex(name, ':', VX_MAX_TARGET_NAME); if (index == VX_MAX_TARGET_NAME) { - strcpy(targetName,"khronos.any"); + strcpy(targetName,"khronos.tiling"); } else { @@ -636,9 +731,9 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxAddTilingKernel(vx_context c, } if (target && target->funcs.addtilingkernel) { - kernel = target->funcs.addtilingkernel(target, name, enumeration, - flexible_func_ptr, fast_func_ptr, num_params, - input, output); + kernel = target->funcs.addtilingkernel(target, name, enumeration, function, + flexible_func_ptr, fast_func_ptr, num_params, validate, + input, output, initialize, deinitialize); VX_PRINT(VX_ZONE_KERNEL,"Added Kernel %s to Target %s ("VX_FMT_REF")\n", name, target->name, kernel); } else @@ -744,6 +839,28 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryKernel(vx_kernel kern, vx_enum attribu status = VX_ERROR_INVALID_PARAMETERS; } break; +#ifdef OPENVX_KHR_NODE_MEMORY + case VX_KERNEL_GLOBAL_DATA_SIZE: + if (VX_CHECK_PARAM(ptr, size, vx_size, 0x3)) + { + *(vx_size *)ptr = kernel->attributes.globalDataSize; + } + else + { + status = VX_ERROR_INVALID_PARAMETERS; + } + break; + case VX_KERNEL_GLOBAL_DATA_PTR: + if (VX_CHECK_PARAM(ptr, size, vx_ptr_t, 0x1)) + { + *(vx_ptr_t *)ptr = kernel->attributes.globalDataPtr; + } + else + { + status = VX_ERROR_INVALID_PARAMETERS; + } + break; +#endif #ifdef OPENVX_KHR_TILING case VX_KERNEL_INPUT_NEIGHBORHOOD: if (VX_CHECK_PARAM(ptr, size, vx_neighborhood_size_t, 0x3)) @@ -795,10 +912,17 @@ VX_API_ENTRY vx_status VX_API_CALL vxAddParameterToKernel(vx_kernel kernel, if (index < kern->signature.num_parameters) { #ifdef OPENVX_KHR_TILING - if (kern->tiling_function) + if (kern->tilingfast_function) { if (((data_type != VX_TYPE_IMAGE) && - (data_type != VX_TYPE_SCALAR)) || + (data_type != VX_TYPE_SCALAR) && + (data_type != VX_TYPE_THRESHOLD) && + (data_type != VX_TYPE_REMAP) && + (data_type != VX_TYPE_CONVOLUTION) && + (data_type != VX_TYPE_TENSOR) && + (data_type != VX_TYPE_ARRAY) && + (data_type != VX_TYPE_LUT) && + (data_type != VX_TYPE_MATRIX)) || (ownIsValidDirection(dir) == vx_false_e) || (ownIsValidState(state) == vx_false_e)) { @@ -953,6 +1077,29 @@ VX_API_ENTRY vx_status VX_API_CALL vxSetKernelAttribute(vx_kernel k, vx_enum att status = VX_ERROR_INVALID_PARAMETERS; } break; +#ifdef EXPERIMENTAL_USE_NODE_MEMORY + case VX_KERNEL_GLOBAL_DATA_SIZE: + if (VX_CHECK_PARAM(ptr, size, vx_size, 0x3)) + { + kernel->attributes.globalDataSize = *(vx_size *)ptr; + VX_PRINT(VX_ZONE_KERNEL, "Set Global Data Size to "VX_FMT_SIZE" bytes\n", kernel->attributes.globalDataSize); + } + else + { + status = VX_ERROR_INVALID_PARAMETERS; + } + break; + case VX_KERNEL_GLOBAL_DATA_PTR: + if (VX_CHECK_PARAM(ptr, size, vx_ptr_t, 0x1)) + { + kernel->attributes.globalDataPtr = *(vx_ptr_t *)ptr; + } + else + { + status = VX_ERROR_INVALID_PARAMETERS; + } + break; +#endif #ifdef OPENVX_KHR_TILING case VX_KERNEL_INPUT_NEIGHBORHOOD: if (VX_CHECK_PARAM(ptr, size, vx_neighborhood_size_t, 0x3)) diff --git a/sample/framework/vx_node_api.c b/sample/framework/vx_node_api.c index 5089e9b..9bae748 100644 --- a/sample/framework/vx_node_api.c +++ b/sample/framework/vx_node_api.c @@ -15,12 +15,6 @@ * limitations under the License. */ -/*! - * \file - * \brief The Graph Mode Interface for all Base Kernels. - * \author Erik Rainey - */ - #include "vx_internal.h" VX_API_ENTRY vx_node VX_API_CALL vxColorConvertNode(vx_graph graph, vx_image input, vx_image output) @@ -29,7 +23,11 @@ VX_API_ENTRY vx_node VX_API_CALL vxColorConvertNode(vx_graph graph, vx_image inp (vx_reference)input, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, VX_KERNEL_COLOR_CONVERT_TILING, params, dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_COLOR_CONVERT, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxChannelExtractNode(vx_graph graph, @@ -66,10 +64,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxChannelCombineNode(vx_graph graph, (vx_reference)plane3, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_CHANNEL_COMBINE_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_CHANNEL_COMBINE, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxSobel3x3Node(vx_graph graph, vx_image input, vx_image output_x, vx_image output_y) @@ -79,10 +84,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxSobel3x3Node(vx_graph graph, vx_image input, (vx_reference)output_x, (vx_reference)output_y, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_SOBEL_3x3_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_SOBEL_3x3, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxMagnitudeNode(vx_graph graph, vx_image grad_x, vx_image grad_y, vx_image mag) @@ -92,10 +104,18 @@ VX_API_ENTRY vx_node VX_API_CALL vxMagnitudeNode(vx_graph graph, vx_image grad_x (vx_reference)grad_y, (vx_reference)mag, }; + +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_MAGNITUDE_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_MAGNITUDE, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxPhaseNode(vx_graph graph, vx_image grad_x, vx_image grad_y, vx_image orientation) @@ -105,10 +125,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxPhaseNode(vx_graph graph, vx_image grad_x, vx (vx_reference)grad_y, (vx_reference)orientation, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_PHASE_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_PHASE, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxScaleImageNode(vx_graph graph, vx_image src, vx_image dst, vx_enum type) @@ -120,10 +147,18 @@ VX_API_ENTRY vx_node VX_API_CALL vxScaleImageNode(vx_graph graph, vx_image src, (vx_reference)dst, (vx_reference)stype, }; - vx_node node = vxCreateNodeByStructure(graph, - VX_KERNEL_SCALE_IMAGE, - params, - dimof(params)); + vx_node node; + #if defined(OPENVX_USE_TILING) + node = vxCreateNodeByStructure(graph, + VX_KERNEL_SCALE_IMAGE_TILING, + params, + dimof(params)); + #else + node = vxCreateNodeByStructure(graph, + VX_KERNEL_SCALE_IMAGE, + params, + dimof(params)); + #endif vxReleaseScalar(&stype); return node; } @@ -135,10 +170,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxTableLookupNode(vx_graph graph, vx_image inpu (vx_reference)lut, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_TABLE_LOOKUP_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_TABLE_LOOKUP, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxHistogramNode(vx_graph graph, vx_image input, vx_distribution distribution) @@ -172,10 +214,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxAbsDiffNode(vx_graph graph, vx_image in1, vx_ (vx_reference)in2, (vx_reference)out, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_ABSDIFF_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_ABSDIFF, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxMeanStdDevNode(vx_graph graph, vx_image input, vx_scalar mean, vx_scalar stddev) @@ -198,10 +247,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxThresholdNode(vx_graph graph, vx_image input, (vx_reference)thesh, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_THRESHOLD_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_THRESHOLD, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxIntegralImageNode(vx_graph graph, vx_image input, vx_image output) @@ -210,10 +266,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxIntegralImageNode(vx_graph graph, vx_image in (vx_reference)input, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_INTEGRAL_IMAGE_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_INTEGRAL_IMAGE, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxErode3x3Node(vx_graph graph, vx_image input, vx_image output) @@ -222,10 +285,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxErode3x3Node(vx_graph graph, vx_image input, (vx_reference)input, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_ERODE_3x3_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_ERODE_3x3, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxDilate3x3Node(vx_graph graph, vx_image input, vx_image output) @@ -234,10 +304,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxDilate3x3Node(vx_graph graph, vx_image input, (vx_reference)input, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_DILATE_3x3_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_DILATE_3x3, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxMedian3x3Node(vx_graph graph, vx_image input, vx_image output) @@ -246,10 +323,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxMedian3x3Node(vx_graph graph, vx_image input, (vx_reference)input, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_MEDIAN_3x3_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_MEDIAN_3x3, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxBox3x3Node(vx_graph graph, vx_image input, vx_image output) @@ -258,10 +342,18 @@ VX_API_ENTRY vx_node VX_API_CALL vxBox3x3Node(vx_graph graph, vx_image input, vx (vx_reference)input, (vx_reference)output, }; + +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_BOX_3x3_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_BOX_3x3, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxGaussian3x3Node(vx_graph graph, vx_image input, vx_image output) @@ -270,10 +362,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxGaussian3x3Node(vx_graph graph, vx_image inpu (vx_reference)input, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_GAUSSIAN_3x3_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_GAUSSIAN_3x3, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxNonLinearFilterNode(vx_graph graph, vx_enum function, vx_image input, vx_matrix mask, vx_image output) @@ -286,11 +385,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxNonLinearFilterNode(vx_graph graph, vx_enum f (vx_reference)mask, (vx_reference)output, }; - +#if defined(OPENVX_USE_TILING) + vx_node node = vxCreateNodeByStructure(graph, + VX_KERNEL_NON_LINEAR_FILTER_TILING, + params, + dimof(params)); +#else vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_NON_LINEAR_FILTER, params, dimof(params)); +#endif vxReleaseScalar(&func); return node; @@ -303,10 +408,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxConvolveNode(vx_graph graph, vx_image input, (vx_reference)conv, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_CUSTOM_CONVOLUTION_TILING, + params, + dimof(params)); +#else return vxCreateNodeByStructure(graph, VX_KERNEL_CUSTOM_CONVOLUTION, params, dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxGaussianPyramidNode(vx_graph graph, vx_image input, vx_pyramid gaussian) @@ -407,6 +519,31 @@ VX_API_ENTRY vx_node VX_API_CALL vxMinMaxLocNode(vx_graph graph, dimof(params)); } +VX_API_ENTRY vx_node VX_API_CALL vxWeightedAverageImageNode(vx_graph graph, + vx_image img1, + vx_scalar alpha, + vx_image img2, + vx_image output) +{ + vx_reference params[] = { + (vx_reference)img1, + (vx_reference)alpha, + (vx_reference)img2, + (vx_reference)output, + }; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_WEIGHTED_AVERAGE_TILING, + params, + dimof(params)); +#else + return vxCreateNodeByStructure(graph, + VX_KERNEL_WEIGHTED_AVERAGE, + params, + dimof(params)); +#endif +} + VX_API_ENTRY vx_node VX_API_CALL vxConvertDepthNode(vx_graph graph, vx_image input, vx_image output, vx_enum policy, vx_scalar shift) { vx_scalar pol = vxCreateScalar(vxGetContext((vx_reference)graph), VX_TYPE_ENUM, &policy); @@ -416,10 +553,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxConvertDepthNode(vx_graph graph, vx_image inp (vx_reference)pol, (vx_reference)shift, }; +#if defined(OPENVX_USE_TILING) + vx_node node = vxCreateNodeByStructure(graph, + VX_KERNEL_CONVERTDEPTH_TILING, + params, + dimof(params)); +#else vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_CONVERTDEPTH, params, dimof(params)); +#endif vxReleaseScalar(&pol); return node; } @@ -453,10 +597,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxAndNode(vx_graph graph, vx_image in1, vx_imag (vx_reference)in2, (vx_reference)out, }; - return vxCreateNodeByStructure(graph, + #if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_AND_TILING, + params, + dimof(params)); + #else + return vxCreateNodeByStructure(graph, VX_KERNEL_AND, params, dimof(params)); + #endif } VX_API_ENTRY vx_node VX_API_CALL vxOrNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out) @@ -466,10 +617,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxOrNode(vx_graph graph, vx_image in1, vx_image (vx_reference)in2, (vx_reference)out, }; - return vxCreateNodeByStructure(graph, + #if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_OR_TILING, + params, + dimof(params)); + #else + return vxCreateNodeByStructure(graph, VX_KERNEL_OR, params, dimof(params)); + #endif } VX_API_ENTRY vx_node VX_API_CALL vxXorNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out) @@ -479,10 +637,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxXorNode(vx_graph graph, vx_image in1, vx_imag (vx_reference)in2, (vx_reference)out, }; - return vxCreateNodeByStructure(graph, + #if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_XOR_TILING, + params, + dimof(params)); + #else + return vxCreateNodeByStructure(graph, VX_KERNEL_XOR, params, dimof(params)); + #endif } VX_API_ENTRY vx_node VX_API_CALL vxNotNode(vx_graph graph, vx_image input, vx_image output) @@ -491,10 +656,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxNotNode(vx_graph graph, vx_image input, vx_im (vx_reference)input, (vx_reference)output, }; - return vxCreateNodeByStructure(graph, + #if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_NOT_TILING, + params, + dimof(params)); + #else + return vxCreateNodeByStructure(graph, VX_KERNEL_NOT, params, dimof(params)); + #endif } VX_API_ENTRY vx_node VX_API_CALL vxMultiplyNode(vx_graph graph, vx_image in1, vx_image in2, vx_scalar scale, vx_enum overflow_policy, vx_enum rounding_policy, vx_image out) @@ -510,10 +682,19 @@ VX_API_ENTRY vx_node VX_API_CALL vxMultiplyNode(vx_graph graph, vx_image in1, vx (vx_reference)rpolicy, (vx_reference)out, }; - vx_node node = vxCreateNodeByStructure(graph, - VX_KERNEL_MULTIPLY, - params, - dimof(params)); + vx_node node; + + #if defined(OPENVX_USE_TILING) + node = vxCreateNodeByStructure(graph, + VX_KERNEL_MULTIPLY_TILING, + params, + dimof(params)); + #else + node = vxCreateNodeByStructure(graph, + VX_KERNEL_MULTIPLY, + params, + dimof(params)); + #endif vxReleaseScalar(&spolicy); vxReleaseScalar(&rpolicy); return node; @@ -529,10 +710,18 @@ VX_API_ENTRY vx_node VX_API_CALL vxAddNode(vx_graph graph, vx_image in1, vx_imag (vx_reference)spolicy, (vx_reference)out, }; - vx_node node = vxCreateNodeByStructure(graph, - VX_KERNEL_ADD, - params, - dimof(params)); + vx_node node; +#if defined(OPENVX_USE_TILING) + node = vxCreateNodeByStructure(graph, + VX_KERNEL_ADD_TILING, + params, + dimof(params)); +#else + node = vxCreateNodeByStructure(graph, + VX_KERNEL_ADD, + params, + dimof(params)); +#endif vxReleaseScalar(&spolicy); return node; } @@ -547,10 +736,18 @@ VX_API_ENTRY vx_node VX_API_CALL vxSubtractNode(vx_graph graph, vx_image in1, vx (vx_reference)spolicy, (vx_reference)out, }; - vx_node node = vxCreateNodeByStructure(graph, - VX_KERNEL_SUBTRACT, - params, - dimof(params)); + vx_node node; +#if defined(OPENVX_USE_TILING) + node = vxCreateNodeByStructure(graph, + VX_KERNEL_SUBTRACT_TILING, + params, + dimof(params)); +#else + node = vxCreateNodeByStructure(graph, + VX_KERNEL_SUBTRACT, + params, + dimof(params)); +#endif vxReleaseScalar(&spolicy); return node; } @@ -565,10 +762,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxWarpAffineNode(vx_graph graph, vx_image input (vx_reference)stype, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + vx_node node = vxCreateNodeByStructure(graph, + VX_KERNEL_WARP_AFFINE_TILING, + params, + dimof(params)); +#else vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_WARP_AFFINE, params, dimof(params)); +#endif vxReleaseScalar(&stype); if (vxGetStatus((vx_reference)node) == VX_SUCCESS) @@ -592,10 +796,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxWarpPerspectiveNode(vx_graph graph, vx_image (vx_reference)stype, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + vx_node node = vxCreateNodeByStructure(graph, + VX_KERNEL_WARP_PERSPECTIVE_TILING, + params, + dimof(params)); +#else vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_WARP_PERSPECTIVE, params, dimof(params)); +#endif vxReleaseScalar(&stype); if (vxGetStatus((vx_reference)node) == VX_SUCCESS) @@ -650,10 +861,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxFastCornersNode(vx_graph graph, vx_image inpu (vx_reference)corners, (vx_reference)num_corners, }; +#if defined(OPENVX_USE_TILING) + vx_node node = vxCreateNodeByStructure(graph, + VX_KERNEL_FAST_CORNERS_TILING, + params, + dimof(params)); +#else vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_FAST_CORNERS, params, dimof(params)); +#endif vxReleaseScalar(&nonmax); return node; } @@ -667,10 +885,19 @@ VX_API_ENTRY vx_node VX_API_CALL vxNonMaxSuppressionNode(vx_graph graph, vx_imag (vx_reference)wsize, (vx_reference)output, }; - vx_node node = vxCreateNodeByStructure(graph, - VX_KERNEL_NON_MAX_SUPPRESSION, - params, - dimof(params)); +vx_node node; + #if defined(OPENVX_USE_TILING) + node = vxCreateNodeByStructure(graph, + VX_KERNEL_NON_MAX_SUPPRESSION_TILING, + params, + dimof(params)); + #else + node = vxCreateNodeByStructure(graph, + VX_KERNEL_NON_MAX_SUPPRESSION, + params, + dimof(params)); + #endif + vxReleaseScalar(&wsize); return node; } @@ -723,10 +950,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxRemapNode(vx_graph graph, (vx_reference)spolicy, (vx_reference)output, }; +#if defined(OPENVX_USE_TILING) + vx_node node = vxCreateNodeByStructure(graph, + VX_KERNEL_REMAP_TILING, + params, + dimof(params)); +#else vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_REMAP, params, dimof(params)); +#endif vxReleaseScalar(&spolicy); if (vxGetStatus((vx_reference)node) == VX_SUCCESS) @@ -748,10 +982,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxHalfScaleGaussianNode(vx_graph graph, vx_imag (vx_reference)output, (vx_reference)ksize, }; +#if defined(OPENVX_USE_TILING) + vx_node node = vxCreateNodeByStructure(graph, + VX_KERNEL_HALFSCALE_GAUSSIAN_TILING, + params, + dimof(params)); +#else vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_HALFSCALE_GAUSSIAN, params, dimof(params)); +#endif vxReleaseScalar(&ksize); return node; } @@ -928,11 +1169,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxMinNode(vx_graph graph, vx_image in1, vx_imag (vx_reference) in2, (vx_reference) out, }; - vx_node node = vxCreateNodeByStructure(graph, - VX_KERNEL_MIN, - params, - dimof(params)); - return node; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_MIN_TILING, + params, + dimof(params)); +#else + return vxCreateNodeByStructure(graph, + VX_KERNEL_MIN, + params, + dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxCopyNode(vx_graph graph, vx_reference input, vx_reference output) @@ -952,12 +1199,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxMaxNode(vx_graph graph, vx_image in1, vx_imag (vx_reference) in2, (vx_reference) out, }; - - vx_node node = vxCreateNodeByStructure(graph, - VX_KERNEL_MAX, - params, - dimof(params)); - return node; +#if defined(OPENVX_USE_TILING) + return vxCreateNodeByStructure(graph, + VX_KERNEL_MAX_TILING, + params, + dimof(params)); +#else + return vxCreateNodeByStructure(graph, + VX_KERNEL_MAX, + params, + dimof(params)); +#endif } VX_API_ENTRY vx_node VX_API_CALL vxLBPNode(vx_graph graph, @@ -972,10 +1224,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxLBPNode(vx_graph graph, (vx_reference)out, }; +#if defined(OPENVX_USE_TILING) + vx_node node = vxCreateNodeByStructure(graph, + VX_KERNEL_LBP_TILING, + params, + dimof(params)); +#else vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_LBP, params, dimof(params)); +#endif vxReleaseScalar(&sformat); vxReleaseScalar(&ksize); @@ -1071,9 +1330,12 @@ vx_node VX_API_CALL vxHOGCellsNode(vx_graph graph, vx_image input, vx_int32 cell (vx_reference)magnitudes, (vx_reference)bins, }; - vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_HOG_CELLS, params, dimof(params)); - - vxReleaseScalar(&cell_width_scalar); + #if defined(OPENVX_USE_TILING) + vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_HOG_CELLS_TILING, params, dimof(params)); + #else + vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_HOG_CELLS, params, dimof(params)); + #endif + vxReleaseScalar(&cell_width_scalar); vxReleaseScalar(&cell_height_scalar); vxReleaseScalar(&num_bins_scalar); @@ -1095,7 +1357,11 @@ vx_node VX_API_CALL vxHOGFeaturesNode(vx_graph graph, vx_image input, vx_tensor (vx_reference)hog_param_size_scalar, (vx_reference)features, }; +#if defined(OPENVX_USE_TILING) + vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_HOG_FEATURES_TILING, param, dimof(param)); +#else vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_HOG_FEATURES, param, dimof(param)); +#endif vxReleaseScalar(&hog_param_size_scalar); vxReleaseArray(&hog_param); diff --git a/sample/include/vx_internal.h b/sample/include/vx_internal.h old mode 100644 new mode 100755 index ba900ca..aab7726 --- a/sample/include/vx_internal.h +++ b/sample/include/vx_internal.h @@ -69,12 +69,24 @@ #define OPENVX_TILING_1_0 #include #endif +#if defined(EXPERIMENTAL_USE_NODE_MEMORY) +#include +#endif +#if defined(EXPERIMENTAL_USE_OPENCL) +#include +#endif #if defined(EXPERIMENTAL_USE_DOT) #include #endif -#if defined(OPENVX_USE_XML) +#if defined(EXPERIMENTAL_USE_XML) #include #endif +#if defined(EXPERIMENTAL_USE_TARGET) +#include +#endif +#if defined(EXPERIMENTAL_USE_VARIANTS) +#include +#endif #include #if defined(OPENVX_USE_IX) #include @@ -108,10 +120,12 @@ */ #define VX_INT_MAX_PATH (256) +#ifndef EXPERIMENTAL_USE_TARGET /* Otherwise already defined in */ /*! \brief Defines the maximum number of characters in a target string. * \ingroup group_target */ #define VX_MAX_TARGET_NAME (64) +#endif #ifndef VX_MAX_STRUCT_NAME #define VX_MAX_STRUCT_NAME (64) @@ -125,7 +139,7 @@ /*! \brief Maximum number of references in the context. * \ingroup group_int_defines */ -#define VX_INT_MAX_REF (4096) +#define VX_INT_MAX_REF (1024) /*! \brief Maximum number of user defined structs/ * \ingroup group_int_defines @@ -195,7 +209,7 @@ /*! \brief Used to determine if a type is a struct. * \ingroup group_int_macros */ -#define VX_TYPE_IS_STRUCT(type) ((type) >= VX_TYPE_RECTANGLE && (type) < VX_TYPE_VENDOR_STRUCT_END) +#define VX_TYPE_IS_STRUCT(type) ((type) >= VX_TYPE_RECTANGLE && (type) < VX_TYPE_KHRONOS_STRUCT_MAX) /*! \brief Used to determine if a type is a data object. * \ingroup group_int_macros @@ -206,7 +220,7 @@ /*! \brief Used to determine if a type is an object. * \ingroup group_int_macros */ -#define VX_TYPE_IS_OBJECT(type) ((type) >= VX_TYPE_REFERENCE && (type) < VX_TYPE_VENDOR_OBJECT_END) +#define VX_TYPE_IS_OBJECT(type) ((type) >= VX_TYPE_REFERENCE && (type) < VX_TYPE_KHRONOS_OBJECT_END) /*! A parameter checker for size and alignment. * \ingroup group_int_macros @@ -388,7 +402,9 @@ typedef struct _vx_processor_t { // forward declarations struct _vx_threadpool_t; struct _vx_threadpool_worker_t; +#if !defined(EXPERIMENTAL_USE_TARGET) typedef struct _vx_target *vx_target; +#endif /*! \brief The function pointer to the worker function. * \param [in] worker The per-thread worker data structure. @@ -518,6 +534,10 @@ typedef struct _vx_reference { vx_int32 delay_slot_index; /*! \brief This indicates that if the object is virtual whether it is accessible at the moment or not */ vx_bool is_accessible; +#if defined(EXPERIMENTAL_USE_OPENCL) + /*! \brief An OpenCL event that the framework can block upon for this object */ + cl_event event; +#endif /*! \brief The reference name */ char name[VX_MAX_REFERENCE_NAME]; } vx_reference_t; @@ -694,7 +714,8 @@ typedef struct _vx_kernel { vx_uint32 affinity; #ifdef OPENVX_KHR_TILING /*! \brief The tiling function pointer interface */ - vx_tiling_kernel_f tiling_function; + vx_tiling_kernel_f tilingfast_function; + vx_tiling_kernel_f tilingflexible_function; #endif } vx_kernel_t; @@ -730,6 +751,9 @@ typedef vx_status (*vx_target_deinit_f)(vx_target target); typedef vx_status (*vx_target_supports_f)(vx_target target, vx_char targetName[VX_MAX_TARGET_NAME], vx_char kernelName[VX_MAX_TARGET_NAME], +#if defined(EXPERIMENTAL_USE_VARIANTS) + vx_char variantName[VX_MAX_VARIANT_NAME], +#endif vx_uint32 *pIndex); /*! \brief Processes the array of nodes supplied. @@ -784,6 +808,7 @@ typedef vx_kernel (*vx_target_addtilingkernel_f)(vx_target target, vx_tiling_kernel_f flexible_func_ptr, vx_tiling_kernel_f fast_func_ptr, vx_uint32 num_parameters, + vx_kernel_validate_f validate, vx_kernel_input_validate_f input, vx_kernel_output_validate_f output); #endif @@ -810,14 +835,32 @@ typedef struct _vx_target_funcs_t { #endif } vx_target_funcs_t; +#ifndef EXPERIMENTAL_USE_TARGET /* Otherwise already defined in */ enum vx_ext_target_type_e { VX_TYPE_TARGET = 0x816,/*!< \brief A \ref vx_target */ }; +#endif /*! \brief The priority list of targets. * \ingroup group_int_target */ enum vx_target_priority_e { +#if defined(OPENVX_USE_TILING) + /*! \brief Defines the priority of the TILING Target */ + VX_TARGET_PRIORITY_TILING, +#endif +#if defined(EXPERIMENTAL_USE_OPENCL) + /*! \brief Defines the priority of the OpenCL Target */ + VX_TARGET_PRIORITY_OPENCL, +#endif +#if defined(EXPERIMENTAL_USE_OPENMP) + /*! \brief Defines the priority of the OpenMP targets */ + VX_TARGET_PRIORITY_OPENMP, +#endif + /*! \brief Defines the priority of the VENUM targets */ +#if defined(EXPERIMENTAL_USE_VENUM) + VX_TARGET_PRIORITY_VENUM, +#endif /*! \brief Defines the priority of the C model target */ VX_TARGET_PRIORITY_C_MODEL, /*! \brief Defines the maximum priority */ @@ -976,6 +1019,19 @@ typedef struct _vx_context { } user_structs[VX_INT_MAX_USER_STRUCTS]; /*! \brief The worker pool used to parallelize the graph*/ vx_threadpool_t *workers; +#if defined(EXPERIMENTAL_USE_OPENCL) +#define CL_MAX_PLATFORMS (1) +#define CL_MAX_DEVICES (2) +#define CL_MAX_KERNELS (50) + /*! \brief The array of platform ids */ + cl_platform_id platforms[CL_MAX_PLATFORMS]; + /*! \brief The number of platform ids */ + cl_uint num_platforms; + cl_device_id devices[CL_MAX_PLATFORMS][CL_MAX_DEVICES]; + cl_uint num_devices[CL_MAX_PLATFORMS]; + cl_context global[CL_MAX_PLATFORMS]; + cl_command_queue queues[CL_MAX_PLATFORMS][CL_MAX_DEVICES]; +#endif /*! \brief The immediate mode border */ vx_border_t imm_border; /*! \brief The unsupported border mode policy for immediate mode functions */ @@ -1129,15 +1185,23 @@ typedef struct _vx_memory_t { /*! \brief The array of pointers (one per plane for images) */ vx_uint8* ptrs[VX_PLANE_MAX]; /*! \brief The number of dimensions per ptr */ - vx_uint32 ndims; + vx_int32 ndims; /*! \brief The dimensional values per ptr */ - vx_uint32 dims[VX_PLANE_MAX][VX_DIM_MAX]; + vx_int32 dims[VX_PLANE_MAX][VX_DIM_MAX]; /*! \brief The per ptr stride values per dimension */ vx_int32 strides[VX_PLANE_MAX][VX_DIM_MAX]; /*! \brief The write locks. Used by Access/Commit pairs on usages which have * VX_WRITE_ONLY or VX_READ_AND_WRITE flag parts. Only single writers are permitted. */ vx_sem_t locks[VX_PLANE_MAX]; +#if defined(EXPERIMENTAL_USE_OPENCL) + /*! \brief This contains the OpenCL memory references */ + cl_mem hdls[VX_PLANE_MAX]; + /*! \brief This describes the type of memory allocated with OpenCL */ + cl_mem_object_type cl_type; + /*! \brief This describes the image format (if it is an image) */ + cl_image_format cl_format; +#endif } vx_memory_t; /*! \brief The internal representation of a \ref vx_image @@ -1174,6 +1238,10 @@ typedef struct _vx_image { vx_rectangle_t region; /*! \brief The memory type */ vx_enum memory_type; +#if defined(EXPERIMENTAL_USE_OPENCL) + /*! \brief This describes the type of OpenCL Image that maps to this image (if applicable). */ + cl_image_format cl_format; +#endif } vx_image_t; /*! \brief The internal representation of a \ref vx_array diff --git a/sample/targets/CMakeLists.txt b/sample/targets/CMakeLists.txt index 0816dc0..2d4a98b 100644 --- a/sample/targets/CMakeLists.txt +++ b/sample/targets/CMakeLists.txt @@ -16,3 +16,7 @@ add_subdirectory( c_model ) +if (OPENVX_USE_TILING) + add_subdirectory( tiling ) +endif (OPENVX_USE_TILING) + diff --git a/sample/targets/c_model/vx_interface.c b/sample/targets/c_model/vx_interface.c index 2c19a7f..98dbc55 100644 --- a/sample/targets/c_model/vx_interface.c +++ b/sample/targets/c_model/vx_interface.c @@ -15,12 +15,6 @@ * limitations under the License. */ -/*! - * \file - * \brief The C-Model Target Interface - * \author Erik Rainey - */ - #include #include @@ -358,7 +352,7 @@ vx_kernel vxTargetAddTilingKernel(vx_target target, kernel = &(target->kernels[k]); if (kernel->enabled == vx_false_e) { - kernel->tiling_function = fast_func_ptr; + kernel->tilingfast_function = fast_func_ptr; ownInitializeKernel(target->base.context, kernel, enumeration, vxTilingKernel, name, @@ -511,7 +505,7 @@ vx_status VX_CALLBACK vxTilingKernel(vx_node node, const vx_reference parameters { //printf("Calling Tile{%u,%u} with %s\n", tx, ty, ((vx_node_t *)node)->kernel->name); tile_memory = ((vx_node_t *)node)->attributes.tileDataPtr; - ((vx_node_t *)node)->kernel->tiling_function(params, tile_memory, size); + ((vx_node_t *)node)->kernel->tilingfast_function(params, tile_memory, size); } else { diff --git a/sample/targets/opencl/CMakeLists.txt b/sample/targets/opencl/CMakeLists.txt new file mode 100644 index 0000000..b558883 --- /dev/null +++ b/sample/targets/opencl/CMakeLists.txt @@ -0,0 +1,55 @@ +# + +# Copyright (c) 2011-2017 The Khronos Group Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +if ( EXPERIMENTAL_USE_OPENCL ) + # set target name + set( TARGET_NAME openvx-opencl ) + + include_directories( BEFORE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/../../include + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/debug + ${OPENCL_INCLUDE_PATH} ) + + set( INVERTED_COMMA "\"" ) + set( CL_SOURCE_DIR ${INVERTED_COMMA}${CMAKE_SOURCE_DIR}/kernels/opencl${INVERTED_COMMA} ) + + add_definitions( -DVX_CL_SOURCE_DIR=${CL_SOURCE_DIR} ) + + FIND_SOURCES() + + if ((WIN32) OR (CYGWIN)) + set( DEF_FILE openvx-target.def ) + endif ((WIN32) OR (CYGWIN)) + + # add a target named ${TARGET_NAME} + add_library (${TARGET_NAME} SHARED ${SOURCE_FILES} ${DEF_FILE}) + + if (CYGWIN) + set_target_properties( ${TARGET_NAME} PROPERTIES LINK_FLAGS ${CMAKE_CURRENT_SOURCE_DIR}/${DEF_FILE} ) + endif (CYGWIN) + + target_link_libraries( ${TARGET_NAME} openvx ) + + install ( TARGETS ${TARGET_NAME} + RUNTIME DESTINATION bin + ARCHIVE DESTINATION lib + LIBRARY DESTINATION bin ) + +endif ( EXPERIMENTAL_USE_OPENCL ) \ No newline at end of file diff --git a/sample/targets/opencl/concerto.mak b/sample/targets/opencl/concerto.mak new file mode 100644 index 0000000..43cfbeb --- /dev/null +++ b/sample/targets/opencl/concerto.mak @@ -0,0 +1,38 @@ +# + +# Copyright (c) 2012-2017 The Khronos Group Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + + +include $(PRELUDE) +TARGET := openvx-opencl +TARGETTYPE := dsmo +DEFFILE := openvx-target.def +CSOURCES = $(call all-c-files) +IDIRS += $(HOST_ROOT)/$(OPENVX_SRC)/include $(HOST_ROOT)/debug +SHARED_LIBS += openvx +DEFS += VX_CL_SOURCE_DIR="\"$(HOST_ROOT)/kernels/opencl\"" +ifeq ($(TARGET_BUILD),debug) +# This is to use the local headers instead of system defined ones it's temporary +DEFS += VX_INCLUDE_DIR="\"$(HOST_ROOT)/include\"" +endif +ifneq (,$(findstring EXPERIMENTAL_USE_OPENCL,$(SYSDEFS))) +USE_OPENCL:=true +else +SKIPBUILD:=1 +endif +include $(FINALE) + diff --git a/sample/targets/opencl/openvx-target.def b/sample/targets/opencl/openvx-target.def new file mode 100644 index 0000000..19bee4f --- /dev/null +++ b/sample/targets/opencl/openvx-target.def @@ -0,0 +1,9 @@ +LIBRARY "openvx-opencl.dll" +VERSION 1.0 +EXPORTS + vxTargetInit + vxTargetDeinit + vxTargetVerify + vxTargetProcess + vxTargetSupports + vxTargetAddKernel diff --git a/sample/targets/opencl/vx_bitwise.c b/sample/targets/opencl/vx_bitwise.c new file mode 100644 index 0000000..c900f48 --- /dev/null +++ b/sample/targets/opencl/vx_bitwise.c @@ -0,0 +1,409 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "vx_interface.h" + +/* + * The three bitwise kernels with binary parameters have the same parameter domain so + * let's just have one set of validators. + */ + +static vx_status VX_CALLBACK vxBinaryBitwiseInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_image images[2]; + vx_parameter param[2] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 1), + }; + vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0])); + vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1])); + if (images[0] && images[1]) + { + vx_uint32 width[2], height[2]; + vx_df_image format[2]; + + vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0])); + vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1])); + vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0])); + vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1])); + vxQueryImage(images[0], VX_IMAGE_FORMAT, &format[0], sizeof(format[0])); + vxQueryImage(images[1], VX_IMAGE_FORMAT, &format[1], sizeof(format[1])); + if (width[0] == width[1] && height[0] == height[1] && format[0] == format[1]) + status = VX_SUCCESS; + vxReleaseImage(&images[1]); + vxReleaseImage(&images[0]); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + } + return status; +} + +static vx_status VX_CALLBACK vxBinaryBitwiseOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 2) + { + vx_parameter param0 = vxGetParameterByIndex(node, 0); + if (param0) + { + vx_image image0 = 0; + vxQueryParameter(param0, VX_PARAMETER_REF, &image0, sizeof(image0)); + /* + * When passing on the geometry to the output image, we only look at image 0, as + * both input images are verified to match, at input validation. + */ + if (image0) + { + vx_uint32 width = 0, height = 0; + vxQueryImage(image0, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(image0, VX_IMAGE_HEIGHT, &height, sizeof(height)); + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&image0); + } + vxReleaseParameter(¶m0); + } + } + return status; +} + +static vx_param_description_t binary_bitwise_kernel_params[] = { + {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, +}; + + +static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_FAILURE; + vx_context context = node->base.context; + + vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration); + vx_uint32 pidx, pln, didx, plidx, argidx; + cl_int err = 0; + size_t off_dim[3] = {0,0,0}; + size_t work_dim[3]; + + cl_event writeEvents[VX_INT_MAX_PARAMS]; + cl_event readEvents[VX_INT_MAX_PARAMS]; + cl_int we = 0, re = 0; + + // determine which platform to use + plidx = 0; + + // determine which device to use + didx = 0; + + cl_kernel kernel = vxclk->kernels[plidx]; + + pln = 0; + + for (argidx = 0, pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + vx_memory_t *memory = &((vx_image)ref)->memory; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) + { + err = clEnqueueWriteBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, + 0, + ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, + NULL, + &ref->event); + } + } + + we = 0; + for (pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) + { + memcpy(&writeEvents[we++],&ref->event, sizeof(cl_event)); + } + } + + + err = clEnqueueNDRangeKernel(context->queues[plidx][didx], + kernel, + 2, + off_dim, + work_dim, + NULL, + we, writeEvents, &node->base.event); + + clFinish(context->queues[plidx][didx]); + + CL_ERROR_MSG(err, "clEnqueueNDRangeKernel"); + + pln = 0; + + vx_reference ref; + /* enqueue a read on all output data */ + if (num == 3) + ref = node->parameters[2]; + else // Not kernel + ref = node->parameters[1]; + + vx_memory_t *memory = NULL; + + memory = &((vx_image)ref)->memory; + + err = clEnqueueReadBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, 0, ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, NULL, NULL); + + CL_ERROR_MSG(err, "clEnqueueReadBuffer"); + + clFinish(context->queues[plidx][didx]); + + re = 0; + for (pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) + { + memcpy(&readEvents[re++],&ref->event, sizeof(cl_event)); + } + } + err = clFlush(context->queues[plidx][didx]); + CL_ERROR_MSG(err, "Flush"); + VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n"); + clWaitForEvents(re, readEvents); + if (err == CL_SUCCESS) + status = VX_SUCCESS; + + VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status); + return status; +} + +static vx_status VX_CALLBACK vxAndKernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + +vx_cl_kernel_description_t and_kernel = { + { + VX_KERNEL_AND, + "org.khronos.openvx.and", + vxAndKernel, + binary_bitwise_kernel_params, dimof(binary_bitwise_kernel_params), + NULL, + vxBinaryBitwiseInputValidator, + vxBinaryBitwiseOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_and.cl", + "vx_and", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; + +static vx_status VX_CALLBACK vxOrKernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + +vx_cl_kernel_description_t orr_kernel = { + { + VX_KERNEL_OR, + "org.khronos.openvx.or", + vxOrKernel, + binary_bitwise_kernel_params, dimof(binary_bitwise_kernel_params), + NULL, + vxBinaryBitwiseInputValidator, + vxBinaryBitwiseOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_orr.cl", + "vx_orr", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; + +static vx_status VX_CALLBACK vxXorKernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + +vx_cl_kernel_description_t xor_kernel = { + { + VX_KERNEL_XOR, + "org.khronos.openvx.xor", + vxXorKernel, + binary_bitwise_kernel_params, dimof(binary_bitwise_kernel_params), + NULL, + vxBinaryBitwiseInputValidator, + vxBinaryBitwiseOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_xor.cl", + "vx_xor", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; + +/* The Not kernel is an unary operator, requiring separate validators. */ + +static vx_status VX_CALLBACK vxUnaryBitwiseInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + return status; +} + +static vx_status VX_CALLBACK vxUnaryBitwiseOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, 0); + if (param) + { + vx_image inimage = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &inimage, sizeof(inimage)); + if (inimage) + { + vx_uint32 width = 0, height = 0; + vxQueryImage(inimage, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(inimage, VX_IMAGE_HEIGHT, &height, sizeof(height)); + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&inimage); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_param_description_t unary_bitwise_kernel_params[] = { + {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, +}; + +static vx_status VX_CALLBACK vxNotKernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + +vx_cl_kernel_description_t not_kernel = { + { + VX_KERNEL_NOT, + "org.khronos.openvx.not", + vxNotKernel, + unary_bitwise_kernel_params, dimof(unary_bitwise_kernel_params), + NULL, + vxUnaryBitwiseInputValidator, + vxUnaryBitwiseOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_not.cl", + "vx_not", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; diff --git a/sample/targets/opencl/vx_convolution.c b/sample/targets/opencl/vx_convolution.c new file mode 100644 index 0000000..33a3308 --- /dev/null +++ b/sample/targets/opencl/vx_convolution.c @@ -0,0 +1,340 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "vx_interface.h" + +#define C_MAX_CONVOLUTION_DIM (15) + +#if (C_MAX_CONVOLUTION_DIM != VX_INT_MAX_CONVOLUTION_DIM) +#if defined(_WIN32) +#pragma error("C Model does not support VX required Convolution Size") +#elif defined(__GNUC__) +#error "C Model does not support VX required Convolution Size" +#endif +#endif + + +static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_FAILURE; + vx_context context = node->base.context; + + vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration); + vx_uint32 pln, didx, plidx, argidx; + cl_int err = 0; + size_t off_dim[3] = { 0,0,0 }; + size_t work_dim[3]; + + cl_event writeEvents[VX_INT_MAX_PARAMS]; + cl_event readEvents[VX_INT_MAX_PARAMS]; + cl_int we = 0, re = 0; + + // determine which platform to use + plidx = 0; + + // determine which device to use + didx = 0; + + cl_kernel kernel = vxclk->kernels[plidx]; + + pln = 0; + + argidx = 0; + + //Set Input + vx_reference ref = node->parameters[0]; + vx_enum dir = node->kernel->signature.directions[0]; + vx_memory_t *memory = &((vx_image)ref)->memory; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + + err = clEnqueueWriteBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, + 0, + ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, + NULL, + &ref->event); + + //Set bordermode + vx_border_t bordermode; + status = vxQueryNode(node, VX_NODE_BORDER, &bordermode, sizeof(bordermode)); + + int border_mode = bordermode.mode; + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &border_mode); + + //Set const value for constant boder + uint8_t const_vaule = bordermode.constant_value.U8; + err = clSetKernelArg(kernel, argidx++, sizeof(uint8_t), &const_vaule); + + //Set conv_mat + vx_size conv_width, conv_height; + vx_int16 _conv_mat[C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM] = { 0 }; + vx_uint32 scale = 1; + + vx_convolution conv = (vx_convolution)parameters[1]; + + status |= vxQueryConvolution(conv, VX_CONVOLUTION_COLUMNS, &conv_width, sizeof(conv_width)); + status |= vxQueryConvolution(conv, VX_CONVOLUTION_ROWS, &conv_height, sizeof(conv_height)); + status |= vxQueryConvolution(conv, VX_CONVOLUTION_SCALE, &scale, sizeof(scale)); + + status |= vxCopyConvolutionCoefficients(conv, _conv_mat, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + err = clSetKernelArg(kernel, argidx++, sizeof(vx_uint32), &conv_width); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_uint32), &conv_height); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_uint32), &scale); + + short matrix_size = C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM * sizeof(short); + + cl_mem conv_mat = clCreateBuffer(context->global[0], CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, matrix_size, _conv_mat, &err); + + err = clEnqueueWriteBuffer(context->queues[plidx][didx], + conv_mat, + CL_TRUE, + 0, + matrix_size, + _conv_mat, + 0, + NULL, + NULL); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &conv_mat); + + //Set Output + ref = node->parameters[2]; + memory = &((vx_image)ref)->memory; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + + we = 0; + ref = node->parameters[0]; + memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event)); + + err = clEnqueueNDRangeKernel(context->queues[plidx][didx], + kernel, + 2, + off_dim, + work_dim, + NULL, + we, writeEvents, &node->base.event); + + clFinish(context->queues[plidx][didx]); + + CL_ERROR_MSG(err, "clEnqueueNDRangeKernel"); + + /* enqueue a read on all output data */ + ref = node->parameters[2]; + + memory = &((vx_image)ref)->memory; + + err = clEnqueueReadBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, 0, ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, NULL, NULL); + + CL_ERROR_MSG(err, "clEnqueueReadBuffer"); + + clFinish(context->queues[plidx][didx]); + + re = 0; + + ref = node->parameters[2]; + memcpy(&readEvents[re++], &ref->event, sizeof(cl_event)); + + err = clFlush(context->queues[plidx][didx]); + CL_ERROR_MSG(err, "Flush"); + VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n"); + clWaitForEvents(re, readEvents); + if (err == CL_SUCCESS) + status = VX_SUCCESS; + + VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status); + + clReleaseMemObject(conv_mat); + + return status; +} + +static vx_status VX_CALLBACK vxConvolveInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + +#if defined(EXPERIMENTAL_USE_S16) + if( (format == VX_DF_IMAGE_U8) || (format == VX_DF_IMAGE_S16) ) +#else + if (format == VX_DF_IMAGE_U8) +#endif + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + if (index == 1) + { + vx_image input = 0; + vx_convolution conv = 0; + + vx_parameter param0 = vxGetParameterByIndex(node, 0); + vx_parameter param1 = vxGetParameterByIndex(node, index); + + vxQueryParameter(param0, VX_PARAMETER_REF, &input, sizeof(input)); + vxQueryParameter(param1, VX_PARAMETER_REF, &conv, sizeof(conv)); + if (input && conv) + { + vx_uint32 width = 0; + vx_uint32 height = 0; + vx_size dims[2] = { 0, 0 }; + + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + + vxQueryConvolution(conv, VX_CONVOLUTION_COLUMNS, &dims[0], sizeof(dims[0])); + vxQueryConvolution(conv, VX_CONVOLUTION_ROWS, &dims[1], sizeof(dims[1])); + + if ((dims[0] <= VX_INT_MAX_CONVOLUTION_DIM) && + (dims[1] <= VX_INT_MAX_CONVOLUTION_DIM) && + (width >= dims[0]) && + (height >= dims[1])) + { + status = VX_SUCCESS; + } + + vxReleaseImage(&input); + vxReleaseConvolution(&conv); + } + + vxReleaseParameter(¶m0); + vxReleaseParameter(¶m1); + } + + return status; +} + +static vx_status VX_CALLBACK vxConvolveOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 2) + { + vx_parameter params[2] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, index), + }; + if ((vxGetStatus((vx_reference)params[0]) == VX_SUCCESS) && + (vxGetStatus((vx_reference)params[1]) == VX_SUCCESS)) + { + vx_image input = 0; + vx_image output = 0; + vxQueryParameter(params[0], VX_PARAMETER_REF, &input, sizeof(input)); + vxQueryParameter(params[1], VX_PARAMETER_REF, &output, sizeof(output)); + if (input && output) + { + vx_uint32 width = 0, height = 0; + vx_df_image format = 0; + vx_df_image output_format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + + vxQueryImage(output, VX_IMAGE_FORMAT, &output_format, sizeof(output_format)); + + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = output_format == VX_DF_IMAGE_U8 ? VX_DF_IMAGE_U8 : VX_DF_IMAGE_S16; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + + vxReleaseImage(&input); + vxReleaseImage(&output); + } + vxReleaseParameter(¶ms[0]); + vxReleaseParameter(¶ms[1]); + } + } + return status; +} + +static vx_status VX_CALLBACK vxConvolveKernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + printf("OpenCL Convolve\n"); + + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + +static vx_param_description_t convolution_kernel_params[] = { + {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_CONVOLUTION, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, +}; + +vx_cl_kernel_description_t convolution_kernel = { + { + VX_KERNEL_CUSTOM_CONVOLUTION, + "org.khronos.openvx.custom_convolution", + vxConvolveKernel, + convolution_kernel_params, dimof(convolution_kernel_params), + NULL, + vxConvolveInputValidator, + vxConvolveOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_convolve.cl", + "vx_Convolve", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; + diff --git a/sample/targets/opencl/vx_filter.c b/sample/targets/opencl/vx_filter.c new file mode 100644 index 0000000..02f320e --- /dev/null +++ b/sample/targets/opencl/vx_filter.c @@ -0,0 +1,304 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include "vx_interface.h" + +static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_FAILURE; + vx_context context = node->base.context; + + vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration); + vx_uint32 pidx, pln, didx, plidx, argidx; + cl_int err = 0; + size_t off_dim[3] = { 0,0,0 }; + size_t work_dim[3]; + + cl_event writeEvents[VX_INT_MAX_PARAMS]; + cl_event readEvents[VX_INT_MAX_PARAMS]; + cl_int we = 0, re = 0; + + // determine which platform to use + plidx = 0; + + // determine which device to use + didx = 0; + + cl_kernel kernel = vxclk->kernels[plidx]; + + pln = 0; + + argidx = 0; + + //Set Input + vx_reference ref = node->parameters[0]; + vx_enum dir = node->kernel->signature.directions[0]; + vx_memory_t *memory = &((vx_image)ref)->memory; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + + err = clEnqueueWriteBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, + 0, + ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, + NULL, + &ref->event); + + //Set bordermode + vx_border_t bordermode; + status = vxQueryNode(node, VX_NODE_BORDER, &bordermode, sizeof(bordermode)); + + int border_mode = bordermode.mode; + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &border_mode); + + //Set const value for constant boder + uint8_t const_vaule = bordermode.constant_value.U8; + err = clSetKernelArg(kernel, argidx++, sizeof(uint8_t), &const_vaule); + + //Set Output + ref = node->parameters[1]; + memory = &((vx_image)ref)->memory; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + + we = 0; + for (pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) + { + memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event)); + } + } + + + err = clEnqueueNDRangeKernel(context->queues[plidx][didx], + kernel, + 2, + off_dim, + work_dim, + NULL, + we, writeEvents, &node->base.event); + + clFinish(context->queues[plidx][didx]); + + CL_ERROR_MSG(err, "clEnqueueNDRangeKernel"); + + pln = 0; + + /* enqueue a read on all output data */ + ref = node->parameters[1]; + + memory = &((vx_image)ref)->memory; + + err = clEnqueueReadBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, 0, ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, NULL, NULL); + + CL_ERROR_MSG(err, "clEnqueueReadBuffer"); + + clFinish(context->queues[plidx][didx]); + + re = 0; + for (pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) + { + memcpy(&readEvents[re++], &ref->event, sizeof(cl_event)); + } + } + err = clFlush(context->queues[plidx][didx]); + CL_ERROR_MSG(err, "Flush"); + VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n"); + clWaitForEvents(re, readEvents); + if (err == CL_SUCCESS) + status = VX_SUCCESS; + + VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status); + return status; +} + + +static vx_status VX_CALLBACK vxFilterInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + return status; +} + +static vx_status VX_CALLBACK vxFilterOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 1) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, 0); /* we reference the input image */ + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_uint32 width = 0, height = 0; + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + return status; +} + +static vx_param_description_t filter_kernel_params[] = { + {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, +}; + +static vx_status VX_CALLBACK vxBox3x3Kernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + +vx_cl_kernel_description_t box3x3_clkernel = { + { + VX_KERNEL_BOX_3x3, + "org.khronos.openvx.box3x3", + vxBox3x3Kernel, + filter_kernel_params, dimof(filter_kernel_params), + NULL, + vxFilterInputValidator, + vxFilterOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_box3x3.cl", + "vx_box3x3", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; + +static vx_status VX_CALLBACK vxGaussian3x3Kernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + +vx_cl_kernel_description_t gaussian3x3_clkernel = { + { + VX_KERNEL_GAUSSIAN_3x3, + "org.khronos.openvx.gaussian3x3", + vxGaussian3x3Kernel, + filter_kernel_params, dimof(filter_kernel_params), + NULL, + vxFilterInputValidator, + vxFilterOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_gaussian3x3.cl", + "vx_gaussian3x3", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; + +static vx_status VX_CALLBACK vxMedian3x3Kernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + +vx_cl_kernel_description_t median3x3_kernel = { + { + VX_KERNEL_MEDIAN_3x3, + "org.khronos.openvx.median_3x3", + vxMedian3x3Kernel, + filter_kernel_params, dimof(filter_kernel_params), + NULL, + vxFilterInputValidator, + vxFilterOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_median3x3.cl", + "vx_median3x3", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; diff --git a/sample/targets/opencl/vx_gradients.c b/sample/targets/opencl/vx_gradients.c new file mode 100644 index 0000000..8ccba30 --- /dev/null +++ b/sample/targets/opencl/vx_gradients.c @@ -0,0 +1,313 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include "vx_interface.h" + +static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_FAILURE; + vx_context context = node->base.context; + + vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration); + vx_uint32 pidx, pln, didx, plidx, argidx; + cl_int err = 0; + size_t off_dim[3] = { 0,0,0 }; + size_t work_dim[3]; + + cl_event writeEvents[VX_INT_MAX_PARAMS]; + cl_event readEvents[VX_INT_MAX_PARAMS]; + cl_int we = 0, re = 0; + + // determine which platform to use + plidx = 0; + + // determine which device to use + didx = 0; + + cl_kernel kernel = vxclk->kernels[plidx]; + + pln = 0; + + argidx = 0; + + //Set Input + vx_reference ref = node->parameters[0]; + vx_enum dir = node->kernel->signature.directions[0]; + vx_memory_t *memory = &((vx_image)ref)->memory; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + + err = clEnqueueWriteBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, + 0, + ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, + NULL, + &ref->event); + + //Set bordermode + vx_border_t bordermode; + status = vxQueryNode(node, VX_NODE_BORDER, &bordermode, sizeof(bordermode)); + + int border_mode = bordermode.mode; + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &border_mode); + + //Set const value for constant boder + uint8_t const_vaule = bordermode.constant_value.U8; + err = clSetKernelArg(kernel, argidx++, sizeof(uint8_t), &const_vaule); + + //Set grad_x + ref = node->parameters[1]; + memory = &((vx_image)ref)->memory; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + int stride_x = memory->strides[pln][VX_DIM_X] / 2; + int stride_y = memory->strides[pln][VX_DIM_Y] / 2; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &stride_x); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &stride_y); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + + + //Set grad_y + ref = node->parameters[2]; + memory = &((vx_image)ref)->memory; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + int stride_x1 = memory->strides[pln][VX_DIM_X] / 2; + int stride_y1 = memory->strides[pln][VX_DIM_Y] / 2; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &stride_x1); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &stride_y1); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + + we = 0; + for (pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) + { + memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event)); + } + } + + err = clEnqueueNDRangeKernel(context->queues[plidx][didx], + kernel, + 2, + off_dim, + work_dim, + NULL, + we, writeEvents, &node->base.event); + + clFinish(context->queues[plidx][didx]); + + CL_ERROR_MSG(err, "clEnqueueNDRangeKernel"); + + pln = 0; + + /* enqueue a read on all output data */ + ref = node->parameters[1]; + + memory = &((vx_image)ref)->memory; + + err = clEnqueueReadBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, 0, ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, NULL, NULL); + + CL_ERROR_MSG(err, "clEnqueueReadBuffer"); + + clFinish(context->queues[plidx][didx]); + + ref = node->parameters[2]; + + memory = &((vx_image)ref)->memory; + + err = clEnqueueReadBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, 0, ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, NULL, NULL); + + CL_ERROR_MSG(err, "clEnqueueReadBuffer"); + + clFinish(context->queues[plidx][didx]); + + re = 0; + for (pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) + { + memcpy(&readEvents[re++], &ref->event, sizeof(cl_event)); + } + } + err = clFlush(context->queues[plidx][didx]); + CL_ERROR_MSG(err, "Flush"); + VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n"); + clWaitForEvents(re, readEvents); + if (err == CL_SUCCESS) + status = VX_SUCCESS; + + VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status); + return status; +} + +static vx_param_description_t sobel3x3_kernel_params[] = +{ + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, +}; + +static vx_status VX_CALLBACK ownSobel3x3Kernel(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} /* ownSobel3x3Kernel() */ + +static +vx_status VX_CALLBACK own_sobel3x3_validator(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[]) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + + if (NULL != node && NULL != parameters && num == dimof(sobel3x3_kernel_params) && NULL != metas) + { + vx_parameter param1 = vxGetParameterByIndex(node, 0); + vx_parameter param2 = vxGetParameterByIndex(node, 1); + vx_parameter param3 = vxGetParameterByIndex(node, 2); + + if (VX_SUCCESS == vxGetStatus((vx_reference)param1) && + ( (VX_SUCCESS == vxGetStatus((vx_reference)param2)) || (VX_SUCCESS == vxGetStatus((vx_reference)param3)) )) + { + vx_uint32 src_width = 0; + vx_uint32 src_height = 0; + vx_df_image src_format = 0; + vx_image input = 0; + + status = vxQueryParameter(param1, VX_PARAMETER_REF, &input, sizeof(input)); + + status |= vxQueryImage(input, VX_IMAGE_WIDTH, &src_width, sizeof(src_width)); + status |= vxQueryImage(input, VX_IMAGE_HEIGHT, &src_height, sizeof(src_height)); + status |= vxQueryImage(input, VX_IMAGE_FORMAT, &src_format, sizeof(src_format)); + + /* validate input image */ + if (VX_SUCCESS == status) + { + if (src_width >= 3 && src_height >= 3 && src_format == VX_DF_IMAGE_U8) + status = VX_SUCCESS; + else + status = VX_ERROR_INVALID_PARAMETERS; + } + + /* validate output images */ + if (VX_SUCCESS == status) + { + vx_enum dst_format = VX_DF_IMAGE_S16; + + if (NULL == metas[1] && NULL == metas[2]) + status = VX_ERROR_INVALID_PARAMETERS; + + if (VX_SUCCESS == status && NULL != metas[1]) + { + /* if optional parameter non NULL */ + status |= vxSetMetaFormatAttribute(metas[1], VX_IMAGE_WIDTH, &src_width, sizeof(src_width)); + status |= vxSetMetaFormatAttribute(metas[1], VX_IMAGE_HEIGHT, &src_height, sizeof(src_height)); + status |= vxSetMetaFormatAttribute(metas[1], VX_IMAGE_FORMAT, &dst_format, sizeof(dst_format)); + } + + if (VX_SUCCESS == status && NULL != metas[2]) + { + /* if optional parameter non NULL */ + status |= vxSetMetaFormatAttribute(metas[2], VX_IMAGE_WIDTH, &src_width, sizeof(src_width)); + status |= vxSetMetaFormatAttribute(metas[2], VX_IMAGE_HEIGHT, &src_height, sizeof(src_height)); + status |= vxSetMetaFormatAttribute(metas[2], VX_IMAGE_FORMAT, &dst_format, sizeof(dst_format)); + } + } + + if (NULL != input) + vxReleaseImage(&input); + + if (NULL != param1) + vxReleaseParameter(¶m1); + + if (NULL != param2) + vxReleaseParameter(¶m2); + + if (NULL != param3) + vxReleaseParameter(¶m3); + } + } /* if ptrs non NULL */ + + return status; +} /* own_sobel3x3_validator() */ + + +vx_cl_kernel_description_t sobel3x3_clkernel = { + { + VX_KERNEL_SOBEL_3x3, + "org.khronos.openvx.sobel_3x3", + ownSobel3x3Kernel, + sobel3x3_kernel_params, dimof(sobel3x3_kernel_params), + own_sobel3x3_validator, + NULL, + NULL, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_sobel3x3.cl", + "vx_sobel3x3", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; + diff --git a/sample/targets/opencl/vx_interface.c b/sample/targets/opencl/vx_interface.c new file mode 100644 index 0000000..9bdfdc6 --- /dev/null +++ b/sample/targets/opencl/vx_interface.c @@ -0,0 +1,817 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include "vx_internal.h" +#include +#include +#include + +static const vx_char name[VX_MAX_TARGET_NAME] = "pc.opencl"; + +/*! \brief Prototype for assigning to kernel */ +static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const const vx_reference *parameters, vx_uint32 num); + +static vx_cl_kernel_description_t *cl_kernels[] = +{ + &box3x3_clkernel, + &and_kernel, + &xor_kernel, + &orr_kernel, + ¬_kernel, + &gaussian3x3_clkernel, + &sobel3x3_clkernel, + &erode3x3_kernel, + &dilate3x3_kernel, + &median3x3_kernel, + &nonlinearfilter_kernel, + &phase_kernel, + &warp_affine_kernel, + &warp_perspective_kernel, + &convolution_kernel, +}; + +static vx_uint32 num_cl_kernels = dimof(cl_kernels); + +static void VX_CALLBACK vxcl_platform_notifier(const char *errinfo, + const void *private_info, + size_t cb, + void *user_data) +{ + //vx_target target = (vx_target)user_data; + VX_PRINT(VX_ZONE_ERROR, "%s\n", errinfo); +} + +vx_status vxTargetInit(vx_target_t *target) +{ + vx_status status = VX_ERROR_NO_RESOURCES; + cl_int err = 0; + vx_context context = target->base.context; + cl_uint p, d, k; + char *vx_incs = getenv("VX_CL_INCLUDE_DIR"); + //char *vx_incs = "/usr/include -I/home/pi/khronos-openvx-1.2-on-raspberrypi-3b/openvx_sample/include -I/home/pi/khronos-openvx-1.2-on-raspberrypi-3b/openvx_sample/include/VX"; + char *cl_dirs = getenv("VX_CL_SOURCE_DIR"); + //char *cl_dirs = "/home/pi/khronos-openvx-1.2-on-raspberrypi-3b/openvx_sample/kernels/opencl"; + char cl_args[1024]; + + if(NULL == vx_incs) + return VX_FAILURE; + + snprintf(cl_args, sizeof(cl_args), "-D VX_CL_KERNEL -I %s -I %s %s %s", vx_incs, cl_dirs, +#if !defined(__APPLE__) + "-D CL_USE_LUMINANCE", +#else + "", +#endif +#if defined(VX_INCLUDE_DIR) + "-I "VX_INCLUDE_DIR" " +#else + " " +#endif + ); + printf("flags: %s\n", cl_args); + if (cl_dirs == NULL) { +#ifdef VX_CL_SOURCE_DIR + const char *sdir = VX_CL_SOURCE_DIR; + int len = strlen(sdir); + cl_dirs = malloc(len); + strncpy(cl_dirs, sdir, len); +#else + return status; +#endif + } + + strncpy(target->name, name, VX_MAX_TARGET_NAME); + target->priority = VX_TARGET_PRIORITY_OPENCL; + + context->num_platforms = CL_MAX_PLATFORMS; + err = clGetPlatformIDs(CL_MAX_PLATFORMS, context->platforms, NULL); + if (err != CL_SUCCESS) + goto exit; + + for (p = 0; p < context->num_platforms; p++) { + err = clGetDeviceIDs(context->platforms[p], CL_DEVICE_TYPE_ALL, + 0, NULL, &context->num_devices[p]); + err = clGetDeviceIDs(context->platforms[p], CL_DEVICE_TYPE_ALL, + context->num_devices[p] > CL_MAX_DEVICES ? CL_MAX_DEVICES : context->num_devices[p], + context->devices[p], NULL); + if (err == CL_SUCCESS) { + cl_context_properties props[] = { + (cl_context_properties)CL_CONTEXT_PLATFORM, + (cl_context_properties)context->platforms[p], + (cl_context_properties)0, + }; + for (d = 0; d < context->num_devices[p]; d++) { + char deviceName[64]; + cl_bool compiler = CL_FALSE; + cl_bool available = CL_FALSE; + cl_bool image_support = CL_FALSE; + err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); + CL_ERROR_MSG(err, "clGetDeviceInfo"); + err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_COMPILER_AVAILABLE, sizeof(cl_bool), &compiler, NULL); + CL_ERROR_MSG(err, "clGetDeviceInfo"); + err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); + CL_ERROR_MSG(err, "clGetDeviceInfo"); + err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL); + CL_ERROR_MSG(err, "clGetDeviceInfo"); + VX_PRINT(VX_ZONE_INFO, "Device %s (compiler=%s) (available=%s) (images=%s)\n", deviceName, (compiler?"TRUE":"FALSE"), (available?"TRUE":"FALSE"), (image_support?"TRUE":"FALSE")); + } + context->global[p] = clCreateContext(props, + context->num_devices[p], + context->devices[p], + vxcl_platform_notifier, + target, + &err); + if (err != CL_SUCCESS) + break; + + /* check for supported formats */ + if (err == CL_SUCCESS) { + cl_uint f,num_entries = 0u; + cl_image_format *formats = NULL; + cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR; + cl_mem_object_type type = CL_MEM_OBJECT_IMAGE2D; + + err = clGetSupportedImageFormats(context->global[p], flags, type, 0, NULL, &num_entries); + formats = (cl_image_format *)malloc(num_entries * sizeof(cl_image_format)); + err = clGetSupportedImageFormats(context->global[p], flags, type, num_entries, formats, NULL); + for (f = 0; f < num_entries; f++) { + char order[256]; + char datat[256]; + #define CASE_STRINGERIZE2(value, string) case value: strcpy(string, #value); break + switch(formats[f].image_channel_order) { + CASE_STRINGERIZE2(CL_R, order); + CASE_STRINGERIZE2(CL_A, order); + CASE_STRINGERIZE2(CL_RG, order); + CASE_STRINGERIZE2(CL_RA, order); + CASE_STRINGERIZE2(CL_RGB, order); + CASE_STRINGERIZE2(CL_RGBA, order); + CASE_STRINGERIZE2(CL_BGRA, order); + CASE_STRINGERIZE2(CL_ARGB, order); + CASE_STRINGERIZE2(CL_INTENSITY, order); + CASE_STRINGERIZE2(CL_LUMINANCE, order); + CASE_STRINGERIZE2(CL_Rx, order); + CASE_STRINGERIZE2(CL_RGx, order); + CASE_STRINGERIZE2(CL_RGBx, order); + #if defined(CL_VERSION_1_2) && defined(cl_khr_gl_depth_images) + CASE_STRINGERIZE2(CL_DEPTH, order); + CASE_STRINGERIZE2(CL_DEPTH_STENCIL, order); + #if defined(__APPLE__) + CASE_STRINGERIZE2(CL_1RGB_APPLE, order); + CASE_STRINGERIZE2(CL_BGR1_APPLE, order); + CASE_STRINGERIZE2(CL_SFIXED14_APPLE, order); + CASE_STRINGERIZE2(CL_BIASED_HALF_APPLE, order); + CASE_STRINGERIZE2(CL_YCbYCr_APPLE, order); + CASE_STRINGERIZE2(CL_CbYCrY_APPLE, order); + CASE_STRINGERIZE2(CL_ABGR_APPLE, order); + #endif + #endif + default: + sprintf(order, "%x", formats[f].image_channel_order); + break; + } + switch(formats[f].image_channel_data_type) { + CASE_STRINGERIZE2(CL_SNORM_INT8, datat); + CASE_STRINGERIZE2(CL_SNORM_INT16, datat); + CASE_STRINGERIZE2(CL_UNORM_INT8, datat); + CASE_STRINGERIZE2(CL_UNORM_INT16, datat); + CASE_STRINGERIZE2(CL_UNORM_SHORT_565, datat); + CASE_STRINGERIZE2(CL_UNORM_SHORT_555, datat); + CASE_STRINGERIZE2(CL_UNORM_INT_101010, datat); + CASE_STRINGERIZE2(CL_SIGNED_INT8, datat); + CASE_STRINGERIZE2(CL_SIGNED_INT16, datat); + CASE_STRINGERIZE2(CL_SIGNED_INT32, datat); + CASE_STRINGERIZE2(CL_UNSIGNED_INT8, datat); + CASE_STRINGERIZE2(CL_UNSIGNED_INT16, datat); + CASE_STRINGERIZE2(CL_UNSIGNED_INT32, datat); + CASE_STRINGERIZE2(CL_HALF_FLOAT, datat); + CASE_STRINGERIZE2(CL_FLOAT, datat); + #if defined(CL_VERSION_2_0) + CASE_STRINGERIZE2(CL_UNORM_INT24, datat); + #endif + default: + sprintf(order, "%x", formats[f].image_channel_data_type); + break; + } + VX_PRINT(VX_ZONE_INFO, "%s : %s\n", order, datat); + } + } + + /* create a queue for each device */ + for (d = 0; d < context->num_devices[p]; d++) + { + context->queues[p][d] = clCreateCommandQueue(context->global[p], + context->devices[p][d], + CL_QUEUE_PROFILING_ENABLE, + &err); + if (err == CL_SUCCESS) { + } + } + + /* for each kernel */ + for (k = 0; k < num_cl_kernels; k++) + { + char *sources = NULL; + size_t programSze = 0; + + /* load the source file */ + VX_PRINT(VX_ZONE_INFO, "Joiner: %s\n", FILE_JOINER); + VX_PRINT(VX_ZONE_INFO, "Path: %s\n", VX_CL_SOURCEPATH); + VX_PRINT(VX_ZONE_INFO, "Kernel[%u] File: %s\n", k, cl_kernels[k]->sourcepath); + VX_PRINT(VX_ZONE_INFO, "Kernel[%u] Name: %s\n", k, cl_kernels[k]->kernelname); + VX_PRINT(VX_ZONE_INFO, "Kernel[%u] ID: %s\n", k, cl_kernels[k]->description.name); + sources = clLoadSources(cl_kernels[k]->sourcepath, &programSze); + /* create a program with this source */ + cl_kernels[k]->program[p] = clCreateProgramWithSource(context->global[p], + 1, + (const char **)&sources, + &programSze, + &err); + if (err == CL_SUCCESS) + { + err = clBuildProgram((cl_program)cl_kernels[k]->program[p], + 1, + (const cl_device_id *)context->devices, + (const char *)cl_args, + NULL, + NULL); + if (err != CL_SUCCESS) + { + CL_BUILD_MSG(err, "Build Error"); + if (err == CL_BUILD_PROGRAM_FAILURE) + { + char log[10][1024]; + size_t logSize = 0; + clGetProgramBuildInfo((cl_program)cl_kernels[k]->program[p], + (cl_device_id)context->devices[p][0], + CL_PROGRAM_BUILD_LOG, + sizeof(log), + log, + &logSize); + printf("%s\n", log); + VX_PRINT(VX_ZONE_ERROR, "%s", log); + } + } + else + { + cl_int k2 = 0; + cl_build_status bstatus = 0; + size_t bs = 0; + err = clGetProgramBuildInfo(cl_kernels[k]->program[p], + context->devices[p][0], + CL_PROGRAM_BUILD_STATUS, + sizeof(cl_build_status), + &bstatus, + &bs); + VX_PRINT(VX_ZONE_INFO, "Status = %d (%d)\n", bstatus, err); + /* get the cl_kernels from the program */ + cl_kernels[k]->num_kernels[p] = 1; + err = clCreateKernelsInProgram(cl_kernels[k]->program[p], + 1, + &cl_kernels[k]->kernels[p], + NULL); + VX_PRINT(VX_ZONE_INFO, "Found %u cl_kernels in %s (%d)\n", cl_kernels[k]->num_kernels[p], cl_kernels[k]->sourcepath, err); + for (k2 = 0; (err == CL_SUCCESS) && (k2 < (cl_int)cl_kernels[k]->num_kernels[p]); k2++) + { + char kName[VX_MAX_KERNEL_NAME]; + size_t size = 0; + err = clGetKernelInfo(cl_kernels[k]->kernels[p], + CL_KERNEL_FUNCTION_NAME, + 0, + NULL, + &size); + err = clGetKernelInfo(cl_kernels[k]->kernels[p], + CL_KERNEL_FUNCTION_NAME, + size, + kName, + NULL); + VX_PRINT(VX_ZONE_INFO, "Kernel %s\n", kName); + if (strncmp(kName, cl_kernels[k]->kernelname, VX_MAX_KERNEL_NAME) == 0) + { + vx_kernel_f kfunc = cl_kernels[k]->description.function; + VX_PRINT(VX_ZONE_INFO, "Linked Kernel %s on target %s\n", cl_kernels[k]->kernelname, target->name); + target->num_kernels++; + target->base.context->num_kernels++; + status = ownInitializeKernel(target->base.context, + &target->kernels[k], + cl_kernels[k]->description.enumeration, + (kfunc == NULL ? vxclCallOpenCLKernel : kfunc), + cl_kernels[k]->description.name, + cl_kernels[k]->description.parameters, + cl_kernels[k]->description.numParams, + cl_kernels[k]->description.validate, + cl_kernels[k]->description.input_validate, + cl_kernels[k]->description.output_validate, + cl_kernels[k]->description.initialize, + cl_kernels[k]->description.deinitialize); + if (ownIsKernelUnique(&target->kernels[k]) == vx_true_e) { + target->base.context->num_unique_kernels++; + } else { + VX_PRINT(VX_ZONE_KERNEL, "Kernel %s is NOT unqiue\n", target->kernels[k].name); + } + } + } + } + } + else + { + CL_ERROR_MSG(err, "Program"); + } + free(sources); + } + } + } +exit: + if (err == CL_SUCCESS) { + status = VX_SUCCESS; + } else { + status = VX_ERROR_NO_RESOURCES; + } + return status; +} + +vx_status vxTargetDeinit(vx_target_t *target) +{ + vx_context context = target->base.context; + if (vxGetStatus((vx_reference)context) == VX_SUCCESS) + { + cl_uint p = 0, d = 0; + vx_uint32 k = 0; + for (p = 0; p < context->num_platforms; p++) + { + for (k = 0; k < num_cl_kernels; k++) + { + ownDecrementReference(&target->kernels[k].base, VX_INTERNAL); + clReleaseKernel(cl_kernels[k]->kernels[p]); + clReleaseProgram(cl_kernels[k]->program[p]); + + } + for (d = 0; d < context->num_devices[p]; d++) + { + clReleaseCommandQueue(context->queues[p][d]); + } + clReleaseContext(context->global[p]); + } + } + return VX_SUCCESS; +} + +vx_status vxTargetSupports(vx_target_t *target, + vx_char targetName[VX_MAX_TARGET_NAME], + vx_char kernelName[VX_MAX_KERNEL_NAME], +#if defined(EXPERIMENTAL_USE_VARIANTS) + vx_char variantName[VX_MAX_VARIANT_NAME], +#endif + vx_uint32 *pIndex) +{ + vx_status status = VX_ERROR_NOT_SUPPORTED; + if (strncmp(targetName, name, VX_MAX_TARGET_NAME) == 0 || + strncmp(targetName, "default", VX_MAX_TARGET_NAME) == 0 || + strncmp(targetName, "performance", VX_MAX_TARGET_NAME) == 0) + { + vx_uint32 k = 0u; + for (k = 0u; k < VX_INT_MAX_KERNELS; k++) + { + if (strncmp(kernelName, target->kernels[k].name, VX_MAX_KERNEL_NAME) == 0) + { + status = VX_SUCCESS; + if (pIndex) *pIndex = k; + break; + } + } + } + return status; +} + +vx_action vxTargetProcess(vx_target_t *target, vx_node_t *nodes[], vx_size startIndex, vx_size numNodes) +{ + vx_action action = VX_ACTION_CONTINUE; + vx_status status = VX_SUCCESS; + vx_size n = 0; + for (n = startIndex; (n < (startIndex + numNodes)) && (action == VX_ACTION_CONTINUE); n++) + { + VX_PRINT(VX_ZONE_GRAPH,"Executing Kernel %s:%d in Nodes[%u] on target %s\n", + nodes[n]->kernel->name, + nodes[n]->kernel->enumeration, + n, + nodes[n]->base.context->targets[nodes[n]->affinity].name); + + ownStartCapture(&nodes[n]->perf); + status = nodes[n]->kernel->function((vx_node)nodes[n], + (vx_reference *)nodes[n]->parameters, + nodes[n]->kernel->signature.num_parameters); + nodes[n]->executed = vx_true_e; + nodes[n]->status = status; + ownStopCapture(&nodes[n]->perf); + + VX_PRINT(VX_ZONE_GRAPH,"kernel %s returned %d\n", nodes[n]->kernel->name, status); + + if (status == VX_SUCCESS) + { + /* call the callback if it is attached */ + if (nodes[n]->callback) + { + action = nodes[n]->callback((vx_node)nodes[n]); + VX_PRINT(VX_ZONE_GRAPH,"callback returned action %d\n", action); + } + } + else + { + action = VX_ACTION_ABANDON; + VX_PRINT(VX_ZONE_ERROR, "Abandoning Graph due to error (%d)!\n", status); + } + } + return action; +} + +vx_status vxTargetVerify(vx_target_t *target, vx_node_t *node) +{ + vx_status status = VX_SUCCESS; + return status; +} + +vx_kernel vxTargetAddKernel(vx_target_t *target, + vx_char name[VX_MAX_KERNEL_NAME], + vx_enum enumeration, + vx_kernel_f func_ptr, + vx_uint32 numParams, + vx_kernel_validate_f validate, + vx_kernel_input_validate_f input, + vx_kernel_output_validate_f output, + vx_kernel_initialize_f initialize, + vx_kernel_deinitialize_f deinitialize) +{ + vx_uint32 k = 0u; + vx_kernel_t *kernel = NULL; + for (k = 0; k < VX_INT_MAX_KERNELS; k++) + { + kernel = &(target->kernels[k]); + if (kernel->enabled == vx_false_e) + { + ownInitializeKernel(target->base.context, + kernel, + enumeration, func_ptr, name, + NULL, numParams, + validate, input, output, initialize, deinitialize); + VX_PRINT(VX_ZONE_KERNEL, "Reserving %s Kernel[%u] for %s\n", target->name, k, kernel->name); + target->num_kernels++; + break; + } + kernel = NULL; + } + return (vx_kernel)kernel; +} + +vx_cl_kernel_description_t *vxclFindKernel(vx_enum enumeration) +{ + vx_cl_kernel_description_t *vxclk = NULL; + vx_uint32 k; + for (k = 0; k < num_cl_kernels; k++) + { + if (enumeration == cl_kernels[k]->description.enumeration) + { + vxclk = cl_kernels[k]; + break; + } + } + return vxclk; +} + +/*! \brief Calls an OpenCL kernel from OpenVX Graph. + * Steps: + * \arg Find the target + * \arg Get the vxcl context + * \arg Find the kernel (to get cl kernel information) + * \arg for each input parameter that is an object, enqueue write + * \arg wait for finish + * \arg for each parameter, SetKernelArg + * \arg call kernel + * \arg wait for finish + * \arg for each output parameter that is an object, enqueue read + * \arg wait for finish + * \note This implementation will attempt to use the External API as much as possible, + * but will cast to internal representation when needed (due to lack of API or + * need for secret information). This is not an optimal OpenCL invocation. + */ +static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + static struct timeval start, start1, end; + gettimeofday(&start, NULL); + + vx_status status = VX_FAILURE; + vx_context context = node->base.context; + vx_target target = (vx_target_t *)&node->base.context->targets[node->affinity]; + vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration); + vx_uint32 pidx, pln, didx, plidx, argidx; + cl_int err = 0; + size_t off_dim[3] = {0,0,0}; + size_t work_dim[3]; + //size_t local_dim[3]; + cl_event writeEvents[VX_INT_MAX_PARAMS]; + cl_event readEvents[VX_INT_MAX_PARAMS]; + cl_int we = 0, re = 0; + + // determine which platform to use + plidx = 0; + + // determine which device to use + didx = 0; + + /* for each input/bi data object, enqueue it and set the kernel parameters */ + for (argidx = 0, pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + vx_enum type = node->kernel->signature.types[pidx]; + vx_memory_t *memory = NULL; + + switch (type) + { + case VX_TYPE_ARRAY: + memory = &((vx_array)ref)->memory; + break; + case VX_TYPE_CONVOLUTION: + memory = &((vx_convolution)ref)->base.memory; + break; + case VX_TYPE_DISTRIBUTION: + memory = &((vx_distribution)ref)->memory; + break; + case VX_TYPE_IMAGE: + memory = &((vx_image)ref)->memory; + break; + case VX_TYPE_LUT: + memory = &((vx_lut_t*)ref)->memory; + break; + case VX_TYPE_MATRIX: + memory = &((vx_matrix)ref)->memory; + break; + //case VX_TYPE_PYRAMID: + // break; + case VX_TYPE_REMAP: + memory = &((vx_remap)ref)->memory; + break; + //case VX_TYPE_SCALAR: + //case VX_TYPE_THRESHOLD: + // break; + } + if (memory) { + for (pln = 0; pln < memory->nptrs; pln++) { + if (memory->cl_type == CL_MEM_OBJECT_BUFFER) { + if (type == VX_TYPE_IMAGE) { + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + // width, height, stride_x, stride_y + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_X]); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_Y]); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 4 parameters\n"); + } else if (type == VX_TYPE_ARRAY || type == VX_TYPE_LUT) { + vx_array arr = (vx_array)ref; + // sizeof item, active count, capacity + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->item_size); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->num_items); // this is output? + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->capacity); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &arr->memory.strides[VX_DIM_X]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_buffer as Buffer with 4 parameters\n"); + } else if (type == VX_TYPE_MATRIX) { + vx_matrix mat = (vx_matrix)ref; + // columns, rows + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->columns); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->rows); + VX_PRINT(VX_ZONE_INFO, "Setting vx_matrix as Buffer with 2 parameters\n"); + } else if (type == VX_TYPE_DISTRIBUTION) { + vx_distribution dist = (vx_distribution)ref; + // num, range, offset, num_bins + vx_uint32 num_bins = dist->memory.dims[0][VX_DIM_X]; + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->memory.dims[VX_DIM_X]); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->range_x); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->offset_x); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&num_bins); + } else if (type == VX_TYPE_CONVOLUTION) { + vx_convolution conv = (vx_convolution)ref; + // columns, rows, scale + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.columns); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.rows); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->scale); + } + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) + { + gettimeofday(&start1, NULL); + err = clEnqueueWriteBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, + 0, + ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, + NULL, + &ref->event); + gettimeofday(&end, NULL); + + double costTime = ((double)end.tv_sec * 1000.0 + (double)end.tv_usec / 1000.0) + - ((double)start1.tv_sec * 1000.0 + (double)start1.tv_usec / 1000.0); + + printf("opencl write DMA %f ms\n", costTime); + } + } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) { + vx_rectangle_t rect = {0}; + vx_image image = (vx_image)ref; + vxGetValidRegionImage(image, &rect); + size_t origin[3] = {rect.start_x, rect.start_y, 0}; + size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1}; + /* set the work dimensions */ + work_dim[0] = rect.end_x-rect.start_x; + work_dim[1] = rect.end_y-rect.start_y; + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as image2d_t wd={%zu,%zu} arg:%u\n",work_dim[0], work_dim[1], argidx); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + if (err != CL_SUCCESS) { + VX_PRINT(VX_ZONE_ERROR, "Error Calling Kernel %s, parameter %u\n", node->kernel->name, pidx); + } + if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) + { + err = clEnqueueWriteImage(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, + origin, region, + memory->strides[pln][VX_DIM_Y], + 0, + memory->ptrs[pln], + 0, NULL, + NULL); + CL_ERROR_MSG(err, "clEnqueueWriteImage"); + } + } + } + } else { + if (type == VX_TYPE_SCALAR) { + vx_value_t value; // largest platform atomic + vx_size size = 0ul; + vx_scalar sc = (vx_scalar)ref; + vx_enum stype = VX_TYPE_INVALID; + vxCopyScalar(sc, &value, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxQueryScalar(sc, VX_SCALAR_TYPE, &stype, sizeof(stype)); + size = ownSizeOfType(stype); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, size, &value); + } + else if (type == VX_TYPE_THRESHOLD) { + vx_enum ttype = 0; + vx_threshold th = (vx_threshold)ref; + vxQueryThreshold(th, VX_THRESHOLD_TYPE, &ttype, sizeof(ttype)); + if (ttype == VX_THRESHOLD_TYPE_BINARY) { + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->value); + } else if (ttype == VX_THRESHOLD_TYPE_RANGE) { + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->lower); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->upper); + } + } + } + } + we = 0; + for (pidx = 0; pidx < num; pidx++) { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) { + memcpy(&writeEvents[we++],&ref->event, sizeof(cl_event)); + } + } + //local_dim[0] = 1; + //local_dim[1] = 1; + err = clEnqueueNDRangeKernel(context->queues[plidx][didx], + vxclk->kernels[plidx], + 2, + off_dim, + work_dim, + NULL, + we, writeEvents, &node->base.event); + + CL_ERROR_MSG(err, "clEnqueueNDRangeKernel"); + /* enqueue a read on all output data */ + for (pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + vx_enum type = node->kernel->signature.types[pidx]; + + if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) + { + vx_memory_t *memory = NULL; + + switch (type) + { + case VX_TYPE_ARRAY: + memory = &((vx_array)ref)->memory; + break; + case VX_TYPE_CONVOLUTION: + memory = &((vx_convolution)ref)->base.memory; + break; + case VX_TYPE_DISTRIBUTION: + memory = &((vx_distribution)ref)->memory; + break; + case VX_TYPE_IMAGE: + memory = &((vx_image)ref)->memory; + break; + case VX_TYPE_LUT: + memory = &((vx_lut_t*)ref)->memory; + break; + case VX_TYPE_MATRIX: + memory = &((vx_matrix)ref)->memory; + break; + //case VX_TYPE_PYRAMID: + // break; + case VX_TYPE_REMAP: + memory = &((vx_remap)ref)->memory; + break; + //case VX_TYPE_SCALAR: + //case VX_TYPE_THRESHOLD: + // break; + } + if (memory) { + for (pln = 0; pln < memory->nptrs; pln++) { + if (memory->cl_type == CL_MEM_OBJECT_BUFFER) { + gettimeofday(&start1, NULL); + err = clEnqueueReadBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, 0, ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, NULL, NULL); + gettimeofday(&end, NULL); + + double costTime = ((double)end.tv_sec * 1000.0 + (double)end.tv_usec / 1000.0) + - ((double)start1.tv_sec * 1000.0 + (double)start1.tv_usec / 1000.0); + + printf("opencl read DMA %f ms\n", costTime); + CL_ERROR_MSG(err, "clEnqueueReadBuffer"); + } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) { + vx_rectangle_t rect = {0}; + vx_image image = (vx_image)ref; + vxGetValidRegionImage(image, &rect); + size_t origin[3] = {rect.start_x, rect.start_y, 0}; + size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1}; + /* set the work dimensions */ + work_dim[0] = rect.end_x-rect.start_x; + work_dim[1] = rect.end_y-rect.start_y; + err = clEnqueueReadImage(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, + origin, region, + memory->strides[pln][VX_DIM_Y], + 0, + memory->ptrs[pln], + 1, &node->base.event, + &ref->event); + CL_ERROR_MSG(err, "clEnqueueReadImage"); + VX_PRINT(VX_ZONE_INFO, "Reading Image wd={%zu,%zu}\n", work_dim[0], work_dim[1]); + } + } + } + } + } + re = 0; + for (pidx = 0; pidx < num; pidx++) { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) { + memcpy(&readEvents[re++],&ref->event, sizeof(cl_event)); + } + } + err = clFlush(context->queues[plidx][didx]); + gettimeofday(&end, NULL); + + double costTime1 = ((double)end.tv_sec * 1000.0 + (double)end.tv_usec / 1000.0) + - ((double)start.tv_sec * 1000.0 + (double)start.tv_usec / 1000.0); + + printf("box3x3 core %f ms\n", costTime1); + CL_ERROR_MSG(err, "Flush"); + VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n"); + clWaitForEvents(re, readEvents); + if (err == CL_SUCCESS) + status = VX_SUCCESS; +//exit: + VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status); + return status; +} + diff --git a/sample/targets/opencl/vx_interface.h b/sample/targets/opencl/vx_interface.h new file mode 100644 index 0000000..1c8aa69 --- /dev/null +++ b/sample/targets/opencl/vx_interface.h @@ -0,0 +1,107 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _VX_INTERFACE_H_ +#define _VX_INTERFACE_H_ + +#include "vx_internal.h" + +#if defined(DARWIN) +#include +#else +#include +#endif + +#include + +/*! \brief The maximum number of platforms */ +#define VX_CL_MAX_PLATFORMS (1) + +/*! \brief The maximum number of CL devices in the system */ +#define VX_CL_MAX_DEVICES (2) + +/*! \brief The maximum number of characters on a line of OpenCL source code */ +#define VX_CL_MAX_LINE_WIDTH (160) + +/*! \brief The maximum path name */ +#define VX_CL_MAX_PATH (256) + +#ifndef VX_CL_ARGS +#define VX_CL_ARGS "-I." +#endif + +#ifndef VX_CL_SOURCEPATH +#define VX_CL_SOURCEPATH "" +#endif + +typedef void (*cl_notifier_f)(cl_program program, void *args); + +typedef void (*cl_platform_notifier_f)(const char *errinfo, + const void *private_info, + size_t cb, + void *user_data); + +typedef struct _vx_cl_context_t { + cl_uint num_platforms; + cl_uint num_devices[VX_CL_MAX_PLATFORMS]; + cl_platform_id platform[VX_CL_MAX_PLATFORMS]; + cl_device_id devices[VX_CL_MAX_PLATFORMS][VX_CL_MAX_DEVICES]; + cl_context context[VX_CL_MAX_PLATFORMS]; + cl_context_properties context_props; + cl_command_queue queues[VX_CL_MAX_PLATFORMS][VX_CL_MAX_DEVICES]; + struct _vx_cl_kernel_description_t **kernels; + vx_uint32 num_kernels; +} vx_cl_context_t; + +#define INIT_PROGRAMS {0} +#define INIT_KERNELS {0} +#define INIT_NUMKERNELS {0} +#define INIT_RETURNS {{0,0}} + +typedef struct _vx_cl_kernel_description_t { + vx_kernel_description_t description; + char sourcepath[VX_CL_MAX_PATH]; + char kernelname[VX_MAX_KERNEL_NAME]; + cl_program program[VX_CL_MAX_PLATFORMS]; + cl_kernel kernels[VX_CL_MAX_PLATFORMS]; + cl_uint num_kernels[VX_CL_MAX_PLATFORMS]; + cl_int returns[VX_CL_MAX_PLATFORMS][VX_CL_MAX_DEVICES]; + void *reserved; /* for additional data */ +} vx_cl_kernel_description_t; + +vx_cl_kernel_description_t *vxclFindKernel(vx_enum enumeration); + +extern vx_cl_kernel_description_t box3x3_clkernel; +extern vx_cl_kernel_description_t add_clkernel; +extern vx_cl_kernel_description_t and_kernel; +extern vx_cl_kernel_description_t xor_kernel; +extern vx_cl_kernel_description_t orr_kernel; +extern vx_cl_kernel_description_t not_kernel; +extern vx_cl_kernel_description_t gaussian3x3_clkernel; +extern vx_cl_kernel_description_t sobel3x3_clkernel; +extern vx_cl_kernel_description_t erode3x3_kernel; +extern vx_cl_kernel_description_t dilate3x3_kernel; +extern vx_cl_kernel_description_t median3x3_kernel; +extern vx_cl_kernel_description_t nonlinearfilter_kernel; +extern vx_cl_kernel_description_t phase_kernel; +extern vx_cl_kernel_description_t warp_affine_kernel; +extern vx_cl_kernel_description_t warp_perspective_kernel; +extern vx_cl_kernel_description_t convolution_kernel; + +#endif + + diff --git a/sample/targets/opencl/vx_morphology.c b/sample/targets/opencl/vx_morphology.c new file mode 100644 index 0000000..2b1a2c2 --- /dev/null +++ b/sample/targets/opencl/vx_morphology.c @@ -0,0 +1,280 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include "vx_interface.h" + +static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_FAILURE; + vx_context context = node->base.context; + + vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration); + vx_uint32 pidx, pln, didx, plidx, argidx; + cl_int err = 0; + size_t off_dim[3] = { 0,0,0 }; + size_t work_dim[3]; + + cl_event writeEvents[VX_INT_MAX_PARAMS]; + cl_event readEvents[VX_INT_MAX_PARAMS]; + cl_int we = 0, re = 0; + + // determine which platform to use + plidx = 0; + + // determine which device to use + didx = 0; + + cl_kernel kernel = vxclk->kernels[plidx]; + + pln = 0; + + argidx = 0; + + //Set Input + vx_reference ref = node->parameters[0]; + vx_enum dir = node->kernel->signature.directions[0]; + vx_memory_t *memory = &((vx_image)ref)->memory; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + + err = clEnqueueWriteBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, + 0, + ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, + NULL, + &ref->event); + + //Set bordermode + vx_border_t bordermode; + status = vxQueryNode(node, VX_NODE_BORDER, &bordermode, sizeof(bordermode)); + + int border_mode = bordermode.mode; + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &border_mode); + + //Set const value for constant boder + uint8_t const_vaule = bordermode.constant_value.U8; + err = clSetKernelArg(kernel, argidx++, sizeof(uint8_t), &const_vaule); + + //Set Output + ref = node->parameters[1]; + memory = &((vx_image)ref)->memory; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + + we = 0; + for (pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) + { + memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event)); + } + } + + + err = clEnqueueNDRangeKernel(context->queues[plidx][didx], + kernel, + 2, + off_dim, + work_dim, + NULL, + we, writeEvents, &node->base.event); + + clFinish(context->queues[plidx][didx]); + + CL_ERROR_MSG(err, "clEnqueueNDRangeKernel"); + + pln = 0; + + /* enqueue a read on all output data */ + ref = node->parameters[1]; + + memory = &((vx_image)ref)->memory; + + err = clEnqueueReadBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, 0, ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, NULL, NULL); + + CL_ERROR_MSG(err, "clEnqueueReadBuffer"); + + clFinish(context->queues[plidx][didx]); + + re = 0; + for (pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) + { + memcpy(&readEvents[re++], &ref->event, sizeof(cl_event)); + } + } + err = clFlush(context->queues[plidx][didx]); + CL_ERROR_MSG(err, "Flush"); + VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n"); + clWaitForEvents(re, readEvents); + if (err == CL_SUCCESS) + status = VX_SUCCESS; + + VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status); + return status; +} + +static vx_status VX_CALLBACK vxErode3x3Kernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + + +static vx_status VX_CALLBACK vxDilate3x3Kernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + +static vx_status VX_CALLBACK vxMorphologyInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + return status; +} + +static vx_status VX_CALLBACK vxMorphologyOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, 0); /* we reference the input image */ + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_image input = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_uint32 width = 0, height = 0; + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_param_description_t morphology_kernel_params[] = { + {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, +}; + +vx_cl_kernel_description_t erode3x3_kernel = { + { + VX_KERNEL_ERODE_3x3, + "org.khronos.openvx.erode_3x3", + vxErode3x3Kernel, + morphology_kernel_params, dimof(morphology_kernel_params), + NULL, + vxMorphologyInputValidator, + vxMorphologyOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_erode3x3.cl", + "vx_erode3x3", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; + +vx_cl_kernel_description_t dilate3x3_kernel = { + { + VX_KERNEL_DILATE_3x3, + "org.khronos.openvx.dilate_3x3", + vxDilate3x3Kernel, + morphology_kernel_params, dimof(morphology_kernel_params), + NULL, + vxMorphologyInputValidator, + vxMorphologyOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_dilate3x3.cl", + "vx_dilate3x3", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; + + diff --git a/sample/targets/opencl/vx_nonlinearfilter.c b/sample/targets/opencl/vx_nonlinearfilter.c new file mode 100644 index 0000000..b20b19f --- /dev/null +++ b/sample/targets/opencl/vx_nonlinearfilter.c @@ -0,0 +1,366 @@ +/* + + * Copyright (c) 2016-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "vx_interface.h" + +#define C_MAX_NONLINEAR_DIM (9) + +static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_FAILURE; + vx_context context = node->base.context; + + vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration); + vx_uint32 pln, didx, plidx, argidx; + cl_int err = 0; + size_t off_dim[3] = { 0,0,0 }; + size_t work_dim[3]; + + cl_event writeEvents[VX_INT_MAX_PARAMS]; + cl_event readEvents[VX_INT_MAX_PARAMS]; + cl_int we = 0, re = 0; + + // determine which platform to use + plidx = 0; + + // determine which device to use + didx = 0; + + cl_kernel kernel = vxclk->kernels[plidx]; + + pln = 0; + argidx = 0; + + // Input function + vx_reference ref = node->parameters[0]; + vx_value_t value; // largest platform atomic + vx_size size = 0ul; + vx_scalar sc = (vx_scalar)ref; + vx_enum stype = VX_TYPE_INVALID; + vxCopyScalar(sc, &value, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxQueryScalar(sc, VX_SCALAR_TYPE, &stype, sizeof(stype)); + size = ownSizeOfType(stype); + err = clSetKernelArg(kernel, argidx++, size, &value); + + + // Input src + ref = node->parameters[1]; + vx_memory_t *memory = &((vx_image)ref)->memory; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + + err = clEnqueueWriteBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, + 0, + ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, + NULL, + &ref->event); + + + // Input mask + ref = node->parameters[2]; + memory = &((vx_matrix)ref)->memory; + + size = ownComputeMemorySize(memory, pln); + + memory->hdls[pln] = clCreateBuffer(context->global[0], CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size, memory->ptrs[pln], &err); + err = clEnqueueWriteBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, + 0, + size, + memory->ptrs[pln], + 0, + NULL, + NULL); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + + // Origin matrix + vx_matrix mask = (vx_matrix)parameters[2]; + vx_coordinates2d_t origin; + status |= vxQueryMatrix(mask, VX_MATRIX_ORIGIN, &origin, sizeof(origin)); + + vx_matrix mat = (vx_matrix)ref; + vx_size rx0 = origin.x; + vx_size ry0 = origin.y; + vx_size rx1 = mat->columns - origin.x - 1; + vx_size ry1 = mat->rows - origin.y - 1; + + err = clSetKernelArg(kernel, argidx++, sizeof(vx_size), &rx0); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_size), &ry0); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_size), &rx1); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_size), &ry1); + + vx_uint8 m[C_MAX_NONLINEAR_DIM * C_MAX_NONLINEAR_DIM]; + status |= vxCopyMatrix(mask, m, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + int mask_index = 0; + int count_mask = 0; + for (int r = 0; r < mat->rows; ++r) + { + for (int c = 0; c < mat->columns; ++c, ++mask_index) + { + if (m[mask_index]) + ++count_mask; + } + } + + err = clSetKernelArg(kernel, argidx++, sizeof(int), &mat->rows); + err = clSetKernelArg(kernel, argidx++, sizeof(int), &count_mask); + + //Set bordermode + vx_border_t bordermode; + status = vxQueryNode(node, VX_NODE_BORDER, &bordermode, sizeof(bordermode)); + + int border_mode = bordermode.mode; + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &border_mode); + + //Set const value for constant boder + uint8_t const_vaule = bordermode.constant_value.U8; + err = clSetKernelArg(kernel, argidx++, sizeof(uint8_t), &const_vaule); + + + //Set Output + ref = node->parameters[3]; + memory = &((vx_image)ref)->memory; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + + we = 0; + + // Input src + ref = node->parameters[1]; + memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event)); + + err = clEnqueueNDRangeKernel(context->queues[plidx][didx], + kernel, + 2, + off_dim, + work_dim, + NULL, + we, writeEvents, &node->base.event); + + clFinish(context->queues[plidx][didx]); + + CL_ERROR_MSG(err, "clEnqueueNDRangeKernel"); + + pln = 0; + + /* enqueue a read on all output data */ + ref = node->parameters[3]; + + memory = &((vx_image)ref)->memory; + + err = clEnqueueReadBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, 0, ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, NULL, NULL); + + CL_ERROR_MSG(err, "clEnqueueReadBuffer"); + + clFinish(context->queues[plidx][didx]); + + re = 0; + + ref = node->parameters[3]; + + memcpy(&readEvents[re++], &ref->event, sizeof(cl_event)); + + err = clFlush(context->queues[plidx][didx]); + CL_ERROR_MSG(err, "Flush"); + VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n"); + clWaitForEvents(re, readEvents); + if (err == CL_SUCCESS) + status = VX_SUCCESS; + + VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status); + return status; +} + +static vx_status VX_CALLBACK vxNonLinearFilterKernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + +static vx_status VX_CALLBACK vxNonLinearFilterInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_scalar scalar = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_ENUM) + { + vx_enum function = 0; + vxCopyScalar(scalar, &function, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((function == VX_NONLINEAR_FILTER_MEDIAN) || + (function == VX_NONLINEAR_FILTER_MIN) || + (function == VX_NONLINEAR_FILTER_MAX)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 2) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (param) + { + vx_matrix matrix; + vxQueryParameter(param, VX_PARAMETER_REF, &matrix, sizeof(matrix)); + if (matrix) + { + vx_enum data_type = 0; + vx_size cols = 0, rows = 0; + vxQueryMatrix(matrix, VX_MATRIX_TYPE, &data_type, sizeof(data_type)); + vxQueryMatrix(matrix, VX_MATRIX_COLUMNS, &cols, sizeof(cols)); + vxQueryMatrix(matrix, VX_MATRIX_ROWS, &rows, sizeof(rows)); + if ((rows <= VX_INT_MAX_NONLINEAR_DIM) && + (cols <= VX_INT_MAX_NONLINEAR_DIM) && + (data_type == VX_TYPE_UINT8)) + { + status = VX_SUCCESS; + } + vxReleaseMatrix(&matrix); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxNonLinearFilterOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 3) + { + vx_parameter param = vxGetParameterByIndex(node, 1); /* we reference the input image */ + if (param) + { + vx_image input = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_uint32 width = 0, height = 0; + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_param_description_t filter_kernel_params[] = { + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_MATRIX, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, +}; + +vx_cl_kernel_description_t nonlinearfilter_kernel = { + { + VX_KERNEL_NON_LINEAR_FILTER, + "org.khronos.openvx.non_linear_filter", + vxNonLinearFilterKernel, + filter_kernel_params, dimof(filter_kernel_params), + NULL, + vxNonLinearFilterInputValidator, + vxNonLinearFilterOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_nonlinearfilter.cl", + "vx_nonlinearfilter", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; diff --git a/sample/targets/opencl/vx_phase.c b/sample/targets/opencl/vx_phase.c new file mode 100644 index 0000000..c38d30a --- /dev/null +++ b/sample/targets/opencl/vx_phase.c @@ -0,0 +1,271 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "vx_interface.h" + +static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_FAILURE; + vx_context context = node->base.context; + + vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration); + vx_uint32 pidx, pln, didx, plidx, argidx; + cl_int err = 0; + size_t off_dim[3] = { 0,0,0 }; + size_t work_dim[3]; + + cl_event writeEvents[VX_INT_MAX_PARAMS]; + cl_event readEvents[VX_INT_MAX_PARAMS]; + cl_int we = 0, re = 0; + + // determine which platform to use + plidx = 0; + + // determine which device to use + didx = 0; + + cl_kernel kernel = vxclk->kernels[plidx]; + + pln = 0; + + for (argidx = 0, pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + vx_memory_t *memory = &((vx_image)ref)->memory; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X]; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + //stride_x, stride_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n"); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + CL_ERROR_MSG(err, "clSetKernelArg"); + if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) + { + err = clEnqueueWriteBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, + 0, + ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, + NULL, + &ref->event); + } + } + + we = 0; + for (pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) + { + memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event)); + } + } + + err = clEnqueueNDRangeKernel(context->queues[plidx][didx], + kernel, + 2, + off_dim, + work_dim, + NULL, + we, writeEvents, &node->base.event); + + clFinish(context->queues[plidx][didx]); + + CL_ERROR_MSG(err, "clEnqueueNDRangeKernel"); + + pln = 0; + + vx_reference ref; + /* enqueue a read on all output data */ + ref = node->parameters[2]; + + vx_memory_t *memory = NULL; + + memory = &((vx_image)ref)->memory; + + err = clEnqueueReadBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, 0, ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, NULL, NULL); + + CL_ERROR_MSG(err, "clEnqueueReadBuffer"); + + clFinish(context->queues[plidx][didx]); + + re = 0; + for (pidx = 0; pidx < num; pidx++) + { + vx_reference ref = node->parameters[pidx]; + vx_enum dir = node->kernel->signature.directions[pidx]; + if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) + { + memcpy(&readEvents[re++], &ref->event, sizeof(cl_event)); + } + } + err = clFlush(context->queues[plidx][didx]); + CL_ERROR_MSG(err, "Flush"); + VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n"); + clWaitForEvents(re, readEvents); + if (err == CL_SUCCESS) + status = VX_SUCCESS; + + VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status); + return status; +} + +static +vx_param_description_t phase_kernel_params[] = +{ + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, +}; + +static +vx_status VX_CALLBACK vxPhaseKernel(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} /* vxPhaseKernel() */ + +static vx_status VX_CALLBACK vxPhaseInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + + if (index == 0 || index == 1) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_S16 || format == VX_DF_IMAGE_F32) + { + if (index == 0) + { + status = VX_SUCCESS; + } + else + { + vx_parameter param0 = vxGetParameterByIndex(node, index); + vx_image input0 = 0; + + vxQueryParameter(param0, VX_PARAMETER_REF, &input0, sizeof(input0)); + if (input0) + { + vx_uint32 width0 = 0, height0 = 0, width1 = 0, height1 = 0; + vxQueryImage(input0, VX_IMAGE_WIDTH, &width0, sizeof(width0)); + vxQueryImage(input0, VX_IMAGE_HEIGHT, &height0, sizeof(height0)); + vxQueryImage(input, VX_IMAGE_WIDTH, &width1, sizeof(width1)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height1, sizeof(height1)); + + if (width0 == width1 && height0 == height1) + status = VX_SUCCESS; + + vxReleaseImage(&input0); + } + + vxReleaseParameter(¶m0); + } + } + + vxReleaseImage(&input); + } + + vxReleaseParameter(¶m); + } + + return status; +} + +static vx_status VX_CALLBACK vxPhaseOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + + if (index == 2) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, 0); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_uint32 width = 0; + vx_uint32 height = 0; + vx_df_image format = 0; + + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + + status = VX_SUCCESS; + + vxReleaseImage(&input); + } + + vxReleaseParameter(¶m); + } + + return status; +} + +vx_cl_kernel_description_t phase_kernel = +{ + { + VX_KERNEL_PHASE, + "org.khronos.openvx.phase", + vxPhaseKernel, + phase_kernel_params, dimof(phase_kernel_params), + NULL, + vxPhaseInputValidator, + vxPhaseOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_phase.cl", + "vx_phase", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; + + diff --git a/sample/targets/opencl/vx_support.c b/sample/targets/opencl/vx_support.c new file mode 100644 index 0000000..aebbd44 --- /dev/null +++ b/sample/targets/opencl/vx_support.c @@ -0,0 +1,264 @@ +/* + + * Copyright (c) 2011-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#define CASE_STRINGERIZE(err, label, function, file, line) \ + case err: \ + fprintf(stderr, "%s: OpenCL error "#err" at %s in %s:%d\n", label, function, file, line); \ + break + +static size_t flen(FILE *fp) +{ + size_t len = 0; + fseek(fp, 0, SEEK_END); + len = ftell(fp); + fseek(fp, 0, SEEK_SET); + return len; +} + +static size_t flines(FILE *fp) +{ + size_t numLines = 0; + if (fp) { + char line[CL_MAX_LINESIZE]; + fseek(fp, 0, SEEK_SET); + while (fgets(line, sizeof(line), fp) != NULL) { + numLines++; + } + //printf("%lu lines in file %p\n",numLines,fp); + fseek(fp, 0, SEEK_SET); + } + return numLines; +} + +cl_int clBuildError(cl_int build_status, const char *label, const char *function, const char *file, int line) +{ + switch (build_status) + { + case CL_BUILD_SUCCESS: + fprintf(stdout, "%s: Build Successful!\n", label); + break; + CASE_STRINGERIZE(CL_BUILD_NONE, label, function, file, line); + CASE_STRINGERIZE(CL_BUILD_ERROR, label, function, file, line); + CASE_STRINGERIZE(CL_BUILD_IN_PROGRESS, label, function, file, line); + default: + fprintf(stderr, "%s: Unknown build error %d at %s in %s:%d\n", label, build_status, function, file, line); + break; + } + return build_status; +} + +cl_int clPrintError(cl_int err, const char *label, const char *function, const char *file, int line) +{ + switch (err) + { + //CASE_STRINGERIZE(CL_SUCCESS, label, function, file, line); + case CL_SUCCESS: + break; + CASE_STRINGERIZE(CL_BUILD_PROGRAM_FAILURE, label, function, file, line); + CASE_STRINGERIZE(CL_COMPILER_NOT_AVAILABLE, label, function, file, line); + CASE_STRINGERIZE(CL_DEVICE_NOT_AVAILABLE, label, function, file, line); + CASE_STRINGERIZE(CL_DEVICE_NOT_FOUND, label, function, file, line); + CASE_STRINGERIZE(CL_IMAGE_FORMAT_MISMATCH, label, function, file, line); + CASE_STRINGERIZE(CL_IMAGE_FORMAT_NOT_SUPPORTED, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_ARG_INDEX, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_ARG_SIZE, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_ARG_VALUE, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_BINARY, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_BUFFER_SIZE, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_BUILD_OPTIONS, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_COMMAND_QUEUE, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_CONTEXT, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_DEVICE, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_DEVICE_TYPE, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_EVENT, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_EVENT_WAIT_LIST, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_GL_OBJECT, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_GLOBAL_OFFSET, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_HOST_PTR, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_IMAGE_SIZE, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_KERNEL_NAME, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_KERNEL, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_KERNEL_ARGS, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_KERNEL_DEFINITION, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_MEM_OBJECT, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_OPERATION, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_PLATFORM, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_PROGRAM, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_PROGRAM_EXECUTABLE, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_QUEUE_PROPERTIES, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_SAMPLER, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_VALUE, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_WORK_DIMENSION, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_WORK_GROUP_SIZE, label, function, file, line); + CASE_STRINGERIZE(CL_INVALID_WORK_ITEM_SIZE, label, function, file, line); + CASE_STRINGERIZE(CL_MAP_FAILURE, label, function, file, line); + CASE_STRINGERIZE(CL_MEM_OBJECT_ALLOCATION_FAILURE, label, function, file, line); + CASE_STRINGERIZE(CL_MEM_COPY_OVERLAP, label, function, file, line); + CASE_STRINGERIZE(CL_OUT_OF_HOST_MEMORY, label, function, file, line); + CASE_STRINGERIZE(CL_OUT_OF_RESOURCES, label, function, file, line); + CASE_STRINGERIZE(CL_PROFILING_INFO_NOT_AVAILABLE, label, function, file, line); + default: + fprintf(stderr, "%s: Unknown error %d at %s in %s:%d\n", label, err, function, file, line); + break; + } + return err; +} + +char *clLoadSources(char *filename, size_t *programSize) +{ + FILE *pFile = NULL; + char *programSource = NULL; + VX_PRINT(VX_ZONE_INFO, "Reading source file %s\n", filename); + pFile = fopen((char *)filename, "rb"); + if (pFile != NULL && programSize) + { + // obtain file size: + fseek(pFile, 0, SEEK_END); + *programSize = ftell(pFile); + rewind(pFile); + + int size = *programSize + 1; + programSource = (char*)malloc(sizeof(char)*(size)); + if (programSource == NULL) + { + fclose(pFile); + free(programSource); + return NULL; + } + + fread(programSource, sizeof(char), *programSize, pFile); + programSource[*programSize] = '\0'; + fclose(pFile); + } + return programSource; +} + +#if defined(EXPERIMENTAL_USE_FNMATCH) +static int vx_source_filter(const struct dirent *de) +{ + if (de && 0 == fnmatch("vx_*.cl", de->d_name, +#if defined(__QNX__) || defined(__APPLE__) + FNM_PERIOD|FNM_PATHNAME)) +#else + FNM_PERIOD|FNM_FILE_NAME)) +#endif + return 1; + else + return 0; +} + +#if defined(__QNX__) +typedef int (*sorting_f)(const void *, const void *); +#else +typedef int (*sorting_f)(const struct dirent **, const struct dirent **); +#endif + +static int name_sort(const struct dirent **a, const struct dirent **b) +{ + return strcmp((*a)->d_name, (*b)->d_name); +} + +cl_program vxLoadProgram(cl_context context, const char *src_dir, cl_int *perr) +{ + cl_program program; + struct dirent **names = NULL; + int i, f, num_lines = 0, cur_line = 0; + int num_files = scandir(src_dir, &names, &vx_source_filter, &name_sort); + size_t *lengths = NULL, lineSize = CL_MAX_LINESIZE; + char **source = NULL; + printf("Matched %d files\n", num_files); + for (f = 0; f < num_files; f++) { + if (names[f]->d_name) { + char pathname[CL_MAX_LINESIZE]; + sprintf(pathname, "%s%s", src_dir, names[f]->d_name); + FILE *fp = fopen(pathname, "r"); + if (fp) { + num_lines += flines(fp); + fclose(fp); + } + } + } + printf("Total Number Lines: %d\n", num_lines); + // allocate big array of lines + source = ALLOC(char *, num_lines); + lengths = ALLOC(size_t, num_lines); + for (i = 0; i < num_lines; i++) { + source[i] = ALLOC(char, lineSize); + lengths[i] = lineSize; + } + // load all source into a single array + for (f = 0; f < num_files; f++) { + if (names[f]->d_name) { + char pathname[CL_MAX_LINESIZE]; + sprintf(pathname, "%s%s", src_dir, names[f]->d_name); + FILE *fp = fopen(pathname, "r"); + if (fp) { + printf("Reading from file %s\n", pathname); + do { + if (fgets(source[cur_line], lengths[cur_line], fp) == NULL) + break; + // trim to exact lengths + lengths[cur_line] = strlen(source[cur_line]); + cur_line++; + } while (1); + printf("@ %u lines\n", cur_line); + fclose(fp); + } + } + } + if (num_lines != cur_line) { + fprintf(stderr, "Failed to read in all lines from source files!\n"); + return 0; + } +#if 1 + for (i = 0; i < num_lines; i++) { + printf("%4d [%4zu] %s", i, lengths[i], source[i]); + } +#endif + program = clCreateProgramWithSource(context, num_lines, (const char **)source, lengths, perr); + CL_ERROR_MSG(*perr, "clCreateProgramWithSource"); +#if 0 + if (perr != CL_SUCCESS) { + cl_int err = 0; + size_t src_size = 0; + char *src = NULL; + err = clGetProgramInfo(program, CL_PROGRAM_SOURCE, 0, NULL, &src_size); + CL_ERROR_MSG(err, "clGetProgramInfo"); + printf("Source Code has %zu bytes\n", src_size); + src = (char *)malloc(src_size); + err = clGetProgramInfo(program, CL_PROGRAM_SOURCE, src_size, src, NULL); + CL_ERROR_MSG(err, "clGetProgramInfo"); + printf("%s", src); + free(src); + } +#endif + return program; +} + +#elif defined(_WIN32) + +cl_program vxLoadProgram(cl_context context, const char *src_dir, cl_int *perr) { + return 0; +} + +#endif + + + diff --git a/sample/targets/opencl/vx_support.h b/sample/targets/opencl/vx_support.h new file mode 100644 index 0000000..64aa1f5 --- /dev/null +++ b/sample/targets/opencl/vx_support.h @@ -0,0 +1,46 @@ +/* + + * Copyright (c) 2011-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "vx_internal.h" + +#if defined(__ANDROID__) || defined(__linux__) || defined(__QNX__) || defined(__CYGWIN__) || defined(__APPLE__) +#if !defined(__QNX__) && !defined(__APPLE__) +#include +#else +#define __EXT_UNIX_MISC //Needed by QNX version of dirent.h to include scandir() +#endif +#include +#if defined(__APPLE__) +#include +#endif +#include +#include +#define EXPERIMENTAL_USE_FNMATCH +#elif defined(_WIN32) +#define snprintf _snprintf +#endif + +#define CL_MAX_LINESIZE (1024) + +#define ALLOC(type,count) (type *)calloc(count, sizeof(type)) +#define CL_ERROR_MSG(err, string) clPrintError(err, string, __FUNCTION__, __FILE__, __LINE__) +#define CL_BUILD_MSG(err, string) clBuildError(err, string, __FUNCTION__, __FILE__, __LINE__) + +char *clLoadSources(char *filename, size_t *programSize); +cl_int clBuildError(cl_int build_status, const char *label, const char *function, const char *file, int line); +cl_int clPrintError(cl_int err, const char *label, const char *function, const char *file, int line); diff --git a/sample/targets/opencl/vx_warp.c b/sample/targets/opencl/vx_warp.c new file mode 100644 index 0000000..243515e --- /dev/null +++ b/sample/targets/opencl/vx_warp.c @@ -0,0 +1,395 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "vx_interface.h" + + +static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_FAILURE; + vx_context context = node->base.context; + + vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration); + vx_uint32 pln, didx, plidx, argidx; + cl_int err = 0; + size_t off_dim[3] = { 0,0,0 }; + size_t work_dim[3]; + + cl_event writeEvents[VX_INT_MAX_PARAMS]; + cl_event readEvents[VX_INT_MAX_PARAMS]; + cl_int we = 0, re = 0; + + // determine which platform to use + plidx = 0; + + // determine which device to use + didx = 0; + + cl_kernel kernel = vxclk->kernels[plidx]; + + pln = 0; + argidx = 0; + + vx_reference ref; + + // Input src + ref = node->parameters[0]; + vx_memory_t *memory = &((vx_image)ref)->memory; + + vx_size in_step_x = 1; + vx_size in_step_y = 1; + vx_size in_offset_first_element_in_bytes = 0; + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + + //stride_x, step_x, stride_y, step_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &in_step_x); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &in_step_y); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &in_offset_first_element_in_bytes); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 5 parameters\n"); + + vx_int32 src_width = memory->dims[pln][VX_DIM_X]; + vx_int32 src_height = memory->dims[pln][VX_DIM_Y]; + + CL_ERROR_MSG(err, "clSetKernelArg"); + + err = clEnqueueWriteBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, + 0, + ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, + NULL, + &ref->event); + + //Set Output + ref = node->parameters[3]; + memory = &((vx_image)ref)->memory; + + vx_size out_step_x = 4; + vx_size out_step_y = memory->strides[pln][VX_DIM_Y]; + vx_size out_offset_first_element_in_bytes = 0; + + /* set the work dimensions */ + work_dim[0] = memory->dims[pln][VX_DIM_X] / 4; + work_dim[1] = memory->dims[pln][VX_DIM_Y]; + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]); + + //stride_x, step_x, stride_y, step_y + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &out_step_x); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &out_step_y); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &out_offset_first_element_in_bytes); + VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 5 parameters\n"); + + int width = memory->dims[pln][VX_DIM_X]; + int height = memory->dims[pln][VX_DIM_Y]; + //width, height + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &src_width); + err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &src_height); + + CL_ERROR_MSG(err, "clSetKernelArg"); + + vx_matrix mask = (vx_matrix)parameters[1]; + + vx_size matrix_size = 9; + + //vx_float32 *m = (vx_float32 *)malloc(matrix_size * sizeof(vx_float32)); + vx_float32 m[9]; + + status |= vxCopyMatrix(mask, m, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + cl_mem mat = clCreateBuffer(context->global[0], CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, matrix_size * sizeof(vx_float32), m, &err); + + err = clEnqueueWriteBuffer(context->queues[plidx][didx], + mat, + CL_TRUE, + 0, + matrix_size * sizeof(vx_float32), + m, + 0, + NULL, + NULL); + + err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &mat); + + //Set bordermode + vx_border_t bordermode; + status = vxQueryNode(node, VX_NODE_BORDER, &bordermode, sizeof(bordermode)); + //Set const value for constant boder + uint8_t const_vaule = bordermode.constant_value.U8; + err = clSetKernelArg(kernel, argidx++, sizeof(uint8_t), &const_vaule); + + //Set type + vx_scalar stype = (vx_scalar)parameters[2]; + vx_int32 type = 0; + status |= vxCopyScalar(stype, &type, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &type); + + we = 0; + + // Input src + ref = node->parameters[0]; + memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event)); + + err = clEnqueueNDRangeKernel(context->queues[plidx][didx], + kernel, + 2, + off_dim, + work_dim, + NULL, + we, writeEvents, &node->base.event); + + clFinish(context->queues[plidx][didx]); + + CL_ERROR_MSG(err, "clEnqueueNDRangeKernel"); + + /* enqueue a read on all output data */ + ref = node->parameters[3]; + + memory = &((vx_image)ref)->memory; + + err = clEnqueueReadBuffer(context->queues[plidx][didx], + memory->hdls[pln], + CL_TRUE, 0, ownComputeMemorySize(memory, pln), + memory->ptrs[pln], + 0, NULL, NULL); + + CL_ERROR_MSG(err, "clEnqueueReadBuffer"); + + clFinish(context->queues[plidx][didx]); + + re = 0; + + memcpy(&readEvents[re++], &ref->event, sizeof(cl_event)); + + err = clFlush(context->queues[plidx][didx]); + CL_ERROR_MSG(err, "Flush"); + VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n"); + clWaitForEvents(re, readEvents); + if (err == CL_SUCCESS) + status = VX_SUCCESS; + + VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status); + + clReleaseMemObject(mat); + + return status; +} + +static vx_status vxWarpInputValidator(vx_node node, vx_uint32 index, vx_size mat_columns) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_matrix matrix; + vxQueryParameter(param, VX_PARAMETER_REF, &matrix, sizeof(matrix)); + if (matrix) + { + vx_enum data_type = 0; + vx_size rows = 0ul, columns = 0ul; + vxQueryMatrix(matrix, VX_MATRIX_TYPE, &data_type, sizeof(data_type)); + vxQueryMatrix(matrix, VX_MATRIX_ROWS, &rows, sizeof(rows)); + vxQueryMatrix(matrix, VX_MATRIX_COLUMNS, &columns, sizeof(columns)); + if ((data_type == VX_TYPE_FLOAT32) && (columns == mat_columns) && (rows == 3)) + { + status = VX_SUCCESS; + } + vxReleaseMatrix(&matrix); + } + vxReleaseParameter(¶m); + } + } + else if (index == 2) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar scalar = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_ENUM) + { + vx_enum interp = 0; + vxCopyScalar(scalar, &interp, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((interp == VX_INTERPOLATION_NEAREST_NEIGHBOR) || + (interp == VX_INTERPOLATION_BILINEAR)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxWarpAffineInputValidator(vx_node node, vx_uint32 index) +{ + return vxWarpInputValidator(node, index, 2); +} + +static vx_status VX_CALLBACK vxWarpPerspectiveInputValidator(vx_node node, vx_uint32 index) +{ + return vxWarpInputValidator(node, index, 3); +} + +static vx_status VX_CALLBACK vxWarpOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 3) + { + vx_parameter dst_param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)dst_param) == VX_SUCCESS) + { + vx_image dst = 0; + vxQueryParameter(dst_param, VX_PARAMETER_REF, &dst, sizeof(dst)); + if (dst) + { + vx_uint32 w1 = 0, h1 = 0; + vx_df_image f1 = VX_DF_IMAGE_VIRT; + + vxQueryImage(dst, VX_IMAGE_WIDTH, &w1, sizeof(w1)); + vxQueryImage(dst, VX_IMAGE_HEIGHT, &h1, sizeof(h1)); + vxQueryImage(dst, VX_IMAGE_FORMAT, &f1, sizeof(f1)); + /* output can not be virtual */ + if ((w1 != 0) && (h1 != 0) && (f1 == VX_DF_IMAGE_U8)) + { + /* fill in the meta data with the attributes so that the checker will pass */ + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = w1; + ptr->dim.image.height = h1; + status = VX_SUCCESS; + } + vxReleaseImage(&dst); + } + vxReleaseParameter(&dst_param); + } + } + return status; +} + +static vx_param_description_t warp_kernel_params[] = { + {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_MATRIX, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, +}; + +static vx_status VX_CALLBACK vxWarpAffineKernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + printf("OpenCL WarpAffine\n"); + + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + +static vx_status VX_CALLBACK vxWarpPerspectiveKernel(vx_node node, const vx_reference *parameters, vx_uint32 num) +{ + printf("OpenCL WarpPerspective\n"); + + vx_status status = vxclCallOpenCLKernel(node, parameters, num); + + return status; +} + +vx_cl_kernel_description_t warp_affine_kernel = { + { + VX_KERNEL_WARP_AFFINE, + "org.khronos.openvx.warp_affine", + vxWarpAffineKernel, + warp_kernel_params, dimof(warp_kernel_params), + NULL, + vxWarpAffineInputValidator, + vxWarpOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_warp_affine.cl", + "warp_affine", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; + +vx_cl_kernel_description_t warp_perspective_kernel = { + { + VX_KERNEL_WARP_PERSPECTIVE, + "org.khronos.openvx.warp_perspective", + vxWarpPerspectiveKernel, + warp_kernel_params, dimof(warp_kernel_params), + NULL, + vxWarpPerspectiveInputValidator, + vxWarpOutputValidator, + NULL, + NULL, + }, + VX_CL_SOURCE_DIR""FILE_JOINER"vx_warp_perspective.cl", + "warp_perspective", + INIT_PROGRAMS, + INIT_KERNELS, + INIT_NUMKERNELS, + INIT_RETURNS, + NULL, +}; diff --git a/sample/targets/tiling/CMakeLists.txt b/sample/targets/tiling/CMakeLists.txt new file mode 100644 index 0000000..5dd957a --- /dev/null +++ b/sample/targets/tiling/CMakeLists.txt @@ -0,0 +1,50 @@ +# +# Copyright (c) 2011-2018 The Khronos Group Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# set target name +set( TARGET_NAME openvx-tiling_chaining ) + +include_directories( BEFORE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/../../include + ${VX_HEADER_DIR} + ${CMAKE_SOURCE_DIR}/kernels/tiling + ${CMAKE_SOURCE_DIR}/debug + ${CMAKE_SOURCE_DIR}/utils + ) + +FIND_SOURCES() + +if ((WIN32) OR (CYGWIN)) + set( DEF_FILE openvx-target.def ) +endif ((WIN32) OR (CYGWIN)) + +# add a target named ${TARGET_NAME} +add_library (${TARGET_NAME} SHARED ${SOURCE_FILES} ${DEF_FILE}) + +if (CYGWIN) + set_target_properties( ${TARGET_NAME} PROPERTIES LINK_FLAGS ${CMAKE_CURRENT_SOURCE_DIR}/${DEF_FILE} ) +endif (CYGWIN) + +target_link_libraries( ${TARGET_NAME} openvx-debug-lib openvx-extras-lib openvx-helper openvx-tiling_chaining-lib openvx vxu half) + +install ( TARGETS ${TARGET_NAME} + RUNTIME DESTINATION bin + ARCHIVE DESTINATION lib + LIBRARY DESTINATION bin ) + +set_target_properties( ${TARGET_NAME} PROPERTIES FOLDER ${SAMPLE_TARGETS_FOLDER} ) diff --git a/sample/targets/tiling/openvx-target.def b/sample/targets/tiling/openvx-target.def new file mode 100644 index 0000000..ac029d2 --- /dev/null +++ b/sample/targets/tiling/openvx-target.def @@ -0,0 +1,12 @@ +LIBRARY "openvx-tiling_chaining.dll" +VERSION 1.0 +EXPORTS + vxTargetInit + vxTargetDeinit + vxTargetVerify + vxTargetProcess + vxTargetSupports + vxTargetAddKernel + vxTargetAddTilingKernel + vxPublishKernels + vxUnpublishKernels diff --git a/sample/targets/tiling/vx_absdiff.c b/sample/targets/tiling/vx_absdiff.c new file mode 100644 index 0000000..9ab8d44 --- /dev/null +++ b/sample/targets/tiling/vx_absdiff.c @@ -0,0 +1,146 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" +#include "vx_internal.h" +#include + +static vx_status VX_CALLBACK vxAbsDiffInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0 ) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8 + || format == VX_DF_IMAGE_S16 +#if defined(OPENVX_USE_S16) + || format == VX_DF_IMAGE_U16 +#endif + ) + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_image images[2]; + vx_parameter param[2] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 1), + }; + vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0])); + vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1])); + if (images[0] && images[1]) + { + vx_uint32 width[2], height[2]; + vx_df_image format[2]; + + vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0])); + vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1])); + vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0])); + vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1])); + vxQueryImage(images[0], VX_IMAGE_FORMAT, &format[0], sizeof(format[0])); + vxQueryImage(images[1], VX_IMAGE_FORMAT, &format[1], sizeof(format[1])); + if (width[0] == width[1] && height[0] == height[1] && format[0] == format[1]) + { + status = VX_SUCCESS; + } + vxReleaseImage(&images[0]); + vxReleaseImage(&images[1]); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + } + return status; +} + +static vx_status VX_CALLBACK vxAbsDiffOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 2) + { + vx_parameter param[2] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 1), + }; + if ((vxGetStatus((vx_reference)param[0]) == VX_SUCCESS) && + (vxGetStatus((vx_reference)param[1]) == VX_SUCCESS)) + { + vx_image images[2]; + vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0])); + vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1])); + if (images[0] && images[1]) + { + vx_uint32 width[2], height[2]; + vx_df_image format = 0; + vxQueryImage(images[0], VX_IMAGE_FORMAT, &format, sizeof(format)); + vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0])); + vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1])); + vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0])); + vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1])); + if (width[0] == width[1] && height[0] == height[1] && + (format == VX_DF_IMAGE_U8 + || format == VX_DF_IMAGE_S16 +#if defined(OPENVX_USE_S16) + || format == VX_DF_IMAGE_U16 +#endif + )) + { + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = format; + ptr->dim.image.width = width[0]; + ptr->dim.image.height = height[1]; + status = VX_SUCCESS; + } + vxReleaseImage(&images[0]); + vxReleaseImage(&images[1]); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + } + } + return status; +} +vx_tiling_kernel_t absdiff_kernel = +{ + "org.khronos.openvx.tiling_absdiff", + VX_KERNEL_ABSDIFF_TILING, + NULL, + AbsDiff_image_tiling_flexible, + AbsDiff_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxAbsDiffInputValidator, + vxAbsDiffOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + diff --git a/sample/targets/tiling/vx_addsub.c b/sample/targets/tiling/vx_addsub.c new file mode 100644 index 0000000..5c99f8e --- /dev/null +++ b/sample/targets/tiling/vx_addsub.c @@ -0,0 +1,213 @@ +/* + + * Copyright (c) 2013-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" +#include "vx_internal.h" +#include + +static vx_status VX_CALLBACK vxAddSubtractInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16) + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_image images[2]; + vx_parameter param[2] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 1), + }; + vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0])); + vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1])); + if (images[0] && images[1]) + { + vx_uint32 width[2], height[2]; + vx_df_image format1; + + vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0])); + vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1])); + vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0])); + vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1])); + vxQueryImage(images[1], VX_IMAGE_FORMAT, &format1, sizeof(format1)); + if (width[0] == width[1] && height[0] == height[1] && + (format1 == VX_DF_IMAGE_U8 || format1 == VX_DF_IMAGE_S16)) + status = VX_SUCCESS; + vxReleaseImage(&images[0]); + vxReleaseImage(&images[1]); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + } + else if (index == 2) /* overflow_policy: truncate or saturate. */ + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar scalar = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_ENUM) + { + vx_enum overflow_policy = 0; + vxCopyScalar(scalar, &overflow_policy, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((overflow_policy == VX_CONVERT_POLICY_WRAP) || + (overflow_policy == VX_CONVERT_POLICY_SATURATE)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxAddSubtractOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 3) + { + /* + * We need to look at both input images, but only for the format: + * if either is S16 or the output type is not U8, then it's S16. + * The geometry of the output image is copied from the first parameter: + * the input images are known to match from input parameters validation. + */ + vx_parameter param[] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 1), + vxGetParameterByIndex(node, index), + }; + if ((vxGetStatus((vx_reference)param[0]) == VX_SUCCESS) && + (vxGetStatus((vx_reference)param[1]) == VX_SUCCESS) && + (vxGetStatus((vx_reference)param[2]) == VX_SUCCESS)) + { + vx_image images[3]; + vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0])); + vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1])); + vxQueryParameter(param[2], VX_PARAMETER_REF, &images[2], sizeof(images[2])); + if (images[0] && images[1] && images[2]) + { + vx_uint32 width = 0, height = 0; + vx_df_image informat[2] = {VX_DF_IMAGE_VIRT, VX_DF_IMAGE_VIRT}; + vx_df_image outformat = VX_DF_IMAGE_VIRT; + + /* + * When passing on the geometry to the output image, we only look at + * image 0, as both input images are verified to match, at input + * validation. + */ + vxQueryImage(images[0], VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height, sizeof(height)); + vxQueryImage(images[0], VX_IMAGE_FORMAT, &informat[0], sizeof(informat[0])); + vxQueryImage(images[1], VX_IMAGE_FORMAT, &informat[1], sizeof(informat[1])); + vxQueryImage(images[2], VX_IMAGE_FORMAT, &outformat, sizeof(outformat)); + + if (informat[0] == VX_DF_IMAGE_U8 && informat[1] == VX_DF_IMAGE_U8 && outformat == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + else + { + outformat = VX_DF_IMAGE_S16; + status = VX_SUCCESS; + } + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = outformat; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + vxReleaseImage(&images[0]); + vxReleaseImage(&images[1]); + vxReleaseImage(&images[2]); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + vxReleaseParameter(¶m[2]); + } + } + return status; +} + +vx_tiling_kernel_t add_kernel = { + "org.khronos.openvx.tiling_add", + VX_KERNEL_ADD_TILING, + NULL, + Addition_image_tiling_flexible, + Addition_image_tiling_fast, + 4, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxAddSubtractInputValidator, + vxAddSubtractOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + +vx_tiling_kernel_t subtract_kernel = { + "org.khronos.openvx.tiling_subtract", + VX_KERNEL_SUBTRACT_TILING, + NULL, + Subtraction_image_tiling_flexible, + Subtraction_image_tiling_fast, + 4, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxAddSubtractInputValidator, + vxAddSubtractOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + diff --git a/sample/targets/tiling/vx_bitwise.c b/sample/targets/tiling/vx_bitwise.c new file mode 100644 index 0000000..69dc6fa --- /dev/null +++ b/sample/targets/tiling/vx_bitwise.c @@ -0,0 +1,236 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" + +#include "vx_internal.h" + +#include + +static vx_status VX_CALLBACK vxBinaryBitwiseInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_image images[2]; + vx_parameter param[2] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 1), + }; + vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0])); + vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1])); + if (images[0] && images[1]) + { + vx_uint32 width[2], height[2]; + vx_df_image format[2]; + + vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0])); + vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1])); + vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0])); + vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1])); + vxQueryImage(images[0], VX_IMAGE_FORMAT, &format[0], sizeof(format[0])); + vxQueryImage(images[1], VX_IMAGE_FORMAT, &format[1], sizeof(format[1])); + if (width[0] == width[1] && height[0] == height[1] && format[0] == format[1]) + status = VX_SUCCESS; + vxReleaseImage(&images[1]); + vxReleaseImage(&images[0]); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + } + return status; +} + +static vx_status VX_CALLBACK vxBinaryBitwiseOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 2) + { + vx_parameter param0 = vxGetParameterByIndex(node, 0); + if (param0) + { + vx_image image0 = 0; + vxQueryParameter(param0, VX_PARAMETER_REF, &image0, sizeof(image0)); + /* + * When passing on the geometry to the output image, we only look at image 0, as + * both input images are verified to match, at input validation. + */ + if (image0) + { + vx_uint32 width = 0, height = 0; + vxQueryImage(image0, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(image0, VX_IMAGE_HEIGHT, &height, sizeof(height)); + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&image0); + } + vxReleaseParameter(¶m0); + } + } + return status; +} + +vx_tiling_kernel_t And_kernel = +{ + "org.khronos.openvx.tiling_and", + VX_KERNEL_AND_TILING, + NULL, + And_image_tiling_flexible, + And_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxBinaryBitwiseInputValidator, + vxBinaryBitwiseOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; +vx_tiling_kernel_t Or_kernel = +{ + "org.khronos.openvx.tiling_or", + VX_KERNEL_OR_TILING, + NULL, + Or_image_tiling_flexible, + Or_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxBinaryBitwiseInputValidator, + vxBinaryBitwiseOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; +vx_tiling_kernel_t Xor_kernel = +{ + "org.khronos.openvx.tiling_xor", + VX_KERNEL_XOR_TILING, + NULL, + Xor_image_tiling_flexible, + Xor_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxBinaryBitwiseInputValidator, + vxBinaryBitwiseOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + +/* The Not kernel is an unary operator, requiring separate validators. */ +static vx_status VX_CALLBACK vxUnaryBitwiseInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + return status; +} + +static vx_status VX_CALLBACK vxUnaryBitwiseOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, 0); + if (param) + { + vx_image inimage = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &inimage, sizeof(inimage)); + if (inimage) + { + vx_uint32 width = 0, height = 0; + vxQueryImage(inimage, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(inimage, VX_IMAGE_HEIGHT, &height, sizeof(height)); + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&inimage); + } + vxReleaseParameter(¶m); + } + } + return status; +} +vx_tiling_kernel_t Not_kernel = +{ + "org.khronos.openvx.tiling_not", + VX_KERNEL_NOT_TILING, + NULL, + Not_image_tiling_flexible, + Not_image_tiling_fast, + 2, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxUnaryBitwiseInputValidator, + vxUnaryBitwiseOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + diff --git a/sample/targets/tiling/vx_channelcombine.c b/sample/targets/tiling/vx_channelcombine.c new file mode 100644 index 0000000..1add231 --- /dev/null +++ b/sample/targets/tiling/vx_channelcombine.c @@ -0,0 +1,189 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" + +#include "vx_internal.h" + +#include "tiling.h" + +static vx_status VX_CALLBACK vxChannelCombineInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index < 4) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_image image = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &image, sizeof(image)); + if (image) + { + vx_df_image format = 0; + vxQueryImage(image, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&image); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxChannelCombineOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 4) + { + vx_uint32 p, width = 0, height = 0; + vx_uint32 uv_x_scale = 0, uv_y_scale = 0; + vx_parameter params[] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 1), + vxGetParameterByIndex(node, 2), + vxGetParameterByIndex(node, 3), + vxGetParameterByIndex(node, index) + }; + vx_bool planes_present[4] = { vx_false_e, vx_false_e, vx_false_e, vx_false_e }; + /* check for equal plane sizes and determine plane presence */ + for (p = 0; p < index; p++) + { + if (vxGetStatus((vx_reference)params[p]) == VX_SUCCESS) + { + vx_image image = 0; + vxQueryParameter(params[p], VX_PARAMETER_REF, &image, sizeof(image)); + planes_present[p] = image != 0; + + if (image) + { + uint32_t w = 0, h = 0; + vxQueryImage(image, VX_IMAGE_WIDTH, &w, sizeof(w)); + vxQueryImage(image, VX_IMAGE_HEIGHT, &h, sizeof(h)); + if (width == 0 && height == 0) + { + width = w; + height = h; + } + else if (uv_x_scale == 0 && uv_y_scale == 0) + { + uv_x_scale = width == w ? 1 : (width == 2*w ? 2 : 0); + uv_y_scale = height == h ? 1 : (height == 2*h ? 2 : 0); + if (uv_x_scale == 0 || uv_y_scale == 0 || uv_y_scale > uv_x_scale) + { + status = VX_ERROR_INVALID_DIMENSION; + vxAddLogEntry((vx_reference)image, status, "Input image channel %u does not match in dimensions!\n", p); + goto exit; + } + } + else if (width != w * uv_x_scale || height != h * uv_y_scale) + { + status = VX_ERROR_INVALID_DIMENSION; + vxAddLogEntry((vx_reference)image, status, "Input image channel %u does not match in dimensions!\n", p); + goto exit; + } + vxReleaseImage(&image); + } + } + } + if (params[index]) + { + vx_image output = 0; + vxQueryParameter(params[index], VX_PARAMETER_REF, &output, sizeof(output)); + if (output) + { + vx_df_image format = VX_DF_IMAGE_VIRT; + vx_bool supported_format = vx_true_e; + vx_bool correct_planes = planes_present[0] && planes_present[1] && planes_present[2]; + + vxQueryImage(output, VX_IMAGE_FORMAT, &format, sizeof(format)); + switch (format) + { + case VX_DF_IMAGE_RGB: + case VX_DF_IMAGE_YUV4: + correct_planes = correct_planes && uv_y_scale == 1 && uv_x_scale == 1; + break; + case VX_DF_IMAGE_RGBX: + correct_planes = correct_planes && planes_present[3] && uv_y_scale == 1 && uv_x_scale == 1; + break; + case VX_DF_IMAGE_YUYV: + case VX_DF_IMAGE_UYVY: + correct_planes = correct_planes && uv_y_scale == 1 && uv_x_scale == 2; + break; + case VX_DF_IMAGE_NV12: + case VX_DF_IMAGE_NV21: + case VX_DF_IMAGE_IYUV: + correct_planes = correct_planes && uv_y_scale == 2 && uv_x_scale == 2; + break; + default: + supported_format = vx_false_e; + } + if (supported_format) + { + if (correct_planes) + { + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = format; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + } + else + { + VX_PRINT(VX_ZONE_API, "Valid format but missing planes!\n"); + } + } + vxReleaseImage(&output); + } + } +exit: + for (p = 0; p < dimof(params); p++) + { + if (params[p]) + { + vxReleaseParameter(¶ms[p]); + } + } + } + VX_PRINT(VX_ZONE_API, "%s:%u returned %d\n", __FUNCTION__, index, status); + return status; +} + +vx_tiling_kernel_t channelcombine_kernel = +{ + "org.khronos.openvx.tiling_channel_combine", + VX_KERNEL_CHANNEL_COMBINE_TILING, + NULL, + ChannelCombine_image_tiling_flexible, + ChannelCombine_image_tiling_fast, + 5, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxChannelCombineInputValidator, + vxChannelCombineOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_colorconvert.c b/sample/targets/tiling/vx_colorconvert.c new file mode 100644 index 0000000..eefab0b --- /dev/null +++ b/sample/targets/tiling/vx_colorconvert.c @@ -0,0 +1,190 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" + +#include "vx_internal.h" + +#include "tiling.h" + +static vx_status VX_CALLBACK vxColorConvertInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_SUCCESS; + if (index == 0) + { + vx_parameter param = vxGetParameterByIndex(node, 0); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_image image = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &image, sizeof(image)); + if (image) + { + vx_df_image format = 0; + vx_uint32 width = 0, height = 0; + + vxQueryImage(image, VX_IMAGE_FORMAT, &format, sizeof(format)); + vxQueryImage(image, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(image, VX_IMAGE_HEIGHT, &height, sizeof(height)); + // check to make sure the input format is supported. + switch (format) + { + case VX_DF_IMAGE_RGB: /* 8:8:8 interleaved */ + case VX_DF_IMAGE_RGBX: /* 8:8:8:8 interleaved */ + case VX_DF_IMAGE_NV12: /* 4:2:0 co-planar*/ + case VX_DF_IMAGE_NV21: /* 4:2:0 co-planar*/ + case VX_DF_IMAGE_IYUV: /* 4:2:0 planar */ + if (height & 1) + { + status = VX_ERROR_INVALID_DIMENSION; + break; + } + /* no break */ + case VX_DF_IMAGE_YUYV: /* 4:2:2 interleaved */ + case VX_DF_IMAGE_UYVY: /* 4:2:2 interleaved */ + if (width & 1) + { + status = VX_ERROR_INVALID_DIMENSION; + } + break; + default: + status = VX_ERROR_INVALID_FORMAT; + break; + } + vxReleaseImage(&image); + } + else + { + status = VX_ERROR_INVALID_PARAMETERS; + } + vxReleaseParameter(¶m); + } + else + { + status = VX_ERROR_INVALID_PARAMETERS; + } + } + else + { + status = VX_ERROR_INVALID_PARAMETERS; + } + return status; +} + +static vx_df_image color_combos[][2] = { + /* {src, dst} */ + {VX_DF_IMAGE_RGB, VX_DF_IMAGE_RGBX}, + {VX_DF_IMAGE_RGB, VX_DF_IMAGE_NV12}, + {VX_DF_IMAGE_RGB, VX_DF_IMAGE_YUV4}, + {VX_DF_IMAGE_RGB, VX_DF_IMAGE_IYUV}, + {VX_DF_IMAGE_RGBX,VX_DF_IMAGE_RGB}, + {VX_DF_IMAGE_RGBX,VX_DF_IMAGE_NV12}, + {VX_DF_IMAGE_RGBX,VX_DF_IMAGE_YUV4}, + {VX_DF_IMAGE_RGBX,VX_DF_IMAGE_IYUV}, + {VX_DF_IMAGE_NV12,VX_DF_IMAGE_RGB}, + {VX_DF_IMAGE_NV12,VX_DF_IMAGE_RGBX}, + {VX_DF_IMAGE_NV12,VX_DF_IMAGE_NV21}, + {VX_DF_IMAGE_NV12,VX_DF_IMAGE_YUV4}, + {VX_DF_IMAGE_NV12,VX_DF_IMAGE_IYUV}, + {VX_DF_IMAGE_NV21,VX_DF_IMAGE_RGB}, + {VX_DF_IMAGE_NV21,VX_DF_IMAGE_RGBX}, + {VX_DF_IMAGE_NV21,VX_DF_IMAGE_NV12}, + {VX_DF_IMAGE_NV21,VX_DF_IMAGE_YUV4}, + {VX_DF_IMAGE_NV21,VX_DF_IMAGE_IYUV}, + {VX_DF_IMAGE_UYVY,VX_DF_IMAGE_RGB}, + {VX_DF_IMAGE_UYVY,VX_DF_IMAGE_RGBX}, + {VX_DF_IMAGE_UYVY,VX_DF_IMAGE_NV12}, + {VX_DF_IMAGE_UYVY,VX_DF_IMAGE_YUV4}, + {VX_DF_IMAGE_UYVY,VX_DF_IMAGE_IYUV}, + {VX_DF_IMAGE_YUYV,VX_DF_IMAGE_RGB}, + {VX_DF_IMAGE_YUYV,VX_DF_IMAGE_RGBX}, + {VX_DF_IMAGE_YUYV,VX_DF_IMAGE_NV12}, + {VX_DF_IMAGE_YUYV,VX_DF_IMAGE_YUV4}, + {VX_DF_IMAGE_YUYV,VX_DF_IMAGE_IYUV}, + {VX_DF_IMAGE_IYUV,VX_DF_IMAGE_RGB}, + {VX_DF_IMAGE_IYUV,VX_DF_IMAGE_RGBX}, + {VX_DF_IMAGE_IYUV,VX_DF_IMAGE_NV12}, + {VX_DF_IMAGE_IYUV,VX_DF_IMAGE_YUV4}, +}; + +static vx_status VX_CALLBACK vxColorConvertOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 1) + { + vx_parameter param0 = vxGetParameterByIndex(node, 0); + vx_parameter param1 = vxGetParameterByIndex(node, 1); + if ((vxGetStatus((vx_reference)param0) == VX_SUCCESS) && + (vxGetStatus((vx_reference)param1) == VX_SUCCESS)) + { + vx_image output = 0, input = 0; + vxQueryParameter(param0, VX_PARAMETER_REF, &input, sizeof(input)); + vxQueryParameter(param1, VX_PARAMETER_REF, &output, sizeof(output)); + if (input && output) + { + vx_df_image src = VX_DF_IMAGE_VIRT; + vx_df_image dst = VX_DF_IMAGE_VIRT; + vxQueryImage(input, VX_IMAGE_FORMAT, &src, sizeof(src)); + vxQueryImage(output, VX_IMAGE_FORMAT, &dst, sizeof(dst)); + if (dst != VX_DF_IMAGE_VIRT) /* can't be a unspecified format */ + { + vx_uint32 i = 0; + for (i = 0; i < dimof(color_combos); i++) + { + if ((color_combos[i][0] == src) && + (color_combos[i][1] == dst)) + { + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = dst; + vxQueryImage(input, VX_IMAGE_WIDTH, &ptr->dim.image.width, sizeof(ptr->dim.image.width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &ptr->dim.image.height, sizeof(ptr->dim.image.height)); + status = VX_SUCCESS; + break; + } + } + } + vxReleaseImage(&input); + vxReleaseImage(&output); + } + vxReleaseParameter(¶m0); + vxReleaseParameter(¶m1); + } + } + VX_PRINT(VX_ZONE_API, "%s:%u returned %d\n", __FUNCTION__, index, status); + return status; +} + +/*! \brief The exported kernel table entry */ +vx_tiling_kernel_t colorconvert_kernel = +{ + "org.khronos.openvx.tiling_color_convert", + VX_KERNEL_COLOR_CONVERT_TILING, + NULL, + ConvertColor_image_tiling_flexible, + ConvertColor_image_tiling_fast, + 2, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxColorConvertInputValidator, + vxColorConvertOutputValidator, + NULL, + NULL, + { 8, 8 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + diff --git a/sample/targets/tiling/vx_convertdepth.c b/sample/targets/tiling/vx_convertdepth.c new file mode 100644 index 0000000..619c57f --- /dev/null +++ b/sample/targets/tiling/vx_convertdepth.c @@ -0,0 +1,210 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" +#include "vx_internal.h" + +#include + +static vx_status VX_CALLBACK vxConvertDepthInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_image input = 0; + status = vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if ((status == VX_SUCCESS) && input) + { + vx_df_image format = 0; + status = vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if ((status != VX_SUCCESS) || + (format == VX_DF_IMAGE_U8) || +#if defined(EXPERIMENTAL_USE_S16) + (format == VX_DF_IMAGE_U16) || + (format == VX_DF_IMAGE_U32) || + (format == VX_DF_IMAGE_S32) || + (format == VX_DF_IMAGE_F32) || +#endif + (format == VX_DF_IMAGE_S16)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_PARAMETERS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + } + else if (index == 2) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar scalar = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_ENUM) + { + vx_enum overflow_policy = 0; + vxCopyScalar(scalar, &overflow_policy, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((overflow_policy == VX_CONVERT_POLICY_WRAP) || + (overflow_policy == VX_CONVERT_POLICY_SATURATE)) + { + status = VX_SUCCESS; + } + else + { + printf("Overflow given as %08x\n", overflow_policy); + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + else if (index == 3) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar scalar = 0; + status = vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (status == VX_SUCCESS) + { + vx_enum type = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &type, sizeof(type)); + if (type == VX_TYPE_INT32) + { + vx_int32 shift = 0; + status = vxCopyScalar(scalar, &shift, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if (status == VX_SUCCESS) + { + /*! \internal Allowing \f$ 0 \le shift < 32 \f$ could + * produce weird results for smaller bit depths */ + if (shift < 0 || shift >= 32) + { + status = VX_ERROR_INVALID_VALUE; + } + /* status should be VX_SUCCESS from call */ + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxConvertDepthOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 1) + { + vx_parameter param[2] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 1), + }; + if ((vxGetStatus((vx_reference)param[0]) == VX_SUCCESS) && + (vxGetStatus((vx_reference)param[1]) == VX_SUCCESS)) + { + vx_image images[2] = {0,0}; + status = VX_SUCCESS; + status |= vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0])); + status |= vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1])); + if ((status == VX_SUCCESS) && (images[0]) && (images[1])) + { + vx_uint32 width = 0, height = 0; + vx_df_image format[2] = {VX_DF_IMAGE_VIRT, VX_DF_IMAGE_VIRT}; + status |= vxQueryImage(images[0], VX_IMAGE_WIDTH, &width, sizeof(width)); + status |= vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height, sizeof(height)); + status |= vxQueryImage(images[0], VX_IMAGE_FORMAT, &format[0], sizeof(format[0])); + status |= vxQueryImage(images[1], VX_IMAGE_FORMAT, &format[1], sizeof(format[1])); + if (((format[0] == VX_DF_IMAGE_U8) && (format[1] == VX_DF_IMAGE_S16)) || +#if defined(EXPERIMENTAL_USE_S16) + ((format[0] == VX_DF_IMAGE_U8) && (format[1] == VX_DF_IMAGE_U16)) || + ((format[0] == VX_DF_IMAGE_U8) && (format[1] == VX_DF_IMAGE_U32)) || + ((format[0] == VX_DF_IMAGE_U16) && (format[1] == VX_DF_IMAGE_U8)) || + ((format[0] == VX_DF_IMAGE_U16) && (format[1] == VX_DF_IMAGE_U32)) || + ((format[0] == VX_DF_IMAGE_S16) && (format[1] == VX_DF_IMAGE_S32)) || + ((format[0] == VX_DF_IMAGE_U32) && (format[1] == VX_DF_IMAGE_U8)) || + ((format[0] == VX_DF_IMAGE_U32) && (format[1] == VX_DF_IMAGE_U16)) || + ((format[0] == VX_DF_IMAGE_S32) && (format[1] == VX_DF_IMAGE_S16)) || + ((format[0] == VX_DF_IMAGE_F32) && (format[1] == VX_DF_IMAGE_U8)) || /* non-specification */ +#endif + ((format[0] == VX_DF_IMAGE_S16) && (format[1] == VX_DF_IMAGE_U8))) + { + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = format[1]; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_PARAMETERS; + } + vxReleaseImage(&images[0]); + vxReleaseImage(&images[1]); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + } + } + return status; +} + + +vx_tiling_kernel_t convertdepth_kernel = +{ + "org.khronos.openvx.tiling_convertdepth", + VX_KERNEL_CONVERTDEPTH_TILING, + NULL, + ConvertDepth_image_tiling_flexible, + ConvertDepth_image_tiling_fast, + 4, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxConvertDepthInputValidator, + vxConvertDepthOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_convolution.c b/sample/targets/tiling/vx_convolution.c new file mode 100644 index 0000000..60f9d3e --- /dev/null +++ b/sample/targets/tiling/vx_convolution.c @@ -0,0 +1,154 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" + +#include "vx_internal.h" + +#include "tiling.h" + +static vx_status VX_CALLBACK vxConvolveInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + +#if defined(EXPERIMENTAL_USE_S16) + if( (format == VX_DF_IMAGE_U8) || (format == VX_DF_IMAGE_S16) ) +#else + if (format == VX_DF_IMAGE_U8) +#endif + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + if (index == 1) + { + vx_image input = 0; + vx_convolution conv = 0; + + vx_parameter param0 = vxGetParameterByIndex(node, 0); + vx_parameter param1 = vxGetParameterByIndex(node, index); + + vxQueryParameter(param0, VX_PARAMETER_REF, &input, sizeof(input)); + vxQueryParameter(param1, VX_PARAMETER_REF, &conv, sizeof(conv)); + if (input && conv) + { + vx_uint32 width = 0; + vx_uint32 height = 0; + vx_size dims[2] = { 0, 0 }; + + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + + vxQueryConvolution(conv, VX_CONVOLUTION_COLUMNS, &dims[0], sizeof(dims[0])); + vxQueryConvolution(conv, VX_CONVOLUTION_ROWS, &dims[1], sizeof(dims[1])); + + if ((dims[0] <= VX_INT_MAX_CONVOLUTION_DIM) && + (dims[1] <= VX_INT_MAX_CONVOLUTION_DIM) && + (width >= dims[0]) && + (height >= dims[1])) + { + status = VX_SUCCESS; + } + + vxReleaseImage(&input); + vxReleaseConvolution(&conv); + } + + vxReleaseParameter(¶m0); + vxReleaseParameter(¶m1); + } + + return status; +} + +static vx_status VX_CALLBACK vxConvolveOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 2) + { + vx_parameter params[2] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, index), + }; + if ((vxGetStatus((vx_reference)params[0]) == VX_SUCCESS) && + (vxGetStatus((vx_reference)params[1]) == VX_SUCCESS)) + { + vx_image input = 0; + vx_image output = 0; + vxQueryParameter(params[0], VX_PARAMETER_REF, &input, sizeof(input)); + vxQueryParameter(params[1], VX_PARAMETER_REF, &output, sizeof(output)); + if (input && output) + { + vx_uint32 width = 0, height = 0; + vx_df_image format = 0; + vx_df_image output_format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + + vxQueryImage(output, VX_IMAGE_FORMAT, &output_format, sizeof(output_format)); + + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = output_format == VX_DF_IMAGE_U8 ? VX_DF_IMAGE_U8 : VX_DF_IMAGE_S16; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + + vxReleaseImage(&input); + vxReleaseImage(&output); + } + vxReleaseParameter(¶ms[0]); + vxReleaseParameter(¶ms[1]); + } + } + return status; +} + +vx_tiling_kernel_t convolution_kernel = +{ + "org.khronos.openvx.tiling_custom_convolution", + VX_KERNEL_CUSTOM_CONVOLUTION_TILING, + NULL, + Convolve_image_tiling_flexible, + Convolve_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_CONVOLUTION, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxConvolveInputValidator, + vxConvolveOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + diff --git a/sample/targets/tiling/vx_fast9.c b/sample/targets/tiling/vx_fast9.c new file mode 100644 index 0000000..1cfff14 --- /dev/null +++ b/sample/targets/tiling/vx_fast9.c @@ -0,0 +1,156 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" + +#include "vx_internal.h" + +#include "tiling.h" + +static vx_status VX_CALLBACK vxFast9InputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_image input = 0; + status = vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if ((status == VX_SUCCESS) && (input)) + { + vx_df_image format = 0; + status = vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if ((status == VX_SUCCESS) && (format == VX_DF_IMAGE_U8)) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + } + if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar sens = 0; + status = vxQueryParameter(param, VX_PARAMETER_REF, &sens, sizeof(sens)); + if ((status == VX_SUCCESS) && (sens)) + { + vx_enum type = VX_TYPE_INVALID; + vxQueryScalar(sens, VX_SCALAR_TYPE, &type, sizeof(type)); + if (type == VX_TYPE_FLOAT32) + { + vx_float32 k = 0.0f; + status = vxCopyScalar(sens, &k, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((status == VX_SUCCESS) && (k > 0) && (k < 256)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&sens); + } + vxReleaseParameter(¶m); + } + } + if (index == 2) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar s_nonmax = 0; + status = vxQueryParameter(param, VX_PARAMETER_REF, &s_nonmax, sizeof(s_nonmax)); + if ((status == VX_SUCCESS) && (s_nonmax)) + { + vx_enum type = VX_TYPE_INVALID; + vxQueryScalar(s_nonmax, VX_SCALAR_TYPE, &type, sizeof(type)); + if (type == VX_TYPE_BOOL) + { + vx_bool nonmax = vx_false_e; + status = vxCopyScalar(s_nonmax, &nonmax, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((status == VX_SUCCESS) && ((nonmax == vx_false_e) || + (nonmax == vx_true_e))) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&s_nonmax); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxFast9OutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 3) + { + ptr->type = VX_TYPE_ARRAY; + ptr->dim.array.item_type = VX_TYPE_KEYPOINT; + ptr->dim.array.capacity = 0; /* no defined capacity requirement */ + status = VX_SUCCESS; + } + else if (index == 4) + { + ptr->dim.scalar.type = VX_TYPE_SIZE; + status = VX_SUCCESS; + } + return status; +} + +vx_tiling_kernel_t fast9_kernel = +{ + "org.khronos.openvx.tiling_fast_corners", + VX_KERNEL_FAST_CORNERS_TILING, + NULL, + Fast9Corners_image_tiling_flexible, + Fast9Corners_image_tiling_fast, + 5, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_ARRAY, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL } }, + NULL, + vxFast9InputValidator, + vxFast9OutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_filter.c b/sample/targets/tiling/vx_filter.c new file mode 100644 index 0000000..a6a1d3b --- /dev/null +++ b/sample/targets/tiling/vx_filter.c @@ -0,0 +1,139 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "vx_interface.h" + +#include + +static vx_status VX_CALLBACK vxFilterInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (param) + { + vx_image input = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxFilterOutputValidator(vx_node node, vx_uint32 index, vx_meta_format meta) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, 0); /* we reference an input image */ + if (param) + { + vx_image input = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_uint32 width = 0, height = 0; + vx_df_image format = VX_DF_IMAGE_U8; + + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + + vxSetMetaFormatAttribute(meta, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxSetMetaFormatAttribute(meta, VX_IMAGE_HEIGHT, &height, sizeof(height)); + vxSetMetaFormatAttribute(meta, VX_IMAGE_FORMAT, &format, sizeof(format)); + + vxReleaseImage(&input); + + status = VX_SUCCESS; + } + vxReleaseParameter(¶m); + } + } + return status; +} + +vx_tiling_kernel_t box_3x3_kernels = +{ + "org.khronos.openvx.tiling_box_3x3", + VX_KERNEL_BOX_3x3_TILING, + NULL, + box3x3_image_tiling_flexible, + box3x3_image_tiling_fast, + 2, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxFilterInputValidator, + vxFilterOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + +vx_tiling_kernel_t median3x3_kernel = +{ + "org.khronos.openvx.tiling_median_3x3", + VX_KERNEL_MEDIAN_3x3_TILING, + NULL, + Median3x3_image_tiling_flexible, + Median3x3_image_tiling_fast, + 2, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxFilterInputValidator, + vxFilterOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + +vx_tiling_kernel_t gaussian3x3_kernel = +{ + "org.khronos.openvx.tiling_gaussian_3x3", + VX_KERNEL_GAUSSIAN_3x3_TILING, + NULL, + Gaussian3x3_image_tiling_flexible, + Gaussian3x3_image_tiling_fast, + 2, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxFilterInputValidator, + vxFilterOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_gradients.c b/sample/targets/tiling/vx_gradients.c new file mode 100644 index 0000000..520e201 --- /dev/null +++ b/sample/targets/tiling/vx_gradients.c @@ -0,0 +1,124 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "vx_interface.h" + +#include + +static vx_param_description_t sobel3x3_kernel_params[] = +{ + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL }, +}; + +static vx_status VX_CALLBACK own_sobel3x3_validator(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[]) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + + if (NULL != node && NULL != parameters && num == dimof(sobel3x3_kernel_params) && NULL != metas) + { + vx_parameter param1 = vxGetParameterByIndex(node, 0); + vx_parameter param2 = vxGetParameterByIndex(node, 1); + vx_parameter param3 = vxGetParameterByIndex(node, 2); + + if (VX_SUCCESS == vxGetStatus((vx_reference)param1) && + ((VX_SUCCESS == vxGetStatus((vx_reference)param2)) || (VX_SUCCESS == vxGetStatus((vx_reference)param3)))) + { + vx_uint32 src_width = 0; + vx_uint32 src_height = 0; + vx_df_image src_format = 0; + vx_image input = 0; + + status = vxQueryParameter(param1, VX_PARAMETER_REF, &input, sizeof(input)); + + status |= vxQueryImage(input, VX_IMAGE_WIDTH, &src_width, sizeof(src_width)); + status |= vxQueryImage(input, VX_IMAGE_HEIGHT, &src_height, sizeof(src_height)); + status |= vxQueryImage(input, VX_IMAGE_FORMAT, &src_format, sizeof(src_format)); + + /* validate input image */ + if (VX_SUCCESS == status) + { + if (src_width >= 3 && src_height >= 3 && src_format == VX_DF_IMAGE_U8) + status = VX_SUCCESS; + else + status = VX_ERROR_INVALID_PARAMETERS; + } + + /* validate output images */ + if (VX_SUCCESS == status) + { + vx_enum dst_format = VX_DF_IMAGE_S16; + + if (NULL == metas[1] && NULL == metas[2]) + status = VX_ERROR_INVALID_PARAMETERS; + + if (VX_SUCCESS == status && NULL != metas[1]) + { + /* if optional parameter non NULL */ + status |= vxSetMetaFormatAttribute(metas[1], VX_IMAGE_WIDTH, &src_width, sizeof(src_width)); + status |= vxSetMetaFormatAttribute(metas[1], VX_IMAGE_HEIGHT, &src_height, sizeof(src_height)); + status |= vxSetMetaFormatAttribute(metas[1], VX_IMAGE_FORMAT, &dst_format, sizeof(dst_format)); + } + + if (VX_SUCCESS == status && NULL != metas[2]) + { + /* if optional parameter non NULL */ + status |= vxSetMetaFormatAttribute(metas[2], VX_IMAGE_WIDTH, &src_width, sizeof(src_width)); + status |= vxSetMetaFormatAttribute(metas[2], VX_IMAGE_HEIGHT, &src_height, sizeof(src_height)); + status |= vxSetMetaFormatAttribute(metas[2], VX_IMAGE_FORMAT, &dst_format, sizeof(dst_format)); + } + } + + if (NULL != input) + vxReleaseImage(&input); + + if (NULL != param1) + vxReleaseParameter(¶m1); + + if (NULL != param2) + vxReleaseParameter(¶m2); + + if (NULL != param3) + vxReleaseParameter(¶m3); + } + } /* if ptrs non NULL */ + + return status; +} /* own_sobel3x3_validator() */ + +vx_tiling_kernel_t sobel3x3_kernel = +{ + "org.khronos.openvx.tiling_sobel_3x3", + VX_KERNEL_SOBEL_3x3_TILING, + NULL, + Sobel3x3_image_tiling_flexible, + Sobel3x3_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL } }, + own_sobel3x3_validator, + NULL, + NULL, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_hog.c b/sample/targets/tiling/vx_hog.c new file mode 100644 index 0000000..3872b5f --- /dev/null +++ b/sample/targets/tiling/vx_hog.c @@ -0,0 +1,318 @@ +/* +* Copyright (c) 2016-2017 The Khronos Group Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and/or associated documentation files (the +* "Materials"), to deal in the Materials without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Materials, and to +* permit persons to whom the Materials are furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Materials. +* +* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS +* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS +* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT +* https://www.khronos.org/registry/ +* +* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. +*/ + +#include "vx_interface.h" +#include "vx_internal.h" +#include "tiling.h" + + +static vx_status VX_CALLBACK vxHogCellsInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_ATTRIBUTE_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1 || index == 2 || index == 3) + { + vx_scalar scalar = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum type = -1; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &type, sizeof(type)); + if (type == VX_TYPE_INT32) + { + vx_int32 para = 0; + if ((vxCopyScalar(scalar, ¶, VX_READ_ONLY, VX_MEMORY_TYPE_HOST) == VX_SUCCESS) && + (para >= 0)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxHogCellsOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + vx_enum format; + vx_tensor tensor; + vx_parameter param = vxGetParameterByIndex(node, index); + vxQueryParameter(param, VX_PARAMETER_ATTRIBUTE_REF, &tensor, sizeof(tensor)); + if (tensor && index == 4) + { + format = VX_TYPE_INT16; + vx_uint8 fixed_point_pos1 = 8; + vx_size out_num_dims; + vx_size out_dims[2]; + status = vxQueryTensor(tensor, VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims)); + status |= vxQueryTensor(tensor, VX_TENSOR_DIMS, out_dims, sizeof(out_dims)); + status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_DATA_TYPE, &format, sizeof(format)); + status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_FIXED_POINT_POSITION, &fixed_point_pos1, sizeof(fixed_point_pos1)); + status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_DIMS, out_dims, sizeof(*out_dims) * out_num_dims); + status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims)); + } + else if (tensor && index == 5) + { + format = VX_TYPE_INT8; + vx_uint8 fixed_point_pos1 = 0; + vx_size out_num_dims; + vx_size out_dims[3]; + status = vxQueryTensor(tensor, VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims)); + status |= vxQueryTensor(tensor, VX_TENSOR_DIMS, out_dims, sizeof(out_dims)); + status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_DATA_TYPE, &format, sizeof(format)); + status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_FIXED_POINT_POSITION, &fixed_point_pos1, sizeof(fixed_point_pos1)); + status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_DIMS, out_dims, sizeof(*out_dims) * out_num_dims); + status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims)); + } + vxReleaseTensor(&tensor); + vxReleaseParameter(¶m); + return status; +} + +static vx_status VX_CALLBACK vxHogFeaturesInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_ATTRIBUTE_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_tensor mag = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vxQueryParameter(param, VX_PARAMETER_REF, &mag, sizeof(mag)); + if (mag) + { + vx_enum format = -1; + vxQueryTensor(mag, VX_TENSOR_DATA_TYPE, &format, sizeof(format)); + if (format == VX_TYPE_INT16) + { + + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseTensor(&mag); + } + vxReleaseParameter(¶m); + } + } + else if (index == 2) + { + vx_tensor mag = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vxQueryParameter(param, VX_PARAMETER_REF, &mag, sizeof(mag)); + if (mag) + { + vx_enum format = -1; + vxQueryTensor(mag, VX_TENSOR_DATA_TYPE, &format, sizeof(format)); + if (format == VX_TYPE_INT8) + { + + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseTensor(&mag); + } + vxReleaseParameter(¶m); + } + } + else if (index == 3) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_array arr = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &arr, sizeof(arr)); + if (arr) + { + vx_enum item_type = 0; + vxQueryArray(arr, VX_ARRAY_ITEMTYPE, &item_type, sizeof(item_type)); + if (item_type == VX_TYPE_HOG_PARAMS) + { + status = VX_SUCCESS; + } + vxReleaseArray(&arr); + } + vxReleaseParameter(¶m); + } + } + else if (index == 4) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar hog_param_size = 0; + status = vxQueryParameter(param, VX_PARAMETER_REF, &hog_param_size, sizeof(hog_param_size)); + if ((status == VX_SUCCESS) && (hog_param_size)) + { + vx_enum type = 0; + vxQueryScalar(hog_param_size, VX_SCALAR_TYPE, &type, sizeof(type)); + if (type == VX_TYPE_INT32) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&hog_param_size); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxHogFeaturesOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + vx_enum format; + vx_tensor tensor; + vx_parameter param = vxGetParameterByIndex(node, index); + vxQueryParameter(param, VX_PARAMETER_ATTRIBUTE_REF, &tensor, sizeof(tensor)); + if (tensor && index == 5) + { + format = VX_TYPE_INT16; + vx_uint8 fixed_point_pos1 = 8; + vx_size out_num_dims; + vx_size out_dims[3]; + status = vxQueryTensor(tensor, VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims)); + status |= vxQueryTensor(tensor, VX_TENSOR_DIMS, out_dims, sizeof(out_dims)); + status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_DATA_TYPE, &format, sizeof(format)); + status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_FIXED_POINT_POSITION, &fixed_point_pos1, sizeof(fixed_point_pos1)); + status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_DIMS, out_dims, sizeof(*out_dims) * out_num_dims); + status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims)); + } + vxReleaseTensor(&tensor); + vxReleaseParameter(¶m); + return status; +} + +vx_tiling_kernel_t hogcells_kernel = +{ + "org.khronos.openvx.tiling_hogcells", + VX_KERNEL_HOG_CELLS_TILING, + NULL, + HogCells_image_tiling_flexible, + HogCells_image_tiling_fast, + 6, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxHogCellsInputValidator, + vxHogCellsOutputValidator, + NULL, + NULL, + { 32, 32 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + +vx_tiling_kernel_t hogfeatures_kernel = +{ + "org.khronos.openvx.tiling_hogfeatures", + VX_KERNEL_HOG_FEATURES_TILING, + NULL, + HogFeatures_image_tiling_flexible, + HogFeatures_image_tiling_fast, + 6, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_ARRAY, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxHogFeaturesInputValidator, + vxHogFeaturesOutputValidator, + NULL, + NULL, + { 32, 32 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_integralimage.c b/sample/targets/tiling/vx_integralimage.c new file mode 100644 index 0000000..3e836e4 --- /dev/null +++ b/sample/targets/tiling/vx_integralimage.c @@ -0,0 +1,95 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" +#include "vx_internal.h" + +#include + +static vx_status VX_CALLBACK vxIntegralInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + return status; +} + +static vx_status VX_CALLBACK vxIntegralOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, 0); /* we reference the input image */ + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_image input = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_uint32 width = 0, height = 0; + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U32; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +vx_tiling_kernel_t integral_image_kernel = +{ + "org.khronos.openvx.tiling_integral_image", + VX_KERNEL_INTEGRAL_IMAGE_TILING, + NULL, + IntegralImage_image_tiling_flexible, + IntegralImage_image_tiling_fast, + 2, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxIntegralInputValidator, + vxIntegralOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + + diff --git a/sample/targets/tiling/vx_interface.c b/sample/targets/tiling/vx_interface.c new file mode 100644 index 0000000..05b64f7 --- /dev/null +++ b/sample/targets/tiling/vx_interface.c @@ -0,0 +1,706 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_internal.h" +#include + +vx_status VX_CALLBACK vxTilingKernel(vx_node node, vx_reference parameters[], vx_uint32 num); + +static const vx_char name[VX_MAX_TARGET_NAME] = "khronos.tiling"; + +vx_tiling_kernel_t *tiling_kernels[] = +{ + &box_3x3_kernels, + &phase_kernel, + &And_kernel, + &Or_kernel, + &Xor_kernel, + &Not_kernel, + &threshold_kernel, + &colorconvert_kernel, + &Multiply_kernel, + &nonlinearfilter_kernel, + &Magnitude_kernel, + &erode3x3_kernel, + &dilate3x3_kernel, + &median3x3_kernel, + &sobel3x3_kernel, + &Max_kernel, + &Min_kernel, + &gaussian3x3_kernel, + &add_kernel, + &subtract_kernel, + &convertdepth_kernel, + &warp_affine_kernel, + &warp_perspective_kernel, + &weightedaverage_kernel, + &absdiff_kernel, + &integral_image_kernel, + &remap_kernel, + &convolution_kernel, + &hogfeatures_kernel, + &fast9_kernel, + &lbp_kernel, + &scale_image_kernel, + &lut_kernel, + &channelcombine_kernel, + &halfscale_gaussian_kernel, + &nonmaxsuppression_kernel, + &hogcells_kernel, +}; + +/*! \brief The Entry point into a user defined kernel module */ +vx_status VX_API_CALL vxPublishKernels(vx_context context) +{ + // tag::publish_function[] + vx_status status = VX_SUCCESS; + vx_uint32 k = 0; + for (k = 0; k < dimof(tiling_kernels); k++) + { + if (k == 34) + { + int aa = 0; + } + vx_kernel kernel = vxAddTilingKernel(context, + tiling_kernels[k]->name, + tiling_kernels[k]->enumeration, + tiling_kernels[k]->function, + tiling_kernels[k]->flexible_function, + tiling_kernels[k]->fast_function, + tiling_kernels[k]->num_params, + tiling_kernels[k]->validate, + tiling_kernels[k]->input_validator, + tiling_kernels[k]->output_validator, + tiling_kernels[k]->initialize, + tiling_kernels[k]->deinitialize); + if (kernel) + { + vx_uint32 p = 0; + for (p = 0; p < tiling_kernels[k]->num_params; p++) + { + status |= vxAddParameterToKernel(kernel, p, + tiling_kernels[k]->parameters[p].direction, + tiling_kernels[k]->parameters[p].data_type, + tiling_kernels[k]->parameters[p].state); + } + status |= vxSetKernelAttribute(kernel, VX_KERNEL_INPUT_NEIGHBORHOOD, + &tiling_kernels[k]->nbhd, sizeof(vx_neighborhood_size_t)); + status |= vxSetKernelAttribute(kernel, VX_KERNEL_OUTPUT_TILE_BLOCK_SIZE, + &tiling_kernels[k]->block, sizeof(vx_tile_block_size_t)); + status |= vxSetKernelAttribute(kernel, VX_KERNEL_BORDER, + &tiling_kernels[k]->border, sizeof(vx_border_t)); + if (status != VX_SUCCESS) + { + vxRemoveKernel(kernel); + } + else + { + status = vxFinalizeKernel(kernel); + } + if (status != VX_SUCCESS) + { + printf("Failed to publish kernel %s\n", tiling_kernels[k]->name); + break; + } + } + } + // end::publish_function[] + return status; +} + + +/*VX_API_ENTRY*/ vx_status VX_API_CALL vxUnpublishKernels(vx_context context) +{ + vx_status status = VX_FAILURE; + + vx_uint32 k = 0; + for (k = 0; k < dimof(tiling_kernels); k++) + { + vx_kernel kernel = vxGetKernelByName(context, tiling_kernels[k]->name); + kernel->user_kernel = 1; + vx_kernel kernelcpy = kernel; + + if (kernel) + { + status = vxReleaseKernel(&kernelcpy); + if (status != VX_SUCCESS) + { + vxAddLogEntry((vx_reference)context, status, "Failed to release kernel[%u]=%s\n", k, tiling_kernels[k]->name); + } + else + { + kernelcpy = kernel; + status = vxRemoveKernel(kernelcpy); + if (status != VX_SUCCESS) + { + vxAddLogEntry((vx_reference)context, status, "Failed to remove kernel[%u]=%s\n", k, tiling_kernels[k]->name); + } + } + } + else + { + vxAddLogEntry((vx_reference)context, status, "Failed to get added kernel %s\n", tiling_kernels[k]->name); + } + } + + return status; +} + +vx_status vxTargetInit(vx_target target) +{ + if (target) + { + strncpy(target->name, name, VX_MAX_TARGET_NAME); + target->priority = VX_TARGET_PRIORITY_TILING; + } + return vxPublishKernels(target->base.context); +} + +vx_status vxTargetDeinit(vx_target target) +{ + return vxUnpublishKernels(target->base.context); +} + +vx_status vxTargetSupports(vx_target target, + vx_char targetName[VX_MAX_TARGET_NAME], + vx_char kernelName[VX_MAX_KERNEL_NAME], + vx_uint32 *pIndex) +{ + vx_status status = VX_ERROR_NOT_SUPPORTED; + if (strncmp(targetName, name, VX_MAX_TARGET_NAME) == 0 || + strncmp(targetName, "default", VX_MAX_TARGET_NAME) == 0 || + strncmp(targetName, "power", VX_MAX_TARGET_NAME) == 0 || + strncmp(targetName, "performance", VX_MAX_TARGET_NAME) == 0) + { + vx_uint32 k = 0u; + for (k = 0u; k < VX_INT_MAX_KERNELS; k++) + { + vx_char targetKernelName[VX_MAX_KERNEL_NAME]; + vx_char *kernel; + vx_char def[8] = "default"; + + strncpy(targetKernelName, target->kernels[k].name, VX_MAX_KERNEL_NAME); + kernel = strtok(targetKernelName, ":"); + if (kernel == NULL) + kernel = def; + + if (strncmp(kernelName, kernel, VX_MAX_KERNEL_NAME) == 0) + { + status = VX_SUCCESS; + if (pIndex) *pIndex = k; + break; + } + } + } + return status; +} + +vx_action vxTargetProcess(vx_target target, vx_node_t *nodes[], vx_size startIndex, vx_size numNodes) +{ + vx_action action = VX_ACTION_CONTINUE; + vx_status status = VX_SUCCESS; + vx_size n = 0; + for (n = startIndex; (n < (startIndex + numNodes)) && (action == VX_ACTION_CONTINUE); n++) + { + vx_context context = vxGetContext((vx_reference)nodes[n]); + VX_PRINT(VX_ZONE_GRAPH, "Executing Kernel %s:%d in Nodes[%u] on target %s\n", + nodes[n]->kernel->name, + nodes[n]->kernel->enumeration, + n, + nodes[n]->base.context->targets[nodes[n]->affinity].name); + + if (context->perf_enabled) + ownStartCapture(&nodes[n]->perf); + + if (nodes[n]->is_replicated == vx_true_e) + { + vx_size num_replicas = 0; + vx_uint32 param; + vx_uint32 num_parameters = nodes[n]->kernel->signature.num_parameters; + vx_reference parameters[VX_INT_MAX_PARAMS] = { NULL }; + + for (param = 0; param < num_parameters; ++param) + { + if (nodes[n]->replicated_flags[param] == vx_true_e) + { + vx_size numItems = 0; + if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_PYRAMID) + { + vx_pyramid pyr = (vx_pyramid)(nodes[n]->parameters[param])->scope; + numItems = pyr->numLevels; + } + else if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_OBJECT_ARRAY) + { + vx_object_array arr = (vx_object_array)(nodes[n]->parameters[param])->scope; + numItems = arr->num_items; + } + else + { + status = VX_ERROR_INVALID_PARAMETERS; + break; + } + + if (num_replicas == 0) + num_replicas = numItems; + else if (numItems != num_replicas) + { + status = VX_ERROR_INVALID_PARAMETERS; + break; + } + } + else + { + parameters[param] = nodes[n]->parameters[param]; + } + } + + if (status == VX_SUCCESS) + { + vx_size replica; + for (replica = 0; replica < num_replicas; ++replica) + { + for (param = 0; param < num_parameters; ++param) + { + if (nodes[n]->replicated_flags[param] == vx_true_e) + { + if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_PYRAMID) + { + vx_pyramid pyr = (vx_pyramid)(nodes[n]->parameters[param])->scope; + parameters[param] = (vx_reference)pyr->levels[replica]; + } + else if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_OBJECT_ARRAY) + { + vx_object_array arr = (vx_object_array)(nodes[n]->parameters[param])->scope; + parameters[param] = (vx_reference)arr->items[replica]; + } + } + } + + status = nodes[n]->kernel->function((vx_node)nodes[n], + parameters, + num_parameters); + } + } + } + else + { + status = nodes[n]->kernel->function((vx_node)nodes[n], + (vx_reference *)nodes[n]->parameters, + nodes[n]->kernel->signature.num_parameters); + } + + nodes[n]->executed = vx_true_e; + nodes[n]->status = status; + + if (context->perf_enabled) + ownStopCapture(&nodes[n]->perf); + + VX_PRINT(VX_ZONE_GRAPH, "kernel %s returned %d\n", nodes[n]->kernel->name, status); + + if (status == VX_SUCCESS) + { + /* call the callback if it is attached */ + if (nodes[n]->callback) + { + action = nodes[n]->callback((vx_node)nodes[n]); + VX_PRINT(VX_ZONE_GRAPH, "callback returned action %d\n", action); + } + } + else + { + action = VX_ACTION_ABANDON; + VX_PRINT(VX_ZONE_ERROR, "Abandoning Graph due to error (%d)!\n", status); + } + } + return action; +} + +vx_status vxTargetVerify(vx_target target, vx_node_t *node) +{ + vx_status status = VX_SUCCESS; + return status; +} + +vx_kernel vxTargetAddKernel(vx_target target, + vx_char name[VX_MAX_KERNEL_NAME], + vx_enum enumeration, + vx_kernel_f func_ptr, + vx_uint32 numParams, + vx_kernel_validate_f validate, + vx_kernel_input_validate_f input, + vx_kernel_output_validate_f output, + vx_kernel_initialize_f initialize, + vx_kernel_deinitialize_f deinitialize) +{ + vx_uint32 k = 0u; + vx_kernel_t *kernel = NULL; + // ownSemWait(&target->base.lock); + for (k = 0; k < VX_INT_MAX_KERNELS; k++) + { + kernel = &(target->kernels[k]); + if (kernel->enabled == vx_false_e) + { + ownInitializeKernel(target->base.context, + kernel, + enumeration, func_ptr, name, + NULL, numParams, + validate, input, output, initialize, deinitialize); + VX_PRINT(VX_ZONE_KERNEL, "Reserving %s Kernel[%u] for %s\n", target->name, k, kernel->name); + target->num_kernels++; + break; + } + kernel = NULL; + } + // ownSemPost(&target->base.lock); + return (vx_kernel)kernel; +} + +#ifdef OPENVX_KHR_TILING +vx_kernel vxTargetAddTilingKernel(vx_target target, + vx_char name[VX_MAX_KERNEL_NAME], + vx_enum enumeration, + vx_kernel_f function, + vx_tiling_kernel_f flexible_func_ptr, + vx_tiling_kernel_f fast_func_ptr, + vx_uint32 numParams, + vx_kernel_validate_f validate, + vx_kernel_input_validate_f input, + vx_kernel_output_validate_f output, + vx_kernel_initialize_f initialize, + vx_kernel_deinitialize_f deinitialize) +{ + vx_uint32 k = 0u; + vx_kernel_t *kernel = NULL; + for (k = 0; k < VX_INT_MAX_KERNELS; k++) + { + kernel = &(target->kernels[k]); + if (kernel->enabled == vx_false_e) + { + kernel->tilingfast_function = fast_func_ptr; + kernel->tilingflexible_function = flexible_func_ptr; + + if (function == NULL) + { + ownInitializeKernel(target->base.context, + kernel, + enumeration, vxTilingKernel, name, + NULL, numParams, + validate, input, output, initialize, deinitialize); + } + else //Kernel with more than one node like HalfScaleGaussian + { + ownInitializeKernel(target->base.context, + kernel, + enumeration, function, name, + NULL, numParams, + validate, input, output, initialize, deinitialize); + } + VX_PRINT(VX_ZONE_KERNEL, "Reserving %s Kernel[%u] for %s\n", target->name, k, kernel->name); + target->num_kernels++; + break; + } + kernel = NULL; + } + return (vx_kernel)kernel; +} + +static vx_status vxGetPatchToTile(vx_image image, vx_rectangle_t *rect, vx_tile_t *tile) +{ + vx_status status = VX_SUCCESS; + vx_uint32 p = 0; + vx_image_t *img = (vx_image_t *)image; + + for (p = 0; p < img->planes; p++) + { + tile->base[p] = NULL; + if(image->constant == 1) + status = vxAccessImagePatch(image, rect, p, &tile->addr[p], (void **)&tile->base[p], VX_READ_ONLY); + else + status = vxAccessImagePatch(image, rect, p, &tile->addr[p], (void **)&tile->base[p], VX_READ_AND_WRITE); + } + + return status; +} + +static vx_status vxSetTileToPatch(vx_image image, vx_rectangle_t *rect, vx_tile_t *tile) +{ + vx_image_t *img = (vx_image_t *)image; + vx_uint32 p = 0; + vx_status status = VX_SUCCESS; + + for (p = 0; p < img->planes; p++) + { + status = vxCommitImagePatch(image, rect, p, &tile->addr[p], tile->base[p]); + } + + return status; +} + +static void* ownAllocateTensorMemory_tiling(vx_tensor tensor) +{ + vx_size total_size = ownSizeOfType(tensor->data_type); + + if (tensor->addr == NULL) + { + for (vx_uint32 i = 0; i < tensor->number_of_dimensions; i++) + { + total_size *= tensor->dimensions[i]; + } + tensor->addr = calloc(total_size, 1); + } + return tensor->addr; +} + +vx_status VX_CALLBACK vxTilingKernel(vx_node node, vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + + vx_image images[VX_INT_MAX_PARAMS]; + vx_uint32 ty = 0u, tx = 0u, p = 0u; + vx_rectangle_t rect; + vx_tile_t tiles[VX_INT_MAX_PARAMS]; + void *params[VX_INT_MAX_PARAMS] = {NULL}; + vx_enum dirs[VX_INT_MAX_PARAMS]; + vx_enum types[VX_INT_MAX_PARAMS]; + size_t scalars[VX_INT_MAX_PARAMS]; + vx_uint32 index = UINT32_MAX; + vx_uint32 tile_size_y = 0u, tile_size_x = 0u; + vx_uint32 block_multiple = 64; + vx_uint32 height = 0u, width = 0u; + vx_border_t borders = {VX_BORDER_UNDEFINED, 0}; + vx_neighborhood_size_t nbhd; + void *tile_memory = NULL; + vx_size size = 0; + + vx_tile_threshold_t threshold[VX_INT_MAX_PARAMS]; + vx_tile_matrix_t mask[VX_INT_MAX_PARAMS]; + vx_tile_convolution_t conv[VX_INT_MAX_PARAMS]; + vx_tensor tensor[VX_INT_MAX_PARAMS]; + vx_tile_array_t array_t[VX_INT_MAX_PARAMS]; + vx_array arrays[VX_INT_MAX_PARAMS]; + + /* Do the following: + * \arg find out each parameters direction + * \arg assign each image from the parameters + * \arg assign the block/neighborhood info + */ + for (p = 0u; p < num; p++) + { + vx_parameter param = vxGetParameterByIndex(node, p); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vxQueryParameter(param, VX_PARAMETER_DIRECTION, &dirs[p], sizeof(dirs[p])); + vxQueryParameter(param, VX_PARAMETER_TYPE, &types[p], sizeof(types[p])); + vxReleaseParameter(¶m); + } + if (types[p] == VX_TYPE_IMAGE) + { + vxQueryNode(node, VX_NODE_OUTPUT_TILE_BLOCK_SIZE, &tiles[p].tile_block, sizeof(vx_tile_block_size_t)); + vxQueryNode(node, VX_NODE_INPUT_NEIGHBORHOOD, &tiles[p].neighborhood, sizeof(vx_neighborhood_size_t)); + images[p] = (vx_image)parameters[p]; + vxQueryImage(images[p], VX_IMAGE_WIDTH, &tiles[p].image.width, sizeof(vx_uint32)); + vxQueryImage(images[p], VX_IMAGE_HEIGHT, &tiles[p].image.height, sizeof(vx_uint32)); + vxQueryImage(images[p], VX_IMAGE_FORMAT, &tiles[p].image.format, sizeof(vx_df_image)); + vxQueryImage(images[p], VX_IMAGE_SPACE, &tiles[p].image.space, sizeof(vx_enum)); + vxQueryImage(images[p], VX_IMAGE_RANGE, &tiles[p].image.range, sizeof(vx_enum)); + params[p] = &tiles[p]; + if ((dirs[p] == VX_OUTPUT) && (index == UINT32_MAX)) + { + index = p; + } + } + else if (types[p] == VX_TYPE_SCALAR) + { + vxCopyScalar((vx_scalar)parameters[p], (void *)&scalars[p], VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + params[p] = &scalars[p]; + } + else if (types[p] == VX_TYPE_THRESHOLD) + { + vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_TYPE, &threshold[p].thresh_type, sizeof(threshold[p].thresh_type)); + vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_THRESHOLD_VALUE, &threshold[p].value, sizeof(threshold[p].value)); + vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_THRESHOLD_LOWER, &threshold[p].lower, sizeof(threshold[p].lower)); + vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_THRESHOLD_UPPER, &threshold[p].upper, sizeof(threshold[p].upper)); + vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_TRUE_VALUE, &threshold[p].true_value, sizeof(threshold[p].true_value)); + vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_FALSE_VALUE, &threshold[p].false_value, sizeof(threshold[p].false_value)); + vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_INPUT_FORMAT, &threshold[p].input_format, sizeof(threshold[p].input_format)); + + params[p] = &threshold[p]; + } + else if (types[p] == VX_TYPE_MATRIX) + { + vxQueryMatrix((vx_matrix)parameters[p], VX_MATRIX_ROWS, &mask[p].rows, sizeof(mask[p].rows)); + vxQueryMatrix((vx_matrix)parameters[p], VX_MATRIX_COLUMNS, &mask[p].columns, sizeof(mask[p].columns)); + vxQueryMatrix((vx_matrix)parameters[p], VX_MATRIX_TYPE, &mask[p].data_type, sizeof(mask[p].data_type)); + vxQueryMatrix((vx_matrix)parameters[p], VX_MATRIX_ORIGIN, &mask[p].origin, sizeof(mask[p].origin)); + + if ((mask[p].data_type != VX_TYPE_UINT8) || (sizeof(mask[p].m) < mask[p].rows * mask[p].columns)) + status = VX_ERROR_INVALID_PARAMETERS; + + vxCopyMatrix((vx_matrix)parameters[p], mask[p].m, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyMatrix((vx_matrix)parameters[p], mask[p].m_f32, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + params[p] = &mask[p]; + } + else if (types[p] == VX_TYPE_REMAP) + { + vx_remap map = (vx_remap)parameters[p]; + params[p] = ↦ + } + else if (types[p] == VX_TYPE_CONVOLUTION) + { + vxQueryConvolution((vx_convolution)parameters[p], VX_CONVOLUTION_COLUMNS, &conv[p].conv_width, sizeof(conv[p].conv_width)); + vxQueryConvolution((vx_convolution)parameters[p], VX_CONVOLUTION_ROWS, &conv[p].conv_height, sizeof(conv[p].conv_height)); + vxQueryConvolution((vx_convolution)parameters[p], VX_CONVOLUTION_SCALE, &conv[p].scale, sizeof(conv[p].scale)); + + vxCopyConvolutionCoefficients((vx_convolution)parameters[p], conv[p].conv_mat, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + + params[p] = &conv[p]; + } + else if (types[p] == VX_TYPE_TENSOR) + { + tensor[p] = (vx_tensor)parameters[p]; + + if (tensor[p]->addr == NULL) + ownAllocateTensorMemory_tiling(tensor[p]); + + params[p] = tensor[p]->addr; + } + else if (types[p] == VX_TYPE_ARRAY || types[p] == VX_TYPE_LUT) + { + arrays[p] = (vx_array)parameters[p]; + + array_t[p].ptr = ((vx_array)parameters[p])->memory.ptrs[0]; + array_t[p].capacity = ((vx_array)parameters[p])->capacity; + array_t[p].item_size = ((vx_array)parameters[p])->item_size; + array_t[p].item_type = ((vx_array)parameters[p])->item_type; + array_t[p].num_items = ((vx_array)parameters[p])->num_items; + array_t[p].offset = ((vx_array)parameters[p])->offset; + + params[p] = &array_t[p]; + } + } + + if (index == UINT32_MAX) + index = 0; + + /* choose the index of the first output image to based the tiling on */ + status |= vxQueryImage(images[index], VX_IMAGE_WIDTH, &width, sizeof(width)); + status |= vxQueryImage(images[index], VX_IMAGE_HEIGHT, &height, sizeof(height)); + status |= vxQueryNode(node, VX_NODE_BORDER, &borders, sizeof(borders)); + status |= vxQueryNode(node, VX_NODE_INPUT_NEIGHBORHOOD, &nbhd, sizeof(nbhd)); + status |= vxQueryNode(node, VX_NODE_TILE_MEMORY_SIZE, &size, sizeof(size)); + + tile_size_y = tiles[index].tile_block.height; + tile_size_x = tiles[index].tile_block.width; + + if ((borders.mode != VX_BORDER_UNDEFINED) && + (borders.mode != VX_BORDER_MODE_SELF)) + { + return VX_ERROR_NOT_SUPPORTED; + } + + status = VX_SUCCESS; + + rect.start_x = 0; + rect.start_y = 0; + rect.end_x = width; + rect.end_y = height; + for (p = 0u; p < num; p++) + { + if (types[p] == VX_TYPE_IMAGE && images[p] != NULL) + { + tiles[p].tile_x = 0; + tiles[p].tile_y = 0; + status |= vxGetPatchToTile(images[p], &rect, &tiles[p]); + } + } + + vx_uint32 blkCntY = (height / tile_size_y) * tile_size_y; + vx_uint32 blkCntX = (width / tile_size_x) * tile_size_x; + + //tiling fast function + if (((vx_node_t *)node)->kernel->tilingfast_function) + { + for (ty = 0u; (ty < blkCntY) && (status == VX_SUCCESS); ty += tile_size_y) + { + for (tx = 0u; tx < blkCntX; tx += tile_size_x) + { + for (p = 0u; p < num; p++) + { + if (types[p] == VX_TYPE_IMAGE) + { + tiles[p].tile_x = tx; + tiles[p].tile_y = ty; + } + } + tile_memory = ((vx_node_t *)node)->attributes.tileDataPtr; + ((vx_node_t *)node)->kernel->tilingfast_function(params, tile_memory, size); + } + } + + if (((vx_node_t *)node)->kernel->tilingflexible_function && ((blkCntY < height) || (blkCntX < width))) + { + for (p = 0u; p < num; p++) + { + if (types[p] == VX_TYPE_IMAGE) + { + tiles[p].tile_x = tx; + tiles[p].tile_y = ty; + } + } + tile_memory = ((vx_node_t *)node)->attributes.tileDataPtr; + ((vx_node_t *)node)->kernel->tilingflexible_function(params, tile_memory, size); + } + } + //tiling flexible function + else if (((vx_node_t *)node)->kernel->tilingflexible_function) + { + for (p = 0u; p < num; p++) + { + if (types[p] == VX_TYPE_IMAGE) + { + tiles[p].tile_x = tx; + tiles[p].tile_y = ty; + } + } + tile_memory = ((vx_node_t *)node)->attributes.tileDataPtr; + ((vx_node_t *)node)->kernel->tilingflexible_function(params, tile_memory, size); + } + + for (p = 0u; p < num; p++) + { + if (types[p] == VX_TYPE_IMAGE) + { + if (dirs[p] == VX_INPUT && images[p] != NULL) + { + status |= vxSetTileToPatch(images[p], 0, &tiles[p]); + } + else if (dirs[p] == VX_OUTPUT) + { + status |= vxSetTileToPatch(images[p], &rect, &tiles[p]); + } + } + else if (types[p] == VX_TYPE_ARRAY && dirs[p] == VX_OUTPUT) + { + arrays[p]->memory.ptrs[0] = array_t[p].ptr; + arrays[p]->num_items = array_t[p].num_items; + } + } + + return status; +} +#endif diff --git a/sample/targets/tiling/vx_interface.h b/sample/targets/tiling/vx_interface.h new file mode 100644 index 0000000..db17e06 --- /dev/null +++ b/sample/targets/tiling/vx_interface.h @@ -0,0 +1,62 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _OPENVX_INTERFACE_H_ +#define _OPENVX_INTERFACE_H_ + +#include + +extern vx_tiling_kernel_t box_3x3_kernels; +extern vx_tiling_kernel_t phase_kernel; +extern vx_tiling_kernel_t And_kernel; +extern vx_tiling_kernel_t Or_kernel; +extern vx_tiling_kernel_t Xor_kernel; +extern vx_tiling_kernel_t Not_kernel; +extern vx_tiling_kernel_t threshold_kernel; +extern vx_tiling_kernel_t colorconvert_kernel; +extern vx_tiling_kernel_t Multiply_kernel; +extern vx_tiling_kernel_t nonlinearfilter_kernel; +extern vx_tiling_kernel_t Magnitude_kernel; +extern vx_tiling_kernel_t erode3x3_kernel; +extern vx_tiling_kernel_t dilate3x3_kernel; +extern vx_tiling_kernel_t median3x3_kernel; +extern vx_tiling_kernel_t sobel3x3_kernel; +extern vx_tiling_kernel_t Max_kernel; +extern vx_tiling_kernel_t Min_kernel; +extern vx_tiling_kernel_t gaussian3x3_kernel; +extern vx_tiling_kernel_t add_kernel; +extern vx_tiling_kernel_t subtract_kernel; +extern vx_tiling_kernel_t convertdepth_kernel; +extern vx_tiling_kernel_t warp_affine_kernel; +extern vx_tiling_kernel_t warp_perspective_kernel; +extern vx_tiling_kernel_t weightedaverage_kernel; +extern vx_tiling_kernel_t absdiff_kernel; +extern vx_tiling_kernel_t integral_image_kernel; +extern vx_tiling_kernel_t remap_kernel; +extern vx_tiling_kernel_t convolution_kernel; +extern vx_tiling_kernel_t hogfeatures_kernel; +extern vx_tiling_kernel_t fast9_kernel; +extern vx_tiling_kernel_t lbp_kernel; +extern vx_tiling_kernel_t scale_image_kernel; +extern vx_tiling_kernel_t lut_kernel; +extern vx_tiling_kernel_t channelcombine_kernel; +extern vx_tiling_kernel_t halfscale_gaussian_kernel; +extern vx_tiling_kernel_t nonmaxsuppression_kernel; +extern vx_tiling_kernel_t hogcells_kernel; + +#endif + diff --git a/sample/targets/tiling/vx_lbp.c b/sample/targets/tiling/vx_lbp.c new file mode 100644 index 0000000..c1a5251 --- /dev/null +++ b/sample/targets/tiling/vx_lbp.c @@ -0,0 +1,195 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" + +#include "vx_internal.h" + +#include "tiling.h" + +static vx_status VX_CALLBACK vxLBPInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar scalar = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_ENUM) + { + vx_enum format = 0; + vxCopyScalar(scalar, &format, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((format == VX_LBP) || + (format == VX_MLBP) || + (format == VX_ULBP)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } //end if(scalar) + vxReleaseParameter(¶m); + } + } + else if (index == 2) + { + vx_enum format = 0; + vx_parameter param_format = vxGetParameterByIndex(node, 1); + if (vxGetStatus((vx_reference)param_format) == VX_SUCCESS) + { + vx_scalar scalar = 0; + vxQueryParameter(param_format, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_ENUM) + { + vxCopyScalar(scalar, &format, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m_format); + } + + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar value = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &value, sizeof(value)); + if (value) + { + vx_enum stype = 0; + vxQueryScalar(value, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_INT8) + { + vx_int8 gs = 0; + vxCopyScalar(value, &gs, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ( (format == VX_LBP || format == VX_ULBP) && + (gs == 3 || gs == 5)) + { + status = VX_SUCCESS; + } + else if ( format == VX_MLBP && gs == 5 ) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&value); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxLBPOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 3) + { + vx_parameter src_param = vxGetParameterByIndex(node, 0); + if (vxGetStatus((vx_reference)src_param) == VX_SUCCESS) + { + vx_image src = 0; + vxQueryParameter(src_param, VX_PARAMETER_REF, &src, sizeof(src)); + if (src) + { + vx_df_image format = 0; + vx_uint32 width = 0, height = 0; + + vxQueryImage(src, VX_IMAGE_FORMAT, &format, sizeof(format)); + vxQueryImage(src, VX_IMAGE_WIDTH, &width, sizeof(height)); + vxQueryImage(src, VX_IMAGE_HEIGHT, &height, sizeof(height)); + /* output is equal type and size */ + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = format; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&src); + } + vxReleaseParameter(&src_param); + } + } + return status; +} + +vx_tiling_kernel_t lbp_kernel = +{ + "org.khronos.openvx.tiling_lbp", + VX_KERNEL_LBP_TILING, + NULL, + LBP_image_tiling_flexible, + LBP_image_tiling_fast, + 4, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxLBPInputValidator, + vxLBPOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + + + diff --git a/sample/targets/tiling/vx_lut.c b/sample/targets/tiling/vx_lut.c new file mode 100644 index 0000000..1f6aeed --- /dev/null +++ b/sample/targets/tiling/vx_lut.c @@ -0,0 +1,118 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" + +#include "vx_internal.h" + +#include "tiling.h" + +static vx_status VX_CALLBACK vxTableLookupInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, index); + vx_lut lut = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &lut, sizeof(lut)); + if (lut) + { + vx_enum type = 0; + vxQueryLUT(lut, VX_LUT_TYPE, &type, sizeof(type)); + if (type == VX_TYPE_UINT8 || type == VX_TYPE_INT16) + { + status = VX_SUCCESS; + } + vxReleaseLUT(&lut); + } + vxReleaseParameter(¶m); + } + return status; +} + +static vx_status VX_CALLBACK vxTableLookupOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 2) + { + vx_parameter src_param = vxGetParameterByIndex(node, 0); + if (vxGetStatus((vx_reference)src_param) == VX_SUCCESS) + { + vx_image src = 0; + vxQueryParameter(src_param, VX_PARAMETER_REF, &src, sizeof(src)); + if (src) + { + vx_df_image format = 0; + vx_uint32 width = 0, height = 0; + + vxQueryImage(src, VX_IMAGE_FORMAT, &format, sizeof(format)); + vxQueryImage(src, VX_IMAGE_WIDTH, &width, sizeof(height)); + vxQueryImage(src, VX_IMAGE_HEIGHT, &height, sizeof(height)); + /* output is equal type and size */ + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = format; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&src); + } + vxReleaseParameter(&src_param); + } + } + return status; +} + +vx_tiling_kernel_t lut_kernel = +{ + "org.khronos.openvx.tiling_table_lookup", + VX_KERNEL_TABLE_LOOKUP_TILING, + NULL, + TableLookup_image_tiling_flexible, + TableLookup_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_LUT, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxTableLookupInputValidator, + vxTableLookupOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + + diff --git a/sample/targets/tiling/vx_magnitude.c b/sample/targets/tiling/vx_magnitude.c new file mode 100644 index 0000000..8cbce62 --- /dev/null +++ b/sample/targets/tiling/vx_magnitude.c @@ -0,0 +1,118 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" +#include "vx_internal.h" +#include + +static vx_status VX_CALLBACK vxMagnitudeInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0 || index == 1) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_S16) + { + if (index == 0) + { + status = VX_SUCCESS; + } + else + { + vx_parameter param0 = vxGetParameterByIndex(node, index); + vx_image input0 = 0; + + vxQueryParameter(param0, VX_PARAMETER_REF, &input0, sizeof(input0)); + if (input0) + { + vx_uint32 width0 = 0, height0 = 0, width1 = 0, height1 = 0; + vxQueryImage(input0, VX_IMAGE_WIDTH, &width0, sizeof(width0)); + vxQueryImage(input0, VX_IMAGE_HEIGHT, &height0, sizeof(height0)); + vxQueryImage(input, VX_IMAGE_WIDTH, &width1, sizeof(width1)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height1, sizeof(height1)); + + if (width0 == width1 && height0 == height1) + status = VX_SUCCESS; + vxReleaseImage(&input0); + } + vxReleaseParameter(¶m0); + } + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + return status; +} + +static vx_status VX_CALLBACK vxMagnitudeOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 2) + { + vx_parameter param = vxGetParameterByIndex(node, 0); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_image input = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_uint32 width = 0, height = 0; + + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_S16; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +vx_tiling_kernel_t Magnitude_kernel = +{ + "org.khronos.openvx.tiling_magnitude", + VX_KERNEL_MAGNITUDE_TILING, + NULL, + Magnitude_image_tiling_flexible, + Magnitude_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxMagnitudeInputValidator, + vxMagnitudeOutputValidator, + NULL, + NULL, + { 8, 8 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + diff --git a/sample/targets/tiling/vx_minmax.c b/sample/targets/tiling/vx_minmax.c new file mode 100644 index 0000000..bf35f2a --- /dev/null +++ b/sample/targets/tiling/vx_minmax.c @@ -0,0 +1,153 @@ +/* + + * Copyright (c) 2017-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#include "vx_interface.h" +#include "vx_internal.h" +#include + +static vx_status VX_CALLBACK vxMinMaxInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + return status; +} + +static vx_status VX_CALLBACK vxMinMaxOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 2) + { + vx_parameter param[] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 1), + vxGetParameterByIndex(node, index), + }; + if ((vxGetStatus((vx_reference)param[0]) == VX_SUCCESS) && + (vxGetStatus((vx_reference)param[1]) == VX_SUCCESS) && + (vxGetStatus((vx_reference)param[2]) == VX_SUCCESS)) + { + vx_image images[3]; + vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0])); + vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1])); + vxQueryParameter(param[2], VX_PARAMETER_REF, &images[2], sizeof(images[2])); + if (images[0] && images[1] && images[2]) + { + vx_uint32 width = 0, height = 0; + vx_df_image informat[2] = {VX_DF_IMAGE_VIRT, VX_DF_IMAGE_VIRT}; + vx_df_image outformat = VX_DF_IMAGE_VIRT; + vxQueryImage(images[0], VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height, sizeof(height)); + vxQueryImage(images[0], VX_IMAGE_FORMAT, &informat[0], sizeof(informat[0])); + vxQueryImage(images[1], VX_IMAGE_FORMAT, &informat[1], sizeof(informat[1])); + vxQueryImage(images[2], VX_IMAGE_FORMAT, &outformat, sizeof(outformat)); + if (informat[0] == VX_DF_IMAGE_U8 && informat[1] == VX_DF_IMAGE_U8 && outformat == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + else if (informat[0] == VX_DF_IMAGE_S16 && informat[1] == VX_DF_IMAGE_S16 && outformat == VX_DF_IMAGE_S16) + { + status = VX_SUCCESS; + } + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = outformat; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + vxReleaseImage(&images[0]); + vxReleaseImage(&images[1]); + vxReleaseImage(&images[2]); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + vxReleaseParameter(¶m[2]); + } + } + return status; +} +vx_tiling_kernel_t Max_kernel = +{ + "org.khronos.openvx.tiling_max", + VX_KERNEL_MAX_TILING, + NULL, + Max_image_tiling_flexible, + Max_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxMinMaxInputValidator, + vxMinMaxOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; +vx_tiling_kernel_t Min_kernel = +{ + "org.khronos.openvx.tiling_min", + VX_KERNEL_MIN_TILING, + NULL, + Min_image_tiling_flexible, + Min_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxMinMaxInputValidator, + vxMinMaxOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + diff --git a/sample/targets/tiling/vx_morphology.c b/sample/targets/tiling/vx_morphology.c new file mode 100644 index 0000000..adfecab --- /dev/null +++ b/sample/targets/tiling/vx_morphology.c @@ -0,0 +1,121 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" + +#include "vx_internal.h" + +#include + +static vx_status VX_CALLBACK vxMorphologyInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + return status; +} + +static vx_status VX_CALLBACK vxMorphologyOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, 0); /* we reference the input image */ + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_image input = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_uint32 width = 0, height = 0; + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_param_description_t morphology_kernel_params[] = { + {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, + {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, +}; + +vx_tiling_kernel_t erode3x3_kernel = +{ + "org.khronos.openvx.tiling_erode_3x3", + VX_KERNEL_ERODE_3x3_TILING, + NULL, + Erode3x3_image_tiling_flexible, + Erode3x3_image_tiling_fast, + 2, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxMorphologyInputValidator, + vxMorphologyOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + +vx_tiling_kernel_t dilate3x3_kernel = +{ + "org.khronos.openvx.tiling_dilate_3x3", + VX_KERNEL_DILATE_3x3_TILING, + NULL, + Dilate3x3_image_tiling_flexible, + Dilate3x3_image_tiling_fast, + 2, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxMorphologyInputValidator, + vxMorphologyOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + + diff --git a/sample/targets/tiling/vx_multiply.c b/sample/targets/tiling/vx_multiply.c new file mode 100644 index 0000000..b69db1b --- /dev/null +++ b/sample/targets/tiling/vx_multiply.c @@ -0,0 +1,263 @@ +/* + + * Copyright (c) 2013-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" + +#include "vx_internal.h" + +#include + +static vx_status VX_CALLBACK vxMultiplyInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16) + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_image images[2]; + vx_parameter param[2] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 1), + }; + vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0])); + vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1])); + if (images[0] && images[1]) + { + vx_uint32 width[2], height[2]; + vx_df_image format1; + + vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0])); + vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1])); + vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0])); + vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1])); + vxQueryImage(images[1], VX_IMAGE_FORMAT, &format1, sizeof(format1)); + if (width[0] == width[1] && height[0] == height[1] && + (format1 == VX_DF_IMAGE_U8 || format1 == VX_DF_IMAGE_S16)) + status = VX_SUCCESS; + vxReleaseImage(&images[0]); + vxReleaseImage(&images[1]); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + } + else if (index == 2) /* scale: must be non-negative. */ + { + vx_scalar scalar = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum type = -1; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &type, sizeof(type)); + if (type == VX_TYPE_FLOAT32) + { + vx_float32 scale = 0.0f; + if ((vxCopyScalar(scalar, &scale, VX_READ_ONLY, VX_MEMORY_TYPE_HOST) == VX_SUCCESS) && + (scale >= 0)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + else if (index == 3) /* overflow_policy: truncate or saturate. */ + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar scalar = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_ENUM) + { + vx_enum overflow_policy = 0; + vxCopyScalar(scalar, &overflow_policy, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((overflow_policy == VX_CONVERT_POLICY_WRAP) || + (overflow_policy == VX_CONVERT_POLICY_SATURATE)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + else if (index == 4) /* rounding_policy: truncate or saturate. */ + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar scalar = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_ENUM) + { + vx_enum rouding_policy = 0; + vxCopyScalar(scalar, &rouding_policy, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((rouding_policy == VX_ROUND_POLICY_TO_ZERO) || + (rouding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxMultiplyOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 5) + { + /* + * We need to look at both input images, but only for the format: + * if either is S16 or the output type is not U8, then it's S16. + * The geometry of the output image is copied from the first parameter: + * the input images are known to match from input parameters validation. + */ + vx_parameter param[] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 1), + vxGetParameterByIndex(node, index), + }; + if ((vxGetStatus((vx_reference)param[0]) == VX_SUCCESS) && + (vxGetStatus((vx_reference)param[1]) == VX_SUCCESS) && + (vxGetStatus((vx_reference)param[2]) == VX_SUCCESS)) + { + vx_image images[3]; + vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0])); + vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1])); + vxQueryParameter(param[2], VX_PARAMETER_REF, &images[2], sizeof(images[2])); + if (images[0] && images[1] && images[2]) + { + vx_uint32 width = 0, height = 0; + vx_df_image informat[2] = {VX_DF_IMAGE_VIRT, VX_DF_IMAGE_VIRT}; + vx_df_image outformat = VX_DF_IMAGE_VIRT; + + /* + * When passing on the geometry to the output image, we only look at + * image 0, as both input images are verified to match, at input + * validation. + */ + vxQueryImage(images[0], VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height, sizeof(height)); + vxQueryImage(images[0], VX_IMAGE_FORMAT, &informat[0], sizeof(informat[0])); + vxQueryImage(images[1], VX_IMAGE_FORMAT, &informat[1], sizeof(informat[1])); + vxQueryImage(images[2], VX_IMAGE_FORMAT, &outformat, sizeof(outformat)); + + if (informat[0] == VX_DF_IMAGE_U8 && informat[1] == VX_DF_IMAGE_U8 && outformat == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + else + { + status = VX_SUCCESS; + outformat = VX_DF_IMAGE_S16; + } + + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = outformat; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + vxReleaseImage(&images[0]); + vxReleaseImage(&images[1]); + vxReleaseImage(&images[2]); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + vxReleaseParameter(¶m[2]); + } + } + return status; +} +vx_tiling_kernel_t Multiply_kernel = +{ + "org.khronos.openvx.tiling_multiply", + VX_KERNEL_MULTIPLY_TILING, + NULL, + Multiply_image_tiling_flexible, + Multiply_image_tiling_fast, + 6, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxMultiplyInputValidator, + vxMultiplyOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_nonlinearfilter.c b/sample/targets/tiling/vx_nonlinearfilter.c new file mode 100644 index 0000000..0f93f7b --- /dev/null +++ b/sample/targets/tiling/vx_nonlinearfilter.c @@ -0,0 +1,160 @@ +/* + + * Copyright (c) 2016-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_internal.h" + +#include "tiling.h" + +#if (C_MAX_NONLINEAR_DIM != VX_INT_MAX_NONLINEAR_DIM) +#if defined(_WIN32) +#pragma error("C Model does not support VX required nonlinear filter size") +#elif defined(__GNUC__) +#error "C Model does not support VX required nonlinear filter size" +#endif +#endif + +static vx_status VX_CALLBACK vxNonLinearFilterInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_scalar scalar = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_ENUM) + { + vx_enum function = 0; + vxCopyScalar(scalar, &function, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((function == VX_NONLINEAR_FILTER_MEDIAN) || + (function == VX_NONLINEAR_FILTER_MIN) || + (function == VX_NONLINEAR_FILTER_MAX)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 2) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (param) + { + vx_matrix matrix; + vxQueryParameter(param, VX_PARAMETER_REF, &matrix, sizeof(matrix)); + if (matrix) + { + vx_enum data_type = 0; + vx_size cols = 0, rows = 0; + vxQueryMatrix(matrix, VX_MATRIX_TYPE, &data_type, sizeof(data_type)); + vxQueryMatrix(matrix, VX_MATRIX_COLUMNS, &cols, sizeof(cols)); + vxQueryMatrix(matrix, VX_MATRIX_ROWS, &rows, sizeof(rows)); + if ((rows <= VX_INT_MAX_NONLINEAR_DIM) && + (cols <= VX_INT_MAX_NONLINEAR_DIM) && + (data_type == VX_TYPE_UINT8)) + { + status = VX_SUCCESS; + } + vxReleaseMatrix(&matrix); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxNonLinearFilterOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 3) + { + vx_parameter param = vxGetParameterByIndex(node, 1); /* we reference the input image */ + if (param) + { + vx_image input = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_uint32 width = 0, height = 0; + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +vx_tiling_kernel_t nonlinearfilter_kernel = +{ + "org.khronos.openvx.tiling_non_linear_filter", + VX_KERNEL_NON_LINEAR_FILTER_TILING, + NULL, + NonLinearFilter_image_tiling_flexible, + NonLinearFilter_image_tiling_fast, + 4, + { { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_MATRIX, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxNonLinearFilterInputValidator, + vxNonLinearFilterOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_nonmaxsuppression.c b/sample/targets/tiling/vx_nonmaxsuppression.c new file mode 100644 index 0000000..bf53f83 --- /dev/null +++ b/sample/targets/tiling/vx_nonmaxsuppression.c @@ -0,0 +1,168 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#include +#include +#include +#include + +static vx_status VX_CALLBACK vxNonMaxSuppressionInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_image images[2]; + vx_parameter param[2] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 1), + }; + vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0])); + vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1])); + if (images[0] && images[1]) + { + vx_uint32 width[2], height[2]; + vx_df_image format; + + vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0])); + vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1])); + vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0])); + vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1])); + vxQueryImage(images[1], VX_IMAGE_FORMAT, &format, sizeof(format)); + if (width[0] == width[1] && height[0] == height[1] && format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&images[0]); + vxReleaseImage(&images[1]); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + } + else if (index == 2) + { + vx_scalar win_size = 0; + vx_image input = 0; + vx_parameter param[2] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 2) + }; + + vxQueryParameter(param[0], VX_PARAMETER_REF, &input, sizeof(input)); + vxQueryParameter(param[1], VX_PARAMETER_REF, &win_size, sizeof(win_size)); + if (input && win_size) + { + vx_enum type = 0; + vx_uint32 width, height; + vx_int32 wsize = 0; + vxCopyScalar(win_size, &wsize, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + vxQueryScalar(win_size, VX_SCALAR_TYPE, &type, sizeof(type)); + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + if (type == VX_TYPE_INT32) + { + if ( (wsize <= width) && (wsize <= height) && (wsize % 2 == 1)) + { + status = VX_SUCCESS; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + + vxReleaseScalar(&win_size); + vxReleaseImage(&input); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + } + return status; +} + +static vx_status VX_CALLBACK vxNonMaxSuppressionOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 3) + { + vx_parameter param = vxGetParameterByIndex(node, 0); + if (param) + { + vx_image img = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &img, sizeof(img)); + if (img) + { + vx_uint32 width = 0, height = 0; + vx_df_image format = 0; + + vxQueryImage(img, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(img, VX_IMAGE_HEIGHT, &height, sizeof(height)); + vxQueryImage(img, VX_IMAGE_FORMAT, &format, sizeof(format)); + + /* fill in the meta data with the attributes so that the checker will pass */ + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = format; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + + status = VX_SUCCESS; + vxReleaseImage(&img); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +vx_tiling_kernel_t nonmaxsuppression_kernel = +{ + "org.khronos.openvx.tiling_nonmaxsuppression", + VX_KERNEL_NON_MAX_SUPPRESSION_TILING, + NULL, + NonMaxSuppression_image_tiling_flexible, + NonMaxSuppression_image_tiling_fast, + 4, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },}, + NULL, + vxNonMaxSuppressionInputValidator, + vxNonMaxSuppressionOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_phase.c b/sample/targets/tiling/vx_phase.c new file mode 100644 index 0000000..d0c3a03 --- /dev/null +++ b/sample/targets/tiling/vx_phase.c @@ -0,0 +1,132 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" + +#include "vx_internal.h" + +#include + +static vx_status VX_CALLBACK vxPhaseInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + + if (index == 0 || index == 1) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_S16 || format == VX_DF_IMAGE_F32) + { + if (index == 0) + { + status = VX_SUCCESS; + } + else + { + vx_parameter param0 = vxGetParameterByIndex(node, index); + vx_image input0 = 0; + + vxQueryParameter(param0, VX_PARAMETER_REF, &input0, sizeof(input0)); + if (input0) + { + vx_uint32 width0 = 0, height0 = 0, width1 = 0, height1 = 0; + vxQueryImage(input0, VX_IMAGE_WIDTH, &width0, sizeof(width0)); + vxQueryImage(input0, VX_IMAGE_HEIGHT, &height0, sizeof(height0)); + vxQueryImage(input, VX_IMAGE_WIDTH, &width1, sizeof(width1)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height1, sizeof(height1)); + + if (width0 == width1 && height0 == height1) + status = VX_SUCCESS; + + vxReleaseImage(&input0); + } + + vxReleaseParameter(¶m0); + } + } + + vxReleaseImage(&input); + } + + vxReleaseParameter(¶m); + } + + return status; +} + +static vx_status VX_CALLBACK vxPhaseOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + + if (index == 2) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, 0); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_uint32 width = 0; + vx_uint32 height = 0; + vx_df_image format = 0; + + vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height)); + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + + status = VX_SUCCESS; + + vxReleaseImage(&input); + } + + vxReleaseParameter(¶m); + } + + return status; +} + +vx_tiling_kernel_t phase_kernel = +{ + "org.khronos.openvx.tiling_phase", + VX_KERNEL_PHASE_TILING, + NULL, + Phase_image_tiling_flexible, + Phase_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxPhaseInputValidator, + vxPhaseOutputValidator, + NULL, + NULL, + { 8, 8 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_remap.c b/sample/targets/tiling/vx_remap.c new file mode 100644 index 0000000..78c6578 --- /dev/null +++ b/sample/targets/tiling/vx_remap.c @@ -0,0 +1,553 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" + +#include "vx_internal.h" + +#include + +#include "tiling.h" + +#define MIN(a,b) (((a) < (b)) ? (a) : (b)) + +static vx_int32 * alignPtr(vx_int32* ptr, size_t n) +{ + return (vx_int32 *)(((size_t)ptr + n-1) & -n); +} + +static vx_float32 * alignPtr_f(vx_float32* ptr, size_t n) +{ + return (vx_float32 *)(((size_t)ptr + n-1) & -n); +} + + +static void remapNearestNeighborConst(const size_t height, + const size_t width, + const vx_uint8 * srcBase, + const vx_int32 * map, + vx_uint8 * dstBase, ptrdiff_t dstStride, + vx_uint8 borderValue) +{ + for (size_t y = 0; y < height; ++y) + { + const vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * width * sizeof(vx_int32)); + vx_uint8 * dst_row = (vx_uint8 *)((vx_int8 *)dstBase + y * dstStride); + + for (size_t x = 0; x < width; ++x) + { + vx_int32 src_idx = map_row[x]; + dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue; + } + } +} + +static void remapLinearConst(const size_t height, + const size_t width, + const vx_uint8 * srcBase, + const vx_int32 * map, + const vx_float32 * coeffs, + vx_uint8 * dstBase, ptrdiff_t dstStride, + vx_uint8 borderValue) +{ + int16x8_t v_zero16 = vdupq_n_s16(0); + + for (size_t y = 0; y < height; ++y) + { + const vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * width * sizeof(vx_int32) * 4); + const vx_float32 * coeff_row = (vx_float32 *)((vx_int8 *)(coeffs) + y * width * sizeof(vx_float32) * 2); + + vx_uint8 * dst_row = (vx_uint8 *)((vx_int8 *)(dstBase) + y * dstStride); + + size_t x = 0; + + for ( ; x + 8 < width; x += 8) + { + int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 4] >= 0 ? srcBase[map_row[(x << 2) + 4]] : borderValue, v_src00, 1); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 8] >= 0 ? srcBase[map_row[(x << 2) + 8]] : borderValue, v_src00, 2); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6); + v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7); + + int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 5] >= 0 ? srcBase[map_row[(x << 2) + 5]] : borderValue, v_src01, 1); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 9] >= 0 ? srcBase[map_row[(x << 2) + 9]] : borderValue, v_src01, 2); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6); + v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7); + + int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 6] >= 0 ? srcBase[map_row[(x << 2) + 6]] : borderValue, v_src10, 1); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6); + v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7); + + int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 7] >= 0 ? srcBase[map_row[(x << 2) + 7]] : borderValue, v_src11, 1); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6); + v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7); + + // first part + float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00))); + float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10))); + + float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1)); + float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01), + vget_low_s16(v_src00))), v_coeff.val[0]); + float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11), + vget_low_s16(v_src10))), v_coeff.val[0]); + + float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]); + uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst)); + + // second part + v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00))); + v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10))); + + v_coeff = vld2q_f32(coeff_row + (x << 1) + 8); + v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01), + vget_high_s16(v_src00))), v_coeff.val[0]); + v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11), + vget_high_s16(v_src10))), v_coeff.val[0]); + + v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]); + uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst)); + + // store + vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1))); + } + for ( ; x < width; ++x) + { + int16_t src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue; + int16_t src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue; + int16_t src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue; + int16_t src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue; + + vx_float32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00; + vx_float32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10; + dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0); + } + } +} + + +//BLOCK_SIZE is the same as tile_size set in "vx_remap.c" +#define BLOCK_SIZE 16 + +void Remap_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_remap *table = (vx_remap *)parameters[1]; + vx_scalar *stype = (vx_scalar *)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + + vx_uint8 *src_base = in->base[0]; + vx_uint8 *dst_base = out->base[0]; + + vx_uint32 low_y = out->tile_y; + vx_uint32 high_y = out->tile_y + out->tile_block.height; + + vx_int32 policy = (vx_int32)*stype; + + vx_uint32 src_width = in->image.width; + vx_uint32 src_height = in->image.height; + vx_uint32 srcStride = in->addr->stride_y; + + vx_uint32 dst_width = out->image.width; + vx_uint32 dst_height = out->image.height; + vx_uint32 dstStride = out->addr->stride_y; + + int32x4_t v_width4 = vdupq_n_s32(src_width - 1), v_height4 = vdupq_n_s32(src_height - 1); + int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1); + float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f); + + vx_uint8 borderValue = 0; + + size_t i = out->tile_y; + size_t blockHeight = MIN(BLOCK_SIZE, dst_height - i); + size_t j = out->tile_x; + size_t blockWidth = MIN(BLOCK_SIZE, dst_width - j); + + size_t tableStride = (&(*table)->memory)->strides[0][VX_DIM_Y]; + vx_float32 * tableBase = (vx_float32 *)(&(*table)->memory)->ptrs[0]; + + if (policy == VX_INTERPOLATION_NEAREST_NEIGHBOR) + { + vx_int32 _map[BLOCK_SIZE * BLOCK_SIZE + 16]; + vx_int32 * map = alignPtr(_map, 16); + + int32x4_t v_m1_4 = vdupq_n_s32(-1); + int32x2_t v_m1_2 = vdup_n_s32(-1); + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + float32x2_t v_zero2 = vdup_n_f32(0.0f); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + const vx_float32 * table_row = (vx_float32 *)((vx_int8 *)(tableBase) + (i + y) * tableStride) + (j << 1); + vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(&map[0]) + y * blockWidth * sizeof(vx_int32)); + + size_t x = 0; + for ( ; x + 8 <= blockWidth; x += 8) + { + float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)), + v_table1 = vld2q_f32(table_row + (x << 1) + 8); + + int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]); + int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]); + uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)), + vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4))); + int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4); + vst1q_s32(map_row + x, v_dst_index); + + v_dst_x = vcvtq_s32_f32(v_table1.val[0]); + v_dst_y = vcvtq_s32_f32(v_table1.val[1]); + v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)), + vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4))); + v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4); + vst1q_s32(map_row + x + 4, v_dst_index); + } + } + vx_uint8 * dstBase = (vx_uint8 *)((vx_int8 *)dst_base + i * dstStride); + // make remap + remapNearestNeighborConst(blockHeight, blockWidth, src_base, &map[0], dstBase + j, dstStride, borderValue); + } + else if (policy == VX_INTERPOLATION_BILINEAR) + { + vx_int32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16]; + vx_float32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16]; + vx_int32 * map = alignPtr(_map, 16); + vx_float32 * coeffs = alignPtr_f(_coeffs, 16); + + float32x4_t v_zero4 = vdupq_n_f32(0.0f); + int32x4_t v_m1_4 = vdupq_n_s32(-1); + + // compute table + for (size_t y = 0; y < blockHeight; ++y) + { + const vx_float32 * table_row = (vx_float32 *)((vx_int8 *)(tableBase) + (i + y) * tableStride) + (j << 1); + vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * blockWidth * sizeof(vx_int32) * 4); + vx_float32 * coeff_row = (vx_float32 *)((vx_int8 *)(coeffs) + y * blockWidth * sizeof(vx_float32) * 2); + + size_t x = 0; + for ( ; x + 4 <= blockWidth; x += 4) + { + float32x4x2_t v_table = vld2q_f32(table_row + (x << 1)); + + int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]); + int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]); + + float32x4x2_t v_coeff; + v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0)); + v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0)); + uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f); + uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f); + v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]); + v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]); + v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0); + v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0); + + int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1); + int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1); + + int32x4x4_t v_dst_index; + v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4); + v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4); + v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4); + v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4); + + uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4)); + uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4)); + uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4)); + uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4)); + + v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4); + v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4); + v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4); + v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4); + + vst2q_f32(coeff_row + (x << 1), v_coeff); + vst4q_s32(map_row + (x << 2), v_dst_index); + } + } + vx_uint8 * dstBase = (vx_uint8 *)((vx_int8 *)dst_base + i * dstStride); + remapLinearConst(blockHeight, blockWidth, src_base, &map[0], &coeffs[0], dstBase + j, dstStride, borderValue); + } +} + + +static vx_bool read_pixel(void *base, vx_imagepatch_addressing_t *addr, vx_uint32 src_height, vx_uint32 src_width, + vx_float32 x, vx_float32 y, vx_uint8 *pixel) +{ + vx_bool out_of_bounds = (x < 0 || y < 0 || x >= src_width || y >= src_height); + vx_uint32 bx, by; + vx_uint8 *bpixel; + + if (out_of_bounds) + { + return vx_false_e; + } + + // bounded x/y + bx = x < 0 ? 0 : x >= src_width ? src_width - 1 : (vx_uint32)x; + by = y < 0 ? 0 : y >= src_height ? src_height - 1 : (vx_uint32)y; + + vx_uint8 *new_ptr = NULL; + vx_uint32 offset = (addr->stride_y * by + addr->stride_x * bx); + new_ptr = (vx_uint8 *)base; + bpixel = &new_ptr[offset]; + + *pixel = *bpixel; + + return vx_true_e; +} + +#define REMAP(low_y, high_y, low_x) \ + for (y = low_y; y < high_y; y++) \ + { \ + vx_uint8 *dst = (vx_uint8 *)dst_base + y * out->addr->stride_y; \ + for (x = low_x; x < high_x; x++) \ + { \ + vx_float32 src_x = 0.0f; \ + vx_float32 src_y = 0.0f; \ + \ + vxGetRemapPoint(*table, x, y, &src_x, &src_y); \ + \ + if (policy == VX_INTERPOLATION_NEAREST_NEIGHBOR) \ + { \ + read_pixel(src_base, in->addr, in->image.height, in->image.width, src_x + 0.5f, src_y + 0.5f, dst); \ + dst++; \ + } \ + else if (policy == VX_INTERPOLATION_BILINEAR) \ + { \ + vx_uint8 tl = 0; \ + vx_uint8 tr = 0; \ + vx_uint8 bl = 0; \ + vx_uint8 br = 0; \ + vx_float32 xf = floorf(src_x); \ + vx_float32 yf = floorf(src_y); \ + vx_float32 dx = src_x - xf; \ + vx_float32 dy = src_y - yf; \ + vx_float32 a[] = { (1.0f - dx) * (1.0f - dy), (1.0f - dx) * (dy), (dx)* (1.0f - dy), (dx)* (dy), }; \ + vx_bool defined = vx_true_e; \ + defined &= read_pixel(src_base, in->addr, in->image.height, in->image.width, xf + 0, yf + 0, &tl); \ + defined &= read_pixel(src_base, in->addr, in->image.height, in->image.width, xf + 1, yf + 0, &tr); \ + defined &= read_pixel(src_base, in->addr, in->image.height, in->image.width, xf + 0, yf + 1, &bl); \ + defined &= read_pixel(src_base, in->addr, in->image.height, in->image.width, xf + 1, yf + 1, &br); \ + if (defined) \ + *dst = (vx_uint8)(a[0] * tl + a[2] * tr + a[1] * bl + a[3] * br); \ + dst++; \ + } \ + } \ + } + + +void Remap_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size) +{ + vx_uint32 x = 0, y = 0; + + vx_tile_t *in = (vx_tile_t *)parameters[0]; + vx_remap *table = (vx_remap *)parameters[1]; + vx_scalar *stype = (vx_scalar *)parameters[2]; + vx_tile_t *out = (vx_tile_t *)parameters[3]; + + vx_uint32 low_y = in->tile_y; + vx_uint32 high_y = vxTileHeight(in, 0); + + vx_uint32 low_x = in->tile_x; + vx_uint32 high_x = vxTileWidth(in, 0); + + vx_uint8 *src_base = in->base[0] + in->tile_x; + vx_uint8 *dst_base = out->base[0] + out->tile_x; + + vx_int32 policy = (vx_int32)*stype; + + if (low_y == 0 && low_x == 0) + { + REMAP(low_y, high_y, low_x) + } + else + { + REMAP(0, low_y, low_x) + + src_base = in->base[0]; + dst_base = out->base[0]; + REMAP(low_y, high_y, 0) + } +} + +static vx_status VX_CALLBACK vxRemapInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_remap table; + vxQueryParameter(param, VX_PARAMETER_REF, &table, sizeof(table)); + if (table) + { + /* \todo what are we checking? */ + status = VX_SUCCESS; + vxReleaseRemap(&table); + } + vxReleaseParameter(¶m); + } + } + else if (index == 2) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar scalar = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_ENUM) + { + vx_enum policy = 0; + vxCopyScalar(scalar, &policy, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((policy == VX_INTERPOLATION_NEAREST_NEIGHBOR) || + (policy == VX_INTERPOLATION_BILINEAR)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxRemapOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 3) + { + vx_parameter src_param = vxGetParameterByIndex(node, 0); + vx_parameter tbl_param = vxGetParameterByIndex(node, 1); + vx_parameter dst_param = vxGetParameterByIndex(node, index); + if ((vxGetStatus((vx_reference)src_param) == VX_SUCCESS) && + (vxGetStatus((vx_reference)dst_param) == VX_SUCCESS) && + (vxGetStatus((vx_reference)tbl_param) == VX_SUCCESS)) + { + vx_image src = 0; + vx_image dst = 0; + vx_remap tbl = 0; + vxQueryParameter(src_param, VX_PARAMETER_REF, &src, sizeof(src)); + vxQueryParameter(dst_param, VX_PARAMETER_REF, &dst, sizeof(dst)); + vxQueryParameter(tbl_param, VX_PARAMETER_REF, &tbl, sizeof(tbl)); + if ((src) && (dst) && (tbl)) + { + vx_uint32 w1 = 0, h1 = 0; + vx_uint32 w2 = 0, h2 = 0; + vx_uint32 w3 = 0, h3 = 0; + + vxQueryImage(src, VX_IMAGE_WIDTH, &w1, sizeof(w1)); + vxQueryImage(src, VX_IMAGE_HEIGHT, &h1, sizeof(h1)); + vxQueryRemap(tbl, VX_REMAP_SOURCE_WIDTH, &w2, sizeof(w2)); + vxQueryRemap(tbl, VX_REMAP_SOURCE_HEIGHT, &h2, sizeof(h2)); + vxQueryRemap(tbl, VX_REMAP_DESTINATION_WIDTH, &w3, sizeof(w3)); + vxQueryRemap(tbl, VX_REMAP_DESTINATION_HEIGHT, &h3, sizeof(h3)); + + if ((w1 == w2) && (h1 == h2)) + { + /* fill in the meta data with the attributes so that the checker will pass */ + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = w3; + ptr->dim.image.height = h3; + status = VX_SUCCESS; + } + vxReleaseImage(&src); + vxReleaseRemap(&tbl); + vxReleaseImage(&dst); + } + vxReleaseParameter(&src_param); + vxReleaseParameter(&tbl_param); + vxReleaseParameter(&dst_param); + } + } + return status; +} + +vx_tiling_kernel_t remap_kernel = +{ + "org.khronos.openvx.tiling_remap", + VX_KERNEL_REMAP_TILING, + NULL, + Remap_image_tiling_flexible, + Remap_image_tiling_fast, + 4, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_REMAP, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxRemapInputValidator, + vxRemapOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_scale.c b/sample/targets/tiling/vx_scale.c new file mode 100644 index 0000000..29bfd6b --- /dev/null +++ b/sample/targets/tiling/vx_scale.c @@ -0,0 +1,523 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" +#include +#include +#include + +static vx_status VX_CALLBACK vxScaleImageInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + + if (input) + { + vx_df_image format = 0; + + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + else if (format == VX_DF_IMAGE_S16) + { + /* enable internal S16 format support (needed for laplacian pyramid reconstruction) */ + vx_scalar scalar = 0; + vx_parameter param1 = vxGetParameterByIndex(node, 2); + vxQueryParameter(param1, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (VX_TYPE_ENUM == stype) + { + vx_enum interp = 0; + vxCopyScalar(scalar, &interp, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if (VX_INTERPOLATION_NEAREST_NEIGHBOR == interp) + { + /* only NN interpolation is required for laplacian pyramid */ + status = VX_SUCCESS; + } + } + + vxReleaseScalar(&scalar); + } + + vxReleaseParameter(¶m1); + } + + vxReleaseImage(&input); + } + + vxReleaseParameter(¶m); + } + else if (index == 2) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar scalar = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_ENUM) + { + vx_enum interp = 0; + vxCopyScalar(scalar, &interp, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((interp == VX_INTERPOLATION_NEAREST_NEIGHBOR) || + (interp == VX_INTERPOLATION_BILINEAR) || + (interp == VX_INTERPOLATION_AREA)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxScaleImageOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 1) + { + vx_parameter src_param = vxGetParameterByIndex(node, 0); + vx_parameter dst_param = vxGetParameterByIndex(node, index); + if ((vxGetStatus((vx_reference)src_param) == VX_SUCCESS) && + (vxGetStatus((vx_reference)dst_param) == VX_SUCCESS)) + { + vx_image src = 0; + vx_image dst = 0; + vxQueryParameter(src_param, VX_PARAMETER_REF, &src, sizeof(src)); + vxQueryParameter(dst_param, VX_PARAMETER_REF, &dst, sizeof(dst)); + if ((src) && (dst)) + { + vx_uint32 w1 = 0, h1 = 0, w2 = 0, h2 = 0; + vx_df_image f1 = VX_DF_IMAGE_VIRT, f2 = VX_DF_IMAGE_VIRT; + + vxQueryImage(src, VX_IMAGE_WIDTH, &w1, sizeof(w1)); + vxQueryImage(src, VX_IMAGE_HEIGHT, &h1, sizeof(h1)); + vxQueryImage(dst, VX_IMAGE_WIDTH, &w2, sizeof(w2)); + vxQueryImage(dst, VX_IMAGE_HEIGHT, &h2, sizeof(h2)); + vxQueryImage(src, VX_IMAGE_FORMAT, &f1, sizeof(f1)); + vxQueryImage(dst, VX_IMAGE_FORMAT, &f2, sizeof(f2)); + /* output can not be virtual */ + if ((w2 != 0) && (h2 != 0) && (f2 != VX_DF_IMAGE_VIRT) && (f1 == f2)) + { + /* fill in the meta data with the attributes so that the checker will pass */ + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = f2; + ptr->dim.image.width = w2; + ptr->dim.image.height = h2; + status = VX_SUCCESS; + } + vxReleaseImage(&src); + vxReleaseImage(&dst); + } + vxReleaseParameter(&src_param); + vxReleaseParameter(&dst_param); + } + } + return status; +} + +/* scale image kernel */ +static vx_param_description_t scale_kernel_params[] = +{ + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, +}; + +static vx_status VX_CALLBACK vxScaleImageInitializer(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (num == dimof(scale_kernel_params)) + { + vx_image src = (vx_image)parameters[0]; + vx_image dst = (vx_image)parameters[1]; + vx_uint32 w1 = 0, h1 = 0, w2 = 0, h2 = 0; +#if AREA_SCALE_ENABLE + vx_uint32 gcd_w = 0, gcd_h = 0; +#endif + vx_size size = 0; + vx_size kernel_data_size = 0; + + vxQueryImage(src, VX_IMAGE_WIDTH, &w1, sizeof(w1)); + vxQueryImage(src, VX_IMAGE_HEIGHT, &h1, sizeof(h1)); + vxQueryImage(dst, VX_IMAGE_WIDTH, &w2, sizeof(w2)); + vxQueryImage(dst, VX_IMAGE_HEIGHT, &h2, sizeof(h2)); + + /* AREA interpolation requires a scratch buffer, however, if AREA + * implementation is disabled, then no scratch buffer is required, and + * size can be 0 (setting to 1 so that checks can pass in the kernel) */ +#if AREA_SCALE_ENABLE + gcd_w = math_gcd(w1, w2); + gcd_h = math_gcd(h1, h2); + /* printf("%ux%u => %ux%u :: GCD_w %u GCD_h %u\n", w1,h1, w2,h2, gcd_w, gcd_h); */ + if (gcd_w != 0 && gcd_h != 0) + { + size = (w1 / gcd_w) * (w2 / gcd_w) * (h1 / gcd_h) * (h2 / gcd_h) * sizeof(vx_float64); + } + /* printf("Requesting "VX_FMT_SIZE" bytes for resizer\n", size); */ +#else + size = 1; +#endif + vxQueryKernel(node->kernel, VX_KERNEL_LOCAL_DATA_SIZE, &kernel_data_size, sizeof(kernel_data_size)); + if (kernel_data_size == 0) + { + node->attributes.localDataSize = size; + } + status = VX_SUCCESS; + } + return status; +} + +vx_tiling_kernel_t scale_image_kernel = +{ + "org.khronos.openvx.tiling_scale_image", + VX_KERNEL_SCALE_IMAGE_TILING, + NULL, + ScaleImage_image_tiling_flexible, + ScaleImage_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }}, + NULL, + vxScaleImageInputValidator, + vxScaleImageOutputValidator, + vxScaleImageInitializer, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + +/* half scale gaussian kernel */ +static vx_status VX_CALLBACK vxHalfscaleGaussianKernel(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_FAILURE; + + if (num == dimof(scale_kernel_params)) + { + vx_graph subgraph = ownGetChildGraphOfNode(node); + status = vxProcessGraph(subgraph); + } + + return status; +} + +static vx_status VX_CALLBACK vxHalfscaleGaussianInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 2) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar scalar = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_INT32) + { + vx_int32 ksize = 0; + vxCopyScalar(scalar, &ksize, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((ksize == 1) || (ksize == 3) || (ksize == 5)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxHalfscaleGaussianOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 1) + { + vx_parameter src_param = vxGetParameterByIndex(node, 0); + vx_parameter dst_param = vxGetParameterByIndex(node, index); + if ((vxGetStatus((vx_reference)src_param) == VX_SUCCESS) && + (vxGetStatus((vx_reference)dst_param) == VX_SUCCESS)) + { + vx_image src = 0; + vx_image dst = 0; + vxQueryParameter(src_param, VX_PARAMETER_REF, &src, sizeof(src)); + vxQueryParameter(dst_param, VX_PARAMETER_REF, &dst, sizeof(dst)); + if ((src) && (dst)) + { + vx_uint32 w1 = 0, h1 = 0; + vx_df_image f1 = VX_DF_IMAGE_VIRT; + + vxQueryImage(src, VX_IMAGE_WIDTH, &w1, sizeof(w1)); + vxQueryImage(src, VX_IMAGE_HEIGHT, &h1, sizeof(h1)); + vxQueryImage(src, VX_IMAGE_FORMAT, &f1, sizeof(f1)); + + /* fill in the meta data with the attributes so that the checker will pass */ + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = f1; + ptr->dim.image.width = (w1 + 1) / 2; + ptr->dim.image.height = (h1 + 1) / 2; + status = VX_SUCCESS; + } + if (src) vxReleaseImage(&src); + if (dst) vxReleaseImage(&dst); + vxReleaseParameter(&src_param); + vxReleaseParameter(&dst_param); + } + } + return status; +} + +static const vx_uint32 gaussian5x5scale = 256; +static const vx_int16 gaussian5x5[5][5] = +{ + {1, 4, 6, 4, 1}, + {4, 16, 24, 16, 4}, + {6, 24, 36, 24, 6}, + {4, 16, 24, 16, 4}, + {1, 4, 6, 4, 1} +}; + +static vx_convolution vxCreateGaussian5x5Convolution(vx_context context) +{ + vx_convolution conv = vxCreateConvolution(context, 5, 5); + vx_status status = vxCopyConvolutionCoefficients(conv, (vx_int16 *)gaussian5x5, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + if (status != VX_SUCCESS) + { + vxReleaseConvolution(&conv); + return NULL; + } + + status = vxSetConvolutionAttribute(conv, VX_CONVOLUTION_SCALE, (void *)&gaussian5x5scale, sizeof(vx_uint32)); + if (status != VX_SUCCESS) + { + vxReleaseConvolution(&conv); + return NULL; + } + return conv; +} + +static vx_status VX_CALLBACK vxHalfscaleGaussianInitializer(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + + if (num == dimof(scale_kernel_params)) + { + vx_context context = vxGetContext((vx_reference)node); + + vx_graph subgraph = node->child; + if (subgraph) + { + /* deallocate subgraph resources */ + status = vxReleaseGraph(&subgraph); + if (VX_SUCCESS != status) + return status; + + status = ownSetChildGraphOfNode(node, 0); + if (VX_SUCCESS != status) + return status; + } + + /* allocate subgraph resources */ + subgraph = vxCreateGraph(context); + + status = vxGetStatus((vx_reference)subgraph); + if (status == VX_SUCCESS) + { + vx_uint32 i; + vx_image input = (vx_image)parameters[0]; + vx_image output = (vx_image)parameters[1]; + vx_int32 kernel_size = 3; + vx_convolution convolution = 0; + + /* We have a child-graph; we want to make sure the parent + graph is recognized as a valid scope for sake of virtual + image parameters. */ + subgraph->parentGraph = node->graph; + + status |= vxCopyScalar((vx_scalar)parameters[2], &kernel_size, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if (kernel_size == 1) + { + vx_node nodes[] = + { + vxScaleImageNode(subgraph, input, output, VX_INTERPOLATION_NEAREST_NEIGHBOR), + }; + + vx_border_t borders; + status |= vxQueryNode(node, VX_NODE_BORDER, &borders, sizeof(borders)); + for (i = 0; i < dimof(nodes); i++) + { + status |= vxSetNodeAttribute(nodes[i], VX_NODE_BORDER, &borders, sizeof(borders)); + } + + status |= vxAddParameterToGraphByIndex(subgraph, nodes[0], 0); /* input image */ + status |= vxAddParameterToGraphByIndex(subgraph, nodes[0], 1); /* output image */ + + status |= vxVerifyGraph(subgraph); + + /* release our references, the graph will hold it's own */ + for (i = 0; i < dimof(nodes); i++) + { + status |= vxReleaseNode(&nodes[i]); + } + + status |= ownSetChildGraphOfNode(node, subgraph); + } + else if (kernel_size == 3 || kernel_size == 5) + { + if (kernel_size == 5) + { + convolution = vxCreateGaussian5x5Convolution(context); + } + if (kernel_size == 3 || convolution) + { + vx_image virt = vxCreateVirtualImage(subgraph, 0, 0, VX_DF_IMAGE_U8); + vx_node nodes[] = + { + kernel_size == 3 ? vxGaussian3x3Node(subgraph, input, virt) : vxConvolveNode(subgraph, input, convolution, virt), + vxScaleImageNode(subgraph, virt, output, VX_INTERPOLATION_NEAREST_NEIGHBOR), + }; + + vx_border_t borders; + status |= vxQueryNode(node, VX_NODE_BORDER, &borders, sizeof(borders)); + for (i = 0; i < dimof(nodes); i++) + { + status |= vxSetNodeAttribute(nodes[i], VX_NODE_BORDER, &borders, sizeof(borders)); + } + + status |= vxAddParameterToGraphByIndex(subgraph, nodes[0], 0); /* input image */ + status |= vxAddParameterToGraphByIndex(subgraph, nodes[1], 1); /* output image */ + status |= vxAddParameterToGraphByIndex(subgraph, node, 2); /* gradient size - refer to self to quiet sub-graph validator */ + + status |= vxVerifyGraph(subgraph); + + /* release our references, the graph will hold it's own */ + for (i = 0; i < dimof(nodes); i++) + { + status |= vxReleaseNode(&nodes[i]); + } + + if (convolution) + status |= vxReleaseConvolution(&convolution); + + status |= vxReleaseImage(&virt); + + status |= ownSetChildGraphOfNode(node, subgraph); + } + } + } + } + + return status; +} + +static vx_status VX_CALLBACK vxHalfscaleGaussianDeinitializer(vx_node node, const vx_reference parameters[], vx_uint32 num) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + + if (num == dimof(scale_kernel_params)) + { + vx_graph subgraph = ownGetChildGraphOfNode(node); + + status = VX_SUCCESS; + + status |= vxReleaseGraph(&subgraph); + + /* set subgraph to "null" */ + status |= ownSetChildGraphOfNode(node, 0); + } + + return status; +} + + +vx_tiling_kernel_t halfscale_gaussian_kernel = +{ + "org.khronos.openvx.tiling_halfscale_gaussian", + VX_KERNEL_HALFSCALE_GAUSSIAN_TILING, + vxHalfscaleGaussianKernel, + NULL, + NULL, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL } }, + NULL, + vxHalfscaleGaussianInputValidator, + vxHalfscaleGaussianOutputValidator, + vxHalfscaleGaussianInitializer, + vxHalfscaleGaussianDeinitializer, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_threshold.c b/sample/targets/tiling/vx_threshold.c new file mode 100644 index 0000000..87f669d --- /dev/null +++ b/sample/targets/tiling/vx_threshold.c @@ -0,0 +1,138 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" + +#include "vx_internal.h" + +#include "tiling.h" + +static vx_status VX_CALLBACK vxThresholdInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_image input = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if ((format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_FORMAT; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + } + else if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_threshold threshold = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &threshold, sizeof(threshold)); + if (threshold) + { + vx_enum type = 0; + vxQueryThreshold(threshold, VX_THRESHOLD_TYPE, &type, sizeof(type)); + if ((type == VX_THRESHOLD_TYPE_BINARY) || + (type == VX_THRESHOLD_TYPE_RANGE)) + { + vx_enum data_type = 0; + vxQueryThreshold(threshold, VX_THRESHOLD_DATA_TYPE, &data_type, sizeof(data_type)); + if (data_type == VX_TYPE_UINT8) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseThreshold(&threshold); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxThresholdOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 2) + { + vx_parameter src_param = vxGetParameterByIndex(node, 0); + if (vxGetStatus((vx_reference)src_param) == VX_SUCCESS) + { + vx_image src = 0; + vxQueryParameter(src_param, VX_PARAMETER_REF, &src, sizeof(src)); + if (src) + { + vx_uint32 width = 0, height = 0; + + vxQueryImage(src, VX_IMAGE_WIDTH, &width, sizeof(height)); + vxQueryImage(src, VX_IMAGE_HEIGHT, &height, sizeof(height)); + + /* fill in the meta data with the attributes so that the checker will pass */ + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + status = VX_SUCCESS; + vxReleaseImage(&src); + } + vxReleaseParameter(&src_param); + } + } + return status; +} + +vx_tiling_kernel_t threshold_kernel = +{ + "org.khronos.openvx.tiling_threshold", + VX_KERNEL_THRESHOLD_TILING, + NULL, + Threshold_image_tiling_flexible, + Threshold_image_tiling_fast, + 3, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_THRESHOLD, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxThresholdInputValidator, + vxThresholdOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_warp.c b/sample/targets/tiling/vx_warp.c new file mode 100644 index 0000000..84056bc --- /dev/null +++ b/sample/targets/tiling/vx_warp.c @@ -0,0 +1,200 @@ +/* + + * Copyright (c) 2012-2017 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "vx_interface.h" +#include "vx_internal.h" + +#include + +static vx_status vxWarpInputValidator(vx_node node, vx_uint32 index, vx_size mat_columns) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_matrix matrix; + vxQueryParameter(param, VX_PARAMETER_REF, &matrix, sizeof(matrix)); + if (matrix) + { + vx_enum data_type = 0; + vx_size rows = 0ul, columns = 0ul; + vxQueryMatrix(matrix, VX_MATRIX_TYPE, &data_type, sizeof(data_type)); + vxQueryMatrix(matrix, VX_MATRIX_ROWS, &rows, sizeof(rows)); + vxQueryMatrix(matrix, VX_MATRIX_COLUMNS, &columns, sizeof(columns)); + if ((data_type == VX_TYPE_FLOAT32) && (columns == mat_columns) && (rows == 3)) + { + status = VX_SUCCESS; + } + vxReleaseMatrix(&matrix); + } + vxReleaseParameter(¶m); + } + } + else if (index == 2) + { + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vx_scalar scalar = 0; + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum stype = 0; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype)); + if (stype == VX_TYPE_ENUM) + { + vx_enum interp = 0; + vxCopyScalar(scalar, &interp, VX_READ_ONLY, VX_MEMORY_TYPE_HOST); + if ((interp == VX_INTERPOLATION_NEAREST_NEIGHBOR) || + (interp == VX_INTERPOLATION_BILINEAR)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + return status; +} + +static vx_status VX_CALLBACK vxWarpOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 3) + { + vx_parameter dst_param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)dst_param) == VX_SUCCESS) + { + vx_image dst = 0; + vxQueryParameter(dst_param, VX_PARAMETER_REF, &dst, sizeof(dst)); + if (dst) + { + vx_uint32 w1 = 0, h1 = 0; + vx_df_image f1 = VX_DF_IMAGE_VIRT; + + vxQueryImage(dst, VX_IMAGE_WIDTH, &w1, sizeof(w1)); + vxQueryImage(dst, VX_IMAGE_HEIGHT, &h1, sizeof(h1)); + vxQueryImage(dst, VX_IMAGE_FORMAT, &f1, sizeof(f1)); + /* output can not be virtual */ + if ((w1 != 0) && (h1 != 0) && (f1 == VX_DF_IMAGE_U8)) + { + /* fill in the meta data with the attributes so that the checker will pass */ + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = VX_DF_IMAGE_U8; + ptr->dim.image.width = w1; + ptr->dim.image.height = h1; + status = VX_SUCCESS; + } + vxReleaseImage(&dst); + } + vxReleaseParameter(&dst_param); + } + } + return status; +} + +static vx_status VX_CALLBACK vxWarpAffineInputValidator(vx_node node, vx_uint32 index) +{ + return vxWarpInputValidator(node, index, 2); +} + +static vx_status VX_CALLBACK vxWarpPerspectiveInputValidator(vx_node node, vx_uint32 index) +{ + return vxWarpInputValidator(node, index, 3); +} + +static vx_param_description_t warp_kernel_params[] = { + {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_MATRIX, VX_PARAMETER_STATE_REQUIRED}, + {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL}, + {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}, +}; + +vx_tiling_kernel_t warp_affine_kernel = +{ + "org.khronos.openvx.tiling_warp_affine", + VX_KERNEL_WARP_AFFINE_TILING, + NULL, + WarpAffine_image_tiling_flexible, + WarpAffine_image_tiling_fast, + 4, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_MATRIX, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxWarpAffineInputValidator, + vxWarpOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; + + +vx_tiling_kernel_t warp_perspective_kernel = +{ + "org.khronos.openvx.tiling_warp_perspective", + VX_KERNEL_WARP_PERSPECTIVE_TILING, + NULL, + WarpPerspective_image_tiling_flexible, + WarpPerspective_image_tiling_fast, + 4, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_MATRIX, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxWarpPerspectiveInputValidator, + vxWarpOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +}; diff --git a/sample/targets/tiling/vx_weighted_average.c b/sample/targets/tiling/vx_weighted_average.c new file mode 100644 index 0000000..87760f1 --- /dev/null +++ b/sample/targets/tiling/vx_weighted_average.c @@ -0,0 +1,163 @@ +/* + +* Copyright (c) 2012-2017 The Khronos Group Inc. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "vx_interface.h" +#include "vx_internal.h" +#include + +static vx_status VX_CALLBACK vxWeightedAverageInputValidator(vx_node node, vx_uint32 index) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 0) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if ((format == VX_DF_IMAGE_U8)) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + else if (index == 1) + { + vx_scalar scalar = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + if (vxGetStatus((vx_reference)param) == VX_SUCCESS) + { + vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar)); + if (scalar) + { + vx_enum type = -1; + vxQueryScalar(scalar, VX_SCALAR_TYPE, &type, sizeof(type)); + if (type == VX_TYPE_FLOAT32) + { + vx_float32 scale = 0.0f; + if ((vxCopyScalar(scalar, &scale, VX_READ_ONLY, VX_MEMORY_TYPE_HOST) == VX_SUCCESS) && + (scale >= 0) && (scale <= 1.0)) + { + status = VX_SUCCESS; + } + else + { + status = VX_ERROR_INVALID_VALUE; + } + } + else + { + status = VX_ERROR_INVALID_TYPE; + } + vxReleaseScalar(&scalar); + } + vxReleaseParameter(¶m); + } + } + else if (index == 2) + { + vx_image input = 0; + vx_parameter param = vxGetParameterByIndex(node, index); + vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input)); + if (input) + { + vx_df_image format = 0; + vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format)); + if (format == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + vxReleaseImage(&input); + } + vxReleaseParameter(¶m); + } + return status; +} + +static vx_status VX_CALLBACK vxWeightedAverageOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr) +{ + vx_status status = VX_ERROR_INVALID_PARAMETERS; + if (index == 3) + { + vx_parameter param[] = { + vxGetParameterByIndex(node, 0), + vxGetParameterByIndex(node, 2), + vxGetParameterByIndex(node, index), + }; + if ((vxGetStatus((vx_reference)param[0]) == VX_SUCCESS) && + (vxGetStatus((vx_reference)param[1]) == VX_SUCCESS) && + (vxGetStatus((vx_reference)param[2]) == VX_SUCCESS)) + { + vx_image images[3]; + vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0])); + vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1])); + vxQueryParameter(param[2], VX_PARAMETER_REF, &images[2], sizeof(images[2])); + if (images[0] && images[1] && images[2]) + { + vx_uint32 width = 0, height = 0; + vx_df_image informat[2] = { VX_DF_IMAGE_VIRT, VX_DF_IMAGE_VIRT }; + vx_df_image outformat = VX_DF_IMAGE_VIRT; + vxQueryImage(images[0], VX_IMAGE_WIDTH, &width, sizeof(width)); + vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height, sizeof(height)); + vxQueryImage(images[0], VX_IMAGE_FORMAT, &informat[0], sizeof(informat[0])); + vxQueryImage(images[1], VX_IMAGE_FORMAT, &informat[1], sizeof(informat[1])); + vxQueryImage(images[2], VX_IMAGE_FORMAT, &outformat, sizeof(outformat)); + if (informat[0] == VX_DF_IMAGE_U8 && informat[1] == VX_DF_IMAGE_U8 && outformat == VX_DF_IMAGE_U8) + { + status = VX_SUCCESS; + } + ptr->type = VX_TYPE_IMAGE; + ptr->dim.image.format = outformat; + ptr->dim.image.width = width; + ptr->dim.image.height = height; + vxReleaseImage(&images[0]); + vxReleaseImage(&images[1]); + vxReleaseImage(&images[2]); + } + vxReleaseParameter(¶m[0]); + vxReleaseParameter(¶m[1]); + vxReleaseParameter(¶m[2]); + } + } + return status; +} +vx_tiling_kernel_t weightedaverage_kernel = +{ + "org.khronos.openvx.tiling_weightedaverage", + VX_KERNEL_WEIGHTED_AVERAGE_TILING, + NULL, + WeightedAverage_image_tiling_flexible, + WeightedAverage_image_tiling_fast, + 4, + { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED }, + { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED }, + { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } }, + NULL, + vxWeightedAverageInputValidator, + vxWeightedAverageOutputValidator, + NULL, + NULL, + { 16, 16 }, + { -1, 1, -1, 1 }, + { VX_BORDER_MODE_UNDEFINED, 0 }, +};