diff --git a/examples/vx_tiling_ext.c b/examples/vx_tiling_ext.c
index aa3adc6..a367da4 100644
--- a/examples/vx_tiling_ext.c
+++ b/examples/vx_tiling_ext.c
@@ -220,7 +220,8 @@ static vx_status VX_CALLBACK vxAlphaOutputValidator(vx_node node, vx_uint32 inde
     return status;
 }
 
-
+//Move this struct into "include/VX/vx_khr_tiling.h"
+#if 0  
 /*! [publish_support] */
 typedef struct _vx_tiling_kernel_t {
     /*! kernel name */
@@ -246,6 +247,7 @@ typedef struct _vx_tiling_kernel_t {
     /*! border information. */
     vx_border_t border;
 } vx_tiling_kernel_t;
+#endif
 
 static vx_tiling_kernel_t tiling_kernels[] = {
         {"org.khronos.openvx.tiling_gaussian_3x3",
@@ -255,6 +257,7 @@ static vx_tiling_kernel_t tiling_kernels[] = {
           2,
           {{VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
            {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}},
+          NULL,
           vxFilterInputValidator,
           vxFilterOutputValidator,
           {1, 1},
@@ -269,6 +272,7 @@ static vx_tiling_kernel_t tiling_kernels[] = {
           {{VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
            {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
            {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}},
+          NULL,
           vxAlphaInputValidator,
           vxAlphaOutputValidator,
           {1, 1},
@@ -282,6 +286,7 @@ static vx_tiling_kernel_t tiling_kernels[] = {
           2,
           {{VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
            {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}},
+          NULL,
           vxFilterInputValidator,
           vxFilterOutputValidator,
           {1, 1},
@@ -296,6 +301,7 @@ static vx_tiling_kernel_t tiling_kernels[] = {
           {{VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
            {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
            {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED}},
+          NULL,
           vxAddInputValidator,
           vxAddOutputValidator,
           {1, 1},
@@ -319,6 +325,7 @@ VX_API_ENTRY vx_status VX_API_CALL vxPublishKernels(vx_context context)
                 tiling_kernels[k].flexible_function,
                 tiling_kernels[k].fast_function,
                 tiling_kernels[k].num_params,
+                tiling_kernels[k].validate,
                 tiling_kernels[k].input_validator,
                 tiling_kernels[k].output_validator);
         if (kernel)
diff --git a/kernels/CMakeLists.txt b/kernels/CMakeLists.txt
index 75d1f72..99ee945 100644
--- a/kernels/CMakeLists.txt
+++ b/kernels/CMakeLists.txt
@@ -19,4 +19,7 @@
 add_subdirectory( c_model )
 add_subdirectory( debug )
 add_subdirectory( extras )
+if (OPENVX_USE_TILING)
+    add_subdirectory( tiling )
+endif (OPENVX_USE_TILING)
 
diff --git a/kernels/opencl/vx_and.cl b/kernels/opencl/vx_and.cl
new file mode 100644
index 0000000..6a08e00
--- /dev/null
+++ b/kernels/opencl/vx_and.cl
@@ -0,0 +1,10 @@
+
+__kernel void vx_and(int asx, int asy, __global uchar *a, 
+                     int bsx, int bsy, __global uchar *b,
+                     int csx, int csy, __global uchar *c)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    c[y * csy + x * csx] = a[y * asy + x * asx] & b[y * bsy + x * bsx];
+}
diff --git a/kernels/opencl/vx_box3x3.cl b/kernels/opencl/vx_box3x3.cl
new file mode 100644
index 0000000..3ee0d3d
--- /dev/null
+++ b/kernels/opencl/vx_box3x3.cl
@@ -0,0 +1,87 @@
+
+//Define 3 types of border
+#define VX_ID_KHRONOS 0x000
+#define VX_ENUM_BORDER 0x0C
+#define VX_ENUM_BASE(vendor, id)   (((vendor) << 20) | (id << 12))
+
+#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0
+#define VX_BORDER_CONSTANT  VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1
+#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2
+
+#define BOX3x3      sum += (uint)src[x_top * ssx + y_top * ssy]; \
+                    sum += (uint)src[x     * ssx + y_top * ssy]; \
+                    sum += (uint)src[x_bot * ssx + y_top * ssy]; \
+                    sum += (uint)src[x_top * ssx + y     * ssy]; \
+                    sum += (uint)src[x     * ssx + y     * ssy]; \
+                    sum += (uint)src[x_bot * ssx + y     * ssy]; \
+                    sum += (uint)src[x_top * ssx + y_bot * ssy]; \
+                    sum += (uint)src[x     * ssx + y_bot * ssy]; \
+                    sum += (uint)src[x_bot * ssx + y_bot * ssy]; \
+                    sum = sum / 9;                               \  
+                    dst[x * dsx + y * dsy] = (uchar)sum;         \
+
+
+__kernel void vx_box3x3(int ssx, int ssy, __global uchar *src,
+                        int bordermode, uchar const_vaule,
+                        int dsx, int dsy, __global uchar *dst)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const size_t high_x = get_global_size(0);
+    const size_t high_y = get_global_size(1);
+    uint sum = 0;
+    
+    int y_top = y - 1;
+    int y_bot = y + 1;
+    int x_top = x - 1;
+    int x_bot = x + 1;
+    
+    int ky, kx;
+    uint dest_index = 0;
+    
+    if (bordermode == VX_BORDER_CONSTANT)
+    {
+        uchar pixel[9];
+        // Calculate border
+        if (y == 0 || x == 0 || x == high_x - 1 || y == high_y - 1)
+        {           
+            for (ky = -1; ky <= 1; ++ky)
+            {
+                int yy = y + ky;
+                int ccase_y = yy < 0 || yy >= high_y;
+
+                for (kx = -1; kx <= 1; ++kx, ++dest_index)
+                {
+                    int xx = x + kx;
+                    int ccase = ccase_y || xx < 0 || xx >= high_x;
+
+                    if (!ccase)
+                        pixel[dest_index] = src[xx * ssx + yy * ssy];
+                    else
+                        pixel[dest_index] = const_vaule;
+                }
+            }
+
+            sum = pixel[0] + pixel[1] + pixel[2] + pixel[3] + pixel[4] + pixel[5] + pixel[6] + pixel[7] + pixel[8];
+
+            sum = sum / 9;
+            dst[x * dsx + y * dsy] = (uchar)sum;
+        }
+        else
+        {
+            BOX3x3;
+        }
+    }
+    else
+    {
+        if (bordermode == VX_BORDER_REPLICATE)
+        {
+            y_top = y_top < 0 ? 0 : y - 1;
+            y_bot = y_bot >= high_y ? high_y - 1 : y + 1;
+            x_top = x_top < 0 ? 0 : x - 1;
+            x_bot = x_bot >= high_x ? high_x - 1 : x + 1;
+        }
+
+        BOX3x3;
+    }
+}
diff --git a/kernels/opencl/vx_convolve.cl b/kernels/opencl/vx_convolve.cl
new file mode 100644
index 0000000..84fec9d
--- /dev/null
+++ b/kernels/opencl/vx_convolve.cl
@@ -0,0 +1,93 @@
+
+//Define 3 types of border
+#define VX_ID_KHRONOS 0x000
+#define VX_ENUM_BORDER 0x0C
+#define VX_ENUM_BASE(vendor, id)   (((vendor) << 20) | (id << 12))
+
+#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0
+#define VX_BORDER_CONSTANT  VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1
+#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2
+
+#define C_MAX_CONVOLUTION_DIM (15)
+#define UINT8_MAX        255
+
+#define Convolve                                                                            \
+    uchar slice[C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM] = { 0 };                     \
+    uint center_x = x, center_y = y;                                                        \
+    int width = high_x, height = high_y;                                                    \
+    int ky, kx;                                                                             \
+    uint dest_index = 0;                                                                    \
+                                                                                            \
+    if( bordermode == VX_BORDER_REPLICATE || bordermode == VX_BORDER_UNDEFINED )            \
+    {                                                                                       \
+        for (ky = -(int)conv_radius_y; ky <= (int)conv_radius_y; ++ky)                      \
+        {                                                                                   \
+            int yy = (int)(center_y + ky);                                                  \
+            yy = yy < 0 ? 0 : yy >= height ? height - 1 : yy;                               \
+                                                                                            \
+            for (kx = -(int)conv_radius_x; kx <= (int)conv_radius_x; ++kx, ++dest_index)    \
+            {                                                                               \
+                int xx = (int)(center_x + kx);                                              \
+                xx = xx < 0 ? 0 : xx >= width ? width - 1 : xx;                             \
+                slice[dest_index] = src[xx * ssx + yy * ssy];                               \
+            }                                                                               \
+        }                                                                                   \
+    }                                                                                       \
+    else if( bordermode == VX_BORDER_CONSTANT )                                             \
+    {                                                                                       \
+        for (ky = -(int)conv_radius_y; ky <= (int)conv_radius_y; ++ky)                      \
+        {                                                                                   \
+            int yy = (int)(center_y + ky);                                                  \
+            int ccase_y = yy < 0 || yy >= height;                                           \
+                                                                                            \
+            for (kx = -(int)conv_radius_x; kx <= (int)conv_radius_x; ++kx, ++dest_index)    \
+            {                                                                               \
+                int xx = (int)(center_x + kx);                                              \
+                int ccase = ccase_y || xx < 0 || xx >= width;                               \
+                if( !ccase )                                                                \
+                    slice[dest_index] = src[xx * ssx + yy * ssy];                           \
+                else                                                                        \
+                    slice[dest_index] = (uchar)const_vaule;                                 \
+            }                                                                               \
+        }                                                                                   \
+    }                                                                                       \
+                                                                                            \
+    for (int i = 0; i < (int)(conv_width * conv_height); ++i)                               \
+        sum += conv_mat[conv_width * conv_height - 1 - i] * slice[i];                       \
+                                                                                            \
+    value = sum / (int)scale;                                                               \
+                                                                                            \
+    if (value < 0)    dst[x * dsx + y * dsy] = 0;                                           \
+    else if (value > UINT8_MAX)    dst[x * dsx + y * dsy] = UINT8_MAX;                      \
+    else    dst[x * dsx + y * dsy] = value;         
+
+__kernel void vx_Convolve(int ssx, int ssy, __global uchar *src,
+                        int bordermode, uchar const_vaule,
+						uint conv_width, uint conv_height, 
+						uint scale, __global short *conv_mat,
+                        int dsx, int dsy, __global uchar *dst)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+
+    int low_x = 0, low_y = 0;
+    int high_x = get_global_size(0);
+    int high_y = get_global_size(1);
+    int sum = 0;
+    int value = 0;
+
+    int conv_radius_x, conv_radius_y;
+    conv_radius_x = (int)conv_width / 2;
+    conv_radius_y = (int)conv_height / 2;
+   
+    if (bordermode == VX_BORDER_UNDEFINED)
+    {
+        low_x = conv_radius_x;
+        high_x = ((high_x >= (uint)conv_radius_x) ? high_x - conv_radius_x : 0);
+        low_y = conv_radius_y;
+        high_y = ((high_y >= (uint)conv_radius_y) ? high_y - conv_radius_y : 0);
+    }
+    
+    Convolve;
+    
+}
diff --git a/kernels/opencl/vx_dilate3x3.cl b/kernels/opencl/vx_dilate3x3.cl
new file mode 100644
index 0000000..076bc7f
--- /dev/null
+++ b/kernels/opencl/vx_dilate3x3.cl
@@ -0,0 +1,93 @@
+//Define 3 types of border
+#define VX_ID_KHRONOS 0x000
+#define VX_ENUM_BORDER 0x0C
+#define VX_ENUM_BASE(vendor, id)   (((vendor) << 20) | (id << 12))
+
+#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0
+#define VX_BORDER_CONSTANT  VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1
+#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2
+
+uchar max_op(uchar a, uchar b) 
+{
+    return a > b ? a : b;
+}
+
+#define DILATE3x3   pixels[0] = src[x_top * ssx + y_top * ssy];     \
+                    pixels[1] = src[x     * ssx + y_top * ssy];     \
+                    pixels[2] = src[x_bot * ssx + y_top * ssy];     \
+                    pixels[3] = src[x_top * ssx + y     * ssy];     \
+                    pixels[4] = src[x     * ssx + y     * ssy];     \
+                    pixels[5] = src[x_bot * ssx + y     * ssy];     \
+                    pixels[6] = src[x_top * ssx + y_bot * ssy];     \
+                    pixels[7] = src[x     * ssx + y_bot * ssy];     \
+                    pixels[8] = src[x_bot * ssx + y_bot * ssy];     \
+                    max_value = pixels[0];                          \
+                    for (i = 1; i < 9; i++)                         \
+                        max_value = max_op(max_value, pixels[i]);   \
+                    dst[x * dsx + y * dsy] = max_value;             \
+
+__kernel void vx_dilate3x3(int ssx, int ssy, __global uchar *src,
+                           int bordermode, uchar const_vaule,
+                           int dsx, int dsy, __global uchar *dst)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const size_t high_x = get_global_size(0);
+    const size_t high_y = get_global_size(1);
+    uint sum = 0;
+    
+    int y_top = y - 1;
+    int y_bot = y + 1;
+    int x_top = x - 1;
+    int x_bot = x + 1;
+    
+    int ky, kx, i;
+    uint dest_index = 0;
+    uchar pixels[9], max_value;
+    
+    if (bordermode == VX_BORDER_CONSTANT)
+    {
+        // Calculate border
+        if (y == 0 || x == 0 || x == high_x - 1 || y == high_y - 1)
+        {           
+            for (ky = -1; ky <= 1; ++ky)
+            {
+                int yy = y + ky;
+                int ccase_y = yy < 0 || yy >= high_y;
+
+                for (kx = -1; kx <= 1; ++kx, ++dest_index)
+                {
+                    int xx = x + kx;
+                    int ccase = ccase_y || xx < 0 || xx >= high_x;
+
+                    if (!ccase)
+                        pixels[dest_index] = src[xx * ssx + yy * ssy];
+                    else
+                        pixels[dest_index] = const_vaule;
+                }
+            }
+
+            max_value = pixels[0];
+            for (i = 1; i < 9; i++)
+                max_value = max_op(max_value, pixels[i]);
+
+            dst[x * dsx + y * dsy] = max_value;  
+        }
+        else
+        {
+            DILATE3x3;
+        }
+    }
+    else
+    {
+        if (bordermode == VX_BORDER_REPLICATE)
+        {
+            y_top = y_top < 0 ? 0 : y - 1;
+            y_bot = y_bot >= high_y ? high_y - 1 : y + 1;
+            x_top = x_top < 0 ? 0 : x - 1;
+            x_bot = x_bot >= high_x ? high_x - 1 : x + 1;
+        }
+
+        DILATE3x3;
+    }
+}
diff --git a/kernels/opencl/vx_erode3x3.cl b/kernels/opencl/vx_erode3x3.cl
new file mode 100644
index 0000000..a694be9
--- /dev/null
+++ b/kernels/opencl/vx_erode3x3.cl
@@ -0,0 +1,93 @@
+//Define 3 types of border
+#define VX_ID_KHRONOS 0x000
+#define VX_ENUM_BORDER 0x0C
+#define VX_ENUM_BASE(vendor, id)   (((vendor) << 20) | (id << 12))
+
+#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0
+#define VX_BORDER_CONSTANT  VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1
+#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2
+
+uchar min_op(uchar a, uchar b) 
+{
+    return a < b ? a : b;
+}
+
+#define ERODE3x3    pixels[0] = src[x_top * ssx + y_top * ssy];     \
+                    pixels[1] = src[x     * ssx + y_top * ssy];     \
+                    pixels[2] = src[x_bot * ssx + y_top * ssy];     \
+                    pixels[3] = src[x_top * ssx + y     * ssy];     \
+                    pixels[4] = src[x     * ssx + y     * ssy];     \
+                    pixels[5] = src[x_bot * ssx + y     * ssy];     \
+                    pixels[6] = src[x_top * ssx + y_bot * ssy];     \
+                    pixels[7] = src[x     * ssx + y_bot * ssy];     \
+                    pixels[8] = src[x_bot * ssx + y_bot * ssy];     \
+                    min_value = pixels[0];                          \
+                    for (i = 1; i < 9; i++)                         \
+                        min_value = min_op(min_value, pixels[i]);   \
+                    dst[x * dsx + y * dsy] = min_value;             \
+
+__kernel void vx_erode3x3(int ssx, int ssy, __global uchar *src,
+                          int bordermode, uchar const_vaule,
+                          int dsx, int dsy, __global uchar *dst)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const size_t high_x = get_global_size(0);
+    const size_t high_y = get_global_size(1);
+    uint sum = 0;
+    
+    int y_top = y - 1;
+    int y_bot = y + 1;
+    int x_top = x - 1;
+    int x_bot = x + 1;
+    
+    int ky, kx, i;
+    uint dest_index = 0;
+    uchar pixels[9], min_value;
+    
+    if (bordermode == VX_BORDER_CONSTANT)
+    {
+        // Calculate border
+        if (y == 0 || x == 0 || x == high_x - 1 || y == high_y - 1)
+        {           
+            for (ky = -1; ky <= 1; ++ky)
+            {
+                int yy = y + ky;
+                int ccase_y = yy < 0 || yy >= high_y;
+
+                for (kx = -1; kx <= 1; ++kx, ++dest_index)
+                {
+                    int xx = x + kx;
+                    int ccase = ccase_y || xx < 0 || xx >= high_x;
+
+                    if (!ccase)
+                        pixels[dest_index] = src[xx * ssx + yy * ssy];
+                    else
+                        pixels[dest_index] = const_vaule;
+                }
+            }
+
+            min_value = pixels[0];
+            for (i = 1; i < 9; i++)
+                min_value = min_op(min_value, pixels[i]);
+
+            dst[x * dsx + y * dsy] = min_value;  
+        }
+        else
+        {
+            ERODE3x3;
+        }
+    }
+    else
+    {
+        if (bordermode == VX_BORDER_REPLICATE)
+        {
+            y_top = y_top < 0 ? 0 : y - 1;
+            y_bot = y_bot >= high_y ? high_y - 1 : y + 1;
+            x_top = x_top < 0 ? 0 : x - 1;
+            x_bot = x_bot >= high_x ? high_x - 1 : x + 1;
+        }
+
+        ERODE3x3;
+    }
+}
diff --git a/kernels/opencl/vx_gaussian3x3.cl b/kernels/opencl/vx_gaussian3x3.cl
new file mode 100644
index 0000000..112adf6
--- /dev/null
+++ b/kernels/opencl/vx_gaussian3x3.cl
@@ -0,0 +1,86 @@
+
+//Define 3 types of border
+#define VX_ID_KHRONOS 0x000
+#define VX_ENUM_BORDER 0x0C
+#define VX_ENUM_BASE(vendor, id)   (((vendor) << 20) | (id << 12))
+
+#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0
+#define VX_BORDER_CONSTANT  VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1
+#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2
+
+#define GAUSSIAN3x3     sum += (uint)src[x_top * ssx + y_top * ssy];   \
+                        sum += 2*(uint)src[x     * ssx + y_top * ssy]; \
+                        sum += (uint)src[x_bot * ssx + y_top * ssy];   \
+                        sum += 2*(uint)src[x_top * ssx + y     * ssy]; \
+                        sum += 4*(uint)src[x     * ssx + y     * ssy]; \
+                        sum += 2*(uint)src[x_bot * ssx + y     * ssy]; \
+                        sum += (uint)src[x_top * ssx + y_bot * ssy];   \
+                        sum += 2*(uint)src[x     * ssx + y_bot * ssy]; \
+                        sum += (uint)src[x_bot * ssx + y_bot * ssy];   \
+                        sum = sum / 16;                                \
+                        dst[x * dsx + y * dsy] = (uchar)sum;           \
+
+__kernel void vx_gaussian3x3(int ssx, int ssy, __global uchar *src,
+                             int bordermode, uchar const_vaule,
+                             int dsx, int dsy, __global uchar *dst)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const size_t high_x = get_global_size(0);
+    const size_t high_y = get_global_size(1);
+    uint sum = 0;
+    
+    int y_top = y - 1;
+    int y_bot = y + 1;
+    int x_top = x - 1;
+    int x_bot = x + 1;
+    
+    int ky, kx;
+    uint dest_index = 0;
+    
+    if (bordermode == VX_BORDER_CONSTANT)
+    {
+        uchar pixel[9];
+        // Calculate border
+        if (y == 0 || x == 0 || x == high_x - 1 || y == high_y - 1)
+        {           
+            for (ky = -1; ky <= 1; ++ky)
+            {
+                int yy = y + ky;
+                int ccase_y = yy < 0 || yy >= high_y;
+
+                for (kx = -1; kx <= 1; ++kx, ++dest_index)
+                {
+                    int xx = x + kx;
+                    int ccase = ccase_y || xx < 0 || xx >= high_x;
+
+                    if (!ccase)
+                        pixel[dest_index] = src[xx * ssx + yy * ssy];
+                    else
+                        pixel[dest_index] = const_vaule;
+                }
+            }
+
+            sum = pixel[0] + 2*pixel[1] + pixel[2] + 2*pixel[3] + 4*pixel[4] + 2*pixel[5] + pixel[6] + 2*pixel[7] + pixel[8];
+
+            sum = sum / 16;
+            dst[x * dsx + y * dsy] = (uchar)sum;
+        }
+        else
+        {
+            GAUSSIAN3x3;
+        }
+    }
+    else
+    {
+        if (bordermode == VX_BORDER_REPLICATE)
+        {
+            y_top = y_top < 0 ? 0 : y - 1;
+            y_bot = y_bot >= high_y ? high_y - 1 : y + 1;
+            x_top = x_top < 0 ? 0 : x - 1;
+            x_bot = x_bot >= high_x ? high_x - 1 : x + 1;
+        }
+
+        GAUSSIAN3x3;
+    }
+}
diff --git a/kernels/opencl/vx_median3x3.cl b/kernels/opencl/vx_median3x3.cl
new file mode 100644
index 0000000..2ae7a61
--- /dev/null
+++ b/kernels/opencl/vx_median3x3.cl
@@ -0,0 +1,123 @@
+//Define 3 types of border
+#define VX_ID_KHRONOS 0x000
+#define VX_ENUM_BORDER 0x0C
+#define VX_ENUM_BASE(vendor, id)   (((vendor) << 20) | (id << 12))
+
+#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0
+#define VX_BORDER_CONSTANT  VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1
+#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2
+
+uchar min_op(uchar a, uchar b) 
+{
+    return a < b ? a : b;
+}
+
+uchar max_op(uchar a, uchar b) 
+{
+    return a > b ? a : b;
+}
+
+void sort_mid(uchar *a, uchar *b)
+{
+    const uchar min = min_op(*a, *b);
+    const uchar max = max_op(*a, *b);
+    
+    *a = min;
+    *b = max;
+}
+
+#define SORT        sort_mid(&pixels[1], &pixels[2]);            \
+                    sort_mid(&pixels[4], &pixels[5]);            \
+                    sort_mid(&pixels[7], &pixels[8]);            \
+                    sort_mid(&pixels[0], &pixels[1]);            \
+                    sort_mid(&pixels[3], &pixels[4]);            \
+                    sort_mid(&pixels[6], &pixels[7]);            \
+                    sort_mid(&pixels[1], &pixels[2]);            \
+                    sort_mid(&pixels[4], &pixels[5]);            \
+                    sort_mid(&pixels[7], &pixels[8]);            \
+                    sort_mid(&pixels[0], &pixels[3]);            \
+                    sort_mid(&pixels[5], &pixels[8]);            \
+                    sort_mid(&pixels[4], &pixels[7]);            \
+                    sort_mid(&pixels[3], &pixels[6]);            \
+                    sort_mid(&pixels[1], &pixels[4]);            \
+                    sort_mid(&pixels[2], &pixels[5]);            \
+                    sort_mid(&pixels[4], &pixels[7]);            \
+                    sort_mid(&pixels[4], &pixels[2]);            \
+                    sort_mid(&pixels[6], &pixels[4]);            \
+                    sort_mid(&pixels[4], &pixels[2]);            \
+                    
+#define MEDIAN3x3   pixels[0] = src[x_top * ssx + y_top * ssy];     \
+                    pixels[1] = src[x     * ssx + y_top * ssy];     \
+                    pixels[2] = src[x_bot * ssx + y_top * ssy];     \
+                    pixels[3] = src[x_top * ssx + y     * ssy];     \
+                    pixels[4] = src[x     * ssx + y     * ssy];     \
+                    pixels[5] = src[x_bot * ssx + y     * ssy];     \
+                    pixels[6] = src[x_top * ssx + y_bot * ssy];     \
+                    pixels[7] = src[x     * ssx + y_bot * ssy];     \
+                    pixels[8] = src[x_bot * ssx + y_bot * ssy];     \
+                    SORT;                                           \
+                    dst[x * dsx + y * dsy] = pixels[4];             \
+
+__kernel void vx_median3x3(int ssx, int ssy, __global uchar *src,
+                           int bordermode, uchar const_vaule,
+                           int dsx, int dsy, __global uchar *dst)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const size_t high_x = get_global_size(0);
+    const size_t high_y = get_global_size(1);
+    uint sum = 0;
+    
+    int y_top = y - 1;
+    int y_bot = y + 1;
+    int x_top = x - 1;
+    int x_bot = x + 1;
+    
+    int ky, kx;
+    uint dest_index = 0;
+    uchar pixels[9];
+    
+    if (bordermode == VX_BORDER_CONSTANT)
+    {
+        // Calculate border
+        if (y == 0 || x == 0 || x == high_x - 1 || y == high_y - 1)
+        {           
+            for (ky = -1; ky <= 1; ++ky)
+            {
+                int yy = y + ky;
+                int ccase_y = yy < 0 || yy >= high_y;
+
+                for (kx = -1; kx <= 1; ++kx, ++dest_index)
+                {
+                    int xx = x + kx;
+                    int ccase = ccase_y || xx < 0 || xx >= high_x;
+
+                    if (!ccase)
+                        pixels[dest_index] = src[xx * ssx + yy * ssy];
+                    else
+                        pixels[dest_index] = const_vaule;
+                }
+            }
+
+            SORT;
+
+            dst[x * dsx + y * dsy] = pixels[4];  
+        }
+        else
+        {
+            MEDIAN3x3;
+        }
+    }
+    else
+    {
+        if (bordermode == VX_BORDER_REPLICATE)
+        {
+            y_top = y_top < 0 ? 0 : y - 1;
+            y_bot = y_bot >= high_y ? high_y - 1 : y + 1;
+            x_top = x_top < 0 ? 0 : x - 1;
+            x_bot = x_bot >= high_x ? high_x - 1 : x + 1;
+        }
+
+        MEDIAN3x3;
+    }
+}
diff --git a/kernels/opencl/vx_nonlinearfilter.cl b/kernels/opencl/vx_nonlinearfilter.cl
new file mode 100644
index 0000000..a9d1958
--- /dev/null
+++ b/kernels/opencl/vx_nonlinearfilter.cl
@@ -0,0 +1,531 @@
+//Define 3 types of border
+#define VX_ID_KHRONOS 0x000
+#define VX_ENUM_BORDER 0x0C
+#define VX_ENUM_NONLINEAR 0x16
+#define VX_ENUM_BASE(vendor, id)   (((vendor) << 20) | (id << 12))
+
+#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0
+#define VX_BORDER_CONSTANT  VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1
+#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2
+
+#define VX_NONLINEAR_FILTER_MEDIAN  VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NONLINEAR) + 0x0
+#define VX_NONLINEAR_FILTER_MIN     VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NONLINEAR) + 0x1 
+#define VX_NONLINEAR_FILTER_MAX     VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_NONLINEAR) + 0x2
+
+uchar min_op(uchar a, uchar b) 
+{
+    return a < b ? a : b;
+}
+
+uchar max_op(uchar a, uchar b) 
+{
+    return a > b ? a : b;
+}
+
+void sort_mid(uchar *a, uchar *b)
+{
+    const uchar min = min_op(*a, *b);
+    const uchar max = max_op(*a, *b);
+    
+    *a = min;
+    *b = max;
+}
+
+#define SORT_MID_CROSS_3x3      sort_mid(&pixels[0], &pixels[1]);       \
+                                sort_mid(&pixels[2], &pixels[3]);       \
+                                sort_mid(&pixels[0], &pixels[2]);       \
+                                sort_mid(&pixels[1], &pixels[3]);       \
+                                sort_mid(&pixels[1], &pixels[2]);       \
+                                sort_mid(&pixels[0], &pixels[4]);       \
+                                sort_mid(&pixels[1], &pixels[4]);       \
+                                sort_mid(&pixels[2], &pixels[4]);       \
+
+#define SORT_MID_3x3            sort_mid(&pixels[1], &pixels[2]);            \
+                                sort_mid(&pixels[4], &pixels[5]);            \
+                                sort_mid(&pixels[7], &pixels[8]);            \
+                                sort_mid(&pixels[0], &pixels[1]);            \
+                                sort_mid(&pixels[3], &pixels[4]);            \
+                                sort_mid(&pixels[6], &pixels[7]);            \
+                                sort_mid(&pixels[1], &pixels[2]);            \
+                                sort_mid(&pixels[4], &pixels[5]);            \
+                                sort_mid(&pixels[7], &pixels[8]);            \
+                                sort_mid(&pixels[0], &pixels[3]);            \
+                                sort_mid(&pixels[5], &pixels[8]);            \
+                                sort_mid(&pixels[4], &pixels[7]);            \
+                                sort_mid(&pixels[3], &pixels[6]);            \
+                                sort_mid(&pixels[1], &pixels[4]);            \
+                                sort_mid(&pixels[2], &pixels[5]);            \
+                                sort_mid(&pixels[4], &pixels[7]);            \
+                                sort_mid(&pixels[4], &pixels[2]);            \
+                                sort_mid(&pixels[6], &pixels[4]);            \
+                                sort_mid(&pixels[4], &pixels[2]);            \
+                                
+
+#define SORT_MID_DISK_5x5       sort_mid(&pixels[0], &pixels[1]);       \          
+                                sort_mid(&pixels[2], &pixels[3]);       \
+                                sort_mid(&pixels[4], &pixels[5]);       \
+                                sort_mid(&pixels[6], &pixels[7]);       \
+                                sort_mid(&pixels[8], &pixels[9]);       \
+                                sort_mid(&pixels[10], &pixels[11]);     \
+                                sort_mid(&pixels[12], &pixels[13]);     \
+                                sort_mid(&pixels[14], &pixels[15]);     \
+                                sort_mid(&pixels[16], &pixels[17]);     \
+                                sort_mid(&pixels[18], &pixels[19]);     \
+                                sort_mid(&pixels[0], &pixels[2]);       \
+                                sort_mid(&pixels[1], &pixels[3]);       \
+                                sort_mid(&pixels[4], &pixels[6]);       \
+                                sort_mid(&pixels[5], &pixels[7]);       \
+                                sort_mid(&pixels[8], &pixels[10]);      \
+                                sort_mid(&pixels[9], &pixels[11]);      \
+                                sort_mid(&pixels[12], &pixels[14]);     \
+                                sort_mid(&pixels[13], &pixels[15]);     \
+                                sort_mid(&pixels[16], &pixels[18]);     \
+                                sort_mid(&pixels[17], &pixels[19]);     \
+                                sort_mid(&pixels[1], &pixels[2]);       \
+                                sort_mid(&pixels[5], &pixels[6]);       \
+                                sort_mid(&pixels[0], &pixels[4]);       \
+                                sort_mid(&pixels[3], &pixels[7]);       \
+                                sort_mid(&pixels[9], &pixels[10]);      \
+                                sort_mid(&pixels[13], &pixels[14]);     \
+                                sort_mid(&pixels[8], &pixels[12]);      \
+                                sort_mid(&pixels[11], &pixels[15]);     \
+                                sort_mid(&pixels[17], &pixels[18]);     \
+                                sort_mid(&pixels[16], &pixels[20]);     \
+                                sort_mid(&pixels[1], &pixels[5]);       \
+                                sort_mid(&pixels[2], &pixels[6]);       \
+                                sort_mid(&pixels[9], &pixels[13]);      \
+                                sort_mid(&pixels[10], &pixels[14]);     \
+                                sort_mid(&pixels[0], &pixels[8]);       \
+                                sort_mid(&pixels[7], &pixels[15]);      \
+                                sort_mid(&pixels[17], &pixels[20]);     \
+                                sort_mid(&pixels[1], &pixels[4]);       \
+                                sort_mid(&pixels[3], &pixels[6]);       \
+                                sort_mid(&pixels[9], &pixels[12]);      \
+                                sort_mid(&pixels[11], &pixels[14]);     \
+                                sort_mid(&pixels[18], &pixels[20]);     \
+                                sort_mid(&pixels[0], &pixels[16]);      \
+                                sort_mid(&pixels[2], &pixels[4]);       \
+                                sort_mid(&pixels[3], &pixels[5]);       \
+                                sort_mid(&pixels[10], &pixels[12]);     \
+                                sort_mid(&pixels[11], &pixels[13]);     \
+                                sort_mid(&pixels[1], &pixels[9]);       \
+                                sort_mid(&pixels[6], &pixels[14]);      \
+                                sort_mid(&pixels[19], &pixels[20]);     \
+                                sort_mid(&pixels[3], &pixels[4]);       \
+                                sort_mid(&pixels[11], &pixels[12]);     \
+                                sort_mid(&pixels[1], &pixels[8]);       \
+                                sort_mid(&pixels[2], &pixels[10]);      \
+                                sort_mid(&pixels[5], &pixels[13]);      \
+                                sort_mid(&pixels[7], &pixels[14]);      \
+                                sort_mid(&pixels[3], &pixels[11]);      \
+                                sort_mid(&pixels[2], &pixels[8]);       \
+                                sort_mid(&pixels[4], &pixels[12]);      \
+                                sort_mid(&pixels[7], &pixels[13]);      \
+                                sort_mid(&pixels[1], &pixels[17]);      \
+                                sort_mid(&pixels[3], &pixels[10]);      \
+                                sort_mid(&pixels[5], &pixels[12]);      \
+                                sort_mid(&pixels[1], &pixels[16]);      \
+                                sort_mid(&pixels[2], &pixels[18]);      \
+                                sort_mid(&pixels[3], &pixels[9]);       \
+                                sort_mid(&pixels[6], &pixels[12]);      \
+                                sort_mid(&pixels[2], &pixels[16]);      \
+                                sort_mid(&pixels[3], &pixels[8]);       \
+                                sort_mid(&pixels[7], &pixels[12]);      \
+                                sort_mid(&pixels[5], &pixels[9]);       \
+                                sort_mid(&pixels[6], &pixels[10]);      \
+                                sort_mid(&pixels[4], &pixels[8]);       \
+                                sort_mid(&pixels[7], &pixels[11]);      \
+                                sort_mid(&pixels[3], &pixels[19]);      \
+                                sort_mid(&pixels[5], &pixels[8]);       \
+                                sort_mid(&pixels[7], &pixels[10]);      \
+                                sort_mid(&pixels[3], &pixels[18]);      \
+                                sort_mid(&pixels[4], &pixels[20]);      \
+                                sort_mid(&pixels[6], &pixels[8]);       \
+                                sort_mid(&pixels[7], &pixels[9]);       \
+                                sort_mid(&pixels[3], &pixels[17]);      \
+                                sort_mid(&pixels[5], &pixels[20]);      \
+                                sort_mid(&pixels[7], &pixels[8]);       \
+                                sort_mid(&pixels[3], &pixels[16]);      \
+                                sort_mid(&pixels[6], &pixels[20]);      \
+                                sort_mid(&pixels[5], &pixels[17]);      \
+                                sort_mid(&pixels[7], &pixels[20]);      \
+                                sort_mid(&pixels[4], &pixels[16]);      \
+                                sort_mid(&pixels[6], &pixels[18]);      \
+                                sort_mid(&pixels[5], &pixels[16]);      \
+                                sort_mid(&pixels[7], &pixels[19]);      \
+                                sort_mid(&pixels[7], &pixels[18]);      \   
+                                sort_mid(&pixels[6], &pixels[16]);      \
+                                sort_mid(&pixels[7], &pixels[17]);      \
+                                sort_mid(&pixels[10], &pixels[18]);     \
+                                sort_mid(&pixels[7], &pixels[16]);      \   
+                                sort_mid(&pixels[9], &pixels[17]);      \
+                                sort_mid(&pixels[8], &pixels[16]);      \
+                                sort_mid(&pixels[9], &pixels[16]);      \
+                                sort_mid(&pixels[10], &pixels[16]);     \
+                                
+                                
+                                
+ #define SORT_MID_BOX_5x5       sort_mid(&pixels[1], &pixels[2]);       \
+                                sort_mid(&pixels[0], &pixels[1]);       \
+                                sort_mid(&pixels[1], &pixels[2]);       \
+                                sort_mid(&pixels[4], &pixels[5]);       \
+                                sort_mid(&pixels[3], &pixels[4]);       \
+                                sort_mid(&pixels[4], &pixels[5]);       \
+                                sort_mid(&pixels[0], &pixels[3]);       \
+                                sort_mid(&pixels[2], &pixels[5]);       \
+                                sort_mid(&pixels[2], &pixels[3]);       \
+                                sort_mid(&pixels[1], &pixels[4]);       \   
+                                sort_mid(&pixels[1], &pixels[2]);       \
+                                sort_mid(&pixels[3], &pixels[4]);       \
+                                sort_mid(&pixels[7], &pixels[8]);       \
+                                sort_mid(&pixels[6], &pixels[7]);       \
+                                sort_mid(&pixels[7], &pixels[8]);       \
+                                sort_mid(&pixels[10], &pixels[11]);     \
+                                sort_mid(&pixels[9], &pixels[10]);      \
+                                sort_mid(&pixels[10], &pixels[11]);     \
+                                sort_mid(&pixels[6], &pixels[9]);       \
+                                sort_mid(&pixels[8], &pixels[11]);      \
+                                sort_mid(&pixels[8], &pixels[9]);       \
+                                sort_mid(&pixels[7], &pixels[10]);      \
+                                sort_mid(&pixels[7], &pixels[8]);       \
+                                sort_mid(&pixels[9], &pixels[10]);      \
+                                sort_mid(&pixels[0], &pixels[6]);       \
+                                sort_mid(&pixels[4], &pixels[10]);      \
+                                sort_mid(&pixels[4], &pixels[6]);       \
+                                sort_mid(&pixels[2], &pixels[8]);       \   
+                                sort_mid(&pixels[2], &pixels[4]);       \
+                                sort_mid(&pixels[6], &pixels[8]);       \
+                                sort_mid(&pixels[1], &pixels[7]);       \
+                                sort_mid(&pixels[5], &pixels[11]);      \
+                                sort_mid(&pixels[5], &pixels[7]);       \
+                                sort_mid(&pixels[3], &pixels[9]);       \
+                                sort_mid(&pixels[3], &pixels[5]);       \   
+                                sort_mid(&pixels[7], &pixels[9]);       \
+                                sort_mid(&pixels[1], &pixels[2]);       \
+                                sort_mid(&pixels[3], &pixels[4]);       \   
+                                sort_mid(&pixels[5], &pixels[6]);       \   
+                                sort_mid(&pixels[7], &pixels[8]);       \
+                                sort_mid(&pixels[9], &pixels[10]);      \
+                                sort_mid(&pixels[13], &pixels[14]);     \
+                                sort_mid(&pixels[12], &pixels[13]);     \
+                                sort_mid(&pixels[13], &pixels[14]);     \
+                                sort_mid(&pixels[16], &pixels[17]);     \
+                                sort_mid(&pixels[15], &pixels[16]);     \
+                                sort_mid(&pixels[16], &pixels[17]);     \
+                                sort_mid(&pixels[12], &pixels[15]);     \
+                                sort_mid(&pixels[14], &pixels[17]);     \
+                                sort_mid(&pixels[14], &pixels[15]);     \
+                                sort_mid(&pixels[13], &pixels[16]);     \   
+                                sort_mid(&pixels[13], &pixels[14]);     \
+                                sort_mid(&pixels[15], &pixels[16]);     \
+                                sort_mid(&pixels[19], &pixels[20]);     \
+                                sort_mid(&pixels[18], &pixels[19]);     \
+                                sort_mid(&pixels[19], &pixels[20]);     \
+                                sort_mid(&pixels[21], &pixels[22]);     \
+                                sort_mid(&pixels[23], &pixels[24]);     \
+                                sort_mid(&pixels[21], &pixels[23]);     \
+                                sort_mid(&pixels[22], &pixels[24]);     \
+                                sort_mid(&pixels[22], &pixels[23]);     \
+                                sort_mid(&pixels[18], &pixels[21]);     \
+                                sort_mid(&pixels[20], &pixels[23]);     \
+                                sort_mid(&pixels[20], &pixels[21]);     \
+                                sort_mid(&pixels[19], &pixels[22]);     \
+                                sort_mid(&pixels[22], &pixels[24]);     \
+                                sort_mid(&pixels[19], &pixels[20]);     \
+                                sort_mid(&pixels[21], &pixels[22]);     \
+                                sort_mid(&pixels[23], &pixels[24]);     \
+                                sort_mid(&pixels[12], &pixels[18]);     \
+                                sort_mid(&pixels[16], &pixels[22]);     \
+                                sort_mid(&pixels[16], &pixels[18]);     \
+                                sort_mid(&pixels[14], &pixels[20]);     \
+                                sort_mid(&pixels[20], &pixels[24]);     \
+                                sort_mid(&pixels[14], &pixels[16]);     \
+                                sort_mid(&pixels[18], &pixels[20]);     \
+                                sort_mid(&pixels[22], &pixels[24]);     \
+                                sort_mid(&pixels[13], &pixels[19]);     \
+                                sort_mid(&pixels[17], &pixels[23]);     \
+                                sort_mid(&pixels[17], &pixels[19]);     \
+                                sort_mid(&pixels[15], &pixels[21]);     \
+                                sort_mid(&pixels[15], &pixels[17]);     \
+                                sort_mid(&pixels[19], &pixels[21]);     \
+                                sort_mid(&pixels[13], &pixels[14]);     \
+                                sort_mid(&pixels[15], &pixels[16]);     \
+                                sort_mid(&pixels[17], &pixels[18]);     \
+                                sort_mid(&pixels[19], &pixels[20]);     \
+                                sort_mid(&pixels[21], &pixels[22]);     \
+                                sort_mid(&pixels[23], &pixels[24]);     \
+                                sort_mid(&pixels[0], &pixels[12]);      \
+                                sort_mid(&pixels[8], &pixels[20]);      \
+                                sort_mid(&pixels[8], &pixels[12]);      \
+                                sort_mid(&pixels[4], &pixels[16]);      \
+                                sort_mid(&pixels[16], &pixels[24]);     \
+                                sort_mid(&pixels[12], &pixels[16]);     \
+                                sort_mid(&pixels[2], &pixels[14]);      \
+                                sort_mid(&pixels[10], &pixels[22]);     \
+                                sort_mid(&pixels[10], &pixels[14]);     \
+                                sort_mid(&pixels[6], &pixels[18]);      \
+                                sort_mid(&pixels[6], &pixels[10]);      \
+                                sort_mid(&pixels[10], &pixels[12]);     \
+                                sort_mid(&pixels[1], &pixels[13]);      \
+                                sort_mid(&pixels[9], &pixels[21]);      \
+                                sort_mid(&pixels[9], &pixels[13]);      \
+                                sort_mid(&pixels[5], &pixels[17]);      \
+                                sort_mid(&pixels[13], &pixels[17]);     \
+                                sort_mid(&pixels[3], &pixels[15]);      \
+                                sort_mid(&pixels[11], &pixels[23]);     \
+                                sort_mid(&pixels[11], &pixels[15]);     \
+                                sort_mid(&pixels[7], &pixels[19]);      \
+                                sort_mid(&pixels[7], &pixels[11]);      \
+                                sort_mid(&pixels[11], &pixels[13]);     \
+                                sort_mid(&pixels[11], &pixels[12]);     \
+
+                         
+#define FILTER_VALUE_3x3       switch (function)                                            \
+                               {                                                            \
+                                   case VX_NONLINEAR_FILTER_MIN:                            \
+                                   {                                                        \
+                                       min_value = pixels[0];                               \
+                                       for (i = 1; i < count_mask; i++)                     \
+                                           min_value = min_op(min_value, pixels[i]);        \
+                                                                                            \
+                                       dst[x * dsx + y * dsy] = min_value;                  \
+                                                                                            \
+                                       break;                                               \
+                                   }                                                        \
+                                   case VX_NONLINEAR_FILTER_MAX:                            \
+                                   {                                                        \
+                                       max_value = pixels[0];                               \
+                                       for (i = 1; i < count_mask; i++)                     \
+                                           max_value = max_op(max_value, pixels[i]);        \
+                                                                                            \
+                                       dst[x * dsx + y * dsy] = max_value;                  \
+                                                                                            \
+                                       break;                                               \
+                                   }                                                        \
+                                   case VX_NONLINEAR_FILTER_MEDIAN:                         \
+                                   {                                                        \
+                                       SORT_MID_3x3;                                        \
+                                                                                            \
+                                       dst[x * dsx + y * dsy] = pixels[4];                  \
+                                                                                            \
+                                       break;                                               \
+                                   }                                                        \
+                                }                                                           \
+                                                            
+                            
+#define FILTER_CROSS_3x3       switch (function)                                            \
+                               {                                                            \
+                                   case VX_NONLINEAR_FILTER_MIN:                            \
+                                   {                                                        \
+                                       min_value = pixels[0];                               \
+                                       for (i = 1; i < count_mask; i++)                     \
+                                           min_value = min_op(min_value, pixels[i]);        \
+                                                                                            \
+                                       dst[x * dsx + y * dsy] = min_value;                  \
+                                                                                            \
+                                       break;                                               \
+                                   }                                                        \
+                                   case VX_NONLINEAR_FILTER_MAX:                            \
+                                   {                                                        \
+                                       max_value = pixels[0];                               \
+                                       for (i = 1; i < count_mask; i++)                     \
+                                           max_value = max_op(max_value, pixels[i]);        \
+                                                                                            \
+                                       dst[x * dsx + y * dsy] = max_value;                  \
+                                                                                            \
+                                       break;                                               \
+                                   }                                                        \
+                                   case VX_NONLINEAR_FILTER_MEDIAN:                         \
+                                   {                                                        \
+                                       SORT_MID_CROSS_3x3;                                  \
+                                                                                            \
+                                       dst[x * dsx + y * dsy] = pixels[2];                  \
+                                                                                            \
+                                       break;                                               \
+                                   }                                                        \
+                                }                                                           \
+                                
+                                
+#define FILTER_DISK_5x5        switch (function)                                            \
+                               {                                                            \
+                                   case VX_NONLINEAR_FILTER_MIN:                            \
+                                   {                                                        \
+                                       min_value = pixels[0];                               \
+                                       for (i = 1; i < count_mask; i++)                     \
+                                           min_value = min_op(min_value, pixels[i]);        \
+                                                                                            \
+                                       dst[x * dsx + y * dsy] = min_value;                  \
+                                                                                            \
+                                       break;                                               \
+                                   }                                                        \
+                                   case VX_NONLINEAR_FILTER_MAX:                            \
+                                   {                                                        \
+                                       max_value = pixels[0];                               \
+                                       for (i = 1; i < count_mask; i++)                     \
+                                           max_value = max_op(max_value, pixels[i]);        \
+                                                                                            \
+                                       dst[x * dsx + y * dsy] = max_value;                  \
+                                                                                            \
+                                       break;                                               \
+                                   }                                                        \
+                                   case VX_NONLINEAR_FILTER_MEDIAN:                         \
+                                   {                                                        \
+                                       SORT_MID_DISK_5x5;                                   \
+                                                                                            \
+                                       dst[x * dsx + y * dsy] = pixels[10];                 \
+                                                                                            \
+                                       break;                                               \
+                                   }                                                        \
+                                }                                                           \
+                                
+#define FILTER_BOX_5x5         switch (function)                                            \
+                               {                                                            \
+                                   case VX_NONLINEAR_FILTER_MIN:                            \
+                                   {                                                        \
+                                       min_value = pixels[0];                               \
+                                       for (i = 1; i < count_mask; i++)                     \
+                                           min_value = min_op(min_value, pixels[i]);        \
+                                                                                            \
+                                       dst[x * dsx + y * dsy] = min_value;                  \
+                                                                                            \
+                                       break;                                               \
+                                   }                                                        \
+                                   case VX_NONLINEAR_FILTER_MAX:                            \
+                                   {                                                        \
+                                       max_value = pixels[0];                               \
+                                       for (i = 1; i < count_mask; i++)                     \
+                                           max_value = max_op(max_value, pixels[i]);        \
+                                                                                            \
+                                       dst[x * dsx + y * dsy] = max_value;                  \
+                                                                                            \
+                                       break;                                               \
+                                   }                                                        \
+                                   case VX_NONLINEAR_FILTER_MEDIAN:                         \
+                                   {                                                        \
+                                       SORT_MID_BOX_5x5;                                    \
+                                                                                            \
+                                       dst[x * dsx + y * dsy] = pixels[12];                 \
+                                                                                            \
+                                       break;                                               \
+                                   }                                                        \
+                                }                                                           \
+
+
+__kernel void vx_nonlinearfilter(uint function, int ssx, int ssy, __global uchar *src,
+                                 __global uchar *mask, uint left, uint top, uint right, uint bottom,
+                                 int mat_rows, int count_mask, int bordermode, uchar const_vaule,
+                                 int dsx, int dsy, __global uchar *dst)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const size_t high_x = get_global_size(0);
+    const size_t high_y = get_global_size(1);
+    
+    int ky, kx, i;
+    uint dest_index = 0;
+    uint mask_index = 0;
+    uchar pixels[25], min_value, max_value;
+    
+    if (bordermode == VX_BORDER_CONSTANT)
+    {
+        for (ky = -(int)top; ky <= (int)bottom; ++ky)
+        {
+            int yy = y + ky;
+            int ccase_y = yy < 0 || yy >= high_y;
+
+            for (kx = -(int)left; kx <= (int)right; ++kx, ++mask_index)
+            {
+                int xx = x + kx;
+                int ccase = ccase_y || xx < 0 || xx >= high_x;
+
+                if (mask[mask_index])
+                {
+                    if (!ccase)
+                        pixels[dest_index++] = src[xx * ssx + yy * ssy];
+                    else
+                        pixels[dest_index++] = const_vaule;
+                }
+            }
+        }
+        
+        switch (mat_rows)                       
+        {                                       
+            case 3 :  //mask = 3x3              
+            {                                   
+                if (count_mask == 5)            
+                {                               
+                    FILTER_CROSS_3x3;           
+                }                               
+                else  //count_mask = 9          
+                {                               
+                    FILTER_VALUE_3x3;           
+                }                               
+                break;                          
+            }                                   
+            case 5 :  //mask = 5x5              
+            {                                   
+                if (count_mask == 9)            
+                {                               
+                    FILTER_VALUE_3x3;           
+                }                               
+                else if (count_mask == 21)      
+                {                               
+                    FILTER_DISK_5x5;            
+                }                               
+                else   //count_mask = 25        
+                {                               
+                    FILTER_BOX_5x5;             
+                }                               
+                break;                          
+            }                                   
+        }                                       
+    }
+    else
+    {
+        for (ky = -(int)top; ky <= (int)bottom; ++ky)
+        {
+            int yy = y + ky;
+            yy = yy < 0 ? 0 : yy >= high_y ? high_y - 1 : yy;
+
+            for (kx = -(int)left; kx <= (int)right; ++kx, ++mask_index)
+            {
+                int xx = x + kx;
+                xx = xx < 0 ? 0 : xx >= high_x ? high_x - 1 : xx;
+                if (mask[mask_index])
+                    pixels[dest_index++] = src[xx * ssx + yy * ssy];
+            }
+        }
+        
+        switch (mat_rows)                       
+        {                                       
+            case 3 :  //mask = 3x3              
+            {                                   
+                if (count_mask == 5)            
+                {                               
+                    FILTER_CROSS_3x3;           
+                }                               
+                else  //count_mask = 9          
+                {                               
+                    FILTER_VALUE_3x3;           
+                }                               
+                break;                          
+            }                                   
+            case 5 :  //mask = 5x5              
+            {                                   
+                if (count_mask == 9)            
+                {                               
+                    FILTER_VALUE_3x3;           
+                }                               
+                else if (count_mask == 21)      
+                {                               
+                    FILTER_DISK_5x5;            
+                }                               
+                else   //count_mask = 25        
+                {                               
+                    FILTER_BOX_5x5;             
+                }                               
+                break;                          
+            }                                   
+        }  
+    }
+}
\ No newline at end of file
diff --git a/kernels/opencl/vx_not.cl b/kernels/opencl/vx_not.cl
new file mode 100644
index 0000000..106c21c
--- /dev/null
+++ b/kernels/opencl/vx_not.cl
@@ -0,0 +1,9 @@
+
+__kernel void vx_not(int asx, int asy, __global uchar *a, 
+                     int bsx, int bsy, __global uchar *b)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    b[y * bsy + x * bsx] = ~a[y * asy + x * asx];
+}
diff --git a/kernels/opencl/vx_orr.cl b/kernels/opencl/vx_orr.cl
new file mode 100644
index 0000000..6b7195f
--- /dev/null
+++ b/kernels/opencl/vx_orr.cl
@@ -0,0 +1,10 @@
+
+__kernel void vx_orr(int asx, int asy, __global uchar *a, 
+                     int bsx, int bsy, __global uchar *b,
+                     int csx, int csy, __global uchar *c)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    c[y * csy + x * csx] = a[y * asy + x * asx] | b[y * bsy + x * bsx];
+}
diff --git a/kernels/opencl/vx_phase.cl b/kernels/opencl/vx_phase.cl
new file mode 100644
index 0000000..b624c49
--- /dev/null
+++ b/kernels/opencl/vx_phase.cl
@@ -0,0 +1,56 @@
+
+#define DBL_EPSILON     2.2204460492503131e-016 
+
+#define M_PI 3.1415926535897932384626433832795
+
+#define ABS(x)    ((x) > 0 ? (x) : -(x))
+
+#define FLOOR(x)    (x > 0 ? (int)(x) : (int)(x - 0.99))
+
+__kernel void vx_phase(int ssx0, int ssy0, __global short *src0,
+                       int ssx1, int ssy1, __global short *src1,
+                       int dsx,  int dsy,  __global uchar *dst)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    
+    float scale = 256.0f / 360.0f;
+
+    float P1 = ((float)( 0.9997878412794807  * (180.0 / M_PI) * scale)), 
+    P3 = ((float)(-0.3258083974640975  * (180.0 / M_PI) * scale)),       
+    P5 = ((float)( 0.1555786518463281  * (180.0 / M_PI) * scale)),       
+    P7 = ((float)(-0.04432655554792128 * (180.0 / M_PI) * scale)),       
+    A_90 = ((float)(90.f * scale)),                                      
+    A_180 = ((float)(180.f * scale)),                                    
+    A_360 = ((float)(360.f * scale));                                   
+    
+    /* -M_PI to M_PI */
+    float val_x;
+    float val_y;
+    
+    val_x = (float)(src0[x * ssx0 / 2 + y * ssy0 / 2]);
+    val_y = (float)(src1[x * ssx1 / 2 + y * ssy1 / 2]);
+    
+    float arct;
+
+    float ax = ABS(val_x), ay = ABS(val_y);                             
+    float c, c2;                                                   
+    if (ax >= ay)                                                   
+    {                                                               
+        c = ay / (ax + (float)DBL_EPSILON);                        
+        c2 = c * c;                                                 
+        arct = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c;            
+    }                                                              
+    else                                                           
+    {                                                               
+        c = ax / (ay + (float)DBL_EPSILON);                         
+        c2 = c * c;                                                 
+        arct = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c;      
+    }                                                               
+    if (val_x < 0)                                                      
+        arct = A_180 - arct;                                              
+    if (val_y < 0)                                                      
+        arct = A_360 - arct;                                              
+    
+    dst[x * dsx + y * dsy] = (uchar)(int)floor(arct + 0.5f);
+}
diff --git a/kernels/opencl/vx_sobel3x3.cl b/kernels/opencl/vx_sobel3x3.cl
new file mode 100644
index 0000000..31fe8c6
--- /dev/null
+++ b/kernels/opencl/vx_sobel3x3.cl
@@ -0,0 +1,114 @@
+
+//Define 3 types of border
+#define VX_ID_KHRONOS 0x000
+#define VX_ENUM_BORDER 0x0C
+#define VX_ENUM_BASE(vendor, id)   (((vendor) << 20) | (id << 12))
+
+#define VX_BORDER_UNDEFINED VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x0
+#define VX_BORDER_CONSTANT  VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x1
+#define VX_BORDER_REPLICATE VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_BORDER) + 0x2
+
+#define SOBEL3x3_gx     sx -= (uint)src[x_top * ssx + y_top * ssy];     \
+                        sx -= 2 * (uint)src[x_top * ssx + y * ssy];     \
+                        sx -= (uint)src[x_top * ssx + y_bot * ssy];     \
+                        sx += (uint)src[x_bot * ssx + y_top * ssy];     \
+                        sx += 2 * (uint)src[x_bot * ssx + y * ssy];     \
+                        sx += (uint)src[x_bot * ssx + y_bot * ssy];     \
+                        gx[x * dsx1 + y * dsy1] = (short)sx;            \
+                        
+#define SOBEL3x3_gy     sy -= (uint)src[x_top * ssx + y_top * ssy];     \
+                        sy -= 2 * (uint)src[x * ssx + y_top * ssy];     \
+                        sy -= (uint)src[x_bot * ssx + y_top * ssy];     \
+                        sy += (uint)src[x_top * ssx + y_bot * ssy];     \
+                        sy += 2 * (uint)src[x * ssx + y_bot * ssy];     \
+                        sy += (uint)src[x_bot * ssx + y_bot * ssy];     \
+                        gy[x * dsx2 + y * dsy2] = (short)sy;            \
+
+
+__kernel void vx_sobel3x3(int ssx, int ssy, __global uchar *src,
+                          int bordermode, uchar const_vaule,
+                          int dsx1, int dsy1, __global short *gx,
+                          int dsx2, int dsy2, __global short *gy)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const size_t high_x = get_global_size(0);
+    const size_t high_y = get_global_size(1);
+    int sx = 0, sy = 0;
+    
+    int y_top = y - 1;
+    int y_bot = y + 1;
+    int x_top = x - 1;
+    int x_bot = x + 1;
+    
+    int ky, kx;
+    uint dest_index = 0;
+    
+    if (bordermode == VX_BORDER_CONSTANT)
+    {
+        uchar pixel[9];
+        // Calculate border
+        if (y == 0 || x == 0 || x == high_x - 1 || y == high_y - 1)
+        {           
+            for (ky = -1; ky <= 1; ++ky)
+            {
+                int yy = y + ky;
+                int ccase_y = yy < 0 || yy >= high_y;
+
+                for (kx = -1; kx <= 1; ++kx, ++dest_index)
+                {
+                    int xx = x + kx;
+                    int ccase = ccase_y || xx < 0 || xx >= high_x;
+
+                    if (!ccase)
+                        pixel[dest_index] = src[xx * ssx + yy * ssy];
+                    else
+                        pixel[dest_index] = const_vaule;
+                }
+            }
+
+            if (gx)
+            {
+                sx = pixel[8] + 2*pixel[5] - pixel[6] - pixel[0] - 2*pixel[3] + pixel[2];
+
+                gx[x * dsx1 + y * dsy1] = (short)sx;
+            }
+            if (gy)
+            {
+                sy = pixel[6] + 2*pixel[7] + pixel[8] - pixel[0] - 2*pixel[1] - pixel[2];
+
+                gy[x * dsx2 + y * dsy2] = (short)sy;
+            }
+        }
+        else
+        {
+            if (gx)
+            {
+                SOBEL3x3_gx;
+            }
+            if (gy)
+            {
+                SOBEL3x3_gy;
+            }
+        }
+    }
+    else
+    {
+        if (bordermode == VX_BORDER_REPLICATE)
+        {
+            y_top = y_top < 0 ? 0 : y - 1;
+            y_bot = y_bot >= high_y ? high_y - 1 : y + 1;
+            x_top = x_top < 0 ? 0 : x - 1;
+            x_bot = x_bot >= high_x ? high_x - 1 : x + 1;
+        }
+
+        if (gx)
+        {
+            SOBEL3x3_gx;
+        }
+        if (gy)
+        {
+            SOBEL3x3_gy;
+        }
+    }
+}
diff --git a/kernels/opencl/vx_warp_affine.cl b/kernels/opencl/vx_warp_affine.cl
new file mode 100644
index 0000000..e0009be
--- /dev/null
+++ b/kernels/opencl/vx_warp_affine.cl
@@ -0,0 +1,273 @@
+
+#define VX_ID_KHRONOS    0x000
+#define VX_ENUM_INTERPOLATION   0x04
+#define VX_ENUM_BASE(vendor, id)            (((vendor) << 20) | (id << 12))
+#define VX_INTERPOLATION_NEAREST_NEIGHBOR  VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x0
+#define VX_INTERPOLATION_BILINEAR          VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x1
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+	
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+	
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+*
+* @param[in] ptr                           Pointer to the starting postion of the buffer
+* @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+* @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+* @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+* @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+* @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+*
+* @return An image object
+*/
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x = stride_x,
+        .stride_y = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Get the pointer position of a Image
+*
+* @param[in] img Pointer to the starting position of the buffer
+* @param[in] x   Relative X position
+* @param[in] y   Relative Y position
+*/
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+{
+	const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+	
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border(float8 coords, const float width, const float height)
+{
+    return clamp_to_border_with_size(coords, width, height, 0);
+}
+
+/** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of coordinates to be read from the image.
+ */
+inline const VEC_DATA_TYPE(uchar, 4) read_texels4(const Image *in, const int8 coords)
+{ 
+    return (VEC_DATA_TYPE(uchar, 4))(*((__global uchar *)offset(in, coords.s0, coords.s1)),
+                                         *((__global uchar *)offset(in, coords.s2, coords.s3)),
+                                         *((__global uchar *)offset(in, coords.s4, coords.s5)),
+                                         *((__global uchar *)offset(in, coords.s6, coords.s7)));
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Transforms 4 2D coordinates using the formula:
+ *
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *
+ * @param[in] coord 2D coordinate to transform.
+ * @param[in] mtx   affine matrix
+ *
+ * @return a int8 containing 4 2D transformed values.
+ */
+inline const float8 apply_affine_transform(const float2 coord, const float8 mtx)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    // transform [x,x+1,x+2,x+3]
+    const float4 new_x = mad(/*A*/ in_x_coords, (float4)(mtx.s0) /*B*/, mad((float4)(coord.s1), (float4)(mtx.s2), (float4)(mtx.s4)));
+    // transform [y,y+1,y+2,y+3]
+    const float4 new_y = mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s5)));
+
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+ */
+inline const float8 get_neighbour_coords(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in          Pointer to the source image.
+ * @param[in] coords      Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size
+ */
+inline const VEC_DATA_TYPE(uchar, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+{
+    // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
+
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+
+    // Loads the values from the input image
+    const float16 t = (float16)(
+                          /* tl, tr, bl, br */
+                          * ((__global uchar *)offset(in, c1.s0, c1.s1)), *((__global uchar *)offset(in, c1.s2, c1.s3)),
+                          *((__global uchar *)offset(in, c1.s4, c1.s5)), *((__global uchar *)offset(in, c1.s6, c1.s7)),
+                          *((__global uchar *)offset(in, c1.s8, c1.s9)), *((__global uchar *)offset(in, c1.sa, c1.sb)),
+                          *((__global uchar *)offset(in, c1.sc, c1.sd)), *((__global uchar *)offset(in, c1.se, c1.sf)),
+                          *((__global uchar *)offset(in, c2.s0, c2.s1)), *((__global uchar *)offset(in, c2.s2, c2.s3)),
+                          *((__global uchar *)offset(in, c2.s4, c2.s5)), *((__global uchar *)offset(in, c2.s6, c2.s7)),
+                          *((__global uchar *)offset(in, c2.s8, c2.s9)), *((__global uchar *)offset(in, c2.sa, c2.sb)),
+                          *((__global uchar *)offset(in, c2.sc, c2.sd)), *((__global uchar *)offset(in, c2.se, c2.sf)));
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+    return CONVERT(fr, VEC_DATA_TYPE(uchar, 4));
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ */
+inline const VEC_DATA_TYPE(uchar, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+{
+    return bilinear_interpolate_with_border(in, coords, width, height, 1);
+}
+
+/** Performs an affine transform on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
+ *
+ * This kernel performs an affine transform with a 2x3 Matrix M with this method of pixel coordinate translation:
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   output(x,y) = input(x0,y0)
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=1 -DMAT3=2 -DMAT4=4 -DMAT5=2 "\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_affine(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height,
+	__global float matrix[9],
+	const uchar constValue,
+	const int type)
+{
+    Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+	
+	float8 mat = (float8)(matrix[0], matrix[1], matrix[2], matrix[3], matrix[4], matrix[5], 0.0, 0.0);
+	float8 coords = apply_affine_transform(get_current_coords(), mat);
+
+	if (type == VX_INTERPOLATION_NEAREST_NEIGHBOR)
+	    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(coords, width, height))), 0, out.ptr);
+	else if (type == VX_INTERPOLATION_BILINEAR)
+	    vstore4(bilinear_interpolate(&in, coords, width, height), 0, out.ptr);
+	
+	if (coords.even.s0 < 0 || coords.odd.s0 < 0 || coords.even.s0 >= width || coords.odd.s0 >= height)
+	{
+		out.ptr[0] = constValue;
+	}
+	if (coords.even.s1 < 0 || coords.odd.s1 < 0 || coords.even.s1 >= width || coords.odd.s1 >= height)
+	{
+		out.ptr[1] = constValue;
+	}
+	if (coords.even.s2 < 0 || coords.odd.s2 < 0 || coords.even.s2 >= width || coords.odd.s2 >= height)
+	{
+		out.ptr[2] = constValue;
+	}
+	if (coords.even.s3 < 0 || coords.odd.s3 < 0 || coords.even.s3 >= width || coords.odd.s3 >= height)
+	{
+		out.ptr[3] = constValue;
+	}
+}
diff --git a/kernels/opencl/vx_warp_perspective.cl b/kernels/opencl/vx_warp_perspective.cl
new file mode 100644
index 0000000..405dfb6
--- /dev/null
+++ b/kernels/opencl/vx_warp_perspective.cl
@@ -0,0 +1,277 @@
+#define VX_ID_KHRONOS    0x000
+#define VX_ENUM_INTERPOLATION   0x04
+#define VX_ENUM_BASE(vendor, id)            (((vendor) << 20) | (id << 12))
+#define VX_INTERPOLATION_NEAREST_NEIGHBOR  VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x0
+#define VX_INTERPOLATION_BILINEAR          VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_INTERPOLATION) + 0x1
+
+#define VEC_DATA_TYPE_STR(type, size) type##size
+#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+
+#define CONVERT_STR(x, type) (convert_##type((x)))
+#define CONVERT(x, type) CONVERT_STR(x, type)
+
+#define IMAGE_DECLARATION(name)      \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_offset_first_element_in_bytes
+	
+#define CONVERT_TO_IMAGE_STRUCT(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+
+#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+	
+/** Structure to hold Image information */
+typedef struct Image
+{
+    __global uchar *ptr;                           /**< Pointer to the starting postion of the buffer */
+    int             offset_first_element_in_bytes; /**< The offset of the first element in the source image */
+    int             stride_x;                      /**< Stride of the image in X dimension (in bytes) */
+    int             stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
+} Image;
+
+/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
+*
+* @param[in] ptr                           Pointer to the starting postion of the buffer
+* @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+* @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+* @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+* @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+* @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+*
+* @return An image object
+*/
+inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+    Image img =
+    {
+        .ptr = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x = stride_x,
+        .stride_y = stride_y
+    };
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
+    return img;
+}
+
+/** Get the pointer position of a Image
+*
+* @param[in] img Pointer to the starting position of the buffer
+* @param[in] x   Relative X position
+* @param[in] y   Relative Y position
+*/
+inline __global uchar *offset(const Image *img, int x, int y)
+{
+    return img->ptr + x * img->stride_x + y * img->stride_y;
+}
+
+/** Clamps the given coordinates to the borders according to the border size.
+ *
+ * @param[in] coords      Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size of the image
+ *
+ */
+inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+{
+	const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
+    const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
+	
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Clamps the given coordinates to the borders.
+ *
+ * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ *
+ */
+inline const float8 clamp_to_border(float8 coords, const float width, const float height)
+{
+    return clamp_to_border_with_size(coords, width, height, 1);
+}
+
+/** Reads four texels from the input image. The coords vector is used to determine which texels to be read.
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of coordinates to be read from the image.
+ */
+inline const VEC_DATA_TYPE(uchar, 4) read_texels4(const Image *in, const int8 coords)
+{ 
+    return (VEC_DATA_TYPE(uchar, 4))(*((__global uchar *)offset(in, coords.s0, coords.s1)),
+                                         *((__global uchar *)offset(in, coords.s2, coords.s3)),
+                                         *((__global uchar *)offset(in, coords.s4, coords.s5)),
+                                         *((__global uchar *)offset(in, coords.s6, coords.s7)));
+}
+
+/** Returns the current thread coordinates. */
+inline const float2 get_current_coords()
+{
+    return (float2)(get_global_id(0) * 4, get_global_id(1));
+}
+
+/** Transforms four 2D coordinates using the formula:
+ *
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
+ *
+ *   (x0/z0,y0/z0)
+ *
+ * @param[in] coord 2D coordinate to transform.
+ * @param[in] mtx   perspective matrix
+ *
+ * @return a vector float8 containing four 2D transformed values.
+ */
+inline const float8 apply_perspective_transform(const float2 coord, const float16 mtx)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    // transform [z,z+1,z+2,z+3]
+    const float4 z = (float4)mad(in_x_coords, (float4)(mtx.s2), mad((float4)(coord.s1), (float4)(mtx.s5), (float4)(mtx.s8)));
+    // NOTE: Do not multiply x&y by 1.f/Z as this will result in loss of accuracy and mismatches with VX reference implementation
+    // transform [x,x+1,x+2,x+3]
+    const float4 new_x = (float4)mad(in_x_coords, (float4)(mtx.s0), mad((float4)(coord.s1), (float4)(mtx.s3), (float4)(mtx.s6))) / z;
+    // transform [y,y+1,y+2,y+3]
+    const float4 new_y = (float4)mad(in_x_coords, (float4)(mtx.s1), mad((float4)(coord.s1), (float4)(mtx.s4), (float4)(mtx.s7))) / z;
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+}
+
+
+/** Given a texel coordinates this function will return the following array of coordinates:
+ * [ P, right neighbour, below neighbour, below right neighbour ]
+ *
+ * @note No checks to see if the coordinates are out of the image are done here.
+ *
+ * @param[in] coord Input coordinates
+ *
+ * @return vector of 8 floats with the coordinates, even positions are x and odd y.
+ */
+inline const float8 get_neighbour_coords(const float2 coord)
+{
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+}
+
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in          Pointer to the source image.
+ * @param[in] coords      Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width       Width of the image
+ * @param[in] height      Height of the image
+ * @param[in] border_size Border size
+ */
+inline const VEC_DATA_TYPE(uchar, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+{
+    // Sets the 4x4 coordinates for each of the four input texels
+    const float8  fc = floor(coords);
+    const float16 c1 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 = (float16)(
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+
+    // Loads the values from the input image
+    const float16 t = (float16)(
+                          /* tl, tr, bl, br */
+                          * ((__global uchar *)offset(in, c1.s0, c1.s1)), *((__global uchar *)offset(in, c1.s2, c1.s3)),
+                          *((__global uchar *)offset(in, c1.s4, c1.s5)), *((__global uchar *)offset(in, c1.s6, c1.s7)),
+                          *((__global uchar *)offset(in, c1.s8, c1.s9)), *((__global uchar *)offset(in, c1.sa, c1.sb)),
+                          *((__global uchar *)offset(in, c1.sc, c1.sd)), *((__global uchar *)offset(in, c1.se, c1.sf)),
+                          *((__global uchar *)offset(in, c2.s0, c2.s1)), *((__global uchar *)offset(in, c2.s2, c2.s3)),
+                          *((__global uchar *)offset(in, c2.s4, c2.s5)), *((__global uchar *)offset(in, c2.s6, c2.s7)),
+                          *((__global uchar *)offset(in, c2.s8, c2.s9)), *((__global uchar *)offset(in, c2.sa, c2.sb)),
+                          *((__global uchar *)offset(in, c2.sc, c2.sd)), *((__global uchar *)offset(in, c2.se, c2.sf)));
+    const float8 a  = coords - fc;
+    const float8 b  = ((float8)(1.f)) - a;
+    const float4 fr = (float4)(
+                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+    return CONVERT(fr, VEC_DATA_TYPE(uchar, 4));
+}
+
+/* FIXME(COMPMID-682): Clamp border properly in UNDEFINED border mode in Warp, Scale, Remap */
+/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
+ *
+ * @param[in] in     Pointer to the source image.
+ * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
+ * @param[in] width  Width of the image
+ * @param[in] height Height of the image
+ */
+inline const VEC_DATA_TYPE(uchar, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+{
+    return bilinear_interpolate_with_border(in, coords, width, height, 1);
+}
+
+/** Performs perspective transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8.
+ *
+ * This kernel performs perspective transform with a 3x3 Matrix M with this method of pixel coordinate translation:
+ *   x0 = M[1][1] * x + M[1][2] * y + M[1][3]
+ *   y0 = M[2][1] * x + M[2][2] * y + M[2][3]
+ *   z0 = M[3][1] * x + M[3][2] * y + M[3][3]
+ *
+ *   output(x,y) = input(x0/z0,y0/z0)
+ *
+ * @attention The matrix coefficients need to be passed at compile time:\n
+ * const char build_options [] = "-DMAT0=1 -DMAT1=2 -DMAT2=3 -DMAT3=4 -DMAT4=5 -DMAT5=6 -DMAT6=7 -DMAT7=8 -DMAT8=9"\n
+ * clBuildProgram( program, 0, NULL, build_options, NULL, NULL);
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in]  width                             Width of the destination image
+ * @param[in]  height                            Height of the destination image
+ */
+__kernel void warp_perspective(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const int width,
+    const int height,
+	__global float matrix[9],
+	const uchar constValue,
+	const int type)
+{
+	Image in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out = CONVERT_TO_IMAGE_STRUCT(out);
+	
+	float16 mat = (float16)(matrix[0], matrix[1], matrix[2], matrix[3], matrix[4], matrix[5], matrix[6], matrix[7], matrix[8], 0, 0, 0, (float4)0);
+	float8 coords = apply_perspective_transform(get_current_coords(), mat);
+
+	if (type == VX_INTERPOLATION_NEAREST_NEIGHBOR)
+	    vstore4(read_texels4(&in, convert_int8_rtn(clamp_to_border(coords, width, height))), 0, out.ptr);
+	else if (type == VX_INTERPOLATION_BILINEAR)
+	    vstore4(bilinear_interpolate(&in, coords, width, height), 0, out.ptr);
+	
+	if (coords.even.s0 < 0 || coords.odd.s0 < 0 || coords.even.s0 >= width || coords.odd.s0 >= height)
+	{
+		out.ptr[0] = constValue;
+	}
+	if (coords.even.s1 < 0 || coords.odd.s1 < 0 || coords.even.s1 >= width || coords.odd.s1 >= height)
+	{
+		out.ptr[1] = constValue;
+	}
+	if (coords.even.s2 < 0 || coords.odd.s2 < 0 || coords.even.s2 >= width || coords.odd.s2 >= height)
+	{
+		out.ptr[2] = constValue;
+	}
+	if (coords.even.s3 < 0 || coords.odd.s3 < 0 || coords.even.s3 >= width || coords.odd.s3 >= height)
+	{
+		out.ptr[3] = constValue;
+	}
+}
diff --git a/kernels/opencl/vx_xor.cl b/kernels/opencl/vx_xor.cl
new file mode 100644
index 0000000..81ad9a7
--- /dev/null
+++ b/kernels/opencl/vx_xor.cl
@@ -0,0 +1,10 @@
+
+__kernel void vx_xor(int asx, int asy, __global uchar *a, 
+                     int bsx, int bsy, __global uchar *b,
+                     int csx, int csy, __global uchar *c)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    c[y * csy + x * csx] = a[y * asy + x * asx] ^ b[y * bsy + x * bsx];
+}
diff --git a/kernels/tiling/CMakeLists.txt b/kernels/tiling/CMakeLists.txt
new file mode 100644
index 0000000..66af1b7
--- /dev/null
+++ b/kernels/tiling/CMakeLists.txt
@@ -0,0 +1,40 @@
+#
+
+# Copyright (c) 2011-2017 The Khronos Group Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+# set target name
+set( TARGET_NAME openvx-tiling_chaining-lib )
+
+include_directories( BEFORE
+                     ${CMAKE_CURRENT_SOURCE_DIR}
+                     ${CMAKE_SOURCE_DIR}/include
+                     ${CMAKE_SOURCE_DIR}/utils
+                     ${CMAKE_SOURCE_DIR}/debug )
+					 
+FIND_SOURCES()
+
+# add a target named ${TARGET_NAME}
+add_library (${TARGET_NAME} ${SOURCE_FILES})
+
+target_link_libraries( ${TARGET_NAME} openvx )
+
+install ( TARGETS ${TARGET_NAME} 
+          RUNTIME DESTINATION bin
+          ARCHIVE DESTINATION bin
+          LIBRARY DESTINATION bin )
+		  
+set_target_properties( ${TARGET_NAME} PROPERTIES FOLDER ${KERNELS_FOLDER} )
diff --git a/kernels/tiling/tiling.h b/kernels/tiling/tiling.h
new file mode 100644
index 0000000..3537538
--- /dev/null
+++ b/kernels/tiling/tiling.h
@@ -0,0 +1,123 @@
+/*
+
+* Copyright (c) 2011-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <VX/vx_khr_tiling.h>
+
+void box3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void box3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Phase_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Phase_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void And_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void And_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Or_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Or_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Xor_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Xor_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Not_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Not_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Threshold_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Threshold_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void ConvertColor_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void ConvertColor_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Multiply_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Multiply_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void NonLinearFilter_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void NonLinearFilter_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Magnitude_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Magnitude_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Erode3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Erode3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Dilate3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Dilate3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Median3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Median3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Sobel3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Sobel3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Max_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Max_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Min_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Min_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Gaussian3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Gaussian3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Addition_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Addition_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Subtraction_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Subtraction_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void ConvertDepth_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void ConvertDepth_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void WarpAffine_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void WarpAffine_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void WarpPerspective_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void WarpPerspective_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void WeightedAverage_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void WeightedAverage_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void AbsDiff_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void AbsDiff_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void IntegralImage_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void IntegralImage_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Convolve_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Convolve_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void HogFeatures_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void HogFeatures_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void Fast9Corners_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void Fast9Corners_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void LBP_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void LBP_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void ScaleImage_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void ScaleImage_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void TableLookup_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void TableLookup_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void ChannelCombine_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void ChannelCombine_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void NonMaxSuppression_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void NonMaxSuppression_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+
+void HogCells_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size);
+void HogCells_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size);
diff --git a/kernels/tiling/tiling_absdiff.c b/kernels/tiling/tiling_absdiff.c
new file mode 100644
index 0000000..6a28534
--- /dev/null
+++ b/kernels/tiling/tiling_absdiff.c
@@ -0,0 +1,193 @@
+/*
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+void AbsDiff_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+    vx_uint32 low_height = out->tile_y;
+    vx_uint32 height = out->tile_y + out->tile_block.height;
+    switch (in_1->image.format) {
+        case VX_DF_IMAGE_U8:
+        {
+            for (y = low_height; y < height; y++) {
+                const vx_uint8* src1R = (vx_uint8 *)in_1->base[0] + in_1->tile_x + y * in_1->addr[0].stride_y;
+                const vx_uint8* src2R = (vx_uint8 *)in_2->base[0] + in_2->tile_x + y * in_2->addr[0].stride_y;
+                vx_uint8* dstR = (vx_uint8 *)out->base[0] + out->tile_x + y * out->addr[0].stride_y;;
+                for (x = 0; x < out->tile_block.width; x+=16) {
+                    uint8x16_t vSrc1R = vld1q_u8(src1R);
+                    uint8x16_t vSrc2R = vld1q_u8(src2R);
+                    uint8x16_t vDiff = vabdq_u8(vSrc1R, vSrc2R);
+                    vst1q_u8(dstR, vDiff);
+                    src2R += 16* in_1->addr[0].stride_x;
+                    src1R += 16* in_2->addr[0].stride_x;
+                    dstR += 16* out->addr[0].stride_x;
+                }
+            }
+        }
+        break;
+
+        case VX_DF_IMAGE_S16:
+        {
+            uint16x8_t vMaxs16 = vdupq_n_u16(0x7FFF);
+            for (y = low_height; y < height; y++) {
+                const vx_int16* src1R = (vx_int16 *)in_1->base[0] + in_1->tile_x + y * in_1->addr[0].stride_y /2;// + x * in_1->addr[0].stride_x / 2;
+                const vx_int16* src2R = (vx_int16 *)in_2->base[0] + in_2->tile_x + y * in_2->addr[0].stride_y /2;// + x * in_2->addr[0].stride_x / 2;
+                vx_int16* dstR = (vx_int16 *)out->base[0] + out->tile_x + y * out->addr[0].stride_y /2;// + x * in_1->addr[0].stride_x / 2;
+                if (out->image.format == VX_DF_IMAGE_S16) {
+                    for (x = 0; x < out->tile_block.width; x+=8) {
+                        int16x8_t vSrc1R = vld1q_s16(src1R);
+                        int16x8_t vSrc2R = vld1q_s16(src2R);
+                        uint16x8_t vDiff = (uint16x8_t)vabdq_s16(vSrc1R, vSrc2R);
+                        vDiff = vminq_u16(vDiff, vMaxs16);
+                        vst1q_s16(dstR, (int16x8_t)vDiff);
+                        src2R += 8 * in_1->addr[0].stride_x / 2;
+                        src1R += 8 * in_2->addr[0].stride_x / 2;
+                        dstR += 8 * out->addr[0].stride_x / 2;
+                    }
+                }else if (out->image.format == VX_DF_IMAGE_U16) {
+                    for (x = 0; x < out->tile_block.width; x+=8) {
+                            int16x8_t vSrc1R = vld1q_s16(src1R);
+                            int16x8_t vSrc2R = vld1q_s16(src2R);
+                            uint16x8_t vDiff = vabdq_u16((uint16x8_t)vSrc1R, (uint16x8_t)vSrc2R);
+                            vst1q_u16((vx_uint16 *)dstR, vDiff);
+                            src2R += 8 * in_1->addr[0].stride_x / 2;
+                            src1R += 8 * in_2->addr[0].stride_x / 2;
+                            dstR += 8 * out->addr[0].stride_x / 2;
+                    }
+                }
+            }
+        }
+        break;
+
+        case VX_DF_IMAGE_U16:
+        {
+            for (y = low_height; y < height; y++) {
+                const vx_uint16* src1R = (vx_uint16 *)in_1->base[0] + in_1->tile_x + y * in_1->addr[0].stride_y / 2;
+                const vx_uint16* src2R =  (vx_uint16 *)in_2->base[0] + in_2->tile_x + y * in_2->addr[0].stride_y / 2;
+                vx_uint16* dstR =  (vx_uint16 *)out->base[0] + out->tile_x + y * out->addr[0].stride_y / 2;
+                for (x = 0; x < out->tile_block.width; x+=8) {
+                    uint16x8_t vSrc1R = vld1q_u16(src1R);
+                    uint16x8_t vSrc2R = vld1q_u16(src2R);
+                    uint16x8_t vDiff = vabdq_u16(vSrc1R, vSrc2R);
+                    vst1q_u16(dstR, vDiff);
+                    src2R += 8 * in_1->addr[0].stride_x / 2;
+                    src1R += 8 * in_2->addr[0].stride_x / 2;
+                    dstR += 8 * out->addr[0].stride_x / 2;
+                }
+            }
+        }
+        break;
+
+        default:
+            break;
+    }
+}
+
+#define ABSDIFF_FLEXIBLE(low_y, low_x, high_y, high_x, in_1_tile_x, in_2_tile_x, out_tile_x) \
+    switch (in_1->image.format)\
+    {\
+        case VX_DF_IMAGE_U8:\
+            {\
+                for (y = low_y; y < high_y; y++) {\
+                    vx_uint8* src1R = (vx_uint8 *)in_1->base[0] + in_1_tile_x + y * in_1->addr[0].stride_y;\
+                    vx_uint8* src2R = (vx_uint8 *)in_2->base[0] + in_2_tile_x + y * in_2->addr[0].stride_y;\
+                    vx_uint8* dstR = (vx_uint8 *)out->base[0] + out_tile_x + y * out->addr[0].stride_y;\
+                    for (x = low_x; x < high_x; x++) \
+                    {\
+                        vx_int16 tmp = (*src1R) - (*src2R);\
+                        *dstR = (vx_uint8)(tmp < 0 ? (-tmp) : tmp); \
+                        src1R++;\
+                        src2R++;\
+                        dstR++;\
+                    }\
+                }\
+            }\
+            break;\
+        default:\
+            for (y = low_y; y < high_y; y++)\
+            {\
+                for (x = low_x; x < high_x; x++)\
+                {\
+                    if (in_1->image.format == VX_DF_IMAGE_S16)\
+                    {\
+                        vx_int16 *src[2] = \
+                        {\
+                            (vx_int16 *)in_1->base[0] + in_1_tile_x + y * in_1->addr[0].stride_y /2 + x * in_1->addr[0].stride_x / 2,\
+                            (vx_int16 *)in_2->base[0] + in_2_tile_x + y * in_2->addr[0].stride_y /2 + x * in_2->addr[0].stride_x / 2,\
+                        };\
+                        if (out->image.format == VX_DF_IMAGE_S16)\
+                        {\
+                            vx_int16 *dst = (vx_int16 *)out->base[0] + out_tile_x + y * out->addr[0].stride_y / 2 + x * out->addr[0].stride_x / 2;\
+                            vx_uint32 val;\
+                            if (*src[0] > *src[1])\
+                                val = *src[0] - *src[1];\
+                            else\
+                                val = *src[1] - *src[0];\
+                            *dst = (vx_int16)((val > 32767) ? 32767 : val);\
+                        }\
+                        else if (out->image.format == VX_DF_IMAGE_U16) {\
+                            vx_uint16 *dst = (vx_uint16 *)out->base[0] + out_tile_x + y * out->addr[0].stride_y / 2+ x * out->addr[0].stride_x /2;\
+                            if (*src[0] > *src[1])\
+                                *dst = *src[0] - *src[1];\
+                            else\
+                                *dst = *src[1] - *src[0];\
+                        }\
+                    }\
+                    else if (in_1->image.format == VX_DF_IMAGE_U16)\
+                    {\
+                        vx_uint16 *src[2] = \
+                        {\
+                            (vx_uint16 *)in_1->base[0] + in_1_tile_x + y * in_1->addr[0].stride_y / 2 + x * in_1->addr[0].stride_x / 2,\
+                            (vx_uint16 *)in_2->base[0] + in_2->tile_x + y * in_2->addr[0].stride_y / 2 + x * in_2->addr[0].stride_x / 2,\
+                        };\
+                        vx_uint16 *dst = (vx_uint16 *)out->base[0] + out_tile_x + y * out->addr[0].stride_y + x * out->addr[0].stride_x;\
+                        if (*src[0] > *src[1])\
+                            *dst = *src[0] - *src[1];\
+                        else\
+                            *dst = *src[1] - *src[0];\
+                    }\
+                }\
+            }\
+            break;\
+    }\
+
+
+void AbsDiff_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;    
+    if (ty == 0 && tx == 0)
+    {
+        ABSDIFF_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x)
+    }
+    else
+    {
+        ABSDIFF_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x)
+        ABSDIFF_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), 0, 0, 0)
+    }
+}
+
diff --git a/kernels/tiling/tiling_addsub.c b/kernels/tiling/tiling_addsub.c
new file mode 100644
index 0000000..4d48c52
--- /dev/null
+++ b/kernels/tiling/tiling_addsub.c
@@ -0,0 +1,439 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+void Addition_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_enum *overflow_policy = (vx_enum*)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+    vx_uint32 low_height = out->tile_y;
+    vx_uint32 height = out->tile_y + out->tile_block.height;
+    for (y = low_height; y < height; y++)
+    {
+        vx_uint8 *src0p = (vx_uint8 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width;
+        vx_uint8 *src1p = (vx_uint8 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width;
+        vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out->tile_x + y * out->image.width;        
+        vx_int16 *src0p_16 = (vx_int16 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width;
+        vx_int16 *src1p_16 = (vx_int16 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; 
+        vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out->tile_x + y * out->image.width; 
+        for (x = 0; x < out->tile_block.width; x += 8)
+        {            
+            int32x4_t src01;
+            int32x4_t src02;
+            int32x4_t src11;
+            int32x4_t src12;
+            if(in_1->image.format == VX_DF_IMAGE_U8)
+            {
+                uint8x8_t in01_8x8_data = vld1_u8((vx_uint8*)src0p);
+                uint16x8_t tmp16x8 = vmovl_u8 (in01_8x8_data);
+                int32x4x2_t tmp32x4_int_u8 =
+                {
+                    {
+                        vreinterpretq_s32_u32 (vmovl_u16 (vget_low_u16(tmp16x8))),
+                        vreinterpretq_s32_u32 (vmovl_u16 (vget_high_u16(tmp16x8)))
+                    }
+                };
+                src01 = tmp32x4_int_u8.val[0];
+                src02 = tmp32x4_int_u8.val[1];
+                src0p += 8;
+            }
+            else
+            {
+                int16x8_t int02_16x8_data = vld1q_s16((vx_int16*)src0p_16);
+                int32x4x2_t tmp32x4_int_s16 =
+                {
+                    {
+                        vmovl_s16 (vget_low_s16(int02_16x8_data)),
+                        vmovl_s16 (vget_high_s16(int02_16x8_data))
+                    }
+                };
+                src01 = tmp32x4_int_s16.val[0];
+                src02 = tmp32x4_int_s16.val[1];
+                src0p_16 += 8;
+            }            
+            if(in_2->image.format == VX_DF_IMAGE_U8)
+            {
+                uint8x8_t in01_8x8_data = vld1_u8((vx_uint8*)src1p);
+                uint16x8_t tmp16x8 = vmovl_u8 (in01_8x8_data);
+                int32x4x2_t tmp32x4_int_u8 =
+                {
+                    {
+                        vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp16x8))),
+                        vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp16x8)))
+                    }
+                };
+                src11 = tmp32x4_int_u8.val[0];
+                src12 = tmp32x4_int_u8.val[1];
+                src1p += 8;
+            }
+            else
+            {
+                int16x8_t int02_16x8_data = vld1q_s16((vx_int16*)src1p_16);
+                int32x4x2_t tmp32x4_int_s16 =
+                {
+                    {
+                        vmovl_s16(vget_low_s16(int02_16x8_data)),
+                        vmovl_s16(vget_high_s16(int02_16x8_data))
+                    }
+                };
+                src11 = tmp32x4_int_s16.val[0];
+                src12 = tmp32x4_int_s16.val[1];
+                src1p_16 += 8;
+            }
+            int32x4_t unscaled_unconverted_result1 = vaddq_s32(src01, src11);
+            int32x4_t unscaled_unconverted_result2 = vaddq_s32(src02, src12);
+            vx_int32 tmp0 = vgetq_lane_s32(unscaled_unconverted_result1, 0);
+            vx_int32 tmp1 = vgetq_lane_s32(unscaled_unconverted_result1, 1);
+            vx_int32 tmp2 = vgetq_lane_s32(unscaled_unconverted_result1, 2);
+            vx_int32 tmp3 = vgetq_lane_s32(unscaled_unconverted_result1, 3);
+            vx_int32 tmp4 = vgetq_lane_s32(unscaled_unconverted_result2, 0);
+            vx_int32 tmp5 = vgetq_lane_s32(unscaled_unconverted_result2, 1);
+            vx_int32 tmp6 = vgetq_lane_s32(unscaled_unconverted_result2, 2);
+            vx_int32 tmp7 = vgetq_lane_s32(unscaled_unconverted_result2, 3);
+               
+            vx_int32 i;
+            for(i = 0; i < 8; i++)
+            {   
+                vx_int32 int_typed_result;
+                if(i == 0)
+                  int_typed_result = tmp0;
+                else if(i == 1)
+                  int_typed_result = tmp1;
+                else if(i == 2)
+                  int_typed_result = tmp2;
+                else if(i == 3)
+                  int_typed_result = tmp3;
+                else if(i == 4)
+                  int_typed_result = tmp4;
+                else if(i == 5)
+                  int_typed_result = tmp5;
+                else if(i == 6)
+                  int_typed_result = tmp6;
+                else if(i == 7)
+                  int_typed_result = tmp7;
+                vx_int32 final_result_value;
+                if (*overflow_policy == VX_CONVERT_POLICY_SATURATE)
+                {
+                    if (out->image.format == VX_DF_IMAGE_U8)
+                    {
+                        if (int_typed_result > UINT8_MAX)
+                            final_result_value = UINT8_MAX;
+                        else if (int_typed_result < 0)
+                            final_result_value = 0;
+                        else
+                            final_result_value = int_typed_result;
+                    }
+                    else 
+                    {
+                        if (int_typed_result > INT16_MAX)
+                            final_result_value = INT16_MAX;
+                        else if (int_typed_result < INT16_MIN)
+                            final_result_value = INT16_MIN;
+                        else
+                            final_result_value = int_typed_result;
+                    }
+                }
+                else 
+                {
+                    final_result_value = (out->image.format == VX_DF_IMAGE_U8) ?
+                        (vx_uint8)int_typed_result : (vx_int16)int_typed_result;
+                }
+
+                if (out->image.format == VX_DF_IMAGE_U8)
+                {
+                    *dstp = (vx_uint8)final_result_value;
+                    dstp += 1;
+                }
+                else
+                {
+                    *dstp_16 = (vx_int16)final_result_value;
+                    dstp_16 += 1;
+                }
+            }
+        }
+    }
+}
+
+#define ADD_SUB_FLEXIBLE(low_y, low_x, high_y, high_x, opmode, in_1_tile_x, in_2_tile_x, out_tile_x) \
+    for (y = low_y; y < high_y; y++)                                                           \
+    {                                                                                          \
+        vx_uint8 *src0p = (vx_uint8 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width;    \
+        vx_uint8 *src1p = (vx_uint8 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width;    \
+        vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out_tile_x + y * out->image.width;        \
+        vx_int16 *src0p_16 = (vx_int16 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width; \
+        vx_int16 *src1p_16 = (vx_int16 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width; \
+        vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out_tile_x + y * out->image.width;     \
+        for (x = low_x; x < high_x; x++)                                                       \
+        {                                                                                      \
+            vx_int32 src0 = in_1->image.format == VX_DF_IMAGE_U8 ? *src0p : *src0p_16;         \
+            vx_int32 src1 = in_2->image.format == VX_DF_IMAGE_U8 ? *src1p : *src1p_16;         \
+            src0p++;                                                                           \
+            src1p++;                                                                           \
+            src0p_16++;                                                                        \
+            src1p_16++;                                                                        \
+            vx_int32 int_typed_result;                                                         \
+            if(opmode == 0)                                                                    \
+            {                                                                                  \
+                int_typed_result = src0 + src1;                                                \
+            }                                                                                  \
+            else                                                                               \
+            {                                                                                  \
+                int_typed_result = src0 - src1;                                                \
+            }                                                                                  \
+            vx_int32 final_result_value;                                                       \
+            if (*overflow_policy == VX_CONVERT_POLICY_SATURATE)                                \
+            {                                                                                  \
+                if (out->image.format == VX_DF_IMAGE_U8)                                       \
+                {                                                                              \
+                    if (int_typed_result > UINT8_MAX)                                          \
+                        final_result_value = UINT8_MAX;                                        \
+                    else if (int_typed_result < 0)                                             \
+                        final_result_value = 0;                                                \
+                    else                                                                       \
+                        final_result_value = int_typed_result;                                 \
+                }                                                                              \
+                else                                                                           \
+                {                                                                              \
+                    if (int_typed_result > INT16_MAX)                                          \
+                        final_result_value = INT16_MAX;                                        \
+                    else if (int_typed_result < INT16_MIN)                                     \
+                        final_result_value = INT16_MIN;                                        \
+                    else                                                                       \
+                        final_result_value = int_typed_result;                                 \
+                }                                                                              \
+            }                                                                                  \
+            else                                                                               \
+            {                                                                                  \
+                final_result_value = (out->image.format == VX_DF_IMAGE_U8) ?                   \
+                    (vx_uint8)int_typed_result : (vx_int16)int_typed_result;                   \
+            }                                                                                  \
+            if (out->image.format == VX_DF_IMAGE_U8)                                           \
+            {                                                                                  \
+                *dstp = (vx_uint8)final_result_value;                                          \
+                dstp++;                                                                        \
+            }                                                                                  \
+            else                                                                               \
+            {                                                                                  \
+                *dstp_16 = (vx_int16)final_result_value;                                       \
+                dstp_16++;                                                                     \
+            }                                                                                  \
+        }                                                                                      \
+    }                                                                                          \
+
+void Addition_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_enum *overflow_policy = (vx_enum*)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+    
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    vx_uint8 op_mode = 0;
+    if (ty == 0 && tx == 0)
+    {
+        ADD_SUB_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), op_mode, in_1->tile_x, in_2->tile_x, out->tile_x)
+    }
+    else
+    {
+        ADD_SUB_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), op_mode, in_1->tile_x, in_2->tile_x, out->tile_x)
+        ADD_SUB_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), op_mode, 0, 0, 0)
+    }
+}
+
+void Subtraction_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_enum *overflow_policy = (vx_enum*)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+    vx_uint32 low_height = out->tile_y;
+    vx_uint32 height = out->tile_y + out->tile_block.height;
+    for (y = low_height; y < height; y++)
+    {
+        vx_uint8 *src0p = (vx_uint8 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width;
+        vx_uint8 *src1p = (vx_uint8 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width;
+        vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out->tile_x + y * out->image.width;        
+        vx_int16 *src0p_16 = (vx_int16 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width;
+        vx_int16 *src1p_16 = (vx_int16 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; 
+        vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out->tile_x + y * out->image.width; 
+        for (x = 0; x < out->tile_block.width; x += 8)
+        {            
+            int32x4_t src01;
+            int32x4_t src02;
+            int32x4_t src11;
+            int32x4_t src12;
+            if(in_1->image.format == VX_DF_IMAGE_U8)
+            {
+                uint8x8_t in01_8x8_data = vld1_u8((vx_uint8*)src0p);
+                uint16x8_t tmp16x8 = vmovl_u8 (in01_8x8_data);
+                int32x4x2_t tmp32x4_int_u8 =
+                {
+                    {
+                        vreinterpretq_s32_u32 (vmovl_u16 (vget_low_u16(tmp16x8))),
+                        vreinterpretq_s32_u32 (vmovl_u16 (vget_high_u16(tmp16x8)))
+                    }
+                };
+                src01 = tmp32x4_int_u8.val[0];
+                src02 = tmp32x4_int_u8.val[1];
+                src0p += 8;
+            }
+            else
+            {
+                int16x8_t int02_16x8_data = vld1q_s16((vx_int16*)src0p_16);
+                int32x4x2_t tmp32x4_int_s16 =
+                {
+                    {
+                        vmovl_s16 (vget_low_s16(int02_16x8_data)),
+                        vmovl_s16 (vget_high_s16(int02_16x8_data))
+                    }
+                };
+                src01 = tmp32x4_int_s16.val[0];
+                src02 = tmp32x4_int_s16.val[1];
+                src0p_16 += 8;
+            }            
+            if(in_2->image.format == VX_DF_IMAGE_U8)
+            {
+                uint8x8_t in01_8x8_data = vld1_u8((vx_uint8*)src1p);
+                uint16x8_t tmp16x8 = vmovl_u8 (in01_8x8_data);
+                int32x4x2_t tmp32x4_int_u8 =
+                {
+                    {
+                        vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp16x8))),
+                        vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp16x8)))
+                    }
+                };
+                src11 = tmp32x4_int_u8.val[0];
+                src12 = tmp32x4_int_u8.val[1];
+                src1p += 8;
+            }
+            else
+            {
+                int16x8_t int02_16x8_data = vld1q_s16((vx_int16*)src1p_16);
+                int32x4x2_t tmp32x4_int_s16 =
+                {
+                    {
+                        vmovl_s16(vget_low_s16(int02_16x8_data)),
+                        vmovl_s16(vget_high_s16(int02_16x8_data))
+                    }
+                };
+                src11 = tmp32x4_int_s16.val[0];
+                src12 = tmp32x4_int_s16.val[1];
+                src1p_16 += 8;
+            }
+            int32x4_t unscaled_unconverted_result1 = vsubq_s32(src01, src11);
+            int32x4_t unscaled_unconverted_result2 = vsubq_s32(src02, src12);
+            vx_int32 tmp0 = vgetq_lane_s32(unscaled_unconverted_result1, 0);
+            vx_int32 tmp1 = vgetq_lane_s32(unscaled_unconverted_result1, 1);
+            vx_int32 tmp2 = vgetq_lane_s32(unscaled_unconverted_result1, 2);
+            vx_int32 tmp3 = vgetq_lane_s32(unscaled_unconverted_result1, 3);
+            vx_int32 tmp4 = vgetq_lane_s32(unscaled_unconverted_result2, 0);
+            vx_int32 tmp5 = vgetq_lane_s32(unscaled_unconverted_result2, 1);
+            vx_int32 tmp6 = vgetq_lane_s32(unscaled_unconverted_result2, 2);
+            vx_int32 tmp7 = vgetq_lane_s32(unscaled_unconverted_result2, 3);
+               
+            vx_int32 i;
+            for(i = 0; i < 8; i++)
+            {   
+                vx_int32 int_typed_result;
+                if(i == 0)
+                  int_typed_result = tmp0;
+                else if(i == 1)
+                  int_typed_result = tmp1;
+                else if(i == 2)
+                  int_typed_result = tmp2;
+                else if(i == 3)
+                  int_typed_result = tmp3;
+                else if(i == 4)
+                  int_typed_result = tmp4;
+                else if(i == 5)
+                  int_typed_result = tmp5;
+                else if(i == 6)
+                  int_typed_result = tmp6;
+                else if(i == 7)
+                  int_typed_result = tmp7;
+                vx_int32 final_result_value;
+                if (*overflow_policy == VX_CONVERT_POLICY_SATURATE)
+                {
+                    if (out->image.format == VX_DF_IMAGE_U8)
+                    {
+                        if (int_typed_result > UINT8_MAX)
+                            final_result_value = UINT8_MAX;
+                        else if (int_typed_result < 0)
+                            final_result_value = 0;
+                        else
+                            final_result_value = int_typed_result;
+                    }
+                    else 
+                    {
+                        if (int_typed_result > INT16_MAX)
+                            final_result_value = INT16_MAX;
+                        else if (int_typed_result < INT16_MIN)
+                            final_result_value = INT16_MIN;
+                        else
+                            final_result_value = int_typed_result;
+                    }
+                }
+                else 
+                {
+                    final_result_value = (out->image.format == VX_DF_IMAGE_U8) ?
+                        (vx_uint8)int_typed_result : (vx_int16)int_typed_result;
+                }
+
+                if (out->image.format == VX_DF_IMAGE_U8)
+                {
+                    *dstp = (vx_uint8)final_result_value;
+                    dstp += 1;
+                }
+                else
+                {
+                    *dstp_16 = (vx_int16)final_result_value;
+                    dstp_16 += 1;
+                }
+            }
+        }
+    }
+}
+
+void Subtraction_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_enum *overflow_policy = (vx_enum*)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+    
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    vx_uint8 op_mode = 1;
+    if (ty == 0 && tx == 0)
+    {
+        ADD_SUB_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), op_mode, in_1->tile_x, in_2->tile_x, out->tile_x)     
+    }
+    else
+    {
+        ADD_SUB_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), op_mode, in_1->tile_x, in_2->tile_x, out->tile_x)
+        ADD_SUB_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), op_mode, 0, 0, 0)
+    }
+}
diff --git a/kernels/tiling/tiling_bitwise.c b/kernels/tiling/tiling_bitwise.c
new file mode 100644
index 0000000..5c85792
--- /dev/null
+++ b/kernels/tiling/tiling_bitwise.c
@@ -0,0 +1,377 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+void And_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+    vx_uint8 *src_1 = in_1->base[0] + in_1->tile_x;
+    vx_uint8 *src_2 = in_2->base[0] + in_2->tile_x;
+    vx_uint8 *dst = out->base[0] + out->tile_x;
+    vx_uint32 low_height = out->tile_y;
+    vx_uint32 height = out->tile_y + out->tile_block.height;
+ 
+    for (y = low_height; y < height; y++) 
+    {
+        const vx_uint8* src1R = src_1 + y * in_1->image.width;
+        const vx_uint8* src2R = src_2 + y * in_2->image.width;
+        vx_uint8* dstR = dst + y * out->image.width;
+        for (x = 0; x < out->tile_block.width; x+=16) 
+        {
+            uint8x16_t vSrc1R = vld1q_u8(src1R);
+            uint8x16_t vSrc2R = vld1q_u8(src2R);
+            uint8x16_t vAnd = vandq_u8(vSrc1R, vSrc2R);
+            vst1q_u8(dstR, vAnd);
+
+            src2R += 16;
+            src1R += 16;
+            dstR += 16;
+        }
+    }
+    
+}
+void And_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+    vx_uint8 *src_1 = in_1->base[0] + in_1->tile_x;
+    vx_uint8 *src_2 = in_2->base[0] + in_2->tile_x;
+    vx_uint8 *dst = out->base[0] + out->tile_x;
+    
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    if (ty == 0 && tx == 0)
+    {
+        for (y = 0; y < vxTileHeight(out, 0); y++) 
+        {
+            const vx_uint8* src1R = src_1 + y * in_1->image.width;
+            const vx_uint8* src2R = src_2 + y * in_2->image.width;
+            vx_uint8* dstR = dst + y * out->image.width;
+            for (x = 0; x < vxTileWidth(out, 0); x++) 
+            {            
+                *(dstR+x) = *(src1R + x)&*(src2R + x);
+                src2R ++;
+                src1R ++;
+                dstR ++;
+            }
+        }        
+    }
+    else
+    {
+        for (y = 0; y < ty; y++)
+        {
+            const vx_uint8* src1R = src_1 + y * in_1->image.width;
+            const vx_uint8* src2R = src_2 + y * in_2->image.width;
+            vx_uint8* dstR = dst + y * out->image.width;
+            for (x = tx; x < vxTileWidth(out, 0); x++) 
+            {            
+                *(dstR+x) = *(src1R + x)&*(src2R + x);
+                src2R ++;
+                src1R ++;
+                dstR ++;
+            }
+        }
+        for (y = ty; y < vxTileHeight(out, 0); y++)
+        {
+            src_1 = in_1->base[0];
+            src_2 = in_2->base[0];
+            dst = out->base[0];
+            const vx_uint8* src1R = src_1 + y * in_1->image.width;
+            const vx_uint8* src2R = src_2 + y * in_2->image.width;
+            vx_uint8* dstR = dst + y * out->image.width;
+            for (x = 0; x < vxTileWidth(out, 0); x++) 
+            {            
+                *(dstR+x) = *(src1R + x)&*(src2R + x);
+                src2R ++;
+                src1R ++;
+                dstR ++;
+            }
+        }
+    } 
+}
+
+void Or_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+    vx_uint8 *src_1 = in_1->base[0] + in_1->tile_x;
+    vx_uint8 *src_2 = in_2->base[0] + in_2->tile_x;
+    vx_uint8 *dst = out->base[0] + out->tile_x;
+    vx_uint32 low_height = out->tile_y;
+    vx_uint32 height = out->tile_y + out->tile_block.height;
+ 
+    for (y = low_height; y < height; y++) 
+    {
+        const vx_uint8* src1R = src_1 + y * in_1->image.width;
+        const vx_uint8* src2R = src_2 + y * in_2->image.width;
+        vx_uint8* dstR = dst + y * out->image.width;
+        for (x = 0; x < out->tile_block.width; x+=16) 
+        {
+            uint8x16_t vSrc1R = vld1q_u8(src1R);
+            uint8x16_t vSrc2R = vld1q_u8(src2R);
+            uint8x16_t vOr = vorrq_u8(vSrc1R, vSrc2R);
+            vst1q_u8(dstR, vOr);
+
+            src2R += 16;
+            src1R += 16;
+            dstR += 16;
+        }
+    }
+    
+}
+void Or_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+    vx_uint8 *src_1 = in_1->base[0] + in_1->tile_x;
+    vx_uint8 *src_2 = in_2->base[0] + in_2->tile_x;
+    vx_uint8 *dst = out->base[0] + out->tile_x;
+    
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    if (ty == 0 && tx == 0)
+    {
+        for (y = 0; y < vxTileHeight(out, 0); y++) 
+        {
+            const vx_uint8* src1R = src_1 + y * in_1->image.width;
+            const vx_uint8* src2R = src_2 + y * in_2->image.width;
+            vx_uint8* dstR = dst + y * out->image.width;
+            for (x = 0; x < vxTileWidth(out, 0); x++) 
+            {            
+                *(dstR+x) = *(src1R + x)|*(src2R + x);
+                src2R ++;
+                src1R ++;
+                dstR ++;
+            }
+        }        
+    }
+    else
+    {
+        for (y = 0; y < ty; y++)
+        {
+            const vx_uint8* src1R = src_1 + y * in_1->image.width;
+            const vx_uint8* src2R = src_2 + y * in_2->image.width;
+            vx_uint8* dstR = dst + y * out->image.width;
+            for (x = tx; x < vxTileWidth(out, 0); x++) 
+            {            
+                *(dstR+x) = *(src1R + x)|*(src2R + x);
+                src2R ++;
+                src1R ++;
+                dstR ++;
+            }
+        }
+        for (y = ty; y < vxTileHeight(out, 0); y++)
+        {
+            src_1 = in_1->base[0];
+            src_2 = in_2->base[0];
+            dst = out->base[0];
+            const vx_uint8* src1R = src_1 + y * in_1->image.width;
+            const vx_uint8* src2R = src_2 + y * in_2->image.width;
+            vx_uint8* dstR = dst + y * out->image.width;
+            for (x = 0; x < vxTileWidth(out, 0); x++) 
+            {            
+                *(dstR+x) = *(src1R + x)|*(src2R + x);
+                src2R ++;
+                src1R ++;
+                dstR ++;
+            }
+        }
+    } 
+}
+
+void Xor_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+    vx_uint8 *src_1 = in_1->base[0] + in_1->tile_x;
+    vx_uint8 *src_2 = in_2->base[0] + in_2->tile_x;
+    vx_uint8 *dst = out->base[0] + out->tile_x;
+    vx_uint32 low_height = out->tile_y;
+    vx_uint32 height = out->tile_y + out->tile_block.height;
+ 
+    for (y = low_height; y < height; y++) 
+    {
+        const vx_uint8* src1R = src_1 + y * in_1->image.width;
+        const vx_uint8* src2R = src_2 + y * in_2->image.width;
+        vx_uint8* dstR = dst + y * out->image.width;
+        for (x = 0; x < out->tile_block.width; x+=16) 
+        {
+            uint8x16_t vSrc1R = vld1q_u8(src1R);
+            uint8x16_t vSrc2R = vld1q_u8(src2R);
+            uint8x16_t vXor = veorq_u8(vSrc1R, vSrc2R);
+            vst1q_u8(dstR, vXor);
+
+            src2R += 16;
+            src1R += 16;
+            dstR += 16;
+        }
+    }
+    
+}
+void Xor_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+    vx_uint8 *src_1 = in_1->base[0] + in_1->tile_x;
+    vx_uint8 *src_2 = in_2->base[0] + in_2->tile_x;
+    vx_uint8 *dst = out->base[0] + out->tile_x;
+    
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    if (ty == 0 && tx == 0)
+    {
+        for (y = 0; y < vxTileHeight(out, 0); y++) 
+        {
+            const vx_uint8* src1R = src_1 + y * in_1->image.width;
+            const vx_uint8* src2R = src_2 + y * in_2->image.width;
+            vx_uint8* dstR = dst + y * out->image.width;
+            for (x = 0; x < vxTileWidth(out, 0); x++) 
+            {            
+                *(dstR+x) = *(src1R + x)^*(src2R + x);
+                src2R ++;
+                src1R ++;
+                dstR ++;
+            }
+        }        
+    }
+    else
+    {
+        for (y = 0; y < ty; y++)
+        {
+            const vx_uint8* src1R = src_1 + y * in_1->image.width;
+            const vx_uint8* src2R = src_2 + y * in_2->image.width;
+            vx_uint8* dstR = dst + y * out->image.width;
+            for (x = tx; x < vxTileWidth(out, 0); x++) 
+            {            
+                *(dstR+x) = *(src1R + x)^*(src2R + x);
+                src2R ++;
+                src1R ++;
+                dstR ++;
+            }
+        }
+        for (y = ty; y < vxTileHeight(out, 0); y++)
+        {
+            src_1 = in_1->base[0];
+            src_2 = in_2->base[0];
+            dst = out->base[0];
+            const vx_uint8* src1R = src_1 + y * in_1->image.width;
+            const vx_uint8* src2R = src_2 + y * in_2->image.width;
+            vx_uint8* dstR = dst + y * out->image.width;
+            for (x = 0; x < vxTileWidth(out, 0); x++) 
+            {            
+                *(dstR+x) = *(src1R + x)^*(src2R + x);
+                src2R ++;
+                src1R ++;
+                dstR ++;
+            }
+        }
+    }    
+}
+
+void Not_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+    vx_uint8 *src = in->base[0] + in->tile_x;
+    vx_uint8 *dst = out->base[0] + out->tile_x;
+    vx_uint32 low_height = out->tile_y;
+    vx_uint32 height = out->tile_y + out->tile_block.height;
+ 
+    for (y = low_height; y < height; y++) 
+    {
+        const vx_uint8* srcR = src + y * in->image.width;
+        vx_uint8* dstR = dst + y * out->image.width;
+        for (x = 0; x < out->tile_block.width; x+=16) 
+        {
+            uint8x16_t vSrcR = vld1q_u8(srcR);
+            uint8x16_t vNot = vmvnq_u8(vSrcR);
+            vst1q_u8(dstR, vNot);
+
+            srcR += 16;
+            dstR += 16;
+        }
+    }
+    
+}
+void Not_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+    vx_uint8 *src = in->base[0] + in->tile_x;
+    vx_uint8 *dst = out->base[0] + out->tile_x;
+    
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    if (ty == 0 && tx == 0)
+    {
+        for (y = 0; y < vxTileHeight(out, 0); y++) 
+        {
+            const vx_uint8* srcR = src + y * in->image.width;
+            vx_uint8* dstR = dst + y * out->image.width;
+            for (x = 0; x < vxTileWidth(out, 0); x++) 
+            {            
+                *(dstR+x) = ~*(srcR + x);
+                srcR ++;
+                dstR ++;
+            }
+        }        
+    }
+    else
+    {
+        for (y = 0; y < ty; y++)
+        {
+            const vx_uint8* srcR = src + y * in->image.width;
+            vx_uint8* dstR = dst + y * out->image.width;
+            for (x = tx; x < vxTileWidth(out, 0); x++) 
+            {            
+                *(dstR+x) = ~*(srcR + x);
+                srcR ++;
+                dstR ++;
+            }
+        }
+        for (y = ty; y < vxTileHeight(out, 0); y++)
+        {
+            src = in->base[0];
+            dst = out->base[0];
+            const vx_uint8* srcR = src + y * in->image.width;
+            vx_uint8* dstR = dst + y * out->image.width;
+            for (x = 0; x < vxTileWidth(out, 0); x++) 
+            {            
+                *(dstR+x) = ~*(srcR + x);
+                srcR ++;
+                dstR ++;
+            }
+        }
+    } 
+}
diff --git a/kernels/tiling/tiling_channel.c b/kernels/tiling/tiling_channel.c
new file mode 100644
index 0000000..ff43294
--- /dev/null
+++ b/kernels/tiling/tiling_channel.c
@@ -0,0 +1,386 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+void ChannelCombine_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0, p;
+    vx_tile_t *in[4];
+    in[0] = (vx_tile_t *)parameters[0];
+    in[1] = (vx_tile_t *)parameters[1];
+    in[2] = (vx_tile_t *)parameters[2];
+    in[3] = (vx_tile_t *)parameters[3];
+    vx_tile_t *out = (vx_tile_t *)parameters[4];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = out->tile_x + out->tile_block.width;
+
+    void *base_src_ptrs[4] = { NULL };
+    void *base_dst_ptr[4] = { NULL };
+
+    base_src_ptrs[0] = in[0]->base[0];
+    base_src_ptrs[1] = in[1]->base[0];
+    base_src_ptrs[2] = in[2]->base[0];
+    base_src_ptrs[3] = in[3]->base[0];
+
+    base_dst_ptr[0] = out->base[0];
+    base_dst_ptr[1] = out->base[1];
+    base_dst_ptr[2] = out->base[2];
+    base_dst_ptr[3] = out->base[3];
+
+    vx_df_image format;
+
+    format = out->image.format;
+
+    vx_uint8 *planes[4];
+
+    if (format == VX_DF_IMAGE_RGB)
+    {
+        vx_uint8 *ptr0, *ptr1, *ptr2, *pout;
+        for (y = low_y; y < high_y; y += out->addr[0].step_y)
+        {
+            ptr0 = (vx_uint8 *)base_src_ptrs[0] + y * in[0]->addr->stride_y;
+            ptr1 = (vx_uint8 *)base_src_ptrs[1] + y * in[1]->addr->stride_y;
+            ptr2 = (vx_uint8 *)base_src_ptrs[2] + y * in[2]->addr->stride_y;
+            pout = (vx_uint8 *)base_dst_ptr[0] + y * out->addr->stride_y;
+            for (x = low_x; x < high_x; x += 16)
+            {
+                uint8x16x3_t pixels = {{vld1q_u8(ptr0 + x * in[0]->addr->stride_x),
+                                        vld1q_u8(ptr1 + x * in[1]->addr->stride_x),
+                                        vld1q_u8(ptr2 + x * in[2]->addr->stride_x)}};
+
+                vst3q_u8(pout + x * out->addr->stride_x, pixels);
+            }
+        }
+    }
+    else if (format == VX_DF_IMAGE_RGBX)
+    {
+        vx_uint8 *ptr0, *ptr1, *ptr2, *ptr3, *pout;
+        for (y = low_y; y < high_y; y += out->addr[0].step_y)
+        {
+            ptr0 = (vx_uint8 *)base_src_ptrs[0] + y * in[0]->addr->stride_y;
+            ptr1 = (vx_uint8 *)base_src_ptrs[1] + y * in[1]->addr->stride_y;
+            ptr2 = (vx_uint8 *)base_src_ptrs[2] + y * in[2]->addr->stride_y;
+            ptr3 = (vx_uint8 *)base_src_ptrs[3] + y * in[3]->addr->stride_y;
+            pout = (vx_uint8 *)base_dst_ptr[0] + y * out->addr->stride_y;
+            for (x = low_x; x < high_x; x += 16)
+            {
+                uint8x16x4_t pixels = {{vld1q_u8(ptr0 + x * in[0]->addr->stride_x),
+                                        vld1q_u8(ptr1 + x * in[1]->addr->stride_x),
+                                        vld1q_u8(ptr2 + x * in[2]->addr->stride_x),
+                                        vld1q_u8(ptr3 + x * in[3]->addr->stride_x)}};
+
+                vst4q_u8(pout + x * out->addr->stride_x, pixels);
+            }
+        }
+    }
+    else if ((format == VX_DF_IMAGE_YUV4) || (format == VX_DF_IMAGE_IYUV))
+    {
+        vx_uint8 *ptr_in, *ptr_out;
+        vx_uint32 wCnt = ((high_x >> 1) >> 3) << 3;
+        for (p = 0; p < 3; p++)
+        {
+            if (1 == out->addr[p].step_y)
+            {
+                for (y = low_y; y < high_y; y += out->addr[p].step_y)
+                {
+                    ptr_in = (vx_uint8 *)base_src_ptrs[p] + y * in[p]->addr->stride_y;
+                    ptr_out = (vx_uint8 *)base_dst_ptr[p] + y * out->addr[p].stride_y;
+
+                    for (x = low_x; x < high_x; x += 16)
+                    {
+                        uint8x16_t pixels = vld1q_u8(ptr_in + x * in[p]->addr->stride_x);
+                        vst1q_u8(ptr_out + x * out->addr[p].stride_x, pixels);
+                    }
+                }
+            }
+            else
+            {
+                for (y = low_y; y < high_y; y += out->addr[p].step_y)
+                {
+                    ptr_in = (vx_uint8 *)base_src_ptrs[p] + ((y * in[p]->addr->step_y / out->addr[p].step_y) * 
+                             in[p]->addr->scale_y / VX_SCALE_UNITY) * in[p]->addr->stride_y;
+                    ptr_out = (vx_uint8 *)base_dst_ptr[p] + (y * out->addr[p].scale_y / VX_SCALE_UNITY) * out->addr[p].stride_y;
+
+                    for (x = low_x; x < wCnt; x += 8)
+                    {
+                        uint8x8_t pixels = vld1_u8(ptr_in + x * in[p]->addr->stride_x);
+                        vst1_u8(ptr_out + x * out->addr[p].stride_x, pixels);
+                    }
+                }
+            }
+        }
+    }
+    else if ((format == VX_DF_IMAGE_NV12) || (format == VX_DF_IMAGE_NV21))
+    {
+        int vidx = (format == VX_DF_IMAGE_NV12) ? 1 : 0;
+
+        //plane 0
+        {
+            for (y = low_y; y < high_y; y += out->addr[0].step_y)
+            {
+                vx_uint8 *ptr_src = (vx_uint8 *)base_src_ptrs[0] + y * in[0]->addr->stride_y;
+                vx_uint8 *ptr_dst = (vx_uint8 *)base_dst_ptr[0] + y * out->addr[0].stride_y;
+                for (x = low_x; x < high_x; x += 16)
+                {
+                    uint8x16_t pixels = vld1q_u8(ptr_src + x * in[0]->addr->stride_x);
+                    vst1q_u8(ptr_dst + x * out->addr[0].stride_x, pixels);
+                }
+            }
+        }
+
+        // plane 1
+        {
+            vx_uint32 wCnt = ((high_x >> 1) >> 3) << 3;
+            for (y = low_y; y < high_y; y += out->addr[1].step_y)
+            {
+                vx_uint8 *ptr_src0 = (vx_uint8 *)base_src_ptrs[1] + in[1]->addr->stride_y * 
+                                     ((y * in[1]->addr->step_y / out->addr[1].step_y) * in[1]->addr->scale_y / VX_SCALE_UNITY);
+                vx_uint8 *ptr_src1 = (vx_uint8 *)base_src_ptrs[2] + in[2]->addr->stride_y * 
+                                     ((y * in[1]->addr->step_y / out->addr[1].step_y) * in[2]->addr->scale_y / VX_SCALE_UNITY);
+                vx_uint8 *ptr_dst = (vx_uint8 *)base_dst_ptr[1] + out->addr[1].stride_y * (y *out->addr[1].scale_y / VX_SCALE_UNITY);
+                for (x = low_x; x < wCnt; x += 8)
+                {
+                    uint8x8x2_t pixels;
+                    pixels.val[1-vidx] = vld1_u8(ptr_src0 + x * in[1]->addr->stride_x);
+                    pixels.val[vidx] = vld1_u8(ptr_src1 + x * in[2]->addr->stride_x);
+                    vst2_u8(ptr_dst + x * out->addr[1].stride_x, pixels);
+                }
+            }
+        }
+    }
+    else if ((format == VX_DF_IMAGE_YUYV) || (format == VX_DF_IMAGE_UYVY))
+    {
+        int yidx = (format == VX_DF_IMAGE_UYVY) ? 1 : 0;
+        for (y = low_y; y < high_y; y += out->addr[0].step_y)
+        {
+            vx_uint8 *ptr_src0 = (vx_uint8 *)base_src_ptrs[0] + in[0]->addr->stride_y * 
+                                 ((y * in[0]->addr->step_y / out->addr->step_y) * in[0]->addr->scale_y / VX_SCALE_UNITY);
+            vx_uint8 *ptr_src1 = (vx_uint8 *)base_src_ptrs[1] + in[1]->addr->stride_y * 
+                                 ((y * in[1]->addr->step_y / out->addr->step_y) * in[1]->addr->scale_y / VX_SCALE_UNITY);
+            vx_uint8 *ptr_src2 = (vx_uint8 *)base_src_ptrs[2] + in[2]->addr->stride_y * 
+                                 ((y * in[1]->addr->step_y / out->addr->step_y) * in[2]->addr->scale_y / VX_SCALE_UNITY);
+            vx_uint8 *ptr_dst = (vx_uint8 *)base_dst_ptr[0] + out->addr[0].stride_y * y;
+            for (x = low_x; x < high_x; x += 16)
+            {
+                uint8x8x2_t pixels_y = vld2_u8(ptr_src0 + x * in[0]->addr->stride_x);
+                uint8x8x2_t pixels_uv = {{vld1_u8(ptr_src1 + (x >> 1) * in[1]->addr->stride_x),
+                                          vld1_u8(ptr_src2 + (x >> 1) * in[2]->addr->stride_x)}};
+                uint8x8x4_t pixels;
+                pixels.val[0 + yidx] = pixels_y.val[0];
+                pixels.val[1 - yidx] = pixels_uv.val[0];
+                pixels.val[2 + yidx] = pixels_y.val[1];
+                pixels.val[3 - yidx] = pixels_uv.val[1];
+
+                vst4_u8(ptr_dst + x * out->addr[0].stride_x, pixels);
+            }
+        }
+    }
+}
+
+#define RGB(low_y, high_y, low_x)                                                                                   \
+    for (y = low_y; y < high_y; y += out->addr->step_y)                                                             \
+    {                                                                                                               \
+        planes[0] = (vx_uint8 *)base_src_ptrs[0] + y * in[0]->addr->stride_y;                                       \
+        planes[1] = (vx_uint8 *)base_src_ptrs[1] + y * in[1]->addr->stride_y;                                       \
+        planes[2] = (vx_uint8 *)base_src_ptrs[2] + y * in[2]->addr->stride_y;                                       \
+        vx_uint8 *dst = (vx_uint8 *)base_dst_ptr[0] + y * out->addr->stride_y;                                      \
+        for (x = low_x; x < high_x; x += out->addr->step_x)                                                         \
+        {                                                                                                           \
+            dst[0] = planes[0][0];                                                                                  \
+            dst[1] = planes[1][0];                                                                                  \
+            dst[2] = planes[2][0];                                                                                  \
+            if (format == VX_DF_IMAGE_RGBX)                                                                         \
+            {                                                                                                       \
+                planes[3] = (vx_uint8 *)base_src_ptrs[3] + y * in[3]->addr->stride_y + x * in[3]->addr->stride_x;   \
+                dst[3] = planes[3][0];                                                                              \
+            }                                                                                                       \
+            planes[0] += out->addr->step_x * in[0]->addr->stride_x;                                                 \
+            planes[1] += out->addr->step_x * in[1]->addr->stride_x;                                                 \
+            planes[2] += out->addr->step_x * in[2]->addr->stride_x;                                                 \
+            dst += out->addr->step_x * out->addr->stride_x;                                                         \
+        }                                                                                                           \
+    }
+
+
+#define YUV4(low_y, high_y, low_x)                                                                                                     \
+    for (p = 0; p < 3; p++)                                                                                                            \
+    {                                                                                                                                  \
+        for (y = low_y; y < high_y; y += out->addr[p].step_y)                                                                          \
+        {                                                                                                                              \
+            for (x = low_x; x < high_x; x += out->addr[p].step_x)                                                                      \
+            {                                                                                                                          \
+                vx_uint32 x1 = x * in[p]->addr->step_x / out->addr[p].step_x;                                                          \
+                vx_uint32 y1 = y * in[p]->addr->step_y / out->addr[p].step_y;                                                          \
+                vx_uint8 *src = (vx_uint8 *)base_src_ptrs[p] + y1 * in[p]->addr->stride_y + x1 * in[p]->addr->stride_x;                \
+                vx_uint8 *dst = (vx_uint8 *)base_dst_ptr[p] + out->addr[p].stride_y * (out->addr[p].scale_y * y) / VX_SCALE_UNITY +    \
+                                out->addr[p].stride_x * (out->addr[p].scale_x * x) / VX_SCALE_UNITY;                                   \
+                *dst = *src;                                                                                                           \
+            }                                                                                                                          \
+        }                                                                                                                              \
+    }
+
+
+#define NV12(low_y, high_y, low_x)                                                                                                 \
+    for (y = low_y; y < high_y; y += out->addr[0].step_y)                                                                          \
+    {                                                                                                                              \
+        vx_uint8 *src = (vx_uint8 *)base_src_ptrs[0] + y * in[0]->addr->stride_y;                                                  \
+        vx_uint8 *dst = (vx_uint8 *)base_dst_ptr[0] + y * out->addr[0].stride_y;                                                   \
+        for (x = low_x; x < high_x; x += out->addr[0].step_x)                                                                      \
+        {                                                                                                                          \
+            *dst = *src;                                                                                                           \
+                                                                                                                                   \
+            src += out->addr[0].step_x * in[0]->addr->stride_x;                                                                    \
+            dst += out->addr[0].step_x * out->addr[0].stride_x;                                                                    \
+        }                                                                                                                          \
+    }                                                                                                                              \
+                                                                                                                                   \
+    for (y = low_y; y < high_y; y += out->addr[1].step_y)                                                                          \
+    {                                                                                                                              \
+        for (x = low_x; x < high_x; x += out->addr[1].step_x)                                                                      \
+        {                                                                                                                          \
+            vx_uint32 x1 = x * in[1]->addr->step_x / out->addr[1].step_x;                                                          \
+            vx_uint32 y1 = y * in[1]->addr->step_y / out->addr[1].step_y;                                                          \
+            vx_uint8 *src0 = (vx_uint8 *)base_src_ptrs[1] + y1 * in[1]->addr->stride_y + x1 * in[1]->addr->stride_x;               \
+            vx_uint8 *src1 = (vx_uint8 *)base_src_ptrs[2] + y1 * in[2]->addr->stride_y + x1 * in[2]->addr->stride_x;               \
+            vx_uint8 *dst = (vx_uint8 *)base_dst_ptr[1] + out->addr[1].stride_y * (out->addr[1].scale_y * y) / VX_SCALE_UNITY +    \
+                            out->addr[1].stride_x * (out->addr[1].scale_x * x) / VX_SCALE_UNITY;                                   \
+            dst[1 - vidx] = *src0;                                                                                                 \
+            dst[vidx] = *src1;                                                                                                     \
+        }                                                                                                                          \
+    }                   
+
+
+#define YUYV(low_y, high_y, low_x)                                                                                                  \
+    for (y = low_y; y < high_y; y += out->addr->step_y)                                                                             \
+    {                                                                                                                               \
+        for (x = low_x; x < high_x; x += out->addr->step_x * 2)                                                                     \
+        {                                                                                                                           \
+            vx_uint32 x1 = x * in[0]->addr->step_x / out->addr->step_x;                                                             \
+            vx_uint32 y1 = y * in[0]->addr->step_y / out->addr->step_y;                                                             \
+            vx_uint32 x2 = x * in[1]->addr->step_x / (out->addr->step_x * 2);                                                       \
+            vx_uint32 y2 = y * in[1]->addr->step_y / out->addr->step_y;                                                             \
+            vx_uint8 *srcy0 = (vx_uint8 *)base_src_ptrs[0] + y1 * in[0]->addr->stride_y + x1 * in[0]->addr->stride_x;               \
+            vx_uint8 *srcy1 = (vx_uint8 *)base_src_ptrs[0] + y1 * in[0]->addr->stride_y +                                           \
+                              (x1 + in[0]->addr->step_x) * in[0]->addr->stride_x;                                                   \
+            vx_uint8 *srcu = (vx_uint8 *)base_src_ptrs[1] + y2 * in[1]->addr->stride_y + x2 * in[1]->addr->stride_x;                \
+            vx_uint8 *srcv = (vx_uint8 *)base_src_ptrs[2] + y2 * in[2]->addr->stride_y + x2 * in[2]->addr->stride_x;                \
+            vx_uint8 *dst0 = (vx_uint8 *)base_dst_ptr[0] + out->addr[0].stride_y * (out->addr[0].scale_y * y) / VX_SCALE_UNITY +    \
+                             out->addr[0].stride_x * (out->addr[0].scale_x * x) / VX_SCALE_UNITY;                                   \
+            vx_uint8 *dst1 = (vx_uint8 *)base_dst_ptr[0] + out->addr[0].stride_y * (out->addr[0].scale_y * y) / VX_SCALE_UNITY +    \
+                             out->addr[0].stride_x * (out->addr[0].scale_x * (x + out->addr[0].step_x)) / VX_SCALE_UNITY;           \
+                                                                                                                                    \
+            dst0[yidx] = *srcy0;                                                                                                    \
+            dst1[yidx] = *srcy1;                                                                                                    \
+            dst0[1 - yidx] = *srcu;                                                                                                 \
+            dst1[1 - yidx] = *srcv;                                                                                                 \
+        }                                                                                                                           \
+    }
+
+
+void ChannelCombine_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0, p;
+    vx_tile_t *in[4];
+    in[0] = (vx_tile_t *)parameters[0];
+    in[1] = (vx_tile_t *)parameters[1];
+    in[2] = (vx_tile_t *)parameters[2];
+    in[3] = (vx_tile_t *)parameters[3];
+    vx_tile_t *out = (vx_tile_t *)parameters[4];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    void *base_src_ptrs[4] = { NULL };
+    void *base_dst_ptr[4] = { NULL };
+
+    base_src_ptrs[0] = in[0]->base[0];
+    base_src_ptrs[1] = in[1]->base[0];
+    base_src_ptrs[2] = in[2]->base[0];
+    base_src_ptrs[3] = in[3]->base[0];
+
+    base_dst_ptr[0] = out->base[0];
+    base_dst_ptr[1] = out->base[1];
+    base_dst_ptr[2] = out->base[2];
+    base_dst_ptr[3] = out->base[3];
+
+    vx_df_image format;
+
+    format = out->image.format;
+
+    vx_uint8 *planes[4];
+
+    if ((format == VX_DF_IMAGE_RGB) || (format == VX_DF_IMAGE_RGBX))
+    {
+        if (low_y == 0 && low_x == 0)
+        {
+            RGB(low_y, high_y, low_x)
+        }
+        else
+        {
+            RGB(0, low_y, low_x)
+            RGB(low_y, high_y, 0)
+        }
+    }
+    else if ((format == VX_DF_IMAGE_YUV4) || (format == VX_DF_IMAGE_IYUV))
+    {
+        if (low_y == 0 && low_x == 0)
+        {
+            YUV4(low_y, high_y, low_x)
+        }
+        else
+        {
+            YUV4(0, low_y, low_x)
+            YUV4(low_y, high_y, 0)
+        }
+    }
+    else if ((format == VX_DF_IMAGE_NV12) || (format == VX_DF_IMAGE_NV21))
+    {
+        int vidx = (format == VX_DF_IMAGE_NV12) ? 1 : 0;
+        if (low_y == 0 && low_x == 0)
+        {
+            NV12(low_y, high_y, low_x)
+        }
+        else
+        {
+            NV12(0, low_y, low_x)
+            NV12(low_y, high_y, 0)
+        }
+    }
+    else if ((format == VX_DF_IMAGE_YUYV) || (format == VX_DF_IMAGE_UYVY))
+    {
+        int yidx = (format == VX_DF_IMAGE_UYVY) ? 1 : 0;
+
+        if (low_y == 0 && low_x == 0)
+        {
+            YUYV(low_y, high_y, low_x)
+        }
+        else
+        {
+            YUYV(0, low_y, low_x)
+            YUYV(low_y, high_y, 0)
+        }
+    }
+}
diff --git a/kernels/tiling/tiling_convertcolor.c b/kernels/tiling/tiling_convertcolor.c
new file mode 100644
index 0000000..a82b53e
--- /dev/null
+++ b/kernels/tiling/tiling_convertcolor.c
@@ -0,0 +1,2088 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+
+#include <tiling.h>
+
+static vx_uint8 usat8(vx_int32 a)
+{
+    if (a > 255)
+        a = 255;
+    if (a < 0)
+        a = 0;
+    return (vx_uint8)a;
+}
+
+static void yuv2rgb_bt601(vx_uint8 y, vx_uint8 cb, vx_uint8 cr,
+                          vx_uint8 *r, vx_uint8 *g, vx_uint8 *b)
+{
+    /*
+    R'= Y' + 0.000*U' + 1.403*V'
+    G'= Y' - 0.344*U' - 0.714*V'
+    B'= Y' + 1.773*U' + 0.000*V'
+    */
+    vx_float64 f_y = (vx_float64)y;
+    vx_float64 f_u = (vx_float64)cb - 128;
+    vx_float64 f_v = (vx_float64)cr - 128;
+    vx_float64 f_r = f_y + 0.000f*f_u + 1.403f*f_v;
+    vx_float64 f_g = f_y - 0.344f*f_u - 0.714f*f_v;
+    vx_float64 f_b = f_y + 1.773f*f_u + 0.000f*f_v;
+    vx_int32 i_r = (vx_int32)f_r;
+    vx_int32 i_g = (vx_int32)f_g;
+    vx_int32 i_b = (vx_int32)f_b;
+    *r = usat8(i_r);
+    *g = usat8(i_g);
+    *b = usat8(i_b);
+}
+
+static void yuv2rgb_bt709(vx_uint8 y, vx_uint8 cb, vx_uint8 cr,
+                          vx_uint8 *r, vx_uint8 *g, vx_uint8 *b)
+{
+    /*
+    R'= Y' + 0.0000*U + 1.5748*V
+    G'= Y' - 0.1873*U - 0.4681*V
+    B'= Y' + 1.8556*U + 0.0000*V
+    */
+    vx_float64 f_y = (vx_float64)y;
+    vx_float64 f_u = (vx_float64)cb - 128;
+    vx_float64 f_v = (vx_float64)cr - 128;
+    vx_float64 f_r = f_y + 0.0000f*f_u + 1.5748f*f_v;
+    vx_float64 f_g = f_y - 0.1873f*f_u - 0.4681f*f_v;
+    vx_float64 f_b = f_y + 1.8556f*f_u + 0.0000f*f_v;
+    vx_int32 i_r = (vx_int32)f_r;
+    vx_int32 i_g = (vx_int32)f_g;
+    vx_int32 i_b = (vx_int32)f_b;
+    *r = usat8(i_r);
+    *g = usat8(i_g);
+    *b = usat8(i_b);
+}
+
+
+static void rgb2yuv_bt709(vx_uint8 r, vx_uint8 g, vx_uint8 b,
+                          vx_uint8 *y, vx_uint8 *cb, vx_uint8 *cr)
+{
+    /*
+    Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
+    U'=-0.1146*R' - 0.3854*G' + 0.5000*B'
+    V'= 0.5000*R' - 0.4542*G' - 0.0458*B'
+    */
+    vx_float64 f_r = (vx_float64)r;
+    vx_float64 f_g = (vx_float64)g;
+    vx_float64 f_b = (vx_float64)b;
+    vx_float64 f_y = 0 + 0.2126f*f_r + 0.7152f*f_g + 0.0722f*f_b;
+    vx_float64 f_u = 0 - 0.1146f*f_r - 0.3854f*f_g + 0.5000f*f_b;
+    vx_float64 f_v = 0 + 0.5000f*f_r - 0.4542f*f_g - 0.0458f*f_b;
+    vx_int32 i_y = (vx_int32)f_y;
+    vx_int32 i_u = (vx_int32)f_u + 128;
+    vx_int32 i_v = (vx_int32)f_v + 128;
+    *y  = usat8(i_y);
+    *cb = usat8(i_u);
+    *cr = usat8(i_v);
+}
+
+static void yuv2yuv_601to709(vx_uint8 y0, vx_uint8 cb0, vx_uint8 cr0,
+                             vx_uint8 *y1, vx_uint8 *cb1, vx_uint8 *cr1)
+{
+    /*
+    Y' = 1.0090*Y - 0.11826430*Cb - 0.2000311*Cr
+    Cb'= 0.0000*Y + 1.01911200*Cb + 0.1146035*Cr
+    Cr'= 0.0001*Y + 0.07534570*Cb + 1.0290932*Cr
+    */
+    vx_float64 f_y0 = (vx_float64)y0;
+    vx_float64 f_cb0 = (vx_float64)cb0;
+    vx_float64 f_cr0 = (vx_float64)cr0;
+    vx_float64 f_y1  = 1.0090*f_y0 - 0.11826430*f_cb0 - 0.2000311*f_cr0;
+    vx_float64 f_cb1 = 0.0000*f_y0 + 1.01911200*f_cb0 + 0.1146035*f_cr0;
+    vx_float64 f_cr1 = 0.0001*f_y0 + 0.07534570*f_cb0 + 1.0290932*f_cr0;
+    vx_int32 i_y = (vx_int32)f_y1;
+    vx_int32 i_cb = (vx_int32)f_cb1;
+    vx_int32 i_cr = (vx_int32)f_cr1;
+    *y1 = usat8(i_y);
+    *cb1 = usat8(i_cb);
+    *cr1 = usat8(i_cr);
+}
+
+static void rgb2yuv_bt709_neon(vx_float32 *arrfr, vx_float32 *arrfg, vx_float32 *arrfb,
+    vx_uint8 **y, vx_uint8 *cb, vx_uint8 *cr)
+{
+    /*
+    Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
+    U'=-0.1146*R' - 0.3854*G' + 0.5000*B'
+    V'= 0.5000*R' - 0.4542*G' - 0.0458*B'
+    */
+
+    float32x4_t fr32x4 = vld1q_f32(arrfr);
+    float32x4_t fg32x4 = vld1q_f32(arrfg);
+    float32x4_t fb32x4 = vld1q_f32(arrfb);
+
+    float32x4_t fy32x4 = vdupq_n_f32(0.0f);               
+    fy32x4 = vmlaq_n_f32(fy32x4, fr32x4, 0.2126f); 
+    fy32x4 = vmlaq_n_f32(fy32x4, fg32x4, 0.7152f); 
+    fy32x4 = vmlaq_n_f32(fy32x4, fb32x4, 0.0722f); 
+
+    float32x4_t fu32x4 =  vdupq_n_f32(0.0f);
+    fu32x4 = vmlaq_n_f32(fu32x4, fr32x4, -0.1146f); 
+    fu32x4 = vmlaq_n_f32(fu32x4, fg32x4, -0.3854f); 
+    fu32x4 = vmlaq_n_f32(fu32x4, fb32x4, 0.5000f); 
+
+    float32x4_t fv32x4 = vdupq_n_f32(0.0f);
+    fv32x4 = vmlaq_n_f32(fv32x4, fr32x4, 0.5000f); 
+    fv32x4 = vmlaq_n_f32(fv32x4, fg32x4, -0.4542f); 
+    fv32x4 = vmlaq_n_f32(fv32x4, fb32x4, -0.0458f); 
+
+    int32x4_t iy32x4 = vcvtq_s32_f32(fy32x4); 
+
+    int32x4_t icoeff32x4 = vdupq_n_s32(128);                    
+    int32x4_t iu32x4 = vcvtq_s32_f32(fu32x4);
+    iu32x4 = vaddq_s32(iu32x4, icoeff32x4);
+
+    int32x4_t iv32x4 = vcvtq_s32_f32(fv32x4);
+    iv32x4 = vaddq_s32(iv32x4, icoeff32x4);
+
+    int16x4_t vqmovn_s32 (int32x4_t __a);
+    uint16x4_t vreinterpret_u16_s16 (int16x4_t __a); 
+    uint8x8_t vqmovn_u16 (uint16x8_t __a);
+
+    y[0][0] = usat8(vgetq_lane_s32(iy32x4, 0));
+    y[1][0] = usat8(vgetq_lane_s32(iy32x4, 1));
+    y[2][0] = usat8(vgetq_lane_s32(iy32x4, 2));
+    y[3][0] = usat8(vgetq_lane_s32(iy32x4, 3));
+
+
+    cb[0] = usat8(vgetq_lane_s32(iu32x4, 0));
+    cb[1] = usat8(vgetq_lane_s32(iu32x4, 1));
+    cb[2] = usat8(vgetq_lane_s32(iu32x4, 2));
+    cb[3] = usat8(vgetq_lane_s32(iu32x4, 3));
+
+    cr[0] = usat8(vgetq_lane_s32(iv32x4, 0));
+    cr[1] = usat8(vgetq_lane_s32(iv32x4, 1));
+    cr[2] = usat8(vgetq_lane_s32(iv32x4, 2));
+    cr[3] = usat8(vgetq_lane_s32(iv32x4, 3));
+}
+
+static void yuv2rgb_bt601_neon(vx_uint8 **y, vx_uint8 cb, vx_uint8 cr,
+    vx_uint8 **r, vx_uint8 **g, vx_uint8 **b)
+{
+    /*
+    R'= Y' + 0.000*U' + 1.403*V'
+    G'= Y' - 0.344*U' - 0.714*V'
+    B'= Y' + 1.773*U' + 0.000*V'
+    */
+    vx_float32 fy[4] = { (vx_float32)y[0][0], (vx_float32)y[1][0], (vx_float32)y[2][0], (vx_float32)y[3][0] };
+    vx_float32 fu[4] = { (vx_float32)cb - 128, (vx_float32)cb - 128, (vx_float32)cb - 128, (vx_float32)cb - 128 };
+    vx_float32 fv[4] = { (vx_float32)cr - 128, (vx_float32)cr - 128, (vx_float32)cr - 128, (vx_float32)cr - 128 };
+
+    float32x4_t fy32x4 = vld1q_f32(fy);
+    float32x4_t fu32x4 = vld1q_f32(fu);
+    float32x4_t fv32x4 = vld1q_f32(fv);
+
+    float32x4_t fr32x4 = vdupq_n_f32(0.0f);               
+    fr32x4 = vaddq_f32(fr32x4, fy32x4);
+    fr32x4 = vmlaq_n_f32(fr32x4, fu32x4, 0.000f); 
+    fr32x4 = vmlaq_n_f32(fr32x4, fv32x4, 1.403f); 
+
+    float32x4_t fg32x4 = vdupq_n_f32(0.0f);               
+    fg32x4 = vaddq_f32(fg32x4, fy32x4);
+    fg32x4 = vmlaq_n_f32(fg32x4, fu32x4, -0.344f); 
+    fg32x4 = vmlaq_n_f32(fg32x4, fv32x4, -0.714f); 
+
+    float32x4_t fb32x4 = vdupq_n_f32(0.0f);               
+    fb32x4 = vaddq_f32(fb32x4, fy32x4);
+    fb32x4 = vmlaq_n_f32(fb32x4, fu32x4, 1.773f); 
+    fb32x4 = vmlaq_n_f32(fb32x4, fv32x4, 0.000f); 
+
+    int32x4_t ir32x4 = vcvtq_s32_f32(fr32x4); 
+    int32x4_t ig32x4 = vcvtq_s32_f32(fg32x4);
+    int32x4_t ib32x4 = vcvtq_s32_f32(fb32x4);
+
+    vx_int32 arr32[12]; 
+    vst1q_s32(arr32,  ir32x4);
+    vst1q_s32(arr32+4, ig32x4);
+    vst1q_s32(arr32+8, ib32x4);
+
+    for (vx_uint8 i = 0; i < 4; i++)
+    {
+        r[i][0] = usat8(arr32[i]);
+        g[i][1] = usat8(arr32[4 + i]);
+        b[i][2] = usat8(arr32[8 + i]);
+    }
+}
+
+static void yuv2rgb_bt709_neon(vx_uint8 **y, vx_uint8 cb, vx_uint8 cr,
+    vx_uint8 **r, vx_uint8 **g, vx_uint8 **b)
+{
+    /*
+    R'= Y' + 0.0000*U + 1.5748*V
+    G'= Y' - 0.1873*U - 0.4681*V
+    B'= Y' + 1.8556*U + 0.0000*V
+    */
+    vx_float32 fy[4] = { (vx_float32)y[0][0], (vx_float32)y[1][0], (vx_float32)y[2][0], (vx_float32)y[3][0] };
+    vx_float32 fu[4] = { (vx_float32)cb - 128, (vx_float32)cb - 128, (vx_float32)cb - 128, (vx_float32)cb - 128 };
+    vx_float32 fv[4] = { (vx_float32)cr - 128, (vx_float32)cr - 128, (vx_float32)cr - 128, (vx_float32)cr - 128 };
+
+    float32x4_t fy32x4 = vld1q_f32(fy);
+    float32x4_t fu32x4 = vld1q_f32(fu);
+    float32x4_t fv32x4 = vld1q_f32(fv);
+
+    float32x4_t fr32x4 = vdupq_n_f32(0.0f);               
+    fr32x4 = vaddq_f32(fr32x4, fy32x4);
+    fr32x4 = vmlaq_n_f32(fr32x4, fu32x4, 0.000f); 
+    fr32x4 = vmlaq_n_f32(fr32x4, fv32x4, 1.5748f); 
+
+    float32x4_t fg32x4 = vdupq_n_f32(0.0f);               
+    fg32x4 = vaddq_f32(fg32x4, fy32x4);
+    fg32x4 = vmlaq_n_f32(fg32x4, fu32x4, -0.1873f);
+    fg32x4 = vmlaq_n_f32(fg32x4, fv32x4,  -0.4681f);
+
+    float32x4_t fb32x4 = vdupq_n_f32(0.0f);
+    fb32x4 = vaddq_f32(fb32x4, fy32x4);
+    fb32x4 = vmlaq_n_f32(fb32x4, fu32x4, 1.8556f);
+    fb32x4 = vmlaq_n_f32(fb32x4, fv32x4, 0.000f);
+
+    int32x4_t ir32x4 = vcvtq_s32_f32(fr32x4); 
+    int32x4_t ig32x4 = vcvtq_s32_f32(fg32x4);
+    int32x4_t ib32x4 = vcvtq_s32_f32(fb32x4);
+
+    vx_int32 arr32[12]; 
+    vst1q_s32(arr32, ir32x4);
+    vst1q_s32(arr32 + 4, ig32x4);
+    vst1q_s32(arr32+8, ib32x4);
+
+    for (vx_uint8 i = 0; i < 4; i++)
+    {
+        r[i][0] = usat8(arr32[i]);
+        g[i][1] = usat8(arr32[4 + i]);
+        b[i][2] = usat8(arr32[8 + i]);
+    }
+}
+
+static void yuv2yuv_601to709_neon(vx_uint8 *y0, vx_uint8 *cb0, vx_uint8 *cr0,
+    vx_uint8 *y1, vx_uint8 *cb1, vx_uint8 *cr1)
+{
+    /*
+    Y' = 1.0090*Y - 0.11826430*Cb - 0.2000311*Cr
+    Cb'= 0.0000*Y + 1.01911200*Cb + 0.1146035*Cr
+    Cr'= 0.0001*Y + 0.07534570*Cb + 1.0290932*Cr
+    */
+    vx_float32 fy0[4] = { (vx_float32)y0[0], (vx_float32)y0[1], (vx_float32)y0[2], (vx_float32)y0[3] };
+    vx_float32 fcb0[4] = { (vx_float32)cb0[0], (vx_float32)cb0[1], (vx_float32)cb0[2], (vx_float32)cb0[3] };
+    vx_float32 fcr0[4] = { (vx_float32)cr0[0], (vx_float32)cr0[1], (vx_float32)cr0[2], (vx_float32)cr0[3] };;
+
+    float32x4_t fy032x4  = vld1q_f32(fy0);
+    float32x4_t fcb032x4 = vld1q_f32(fcb0);
+    float32x4_t fcr032x4 = vld1q_f32(fcr0);
+
+    float32x4_t fy132x4 = vdupq_n_f32(0.0f);               
+    fy132x4 = vmlaq_n_f32(fy132x4, fy032x4, 1.0090); 
+    fy132x4 = vmlaq_n_f32(fy132x4, fcb032x4, -0.11826430); 
+    fy132x4 = vmlaq_n_f32(fy132x4, fcr032x4, -0.2000311); 
+
+    float32x4_t fcb132x4 = vdupq_n_f32(0.0f);               
+    fcb132x4 = vmlaq_n_f32(fcb132x4, fy032x4, 0.0000); 
+    fcb132x4 = vmlaq_n_f32(fcb132x4, fcb032x4, 1.01911200); 
+    fcb132x4 = vmlaq_n_f32(fcb132x4, fcr032x4, 0.1146035); 
+
+    float32x4_t fcr132x4 = vdupq_n_f32(0.0f);               
+    fcr132x4 = vmlaq_n_f32(fcr132x4, fy032x4, 0.0001); 
+    fcr132x4 = vmlaq_n_f32(fcr132x4, fcb032x4, 0.07534570); 
+    fcr132x4 = vmlaq_n_f32(fcr132x4, fcr032x4,  1.0290932); 
+
+    int32x4_t iy32x4 = vcvtq_s32_f32(fy132x4); 
+    int32x4_t icb32x4 = vcvtq_s32_f32(fcb132x4); 
+    int32x4_t icr32x4 = vcvtq_s32_f32(fcr132x4); 
+
+    vx_int32 arr32[12]; 
+    vst1q_s32(arr32,  iy32x4);
+    vst1q_s32(arr32+4, icb32x4);
+    vst1q_s32(arr32+8, icr32x4);
+
+    for(vx_uint8 i = 0; i < 4; i++) 
+    {
+        y1[i]  = usat8(arr32[i]); 
+        cb1[2*i] = usat8(arr32[4 + i]);
+        cr1[2*i+1] = usat8(arr32[8 + i]); 
+    }
+}
+
+
+static void yuv2rgb_bt601V(vx_float32* y, vx_float32* cb, vx_float32* cr,
+    vx_uint8 *rUint8, vx_uint8 *gUint8, vx_uint8 *bUint8)
+{
+    float32x4_t y32X4Value = vld1q_f32(y);
+    float32x4_t cb32X4Value = vld1q_f32(cb);
+    float32x4_t cr32X4Value = vld1q_f32(cr);
+    float32x4_t All128 = vdupq_n_f32(128.0f);
+    float32x4_t AllZero = vdupq_n_f32(0.0f);
+    float32x4_t rFloatValue, gFloatValue, bFloatValue;
+    int32x4_t  rIntValue, gIntValue, bIntValue;
+    cb32X4Value = vsubq_f32(cb32X4Value,All128);
+    cr32X4Value = vsubq_f32(cr32X4Value,All128);
+
+    // R'= Y' + 0.000*U' + 1.403*V'
+    // G'= Y' - 0.344*U' - 0.714*V'
+    // B'= Y' + 1.773*U' + 0.000*V'
+    rFloatValue = vmlaq_n_f32(y32X4Value, cr32X4Value, 1.403f);
+
+    gFloatValue = vmlaq_n_f32(y32X4Value, cb32X4Value, -0.344f);
+    gFloatValue = vmlaq_n_f32(gFloatValue, cr32X4Value, -0.714f);
+
+    bFloatValue = vmlaq_n_f32(y32X4Value, cb32X4Value, 1.773f);
+
+    rIntValue = vcvtq_s32_f32(rFloatValue);
+    gIntValue = vcvtq_s32_f32(gFloatValue);
+    bIntValue = vcvtq_s32_f32(bFloatValue);
+
+    rUint8[0] = usat8(vgetq_lane_s32(rIntValue, 0));
+    gUint8[0] = usat8(vgetq_lane_s32(gIntValue, 0));
+    bUint8[0] = usat8(vgetq_lane_s32(bIntValue, 0));
+
+    rUint8[1] = usat8(vgetq_lane_s32(rIntValue, 1));
+    gUint8[1] = usat8(vgetq_lane_s32(gIntValue, 1));
+    bUint8[1] = usat8(vgetq_lane_s32(bIntValue, 1));
+
+    rUint8[2] = usat8(vgetq_lane_s32(rIntValue, 2));
+    gUint8[2] = usat8(vgetq_lane_s32(gIntValue, 2));
+    bUint8[2] = usat8(vgetq_lane_s32(bIntValue, 2));
+
+    rUint8[3] = usat8(vgetq_lane_s32(rIntValue, 3));
+    gUint8[3] = usat8(vgetq_lane_s32(gIntValue, 3));
+    bUint8[3] = usat8(vgetq_lane_s32(bIntValue, 3));
+}
+
+static void yuv2rgb_bt709V(vx_float32* y, vx_float32* cb, vx_float32* cr,
+    vx_uint8 *rUint8, vx_uint8 *gUint8, vx_uint8 *bUint8)
+{
+    float32x4_t y32X4Value = vld1q_f32(y);
+    float32x4_t cb32X4Value = vld1q_f32(cb);
+    float32x4_t cr32X4Value = vld1q_f32(cr);
+    float32x4_t All128 = vdupq_n_f32(128.0f);
+    float32x4_t AllZero = vdupq_n_f32(0.0f);
+    float32x4_t rFloatValue, gFloatValue, bFloatValue;
+    int32x4_t  rIntValue, gIntValue, bIntValue;
+    cb32X4Value = vsubq_f32(cb32X4Value,All128);
+    cr32X4Value = vsubq_f32(cr32X4Value,All128);
+
+    // R'= Y' + 0.000*U' + 1.403*V'
+    // G'= Y' - 0.344*U' - 0.714*V'
+    // B'= Y' + 1.773*U' + 0.000*V'
+    rFloatValue = vmlaq_n_f32(y32X4Value, cr32X4Value, 1.5748f);
+
+    gFloatValue = vmlaq_n_f32(y32X4Value, cb32X4Value, -0.1873f);
+    gFloatValue = vmlaq_n_f32(gFloatValue, cr32X4Value, -0.4681f);
+
+    bFloatValue = vmlaq_n_f32(y32X4Value, cb32X4Value, 1.8556f);
+
+    rIntValue = vcvtq_s32_f32(rFloatValue);
+    gIntValue = vcvtq_s32_f32(gFloatValue);
+    bIntValue = vcvtq_s32_f32(bFloatValue);
+
+    rUint8[0] = usat8(vgetq_lane_s32(rIntValue, 0));
+    gUint8[0] = usat8(vgetq_lane_s32(gIntValue, 0));
+    bUint8[0] = usat8(vgetq_lane_s32(bIntValue, 0));
+
+    rUint8[1] = usat8(vgetq_lane_s32(rIntValue, 1));
+    gUint8[1] = usat8(vgetq_lane_s32(gIntValue, 1));
+    bUint8[1] = usat8(vgetq_lane_s32(bIntValue, 1));
+
+    rUint8[2] = usat8(vgetq_lane_s32(rIntValue, 2));
+    gUint8[2] = usat8(vgetq_lane_s32(gIntValue, 2));
+    bUint8[2] = usat8(vgetq_lane_s32(bIntValue, 2));
+
+    rUint8[3] = usat8(vgetq_lane_s32(rIntValue, 3));
+    gUint8[3] = usat8(vgetq_lane_s32(gIntValue, 3));
+    bUint8[3] = usat8(vgetq_lane_s32(bIntValue, 3));
+}
+
+
+void ConvertColor_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    void *src_base[4] = {NULL};
+    void *dst_base[4] = {NULL};
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint32 width = vxTileWidth(out, 0);
+    vx_uint32 height = vxTileHeight(out, 0);
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = out->tile_x + out->tile_block.width;
+
+    src_base[0] = in->base[0];
+    dst_base[0] = out->base[0];
+
+    src_base[1] = in->base[1];
+    dst_base[1] = out->base[1];
+
+    src_base[2] = in->base[2];
+    dst_base[2] = out->base[2];
+
+    vx_uint32 srcP0StrideX = in->addr[0].stride_x;
+    vx_uint32 srcP0StrideY = in->addr[0].stride_y;
+    vx_uint32 dstP0StrideX = out->addr[0].stride_x;
+    vx_uint32 dstP0StrideY = out->addr[0].stride_y;
+
+    vx_uint32 srcP1StrideX = in->addr[1].stride_x;
+    vx_uint32 srcP1StrideY = in->addr[1].stride_y;
+    vx_uint32 dstP1StrideX = out->addr[1].stride_x;
+    vx_uint32 dstP1StrideY = out->addr[1].stride_y;
+
+    vx_uint32 srcP2StrideX = in->addr[2].stride_x;
+    vx_uint32 srcP2StrideY = in->addr[2].stride_y;
+    vx_uint32 dstP2StrideX = out->addr[2].stride_x;
+    vx_uint32 dstP2StrideY = out->addr[2].stride_y;
+
+    vx_df_image src_format, dst_format;
+
+    src_format = in->image.format;
+    dst_format = out->image.format;
+
+    vx_enum src_space = in->image.space;
+
+    if ((src_format == VX_DF_IMAGE_RGB) || (src_format == VX_DF_IMAGE_RGBX))
+    {
+        if (dst_format == VX_DF_IMAGE_RGB || dst_format == VX_DF_IMAGE_RGBX)
+        {
+            if (dst_format == VX_DF_IMAGE_RGB)
+            {
+                for (y = low_y; y < high_y; y++)
+                {
+                    for (x = low_x; x < high_x; x += 8)
+                    {
+                        vx_uint8 *srcP0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                        vx_uint8 *dstP0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+
+                        uint8x8x4_t s = vld4_u8(srcP0);
+
+                        uint8x8x3_t d;
+                        d.val[0] = s.val[0];
+                        d.val[1] = s.val[1];
+                        d.val[2] = s.val[2];
+
+                        vst3_u8(dstP0, d);
+                    }
+                }
+            }
+            else
+            {
+                for (y = low_y; y < high_y; y++)
+                {
+                    for (x = low_x; x < high_x; x += 8)
+                    { 
+                        vx_uint8 *srcP0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                        vx_uint8 *dstP0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+
+                        uint8x8x3_t s = vld3_u8(srcP0);
+
+                        uint8x8x4_t d;
+                        d.val[0] = s.val[0];
+                        d.val[1] = s.val[1];
+                        d.val[2] = s.val[2];
+                        d.val[3] = vdup_n_u8(255);
+
+                        vst4_u8(dstP0, d); 
+                    }
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_NV12)
+        {
+            vx_uint8 cb[4];
+            vx_uint8 cr[4];
+            vx_uint8 *rgb[4];
+            vx_uint8 *luma[4]; 
+            vx_uint8 *cbcr;
+
+            for (y = low_y; y < high_y; y += 2)
+            {
+                for (x = low_x; x < high_x; x += 2)
+                {
+                    rgb[0] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                    rgb[1] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+1) * srcP0StrideX;
+                    rgb[2] = (vx_uint8 *)src_base[0] + (y+1) * srcP0StrideY + x * srcP0StrideX;
+                    rgb[3] = (vx_uint8 *)src_base[0] + (y+1) * srcP0StrideY + (x+1) * srcP0StrideX;
+
+                    luma[0] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+                    luma[1] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX;
+                    luma[2] = (vx_uint8 *)dst_base[0] + (y+1) * dstP0StrideY + x * dstP0StrideX;
+                    luma[3] = (vx_uint8 *)dst_base[0] + (y+1) * dstP0StrideY + (x+1) * dstP0StrideX;
+
+                    cbcr = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1);
+           
+                    vx_float32 arrfr[4] = { (vx_float32)rgb[0][0], (vx_float32)rgb[1][0], (vx_float32)rgb[2][0], (vx_float32)rgb[3][0] };
+                    vx_float32 arrfg[4] = { (vx_float32)rgb[0][1], (vx_float32)rgb[1][1], (vx_float32)rgb[2][1], (vx_float32)rgb[3][1] };
+                    vx_float32 arrfb[4] = { (vx_float32)rgb[0][2], (vx_float32)rgb[1][2], (vx_float32)rgb[2][2], (vx_float32)rgb[3][2] };
+
+                    rgb2yuv_bt709_neon(arrfr, arrfg, arrfb, luma, &cb[0], &cr[0]);
+
+                    cbcr[0] = (cb[0] + cb[1] + cb[2] + cb[3]) / 4;
+                    cbcr[1] = (cr[0] + cr[1] + cr[2] + cr[3]) / 4;
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_YUV4)
+        {
+            vx_uint8 cb[4];
+            vx_uint8 cr[4];
+            vx_uint8 *rgb[4];
+            vx_uint8 *luma[4]; 
+            vx_uint8 *u[4];
+            vx_uint8 *v[4];
+            for (y = low_y; y < high_y; y++)
+            {
+                for (x = low_x; x < high_x; x += 4)
+                {
+                    rgb[0] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                    rgb[1] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+1) * srcP0StrideX;
+                    rgb[2] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+2) * srcP0StrideX;
+                    rgb[3] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+3) * srcP0StrideX;
+
+                    luma[0] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+                    luma[1] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX;
+                    luma[2] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+2) * dstP0StrideX;
+                    luma[3] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+3) * dstP0StrideX;
+
+                    u[0] = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX;
+                    u[1] = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + (x+1) * dstP1StrideX;
+                    u[2] = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + (x+2) * dstP1StrideX;
+                    u[3] = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + (x+3) * dstP1StrideX;
+
+                    v[0] = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX;
+                    v[1] = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + (x+1) * dstP2StrideX;
+                    v[2] = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + (x+2) * dstP2StrideX;
+                    v[3] = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + (x+3) * dstP2StrideX;
+
+                    vx_float32 arrfr[4] = { (vx_float32)rgb[0][0], (vx_float32)rgb[1][0], (vx_float32)rgb[2][0], (vx_float32)rgb[3][0] };
+                    vx_float32 arrfg[4] = { (vx_float32)rgb[0][1], (vx_float32)rgb[1][1], (vx_float32)rgb[2][1], (vx_float32)rgb[3][1] };
+                    vx_float32 arrfb[4] = { (vx_float32)rgb[0][2], (vx_float32)rgb[1][2], (vx_float32)rgb[2][2], (vx_float32)rgb[3][2] };
+
+                    rgb2yuv_bt709_neon(arrfr, arrfg, arrfb, luma, &cb[0], &cr[0]);
+
+                    *u[0] = cb[0];
+                    *u[1] = cb[1]; 
+                    *u[2] = cb[2]; 
+                    *u[3] = cb[3];
+
+                    *v[0] = cr[0];
+                    *v[1] = cr[1]; 
+                    *v[2] = cr[2]; 
+                    *v[3] = cr[3];
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_IYUV)
+        {
+            vx_uint8 cb[4];
+            vx_uint8 cr[4];
+            vx_uint8 *rgb[4];
+            vx_uint8 *luma[4]; 
+            vx_uint8 *cbp;
+            vx_uint8 *crp;
+            for (y = low_y; y < high_y; y += 2)
+            {
+                for (x = low_x; x < high_x; x += 2)
+                {
+                    rgb[0] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                    rgb[1] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+1) * srcP0StrideX;
+                    rgb[2] = (vx_uint8 *)src_base[0] + (y+1) * srcP0StrideY + x * srcP0StrideX;
+                    rgb[3] = (vx_uint8 *)src_base[0] + (y+1) * srcP0StrideY + (x+1) * srcP0StrideX;
+
+                    luma[0] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+                    luma[1] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX;
+                    luma[2] = (vx_uint8 *)dst_base[0] + (y+1) * dstP0StrideY + x * dstP0StrideX;
+                    luma[3] = (vx_uint8 *)dst_base[0] + (y+1) * dstP0StrideY + (x+1) * dstP0StrideX;
+
+                    cbp = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1);
+                    crp = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * (x >> 1);
+
+                    vx_float32 arrfr[4] = { (vx_float32)rgb[0][0], (vx_float32)rgb[1][0], (vx_float32)rgb[2][0], (vx_float32)rgb[3][0] };
+                    vx_float32 arrfg[4] = { (vx_float32)rgb[0][1], (vx_float32)rgb[1][1], (vx_float32)rgb[2][1], (vx_float32)rgb[3][1] };
+                    vx_float32 arrfb[4] = { (vx_float32)rgb[0][2], (vx_float32)rgb[1][2], (vx_float32)rgb[2][2], (vx_float32)rgb[3][2] };
+
+                    rgb2yuv_bt709_neon(arrfr, arrfg, arrfb, luma, &cb[0], &cr[0]);
+
+                    cbp[0] = (vx_uint8)(((vx_uint16)cb[0] + (vx_uint16)cb[1] + (vx_uint16)cb[2] + (vx_uint16)cb[3]) >> 2);
+                    crp[0] = (vx_uint8)(((vx_uint16)cr[0] + (vx_uint16)cr[1] + (vx_uint16)cr[2] + (vx_uint16)cr[3]) >> 2);
+                }
+            }
+        }
+    }
+    else if (src_format == VX_DF_IMAGE_NV21 || src_format == VX_DF_IMAGE_NV12)
+    {
+        int u_pix = src_format == VX_DF_IMAGE_NV12 ? 0 : 1;
+        int v_pix = src_format == VX_DF_IMAGE_NV12 ? 1 : 0;
+        if ((dst_format == VX_DF_IMAGE_RGB) || (dst_format == VX_DF_IMAGE_RGBX))
+        {
+            vx_uint8 *rgb[4];
+            vx_uint8 *luma[4]; 
+            vx_uint8 *crcb;
+            for (y = low_y; y < high_y; y += 2)
+            {
+                for (x = low_x; x < high_x; x += 2)
+                {
+                    luma[0] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                    luma[1] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+1) * srcP0StrideX;
+                    luma[2] = (vx_uint8 *)src_base[0] + (y+1) * srcP0StrideY + x * srcP0StrideX;
+                    luma[3] = (vx_uint8 *)src_base[0] + (y+1) * srcP0StrideY + (x+1) * srcP0StrideX;
+
+                    crcb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1);
+
+                    rgb[0] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+                    rgb[1] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX;
+                    rgb[2] = (vx_uint8 *)dst_base[0] + (y+1) * dstP0StrideY + x * dstP0StrideX;
+                    rgb[3] = (vx_uint8 *)dst_base[0] + (y+1) * dstP0StrideY + (x+1) * dstP0StrideX;
+
+                    if (dst_format == VX_DF_IMAGE_RGBX)
+                    {
+                        rgb[0][3] = 255;
+                        rgb[1][3] = 255;
+                        rgb[2][3] = 255;
+                        rgb[3][3] = 255;
+
+                    } 
+                    if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625)
+                    {
+                        yuv2rgb_bt601_neon(luma, crcb[u_pix], crcb[v_pix], rgb, rgb, rgb);
+                    }
+                    else 
+                    {
+                        yuv2rgb_bt709_neon(luma, crcb[u_pix], crcb[v_pix], rgb, rgb, rgb);
+                    }
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_NV12 || dst_format == VX_DF_IMAGE_NV21)
+        {
+            for (y = low_y; y < high_y; y++)
+            {
+                for (x = low_x; x < high_x; x++)
+                {
+                    vx_uint8 *luma[2] = {(vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX,
+                                         (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX};
+
+                    vx_uint8 *cbcr  = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1);
+                    vx_uint8 *crcb  = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1);
+
+                    yuv2yuv_601to709(luma[0][0],cbcr[0],cbcr[1],&luma[1][0],&crcb[1],&crcb[0]);
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_YUV4)
+        {
+            for (y = low_y; y < high_y; y++)
+            {
+                for (x = low_x; x < high_x; x += 8)
+                {
+                    vx_uint8 *srcP0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                    vx_uint8 *dstP0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+
+                    uint8x8_t lumaV8 = vld1_u8(srcP0); 
+                    vst1_u8(dstP0, lumaV8);
+                }
+            }
+
+            vx_uint8 *crcb = NULL;
+            vx_uint8 *cb[4] = { NULL };
+            vx_uint8 *cr[4] = { NULL };
+            for (y = low_y; y < high_y; y += 2)
+            {
+                for (x = low_x; x < high_x; x += 2)
+                {
+                    crcb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1);
+
+                    cb[0] = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX;
+                    cb[1] = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + (x+1) * dstP1StrideX;
+                    cb[2] = (vx_uint8 *)dst_base[1] + (y+1) * dstP1StrideY + x * dstP1StrideX;
+                    cb[3] = (vx_uint8 *)dst_base[1] + (y+1) * dstP1StrideY + (x+1) * dstP1StrideX;
+
+                    cr[0] = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX;
+                    cr[1] = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + (x+1) * dstP2StrideX;
+                    cr[2] = (vx_uint8 *)dst_base[2] + (y+1) * dstP2StrideY + x * dstP2StrideX;
+                    cr[3] = (vx_uint8 *)dst_base[2] + (y+1) * dstP2StrideY + (x+1) * dstP2StrideX;
+
+                    cb[0][0] = crcb[u_pix];
+                    cb[1][0] = crcb[u_pix];
+                    cb[2][0] = crcb[u_pix];
+                    cb[3][0] = crcb[u_pix];
+
+                    cr[0][0] = crcb[v_pix];
+                    cr[1][0] = crcb[v_pix];
+                    cr[2][0] = crcb[v_pix];
+                    cr[3][0] = crcb[v_pix];
+
+                }
+            }    
+        }
+        else if (dst_format == VX_DF_IMAGE_IYUV)
+        {
+            for (y = low_y; y < high_y; y++)
+            {
+                for (x = low_x; x < high_x; x += 8)
+                {
+                    vx_uint8 *srcP0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                    vx_uint8 *dstP0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+
+                    uint8x8_t lumaV8 = vld1_u8(srcP0); 
+                    vst1_u8(dstP0, lumaV8);
+                }
+            }
+
+            vx_uint8 *crcb[4];
+            vx_uint8 *cb[4];
+            vx_uint8 *cr[4];
+            for (y = low_y; y < high_y; y++)
+            {
+                for (x = low_x; x < high_x; x += 4)
+                {
+                    crcb[0] = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * ((x + 0) / 2);
+                    crcb[1] = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * ((x + 1) / 2);
+                    crcb[2] = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * ((x + 2) / 2);
+                    crcb[3] = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * ((x + 3) / 2);
+
+                    cb[0] = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * ((x + 0) / 2);
+                    cb[1] = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * ((x + 1) / 2);
+                    cb[2] = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * ((x + 2) / 2);
+                    cb[3] = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * ((x + 3) / 2);
+
+                    cr[0] = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * ((x + 0) / 2);
+                    cr[1] = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * ((x + 1) / 2);
+                    cr[2] = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * ((x + 2) / 2);
+                    cr[3] = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * ((x + 3) / 2);
+
+
+                    cb[0][0] = crcb[0][u_pix];
+                    cb[1][0] = crcb[1][u_pix];
+                    cb[2][0] = crcb[2][u_pix];
+                    cb[3][0] = crcb[3][u_pix];
+
+                    cr[0][0] = crcb[0][v_pix];   
+                    cr[1][0] = crcb[1][v_pix];   
+                    cr[2][0] = crcb[2][v_pix];   
+                    cr[3][0] = crcb[3][v_pix];   
+                }
+            }
+        }
+    }
+    else if (src_format == VX_DF_IMAGE_YUYV)
+    {
+        if (dst_format == VX_DF_IMAGE_RGB || dst_format == VX_DF_IMAGE_RGBX)
+        {
+            vx_uint32 x, y;
+            for (y = low_y; y < high_y; y++)
+            {
+                for (x = low_x; x < high_x; x += 4)
+                {
+                    vx_uint8 *yuyv = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                    vx_uint8 *yuyv1 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+2) * srcP0StrideX;
+
+                    vx_float32 yValue[4] = {(vx_float32)yuyv[0],(vx_float32)yuyv[2],(vx_float32)yuyv1[0],(vx_float32)yuyv1[2]};
+                    vx_float32 cbValue[4] = {(vx_float32)yuyv[1],(vx_float32)yuyv[1],(vx_float32)yuyv1[1],(vx_float32)yuyv1[1]};
+                    vx_float32 crValue[4] = {(vx_float32)yuyv[3],(vx_float32)yuyv[3],(vx_float32)yuyv1[3],(vx_float32)yuyv1[3]};
+
+                    vx_uint8 *rgb0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+                    vx_uint8 *rgb1 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX;
+                    vx_uint8 *rgb2 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+2) * dstP0StrideX;
+                    vx_uint8 *rgb3 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+3) * dstP0StrideX;
+
+                    vx_uint8 bUint8[4];
+                    vx_uint8 gUint8[4];
+                    vx_uint8 rUint8[4];
+
+                    if(src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625)
+                    {
+                        yuv2rgb_bt601V(yValue, cbValue, crValue, rUint8, gUint8, bUint8);
+
+                        rgb0[0] = rUint8[0];
+                        rgb1[0] = rUint8[1];
+                        rgb2[0] = rUint8[2];
+                        rgb3[0] = rUint8[3];
+
+                        rgb0[1] = gUint8[0];
+                        rgb1[1] = gUint8[1];
+                        rgb2[1] = gUint8[2];
+                        rgb3[1] = gUint8[3];
+
+                        rgb0[2] = bUint8[0];
+                        rgb1[2] = bUint8[1];
+                        rgb2[2] = bUint8[2];
+                        rgb3[2] = bUint8[3];
+                        if (dst_format == VX_DF_IMAGE_RGBX)
+                        {
+                            rgb0[3] = 255;
+                            rgb1[3] = 255;
+                            rgb2[3] = 255;
+                            rgb3[3] = 255;
+                        }
+                    }
+                    else
+                    {
+                        yuv2rgb_bt709V(yValue, cbValue, crValue, rUint8, gUint8, bUint8);
+
+                        rgb0[0] = rUint8[0];
+                        rgb1[0] = rUint8[1];
+                        rgb2[0] = rUint8[2];
+                        rgb3[0] = rUint8[3];
+
+                        rgb0[1] = gUint8[0];
+                        rgb1[1] = gUint8[1];
+                        rgb2[1] = gUint8[2];
+                        rgb3[1] = gUint8[3];
+
+                        rgb0[2] = bUint8[0];
+                        rgb1[2] = bUint8[1];
+                        rgb2[2] = bUint8[2];
+                        rgb3[2] = bUint8[3];
+                        if (dst_format == VX_DF_IMAGE_RGBX)
+                        {
+                            rgb0[3] = 255;
+                            rgb1[3] = 255;
+                            rgb2[3] = 255;
+                            rgb3[3] = 255;
+                        }
+                    }
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_NV12)
+        {
+            vx_uint32 x, y;
+            vx_uint8 *yuyv[2];
+            vx_uint8 *luma[4];
+            for (y = low_y; y < high_y; y += 2)
+            {
+                vx_uint8 *src0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY;
+                vx_uint8 *src1 = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY;
+                vx_uint8 *dstLuma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY;
+                vx_uint8 *dstLuma1 = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY;
+                vx_uint8 *dstCbCr = (vx_uint8 *)dst_base[1] + (y >> 1) * dstP0StrideY;
+                for (x = low_x; x < high_x; x += 8)
+                {
+                    uint8x8_t srcValue00 = vld1_u8(src0 + x * srcP0StrideX);
+                    uint8x8_t srcValue01 = vld1_u8(src0 + (x + 4) * srcP0StrideX);
+                    uint8x8x2_t dstValue0 = vuzp_u8(srcValue00, srcValue01);
+                    vst1_u8((dstLuma + x * dstP0StrideX),dstValue0.val[0]);
+
+                    uint8x8_t srcValue10 = vld1_u8(src1 + x * srcP0StrideX);
+                    uint8x8_t srcValue11 = vld1_u8(src1 + (x + 4) * srcP0StrideX);
+                    uint8x8x2_t dstValue1 = vuzp_u8(srcValue10, srcValue11);
+                    vst1_u8((dstLuma1 + x * dstP0StrideX),dstValue1.val[0]);
+
+                    uint16x8_t cbcrValue = vaddl_u8(dstValue0.val[1],dstValue1.val[1]);
+
+                    vx_uint16 cbcrValuek[8];
+                    vst1q_u16(cbcrValuek,cbcrValue);
+                    for (vx_uint32 kx = 0; kx < 8; kx += 2)
+                    {
+                        *(dstCbCr + ((x + kx) >> 1) * dstP1StrideX) = cbcrValuek[kx] / 2;
+                        *(dstCbCr + ((x + kx) >> 1) * dstP1StrideX + 1) = cbcrValuek[kx + 1] / 2;
+                    }
+
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_YUV4)
+        {
+            for (y = low_y; y < high_y; y++)
+            {
+                for (x = low_x; x < high_x; x += 2)
+                {
+                    vx_uint8 *yuyv = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                    vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+                    vx_uint8 *cb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX;
+                    vx_uint8 *cr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX;
+
+                    luma[0] = yuyv[0];
+                    luma[1] = yuyv[2];
+                    cb[0] = yuyv[1];
+                    cr[0] = yuyv[3];
+                    cb[1] = yuyv[1];
+                    cr[1] = yuyv[3];
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_IYUV)
+        {
+            vx_uint32 x, y;
+            vx_uint8 *yuyv[2];
+            vx_uint8 *_luma[4];
+            for (y = low_y; y < high_y; y += 2)
+            {
+                vx_uint8 *src0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY;
+                vx_uint8 *src1 = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY; 
+                vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY;
+                vx_uint8 *luma1 = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY;
+                vx_uint8 *cb = (vx_uint8 *)dst_base[1] + (y >> 1) * dstP1StrideY;
+                vx_uint8 *cr = (vx_uint8 *)dst_base[2] + (y >> 1) * dstP2StrideY;
+
+                for (x = low_x; x < high_x; x += 8)
+                {
+                    uint8x8_t src00Value = vld1_u8(src0 + x * srcP0StrideX);
+                    uint8x8_t src01Value = vld1_u8(src0 + (x + 4) * srcP0StrideX);
+                    uint8x8x2_t dst0Value = vuzp_u8(src00Value,src01Value);
+                    vst1_u8((luma + x * dstP0StrideX),dst0Value.val[0]);
+
+                    uint8x8_t src10Value = vld1_u8(src1 + x * srcP0StrideX);
+                    uint8x8_t src11Value = vld1_u8(src1 + (x + 4) * srcP0StrideX);
+                    uint8x8x2_t dst1Value = vuzp_u8(src10Value,src11Value);
+                    vst1_u8((luma1 + x * dstP0StrideX),dst1Value.val[0]);
+
+                    uint16x8_t cbcrValue = vaddl_u8(dst0Value.val[1],dst1Value.val[1]);
+                    vx_uint16 cbcrValuek[8];
+                    vst1q_u16(cbcrValuek,cbcrValue);
+                    for (vx_uint32 kx = 0; kx < 8; kx += 2)
+                    {
+                        *(cb + ((x + kx) >> 1) * dstP1StrideX) = cbcrValuek[kx] / 2;
+                        *(cr + ((x + kx) >> 1) * dstP2StrideX) = cbcrValuek[kx + 1] / 2;
+                    }
+                }
+            }
+        }
+    }
+    else if (src_format == VX_DF_IMAGE_UYVY)
+    {
+        if (dst_format == VX_DF_IMAGE_RGB || dst_format == VX_DF_IMAGE_RGBX)
+        {
+            vx_uint32 x, y;
+            for (y = low_y; y < high_y; y++)
+            {
+                for (x = low_x; x < high_x; x += 4)
+                {
+                    vx_uint8 *uyvy = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                    vx_uint8 *uyvy1 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x+2) * srcP0StrideX;
+
+                    vx_float32 yValue[4] = {(vx_float32)uyvy[1],(vx_float32)uyvy[3],(vx_float32)uyvy1[1],(vx_float32)uyvy1[3]};
+                    vx_float32 cbValue[4] = {(vx_float32)uyvy[0],(vx_float32)uyvy[0],(vx_float32)uyvy1[0],(vx_float32)uyvy1[0]};
+                    vx_float32 crValue[4] = {(vx_float32)uyvy[2],(vx_float32)uyvy[2],(vx_float32)uyvy1[2],(vx_float32)uyvy1[2]};
+
+                    vx_uint8 *rgb0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+                    vx_uint8 *rgb1 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX;
+                    vx_uint8 *rgb2 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+2) * dstP0StrideX;
+                    vx_uint8 *rgb3 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+3) * dstP0StrideX;
+
+                    vx_uint8 bUint8[4];
+                    vx_uint8 gUint8[4];
+                    vx_uint8 rUint8[4];
+
+                    if(src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625)
+                    {
+                        yuv2rgb_bt601V(yValue, cbValue, crValue, rUint8, gUint8, bUint8);
+
+                        rgb0[0] = rUint8[0];
+                        rgb1[0] = rUint8[1];
+                        rgb2[0] = rUint8[2];
+                        rgb3[0] = rUint8[3];
+
+                        rgb0[1] = gUint8[0];
+                        rgb1[1] = gUint8[1];
+                        rgb2[1] = gUint8[2];
+                        rgb3[1] = gUint8[3];
+
+                        rgb0[2] = bUint8[0];
+                        rgb1[2] = bUint8[1];
+                        rgb2[2] = bUint8[2];
+                        rgb3[2] = bUint8[3];
+                        if (dst_format == VX_DF_IMAGE_RGBX)
+                        {
+                            rgb0[3] = 255;
+                            rgb1[3] = 255;
+                            rgb2[3] = 255;
+                            rgb3[3] = 255;
+                        }
+                    }
+                    else
+                    {
+                        yuv2rgb_bt709V(yValue, cbValue, crValue, rUint8, gUint8, bUint8);
+
+                        rgb0[0] = rUint8[0];
+                        rgb1[0] = rUint8[1];
+                        rgb2[0] = rUint8[2];
+                        rgb3[0] = rUint8[3];
+
+                        rgb0[1] = gUint8[0];
+                        rgb1[1] = gUint8[1];
+                        rgb2[1] = gUint8[2];
+                        rgb3[1] = gUint8[3];
+
+                        rgb0[2] = bUint8[0];
+                        rgb1[2] = bUint8[1];
+                        rgb2[2] = bUint8[2];
+                        rgb3[2] = bUint8[3];
+                        if (dst_format == VX_DF_IMAGE_RGBX)
+                        {
+                            rgb0[3] = 255;
+                            rgb1[3] = 255;
+                            rgb2[3] = 255;
+                            rgb3[3] = 255;
+                        }
+                    }
+                }
+            } 
+        }
+        else if (dst_format == VX_DF_IMAGE_NV12)
+        {
+            vx_uint32 x, y;
+            vx_uint8 *uyvy[2];
+            vx_uint8 *luma[4];
+            for (y = low_y; y < high_y; y += 2)
+            {
+                vx_uint8 *src0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY;
+                vx_uint8 *src1 = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY;
+                vx_uint8 *dstLuma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY;
+                vx_uint8 *dstLuma1 = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY;
+                vx_uint8 *dstCbCr = (vx_uint8 *)dst_base[1] + (y >> 1) * dstP0StrideY;
+                for (x = low_x; x < high_x; x += 8)
+                {
+                    uint8x8_t srcValue00 = vld1_u8(src0 + x * srcP0StrideX);
+                    uint8x8_t srcValue01 = vld1_u8(src0 + (x + 4) * srcP0StrideX);
+                    uint8x8x2_t dstValue0 = vuzp_u8(srcValue00, srcValue01);
+                    vst1_u8((dstLuma + x * dstP0StrideX),dstValue0.val[1]);
+
+                    uint8x8_t srcValue10 = vld1_u8(src1 + x * srcP0StrideX);
+                    uint8x8_t srcValue11 = vld1_u8(src1 + (x + 4) * srcP0StrideX);
+                    uint8x8x2_t dstValue1 = vuzp_u8(srcValue10, srcValue11);
+                    vst1_u8((dstLuma1 + x * dstP0StrideX),dstValue1.val[1]);
+
+                    uint16x8_t cbcrValue = vaddl_u8(dstValue0.val[0],dstValue1.val[0]);
+
+                    vx_uint16 cbcrValuek[8];
+                    vst1q_u16(cbcrValuek,cbcrValue);
+                    for (vx_uint32 kx = 0; kx < 8; kx += 2)
+                    {
+                        *(dstCbCr + ((x + kx) >> 1) * dstP1StrideX) = cbcrValuek[kx] / 2;
+                        *(dstCbCr + ((x + kx) >> 1) * dstP1StrideX + 1) = cbcrValuek[kx + 1] / 2;
+                    }
+
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_YUV4)
+        {
+            for (y = low_y; y < high_y; y++)
+            {
+                for (x = low_x; x < high_x; x += 2)
+                {
+                    vx_uint8 *uyvy = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                    vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+                    vx_uint8 *cb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX;
+                    vx_uint8 *cr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX;
+
+                    luma[0] = uyvy[1];
+                    luma[1] = uyvy[3];
+                    cb[0] = uyvy[0];
+                    cr[0] = uyvy[2];
+                    cb[1] = uyvy[0];
+                    cr[1] = uyvy[2];
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_IYUV)
+        {
+            vx_uint32 x, y;
+            vx_uint8 *uyvy[2];
+            vx_uint8 *_luma[4];
+            for (y = low_y; y < high_y; y += 2)
+            {
+                vx_uint8 *src0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY;
+                vx_uint8 *src1 = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY; 
+                vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY;
+                vx_uint8 *luma1 = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY;
+                vx_uint8 *cb = (vx_uint8 *)dst_base[1] + (y >> 1) * dstP1StrideY;
+                vx_uint8 *cr = (vx_uint8 *)dst_base[2] + (y >> 1) * dstP2StrideY;
+
+                for (x = low_x; x < high_x; x += 8)
+                {
+                    uint8x8_t src00Value = vld1_u8(src0 + x * srcP0StrideX);
+                    uint8x8_t src01Value = vld1_u8(src0 + (x + 4) * srcP0StrideX);
+                    uint8x8x2_t dst0Value = vuzp_u8(src00Value,src01Value);
+                    vst1_u8((luma + x * dstP0StrideX),dst0Value.val[1]);
+
+                    uint8x8_t src10Value = vld1_u8(src1 + x * srcP0StrideX);
+                    uint8x8_t src11Value = vld1_u8(src1 + (x + 4) * srcP0StrideX);
+                    uint8x8x2_t dst1Value = vuzp_u8(src10Value,src11Value);
+                    vst1_u8((luma1 + x * dstP0StrideX),dst1Value.val[1]);
+
+                    uint16x8_t cbcrValue = vaddl_u8(dst0Value.val[0], dst1Value.val[0]);
+                    vx_uint16 cbcrValuek[8];
+                    vst1q_u16(cbcrValuek, cbcrValue);
+                    for (vx_uint32 kx = 0; kx < 8; kx += 2)
+                    {
+                        *(cb + ((x + kx) >> 1) * dstP1StrideX) = cbcrValuek[kx] / 2;
+                        *(cr + ((x + kx) >> 1) * dstP2StrideX) = cbcrValuek[kx + 1] / 2;
+                    }
+                }
+            }
+        }
+    }
+    else if (src_format == VX_DF_IMAGE_IYUV)
+    {
+        if (dst_format == VX_DF_IMAGE_RGB || dst_format == VX_DF_IMAGE_RGBX)
+        {
+            vx_uint32 x, y;
+            for (y = low_y; y < high_y; y++)
+            {
+                for (x = low_x; x < high_x; x += 4)
+                {
+                    vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;
+                    vx_uint8 *cb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1);
+                    vx_uint8 *cr = (vx_uint8 *)src_base[2] + srcP2StrideY * (y >> 1) + srcP2StrideX * (x >> 1);
+
+                    vx_float32 yValue[4] = {(vx_float32)luma[0],(vx_float32)luma[1],(vx_float32)luma[2],(vx_float32)luma[3]};
+                    vx_float32 cbValue[4] = {(vx_float32)cb[0],(vx_float32)cb[0],(vx_float32)cb[1],(vx_float32)cb[1]};
+                    vx_float32 crValue[4] = {(vx_float32)cr[0],(vx_float32)cr[0],(vx_float32)cr[1],(vx_float32)cr[1]};
+
+                    vx_uint8 *rgb0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;
+                    vx_uint8 *rgb1 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+1) * dstP0StrideX;
+                    vx_uint8 *rgb2 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+2) * dstP0StrideX;
+                    vx_uint8 *rgb3 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x+3) * dstP0StrideX;
+
+                    vx_uint8 bUint8[4];
+                    vx_uint8 gUint8[4];
+                    vx_uint8 rUint8[4];
+
+                    if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625)
+                    {
+                        yuv2rgb_bt601V(yValue, cbValue, crValue, rUint8, gUint8, bUint8);
+
+                        rgb0[0] = rUint8[0];
+                        rgb1[0] = rUint8[1];
+                        rgb2[0] = rUint8[2];
+                        rgb3[0] = rUint8[3];
+
+                        rgb0[1] = gUint8[0];
+                        rgb1[1] = gUint8[1];
+                        rgb2[1] = gUint8[2];
+                        rgb3[1] = gUint8[3];
+
+                        rgb0[2] = bUint8[0];
+                        rgb1[2] = bUint8[1];
+                        rgb2[2] = bUint8[2];
+                        rgb3[2] = bUint8[3];
+                        if (dst_format == VX_DF_IMAGE_RGBX)
+                        {
+                            rgb0[3] = 255;
+                            rgb1[3] = 255;
+                            rgb2[3] = 255;
+                            rgb3[3] = 255;
+                        }
+                    }
+                    else
+                    {
+                        yuv2rgb_bt709V(yValue, cbValue, crValue, rUint8, gUint8, bUint8);
+
+                        rgb0[0] = rUint8[0];
+                        rgb1[0] = rUint8[1];
+                        rgb2[0] = rUint8[2];
+                        rgb3[0] = rUint8[3];
+
+                        rgb0[1] = gUint8[0];
+                        rgb1[1] = gUint8[1];
+                        rgb2[1] = gUint8[2];
+                        rgb3[1] = gUint8[3];
+
+                        rgb0[2] = bUint8[0];
+                        rgb1[2] = bUint8[1];
+                        rgb2[2] = bUint8[2];
+                        rgb3[2] = bUint8[3];
+                        if (dst_format == VX_DF_IMAGE_RGBX)
+                        {
+                            rgb0[3] = 255;
+                            rgb1[3] = 255;
+                            rgb2[3] = 255;
+                            rgb3[3] = 255;
+                        }
+                    }
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_NV12)
+        {
+            vx_uint32 x, y;
+            for (y = low_y; y < high_y; y += 2)
+            {
+                vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY;
+                vx_uint8 *luma1 = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY;
+                vx_uint8 *cb = (vx_uint8 *)src_base[1] + (y >> 1) * srcP1StrideY;
+                vx_uint8 *cr = (vx_uint8 *)src_base[2] + (y >> 1) * srcP2StrideY;
+                vx_uint8 *nv12Y = (vx_uint8 *)dst_base[0] + y * dstP0StrideY;
+                vx_uint8 *nv12Y1 = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY;
+                vx_uint8 *nv12CbCr = (vx_uint8 *)dst_base[1] + (y >> 1) * dstP1StrideY;
+
+                for (x = low_x; x < high_x; x += 8)
+                {
+                    uint8x8_t lumaValue = vld1_u8(luma + x * srcP0StrideX);
+                    vst1_u8((nv12Y + x * dstP0StrideX), lumaValue);
+
+                    uint8x8_t luma1Value = vld1_u8(luma1 + x * srcP0StrideX);
+                    vst1_u8((nv12Y1 + x * dstP0StrideX), luma1Value);
+
+                    uint8x8_t cbValue = vld1_u8(cb + (x >> 1) * srcP1StrideX);
+                    uint8x8_t crValue = vld1_u8(cr + (x >> 1) * srcP2StrideX);
+
+                    uint8x8x2_t cbcrValue = vzip_u8(cbValue, crValue);
+
+                    vst1_u8((nv12CbCr + (x >> 1) * dstP1StrideX), cbcrValue.val[0]);
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_YUV4)
+        {
+            vx_uint32 x, y;
+            for (y = low_y; y < high_y; y += 2)
+            {
+                vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY;
+                vx_uint8 *luma1 = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY;
+                vx_uint8 *cb = (vx_uint8 *)src_base[1] + (y >> 1) * srcP1StrideY;
+                vx_uint8 *cr = (vx_uint8 *)src_base[2] + (y >> 1) * srcP2StrideY;
+                vx_uint8 *dstLuma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY;
+                vx_uint8 *dstLuma1 = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY;
+                vx_uint8 *dstcb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY;
+                vx_uint8 *dstcb1 = (vx_uint8 *)dst_base[1] + (y + 1) * dstP1StrideY;
+                vx_uint8 *dstcr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY;
+                vx_uint8 *dstcr1 = (vx_uint8 *)dst_base[2] + (y + 1) * dstP1StrideY;
+
+                for (x = low_x; x < high_x; x += 8)
+                {
+                    uint8x8_t lumaValue = vld1_u8(luma + x * srcP0StrideX);
+                    vst1_u8((dstLuma + x * dstP0StrideX), lumaValue);
+
+                    uint8x8_t luma1Value = vld1_u8(luma1 + x * srcP0StrideX);
+                    vst1_u8((dstLuma1 + x * dstP0StrideX), luma1Value);
+
+                    uint8x8_t cbValue = vld1_u8(cb + (x >> 1) * srcP1StrideX);
+                    uint8x8x2_t dstCbValue = vzip_u8(cbValue, cbValue);
+                    vst1_u8((dstcb + x * dstP1StrideX), dstCbValue.val[0]);
+                    vst1_u8((dstcb1 + x * dstP1StrideX), dstCbValue.val[0]);
+
+                    uint8x8_t crValue = vld1_u8(cr + (x >> 1) * srcP2StrideX);
+                    uint8x8x2_t dstCrValue = vzip_u8(crValue, crValue);
+                    vst1_u8((dstcr + x * dstP2StrideX), dstCrValue.val[0]);
+                    vst1_u8((dstcr1 + x * dstP2StrideX), dstCrValue.val[0]);
+                }
+            }
+        }
+    }
+}
+
+
+#define RGBX_RGB(low_y, high_y, low_x)                                                          \
+    for (y = low_y; y < high_y; y++)                                                            \
+    {                                                                                           \
+        for (x = low_x; x < high_x; x++)                                                        \
+        {                                                                                       \
+            vx_uint8 *srcP0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;    \
+            vx_uint8 *dstP0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;    \
+                                                                                                \
+            dstP0[0] = srcP0[0];                                                                \
+            dstP0[1] = srcP0[1];                                                                \
+            dstP0[2] = srcP0[2];                                                                \
+        }                                                                                       \
+    }
+
+#define RGB_RGBX(low_y, high_y, low_x)                                                          \
+    for (y = low_y; y < high_y; y++)                                                            \
+    {                                                                                           \
+        for (x = low_x; x < high_x; x++)                                                        \
+        {                                                                                       \
+            vx_uint8 *srcP0 = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;    \
+            vx_uint8 *dstP0 = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;    \
+                                                                                                \
+            dstP0[0] = srcP0[0];                                                                \
+            dstP0[1] = srcP0[1];                                                                \
+            dstP0[2] = srcP0[2];                                                                \
+            dstP0[3] = 255;                                                                     \
+        }                                                                                       \
+    }
+
+#define RGB_NV12(low_y, high_y, low_x)                                                                                              \
+    for (y = low_y; y < high_y; y += 2)                                                                                             \
+    {                                                                                                                               \
+        for (x = low_x; x < high_x; x += 2)                                                                                         \
+        {                                                                                                                           \
+            rgb[0] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;                                                 \
+            rgb[1] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x + 1) * srcP0StrideX;                                           \
+            rgb[2] = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + x * srcP0StrideX;                                           \
+            rgb[3] = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + (x + 1) * srcP0StrideX;                                     \
+                                                                                                                                    \
+            luma[0] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;                                                \
+            luma[1] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x + 1) * dstP0StrideX;                                          \
+            luma[2] = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + x * dstP0StrideX;                                          \
+            luma[3] = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + (x + 1) * dstP0StrideX;                                    \
+                                                                                                                                    \
+            cbcr = (vx_uint8 *)dst_base[1] + y * dstP1StrideY / 2 + x * dstP1StrideX / 2;                                           \
+                                                                                                                                    \
+            vx_float32 arrfr[4] = { (vx_float32)rgb[0][0], (vx_float32)rgb[1][0], (vx_float32)rgb[2][0], (vx_float32)rgb[3][0] };   \
+            vx_float32 arrfg[4] = { (vx_float32)rgb[0][1], (vx_float32)rgb[1][1], (vx_float32)rgb[2][1], (vx_float32)rgb[3][1] };   \
+            vx_float32 arrfb[4] = { (vx_float32)rgb[0][2], (vx_float32)rgb[1][2], (vx_float32)rgb[2][2], (vx_float32)rgb[3][2] };   \
+                                                                                                                                    \
+            rgb2yuv_bt709(rgb[0][0], rgb[0][1], rgb[0][2], &luma[0][0], &cb[0], &cr[0]);                                            \
+            rgb2yuv_bt709(rgb[1][0], rgb[1][1], rgb[1][2], &luma[1][0], &cb[1], &cr[1]);                                            \
+            rgb2yuv_bt709(rgb[2][0], rgb[2][1], rgb[2][2], &luma[2][0], &cb[2], &cr[2]);                                            \
+            rgb2yuv_bt709(rgb[3][0], rgb[3][1], rgb[3][2], &luma[3][0], &cb[3], &cr[3]);                                            \
+                                                                                                                                    \
+            cbcr[0] = (cb[0] + cb[1] + cb[2] + cb[3]) / 4;                                                                          \
+            cbcr[1] = (cr[0] + cr[1] + cr[2] + cr[3]) / 4;                                                                          \
+        }                                                                                                                           \
+    }
+
+#define RGB_YUV4(low_y, high_y, low_x)                                                          \
+    for (y = low_y; y < high_y; y++)                                                            \
+    {                                                                                           \
+        for (x = low_x; x < high_x; x++)                                                        \
+        {                                                                                       \
+            vx_uint8 *rgb = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;      \
+            vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;     \
+            vx_uint8 *cb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX;       \
+            vx_uint8 *cr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX;       \
+                                                                                                \
+            rgb2yuv_bt709(rgb[0], rgb[1], rgb[2], luma, cb, cr);                                \
+        }                                                                                       \
+    }
+
+#define RGB_IYUV(low_y, high_y, low_x)                                                                              \
+    for (y = low_y; y < high_y; y += 2)                                                                             \
+    {                                                                                                               \
+        for (x = low_x; x < high_x; x += 2)                                                                         \
+        {                                                                                                           \
+            rgb[0] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;                                 \
+            rgb[1] = (vx_uint8 *)src_base[0] + y * srcP0StrideY + (x + 1) * srcP0StrideX;                           \
+            rgb[2] = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + x * srcP0StrideX;                           \
+            rgb[3] = (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + (x + 1) * srcP0StrideX;                     \
+                                                                                                                    \
+            luma[0] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;                                \
+            luma[1] = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x + 1) * dstP0StrideX;                          \
+            luma[2] = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + x * dstP0StrideX;                          \
+            luma[3] = (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + (x + 1) * dstP0StrideX;                    \
+                                                                                                                    \
+            cbp = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1);                      \
+            crp = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * (x >> 1);                      \
+                                                                                                                    \
+            rgb2yuv_bt709(rgb[0][0], rgb[0][1], rgb[0][2], &luma[0][0], &cb[0], &cr[0]);                            \
+            rgb2yuv_bt709(rgb[1][0], rgb[1][1], rgb[1][2], &luma[1][0], &cb[1], &cr[1]);                            \
+            rgb2yuv_bt709(rgb[2][0], rgb[2][1], rgb[2][2], &luma[2][0], &cb[2], &cr[2]);                            \
+            rgb2yuv_bt709(rgb[3][0], rgb[3][1], rgb[3][2], &luma[3][0], &cb[3], &cr[3]);                            \
+                                                                                                                    \
+            cbp[0] = (uint8_t)(((vx_uint16)cb[0] + (vx_uint16)cb[1] + (vx_uint16)cb[2] + (vx_uint16)cb[3]) >> 2);   \
+            crp[0] = (uint8_t)(((vx_uint16)cr[0] + (vx_uint16)cr[1] + (vx_uint16)cr[2] + (vx_uint16)cr[3]) >> 2);   \
+        }                                                                                                           \
+    }
+
+#define NV12_RGB(low_y, high_y, low_x)                                                                      \
+    for (y = low_y; y < high_y; y++)                                                                        \
+    {                                                                                                       \
+        for (x = low_x; x < high_x; x++)                                                                    \
+        {                                                                                                   \
+            vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;                 \
+            vx_uint8 *crcb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1);   \
+            vx_uint8 *rgb = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;                  \
+                                                                                                            \
+            if (dst_format == VX_DF_IMAGE_RGBX)                                                             \
+                rgb[3] = 255;                                                                               \
+                                                                                                            \
+            if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625)             \
+                yuv2rgb_bt601(luma[0], crcb[u_pix], crcb[v_pix], &rgb[0], &rgb[1], &rgb[2]);                \
+            else /*if (src_space == VX_COLOR_SPACE_BT709)*/                                                 \
+                yuv2rgb_bt709(luma[0], crcb[u_pix], crcb[v_pix], &rgb[0], &rgb[1], &rgb[2]);                \
+        }                                                                                                   \
+    }
+
+#define NV12_NV21(low_y, high_y, low_x)                                                                     \
+    for (y = low_y; y < high_y; y++)                                                                        \
+    {                                                                                                       \
+        for (x = low_x; x < high_x; x++)                                                                    \
+        {                                                                                                   \
+            vx_uint8 *luma[2] = { (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX,            \
+                                  (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX };          \
+                                                                                                            \
+            vx_uint8 *cbcr = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1);   \
+            vx_uint8 *crcb = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1);   \
+                                                                                                            \
+            yuv2yuv_601to709(luma[0][0], cbcr[0], cbcr[1], &luma[1][0], &crcb[1], &crcb[0]);                \
+        }                                                                                                   \
+    }
+
+#define NV12_YUV4(low_y, high_y, low_x)                                                                     \
+    for (y = low_y; y < high_y; y++)                                                                        \
+    {                                                                                                       \
+        for (x = low_x; x < high_x; x++)                                                                    \
+        {                                                                                                   \
+            vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;                 \
+            vx_uint8 *yout = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;                 \
+            vx_uint8 *crcb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1);   \
+            vx_uint8 *cb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX;                   \
+            vx_uint8 *cr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX;                   \
+                                                                                                            \
+            yout[0] = luma[0];                                                                              \
+            cb[0] = crcb[u_pix];                                                                            \
+            cr[0] = crcb[v_pix];                                                                            \
+        }                                                                                                   \
+    }
+
+#define NV12_IYUV(low_y, high_y, low_x)                                                                         \
+    for (y = low_y; y < high_y; y++)                                                                            \
+    {                                                                                                           \
+        for (x = low_x; x < high_x; x++)                                                                        \
+        {                                                                                                       \
+            vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;                     \
+            vx_uint8 *yout = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;                     \
+            vx_uint8 *crcb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * ((x + 0) / 2);  \
+            vx_uint8 *cb = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * ((x + 0) / 2);    \
+            vx_uint8 *cr = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * ((x + 0) / 2);    \
+                                                                                                                \
+            yout[0] = luma[0];                                                                                  \
+            cb[0] = crcb[u_pix];                                                                                \
+            cr[0] = crcb[v_pix];                                                                                \
+        }                                                                                                       \
+    }
+
+#define YUYV_RGB(low_y, high_y, low_x)                                                             \
+    for (y = low_y; y < high_y; y++)                                                               \
+    {                                                                                              \
+        for (x = low_x; x < high_x; x += 2)                                                        \
+        {                                                                                          \
+            vx_uint8 *yuyv = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;        \
+            vx_uint8 *rgb = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;         \
+                                                                                                   \
+            if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625)    \
+            {                                                                                      \
+                yuv2rgb_bt601(yuyv[0], yuyv[1], yuyv[3], &rgb[0], &rgb[1], &rgb[2]);               \
+                yuv2rgb_bt601(yuyv[2], yuyv[1], yuyv[3], &rgb[3], &rgb[4], &rgb[5]);               \
+            }                                                                                      \
+            else /*if (src_space == VX_COLOR_SPACE_BT709)*/                                        \
+            {                                                                                      \
+                yuv2rgb_bt709(yuyv[0], yuyv[1], yuyv[3], &rgb[0], &rgb[1], &rgb[2]);               \
+                yuv2rgb_bt709(yuyv[2], yuyv[1], yuyv[3], &rgb[3], &rgb[4], &rgb[5]);               \
+            }                                                                                      \
+        }                                                                                          \
+    }
+
+
+#define YUYV_RGBX(low_y, high_y, low_x)                                                            \
+    for (y = low_y; y < high_y; y++)                                                               \
+    {                                                                                              \
+        for (x = low_x; x < high_x; x += 2)                                                        \
+        {                                                                                          \
+            vx_uint8 *yuyv = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;        \
+            vx_uint8 *rgb = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;         \
+            rgb[3] = rgb[7] = 255;                                                                 \
+                                                                                                   \
+            if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625)    \
+            {                                                                                      \
+                yuv2rgb_bt601(yuyv[0], yuyv[1], yuyv[3], &rgb[0], &rgb[1], &rgb[2]);               \
+                yuv2rgb_bt601(yuyv[2], yuyv[1], yuyv[3], &rgb[4], &rgb[5], &rgb[6]);               \
+            }                                                                                      \
+            else /*if (src_space == VX_COLOR_SPACE_BT709)*/                                        \
+            {                                                                                      \
+                yuv2rgb_bt709(yuyv[0], yuyv[1], yuyv[3], &rgb[0], &rgb[1], &rgb[2]);               \
+                yuv2rgb_bt709(yuyv[2], yuyv[1], yuyv[3], &rgb[4], &rgb[5], &rgb[6]);               \
+            }                                                                                      \
+        }                                                                                          \
+    }
+
+
+#define YUYV_NV12(low_y, high_y, low_x)                                                                     \
+    for (y = low_y; y < high_y; y += 2)                                                                     \
+    {                                                                                                       \
+        for (x = low_x; x < high_x; x += 2)                                                                 \
+        {                                                                                                   \
+            vx_uint8 *yuyv[2] = { (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX,            \
+                (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + x * srcP0StrideX };                      \
+            vx_uint8 *luma[4] = { (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX,            \
+                (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x + 1) * dstP0StrideX,                        \
+                (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + x * dstP0StrideX,                        \
+                (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + (x + 1) * dstP0StrideX };                \
+            vx_uint8 *cbcr = (vx_uint8 *)dst_base[1] + (y >> 1) * dstP1StrideY + (x >> 1) * dstP1StrideX;   \
+                                                                                                            \
+            luma[0][0] = yuyv[0][0];                                                                        \
+            luma[1][0] = yuyv[0][2];                                                                        \
+            luma[2][0] = yuyv[1][0];                                                                        \
+            luma[3][0] = yuyv[1][2];                                                                        \
+            cbcr[0] = (yuyv[0][1] + yuyv[1][1]) / 2;                                                        \
+            cbcr[1] = (yuyv[0][3] + yuyv[1][3]) / 2;                                                        \
+        }                                                                                                   \
+    }
+
+
+#define YUYV_YUV4(low_y, high_y, low_x)                                                        \
+    for (y = low_y; y < high_y; y++)                                                           \
+    {                                                                                          \
+        for (x = low_x; x < high_x; x += 2)                                                    \
+        {                                                                                      \
+            vx_uint8 *yuyv = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;    \
+            vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;    \
+            vx_uint8 *cb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX;      \
+            vx_uint8 *cr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX;      \
+                                                                                               \
+            luma[0] = yuyv[0];                                                                 \
+            luma[1] = yuyv[2];                                                                 \
+            cb[0] = yuyv[1];                                                                   \
+            cr[0] = yuyv[3];                                                                   \
+            cb[1] = yuyv[1];                                                                   \
+            cr[1] = yuyv[3];                                                                   \
+        }                                                                                      \
+    }
+
+
+#define YUYV_IYUV(low_y, high_y, low_x)                                                                           \
+    for (y = low_y; y < high_y; y += 2)                                                                           \
+    {                                                                                                             \
+        for (x = low_x; x < high_x; x += 2)                                                                       \
+        {                                                                                                         \
+            vx_uint8 *yuyv[2] = { (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX,                  \
+                                  (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + x * srcP0StrideX };          \
+            vx_uint8 *luma[4] = { (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX,                  \
+                                  (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x + 1) * dstP0StrideX,            \
+                                  (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + x * dstP0StrideX,            \
+                                  (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + (x + 1) * dstP0StrideX };    \
+            vx_uint8 *cb = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1);           \
+            vx_uint8 *cr = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * (x >> 1);           \
+                                                                                                                  \
+            luma[0][0] = yuyv[0][0];                                                                              \
+            luma[1][0] = yuyv[0][2];                                                                              \
+            luma[2][0] = yuyv[1][0];                                                                              \
+            luma[3][0] = yuyv[1][2];                                                                              \
+            cb[0] = (yuyv[0][1] + yuyv[1][1]) / 2;                                                                \
+            cr[0] = (yuyv[0][3] + yuyv[1][3]) / 2;                                                                \
+        }                                                                                                         \
+    }
+
+
+#define UYVY_RGB(low_y, high_y, low_x)                                                             \
+    for (y = low_y; y < high_y; y++)                                                               \
+    {                                                                                              \
+        for (x = low_x; x < high_x; x += 2)                                                        \
+        {                                                                                          \
+            vx_uint8 *uyvy = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;        \
+            vx_uint8 *rgb = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;         \
+                                                                                                   \
+            if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625)    \
+            {                                                                                      \
+                yuv2rgb_bt601(uyvy[1], uyvy[0], uyvy[2], &rgb[0], &rgb[1], &rgb[2]);               \
+                yuv2rgb_bt601(uyvy[3], uyvy[0], uyvy[2], &rgb[3], &rgb[4], &rgb[5]);               \
+            }                                                                                      \
+            else /*if (src_space == VX_COLOR_SPACE_BT709)*/                                        \
+            {                                                                                      \
+                yuv2rgb_bt709(uyvy[1], uyvy[0], uyvy[2], &rgb[0], &rgb[1], &rgb[2]);               \
+                yuv2rgb_bt709(uyvy[3], uyvy[0], uyvy[2], &rgb[3], &rgb[4], &rgb[5]);               \
+            }                                                                                      \
+        }                                                                                          \
+    }   
+
+
+#define UYVY_RGBX(low_y, high_y, low_x)                                                            \
+    for (y = low_y; y < high_y; y++)                                                               \
+    {                                                                                              \
+        for (x = low_x; x < high_x; x += 2)                                                        \
+        {                                                                                          \
+            vx_uint8 *uyvy = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;        \
+            vx_uint8 *rgb = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;         \
+            rgb[3] = rgb[7] = 255;                                                                 \
+                                                                                                   \
+            if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625)    \
+            {                                                                                      \
+                yuv2rgb_bt601(uyvy[1], uyvy[0], uyvy[2], &rgb[0], &rgb[1], &rgb[2]);               \
+                yuv2rgb_bt601(uyvy[3], uyvy[0], uyvy[2], &rgb[4], &rgb[5], &rgb[6]);               \
+            }                                                                                      \
+            else /*if (src_space == VX_COLOR_SPACE_BT709)*/                                        \
+            {                                                                                      \
+                yuv2rgb_bt709(uyvy[1], uyvy[0], uyvy[2], &rgb[0], &rgb[1], &rgb[2]);               \
+                yuv2rgb_bt709(uyvy[3], uyvy[0], uyvy[2], &rgb[4], &rgb[5], &rgb[6]);               \
+            }                                                                                      \
+        }                                                                                          \
+    }
+
+
+#define UYVY_NV12(low_y, high_y, low_x)                                                                           \
+    for (y = low_y; y < high_y; y += 2)                                                                           \
+    {                                                                                                             \
+        for (x = low_x; x < high_x; x += 2)                                                                       \
+        {                                                                                                         \
+            vx_uint8 *uyvy[2] = { (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX,                  \
+                                  (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + x * srcP0StrideX };          \
+            vx_uint8 *luma[4] = { (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX,                  \
+                                  (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x + 1) * dstP0StrideX,            \
+                                  (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + x * dstP0StrideX,            \
+                                  (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + (x + 1) * dstP0StrideX };    \
+            vx_uint8 *cbcr = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1);         \
+                                                                                                                  \
+            luma[0][0] = uyvy[0][1];                                                                              \
+            luma[1][0] = uyvy[0][3];                                                                              \
+            luma[2][0] = uyvy[1][1];                                                                              \
+            luma[3][0] = uyvy[1][3];                                                                              \
+            cbcr[0] = (uyvy[0][0] + uyvy[1][0]) / 2;                                                              \
+            cbcr[1] = (uyvy[0][2] + uyvy[1][2]) / 2;                                                              \
+        }                                                                                                         \
+    }
+
+
+#define UYVY_YUV4(low_y, high_y, low_x)                                                        \
+    for (y = low_y; y < high_y; y++)                                                           \
+    {                                                                                          \
+        for (x = low_x; x < high_x; x += 2)                                                    \
+        {                                                                                      \
+            vx_uint8 *uyvy = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;    \
+            vx_uint8 *luma = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;    \
+            vx_uint8 *cb = (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX;      \
+            vx_uint8 *cr = (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX;      \
+                                                                                               \
+            luma[0] = uyvy[1];                                                                 \
+            luma[1] = uyvy[3];                                                                 \
+            cb[0] = uyvy[0];                                                                   \
+            cr[0] = uyvy[2];                                                                   \
+            cb[1] = uyvy[0];                                                                   \
+            cr[1] = uyvy[2];                                                                   \
+        }                                                                                      \
+    }
+
+
+#define UYVY_IYUV(low_y, high_y, low_x)                                                                           \
+    for (y = low_y; y < high_y; y += 2)                                                                           \
+    {                                                                                                             \
+        for (x = low_x; x < high_x; x += 2)                                                                       \
+        {                                                                                                         \
+            vx_uint8 *uyvy[2] = { (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX,                  \
+                                  (vx_uint8 *)src_base[0] + (y + 1) * srcP0StrideY + x * srcP0StrideX };          \
+            vx_uint8 *luma[4] = { (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX,                  \
+                                  (vx_uint8 *)dst_base[0] + y * dstP0StrideY + (x + 1) * dstP0StrideX,            \
+                                  (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + x * dstP0StrideX,            \
+                                  (vx_uint8 *)dst_base[0] + (y + 1) * dstP0StrideY + (x + 1) * dstP0StrideX };    \
+            vx_uint8 *cb = (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1);           \
+            vx_uint8 *cr = (vx_uint8 *)dst_base[2] + dstP2StrideY * (y >> 1) + dstP2StrideX * (x >> 1);           \
+                                                                                                                  \
+            luma[0][0] = uyvy[0][1];                                                                              \
+            luma[1][0] = uyvy[0][3];                                                                              \
+            luma[2][0] = uyvy[1][1];                                                                              \
+            luma[3][0] = uyvy[1][3];                                                                              \
+            cb[0] = (uyvy[0][0] + uyvy[1][0]) / 2;                                                                \
+            cr[0] = (uyvy[0][2] + uyvy[1][2]) / 2;                                                                \
+        }                                                                                                         \
+    }
+
+
+#define IYUV_RGB(low_y, high_y, low_x)                                                                     \
+    for (y = low_y; y < high_y; y++)                                                                       \
+    {                                                                                                      \
+        for (x = low_x; x < high_x; x++)                                                                   \
+        {                                                                                                  \
+            vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;                \
+            vx_uint8 *cb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1);    \
+            vx_uint8 *cr = (vx_uint8 *)src_base[2] + srcP2StrideY * (y >> 1) + srcP2StrideX * (x >> 1);    \
+            vx_uint8 *rgb = (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX;                 \
+                                                                                                           \
+            if (dst_format == VX_DF_IMAGE_RGBX)                                                            \
+                rgb[3] = 255;                                                                              \
+                                                                                                           \
+            /*! \todo restricted range 601 ? */                                                            \
+            if (src_space == VX_COLOR_SPACE_BT601_525 || src_space == VX_COLOR_SPACE_BT601_625)            \
+                yuv2rgb_bt601(luma[0], cb[0], cr[0], &rgb[0], &rgb[1], &rgb[2]);                           \
+            else /*if (src_space == VX_COLOR_SPACE_BT709)*/                                                \
+                yuv2rgb_bt709(luma[0], cb[0], cr[0], &rgb[0], &rgb[1], &rgb[2]);                           \
+        }                                                                                                  \
+    }
+
+
+#define IYUV_NV12(low_y, high_y, low_x)                                                                             \
+    for (y = low_y; y < high_y; y++)                                                                                \
+    {                                                                                                               \
+        for (x = low_x; x < high_x; x++)                                                                            \
+        {                                                                                                           \
+            vx_uint8 *luma = (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX;                         \
+            vx_uint8 *cb = (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1);             \
+            vx_uint8 *cr = (vx_uint8 *)src_base[2] + srcP2StrideY * (y >> 1) + srcP2StrideX * (x >> 1);             \
+            vx_uint8 *nv12[2] = { (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX,                    \
+                                  (vx_uint8 *)dst_base[1] + dstP1StrideY * (y >> 1) + dstP1StrideX * (x >> 1) };    \
+            nv12[0][0] = luma[0];                                                                                   \
+            nv12[1][0] = cb[0];                                                                                     \
+            nv12[1][1] = cr[0];                                                                                     \
+        }                                                                                                           \
+    }
+
+
+#define IYUV_YUV4(low_y, high_y, low_x)                                                                         \
+    for (y = low_y; y < high_y; y++)                                                                            \
+    {                                                                                                           \
+        for (x = low_x; x < high_x; x++)                                                                        \
+        {                                                                                                       \
+            vx_uint8 *luma[2] = { (vx_uint8 *)src_base[0] + y * srcP0StrideY + x * srcP0StrideX,                \
+                                  (vx_uint8 *)dst_base[0] + y * dstP0StrideY + x * dstP0StrideX };              \
+            vx_uint8 *cb[2] = { (vx_uint8 *)src_base[1] + srcP1StrideY * (y >> 1) + srcP1StrideX * (x >> 1),    \
+                                (vx_uint8 *)dst_base[1] + y * dstP1StrideY + x * dstP1StrideX };                \
+            vx_uint8 *cr[2] = { (vx_uint8 *)src_base[2] + srcP2StrideY * (y >> 1) + srcP2StrideX * (x >> 1),    \
+                                (vx_uint8 *)dst_base[2] + y * dstP2StrideY + x * dstP2StrideX };                \
+                                                                                                                \
+            luma[1][0] = luma[0][0];                                                                            \
+            cb[1][0] = cb[0][0];                                                                                \
+            cr[1][0] = cr[0][0];                                                                                \
+        }                                                                                                       \
+    }
+
+void ConvertColor_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    void *src_base[4] = { NULL };
+    void *dst_base[4] = { NULL };
+
+    src_base[0] = in->base[0];
+    dst_base[0] = out->base[0];
+
+    src_base[1] = in->base[1];
+    dst_base[1] = out->base[1];
+
+    src_base[2] = in->base[2];
+    dst_base[2] = out->base[2];
+
+    vx_uint32 srcP0StrideX = in->addr[0].stride_x;
+    vx_uint32 srcP0StrideY = in->addr[0].stride_y;
+    vx_uint32 dstP0StrideX = out->addr[0].stride_x;
+    vx_uint32 dstP0StrideY = out->addr[0].stride_y;
+
+    vx_uint32 srcP1StrideX = in->addr[1].stride_x;
+    vx_uint32 srcP1StrideY = in->addr[1].stride_y;
+    vx_uint32 dstP1StrideX = out->addr[1].stride_x;
+    vx_uint32 dstP1StrideY = out->addr[1].stride_y;
+
+    vx_uint32 srcP2StrideX = in->addr[2].stride_x;
+    vx_uint32 srcP2StrideY = in->addr[2].stride_y;
+    vx_uint32 dstP2StrideX = out->addr[2].stride_x;
+    vx_uint32 dstP2StrideY = out->addr[2].stride_y;
+
+    vx_df_image src_format, dst_format;
+
+    src_format = in->image.format;
+    dst_format = out->image.format;
+
+    vx_enum src_space = in->image.space;
+
+    if ((src_format == VX_DF_IMAGE_RGB) || (src_format == VX_DF_IMAGE_RGBX))
+    {
+        if (dst_format == VX_DF_IMAGE_RGB || dst_format == VX_DF_IMAGE_RGBX)
+        {
+            if (dst_format == VX_DF_IMAGE_RGB)
+            {
+                if (low_y == 0 && low_x == 0)
+                {
+                    RGBX_RGB(low_y, high_y, low_x)
+                }
+                else
+                {
+                    RGBX_RGB(0, low_y, low_x)
+                    RGBX_RGB(low_y, high_y, 0)
+                }
+            }
+            else
+            {
+                if (low_y == 0 && low_x == 0)
+                {
+                    RGB_RGBX(low_y, high_y, low_x)
+                }
+                else
+                {
+                    RGB_RGBX(0, low_y, low_x)
+                    RGB_RGBX(low_y, high_y, 0)
+                }
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_NV12)
+        {
+            vx_uint8 cb[4];
+            vx_uint8 cr[4];
+            vx_uint8 *rgb[4];
+            vx_uint8 *luma[4];
+            vx_uint8 *cbcr;
+
+            if (low_y == 0 && low_x == 0)
+            {
+                RGB_NV12(low_y, high_y, low_x)
+            }
+            else
+            {
+                RGB_NV12(0, low_y, low_x)
+                RGB_NV12(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_YUV4)
+        {
+            vx_uint8 cb[4];
+            vx_uint8 cr[4];
+            vx_uint8 *rgb[4];
+            vx_uint8 *luma[4];
+            vx_uint8 *u[4];
+            vx_uint8 *v[4];
+
+            if (low_y == 0 && low_x == 0)
+            {
+                RGB_YUV4(low_y, high_y, low_x)
+            }
+            else
+            {
+                RGB_YUV4(0, low_y, low_x)
+                RGB_YUV4(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_IYUV)
+        {
+            vx_uint8 cb[4];
+            vx_uint8 cr[4];
+            vx_uint8 *rgb[4];
+            vx_uint8 *luma[4];
+            vx_uint8 *cbp;
+            vx_uint8 *crp;
+
+            if (low_y == 0 && low_x == 0)
+            {
+                RGB_IYUV(low_y, high_y, low_x)
+            }
+            else
+            {
+                RGB_IYUV(0, low_y, low_x)
+                RGB_IYUV(low_y, high_y, 0)
+            }
+        }
+    }
+    else if (src_format == VX_DF_IMAGE_NV21 || src_format == VX_DF_IMAGE_NV12)
+    {
+        int u_pix = src_format == VX_DF_IMAGE_NV12 ? 0 : 1;
+        int v_pix = src_format == VX_DF_IMAGE_NV12 ? 1 : 0;
+        if ((dst_format == VX_DF_IMAGE_RGB) || (dst_format == VX_DF_IMAGE_RGBX))
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                NV12_RGB(low_y, high_y, low_x)
+            }
+            else
+            {
+                NV12_RGB(0, low_y, low_x)
+                NV12_RGB(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_NV12 || dst_format == VX_DF_IMAGE_NV21)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                NV12_NV21(low_y, high_y, low_x)
+            }
+            else
+            {
+                NV12_NV21(0, low_y, low_x)
+                NV12_NV21(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_YUV4)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                NV12_YUV4(low_y, high_y, low_x)
+            }
+            else
+            {
+                NV12_YUV4(0, low_y, low_x)
+                NV12_YUV4(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_IYUV)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                NV12_IYUV(low_y, high_y, low_x)
+            }
+            else
+            {
+                NV12_IYUV(0, low_y, low_x)
+                NV12_IYUV(low_y, high_y, 0)
+            }
+        }
+    }
+    else if (src_format == VX_DF_IMAGE_YUYV)
+    {
+        if (dst_format == VX_DF_IMAGE_RGB)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                YUYV_RGB(low_y, high_y, low_x)
+            }
+            else
+            {
+                YUYV_RGB(0, low_y, low_x)
+                YUYV_RGB(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_RGBX)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                YUYV_RGBX(low_y, high_y, low_x)
+            }
+            else
+            {
+                YUYV_RGBX(0, low_y, low_x)
+                YUYV_RGBX(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_NV12)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                YUYV_NV12(low_y, high_y, low_x)
+            }
+            else
+            {
+                YUYV_NV12(0, low_y, low_x)
+                YUYV_NV12(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_YUV4)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                YUYV_YUV4(low_y, high_y, low_x)
+            }
+            else
+            {
+                YUYV_YUV4(0, low_y, low_x)
+                YUYV_YUV4(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_IYUV)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                YUYV_IYUV(low_y, high_y, low_x)
+            }
+            else
+            {
+                YUYV_IYUV(0, low_y, low_x)
+                YUYV_IYUV(low_y, high_y, 0)
+            }
+        }
+    }
+    else if (src_format == VX_DF_IMAGE_UYVY)
+    {
+        if (dst_format == VX_DF_IMAGE_RGB)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                UYVY_RGB(low_y, high_y, low_x)
+            }
+            else
+            {
+                UYVY_RGB(0, low_y, low_x)
+                UYVY_RGB(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_RGBX)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                UYVY_RGBX(low_y, high_y, low_x)
+            }
+            else
+            {
+                UYVY_RGBX(0, low_y, low_x)
+                UYVY_RGBX(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_NV12)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                UYVY_NV12(low_y, high_y, low_x)
+            }
+            else
+            {
+                UYVY_NV12(0, low_y, low_x)
+                UYVY_NV12(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_YUV4)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                UYVY_YUV4(low_y, high_y, low_x)
+            }
+            else
+            {
+                UYVY_YUV4(0, low_y, low_x)
+                UYVY_YUV4(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_IYUV)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                UYVY_IYUV(low_y, high_y, low_x)
+            }
+            else
+            {
+                UYVY_IYUV(0, low_y, low_x)
+                UYVY_IYUV(low_y, high_y, 0)
+            }
+        }
+    }
+    else if (src_format == VX_DF_IMAGE_IYUV)
+    {
+        if (dst_format == VX_DF_IMAGE_RGB || dst_format == VX_DF_IMAGE_RGBX)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                IYUV_RGB(low_y, high_y, low_x)
+            }
+            else
+            {
+                IYUV_RGB(0, low_y, low_x)
+                IYUV_RGB(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_NV12)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                IYUV_NV12(low_y, high_y, low_x)
+            }
+            else
+            {
+                IYUV_NV12(0, low_y, low_x)
+                IYUV_NV12(low_y, high_y, 0)
+            }
+        }
+        else if (dst_format == VX_DF_IMAGE_YUV4)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                IYUV_YUV4(low_y, high_y, low_x)
+            }
+            else
+            {
+                IYUV_YUV4(0, low_y, low_x)
+                IYUV_YUV4(low_y, high_y, 0)
+            }
+        }
+    }
+}
diff --git a/kernels/tiling/tiling_convertdepth.c b/kernels/tiling/tiling_convertdepth.c
new file mode 100644
index 0000000..1eea83e
--- /dev/null
+++ b/kernels/tiling/tiling_convertdepth.c
@@ -0,0 +1,173 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+void ConvertDepth_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+    vx_enum *policy = (vx_enum *)parameters[2];
+    vx_int32 *shift = (vx_int32 *)parameters[3];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    if (in->image.format == VX_DF_IMAGE_U8 && out->image.format == VX_DF_IMAGE_S16) 
+    {
+        vx_uint8 *src_base = in->base[0] + in->tile_x;                                      
+        vx_int16 *dst_base = (vx_int16 *)out->base[0] + out->tile_x;   
+     
+        int16x8_t sh=vdupq_n_s16(*shift);
+
+        for (y = low_y; y < high_y; y++)                
+        {
+            vx_uint8* srcp = (vx_uint8 *)src_base + y * in->addr->stride_y;                  
+            vx_int16* dstp = (vx_int16 *)dst_base + y * out->addr->stride_y / 2;             
+            for (x = 0; x < out->tile_block.width; x += 16)
+            {
+                uint8x16_t v_src = vld1q_u8(srcp);
+                int16x8_t v_dst0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v_src)));
+                int16x8_t v_dst1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(v_src)));
+
+                vst1q_s16(dstp, vshlq_s16(v_dst0, sh));
+                vst1q_s16(dstp+8, vshlq_s16(v_dst1, sh));
+
+                srcp+=16;
+                dstp+=16;
+            }
+        } 
+    }
+    else if (in->image.format == VX_DF_IMAGE_S16 && out->image.format == VX_DF_IMAGE_U8)    
+    {                                                                                       
+        vx_int16 *src_base = (vx_int16 *)in->base[0] + in->tile_x;                          
+        vx_uint8 *dst_base = out->base[0] + out->tile_x;    
+
+        int16x8_t sh=vdupq_n_s16(-(*shift));
+
+        for (y = low_y; y < high_y; y++)                
+        {
+            vx_int16* srcp = (vx_int16 *)src_base + y * in->addr->stride_y / 2;                  
+            vx_uint8* dstp = (vx_uint8 *)dst_base + y * out->addr->stride_y;             
+            for (x = 0; x < out->tile_block.width; x += 16)
+            {
+                int16x8_t v_src0 = vld1q_s16(srcp);
+                int16x8_t v_src1 = vld1q_s16(srcp+8);
+
+                if (*policy == VX_CONVERT_POLICY_SATURATE)
+                {
+                    int16x8_t v_dst0= vqshlq_s16(v_src0,sh);
+                    int16x8_t v_dst1= vqshlq_s16(v_src1,sh);
+                    uint8x8_t v_dst00 = vqmovun_s16(v_dst0);
+                    uint8x8_t v_dst01 = vqmovun_s16(v_dst1);
+                    uint8x16_t v_dst = vcombine_u8(v_dst00,v_dst01);
+
+                    vst1q_u8(dstp, v_dst);
+                }
+                else if (*policy == VX_CONVERT_POLICY_WRAP)
+                {
+                    int16x8_t v_dst0= vshlq_s16(v_src0,sh);
+                    int16x8_t v_dst1= vshlq_s16(v_src1,sh);
+                    uint8x16_t v_dst = vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(v_dst0)),vmovn_u16(vreinterpretq_u16_s16(v_dst1)));
+
+                    vst1q_u8(dstp, v_dst);
+                }
+                srcp+=16;
+                dstp+=16;
+            }
+        } 
+    }                                
+}
+
+
+#define CONVERT_DEPTH(low_y, high_y, low_x, in_tile_x, out_tile_x)                          \
+    if (in->image.format == VX_DF_IMAGE_U8 && out->image.format == VX_DF_IMAGE_S16)         \
+    {                                                                                       \
+        vx_uint8 *src_base = in->base[0] + in_tile_x;                                       \
+        vx_int16 *dst_base = (vx_int16 *)out->base[0] + out_tile_x;                         \
+        for (y = low_y; y < high_y; y++)                                                    \
+        {                                                                                   \
+            vx_uint8 *src = (vx_uint8 *)src_base + y * in->addr->stride_y;                  \
+            vx_int16 *dst = (vx_int16 *)dst_base + y * out->addr->stride_y / 2;             \
+            for (x = low_x; x < high_x; x++)                                                \
+            {                                                                               \
+                *dst = ((vx_int16)(*src)) << (*shift);                                      \
+                                                                                            \
+                src++;                                                                      \
+                dst++;                                                                      \
+            }                                                                               \
+        }                                                                                   \
+    }                                                                                       \
+    else if (in->image.format == VX_DF_IMAGE_S16 && out->image.format == VX_DF_IMAGE_U8)    \
+    {                                                                                       \
+        vx_int16 *src_base = (vx_int16 *)in->base[0] + in_tile_x;                           \
+        vx_uint8 *dst_base = out->base[0] + out_tile_x;                                     \
+        for (y = low_y; y < high_y; y++)                                                    \
+        {                                                                                   \
+            vx_int16 *src = (vx_int16 *)src_base + y * in->addr->stride_y / 2;              \
+            vx_uint8 *dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;                 \
+            for (x = low_x; x < high_x; x++)                                                \
+            {                                                                               \
+                if (*policy == VX_CONVERT_POLICY_WRAP)                                      \
+                {                                                                           \
+                    *dst = (vx_uint8)((*src) >> (*shift));                                  \
+                                                                                            \
+                    src++;                                                                  \
+                    dst++;                                                                  \
+                }                                                                           \
+                else if (*policy == VX_CONVERT_POLICY_SATURATE)                             \
+                {                                                                           \
+                    vx_int16 value = (*src) >> (*shift);                                    \
+                    value = (value < 0 ? 0 : value);                                        \
+                    value = (value > UINT8_MAX ? UINT8_MAX : value);                        \
+                    *dst = (vx_uint8)value;                                                 \
+                                                                                            \
+                    src++;                                                                  \
+                    dst++;                                                                  \
+                }                                                                           \
+            }                                                                               \
+        }                                                                                   \
+    }
+
+void ConvertDepth_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+    vx_enum *policy = (vx_enum *)parameters[2];
+    vx_int32 *shift = (vx_int32 *)parameters[3];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    if (low_y == 0 && low_x == 0)
+    {
+        CONVERT_DEPTH(low_y, high_y, low_x, in->tile_x, out->tile_x)
+    }
+    else
+    {
+        CONVERT_DEPTH(0, low_y, low_x, in->tile_x, out->tile_x)
+        CONVERT_DEPTH(low_y, high_y, 0, 0, 0)
+    }
+}
diff --git a/kernels/tiling/tiling_convolve.c b/kernels/tiling/tiling_convolve.c
new file mode 100644
index 0000000..7f11277
--- /dev/null
+++ b/kernels/tiling/tiling_convolve.c
@@ -0,0 +1,1001 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+
+#include <tiling.h>
+
+#include <string.h>
+
+static vx_uint8 u32Tou8(vx_uint32 x)
+{
+    vx_uint8 ret = 0;
+    if (x == 0)
+    {
+        return 32;
+    }
+    if ((x & 0x0000FFFF) == 0)
+    {
+        ret = ret + 16;
+        x = x >> 16;
+    }
+    if ((x & 0x000000FF) == 0)
+    {
+        ret = ret + 8;
+        x = x >> 8;
+    }
+    if ((x & 0x0000000F) == 0)
+    {
+        ret = ret + 4;
+        x = x >> 4;
+    }
+    if ((x & 0x00000003) == 0)
+    {
+        ret = ret + 2;
+        x = x >> 2;
+    }
+    if ((x & 0x00000001) == 0)
+    {
+        ret = ret + 1;
+    }
+    return ret;
+}
+
+static void s32ShiftR(int32x4_t *pv32x4, vx_int32 shift)
+{
+    switch(shift)
+    {
+        case 0:
+            break;
+        case 1:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 1);
+            break;
+        case 2:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 2);
+            break;
+        case 3:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 3);
+            break;
+        case 4:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 4);
+            break;
+        case 5:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 5);
+            break;
+        case 6:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 6);
+            break;
+        case 7:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 7);
+            break;
+        case 8:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 8);
+            break;
+        case 9:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 9);
+            break;
+        case 10:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 10);
+            break;
+        case 11:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 11);
+            break;
+        case 12:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 12);
+            break;
+        case 13:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 13);
+            break;
+        case 14:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 14);
+            break;
+        case 15:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 15);
+            break;
+        case 16:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 16);
+            break;
+        case 17:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 17);
+            break;
+        case 18:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 18);
+            break;
+        case 19:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 19);
+            break;
+        case 20:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 20);
+            break;
+        case 21:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 21);
+            break;
+        case 22:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 22);
+            break;
+        case 23:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 23);
+            break;
+        case 24:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 24);
+            break;
+        case 25:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 25);
+            break;
+        case 26:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 26);
+            break;
+        case 27:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 27);
+            break;
+        case 28:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 28);
+            break;
+        case 29:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 29);
+            break;
+        case 30:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 30);
+            break;
+        case 31:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 31);
+            break;
+        case 32:
+            *pv32x4 = vshrq_n_s32(*pv32x4, 32);
+            break;
+        default:
+            break;
+    }
+    return;
+}
+
+static void convStru8u8(int32x4_t *pvOut0, int32x4_t *pvOut1, int32x4_t *pvOut2, int32x4_t *pvOut3,
+                        vx_uint8 *dst, vx_uint8 fillCnt)
+{
+    int32x4_t out0 = *pvOut0;
+    int32x4_t out1 = *pvOut1;
+    int32x4_t out2 = *pvOut2;
+    int32x4_t out3 = *pvOut3;
+    int32x4_t vMaxu8 = vdupq_n_s32(UINT8_MAX);
+    int32x4_t vZero = vdupq_n_s32(0);
+    uint16x8_t vRetLow, vRetHigh;
+    vx_uint8 szTmp[16];
+
+    out0 = vminq_s32(out0, vMaxu8);
+    out1 = vminq_s32(out1, vMaxu8);
+    out2 = vminq_s32(out2, vMaxu8);
+    out3 = vminq_s32(out3, vMaxu8);
+
+    out0 = vmaxq_s32(out0, vZero);
+    out1 = vmaxq_s32(out1, vZero);
+    out2 = vmaxq_s32(out2, vZero);
+    out3 = vmaxq_s32(out3, vZero);
+
+    vRetLow =  vreinterpretq_u16_s16(vcombine_s16(vqmovn_s32(out0), vqmovn_s32(out1)));
+    vRetHigh = vreinterpretq_u16_s16(vcombine_s16(vqmovn_s32(out2), vqmovn_s32(out3)));
+
+    if (16 == fillCnt)
+    {
+        vst1q_u8(dst, vcombine_u8(vqmovn_u16(vRetLow), vqmovn_u16(vRetHigh)));
+    }
+    else
+    {
+        vst1q_u8(szTmp, vcombine_u8(vqmovn_u16(vRetLow), vqmovn_u16(vRetHigh)));
+        for (vx_uint8 idx = 0; idx < fillCnt; idx++)
+        {
+            dst[idx] = szTmp[idx];
+        }
+    }
+
+    return;
+}
+
+static void convStru8s16(int32x4_t *pvOut0, int32x4_t *pvOut1, int32x4_t *pvOut2, int32x4_t *pvOut3,
+                         vx_int16 *dst, vx_uint8 fillCnt)
+{
+    int32x4_t out0 = *pvOut0;
+    int32x4_t out1 = *pvOut1;
+    int32x4_t out2 = *pvOut2;
+    int32x4_t out3 = *pvOut3;
+    int32x4_t vMaxs16 = vdupq_n_s32(INT16_MAX);
+    int32x4_t vMins16 = vdupq_n_s32(INT16_MIN);
+    vx_int16 szTmp[16];
+
+    out0 = vminq_s32(out0, vMaxs16);
+    out1 = vminq_s32(out1, vMaxs16);
+    out2 = vminq_s32(out2, vMaxs16);
+    out3 = vminq_s32(out3, vMaxs16);
+
+    out0 = vmaxq_s32(out0, vMins16);
+    out1 = vmaxq_s32(out1, vMins16);
+    out2 = vmaxq_s32(out2, vMins16);
+    out3 = vmaxq_s32(out3, vMins16);
+
+    if (16 == fillCnt)
+    {
+        vst1q_s16(dst, vcombine_s16(vqmovn_s32(out0), vqmovn_s32(out1)));
+        vst1q_s16(dst + 8, vcombine_s16(vqmovn_s32(out2), vqmovn_s32(out3)));
+    }
+    else
+    {
+        vst1q_s16(szTmp, vcombine_s16(vqmovn_s32(out0), vqmovn_s32(out1)));
+        vst1q_s16(szTmp + 8, vcombine_s16(vqmovn_s32(out2), vqmovn_s32(out3)));
+        for (vx_uint8 idx = 0; idx < fillCnt; idx++)
+        {
+            dst[idx] = szTmp[idx];
+        }
+    }
+
+    return;
+}
+
+static void convStrs16u8(int32x4_t *pvOut0, int32x4_t *pvOut1, vx_uint8 *dst, vx_uint8 fillCnt)
+{
+    int32x4_t out0 = *pvOut0;
+    int32x4_t out1 = *pvOut1;
+    int32x4_t vMaxu8 = vdupq_n_s32(UINT8_MAX);
+    int32x4_t vZero = vdupq_n_s32(0);
+    int16x8_t vRet;
+    vx_uint8 szTmp[8];
+
+    out0 = vminq_s32(out0, vMaxu8);
+    out1 = vminq_s32(out1, vMaxu8);
+
+    out0 = vmaxq_s32(out0, vZero);
+    out1 = vmaxq_s32(out1, vZero);
+
+    vRet =  vcombine_s16(vqmovn_s32(out0), vqmovn_s32(out1));
+    if (8 == fillCnt)
+    {
+        vst1_u8(dst, vqmovn_u16(vreinterpretq_u16_s16(vRet)));
+    }
+    else
+    {
+        vst1_u8(szTmp, vqmovn_u16(vreinterpretq_u16_s16(vRet)));
+        for (vx_uint8 idx = 0; idx < fillCnt; idx++)
+        {
+            dst[idx] = szTmp[idx];
+        }
+    }
+
+    return;
+}
+
+static void convStrs16s16(int32x4_t *pvOut0, int32x4_t *pvOut1, vx_int16 *dst, vx_uint8 fillCnt)
+{
+    int32x4_t out0 = *pvOut0;
+    int32x4_t out1 = *pvOut1;
+    int32x4_t vMaxs16 = vdupq_n_s32(INT16_MAX);
+    int32x4_t vMins16 = vdupq_n_s32(INT16_MIN);
+    vx_int16 szTmp[8];
+
+    out0 = vminq_s32(out0, vMaxs16);
+    out1 = vminq_s32(out1, vMaxs16);
+
+    out0 = vmaxq_s32(out0, vMins16);
+    out1 = vmaxq_s32(out1, vMins16);
+
+    if (8 == fillCnt)
+    {
+        vst1q_s16(dst, vcombine_s16(vqmovn_s32(out0), vqmovn_s32(out1)));
+    }
+    else
+    {
+        vst1q_s16(szTmp, vcombine_s16(vqmovn_s32(out0), vqmovn_s32(out1)));
+        for (vx_uint8 idx = 0; idx < fillCnt; idx++)
+        {
+            dst[idx] = szTmp[idx];
+        }
+    }
+}
+
+static void convRow3x1u8(uint8x16_t *pvPrv, uint8x16_t *pvCur, uint8x16_t *pvNxt, vx_int16 *coeff,
+                         int32x4_t *pvOut0, int32x4_t *pvOut1, int32x4_t *pvOut2, int32x4_t *pvOut3)
+{
+    uint8x16_t vPrv = *pvPrv;
+    uint8x16_t vCur = *pvCur;
+    uint8x16_t vNxt = *pvNxt;
+
+    uint8x16_t vData = vextq_u8(vPrv, vCur, 15);
+    int16x8_t s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    int16x8_t s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[2]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[2]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[2]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[2]);
+
+    vData = vextq_u8(vCur, vNxt, 1);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[0]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[0]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[0]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[0]);
+
+    vData = vCur;
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[1]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[1]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[1]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[1]);
+
+    return;
+}
+
+static void convRow5x1u8(uint8x16_t *pvPrv, uint8x16_t *pvCur, uint8x16_t *pvNxt, vx_int16 *coeff,
+                         int32x4_t *pvOut0, int32x4_t *pvOut1, int32x4_t *pvOut2, int32x4_t *pvOut3)
+{
+    uint8x16_t vPrv = *pvPrv;
+    uint8x16_t vCur = *pvCur;
+    uint8x16_t vNxt = *pvNxt;
+
+    uint8x16_t vData = vextq_u8(vPrv, vCur, 14);
+    int16x8_t s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    int16x8_t s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[4]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[4]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[4]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[4]);
+
+    vData = vextq_u8(vCur, vNxt, 2);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[0]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[0]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[0]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[0]);
+
+    vData = vextq_u8(vPrv, vCur, 15);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[3]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[3]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[3]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[3]);
+
+    vData = vextq_u8(vCur, vNxt, 1);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[1]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[1]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[1]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[1]);
+
+    vData = vCur;
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[2]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[2]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[2]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[2]);
+
+    return;
+}
+
+static void convRow7x1u8(uint8x16_t *pvPrv, uint8x16_t *pvCur, uint8x16_t *pvNxt, vx_int16 *coeff,
+                         int32x4_t *pvOut0, int32x4_t *pvOut1, int32x4_t *pvOut2, int32x4_t *pvOut3)
+{
+    uint8x16_t vPrv = *pvPrv;
+    uint8x16_t vCur = *pvCur;
+    uint8x16_t vNxt = *pvNxt;
+
+    uint8x16_t vData = vextq_u8(vPrv, vCur, 13);
+    int16x8_t s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    int16x8_t s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[6]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[6]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[6]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[6]);
+
+    vData = vextq_u8(vCur, vNxt, 3);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[0]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[0]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[0]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[0]);
+
+    vData = vextq_u8(vPrv, vCur, 14);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[5]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[5]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[5]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[5]);
+
+    vData = vextq_u8(vCur, vNxt, 2);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[1]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[1]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[1]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[1]);
+
+    vData = vextq_u8(vPrv, vCur, 15);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[4]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[4]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[4]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[4]);
+
+    vData = vextq_u8(vCur, vNxt, 1);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[2]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[2]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[2]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[2]);
+
+    vData = vCur;
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[3]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[3]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[3]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[3]);
+
+    return;
+}
+
+static void convRow9x1u8(uint8x16_t *pvPrv, uint8x16_t *pvCur, uint8x16_t *pvNxt, vx_int16 *coeff,
+                         int32x4_t *pvOut0, int32x4_t *pvOut1, int32x4_t *pvOut2, int32x4_t *pvOut3)
+{
+    uint8x16_t vPrv = *pvPrv;
+    uint8x16_t vCur = *pvCur;
+    uint8x16_t vNxt = *pvNxt;
+
+    uint8x16_t vData = vextq_u8(vPrv, vCur, 12);
+    int16x8_t s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    int16x8_t s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[8]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[8]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[8]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[8]);
+
+    vData = vextq_u8(vCur, vNxt, 4);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[0]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[0]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[0]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[0]);
+
+    vData = vextq_u8(vPrv, vCur, 13);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[7]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[7]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[7]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[7]);
+
+    vData = vextq_u8(vCur, vNxt, 3);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[1]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[1]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[1]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[1]);
+
+    vData = vextq_u8(vPrv, vCur, 14);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[6]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[6]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[6]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[6]);
+
+    vData = vextq_u8(vCur, vNxt, 2);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[2]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[2]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[2]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[2]);
+
+    vData = vextq_u8(vPrv, vCur, 15);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[5]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[5]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[5]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[5]);
+
+    vData = vextq_u8(vCur, vNxt, 1);
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[3]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[3]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[3]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[3]);
+
+    vData = vCur;
+    s16Tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vData)));
+    s16Tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(vData)));
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(s16Tmp0), coeff[4]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(s16Tmp0), coeff[4]);
+    *pvOut2 = vmlal_n_s16(*pvOut2, vget_low_s16(s16Tmp1), coeff[4]);
+    *pvOut3 = vmlal_n_s16(*pvOut3, vget_high_s16(s16Tmp1), coeff[4]);
+
+    return;
+}
+
+
+static void convRow3x1s16(int16x8_t *pvPrv, int16x8_t *pvCur, int16x8_t *pvNxt, vx_int16 *coeff,
+                          int32x4_t *pvOut0, int32x4_t *pvOut1)
+{
+    int16x8_t vPrv = *pvPrv;
+    int16x8_t vCur = *pvCur;
+    int16x8_t vNxt = *pvNxt;
+
+    int16x8_t vData = vextq_s16(vPrv, vCur, 7);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[2]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[2]);
+
+    vData = vextq_s16(vCur, vNxt, 1);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[0]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[0]);
+
+    vData = vCur;
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[1]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[1]);
+
+    return;
+}
+
+static void convRow5x1s16(int16x8_t *pvPrv, int16x8_t *pvCur, int16x8_t *pvNxt, vx_int16 *coeff,
+                          int32x4_t *pvOut0, int32x4_t *pvOut1)
+{
+    int16x8_t vPrv = *pvPrv;
+    int16x8_t vCur = *pvCur;
+    int16x8_t vNxt = *pvNxt;
+
+    int16x8_t vData = vextq_s16(vPrv, vCur, 6);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[4]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[4]);
+
+    vData = vextq_s16(vCur, vNxt, 2);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[0]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[0]);
+
+    vData = vextq_s16(vPrv, vCur, 7);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[3]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[3]);
+
+    vData = vextq_s16(vCur, vNxt, 1);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[1]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[1]);
+
+    vData = vCur;
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[2]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[2]);
+
+    return;
+}
+
+static void convRow7x1s16(int16x8_t *pvPrv, int16x8_t *pvCur, int16x8_t *pvNxt, vx_int16 *coeff,
+                          int32x4_t *pvOut0, int32x4_t *pvOut1)
+{
+    int16x8_t vPrv = *pvPrv;
+    int16x8_t vCur = *pvCur;
+    int16x8_t vNxt = *pvNxt;
+
+    int16x8_t vData = vextq_s16(vPrv, vCur, 5);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[6]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[6]);
+
+    vData = vextq_s16(vCur, vNxt, 3);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[0]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[0]);
+
+
+    vData = vextq_s16(vPrv, vCur, 6);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[5]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[5]);
+
+    vData = vextq_s16(vCur, vNxt, 2);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[1]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[1]);
+
+    vData = vextq_s16(vPrv, vCur, 7);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[4]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[4]);
+
+    vData = vextq_s16(vCur, vNxt, 1);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[2]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[2]);
+
+    vData = vCur;
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[3]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[3]);
+
+    return;
+}
+
+static void convRow9x1s16(int16x8_t *pvPrv, int16x8_t *pvCur, int16x8_t *pvNxt, vx_int16 *coeff,
+                          int32x4_t *pvOut0, int32x4_t *pvOut1)
+{
+    int16x8_t vPrv = *pvPrv;
+    int16x8_t vCur = *pvCur;
+    int16x8_t vNxt = *pvNxt;
+
+    int16x8_t vData = vextq_s16(vPrv, vCur, 4);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[8]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[8]);
+
+    vData = vextq_s16(vCur, vNxt, 4);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[0]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[0]);
+
+    vData = vextq_s16(vPrv, vCur, 5);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[7]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[7]);
+
+    vData = vextq_s16(vCur, vNxt, 3);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[1]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[1]);
+
+
+    vData = vextq_s16(vPrv, vCur, 6);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[6]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[6]);
+
+    vData = vextq_s16(vCur, vNxt, 2);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[2]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[2]);
+
+    vData = vextq_s16(vPrv, vCur, 7);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[5]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[5]);
+
+    vData = vextq_s16(vCur, vNxt, 1);
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[3]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[3]);
+
+    vData = vCur;
+    *pvOut0 = vmlal_n_s16(*pvOut0, vget_low_s16(vData), coeff[4]);
+    *pvOut1 = vmlal_n_s16(*pvOut1, vget_high_s16(vData), coeff[4]);
+
+    return;
+}
+
+void Convolve_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_convolution_t *conv = (vx_tile_convolution_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+
+    vx_uint8 *src_base = in->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = out->tile_x + out->tile_block.width;
+
+    vx_size conv_width = (*conv).conv_width;
+    vx_size conv_height = (*conv).conv_height;
+
+    vx_int32 conv_radius_x, conv_radius_y;
+
+    conv_radius_x = (vx_int32)conv_width / 2;
+    conv_radius_y = (vx_int32)conv_height / 2;
+
+    vx_uint32 src_format = in->image.format;
+    vx_uint32 dst_format = out->image.format;
+
+    vx_int32 sum = 0, value = 0;
+
+    vx_uint32 scale = (*conv).scale;
+
+    vx_int16 conv_mat[C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM] = { 0 };
+
+    memcpy(conv_mat, ((*conv).conv_mat), conv_width * conv_height * sizeof(vx_int16));
+
+    vx_int32 shift = (vx_int32)u32Tou8(scale);
+
+    if ( high_y == vxTileHeight(out, 0) )
+    {
+        uint8x16_t vPrv[C_MAX_CONVOLUTION_DIM];
+        uint8x16_t vCur[C_MAX_CONVOLUTION_DIM];
+        uint8x16_t vNxt[C_MAX_CONVOLUTION_DIM];
+        int32x4_t out0 = vdupq_n_s32(0);
+        int32x4_t out1 = out0;
+        int32x4_t out2 = out0;
+        int32x4_t out3 = out0;
+
+        vx_uint32 dstY = conv_radius_y;
+        vx_uint8 *dstTmp;
+        for (x = low_x; x < high_x; x += 16)
+        {
+            dstTmp = (vx_uint8 *)dst_base + x * out->addr->stride_x;
+            dstY = conv_radius_y;
+            if (0 == x)
+            {
+                for (y = 0; y < conv_height; y++)
+                {
+                    vPrv[y] = vdupq_n_u8(0);
+                    vCur[y] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + x * in->addr->stride_x);
+                    vNxt[y] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + (x + 16) * in->addr->stride_x);
+                }
+            }
+            else
+            {
+                for (y = 0; y < conv_height; y++)
+                {
+                    vPrv[y] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + (x - 16) * in->addr->stride_x);
+                    vCur[y] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + x * in->addr->stride_x);
+                    vNxt[y] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + (x + 16) * in->addr->stride_x);
+                }
+            }
+
+            for (y = conv_height; y < high_y; (++y, dstY++))
+            {
+                out0 = vdupq_n_s32(0);
+                out1 = out0;
+                out2 = out0;
+                out3 = out0;
+                for (vx_uint8 convY = 0; convY < conv_height; convY++)
+                {
+                    if (3 == conv_width)
+                    {
+                        convRow3x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY],
+                                     conv_mat + (conv_height - (convY + 1))  * conv_width,
+                                     &out0, &out1, &out2, &out3);
+                    }
+                    else if (5 == conv_width)
+                    {
+                        convRow5x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY],
+                                     conv_mat + (conv_height - (convY + 1))  * conv_width,
+                                     &out0, &out1, &out2, &out3);
+                    }
+                    else if (7 == conv_width)
+                    {
+                        convRow7x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY],
+                                     conv_mat + (conv_height - (convY + 1))  * conv_width,
+                                     &out0, &out1, &out2, &out3);
+                    }
+                    else if (9 == conv_width)
+                    {
+                        convRow9x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY],
+                                     conv_mat + (conv_height - (convY + 1))  * conv_width,
+                                     &out0, &out1, &out2, &out3);
+                    }
+                }
+
+                s32ShiftR(&out0, shift);
+                s32ShiftR(&out1, shift);
+                s32ShiftR(&out2, shift);
+                s32ShiftR(&out3, shift);
+
+                if (dst_format == VX_DF_IMAGE_U8)
+                {
+                    convStru8u8(&out0, &out1, &out2, &out3, dstTmp + dstY * out->addr->stride_y, 16);
+                }
+                else if (dst_format == VX_DF_IMAGE_S16)
+                {
+                    convStru8s16(&out0, &out1, &out2, &out3, (vx_int16 *)(dstTmp + dstY * out->addr->stride_y), 16);
+                }
+
+                //swap data and acquire next data
+                for (vx_uint8 convY = 0; convY < (conv_height - 1); convY++)
+                {
+                    vPrv[convY] = vPrv[convY + 1];
+                    vCur[convY] = vCur[convY + 1];
+                    vNxt[convY] = vNxt[convY + 1];
+                }
+
+                if (0 == x)
+                {
+                    vPrv[conv_height - 1] = vdupq_n_u8(0);
+                }
+                else
+                {
+                    vPrv[conv_height - 1] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + (x - 16) * in->addr->stride_x);
+                }
+                vCur[conv_height - 1] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + x * in->addr->stride_x);
+                vNxt[conv_height - 1] = vld1q_u8((vx_uint8 *)src_base + y * in->addr->stride_y + (x + 16) * in->addr->stride_x);
+            }
+
+            //process the last one
+            out0 = vdupq_n_s32(0);
+            out1 = out0;
+            out2 = out0;
+            out3 = out0;
+            for (vx_uint8 convY = 0; convY < conv_height; convY++)
+            {
+                if (3 == conv_width)
+                {
+                    convRow3x1u8(&(vPrv[convY]), &(vCur[convY]), &(vNxt[convY]),
+                                 conv_mat + (conv_height - (convY + 1)) * conv_width,
+                                 &out0, &out1, &out2, &out3);
+                }
+                else if (5 == conv_width)
+                {
+                    convRow5x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY],
+                                 conv_mat + (conv_height - (convY + 1))  * conv_width,
+                                 &out0, &out1, &out2, &out3);
+                }
+                else if (7 == conv_width)
+                {
+                    convRow7x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY],
+                                 conv_mat + (conv_height - (convY + 1))  * conv_width,
+                                 &out0, &out1, &out2, &out3);
+                }
+                else if (9 == conv_width)
+                {
+                    convRow9x1u8(&vPrv[convY], &vCur[convY], &vNxt[convY],
+                                 conv_mat + (conv_height - (convY + 1))  * conv_width,
+                                 &out0, &out1, &out2, &out3);
+                }
+            }
+
+            s32ShiftR(&out0, shift);
+            s32ShiftR(&out1, shift);
+            s32ShiftR(&out2, shift);
+            s32ShiftR(&out3, shift);
+
+            if (dst_format == VX_DF_IMAGE_U8)
+            {
+                convStru8u8(&out0, &out1, &out2, &out3, dstTmp + dstY * out->addr->stride_y, 16);
+            }
+            else if (dst_format == VX_DF_IMAGE_S16)
+            {
+                convStru8s16(&out0, &out1, &out2, &out3, (vx_int16 *)(dstTmp + dstY * out->addr->stride_y), 16);
+            }
+        }
+    }
+}
+
+
+static void vxReadRectangle_flexible(const void *base, const vx_imagepatch_addressing_t *addr, 
+                            vx_df_image type, vx_uint32 center_x, vx_uint32 center_y, 
+                            vx_uint32 radius_x, vx_uint32 radius_y, void *destination)
+{
+    vx_int32 width = (vx_int32)addr->dim_x, height = (vx_int32)addr->dim_y;
+    vx_int32 stride_y = addr->stride_y;
+    vx_int32 stride_x = addr->stride_x;
+    const vx_uint8 *ptr = (const vx_uint8 *)base;
+    vx_int32 ky, kx;
+    vx_uint32 dest_index = 0;
+    // kx, ky - kernel x and y
+    for (ky = -(int32_t)radius_y; ky <= (int32_t)radius_y; ++ky)
+    {
+        vx_int32 y = (vx_int32)(center_y + ky);
+        y = y < 0 ? 0 : y >= height ? height - 1 : y;
+
+        for (kx = -(int32_t)radius_x; kx <= (int32_t)radius_x; ++kx, ++dest_index)
+        {
+            vx_int32 x = (int32_t)(center_x + kx);
+            x = x < 0 ? 0 : x >= width ? width - 1 : x;
+
+            switch (type)
+            {
+            case VX_DF_IMAGE_U8:
+                ((vx_uint8*)destination)[dest_index] = *(vx_uint8*)(ptr + y*stride_y + x*stride_x);
+                break;
+            case VX_DF_IMAGE_S16:
+            case VX_DF_IMAGE_U16:
+                ((vx_uint16*)destination)[dest_index] = *(vx_uint16*)(ptr + y*stride_y + x*stride_x);
+                break;
+            }
+        }
+    }
+}
+
+#define CONVOLVE(low_y, high_y, low_x, high_x)                                                                        \
+    for (y = low_y; y < high_y; ++y)                                                                                  \
+    {                                                                                                                 \
+        for (x = low_x; x < high_x; ++x)                                                                              \
+        {                                                                                                             \
+            sum = 0;                                                                                                  \
+            if (src_format == VX_DF_IMAGE_U8)                                                                         \
+            {                                                                                                         \
+                vx_uint8 slice[C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM] = { 0 };                                \
+                                                                                                                      \
+                vxReadRectangle_flexible(src_base, in->addr, src_format, x, y, conv_radius_x, conv_radius_y, slice);           \
+                                                                                                                      \
+                for (i = 0; i < (vx_int32)(conv_width * conv_height); ++i)                                            \
+                    sum += conv_mat[conv_width * conv_height - 1 - i] * slice[i];                                     \
+            }                                                                                                         \
+            else if (src_format == VX_DF_IMAGE_S16)                                                                   \
+            {                                                                                                         \
+                vx_int16 slice[C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM] = { 0 };                                \
+                                                                                                                      \
+                vxReadRectangle_flexible(src_base, in->addr, src_format, x, y, conv_radius_x, conv_radius_y, slice);           \
+                                                                                                                      \
+                for (i = 0; i < (vx_int32)(conv_width * conv_height); ++i)                                            \
+                    sum += conv_mat[conv_width * conv_height - 1 - i] * slice[i];                                     \
+            }                                                                                                         \
+                                                                                                                      \
+            value = sum / (vx_int32)scale;                                                                            \
+                                                                                                                      \
+            if (dst_format == VX_DF_IMAGE_U8)                                                                         \
+            {                                                                                                         \
+                vx_uint8 *dstp = (vx_uint8 *)dst_base + y * out->addr->stride_y + x * out->addr->stride_x;            \
+                if (value < 0) *dstp = 0;                                                                             \
+                else if (value > UINT8_MAX) *dstp = UINT8_MAX;                                                        \
+                else *dstp = value;                                                                                   \
+            }                                                                                                         \
+            else if (dst_format == VX_DF_IMAGE_S16)                                                                   \
+            {                                                                                                         \
+                vx_int16 *dstp = (vx_int16 *)dst_base + y * out->addr->stride_y / 2 + x * out->addr->stride_x / 2;    \
+                if (value < INT16_MIN) *dstp = INT16_MIN;                                                             \
+                else if (value > INT16_MAX) *dstp = INT16_MAX;                                                        \
+                else *dstp = value;                                                                                   \
+            }                                                                                                         \
+        }                                                                                                             \
+    }
+
+void Convolve_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0, i;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_convolution_t *conv = (vx_tile_convolution_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    vx_size conv_width = (*conv).conv_width;
+    vx_size conv_height = (*conv).conv_height;
+
+    vx_int32 conv_radius_x, conv_radius_y;
+
+    conv_radius_x = (vx_int32)conv_width / 2;
+    conv_radius_y = (vx_int32)conv_height / 2;
+
+    vx_uint32 src_format = in->image.format;
+    vx_uint32 dst_format = out->image.format;
+
+    vx_int32 sum = 0, value = 0;
+
+    vx_uint32 scale = (*conv).scale;
+
+    vx_int16 conv_mat[C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM] = { 0 };
+
+    memcpy(conv_mat, ((*conv).conv_mat), conv_width * conv_height * sizeof(vx_int16));
+
+    if (low_y == 0 && low_x == 0)
+    {
+        CONVOLVE(low_y + conv_radius_y, high_y - conv_radius_y, low_x + conv_radius_x, high_x - conv_radius_x)
+    }
+    else
+    {
+        CONVOLVE(conv_radius_y, low_y, low_x, high_x - conv_radius_x)
+
+        src_base = in->base[0];
+        dst_base = out->base[0];
+        CONVOLVE(low_y, high_y, conv_radius_x, high_x - conv_radius_x)
+    }
+}
diff --git a/kernels/tiling/tiling_fast9.c b/kernels/tiling/tiling_fast9.c
new file mode 100644
index 0000000..0f38824
--- /dev/null
+++ b/kernels/tiling/tiling_fast9.c
@@ -0,0 +1,860 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+#include <tiling.h>
+#include <string.h>
+
+#define PERMUTATIONS 16
+#define APERTURE 3
+#define PERM_SIZE 16
+
+static const vx_uint8 permutations_table[PERMUTATIONS][PERM_SIZE] =
+    {
+        {  0,  1,  2,  3,  4,  5,  6,  7,  8, 255, 255, 255, 255, 255, 255, 255 },
+        { 15,  0,  1,  2,  3,  4,  5,  6,  7, 255, 255, 255, 255, 255, 255, 255 },
+        { 14, 15,  0,  1,  2,  3,  4,  5,  6, 255, 255, 255, 255, 255, 255, 255 },
+        { 13, 14, 15,  0,  1,  2,  3,  4,  5, 255, 255, 255, 255, 255, 255, 255 },
+        { 12, 13, 14, 15,  0,  1,  2,  3,  4, 255, 255, 255, 255, 255, 255, 255 },
+        { 11, 12, 13, 14, 15,  0,  1,  2,  3, 255, 255, 255, 255, 255, 255, 255 },
+        { 10, 11, 12, 13, 14, 15,  0,  1,  2, 255, 255, 255, 255, 255, 255, 255 },
+        {  9, 10, 11, 12, 13, 14, 15,  0,  1, 255, 255, 255, 255, 255, 255, 255 },
+        {  8,  9, 10, 11, 12, 13, 14, 15,  0, 255, 255, 255, 255, 255, 255, 255 },
+        {  7,  8,  9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 },
+        {  6,  7,  8,  9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 },
+        {  5,  6,  7,  8,  9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 },
+        {  4,  5,  6,  7,  8,  9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 },
+        {  3,  4,  5,  6,  7,  8,  9, 10, 11, 255, 255, 255, 255, 255, 255, 255 },
+        {  2,  3,  4,  5,  6,  7,  8,  9, 10, 255, 255, 255, 255, 255, 255, 255 },
+        {  1,  2,  3,  4,  5,  6,  7,  8,  9, 255, 255, 255, 255, 255, 255, 255 }
+    };
+
+/* The following creates the index registers to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
+    . . F 0 1 . . .
+    . E . . . 2 . .
+    D . . . . . 3 .
+    C . . P . . 4 .
+    B . . . . . 5 .
+    . A . . . 6 . .
+    . . 9 8 7 . . .
+    Where . is an irrelevant texel value
+    We want to retrieve all texels [0,F]
+    The 4 registers in r will then be used to get these texels out of two tables in the function get_circle_texels()
+    The first table holds the top 4 rows of texels
+    . . F 0 1 . . .
+    . E . . . 2 . .
+    D . . . . . 3 .
+    C . . P . . 4 .
+    The second table the bottom 3 rows of texels
+    B . . . . . 5 .
+    . A . . . 6 . .
+    . . 9 8 7 . . .
+*/
+static const vx_uint8 top_right[8] =
+{
+    /* The register r.val[0] will be used to retrieve these texels:
+    . . . 0 1 . . .
+    . . . . . 2 . .
+    . . . . . . 3 .
+    . . . . . . 4 .
+    */
+    3 /* top table, first row, elem 4, value 0 in the diagram above */,
+    4 /* top table, first row, elem 5, value 1 in the diagram above */,
+    13 /* top table, second row, elem 6, value 2 in the diagram above */,
+    22 /* top table, third row, elem 7, value 3 in the diagram above*/,
+    30 /* top table, fourth row, elem 7, value 4 in the diagram above*/,
+    255,
+    255,
+    255
+};
+
+static const vx_uint8 bottom_right[8] =
+{
+    /* The register r.val[1] will be used to retrieve these texels:
+    . . . . . . 5 .
+    . . . . . 6 . .
+    . . . . 7 . . .
+    */
+    255,
+    255,
+    255,
+    255,
+    255,
+    6 /* low table, first row, elem 7, value 5 in the diagram above*/,
+    13 /* low table, second row, elem 6, value 6 in the diagram above*/,
+    20 /* low table, third row, elem 5, value 7 in the diagram above*/
+};
+
+static const vx_uint8 top_left[8] =
+{
+    /* The register r.val[2] will be used to retrieve these texels:
+    . . F . . . . .
+    . E . . . . . .
+    D . . . . . . .
+    C . . . . . . .
+    */
+    255,
+    255,
+    255,
+    255,
+    24 /* top table, fourth row, elem 1, value C in the diagram above */,
+    16 /* top table, third row, elem 1, value D in the diagram above*/,
+    9 /* top table, second row, elem 2, value E in the diagram above*/,
+    2 /* top table, first row, elem 3, value F in the diagram above*/
+};
+
+static const vx_uint8 bottom_left[8] =
+{
+    /* The register r.val[3] will be used to retrieve these texels:
+    B . . . . . . .
+    . A . . . . . .
+    . . 9 8 . . . .
+    */
+    19 /* low table, third row, elem 4, value 8 in the diagram above */,
+    18 /* low table, third row, elem 3, value 9 in the diagram above */,
+    9 /* low table, second row, elem 2, value A in the diagram above */,
+    0 /* low table, first row, elem 1, value B in the diagram above */,
+    255,
+    255,
+    255,
+    255
+};
+
+static void vxAddArrayItems_tiling(vx_tile_array_t *arr, vx_size count, const void *ptr, vx_size stride)
+{
+    if ((count > 0) && (ptr != NULL) && (stride >= arr->item_size))
+    {
+        if (arr->num_items + count <= arr->capacity)
+        {
+            vx_size offset = arr->num_items * arr->item_size;
+            vx_uint8 *dst_ptr = (vx_uint8 *)arr->ptr + offset;
+
+            vx_size i;
+            for (i = 0; i < count; ++i)
+            {
+                vx_uint8 *tmp = (vx_uint8 *)ptr;
+                memcpy(&dst_ptr[i * arr->item_size], &tmp[i * stride], arr->item_size);
+            }
+
+            arr->num_items += count;
+        }
+    }
+}
+
+static void addCorner(vx_int32 y, int16x8_t *pvX, uint8x8_t *pvPred, vx_uint8 *pStrength, vx_size dst_capacity,
+                      vx_size *num_corners, vx_tile_array_t *points)
+{
+    uint8x8_t vPred = *pvPred;
+    int16x8_t vX = *pvX;
+    vx_keypoint_t kp;
+    if (0 != vget_lane_u8(vPred, 0) && (*num_corners) < dst_capacity)
+    {
+        kp.x = vgetq_lane_s16(vX, 0);
+        kp.y = y;
+        kp.strength = pStrength[0];
+        kp.scale = 0.0f;
+        kp.orientation = 0.0f;
+        kp.tracking_status = 1;
+        kp.error = 0.0f;
+        (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp));
+        *num_corners += 1;
+    }
+    if (0 != vget_lane_u8(vPred, 1) && (*num_corners) < dst_capacity)
+    {
+        kp.x = vgetq_lane_s16(vX, 1);
+        kp.y = y;
+        kp.strength = pStrength[1];
+        kp.scale = 0.0f;
+        kp.orientation = 0.0f;
+        kp.tracking_status = 1;
+        kp.error = 0.0f;
+        (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp));
+        *num_corners += 1;
+    }
+    if (0 != vget_lane_u8(vPred, 2) && (*num_corners) < dst_capacity)
+    {
+        kp.x = vgetq_lane_s16(vX, 2);
+        kp.y = y;
+        kp.strength = pStrength[2];
+        kp.scale = 0.0f;
+        kp.orientation = 0.0f;
+        kp.tracking_status = 1;
+        kp.error = 0.0f;
+        (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp));
+        *num_corners += 1;
+    }
+    if (0 != vget_lane_u8(vPred, 3) && (*num_corners) < dst_capacity)
+    {
+        kp.x = vgetq_lane_s16(vX, 3);
+        kp.y = y;
+        kp.strength = pStrength[3];
+        kp.scale = 0.0f;
+        kp.orientation = 0.0f;
+        kp.tracking_status = 1;
+        kp.error = 0.0f;
+        (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp));
+        *num_corners += 1;
+    }
+    if (0 != vget_lane_u8(vPred, 4) && (*num_corners) < dst_capacity)
+    {
+        kp.x = vgetq_lane_s16(vX, 4);
+        kp.y = y;
+        kp.strength = pStrength[4];
+        kp.scale = 0.0f;
+        kp.orientation = 0.0f;
+        kp.tracking_status = 1;
+        kp.error = 0.0f;
+        (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp));
+        *num_corners += 1;
+    }
+    if (0 != vget_lane_u8(vPred, 5) && (*num_corners) < dst_capacity)
+    {
+        kp.x = vgetq_lane_s16(vX, 5);
+        kp.y = y;
+        kp.strength = pStrength[5];
+        kp.scale = 0.0f;
+        kp.orientation = 0.0f;
+        kp.tracking_status = 1;
+        kp.error = 0.0f;
+        (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp));
+        *num_corners += 1;
+    }
+    if (0 != vget_lane_u8(vPred, 6) && (*num_corners) < dst_capacity)
+    {
+        kp.x = vgetq_lane_s16(vX, 6);
+        kp.y = y;
+        kp.strength = pStrength[6];
+        kp.scale = 0.0f;
+        kp.orientation = 0.0f;
+        kp.tracking_status = 1;
+        kp.error = 0.0f;
+        (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp));
+        *num_corners += 1;
+    }
+    if (0 != vget_lane_u8(vPred, 7) && (*num_corners) < dst_capacity)
+    {
+        kp.x = vgetq_lane_s16(vX, 7);
+        kp.y = y;
+        kp.strength = pStrength[7];
+        kp.scale = 0.0f;
+        kp.orientation = 0.0f;
+        kp.tracking_status = 1;
+        kp.error = 0.0f;
+        (void)vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp));
+        *num_corners += 1;
+    }
+}
+
+static void getPermIdx(vx_uint32 idx, uint8x8x2_t *pvPermIdx)
+{
+    uint8x8x2_t vPermIdx = {
+        {vld1_u8(permutations_table[idx]), vld1_u8(permutations_table[idx] + 8)}};
+    *pvPermIdx = vPermIdx;
+}
+
+static void getElemIdx(uint8x8x4_t *pvIdx)
+{
+    uint8x8x4_t reg = {
+            {
+                vld1_u8(top_right),
+                vld1_u8(bottom_right),
+                vld1_u8(top_left),
+                vld1_u8(bottom_left)
+            }};
+    *pvIdx = reg;
+}
+
+static vx_uint8 isFastCorner(uint8x8_t *pvVal, vx_uint8 p, vx_uint8 tolerance)
+{
+    uint8x8x4_t vIdx;
+    uint8x8x2_t vPermIdx;
+    uint8x8x4_t vTbl_hi = {{pvVal[0], pvVal[1], pvVal[2], pvVal[3]}};
+    uint8x8x3_t vTbl_lo = {{pvVal[4], pvVal[5], pvVal[6]}};
+    uint8x16_t vPG = vqaddq_u8(vdupq_n_u8(p), vdupq_n_u8(tolerance));
+    uint8x16_t vPL = vqsubq_u8(vdupq_n_u8(p), vdupq_n_u8(tolerance));
+
+    getElemIdx(&vIdx);
+
+    uint8x16_t vPixel = vcombine_u8(vtbx3_u8(vtbl4_u8(vTbl_hi, vIdx.val[0]), vTbl_lo, vIdx.val[1]),
+                                    vtbx3_u8(vtbl4_u8(vTbl_hi, vIdx.val[2]), vTbl_lo, vIdx.val[3]));
+    uint8x8x2_t vTmp = {{vget_low_u8(vPixel), vget_high_u8(vPixel)}};
+    uint8x8_t vPermR = vdup_n_u8(0xFF);
+    vx_uint8 bPG = 0;
+    vx_uint8 bPL = 0;
+
+    for (vx_uint8 idx = 0; idx < PERMUTATIONS; idx++)
+    {
+        getPermIdx(idx, &vPermIdx);
+        uint8x16_t vVal = vcombine_u8(vtbl2_u8(vTmp, vPermIdx.val[0]),
+                                      vtbx2_u8(vPermR, vTmp, vPermIdx.val[1]));
+        uint8x16_t vPred = vcgtq_u8(vVal, vPG);
+        uint64x1_t vRet = vreinterpret_u64_u8(vand_u8(vget_high_u8(vPred), vget_low_u8(vPred)));
+        bPG |= (vget_lane_u64(vRet, 0) == UINT64_MAX);
+
+        vPred = vcltq_u8(vVal, vPL);
+        uint64x2_t vRet2 = vreinterpretq_u64_u8(vPred);
+        bPL |= ((vgetq_lane_u64(vRet2, 0) == UINT64_MAX) && (vgetq_lane_u64(vRet2, 1) == 0xFF));
+    }
+
+    return (bPG | bPL);
+}
+
+static vx_uint8 getStrength(vx_uint8 bCorner, uint8x8_t *pvVal, vx_uint8 p, vx_uint8 tolerance)
+{
+    vx_uint8 a = 0, b = 255;
+
+    if (bCorner)
+    {
+        a = tolerance;
+        while (b - a > 1)
+        {
+            vx_uint8 c = (a + b)/2;
+            if (isFastCorner(pvVal, p, c))
+                a = c;
+            else
+                b = c;
+        }
+    }
+
+    return a;
+}
+
+static void fast9CornersPerRow(uint8x8_t *pvPrv, uint8x8_t *pvCur, uint8x8_t *pvNxt, int16x8_t *pvXStep,
+                               vx_imagepatch_addressing_t *src_addr, vx_uint8 tolerance, vx_uint8 *pStrength)
+{
+    vx_uint8 bCorner;
+    int16x8_t vX = *pvXStep;
+    uint8x8_t vPrv[7], vCur[7], vNxt[7];
+    uint8x8_t vTmp[7];
+
+    vx_int32 x;
+    for (x = 0; x < 7; x++)
+    {
+        vPrv[x] = pvPrv[x];
+        vCur[x] = pvCur[x];
+        vNxt[x] = pvNxt[x];
+    }
+
+    if (vgetq_lane_s16(vX, 0) >= APERTURE && vgetq_lane_s16(vX, 0) < (src_addr->dim_x - APERTURE))
+    {
+        for (vx_uint32 idx = 0; idx < 7; idx++)
+        {
+            vTmp[idx] = vext_u8(vPrv[idx], vCur[idx], 5);
+        }
+
+        bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 0), tolerance);
+        pStrength[0] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 0), tolerance);
+    }
+
+    if (vgetq_lane_s16(vX, 1) >= APERTURE && vgetq_lane_s16(vX, 1) < (src_addr->dim_x - APERTURE))
+    {
+        for (vx_uint32 idx = 0; idx < 7; idx++)
+        {
+            vTmp[idx] = vext_u8(vPrv[idx], vCur[idx], 6);
+        }
+
+        bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 1), tolerance);
+        pStrength[1] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 1), tolerance);
+    }
+
+    if (vgetq_lane_s16(vX, 2) >= APERTURE && vgetq_lane_s16(vX, 2) < (src_addr->dim_x - APERTURE))
+    {
+        for (vx_uint32 idx = 0; idx < 7; idx++)
+        {
+            vTmp[idx] = vext_u8(vPrv[idx], vCur[idx], 7);
+        }
+
+        bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 2), tolerance);
+        pStrength[2] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 2), tolerance);
+    }
+    if (vgetq_lane_s16(vX, 3) >= APERTURE && vgetq_lane_s16(vX, 3) < (src_addr->dim_x - APERTURE))
+    {
+        bCorner = isFastCorner(vCur, vget_lane_u8(vCur[3], 3), tolerance);
+        pStrength[3] = getStrength(bCorner, vCur, vget_lane_u8(vCur[3], 3), tolerance);
+    }
+    if (vgetq_lane_s16(vX, 4) >= APERTURE && vgetq_lane_s16(vX, 4) < (src_addr->dim_x - APERTURE))
+    {
+        for (vx_uint32 idx = 0; idx < 7; idx++)
+        {
+            vTmp[idx] = vext_u8(vCur[idx], vNxt[idx], 1);
+        }
+
+        bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 4), tolerance);
+        pStrength[4] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 4), tolerance);
+    }
+    if (vgetq_lane_s16(vX, 5) >= APERTURE && vgetq_lane_s16(vX, 5) < (src_addr->dim_x - APERTURE))
+    {
+        for (vx_uint32 idx = 0; idx < 7; idx++)
+        {
+            vTmp[idx] = vext_u8(vCur[idx], vNxt[idx], 2);
+        }
+
+        bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 5), tolerance);
+        pStrength[5] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 5), tolerance);
+    }
+    if (vgetq_lane_s16(vX, 6) >= APERTURE && vgetq_lane_s16(vX, 6) < (src_addr->dim_x - APERTURE))
+    {
+        for (vx_uint32 idx = 0; idx < 7; idx++)
+        {
+            vTmp[idx] = vext_u8(vCur[idx], vNxt[idx], 3);
+        }
+
+        bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 6), tolerance);
+        pStrength[6] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 6), tolerance);
+    }
+    if (vgetq_lane_s16(vX, 7) >= APERTURE && vgetq_lane_s16(vX, 7) < (src_addr->dim_x - APERTURE))
+    {
+        for (vx_uint32 idx = 0; idx < 7; idx++)
+        {
+            vTmp[idx] = vext_u8(vCur[idx], vNxt[idx], 4);
+        }
+
+        bCorner = isFastCorner(vTmp, vget_lane_u8(vCur[3], 7), tolerance);
+        pStrength[7] = getStrength(bCorner, vTmp, vget_lane_u8(vCur[3], 7), tolerance);
+    }
+}
+
+static vx_uint8 indexes[PERMUTATIONS][9] = 
+{
+    { 0, 1, 2, 3, 4, 5, 6, 7, 8 },
+    { 15, 0, 1, 2, 3, 4, 5, 6, 7 },
+    { 14,15, 0, 1, 2, 3, 4, 5, 6 },
+    { 13,14,15, 0, 1, 2, 3, 4, 5 },
+    { 12,13,14,15, 0, 1, 2, 3, 4 },
+    { 11,12,13,14,15, 0, 1, 2, 3 },
+    { 10,11,12,13,14,15, 0, 1, 2 },
+    { 9,10,11,12,13,14,15, 0, 1 },
+    { 8, 9,10,11,12,13,14,15, 0 },
+    { 7, 8, 9,10,11,12,13,14,15 },
+    { 6, 7, 8, 9,10,11,12,13,14 },
+    { 5, 6, 7, 8, 9,10,11,12,13 },
+    { 4, 5, 6, 7, 8, 9,10,11,12 },
+    { 3, 4, 5, 6, 7, 8, 9,10,11 },
+    { 2, 3, 4, 5, 6, 7, 8, 9,10 },
+    { 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+};
+
+/* offsets from "p" */
+static vx_int32 offsets[16][2] = 
+{
+    { 0, -3 },
+    { 1, -3 },
+    { 2, -2 },
+    { 3, -1 },
+    { 3,  0 },
+    { 3,  1 },
+    { 2,  2 },
+    { 1,  3 },
+    { 0,  3 },
+    { -1,  3 },
+    { -2,  2 },
+    { -3,  1 },
+    { -3,  0 },
+    { -3, -1 },
+    { -2, -2 },
+    { -1, -3 },
+};
+
+
+static vx_bool vxIsFastCorner(const vx_uint8* buf, vx_uint8 p, vx_uint8 tolerance)
+{
+    vx_int32 i, a;
+    for (a = 0; a < PERMUTATIONS; a++)
+    {
+        vx_bool isacorner = vx_true_e;
+        for (i = 0; i < dimof(indexes[a]); i++)
+        {
+            vx_uint8 j = indexes[a][i];
+            vx_uint8 v = buf[j];
+            if (v <= (p + tolerance))
+            {
+                isacorner = vx_false_e;
+            }
+        }
+        if (isacorner == vx_true_e)
+            return isacorner;
+        isacorner = vx_true_e;
+        for (i = 0; i < dimof(indexes[a]); i++)
+        {
+            vx_uint8 j = indexes[a][i];
+            vx_uint8 v = buf[j];
+            if (v >= (p - tolerance))
+            {
+                isacorner = vx_false_e;
+            }
+        }
+        if (isacorner == vx_true_e)
+            return isacorner;
+    }
+    return vx_false_e;
+}
+
+
+static vx_uint8 vxGetFastCornerStrength(vx_int32 x, vx_int32 y, void* src_base,
+                                        vx_imagepatch_addressing_t* src_addr, vx_uint8 tolerance)
+{
+    if (x < APERTURE || y < APERTURE || x >= (vx_int32)src_addr->dim_x - APERTURE || y >= (vx_int32)src_addr->dim_y - APERTURE)
+        return 0;
+    {
+        vx_uint8 p = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y, src_addr);
+        vx_uint8 buf[16];
+        vx_int32 j;
+        vx_uint8 a, b = 255;
+
+        for (j = 0; j < 16; j++)
+        {
+            buf[j] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + offsets[j][0], y + offsets[j][1], src_addr);
+        }
+
+        if (!vxIsFastCorner(buf, p, tolerance))
+            return 0;
+
+        a = tolerance;
+        while (b - a > 1)
+        {
+            vx_uint8 c = (a + b) / 2;
+            if (vxIsFastCorner(buf, p, c))
+                a = c;
+            else
+                b = c;
+        }
+        return a;
+    }
+}
+
+void Fast9Corners_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_float32 *sens = (vx_float32*)parameters[1];
+    vx_bool *nonm = (vx_bool*)parameters[2];
+    vx_tile_array_t *points = (vx_tile_array_t *)parameters[3];
+    vx_scalar s_num_corners = (vx_scalar)parameters[4];
+
+    vx_keypoint_t kp;
+
+    vx_size num_corners = 0;
+
+    vx_uint8 *src_base = in->base[0];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = in->tile_y + in->tile_block.height;
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = in->tile_x + in->tile_block.width;
+
+    vx_uint8 tolerance = (vx_uint8)(*sens);
+    vx_bool do_nonmax = *nonm;
+    vx_size dst_capacity = points->capacity;
+
+    memset(&kp, 0, sizeof(kp));
+
+    vx_int32 w8 = ((in->image.width - 2 * APERTURE) >> 3) << 3;
+    vx_int16 szXStep[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    vx_uint8 szStrength[8];
+    int16x8_t vXStep = vld1q_s16(szXStep);
+    uint8x8_t vZero = vdup_n_u8(0);
+    uint8x8_t vPrv[7], vCur[7], vNxt[7];
+    uint8x8_t vNMPrv[7], vNMCur[7], vNMNxt[7];
+
+    if (high_y == in->image.height && high_x == in->image.width)
+    {
+        for (y = APERTURE; y < in->image.height - APERTURE; y++)
+        {
+            for (vx_uint8 idx = 0; idx < 7; idx++)
+            {
+                vPrv[idx] = vdup_n_u8(0);
+                vCur[idx] = vld1_u8((vx_uint8 *)src_base + (y - APERTURE + idx) * in->addr->stride_y);
+            }
+            for (x = 0; x < in->image.width - APERTURE; x += 8)
+            {
+                for (vx_uint8 idx = 0; idx < 7; idx++)
+                {
+                    vNxt[idx] = vld1_u8((vx_uint8 *)src_base + (y - APERTURE + idx) * in->addr->stride_y + (x + 8) * in->addr->stride_x);
+                }
+                int16x8_t vX = vaddq_s16(vdupq_n_s16(x), vXStep);
+
+                memset(szStrength, 0, 8);
+                fast9CornersPerRow(vPrv, vCur, vNxt, &vX, in->addr, tolerance, szStrength);
+                uint8x8_t vStrength = vld1_u8(szStrength);
+                uint8x8_t vPred = vcgt_u8(vStrength, vZero);
+                uint64x1_t vRetBit = vreinterpret_u64_u8(vPred);
+
+                if (do_nonmax && (0 != vget_lane_u64(vRetBit, 0)))
+                {
+                    vx_uint8 szNMStrength[8];
+                    uint8x8_t vTmpPrv[7], vTmpCur[7], vTmpNxt[7];
+                    uint8x8_t vNMStrength;
+                    uint8x8_t vTmpPred;
+                    int16x8_t vNMX;
+                    if ((y - 1) >= APERTURE)
+                    {
+                        if (x != 0)
+                        {
+                            vNMPrv[0] = vld1_u8((vx_uint8 *)src_base + (y - APERTURE - 1) * in->addr->stride_y + (x - 8) * in->addr->stride_x);
+                        }
+                        else
+                        {
+                            vNMPrv[0] = vdup_n_u8(0);
+                        }
+                        vNMCur[0] = vld1_u8((vx_uint8 *)src_base + (y - APERTURE - 1) * in->addr->stride_y + x * in->addr->stride_x);
+                        vNMNxt[0] = vld1_u8((vx_uint8 *)src_base + (y - APERTURE - 1) * in->addr->stride_y + (x + 8) * in->addr->stride_x);
+                        for (vx_uint8 idx = 1; idx < 7; idx++)
+                        {
+                            vNMPrv[idx] = vPrv[idx - 1];
+                            vNMCur[idx] = vCur[idx - 1];
+                            vNMNxt[idx] = vNxt[idx - 1];
+                        }
+
+                        for (vx_uint8 idx = 0; idx < 7; idx++)
+                        {
+                            vTmpPrv[idx] = vext_u8(vZero, vNMPrv[idx], 7);
+                            vTmpCur[idx] = vext_u8(vNMPrv[idx], vNMCur[idx], 7);
+                            vTmpNxt[idx] = vext_u8(vNMCur[idx], vNMNxt[idx], 7);
+                        }
+                        vNMX = vsubq_s16(vX, vdupq_n_s16(1));
+                        memset(szNMStrength, 0, 8);
+                        fast9CornersPerRow(vTmpPrv, vTmpCur, vTmpNxt, &vNMX, in->addr, tolerance, szNMStrength);
+                        vNMStrength = vld1_u8(szNMStrength);
+                        vTmpPred = vcge_u8(vStrength, vNMStrength);
+                        vPred = vand_u8(vPred, vTmpPred);
+                        vRetBit = vreinterpret_u64_u8(vPred);
+
+                        if (0 != vget_lane_u64(vRetBit, 0))
+                        {
+                            memset(szNMStrength, 0, 8);
+                            fast9CornersPerRow(vNMPrv, vNMCur, vNMNxt, &vX, in->addr, tolerance, szNMStrength);
+                            vNMStrength = vld1_u8(szNMStrength);
+                            vTmpPred = vcge_u8(vStrength, vNMStrength);
+                            vPred = vand_u8(vPred, vTmpPred);
+                            vRetBit = vreinterpret_u64_u8(vPred);
+                        }
+
+                        if (0 != vget_lane_u64(vRetBit, 0))
+                        {
+                            for (vx_uint8 idx = 0; idx < 7; idx++)
+                            {
+                                vTmpPrv[idx] = vext_u8(vNMPrv[idx], vNMCur[idx], 1);
+                                vTmpCur[idx] = vext_u8(vNMCur[idx], vNMNxt[idx], 1);
+                                vTmpNxt[idx] = vext_u8(vNMNxt[idx], vZero, 1);
+                            }
+                            vNMX = vaddq_s16(vX, vdupq_n_s16(1));
+                            memset(szNMStrength, 0, 8);
+                            fast9CornersPerRow(vTmpPrv, vTmpCur, vTmpNxt, &vNMX, in->addr, tolerance, szNMStrength);
+                            vNMStrength = vld1_u8(szNMStrength);
+                            vTmpPred = vcge_u8(vStrength, vNMStrength);
+                            vPred = vand_u8(vPred, vTmpPred);
+                            vRetBit = vreinterpret_u64_u8(vPred);
+                        }
+                    }
+
+                    if (0 != vget_lane_u64(vRetBit, 0))
+                    {
+                        for (vx_uint8 idx = 0; idx < 7; idx++)
+                        {
+                            vTmpPrv[idx] = vext_u8(vZero, vPrv[idx], 7);
+                            vTmpCur[idx] = vext_u8(vPrv[idx], vCur[idx], 7);
+                            vTmpNxt[idx] = vext_u8(vCur[idx], vNxt[idx], 7);
+                        }
+                        vNMX = vsubq_s16(vX, vdupq_n_s16(1));
+                        memset(szNMStrength, 0, 8);
+                        fast9CornersPerRow(vTmpPrv, vTmpCur, vTmpNxt, &vNMX, in->addr, tolerance, szNMStrength);
+                        vNMStrength = vld1_u8(szNMStrength);
+                        vTmpPred = vcge_u8(vStrength, vNMStrength);
+                        vPred = vand_u8(vPred, vTmpPred);
+                        vRetBit = vreinterpret_u64_u8(vPred);
+                    }
+
+                    if (0 != vget_lane_u64(vRetBit, 0))
+                    {
+                        for (vx_uint8 idx = 0; idx < 7; idx++)
+                        {
+                            vTmpPrv[idx] = vext_u8(vPrv[idx], vCur[idx], 1);
+                            vTmpCur[idx] = vext_u8(vCur[idx], vNxt[idx], 1);
+                            vTmpNxt[idx] = vext_u8(vNxt[idx], vZero, 1);
+                        }
+                        vNMX = vaddq_s16(vX, vdupq_n_s16(1));
+                        memset(szNMStrength, 0, 8);
+                        fast9CornersPerRow(vTmpPrv, vTmpCur, vTmpNxt, &vNMX, in->addr, tolerance, szNMStrength);
+                        vNMStrength = vld1_u8(szNMStrength);
+                        vTmpPred = vcgt_u8(vStrength, vNMStrength);
+                        vPred = vand_u8(vPred, vTmpPred);
+                        vRetBit = vreinterpret_u64_u8(vPred);
+                    }
+
+                    if ((y + 1) < (in->image.height - APERTURE))
+                    {
+                        if (0 != vget_lane_u64(vRetBit, 0))
+                        {
+                            if (x != 0)
+                            {
+                                vNMPrv[6] = vld1_u8((vx_uint8 *)src_base + (y + APERTURE + 1) * in->addr->stride_y + (x - 8) * in->addr->stride_x);
+                            }
+                            else
+                            {
+                                vNMPrv[6] = vdup_n_u8(0);
+                            }
+                            vNMCur[6] = vld1_u8((vx_uint8 *)src_base + (y + APERTURE + 1) * in->addr->stride_y + x * in->addr->stride_x);
+                            vNMNxt[6] = vld1_u8((vx_uint8 *)src_base + (y + APERTURE + 1) * in->addr->stride_y + (x + 8) * in->addr->stride_x);
+                            for (vx_uint8 idx = 0; idx < 6; idx++)
+                            {
+                                vNMPrv[idx] = vPrv[idx + 1];
+                                vNMCur[idx] = vCur[idx + 1];
+                                vNMNxt[idx] = vNxt[idx + 1];
+                            }
+
+                            for (vx_uint8 idx = 0; idx < 7; idx++)
+                            {
+                                vTmpPrv[idx] = vext_u8(vZero, vNMPrv[idx], 7);
+                                vTmpCur[idx] = vext_u8(vNMPrv[idx], vNMCur[idx], 7);
+                                vTmpNxt[idx] = vext_u8(vNMCur[idx], vNMNxt[idx], 7);
+                            }
+                            vNMX = vsubq_s16(vX, vdupq_n_s16(1));
+                            memset(szNMStrength, 0, 8);
+                            fast9CornersPerRow(vTmpPrv, vTmpCur, vTmpNxt, &vNMX, in->addr, tolerance, szNMStrength);
+                            vNMStrength = vld1_u8(szNMStrength);
+                            vTmpPred = vcgt_u8(vStrength, vNMStrength);
+                            vPred = vand_u8(vPred, vTmpPred);
+                            vRetBit = vreinterpret_u64_u8(vPred);
+                        }
+
+                        if (0 != vget_lane_u64(vRetBit, 0))
+                        {
+                            memset(szNMStrength, 0, 8);
+                            fast9CornersPerRow(vNMPrv, vNMCur, vNMNxt, &vX, in->addr, tolerance, szNMStrength);
+                            vNMStrength = vld1_u8(szNMStrength);
+                            vTmpPred = vcgt_u8(vStrength, vNMStrength);
+                            vPred = vand_u8(vPred, vTmpPred);
+                            vRetBit = vreinterpret_u64_u8(vPred);
+                        }
+
+                        if (0 != vget_lane_u64(vRetBit, 0))
+                        {
+                            for (vx_uint8 idx = 0; idx < 7; idx++)
+                            {
+                                vTmpPrv[idx] = vext_u8(vNMPrv[idx], vNMCur[idx], 1);
+                                vTmpCur[idx] = vext_u8(vNMCur[idx], vNMNxt[idx], 1);
+                                vTmpNxt[idx] = vext_u8(vNMNxt[idx], vZero, 1);
+                            }
+                            vNMX = vaddq_s16(vX, vdupq_n_s16(1));
+                            memset(szNMStrength, 0, 8);
+                            fast9CornersPerRow(vTmpPrv, vTmpCur, vTmpNxt, &vNMX, in->addr, tolerance, szNMStrength);
+                            vNMStrength = vld1_u8(szNMStrength);
+                            vTmpPred = vcgt_u8(vStrength, vNMStrength);
+                            vPred = vand_u8(vPred, vTmpPred);
+                        }
+                    }
+                }
+
+                vRetBit = vreinterpret_u64_u8(vPred);
+                if (0 != vget_lane_u64(vRetBit, 0))
+                {
+                    addCorner(y, &vX, &vPred, szStrength, dst_capacity, &num_corners, points);
+                }
+
+                for (vx_uint8 idx = 0; idx < 7; idx++)
+                {
+                    vPrv[idx] = vCur[idx];
+                    vCur[idx] = vNxt[idx];
+                }
+            }
+        }
+    }
+}
+
+
+#define FAST9CORNERS(low_y, high_y, low_x, high_x)                                                              \
+    for (y = low_y; y < high_y; y++)                                                                            \
+    {                                                                                                           \
+        for (x = low_x; x < high_x; x++)                                                                        \
+        {                                                                                                       \
+            vx_uint8 strength = vxGetFastCornerStrength(x, y, src_base, in->addr, tolerance);                   \
+            if (strength > 0)                                                                                   \
+            {                                                                                                   \
+                if (do_nonmax)                                                                                  \
+                {                                                                                               \
+                    if (strength >= vxGetFastCornerStrength(x - 1, y - 1, src_base, in->addr, tolerance) &&     \
+                        strength >= vxGetFastCornerStrength(x, y - 1, src_base, in->addr, tolerance) &&         \
+                        strength >= vxGetFastCornerStrength(x + 1, y - 1, src_base, in->addr, tolerance) &&     \
+                        strength >= vxGetFastCornerStrength(x - 1, y, src_base, in->addr, tolerance) &&         \
+                        strength > vxGetFastCornerStrength(x + 1, y, src_base, in->addr, tolerance) &&          \
+                        strength > vxGetFastCornerStrength(x - 1, y + 1, src_base, in->addr, tolerance) &&      \
+                        strength > vxGetFastCornerStrength(x, y + 1, src_base, in->addr, tolerance) &&          \
+                        strength > vxGetFastCornerStrength(x + 1, y + 1, src_base, in->addr, tolerance))        \
+                        ;                                                                                       \
+                    else                                                                                        \
+                        continue;                                                                               \
+                }                                                                                               \
+                if (num_corners < dst_capacity)                                                                 \
+                {                                                                                               \
+                    kp.x = x;                                                                                   \
+                    kp.y = y;                                                                                   \
+                    kp.strength = strength;                                                                     \
+                    kp.scale = 0.0f;                                                                            \
+                    kp.orientation = 0.0f;                                                                      \
+                    kp.tracking_status = 1;                                                                     \
+                    kp.error = 0.0f;                                                                            \
+                    vxAddArrayItems_tiling(points, 1, &kp, sizeof(kp));                                         \
+                }                                                                                               \
+                num_corners++;                                                                                  \
+            }                                                                                                   \
+        }                                                                                                       \
+    }
+
+
+void Fast9Corners_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_float32 *sens = (vx_float32*)parameters[1];
+    vx_bool *nonm = (vx_bool*)parameters[2];
+    vx_tile_array_t *points = (vx_tile_array_t *)parameters[3];
+    vx_scalar s_num_corners = (vx_scalar)parameters[4];
+
+    vx_keypoint_t kp;
+
+    vx_size num_corners = 0;
+
+    vx_uint8 *src_base = in->base[0];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = vxTileHeight(in, 0);
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = vxTileWidth(in, 0);
+
+    vx_uint8 tolerance = (vx_uint8)(*sens);
+    vx_bool do_nonmax = *nonm;
+    vx_size dst_capacity = points->capacity;
+
+    memset(&kp, 0, sizeof(kp));
+
+    if (low_y == 0 && low_x == 0)
+    {
+        FAST9CORNERS(low_y + APERTURE, high_y - APERTURE, low_x + APERTURE, high_x - APERTURE)
+    }
+    else
+    {
+        FAST9CORNERS(APERTURE, low_y, low_x, high_x - APERTURE)
+        FAST9CORNERS(low_y, high_y, APERTURE, high_x - APERTURE)
+    }
+
+    if (s_num_corners)
+        vxCopyScalar(s_num_corners, &num_corners, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+}
diff --git a/kernels/tiling/tiling_filter.c b/kernels/tiling/tiling_filter.c
new file mode 100644
index 0000000..7b3e780
--- /dev/null
+++ b/kernels/tiling/tiling_filter.c
@@ -0,0 +1,481 @@
+/*
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+#include <stdlib.h>
+
+void box3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x, y;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+    float32x4_t oneovernine = vdupq_n_f32(1.0f / 9.0f);
+    vx_uint8 *src = in->base[0] + in->tile_x;
+    vx_uint8 *dst = out->base[0] + out->tile_x;
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    if (low_y == 0)
+    {
+        low_y = 1;
+    }
+    if (high_y == out->image.height)
+    {
+        high_y = high_y - 1;
+    }
+
+    for (y = low_y; y < high_y; y++)
+    {
+        vx_uint8* dst_u8 = (vx_uint8 *)dst + 1 + y * out->image.width;
+        vx_uint8* top_src = (vx_uint8 *)src + (y - 1) * in->image.width;
+        vx_uint8* mid_src = (vx_uint8 *)src + (y)* in->image.width;
+        vx_uint8* bot_src = (vx_uint8 *)src + (y + 1)* in->image.width;
+
+        for (x = 0; x < out->tile_block.width; x += 8)
+        {
+            const uint8x16_t top_data = vld1q_u8(top_src);
+            const uint8x16_t mid_data = vld1q_u8(mid_src);
+            const uint8x16_t bot_data = vld1q_u8(bot_src);
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t mid_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            //top left
+            int16x8_t vOut = top_s16.val[0];
+            //top mid
+            vOut = vaddq_s16(vOut, vextq_s16(top_s16.val[0], top_s16.val[1], 1));
+            //top right
+            vOut = vaddq_s16(vOut, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+            //mid left
+            vOut = vaddq_s16(vOut, mid_s16.val[0]);
+            //mid mid
+            vOut = vaddq_s16(vOut, vextq_s16(mid_s16.val[0], mid_s16.val[1], 1));
+            //mid right
+            vOut = vaddq_s16(vOut, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2));
+            //bot left
+            vOut = vaddq_s16(vOut, bot_s16.val[0]);
+            //bot mid
+            vOut = vaddq_s16(vOut, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1));
+            //bot right
+            vOut = vaddq_s16(vOut, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+            float32x4_t outfloathigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vOut)));
+            float32x4_t outfloatlow  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(vOut)));
+
+            outfloathigh = vmulq_f32(outfloathigh, oneovernine);
+            outfloatlow  = vmulq_f32(outfloatlow, oneovernine);
+
+            vOut = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(outfloatlow)),
+                               vqmovn_s32(vcvtq_s32_f32(outfloathigh)));
+
+            vst1_u8(dst_u8, vqmovun_s16(vOut));
+
+            top_src += 8;
+            mid_src += 8;
+            bot_src += 8;
+            dst_u8 += 8;
+        }
+    }
+}
+
+
+void box3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+
+    if (ty == 0 && tx == 0)
+    {
+        for (y = 1; y < vxTileHeight(out, 0); y++)
+        {
+            for (x = 1; x < vxTileWidth(out, 0); x++)
+            {
+                vx_int32 j, i;
+                vx_uint32 sum = 0;
+                vx_uint32 count = 0;
+                for (j = vxNeighborhoodTop(in); j <= vxNeighborhoodBottom(in); j++)
+                {
+                    for (i = vxNeighborhoodLeft(in); i <= vxNeighborhoodRight(in); i++)
+                    {
+                        sum += vxImagePixel(vx_uint8, in, 0, x, y, i, j);
+                        count++;
+                    }
+                }
+                sum /= count;
+                if (sum > 255)
+                    sum = 255;
+                vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = (vx_uint8)sum;
+            }
+        }
+    }
+    else
+    {
+        for (y = 1; y < ty; y++)
+        {
+            for (x = tx; x < vxTileWidth(out, 0); x++)
+            {
+                vx_int32 j, i;
+                vx_uint32 sum = 0;
+                vx_uint32 count = 0;
+                for (j = vxNeighborhoodTop(in); j <= vxNeighborhoodBottom(in); j++)
+                {
+                    for (i = vxNeighborhoodLeft(in); i <= vxNeighborhoodRight(in); i++)
+                    {
+
+                        sum += vxImagePixel(vx_uint8, in, 0, x, y, i, j);
+                        count++;
+                    }
+                }
+                sum /= count;
+                if (sum > 255)
+                    sum = 255;
+                vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = (vx_uint8)sum;
+            }
+        }
+
+        for (y = ty; y < vxTileHeight(out, 0); y++)
+        {
+            for (x = 1; x < vxTileWidth(out, 0); x++)
+            {
+                vx_int32 j, i;
+                vx_uint32 sum = 0;
+                vx_uint32 count = 0;
+                for (j = vxNeighborhoodTop(in); j <= vxNeighborhoodBottom(in); j++)
+                {
+                    for (i = vxNeighborhoodLeft(in); i <= vxNeighborhoodRight(in); i++)
+                    {
+                        sum += vxImagePixel(vx_uint8, in, 0, x, y, i, j);
+                        count++;
+                    }
+                }
+                sum /= count;
+                if (sum > 255)
+                    sum = 255;
+                vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = (vx_uint8)sum;
+            }
+        }
+    }
+}
+
+static inline void sort(uint8x8_t *a, uint8x8_t *b)
+{
+    const uint8x8_t min = vmin_u8(*a, *b);
+    const uint8x8_t max = vmax_u8(*a, *b);
+    *a = min;
+    *b = max;
+}
+
+void Median3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x, y;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    if (low_y == 0)
+    {
+        low_y = 1;
+    }
+    if (high_y == out->image.height)
+    {
+        high_y = high_y - 1;
+    }
+
+    for (y = low_y; y < high_y; y++)
+    {
+        vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * out->addr->stride_y;
+        vx_uint8* top_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y;
+        vx_uint8* mid_src = (vx_uint8 *)src_base + (y) * in->addr->stride_y;
+        vx_uint8* bot_src = (vx_uint8 *)src_base + (y + 1) * in->addr->stride_y;
+
+        for (x = 0; x < out->tile_block.width; x += 8)
+        {
+            const uint8x16_t top_data = vld1q_u8(top_src);
+            const uint8x16_t mid_data = vld1q_u8(mid_src);
+            const uint8x16_t bot_data = vld1q_u8(bot_src);
+
+            uint8x8_t p0 = vget_low_u8(top_data);
+            uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
+            uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
+            uint8x8_t p3 = vget_low_u8(mid_data);
+            uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+            uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+            uint8x8_t p6 = vget_low_u8(bot_data);
+            uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
+            uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
+
+            sort(&p1, &p2);
+            sort(&p4, &p5);
+            sort(&p7, &p8);
+
+            sort(&p0, &p1);
+            sort(&p3, &p4);
+            sort(&p6, &p7);
+
+            sort(&p1, &p2);
+            sort(&p4, &p5);
+            sort(&p7, &p8);
+
+            sort(&p0, &p3);
+            sort(&p5, &p8);
+            sort(&p4, &p7);
+
+            sort(&p3, &p6);
+            sort(&p1, &p4);
+            sort(&p2, &p5);
+
+            sort(&p4, &p7);
+            sort(&p4, &p2);
+            sort(&p6, &p4);
+
+            sort(&p4, &p2);
+
+            vst1_u8(dst, p4);
+
+            top_src+=8;
+            mid_src+=8;
+            bot_src+=8;
+            dst += 8;
+        }
+    }
+}
+
+
+static int vx_uint8_compare(const void *p1, const void *p2)
+{
+    vx_uint8 a = *(vx_uint8 *)p1;
+    vx_uint8 b = *(vx_uint8 *)p2;
+    if (a > b)
+        return 1;
+    else if (a == b)
+        return 0;
+    else
+        return -1;
+}
+
+
+#define Median3x3(low_y, high_y, low_x, high_x)                                        \
+    for (y = low_y; y < high_y; y++)                                                   \
+    {                                                                                  \
+        for (x = low_x; x < high_x; x++)                                               \
+        {                                                                              \
+            vx_int32 j, i;                                                             \
+            vx_uint8 values[9];                                                        \
+            vx_uint32 count = 0;                                                       \
+            for (j = vxNeighborhoodTop(in); j <= vxNeighborhoodBottom(in); j++)        \
+            {                                                                          \
+                for (i = vxNeighborhoodLeft(in); i <= vxNeighborhoodRight(in); i++)    \
+                {                                                                      \
+                   values[count++] = vxImagePixel(vx_uint8, in, 0, x, y, i, j);        \
+                }                                                                      \
+            }                                                                          \
+            qsort(values, dimof(values), sizeof(vx_uint8), vx_uint8_compare);          \
+            vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = values[4];                    \
+        }                                                                              \
+    }
+
+
+void Median3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    if (low_y == 0 && low_x == 0)
+    {
+        Median3x3(low_y + 1, high_y - 1, low_x + 1, high_x - 1)
+    }
+    else
+    {
+        Median3x3(1, low_y, low_x, high_x - 1)
+        Median3x3(low_y, high_y, 1, high_x - 1)
+    }
+}
+
+
+void Gaussian3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x, y;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    int16x8_t two  = vdupq_n_s16(2);
+    int16x8_t four = vdupq_n_s16(4);
+
+    if (low_y == 0)
+    {
+        low_y = 1;
+    }
+    if (high_y == out->image.height)
+    {
+        high_y = high_y - 1;
+    }
+
+    for (y = low_y; y < high_y; y++)
+    {
+        vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * out->addr->stride_y;
+        vx_uint8* top_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y;
+        vx_uint8* mid_src = (vx_uint8 *)src_base + (y) * in->addr->stride_y;
+        vx_uint8* bot_src = (vx_uint8 *)src_base + (y + 1) * in->addr->stride_y;
+
+        for (x = 0; x < out->tile_block.width; x += 8)
+        {
+            const uint8x16_t top_data = vld1q_u8(top_src);
+            const uint8x16_t mid_data = vld1q_u8(mid_src);
+            const uint8x16_t bot_data = vld1q_u8(bot_src);
+
+            const int16x8x2_t top_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
+                }
+            };
+            const int16x8x2_t mid_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
+                }
+            };
+            const int16x8x2_t bot_s16 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
+                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
+                }
+            };
+
+            //top left
+            int16x8_t out = top_s16.val[0];
+            //top mid
+            out = vmlaq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1), two);
+            //top right
+            out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+            //mid left
+            out = vmlaq_s16(out, mid_s16.val[0], two);
+            //mid mid
+            out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 1), four);
+            //mid right
+            out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two);
+            //bot left
+            out = vaddq_s16(out, bot_s16.val[0]);
+            //bot mid
+            out = vmlaq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two);
+            //bot right
+            out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+            vst1_u8(dst, vqshrun_n_s16(out, 4));
+
+            top_src+=8;
+            mid_src+=8;
+            bot_src+=8;
+            dst += 8;
+        }
+    }
+}
+
+#define Gaussian3x3(low_y, high_y, low_x, high_x)                          \
+    for (y = low_y; y < high_y; y++)                                       \
+    {                                                                      \
+        for (x = low_x; x < high_x; x++)                                   \
+        {                                                                  \
+            vx_uint32 sum = 0;                                             \
+                                                                           \
+            sum += vxImagePixel(vx_uint8, in, 0, x, y, -1, -1);            \
+            sum += vxImagePixel(vx_uint8, in, 0, x, y, 0, -1) << 1;        \
+            sum += vxImagePixel(vx_uint8, in, 0, x, y, +1, -1);            \
+            sum += vxImagePixel(vx_uint8, in, 0, x, y, -1, 0) << 1;        \
+            sum += vxImagePixel(vx_uint8, in, 0, x, y, 0, 0) << 2;         \
+            sum += vxImagePixel(vx_uint8, in, 0, x, y, +1, 0) << 1;        \
+            sum += vxImagePixel(vx_uint8, in, 0, x, y, -1, +1);            \
+            sum += vxImagePixel(vx_uint8, in, 0, x, y, 0, +1) << 1;        \
+            sum += vxImagePixel(vx_uint8, in, 0, x, y, +1, +1);            \
+            sum >>= 4;                                                     \
+            if (sum > 255)                                                 \
+                sum = 255;                                                 \
+            vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = (vx_uint8)sum;    \
+        }                                                                  \
+    }
+
+
+void Gaussian3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    if (low_y == 0 && low_x == 0)
+    {
+        Gaussian3x3(low_y + 1, high_y - 1, low_x + 1, high_x - 1)
+    }
+    else
+    {
+        Gaussian3x3(1, low_y, low_x, high_x - 1)
+        Gaussian3x3(low_y, high_y, 1, high_x - 1)
+    }
+}
diff --git a/kernels/tiling/tiling_hog.c b/kernels/tiling/tiling_hog.c
new file mode 100644
index 0000000..a6289a6
--- /dev/null
+++ b/kernels/tiling/tiling_hog.c
@@ -0,0 +1,403 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+#include <tiling.h>
+#include <math.h>
+
+#define min(a,b) (a<b?a:b)
+
+void HogCells_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_int32 *cell_w = (vx_int32 *)parameters[1];
+    vx_int32 *cell_h = (vx_int32 *)parameters[2];
+    vx_int32 *num_orientations = (vx_int32 *)parameters[3];
+    void* magnitudes_data = parameters[4];
+    void* bins_data = parameters[5];
+    vx_float32 gx;
+    vx_float32 gy;
+    vx_float32 orientation;
+    vx_float32 magnitude;
+    vx_int8 bin;
+    float num_div_360 = (float)(*num_orientations) / 360.0f;
+    vx_int32 num_cellw = (vx_int32)floor(((vx_float64)in->image.width) / ((vx_float64)(*cell_w)));    
+    vx_uint32 low_height = in->tile_y;
+    vx_uint32 height = in->tile_y + in->tile_block.height;    
+    vx_uint32 low_width = in->tile_x;
+    vx_uint32 width = in->tile_x + in->tile_block.width;
+    
+    vx_float32 gx_0, gx_1, gx_2, gx_3;
+    vx_float32 gy_0, gy_1, gy_2, gy_3;
+    float32x4_t magnitude_f32x4;
+    float32x4_t orientation_f32x4;
+    float32x4_t fv_0_5_32x4 = vdupq_n_f32(0.5f);
+    float32x4_t num_div_360_f32x4 = vdupq_n_f32(num_div_360);
+    int32x4_t bin_s32x4;
+    vx_int32 cell_wxh = (*cell_w)*(*cell_h);
+    int32x4_t num_orientations_s32x4 = vdupq_n_s32((*num_orientations));
+    int32x4_t num_cellw_s32x4 = vdupq_n_s32(num_cellw);
+    float32_t pi_3_14 = 180 / 3.14159265;
+    
+    for (vx_int32 j = low_height; j < height; j++) 
+    {        
+        int32x4_t celly_s32x4 = vdupq_n_s32(j/(*cell_h));
+        vx_int32 y1 = j - 1 < 0 ? 0 : j - 1;
+        vx_int32 y2 = j + 1 >= in->image.height ? in->image.height - 1 : j + 1;
+        vx_uint8 *src_base_y = (vx_uint8 *)in->base[0] + j*in->addr[0].stride_y;
+        vx_uint8 *src_base_y_y1 = (vx_uint8 *)in->base[0] + y1*in->addr[0].stride_y;
+        vx_uint8 *src_base_y_y2 = (vx_uint8 *)in->base[0] + y2*in->addr[0].stride_y;        
+        for (int i = low_width; i < width; i+=4) 
+        {            
+            vx_int32 x1 = i - 1 < 0 ? 0 : i - 1;
+            vx_int32 x2 = i + 1 >= in->image.width ? in->image.width - 1 : i + 1;
+            gx_0 = *(src_base_y + x2) - *(src_base_y + x1);                     
+            x1 = i < 0 ? 0 : i;
+            x2 = i + 2 >= in->image.width ? in->image.width - 1 : i+2;
+            gx_1 = *(src_base_y + x2) - *(src_base_y + x1);            
+            x1 = i + 1 < 0 ? 0 : i + 1;
+            x2 = i+3 >= in->image.width ? in->image.width - 1 : i+3;
+            gx_2 = *(src_base_y + x2) - *(src_base_y + x1);             
+            x1 = i+2 < 0 ? 0 : i+2;
+            x2 = i+4 >= in->image.width ? in->image.width - 1 : i+4;
+            gx_3 = *(src_base_y + x2) - *(src_base_y + x1); 
+            gy_0 = *(src_base_y_y2 + i) - *(src_base_y_y1 + i);
+            gy_1 = *(src_base_y_y2 + i + 1) - *(src_base_y_y1 + i + 1);
+            gy_2 = *(src_base_y_y2 + i + 2) - *(src_base_y_y1 + i + 2);
+            gy_3 = *(src_base_y_y2 + i + 3) - *(src_base_y_y1 + i + 3);
+                        
+            //calculating mag and orientation
+            magnitude_f32x4 = vsetq_lane_f32(sqrtf(gx_0*gx_0 + gy_0*gy_0) / cell_wxh, magnitude_f32x4, 0);
+            magnitude_f32x4 = vsetq_lane_f32(sqrtf(gx_1*gx_1 + gy_1*gy_1) / cell_wxh, magnitude_f32x4, 1);
+            magnitude_f32x4 = vsetq_lane_f32(sqrtf(gx_2*gx_2 + gy_2*gy_2) / cell_wxh, magnitude_f32x4, 2);
+            magnitude_f32x4 = vsetq_lane_f32(sqrtf(gx_3*gx_3 + gy_3*gy_3) / cell_wxh, magnitude_f32x4, 3);
+            orientation_f32x4 = vsetq_lane_f32(fmod(atan2f(gy_0, gx_0) * pi_3_14, 360), orientation_f32x4, 0);
+            orientation_f32x4 = vsetq_lane_f32(fmod(atan2f(gy_1, gx_1) * pi_3_14, 360), orientation_f32x4, 1);
+            orientation_f32x4 = vsetq_lane_f32(fmod(atan2f(gy_2, gx_2) * pi_3_14, 360), orientation_f32x4, 2);
+            orientation_f32x4 = vsetq_lane_f32(fmod(atan2f(gy_3, gx_3) * pi_3_14, 360), orientation_f32x4, 3);            
+            uint32x4_t lt0 = vcltq_f32(orientation_f32x4, vdupq_n_f32(0.0));
+            float32x4_t orientation_f32x4_360 = vaddq_f32(orientation_f32x4, vdupq_n_f32(360.0));
+            orientation_f32x4 = vbslq_f32(lt0, orientation_f32x4_360, orientation_f32x4);
+            
+            //calculating bin.
+            int32x4_t bin_s32x4 = vcvtq_s32_f32(vmulq_f32(orientation_f32x4, num_div_360_f32x4));
+
+            int32x4_t cellx_s32x4 = vsetq_lane_s32(i/(*cell_w), cellx_s32x4, 0);
+            cellx_s32x4 = vsetq_lane_s32((i+1)/(*cell_w), cellx_s32x4, 1);
+            cellx_s32x4 = vsetq_lane_s32((i+2)/(*cell_w), cellx_s32x4, 2);
+            cellx_s32x4 = vsetq_lane_s32((i+3)/(*cell_w), cellx_s32x4, 3);
+            int32x4_t magnitudes_index_s32x4 = vaddq_s32(vmulq_s32(celly_s32x4, num_cellw_s32x4), cellx_s32x4);
+            int32x4_t bins_index_s32x4 = vaddq_s32(vmulq_s32(magnitudes_index_s32x4, num_orientations_s32x4), bin_s32x4);
+            
+            void *mag_ptr = (vx_int8 *)magnitudes_data + vgetq_lane_s32(magnitudes_index_s32x4, 0)*2;
+            *(vx_int16 *)(mag_ptr) = *(vx_int16 *)(mag_ptr) + vgetq_lane_f32(magnitude_f32x4, 0);            
+            mag_ptr = (vx_int8 *)magnitudes_data + vgetq_lane_s32(magnitudes_index_s32x4, 1)*2;
+            *(vx_int16 *)(mag_ptr) = *(vx_int16 *)(mag_ptr) + vgetq_lane_f32(magnitude_f32x4, 1);            
+            mag_ptr = (vx_int8 *)magnitudes_data + vgetq_lane_s32(magnitudes_index_s32x4, 2)*2;
+            *(vx_int16 *)(mag_ptr) = *(vx_int16 *)(mag_ptr) + vgetq_lane_f32(magnitude_f32x4, 2);            
+            mag_ptr = (vx_int8 *)magnitudes_data + vgetq_lane_s32(magnitudes_index_s32x4, 3)*2;
+            *(vx_int16 *)(mag_ptr) = *(vx_int16 *)(mag_ptr) + vgetq_lane_f32(magnitude_f32x4, 3);
+            vx_int8 *bins_ptr = (vx_int8 *)bins_data + vgetq_lane_s32(bins_index_s32x4, 0);
+            *bins_ptr = *bins_ptr + vgetq_lane_f32(magnitude_f32x4, 0);            
+            bins_ptr = (vx_int8 *)bins_data + vgetq_lane_s32(bins_index_s32x4, 1);
+            *bins_ptr = *bins_ptr + vgetq_lane_f32(magnitude_f32x4, 1);            
+            bins_ptr = (vx_int8 *)bins_data + vgetq_lane_s32(bins_index_s32x4, 2);
+            *bins_ptr = *bins_ptr + vgetq_lane_f32(magnitude_f32x4, 2);            
+            bins_ptr = (vx_int8 *)bins_data + vgetq_lane_s32(bins_index_s32x4, 3);
+            *bins_ptr = *bins_ptr + vgetq_lane_f32(magnitude_f32x4, 3);            
+        }
+    }
+}
+
+#define HOGCELLS_SCALING(low_y, low_x, high_y, high_x, in_tile_x)\
+    for (int j = low_y; j < high_y; j++) {\
+        for (int i = low_x; i < high_x; i++) {\
+            int x1 = i - 1 < 0 ? 0 : i - 1;\
+            int x2 = i + 1 >= high_x ? high_x - 1 : i + 1;\
+            vx_uint8 *gx1 = (vx_uint8 *)in->base[0] + in_tile_x + j * in->addr[0].stride_y + x1 * in->addr[0].stride_x;\
+            vx_uint8 *gx2 = (vx_uint8 *)in->base[0] + in_tile_x + j * in->addr[0].stride_y + x2 * in->addr[0].stride_x;\
+            gx = *gx2 - *gx1;\
+            int y1 = j - 1 < 0 ? 0 : j - 1;\
+            int y2 = j + 1 >= high_y ? high_y - 1 : j + 1;\
+            vx_uint8 *gy1 = (vx_uint8 *)in->base[0] + in_tile_x + y1 * in->addr[0].stride_y + i * in->addr[0].stride_x;\
+            vx_uint8 *gy2 = (vx_uint8 *)in->base[0] + in_tile_x + y2 * in->addr[0].stride_y + i * in->addr[0].stride_x;\
+            gy = *gy2 - *gy1;\
+            magnitude = sqrtf(powf(gx, 2) + powf(gy, 2));\
+            orientation = fmod(atan2f(gy, gx + 0.00000000000001)\
+                * (180 / 3.14159265), 360);\
+            if (orientation < 0) {\
+                orientation += 360;\
+            }\
+            bin = (vx_int8)floor(orientation * num_div_360);\
+            vx_int32 cellx = i / (*cell_w);\
+            vx_int32 celly = j / (*cell_h);\
+            vx_int32 magnitudes_index = celly * num_cellw + cellx;\
+            vx_int32 bins_index = (celly * num_cellw + cellx) * (*num_orientations) + bin;\
+            vx_size magnitudes_pos = 2 * magnitudes_index;\
+            vx_size bins_pos = bins_index;\
+            void *mag_ptr = (vx_int8 *)magnitudes_data + magnitudes_pos;\
+            void *bins_ptr = (vx_int8 *)bins_data + bins_pos;\
+            *(vx_int16 *)(mag_ptr) = *(vx_int16 *)(mag_ptr) + magnitude / ((*cell_w) * (*cell_h));\
+            *(vx_int8 *)(bins_ptr) = *(vx_int8 *)(bins_ptr) + magnitude / ((*cell_w) * (*cell_h));\
+        }\
+    }\
+
+void HogCells_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_int32 *cell_w = (vx_int32 *)parameters[1];
+    vx_int32 *cell_h = (vx_int32 *)parameters[2];
+    vx_int32 *num_orientations = (vx_int32 *)parameters[3];
+    void* magnitudes_data = parameters[4];
+    void* bins_data = parameters[5];
+    vx_float32 gx;
+    vx_float32 gy;
+    vx_float32 orientation;
+    vx_float32 magnitude;
+    vx_int8 bin;
+
+    float num_div_360 = (float)(*num_orientations) / 360.0f;
+    vx_int32 num_cellw = (vx_int32)floor(((vx_float64)in->image.width) / ((vx_float64)(*cell_w)));
+    vx_uint32 ty = in->tile_y;
+    vx_uint32 tx = in->tile_x;
+    if (ty == 0 && tx == 0)
+    {
+        HOGCELLS_SCALING(0, 0, vxTileHeight(in, 0), vxTileWidth(in, 0), in->tile_x)     
+    }
+    else
+    {
+        HOGCELLS_SCALING(0, tx, ty, vxTileWidth(in, 0), in->tile_x)
+        HOGCELLS_SCALING(ty, 0, vxTileHeight(in, 0), vxTileWidth(in, 0), 0)
+    }
+}
+
+void HogFeatures_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_int32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    void *magnitudes_data = parameters[1];
+    void * bins_data = parameters[2];
+    vx_tile_array_t *hog_params = (vx_tile_array_t *)parameters[3];
+    void * features_data = parameters[5];
+
+    vx_uint32 high_y = in->tile_y + in->tile_block.height;
+
+    vx_uint32 high_x = in->tile_x + in->tile_block.width;
+
+    vx_int32 width, height;
+
+    vx_hog_t *hog_params_t = (vx_hog_t *)hog_params->ptr;
+
+    if (hog_params_t->num_bins > 0 && hog_params_t->num_bins < 360)
+    {
+        width = high_x;
+        height = high_y;
+        vx_int32 num_blockW = width / hog_params_t->cell_width - 1;
+        vx_int32 num_blockH = height / hog_params_t->cell_height - 1;
+        vx_int32 n_cellsx = width / hog_params_t->cell_width;
+        vx_int32 cells_per_block_w = hog_params_t->block_width / hog_params_t->cell_width;
+        vx_int32 cells_per_block_h = hog_params_t->block_height / hog_params_t->cell_height;
+
+        vx_int16 *ptr_src = (vx_int16 *)magnitudes_data;
+        vx_int8 *ptr_bins = (vx_int8 *)bins_data;
+        vx_int16 *ptr_dst = (vx_int16 *)features_data;
+        vx_int32 num_bins_s32 = hog_params_t->num_bins;            
+        vx_int32 roiw4 = num_blockW * num_bins_s32 >= 3*num_bins_s32 ? num_blockW * num_bins_s32 : 0;
+
+        for (y = 0; y < num_blockH; y++)
+        {
+            vx_int16 *src_r1 = ptr_src + (y + 0) * n_cellsx;
+            vx_int16 *src_r2 = ptr_src + (y + 1) * n_cellsx;
+            vx_int8 *bins_r1 = ptr_bins + (y + 0) * n_cellsx * hog_params_t->num_bins;
+            vx_int8 *bins_r2 = ptr_bins + (y + 1) * n_cellsx * hog_params_t->num_bins;
+            vx_int16 *dst_r1 = ptr_dst + y * (num_blockW + 1) * hog_params_t->num_bins;
+            for (x = 0; x < roiw4; x += 4*num_bins_s32)
+            {
+                int32x4_t bidx_s32x4;
+                vsetq_lane_s32(x / num_bins_s32, bidx_s32x4, 0);
+                vsetq_lane_s32((x + num_bins_s32) / num_bins_s32, bidx_s32x4, 1);
+                vsetq_lane_s32((x + 2 * num_bins_s32) / num_bins_s32, bidx_s32x4, 2);
+                vsetq_lane_s32((x + 3 * num_bins_s32) / num_bins_s32, bidx_s32x4, 3);
+
+                float32x4_t sum_f32x4;
+                int16x4_t value1_s16x4;
+                int16x4_t value2_s16x4;
+                int16x4_t value3_s16x4;
+                int16x4_t value4_s16x4;                
+                value1_s16x4 = vset_lane_s16(src_r1[vgetq_lane_s32(bidx_s32x4, 0)], value1_s16x4, 0);
+                value1_s16x4 = vset_lane_s16(src_r1[vgetq_lane_s32(bidx_s32x4, 1)], value1_s16x4, 1);
+                value1_s16x4 = vset_lane_s16(src_r1[vgetq_lane_s32(bidx_s32x4, 2)], value1_s16x4, 2);
+           
+                value2_s16x4 = vset_lane_s16(src_r1[vgetq_lane_s32(bidx_s32x4, 0) + 1], value2_s16x4, 0);
+                value2_s16x4 = vset_lane_s16(src_r1[vgetq_lane_s32(bidx_s32x4, 1) + 1], value2_s16x4, 1);
+                value2_s16x4 = vset_lane_s16(src_r1[vgetq_lane_s32(bidx_s32x4, 2) + 1], value2_s16x4, 2);
+             
+                value3_s16x4 = vset_lane_s16(src_r2[vgetq_lane_s32(bidx_s32x4, 0)], value3_s16x4, 0);
+                value3_s16x4 = vset_lane_s16(src_r2[vgetq_lane_s32(bidx_s32x4, 1)], value3_s16x4, 1);
+                value3_s16x4 = vset_lane_s16(src_r2[vgetq_lane_s32(bidx_s32x4, 2)], value3_s16x4, 2);
+              
+                value4_s16x4 = vset_lane_s16(src_r2[vgetq_lane_s32(bidx_s32x4, 0) + 1], value4_s16x4, 0);
+                value4_s16x4 = vset_lane_s16(src_r2[vgetq_lane_s32(bidx_s32x4, 1) + 1], value4_s16x4, 1);
+                value4_s16x4 = vset_lane_s16(src_r2[vgetq_lane_s32(bidx_s32x4, 2) + 1], value4_s16x4, 2);
+ 
+                sum_f32x4 = vcvtq_f32_s32(vmovl_s16(vadd_s16(vadd_s16(vmul_s16(value1_s16x4, value1_s16x4), vmul_s16(value2_s16x4, value2_s16x4)),
+                    vadd_s16(vmul_s16(value3_s16x4, value3_s16x4), vmul_s16(value4_s16x4, value4_s16x4)))));
+
+                vx_float32 scale = 1.f / sqrtf(vgetq_lane_f32(sum_f32x4, 0) + 0.00000000000001);
+                vx_int8 *bins1 = bins_r1 + (x + 0);
+                vx_int8 *bins2 = bins_r1 + (x + 1);
+                vx_int8 *bins3 = bins_r2 + (x + 0);
+                vx_int8 *bins4 = bins_r2 + (x + 1);
+                vx_int16 *dst = dst_r1 + x;
+                for (int k = 0; k < num_bins_s32; k++)
+                {
+                    vx_float32 hist = 0.0;
+                    hist += min(bins1[k] * scale, hog_params_t->threshold);
+                    hist += min(bins2[k] * scale, hog_params_t->threshold);
+                    hist += min(bins3[k] * scale, hog_params_t->threshold);
+                    hist += min(bins4[k] * scale, hog_params_t->threshold);
+                    dst[k] += hist;
+                }
+                
+                scale = 1.f / sqrtf(vgetq_lane_f32(sum_f32x4, 1) + 0.00000000000001);
+                bins1 = bins_r1 + (x + 0 + num_bins_s32);
+                bins2 = bins_r1 + (x + 1 + num_bins_s32);
+                bins3 = bins_r2 + (x + 0 + num_bins_s32);
+                bins4 = bins_r2 + (x + 1 + num_bins_s32);
+                dst = dst_r1 + x + num_bins_s32;
+                for (int k = 0; k < num_bins_s32; k++)
+                {
+                    vx_float32 hist = 0.0;
+                    hist += min(bins1[k] * scale, hog_params_t->threshold);
+                    hist += min(bins2[k] * scale, hog_params_t->threshold);
+                    hist += min(bins3[k] * scale, hog_params_t->threshold);
+                    hist += min(bins4[k] * scale, hog_params_t->threshold);
+                    dst[k] += hist;
+                }
+                
+                scale = 1.f / sqrtf(vgetq_lane_f32(sum_f32x4, 2) + 0.00000000000001);
+                bins1 = bins_r1 + (x + 0 + 2*num_bins_s32);
+                bins2 = bins_r1 + (x + 1 + 2*num_bins_s32);
+                bins3 = bins_r2 + (x + 0 + 2*num_bins_s32);
+                bins4 = bins_r2 + (x + 1 + 2*num_bins_s32);
+                dst = dst_r1 + x + 2*num_bins_s32;
+                for (int k = 0; k < num_bins_s32; k++)
+                {
+                    vx_float32 hist = 0.0;
+                    hist += min(bins1[k] * scale, hog_params_t->threshold);
+                    hist += min(bins2[k] * scale, hog_params_t->threshold);
+                    hist += min(bins3[k] * scale, hog_params_t->threshold);
+                    hist += min(bins4[k] * scale, hog_params_t->threshold);
+                    dst[k] += hist;
+                }
+                
+                scale = 1.f / sqrtf(vgetq_lane_f32(sum_f32x4, 3) + 0.00000000000001);
+                bins1 = bins_r1 + (x + 0 + 3*num_bins_s32);
+                bins2 = bins_r1 + (x + 1 + 3*num_bins_s32);
+                bins3 = bins_r2 + (x + 0 + 3*num_bins_s32);
+                bins4 = bins_r2 + (x + 1 + 3*num_bins_s32);
+                dst = dst_r1 + x + 3*num_bins_s32;
+                for (int k = 0; k < num_bins_s32; k++)
+                {
+                    vx_float32 hist = 0.0;
+                    hist += min(bins1[k] * scale, hog_params_t->threshold);
+                    hist += min(bins2[k] * scale, hog_params_t->threshold);
+                    hist += min(bins3[k] * scale, hog_params_t->threshold);
+                    hist += min(bins4[k] * scale, hog_params_t->threshold);
+                    dst[k] += hist;
+                }
+            }
+        }
+    }
+}
+
+
+#define HOGFEATURES(low_y, high_y, low_x)                                                                                              \
+    for (vx_int32 blkH = 0; blkH < num_blockH; blkH++)                                                                                 \
+    {                                                                                                                                  \
+        for (vx_int32 blkW = 0; blkW < num_blockW; blkW++)                                                                             \
+        {                                                                                                                              \
+            vx_float32 sum = 0;                                                                                                        \
+            for (vx_int32 y = 0; y < cells_per_block_h; y++)                                                                           \
+            {                                                                                                                          \
+                for (vx_int32 x = 0; x < cells_per_block_w; x++)                                                                       \
+                {                                                                                                                      \
+                    vx_int32 index = (blkH + y)*n_cellsx + (blkW + x);                                                                 \
+                    void *mag_ptr = (vx_int8 *)magnitudes_data + index;                                                                \
+                    sum += (*(vx_int16 *)mag_ptr) * (*(vx_int16 *)mag_ptr);                                                            \
+                }                                                                                                                      \
+            }                                                                                                                          \
+            sum = sqrtf(sum + 0.00000000000001);                                                                                       \
+            for (vx_int32 y = 0; y < cells_per_block_h; y++)                                                                           \
+            {                                                                                                                          \
+                for (vx_int32 x = 0; x < cells_per_block_w; x++)                                                                       \
+                {                                                                                                                      \
+                    for (vx_int32 k = 0; k < hog_params_t->num_bins; k++)                                                              \
+                    {                                                                                                                  \
+                        vx_int32 bins_index = (blkH + y)*n_cellsx * hog_params_t->num_bins + (blkW + x)*hog_params_t->num_bins + k;    \
+                        vx_int32 block_index = blkH * num_blockW * hog_params_t->num_bins + blkW * hog_params_t->num_bins + k;         \
+                        float hist = min((vx_int8)(*((vx_int8 *)bins_data + bins_index)) / sum, hog_params_t->threshold);              \
+                        void *features_ptr = (vx_int8 *)features_data + block_index;                                                   \
+                        *(vx_int16 *)features_ptr = *(vx_int16 *)features_ptr + hist;                                                  \
+                    }                                                                                                                  \
+                }                                                                                                                      \
+            }                                                                                                                          \
+        }                                                                                                                              \
+    }
+
+void HogFeatures_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    void *magnitudes_data = parameters[1];
+    void *bins_data = parameters[2];
+    vx_tile_array_t *hog_params = (vx_tile_array_t *)parameters[3];
+    void * features_data = parameters[5];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = vxTileHeight(in, 0);
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = vxTileWidth(in, 0);
+
+    vx_int32 width = high_x, height = high_y;
+
+    vx_hog_t *hog_params_t = (vx_hog_t *)hog_params->ptr;
+
+    vx_int32 num_blockW = width / hog_params_t->cell_width - 1;
+    vx_int32 num_blockH = height / hog_params_t->cell_height - 1;
+    vx_int32 n_cellsx = width / hog_params_t->cell_width;
+    vx_int32 cells_per_block_w = hog_params_t->block_width / hog_params_t->cell_width;
+    vx_int32 cells_per_block_h = hog_params_t->block_height / hog_params_t->cell_height;
+
+    if (hog_params_t->num_bins > 0 && hog_params_t->num_bins < 360)
+    {
+        if (low_y == 0 && low_x == 0)
+        {
+            HOGFEATURES(low_y, high_y, low_x)
+        }
+        else
+        {
+            HOGFEATURES(0, low_y, low_x)
+            HOGFEATURES(low_y, high_y, 0)
+        }
+    }
+}
diff --git a/kernels/tiling/tiling_integralimage.c b/kernels/tiling/tiling_integralimage.c
new file mode 100644
index 0000000..62dfc96
--- /dev/null
+++ b/kernels/tiling/tiling_integralimage.c
@@ -0,0 +1,218 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+void IntegralImage_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = out->tile_x + out->tile_block.width;
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint32 *dst_base = (vx_uint32 *)out->base[0] + out->tile_x;
+
+    for (y = low_y; y < high_y; y++)
+    {
+        const vx_uint8 *pixels_ptr = src_base + y * in->addr->stride_y;
+        vx_uint32  *sums = dst_base + y * out->addr->stride_y / 4;
+
+        if (y == 0)
+        {
+            for (x = low_x; x < high_x; x += 16)
+            {
+                const uint8x16_t input_pixels = vld1q_u8(pixels_ptr);
+                
+                const uint16x8x2_t temp = 
+                {
+                    {
+                        vmovl_u8(vget_low_u8(input_pixels)),
+                        vmovl_u8(vget_high_u8(input_pixels))
+                    }
+                };
+
+                uint32x4x4_t pixels = 
+                {
+                    {
+                        vmovl_u16(vget_low_u16(temp.val[0])),
+                        vmovl_u16(vget_high_u16(temp.val[0])),
+                        vmovl_u16(vget_low_u16(temp.val[1])),
+                        vmovl_u16(vget_high_u16(temp.val[1]))
+                    }
+                };
+
+                vst1q_u32(sums, pixels.val[0]);
+
+                vst1q_u32(sums + 4, pixels.val[1]);
+
+                vst1q_u32(sums + 8, pixels.val[2]);
+
+                vst1q_u32(sums + 12, pixels.val[3]);
+
+                if (x == 0)
+                {
+                    sums[0] = pixels_ptr[0];
+
+                    // Perform prefix summation
+                    for (vx_int32 i = 1; i < 16; i++)
+                    {
+                        sums[i] += sums[i-1];
+                    }
+                }
+                else
+                {
+                    // Perform prefix summation
+                    for (vx_int32 i = 0; i < 16; i++)
+                    {
+                        sums[i] += sums[i-1];
+                    }
+                }
+
+                pixels_ptr += 16;
+                sums += 16;
+            }
+        }
+        else
+        {
+            vx_uint32  *prev_sums_mid = dst_base + (y-1) * out->addr->stride_y / 4;    //(0,-1)
+            vx_uint32  *prev_sums_left = dst_base + (y-1) * out->addr->stride_y / 4 - out->addr->stride_x / 4;  //(-1,-1)
+
+            for (x = low_x; x < high_x; x += 16)
+            {
+                const uint8x16_t input_pixels = vld1q_u8(pixels_ptr);
+                
+                const uint16x8x2_t temp = 
+                {
+                    {
+                        vmovl_u8(vget_low_u8(input_pixels)),
+                        vmovl_u8(vget_high_u8(input_pixels))
+                    }
+                };
+
+                uint32x4x4_t pixels = 
+                {
+                    {
+                        vmovl_u16(vget_low_u16(temp.val[0])),
+                        vmovl_u16(vget_high_u16(temp.val[0])),
+                        vmovl_u16(vget_low_u16(temp.val[1])),
+                        vmovl_u16(vget_high_u16(temp.val[1]))
+                    }
+                };
+
+                // Add top mid pixel values
+                pixels.val[0] = vaddq_u32(vld1q_u32(prev_sums_mid), pixels.val[0]);
+                pixels.val[1] = vaddq_u32(vld1q_u32(prev_sums_mid + 4), pixels.val[1]);
+                pixels.val[2] = vaddq_u32(vld1q_u32(prev_sums_mid + 8), pixels.val[2]);
+                pixels.val[3] = vaddq_u32(vld1q_u32(prev_sums_mid + 12), pixels.val[3]);
+
+                // Subtract top left diagonal values
+                pixels.val[0] = vsubq_u32(pixels.val[0], vld1q_u32(prev_sums_left));
+                vst1q_u32(sums, pixels.val[0]);
+
+                pixels.val[1] = vsubq_u32(pixels.val[1], vld1q_u32(prev_sums_left + 4));
+                vst1q_u32(sums + 4, pixels.val[1]);
+
+                pixels.val[2] = vsubq_u32(pixels.val[2], vld1q_u32(prev_sums_left + 8));
+                vst1q_u32(sums + 8, pixels.val[2]);
+
+                pixels.val[3] = vsubq_u32(pixels.val[3], vld1q_u32(prev_sums_left + 12));
+                vst1q_u32(sums + 12, pixels.val[3]);
+
+                if (x == 0)
+                {
+                    sums[0] = prev_sums_mid[0] + pixels_ptr[0];
+                    // Perform prefix summation
+                    for (vx_int32 i = 1; i < 16; i++)
+                    {
+                        sums[i] += sums[i-1];
+                    }
+                }
+                else
+                {
+                    // Perform prefix summation
+                    for (vx_int32 i = 0; i < 16; i++)
+                    {
+                        sums[i] += sums[i-1];
+                    }
+                }
+
+                pixels_ptr += 16;
+                sums += 16;
+                prev_sums_mid += 16;
+                prev_sums_left += 16;
+            }
+        }
+    }
+}
+
+#define INTEGRAL_IMAGE(low_y, high_y, low_x)                                                    \
+    for (y = low_y; y < high_y; y++)                                                            \
+    {                                                                                           \
+        vx_uint8 *pixels = (vx_uint8 *)src_base + y * in->addr->stride_y;                       \
+        vx_uint32 *sums = (vx_uint32 *)dst_base + y * out->addr->stride_y / 4;                  \
+        if (y == 0)                                                                             \
+        {                                                                                       \
+            sums[0] = pixels[0];                                                                \
+            for (x = low_x; x < high_x; x++)                                                    \
+                sums[x] = sums[x - 1] + pixels[x];                                              \
+        }                                                                                       \
+        else                                                                                    \
+        {                                                                                       \
+            vx_uint32 *prev_sums = (vx_uint32 *)dst_base + (y - 1) * out->addr->stride_y / 4;   \
+            sums[0] = prev_sums[0] + pixels[0];                                                 \
+            for (x = low_x; x < high_x; x++)                                                    \
+                sums[x] = pixels[x] + sums[x - 1] + prev_sums[x] - prev_sums[x - 1];            \
+        }                                                                                       \
+    }               
+
+void IntegralImage_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = vxTileHeight(in, 0);
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = vxTileWidth(in, 0);
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    if (low_y == 0 && low_x == 0)
+    {
+        INTEGRAL_IMAGE(low_y, high_y, low_x + 1)
+    }
+    else
+    {
+        INTEGRAL_IMAGE(0, low_y, low_x)
+
+        vx_uint8 *src_base = in->base[0];
+        vx_uint8 *dst_base = out->base[0];
+        INTEGRAL_IMAGE(low_y, high_y, 1)
+    }
+}
diff --git a/kernels/tiling/tiling_lbp.c b/kernels/tiling/tiling_lbp.c
new file mode 100644
index 0000000..0c1a53b
--- /dev/null
+++ b/kernels/tiling/tiling_lbp.c
@@ -0,0 +1,804 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+#include <tiling.h>
+#include <stdlib.h>
+
+static void vxLBPStandard_tiling_fast(vx_tile_t *in, vx_int8 ksize, vx_tile_t *out)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_uint8 *src_base = in->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = in->tile_y + in->tile_block.height;
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = in->tile_x + in->tile_block.width;
+
+    if(ksize == 3)
+    {
+        if (low_y == 0)
+        {
+            low_y = 1;
+        }
+        if (high_y == in->image.height)
+        {
+            high_y = high_y - 1;
+        }
+        if (high_x == in->image.width)
+        {
+            high_x = high_x - 1;
+        }
+
+        uint8x16_t vPrv[3], vCur[3], vNxt[3];
+        uint8x16_t vOne = vdupq_n_u8(1);
+
+        for (y = low_y; y < high_y; y += in->addr->step_y)
+        {
+            vx_uint8 *ptr_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y;
+            vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+            for (vx_uint8 idx = 0; idx < 3; idx++)
+            {
+                vPrv[idx] = vdupq_n_u8(0);
+                vCur[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y);
+                vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + 16 * in->addr->stride_x);
+            }
+            for (x = 0; x < high_x; x += 16)
+            {
+                uint8x16_t vSum = vdupq_n_u8(0);
+                uint8x16_t vTmp = vextq_u8(vPrv[0], vCur[0], 15);
+                uint8x16_t vPred = vcgeq_u8(vTmp, vCur[1]);
+                uint8x16_t vVal = vandq_u8(vPred, vOne);
+                vSum = vVal;
+
+                vPred = vcgeq_u8(vCur[0], vCur[1]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 1);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vTmp = vextq_u8(vCur[0], vNxt[0], 1);
+                vPred = vcgeq_u8(vTmp, vCur[1]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 2);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vTmp = vextq_u8(vCur[1], vNxt[1], 1);
+                vPred = vcgeq_u8(vTmp, vCur[1]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 3);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vTmp = vextq_u8(vCur[2], vNxt[2], 1);
+                vPred = vcgeq_u8(vTmp, vCur[1]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 4);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vPred = vcgeq_u8(vCur[2], vCur[1]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 5);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vTmp = vextq_u8(vPrv[2], vCur[2], 15);
+                vPred = vcgeq_u8(vTmp, vCur[1]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 6);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vTmp = vextq_u8(vPrv[1], vCur[1], 15);
+                vPred = vcgeq_u8(vTmp, vCur[1]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 7);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vst1q_u8(ptr_dst + x, vSum);
+
+                for (vx_uint8 idx = 0; idx < 3; idx++)
+                {
+                    vPrv[idx] = vCur[idx];
+                    vCur[idx] = vNxt[idx];
+                    vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + (x + 32) * in->addr->stride_x);
+                }
+            }
+         }
+     }
+     else if (ksize == 5)
+     {
+        if (low_y == 0)
+        {
+            low_y = 2;
+        }
+        if (high_y == in->image.height)
+        {
+            high_y = high_y - 2;
+        }
+        if (high_x == in->image.width)
+        {
+            high_x = high_x - 2;
+        }
+
+        uint8x16_t vPrv[5], vCur[5], vNxt[5];
+        uint8x16_t vOne = vdupq_n_u8(1);
+
+        for (y = low_y; y < high_y; y += in->addr->step_y)
+        {
+            vx_uint8 *ptr_src = (vx_uint8 *)src_base + (y - 2) * in->addr->stride_y;
+            vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+            for (vx_uint8 idx = 0; idx < 5; idx++)
+            {
+                vPrv[idx] = vdupq_n_u8(0);
+                vCur[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y);
+                vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + 16 * in->addr->stride_x);
+            }
+            for (x = 0; x < high_x; x += 16)
+            {
+                uint8x16_t vSum = vdupq_n_u8(0);
+                uint8x16_t vTmp = vextq_u8(vPrv[1], vCur[1], 15);
+                uint8x16_t vPred = vcgeq_u8(vTmp, vCur[2]);
+                uint8x16_t vVal = vandq_u8(vPred, vOne);
+                vSum = vVal;
+
+                vPred = vcgeq_u8(vCur[0], vCur[2]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 1);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vTmp = vextq_u8(vCur[1], vNxt[1], 1);
+                vPred = vcgeq_u8(vTmp, vCur[2]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 2);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vTmp = vextq_u8(vCur[2], vNxt[2], 2);
+                vPred = vcgeq_u8(vTmp, vCur[2]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 3);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vTmp = vextq_u8(vCur[3], vNxt[3], 1);
+                vPred = vcgeq_u8(vTmp, vCur[2]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 4);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vPred = vcgeq_u8(vCur[4], vCur[2]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 5);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vTmp = vextq_u8(vPrv[3], vCur[3], 15);
+                vPred = vcgeq_u8(vTmp, vCur[2]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 6);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vTmp = vextq_u8(vPrv[2], vCur[2], 14);
+                vPred = vcgeq_u8(vTmp, vCur[2]);
+                vVal = vandq_u8(vPred, vOne);
+                vVal = vshlq_n_u8(vVal, 7);
+                vSum = vaddq_u8(vVal, vSum);
+
+                vst1q_u8(ptr_dst + x, vSum);
+
+                for (vx_uint8 idx = 0; idx < 5; idx++)
+                {
+                    vPrv[idx] = vCur[idx];
+                    vCur[idx] = vNxt[idx];
+                    vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + (x + 32) * in->addr->stride_x);
+                }
+            }
+         }
+     }
+}
+
+static void vxLBPModified_tiling_fast(vx_tile_t *in, vx_tile_t *out)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_uint8 *src_base = in->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = in->tile_y + in->tile_block.height;
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = in->tile_x + in->tile_block.width;
+
+    uint8x16_t vPrv[3], vCur[3], vNxt[3], vG[8];
+    vx_uint32 w16;
+    uint8x16_t vOne = vdupq_n_u8(1);
+    vx_uint8 szCoeff[8] = { 1 << 0, 1 << 1, 1 << 2, 1 << 3,
+                           1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+
+    if (low_y == 0)
+    {
+        low_y = 2;
+    }
+    if (high_y == in->image.height)
+    {
+        high_y = high_y - 2;
+    }
+    if (high_x == in->image.width)
+    {
+        high_x = high_x - 2;
+    }
+
+    for (y = low_y; y < high_y; y += in->addr->step_y)
+    {
+        vx_uint8 *ptr_src = (vx_uint8 *)src_base + (y - 2) * in->addr->stride_y;
+        vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+        for (vx_uint8 idx = 0, idxY = 0; idxY < 5; (idx++, idxY += 2))
+        {
+            vPrv[idx] = vdupq_n_u8(0);
+            vCur[idx] = vld1q_u8(ptr_src + idxY * in->addr->stride_y);
+            vNxt[idx] = vld1q_u8(ptr_src + idxY * in->addr->stride_y + 16 * in->addr->stride_x);
+        }
+        for (x = 0; x < high_x; x += 16)
+        {
+            uint16x8_t vSumu16_lo = vdupq_n_u16(0);
+            uint16x8_t vSumu16_hi = vdupq_n_u16(0);
+            uint8x16_t vAvg, vPred, vSum;
+
+            vG[0] = vextq_u8(vPrv[0], vCur[0], 14);
+            vG[1] = vCur[0];
+            vG[2] = vextq_u8(vCur[0], vNxt[0], 2);
+            vG[3] = vextq_u8(vCur[1], vNxt[1], 2);
+            vG[4] = vextq_u8(vCur[2], vNxt[2], 2);
+            vG[5] = vCur[2];
+            vG[6] = vextq_u8(vPrv[2], vCur[2], 14);
+            vG[7] = vextq_u8(vPrv[1], vCur[1], 14);
+
+            for (vx_uint8 idx = 0; idx < 8; idx++)
+            {
+                vSumu16_lo = vaddq_u16(vSumu16_lo, vmovl_u8(vget_low_u8(vG[idx])));
+                vSumu16_hi = vaddq_u16(vSumu16_hi, vmovl_u8(vget_high_u8(vG[idx])));
+            }
+
+            vSumu16_lo = vaddq_u16(vSumu16_lo, vdupq_n_u16(1));
+            vSumu16_hi = vaddq_u16(vSumu16_hi, vdupq_n_u16(1));
+            vSumu16_lo = vshrq_n_u16(vSumu16_lo, 3);
+            vSumu16_hi = vshrq_n_u16(vSumu16_hi, 3);
+            vAvg = vcombine_u8(vmovn_u16(vSumu16_lo), vmovn_u16(vSumu16_hi));
+
+            vSumu16_lo = vdupq_n_u16(0);
+            vSumu16_hi = vdupq_n_u16(0);
+            for (vx_uint8 idx = 0; idx < 8; idx++)
+            {
+                vPred = vcgtq_u8(vG[idx], vAvg);
+                vPred = vandq_u8(vPred, vOne);
+                vSumu16_lo = vmlaq_n_u16(vSumu16_lo, vmovl_u8(vget_low_u8(vPred)), szCoeff[idx]);
+                vSumu16_hi = vmlaq_n_u16(vSumu16_hi, vmovl_u8(vget_high_u8(vPred)), szCoeff[idx]);
+            }
+
+            vSum = vcombine_u8(vmovn_u16(vSumu16_lo), vmovn_u16(vSumu16_hi));
+            vst1q_u8(ptr_dst + x, vSum);
+
+            for (vx_uint8 idx = 0, idxY = 0; idxY < 5; (idx++, idxY += 2))
+            {
+                vPrv[idx] = vCur[idx];
+                vCur[idx] = vNxt[idx];
+                vNxt[idx] = vld1q_u8(ptr_src + idxY * in->addr->stride_y + (x + 32) * in->addr->stride_x);
+            }
+        }
+     }
+}
+
+static void vxLBPUniform_tiling_fast(vx_tile_t *in, vx_int8 ksize, vx_tile_t *out)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_uint8 *src_base = in->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = in->tile_y + in->tile_block.height;
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = in->tile_x + in->tile_block.width;
+
+    vx_uint8 szCoeff[8] = { 1 << 0, 1 << 1, 1 << 2, 1 << 3,
+                           1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+
+    uint8x16_t vOne = vdupq_n_u8(1);
+    uint8x16_t vNine = vdupq_n_u8(9);
+    uint8x16_t vTwo = vdupq_n_u8(2);
+
+    if(ksize == 3)
+    {
+        if (low_y == 0)
+        {
+            low_y = 1;
+        }
+        if (high_y == in->image.height)
+        {
+            high_y = high_y - 1;
+        }
+        if (high_x == in->image.width)
+        {
+            high_x = high_x - 1;
+        }
+
+        uint8x16_t vPrv[3], vCur[3], vNxt[3], vG[8];
+
+        for (y = low_y; y < high_y; y += in->addr->step_y)
+        {
+            vx_uint8 *ptr_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y;
+            vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+            for (vx_uint8 idx = 0; idx < 3; idx++)
+            {
+                vPrv[idx] = vdupq_n_u8(0);
+                vCur[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y);
+                vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + 16 * in->addr->stride_x);
+            }
+            for (x = 0; x < high_x; x += 16)
+            {
+                vG[0] = vextq_u8(vPrv[0], vCur[0], 15);
+                vG[1] = vCur[0];
+                vG[2] = vextq_u8(vCur[0], vNxt[0], 1);
+                vG[3] = vextq_u8(vCur[1], vNxt[1], 1);
+                vG[4] = vextq_u8(vCur[2], vNxt[2], 1);
+                vG[5] = vCur[2];
+                vG[6] = vextq_u8(vPrv[2], vCur[2], 15);
+                vG[7] = vextq_u8(vPrv[1], vCur[1], 15);
+
+                uint8x16_t vPred = vcgeq_u8(vG[7], vCur[1]);
+                uint8x16_t vU1 = vandq_u8(vPred, vOne);
+                vPred = vcgeq_u8(vG[0], vCur[1]);
+                uint8x16_t vU2 = vandq_u8(vPred, vOne);
+                uint8x16_t vAbs1 = vabdq_u8(vU1, vU2);
+                uint8x16_t vAbs2 = vdupq_n_u8(0);
+
+                for (vx_uint8 idx = 1; idx < 8; idx++)
+                {
+                    vPred = vcgeq_u8(vG[idx], vCur[1]);
+                    vU1 = vandq_u8(vPred, vOne);
+                    vPred = vcgeq_u8(vG[idx - 1], vCur[1]);
+                    vU2 = vandq_u8(vPred, vOne);
+                    vAbs2 = vaddq_u8(vAbs2, vabdq_u8(vU1, vU2));
+                }
+                vAbs1 = vaddq_u8(vAbs1, vAbs2);
+
+                uint16x8_t vSumu16_lo = vdupq_n_u16(0);
+                uint16x8_t vSumu16_hi = vdupq_n_u16(0);
+                for (vx_uint8 idx = 0; idx < 8; idx++)
+                {
+                    vPred = vcgeq_u8(vG[idx], vCur[1]);
+                    vPred = vandq_u8(vPred, vOne);
+                    vSumu16_lo = vmlaq_n_u16(vSumu16_lo, vmovl_u8(vget_low_u8(vPred)), szCoeff[idx]);
+                    vSumu16_hi = vmlaq_n_u16(vSumu16_hi, vmovl_u8(vget_high_u8(vPred)), szCoeff[idx]);
+                }
+
+                uint8x16_t vSum = vcombine_u8(vmovn_u16(vSumu16_lo), vmovn_u16(vSumu16_hi));
+                vPred = vcleq_u8(vAbs1, vTwo);
+                vSum = vbslq_u8(vPred, vSum, vNine);
+
+                vst1q_u8(ptr_dst + x, vSum);
+
+                for (vx_uint8 idx = 0; idx < 3; idx++)
+                {
+                    vPrv[idx] = vCur[idx];
+                    vCur[idx] = vNxt[idx];
+                    vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + (x + 32) * in->addr->stride_x);
+                }
+            }
+         }
+     }
+     else if (ksize == 5)
+     {
+        if (low_y == 0)
+        {
+            low_y = 2;
+        }
+        if (high_y == in->image.height)
+        {
+            high_y = high_y - 2;
+        }
+        if (high_x == in->image.width)
+        {
+            high_x = high_x - 2;
+        }
+
+        uint8x16_t vPrv[5], vCur[5], vNxt[5], vG[8];
+
+        for (y = low_y; y < high_y; y += in->addr->step_y)
+        {
+            vx_uint8 *ptr_src = (vx_uint8 *)src_base + (y - 2) * in->addr->stride_y;
+            vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+            for (vx_uint8 idx = 0; idx < 5; idx++)
+            {
+                vPrv[idx] = vdupq_n_u8(0);
+                vCur[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y);
+                vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + 16 * in->addr->stride_x);
+            }
+            for (x = 0; x < high_x; x += 16)
+            {
+                vG[0] = vextq_u8(vPrv[1], vCur[1], 15);
+                vG[1] = vCur[0];
+                vG[2] = vextq_u8(vCur[1], vNxt[1], 1);
+                vG[3] = vextq_u8(vCur[2], vNxt[2], 2);
+                vG[4] = vextq_u8(vCur[3], vNxt[3], 1);
+                vG[5] = vCur[4];
+                vG[6] = vextq_u8(vPrv[3], vCur[3], 15);
+                vG[7] = vextq_u8(vPrv[2], vCur[2], 14);
+
+                uint8x16_t vPred = vcgeq_u8(vG[7], vCur[2]);
+                uint8x16_t vU1 = vandq_u8(vPred, vOne);
+                vPred = vcgeq_u8(vG[0], vCur[2]);
+                uint8x16_t vU2 = vandq_u8(vPred, vOne);
+                uint8x16_t vAbs1 = vabdq_u8(vU1, vU2);
+                uint8x16_t vAbs2 = vdupq_n_u8(0);
+
+                for (vx_uint8 idx = 1; idx < 8; idx++)
+                {
+                    vPred = vcgeq_u8(vG[idx], vCur[2]);
+                    vU1 = vandq_u8(vPred, vOne);
+                    vPred = vcgeq_u8(vG[idx - 1], vCur[2]);
+                    vU2 = vandq_u8(vPred, vOne);
+                    vAbs2 = vaddq_u8(vAbs2, vabdq_u8(vU1, vU2));
+                }
+                vAbs1 = vaddq_u8(vAbs1, vAbs2);
+
+                uint16x8_t vSumu16_lo = vdupq_n_u16(0);
+                uint16x8_t vSumu16_hi = vdupq_n_u16(0);
+                for (vx_uint8 idx = 0; idx < 8; idx++)
+                {
+                    vPred = vcgeq_u8(vG[idx], vCur[2]);
+                    vPred = vandq_u8(vPred, vOne);
+                    vSumu16_lo = vmlaq_n_u16(vSumu16_lo, vmovl_u8(vget_low_u8(vPred)), szCoeff[idx]);
+                    vSumu16_hi = vmlaq_n_u16(vSumu16_hi, vmovl_u8(vget_high_u8(vPred)), szCoeff[idx]);
+                }
+
+                uint8x16_t vSum = vcombine_u8(vmovn_u16(vSumu16_lo), vmovn_u16(vSumu16_hi));
+                vPred = vcleq_u8(vAbs1, vTwo);
+                vSum = vbslq_u8(vPred, vSum, vNine);
+
+                vst1q_u8(ptr_dst + x, vSum);
+
+                for (vx_uint8 idx = 0; idx < 5; idx++)
+                {
+                    vPrv[idx] = vCur[idx];
+                    vCur[idx] = vNxt[idx];
+                    vNxt[idx] = vld1q_u8(ptr_src + idx * in->addr->stride_y + (x + 32) * in->addr->stride_x);
+                }
+            }
+         }
+     }
+}
+
+void LBP_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_enum *format = (vx_enum *)parameters[1];
+    vx_int8 *size = (vx_int8 *)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+
+    switch (*format)
+    {
+    case VX_LBP:
+        vxLBPStandard_tiling_fast(in, *size, out);
+        break;
+    case VX_MLBP:
+        vxLBPModified_tiling_fast(in, out);
+        break;
+    case VX_ULBP:
+        vxLBPUniform_tiling_fast(in, *size, out);
+        break;
+    }
+}
+
+vx_uint8 vx_lbp_s(vx_int16 x)
+{
+    if (x >= 0)
+    {
+        return 1;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+vx_uint8 vx_lbp_u(vx_uint8 *g, vx_uint8 gc)
+{
+    vx_uint8 u1 = vx_lbp_s(g[7] - gc);
+    vx_uint8 u2 = vx_lbp_s(g[0] - gc);
+
+    vx_uint8 abs1 = abs(u1 - u2);
+
+    vx_uint8 abs2 = 0;
+    for (vx_int8 p = 1; p < 8; p++)
+    {
+        u1 = vx_lbp_s(g[p] - gc);
+        u2 = vx_lbp_s(g[p - 1] - gc);
+        abs2 += abs(u1 - u2);
+    }
+
+    return abs1 + abs2;
+}
+
+#define LBPSTANDARD_3x3(low_y, high_y, low_x, high_x)                                           \
+    for (y = low_y; y < high_y; y += in->addr->step_y)                                          \
+    {                                                                                           \
+        for (x = low_x; x < high_x; x += in->addr->step_x)                                      \
+        {                                                                                       \
+            g[0] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y - 1, in->addr);   \
+            g[1] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y - 1, in->addr);       \
+            g[2] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y - 1, in->addr);   \
+            g[3] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y, in->addr);       \
+            g[4] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y + 1, in->addr);   \
+            g[5] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y + 1, in->addr);       \
+            g[6] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y + 1, in->addr);   \
+            g[7] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y, in->addr);       \
+            gc = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y, in->addr);             \
+                                                                                                \
+            sum = 0;                                                                            \
+            for (vx_int8 p = 0; p < 8; p++)                                                     \
+            {                                                                                   \
+                sum += vx_lbp_s(g[p] - gc) * (1 << p);                                          \
+            }                                                                                   \
+                                                                                                \
+            vx_uint8 *dst_ptr = vxFormatImagePatchAddress2d(dst_base, x, y, out->addr);         \
+            *dst_ptr = sum;                                                                     \
+        }                                                                                       \
+    }                                                                                           
+
+
+#define LBPSTANDARD_5x5(low_y, high_y, low_x, high_x)                                           \
+    for (y = low_y; y < high_y; y += in->addr->step_y)                                          \
+    {                                                                                           \
+        for (x = low_x; x < high_x; x += in->addr->step_x)                                      \
+        {                                                                                       \
+            g[0] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y - 1, in->addr);   \
+            g[1] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y - 2, in->addr);       \
+            g[2] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y - 1, in->addr);   \
+            g[3] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 2, y, in->addr);       \
+            g[4] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y + 1, in->addr);   \
+            g[5] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y + 2, in->addr);       \
+            g[6] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y + 1, in->addr);   \
+            g[7] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 2, y, in->addr);       \
+            gc = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y, in->addr);             \
+                                                                                                \
+            sum = 0;                                                                            \
+            for (vx_int8 p = 0; p < 8; p++)                                                     \
+            {                                                                                   \
+                sum += vx_lbp_s(g[p] - gc) * (1 << p);                                          \
+            }                                                                                   \
+                                                                                                \
+            vx_uint8 *dst_ptr = vxFormatImagePatchAddress2d(dst_base, x, y, out->addr);         \
+            *dst_ptr = sum;                                                                     \
+        }                                                                                       \
+    }                                                                                           
+
+static void vxLBPStandard_tiling_flexible(vx_tile_t *in, vx_int8 ksize, vx_tile_t *out)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_uint8 *src_base = in->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = vxTileHeight(in, 0);
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = vxTileWidth(in, 0);
+
+    vx_uint8 gc, g[8], sum;
+
+    if (low_y == 0 && low_x == 0)
+    {
+        if (ksize == 3)
+            LBPSTANDARD_3x3(low_y + 1, high_y - 1, low_x + 1, high_x - 1)
+        else if (ksize == 5)
+            LBPSTANDARD_5x5(low_y + 2, high_y - 2, low_x + 2, high_x - 2)
+    }
+    else
+    {
+        if (ksize == 3)
+        {
+            LBPSTANDARD_3x3(1, low_y, low_x, high_x - 1)
+            LBPSTANDARD_3x3(low_y, high_y, 1, high_x - 1)
+        }
+        else if (ksize == 5)
+        {
+            LBPSTANDARD_5x5(2, low_y, low_x, high_x - 2)
+            LBPSTANDARD_5x5(low_y, high_y, 2, high_x - 2)
+        }
+    }
+}
+
+#define LBPMODIFIED(low_y, high_y, low_x, high_x)                                               \
+    for (y = low_y; y < high_y; y += in->addr->step_y)                                          \
+    {                                                                                           \
+        for (x = low_x; x < high_x; x += in->addr->step_x)                                      \
+        {                                                                                       \
+            g[0] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 2, y - 2, in->addr);   \
+            g[1] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y - 2, in->addr);       \
+            g[2] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 2, y - 2, in->addr);   \
+            g[3] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 2, y, in->addr);       \
+            g[4] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 2, y + 2, in->addr);   \
+            g[5] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y + 2, in->addr);       \
+            g[6] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 2, y + 2, in->addr);   \
+            g[7] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 2, y, in->addr);       \
+                                                                                                \
+            avg = (g[0] + g[1] + g[2] + g[3] + g[4] + g[5] + g[6] + g[7] + 1) / 8;              \
+                                                                                                \
+            sum = 0;                                                                            \
+            for (vx_int8 p = 0; p < 8; p++)                                                     \
+            {                                                                                   \
+                sum += ((g[p] > avg) * (1 << p));                                               \
+            }                                                                                   \
+                                                                                                \
+            vx_uint8 *dst_ptr = vxFormatImagePatchAddress2d(dst_base, x, y, out->addr);         \
+            *dst_ptr = sum;                                                                     \
+        }                                                                                       \
+    }
+
+void vxLBPModified_tiling_flexible(vx_tile_t *in, vx_tile_t *out)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_uint8 *src_base = in->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = vxTileHeight(in, 0);
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = vxTileWidth(in, 0);
+
+    vx_uint8 avg, g[8], sum;
+
+    if (low_y == 0 && low_x == 0)
+    {
+        LBPMODIFIED(low_y + 2, high_y - 2, low_x + 2, high_x - 2)
+    }
+    else
+    {
+        LBPMODIFIED(2, low_y, low_x, high_x - 2)
+        LBPMODIFIED(low_y, high_y, 2, high_x - 2)
+    }
+}
+
+#define LBPUNIFORM_3x3(low_y, high_y, low_x, high_x)                                            \
+    for (y = low_y; y < high_y; y += in->addr->step_y)                                          \
+    {                                                                                           \
+        for (x = low_x; x < high_x; x += in->addr->step_x)                                      \
+        {                                                                                       \
+            g[0] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y - 1, in->addr);   \
+            g[1] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y - 1, in->addr);       \
+            g[2] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y - 1, in->addr);   \
+            g[3] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y, in->addr);       \
+            g[4] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y + 1, in->addr);   \
+            g[5] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y + 1, in->addr);       \
+            g[6] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y + 1, in->addr);   \
+            g[7] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y, in->addr);       \
+            gc = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y, in->addr);             \
+                                                                                                \
+            sum = 0;                                                                            \
+            if (vx_lbp_u(g, gc) <= 2)                                                           \
+            {                                                                                   \
+                for (vx_uint8 p = 0; p < 8; p++)                                                \
+                {                                                                               \
+                    sum += vx_lbp_s(g[p] - gc)*(1 << p);                                        \
+                }                                                                               \
+            }                                                                                   \
+            else                                                                                \
+            {                                                                                   \
+                sum = 9;                                                                        \
+            }                                                                                   \
+                                                                                                \
+            vx_uint8 *dst_ptr = vxFormatImagePatchAddress2d(dst_base, x, y, out->addr);         \
+            *dst_ptr = sum;                                                                     \
+        }                                                                                       \
+    }
+
+
+#define LBPUNIFORM_5x5(low_y, high_y, low_x, high_x)                                            \
+    for (y = low_y; y < high_y; y += in->addr->step_y)                                          \
+    {                                                                                           \
+        for (x = low_x; x < high_x; x += in->addr->step_x)                                      \
+        {                                                                                       \
+            g[0] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y - 1, in->addr);   \
+            g[1] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y - 2, in->addr);       \
+            g[2] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y - 1, in->addr);   \
+            g[3] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 2, y, in->addr);       \
+            g[4] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x + 1, y + 1, in->addr);   \
+            g[5] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y + 2, in->addr);       \
+            g[6] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 1, y + 1, in->addr);   \
+            g[7] = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x - 2, y, in->addr);       \
+            gc = *(vx_uint8*)vxFormatImagePatchAddress2d(src_base, x, y, in->addr);             \
+                                                                                                \
+            sum = 0;                                                                            \
+            if (vx_lbp_u(g, gc) <= 2)                                                           \
+            {                                                                                   \
+                for (vx_uint8 p = 0; p < 8; p++)                                                \
+                {                                                                               \
+                    sum += vx_lbp_s(g[p] - gc)*(1 << p);                                        \
+                }                                                                               \
+            }                                                                                   \
+            else                                                                                \
+            {                                                                                   \
+                sum = 9;                                                                        \
+            }                                                                                   \
+                                                                                                \
+            vx_uint8 *dst_ptr = vxFormatImagePatchAddress2d(dst_base, x, y, out->addr);         \
+            *dst_ptr = sum;                                                                     \
+        }                                                                                       \
+    }
+
+
+void vxLBPUniform_tiling_flexible(vx_tile_t *in, vx_int8 ksize, vx_tile_t *out)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_uint8 *src_base = in->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = vxTileHeight(in, 0);
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = vxTileWidth(in, 0);
+
+    vx_uint8 gc, g[8], sum;
+
+    if (low_y == 0 && low_x == 0)
+    {
+        if (ksize == 3)
+            LBPUNIFORM_3x3(low_y + 1, high_y - 1, low_x + 1, high_x - 1)
+        else if (ksize == 5)
+            LBPUNIFORM_5x5(low_y + 2, high_y - 2, low_x + 2, high_x - 2)
+    }
+    else
+    {
+        if (ksize == 3)
+        {
+            LBPUNIFORM_3x3(1, low_y, low_x, high_x - 1)
+            LBPUNIFORM_3x3(low_y, high_y, 1, high_x - 1)
+        }
+        else if (ksize == 5)
+        {
+            LBPUNIFORM_5x5(2, low_y, low_x, high_x - 2)
+            LBPUNIFORM_5x5(low_y, high_y, 2, high_x - 2)
+        }
+    }
+}
+
+
+void LBP_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_enum *format = (vx_enum *)parameters[1];
+    vx_int8 *size = (vx_int8 *)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+
+    switch (*format)
+    {
+    case VX_LBP:
+        vxLBPStandard_tiling_flexible(in, *size, out);
+        break;
+    case VX_MLBP:
+        vxLBPModified_tiling_flexible(in, out);
+        break;
+    case VX_ULBP:
+        vxLBPUniform_tiling_flexible(in, *size, out);
+        break;
+    }
+}
diff --git a/kernels/tiling/tiling_lut.c b/kernels/tiling/tiling_lut.c
new file mode 100644
index 0000000..ac32f14
--- /dev/null
+++ b/kernels/tiling/tiling_lut.c
@@ -0,0 +1,230 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+void TableLookup_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_array_t *lut = (vx_tile_array_t*)parameters[1];
+    vx_tile_t *out = (vx_tile_t*)parameters[2];
+
+    vx_uint8 *src_base = in->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = in->tile_y + in->tile_block.height;
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = in->tile_x + in->tile_block.width;
+
+    vx_enum type = lut->item_type;
+    vx_size count = lut->num_items;
+    vx_uint32 offset = lut->offset;
+
+    void *lut_ptr = lut->ptr;
+
+    if (type == VX_TYPE_UINT8)
+    {
+        int32x4_t vOffset = vdupq_n_s32((vx_int32)offset);
+        int32x4_t vCnt = vdupq_n_s32((vx_int32)count);
+        int32x4_t vZero = vdupq_n_s32(0);
+
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_uint8 *ptr_src = (vx_uint8 *)src_base + y * in->addr->stride_y;
+            vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+            for (x = low_x; x < high_x; x += 16)
+            {
+                vx_uint8 *lut_tmp = (vx_uint8 *)lut_ptr;
+                uint8x16_t vSrc = vld1q_u8(ptr_src + x);
+                uint16x8_t vSrcs16_low = vmovl_u8(vget_low_u8(vSrc));
+                uint16x8_t vSrcs16_high = vmovl_u8(vget_high_u8(vSrc));
+                int32x4_t vPoss32_low = vaddq_s32(vOffset, vmovl_s16(vreinterpret_s16_u16(vget_low_u16(vSrcs16_low))));
+                int32x4_t vPoss32_high = vaddq_s32(vOffset, vmovl_s16(vreinterpret_s16_u16(vget_high_u16(vSrcs16_low))));
+                uint32x4_t vPreds32_low = vcgeq_s32(vPoss32_low, vZero);
+                uint32x4_t vPreds32_tmp = vcltq_s32(vPoss32_low, vCnt);
+                vPreds32_low = vandq_u32(vPreds32_low, vPreds32_tmp);
+                vPoss32_low = vbslq_s32(vPreds32_low, vPoss32_low, vZero);
+                uint32x4_t vPreds32_high = vcgeq_s32(vPoss32_high, vZero);
+                vPreds32_tmp = vcltq_s32(vPoss32_high, vCnt);
+                vPreds32_high = vandq_u32(vPreds32_high, vPreds32_tmp);
+                vPoss32_high = vbslq_s32(vPreds32_high, vPoss32_high, vZero);
+                uint8x8_t vPredu8_low = vmovn_u16(vcombine_u16(vmovn_u32(vPreds32_low), vmovn_u32(vPreds32_high)));
+
+                uint8x16_t vVal = vdupq_n_u8(0);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 0)], vVal, 0);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 1)], vVal, 1);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 2)], vVal, 2);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 3)], vVal, 3);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 0)], vVal, 4);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 1)], vVal, 5);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 2)], vVal, 6);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 3)], vVal, 7);
+
+                vPoss32_low = vaddq_s32(vOffset, vmovl_s16(vreinterpret_s16_u16(vget_low_u16(vSrcs16_high))));
+                vPoss32_high = vaddq_s32(vOffset, vmovl_s16(vreinterpret_s16_u16(vget_high_u16(vSrcs16_high))));
+                vPreds32_low = vcgeq_s32(vPoss32_low, vZero);
+                vPreds32_tmp = vcltq_s32(vPoss32_low, vCnt);
+                vPreds32_low = vandq_u32(vPreds32_low, vPreds32_tmp);
+                vPoss32_low = vbslq_s32(vPreds32_low, vPoss32_low, vZero);
+                vPreds32_high = vcgeq_s32(vPoss32_high, vZero);
+                vPreds32_tmp = vcltq_s32(vPoss32_high, vCnt);
+                vPreds32_high = vandq_u32(vPreds32_high, vPreds32_tmp);
+                vPoss32_high = vbslq_s32(vPreds32_high, vPoss32_high, vZero);
+                uint8x8_t vPredu8_high = vmovn_u16(vcombine_u16(vmovn_u32(vPreds32_low), vmovn_u32(vPreds32_high)));
+
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 0)], vVal, 8);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 1)], vVal, 9);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 2)], vVal, 10);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_low, 3)], vVal, 11);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 0)], vVal, 12);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 1)], vVal, 13);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 2)], vVal, 14);
+                vVal = vsetq_lane_u8(lut_tmp[vgetq_lane_s32(vPoss32_high, 3)], vVal, 15);
+
+                uint8x16_t vPredu8 = vcombine_u8(vPredu8_low, vPredu8_high);
+                uint8x16_t vDstOrg = vld1q_u8(ptr_dst + x);
+                vVal = vbslq_u8(vPredu8, vVal, vDstOrg);
+                vst1q_u8(ptr_dst + x, vVal);
+            }
+        }
+    }
+    else if (type == VX_TYPE_INT16)
+    {
+        int32x4_t vOffset = vdupq_n_s32((vx_int32)offset);
+        int32x4_t vCnt = vdupq_n_s32((vx_int32)count);
+        int32x4_t vZero = vdupq_n_s32(0);
+
+        vx_int16 *lut_tmp = (vx_int16 *)lut_ptr;
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_uint8 *ptr_src = (vx_uint8 *)src_base + y * in->addr->stride_y;
+            vx_uint8 *ptr_dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+            for (x = low_x; x < high_x; x += 8)
+            {
+                int16x8_t vSrc = vld1q_s16((vx_int16 *)(ptr_src + x * in->addr->stride_x));
+                int32x4_t vPoss32_low = vaddq_s32(vOffset, vmovl_s16(vget_low_s16(vSrc)));
+                int32x4_t vPoss32_high = vaddq_s32(vOffset, vmovl_s16(vget_high_s16(vSrc)));
+                uint32x4_t vPreds32_low = vcgeq_s32(vPoss32_low, vZero);
+                uint32x4_t vPreds32_tmp = vcltq_s32(vPoss32_low, vCnt);
+                vPreds32_low = vandq_u32(vPreds32_low, vPreds32_tmp);
+                vPoss32_low = vbslq_s32(vPreds32_low, vPoss32_low, vZero);
+                uint32x4_t vPreds32_high = vcgeq_s32(vPoss32_high, vZero);
+                vPreds32_tmp = vcltq_s32(vPoss32_high, vCnt);
+                vPreds32_high = vandq_u32(vPreds32_high, vPreds32_tmp);
+                vPoss32_high = vbslq_s32(vPreds32_high, vPoss32_high, vZero);
+                uint16x8_t vPredu16 = vcombine_u16(vmovn_u32(vPreds32_low), vmovn_u32(vPreds32_high));
+
+                int16x8_t vVal = vdupq_n_s16(0);
+                vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_low, 0)], vVal, 0);
+                vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_low, 1)], vVal, 1);
+                vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_low, 2)], vVal, 2);
+                vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_low, 3)], vVal, 3);
+                vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_high, 0)], vVal, 4);
+                vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_high, 1)], vVal, 5);
+                vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_high, 2)], vVal, 6);
+                vVal = vsetq_lane_s16(lut_tmp[vgetq_lane_s32(vPoss32_high, 3)], vVal, 7);
+
+                int16x8_t vDstOrg = vld1q_s16((vx_int16 *)(ptr_dst + x * out->addr->stride_x));
+                vVal = vbslq_s16(vPredu16, vVal, vDstOrg);
+                vst1q_s16((vx_int16 *)(ptr_dst + x * out->addr->stride_x), vVal);
+            }
+        }
+    }
+}
+
+#define TABLELOOKUP(type, low_y, high_y, low_x, high_x, type_size)                 \
+    for (y = low_y; y < high_y; y++)                                               \
+    {                                                                              \
+        type *src_ptr = (type *)src_base + y * in->addr->stride_y / type_size;     \
+        type *dst_ptr = (type *)dst_base + y * out->addr->stride_y / type_size;    \
+        for (x = low_x; x < high_x; x++)                                           \
+        {                                                                          \
+            type *lut_tmp = (type *)lut_ptr;                                       \
+            vx_int32 index = (vx_int32)offset + (vx_int32)(*src_ptr);              \
+            if (index >= 0 && index < (vx_int32)count)                             \
+            {                                                                      \
+                *dst_ptr = lut_tmp[index];                                         \
+            }                                                                      \
+            src_ptr++;                                                             \
+            dst_ptr++;                                                             \
+        }                                                                          \
+    }
+
+void TableLookup_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_array_t *lut = (vx_tile_array_t*)parameters[1];
+    vx_tile_t *out = (vx_tile_t*)parameters[2];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = vxTileHeight(in, 0);
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = vxTileWidth(in, 0);
+
+    vx_enum type = lut->item_type;
+    vx_size count = lut->num_items;
+    vx_uint32 offset = lut->offset;
+
+    void *lut_ptr = lut->ptr;
+
+    if (low_y == 0 && low_x == 0)
+    {
+        if (type == VX_TYPE_UINT8)
+        {
+            vx_uint8 *src_base = in->base[0] + in->tile_x;                             
+            vx_uint8 *dst_base = out->base[0] + out->tile_x;                           
+            TABLELOOKUP(vx_uint8, low_y, high_y, low_x, high_x, 1)
+        }
+        else if (type == VX_TYPE_INT16)
+        {
+            vx_int16 *src_base = (vx_int16 *)in->base[0] + in->tile_x;                             
+            vx_int16 *dst_base = (vx_int16 *)out->base[0] + out->tile_x;                           
+            TABLELOOKUP(vx_int16, low_y, high_y, low_x, high_x, 2)
+        }
+    }
+    else
+    {
+        if (type == VX_TYPE_UINT8)
+        {
+            vx_uint8 *src_base = in->base[0] + in->tile_x;
+            vx_uint8 *dst_base = out->base[0] + out->tile_x;
+            TABLELOOKUP(vx_uint8, 0, low_y, low_x, high_x, 1)
+
+            src_base = in->base[0];
+            dst_base = out->base[0];
+            TABLELOOKUP(vx_uint8, low_y, high_y, 0, high_x, 1)
+        }
+        else if (type == VX_TYPE_INT16)
+        {
+            vx_int16 *src_base = (vx_int16 *)in->base[0] + in->tile_x;
+            vx_int16 *dst_base = (vx_int16 *)out->base[0] + out->tile_x;
+            TABLELOOKUP(vx_int16, 0, low_y, low_x, high_x, 2)
+
+            src_base = (vx_int16 *)in->base[0];
+            dst_base = (vx_int16 *)out->base[0];
+            TABLELOOKUP(vx_int16, low_y, high_y, 0, high_x, 2)
+        }
+    }
+}
diff --git a/kernels/tiling/tiling_magnitude.c b/kernels/tiling/tiling_magnitude.c
new file mode 100644
index 0000000..5a90266
--- /dev/null
+++ b/kernels/tiling/tiling_magnitude.c
@@ -0,0 +1,205 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_neon.h>
+#include <tiling.h>
+#include <math.h>
+
+// nodeless version of the Magnitude kernel
+void Magnitude_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x, value;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];    
+    vx_uint32 low_height = out->tile_y;
+    vx_uint32 height = out->tile_y + out->tile_block.height;
+    
+    for (y = low_height; y < height; y++)
+    {
+        vx_int16 *in_x = (vx_int16 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width;
+        vx_int16 *in_y = (vx_int16 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; 
+        vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out->tile_x + y * out->image.width; 
+        vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out->tile_x + y * out->image.width; 
+        for (x = 0; x < out->tile_block.width; x += 8)
+        {
+            int16x8_t in_x16x8 = vld1q_s16(in_x);
+            int16x8_t in_y16x8 = vld1q_s16(in_y);            
+            if (out->image.format == VX_DF_IMAGE_U8)
+            {
+                const int32x4x2_t low_grad = 
+                {
+                    {
+                      vmovl_s16(vmul_s16(vget_low_s16(in_x16x8), vget_low_s16(in_x16x8))),
+                      vmovl_s16(vmul_s16(vget_low_s16(in_y16x8), vget_low_s16(in_y16x8)))
+                    }
+                };
+                const int32x4x2_t top_grad = 
+                {
+                    {
+                      vmovl_s16(vmul_s16(vget_high_s16(in_x16x8), vget_high_s16(in_x16x8))),
+                      vmovl_s16(vmul_s16(vget_high_s16(in_y16x8), vget_high_s16(in_y16x8)))
+                    }
+                };
+                
+                vx_float64 sum1 = vgetq_lane_s32(low_grad.val[0], 0) + vgetq_lane_s32(low_grad.val[1], 0) ;
+                value = ((vx_int32)sqrt(sum1))/4;
+                *dstp = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value);
+                
+                vx_float64 sum2 = vgetq_lane_s32(low_grad.val[0], 1) + vgetq_lane_s32(low_grad.val[1], 1) ;
+                value = ((vx_int32)sqrt(sum2))/4;
+                *(dstp+1) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value);
+                
+                vx_float64 sum3 = vgetq_lane_s32(low_grad.val[0], 2) + vgetq_lane_s32(low_grad.val[1], 2) ;
+                value = ((vx_int32)sqrt(sum3))/4;
+                *(dstp+2) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value);
+                
+                vx_float64 sum4 = vgetq_lane_s32(low_grad.val[0], 3) + vgetq_lane_s32(low_grad.val[1], 3) ;
+                value = ((vx_int32)sqrt(sum4))/4;
+                *(dstp+3) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value);
+                
+                vx_float64 sum5 = vgetq_lane_s32(top_grad.val[0], 0) + vgetq_lane_s32(top_grad.val[1], 0) ;
+                value = ((vx_int32)sqrt(sum5))/4;
+                *(dstp+4) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value);
+                
+                vx_float64 sum6 = vgetq_lane_s32(top_grad.val[0], 1) + vgetq_lane_s32(top_grad.val[1], 1) ;
+                value = ((vx_int32)sqrt(sum6))/4;
+                *(dstp+5) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value);
+                
+                vx_float64 sum7 = vgetq_lane_s32(top_grad.val[0], 2) + vgetq_lane_s32(top_grad.val[1], 2) ;
+                value = ((vx_int32)sqrt(sum7))/4;
+                *(dstp+6) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value);
+                
+                vx_float64 sum8 = vgetq_lane_s32(top_grad.val[0], 3) + vgetq_lane_s32(top_grad.val[1], 3) ;
+                value = ((vx_int32)sqrt(sum8))/4;
+                *(dstp+7) = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value);
+                dstp += 8;
+            }
+            else if (out->image.format == VX_DF_IMAGE_S16)
+            {
+                vx_int16 tmpx1 = vgetq_lane_s16(in_x16x8 ,0);
+                vx_int16 tmpy1 = vgetq_lane_s16(in_y16x8 ,0);
+                vx_float64 grad1[2] = {(vx_float64)tmpx1*tmpx1, (vx_float64)tmpy1*tmpy1};
+                vx_float64 sum1 = grad1[0] + grad1[1];
+                value = (vx_int32)(sqrt(sum1) + 0.5);
+                *dstp_16 = (vx_int16)(value > INT16_MAX ? INT16_MAX : value);
+                
+                vx_int16 tmpx2 = vgetq_lane_s16(in_x16x8 ,1);
+                vx_int16 tmpy2 = vgetq_lane_s16(in_y16x8 ,1);
+                vx_float64 grad2[2] = {(vx_float64)tmpx2*tmpx2, (vx_float64)tmpy2*tmpy2};
+                vx_float64 sum2 = grad2[0] + grad2[1];
+                value = (vx_int32)(sqrt(sum2) + 0.5);
+                *(dstp_16+1) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value);
+                
+                vx_int16 tmpx3 = vgetq_lane_s16(in_x16x8 ,2);
+                vx_int16 tmpy3 = vgetq_lane_s16(in_y16x8 ,2);
+                vx_float64 grad3[2] = {(vx_float64)tmpx3*tmpx3, (vx_float64)tmpy3*tmpy3};
+                vx_float64 sum3 = grad3[0] + grad3[1];
+                value = (vx_int32)(sqrt(sum3) + 0.5);
+                *(dstp_16+2) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value);
+                
+                vx_int16 tmpx4 = vgetq_lane_s16(in_x16x8 ,3);
+                vx_int16 tmpy4 = vgetq_lane_s16(in_y16x8 ,3);
+                vx_float64 grad4[2] = {(vx_float64)tmpx4*tmpx4, (vx_float64)tmpy4*tmpy4};
+                vx_float64 sum4 = grad4[0] + grad4[1];
+                value = (vx_int32)(sqrt(sum4) + 0.5);
+                *(dstp_16+3) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value);
+                
+                vx_int16 tmpx5 = vgetq_lane_s16(in_x16x8 ,4);
+                vx_int16 tmpy5 = vgetq_lane_s16(in_y16x8 ,4);
+                vx_float64 grad5[2] = {(vx_float64)tmpx5*tmpx5, (vx_float64)tmpy5*tmpy5};
+                vx_float64 sum5 = grad5[0] + grad5[1];
+                value = (vx_int32)(sqrt(sum5) + 0.5);
+                *(dstp_16+4) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value);
+                
+                vx_int16 tmpx6 = vgetq_lane_s16(in_x16x8 ,5);
+                vx_int16 tmpy6 = vgetq_lane_s16(in_y16x8 ,5);
+                vx_float64 grad6[2] = {(vx_float64)tmpx6*tmpx6, (vx_float64)tmpy6*tmpy6};
+                vx_float64 sum6 = grad6[0] + grad6[1];
+                value = (vx_int32)(sqrt(sum6) + 0.5);
+                *(dstp_16+5) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value);
+                
+                vx_int16 tmpx7 = vgetq_lane_s16(in_x16x8 ,6);
+                vx_int16 tmpy7 = vgetq_lane_s16(in_y16x8 ,6);
+                vx_float64 grad7[2] = {(vx_float64)tmpx7*tmpx7, (vx_float64)tmpy7*tmpy7};
+                vx_float64 sum7 = grad7[0] + grad7[1];
+                value = (vx_int32)(sqrt(sum7) + 0.5);
+                *(dstp_16+6) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value);
+                
+                vx_int16 tmpx8 = vgetq_lane_s16(in_x16x8 ,7);
+                vx_int16 tmpy8 = vgetq_lane_s16(in_y16x8 ,7);
+                vx_float64 grad8[2] = {(vx_float64)tmpx8*tmpx8, (vx_float64)tmpy8*tmpy8};
+                vx_float64 sum8 = grad8[0] + grad8[1];
+                value = (vx_int32)(sqrt(sum8) + 0.5);
+                *(dstp_16+7) = (vx_int16)(value > INT16_MAX ? INT16_MAX : value);
+                dstp_16 += 8;
+            }
+            in_x += 8;
+            in_y += 8;
+        }
+    }
+}
+
+#define MAGNITUDE_FLEXIBLE(low_y, low_x, high_y, high_x, in_1_tile_x, in_2_tile_x, out_tile_x)    \
+    for (y = low_y; y < high_y; y++)                                                              \
+    {                                                                                             \
+        vx_int16 *in_x = (vx_int16 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width;        \
+        vx_int16 *in_y = (vx_int16 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width;        \
+        vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out_tile_x + y * out->image.width;           \
+        vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out_tile_x + y * out->image.width;        \
+        for (x = low_x; x < high_x; x++)                                                          \
+        {                                                                                         \
+            if (out->image.format == VX_DF_IMAGE_U8)                                              \
+            {                                                                                     \
+                vx_int32 grad[2] = {in_x[0]*in_x[0], in_y[0]*in_y[0]};                            \
+                vx_float64 sum = grad[0] + grad[1];                                               \
+                value = ((vx_int32)sqrt(sum))/4;                                                  \
+                *dstp = (vx_uint8)(value > UINT8_MAX ? UINT8_MAX : value);                        \
+                dstp += 1;                                                                        \
+            }                                                                                     \
+            else if (out->image.format == VX_DF_IMAGE_S16)                                        \
+            {                                                                                     \
+                vx_float64 grad[2] = {(vx_float64)in_x[0]*in_x[0], (vx_float64)in_y[0]*in_y[0]};  \
+                vx_float64 sum = grad[0] + grad[1];                                               \
+                value = (vx_int32)(sqrt(sum) + 0.5);                                              \
+                *dstp_16 = (vx_int16)(value > INT16_MAX ? INT16_MAX : value);                     \
+                dstp_16 += 1;                                                                     \
+            }                                                                                     \
+            in_x += 1;                                                                            \
+            in_y += 1;                                                                            \
+        }                                                                                         \
+    }                                                                                             \
+    
+void Magnitude_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x, value;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+    
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    if (ty == 0 && tx == 0)
+    {
+        MAGNITUDE_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x)     
+    }
+    else
+    {
+        MAGNITUDE_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x)
+        MAGNITUDE_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), 0, 0, 0)
+    }
+}
diff --git a/kernels/tiling/tiling_minmax.c b/kernels/tiling/tiling_minmax.c
new file mode 100644
index 0000000..f446afc
--- /dev/null
+++ b/kernels/tiling/tiling_minmax.c
@@ -0,0 +1,199 @@
+/*
+
+ * Copyright (c) 2017-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+void Max_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];    
+    vx_uint32 low_height = out->tile_y;
+    vx_uint32 height = out->tile_y + out->tile_block.height;
+    switch (out->image.format)
+    {
+    case VX_DF_IMAGE_U8:
+        for (y = low_height; y < height; y++)
+        {
+            vx_uint8* src0p = (vx_uint8 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width;
+            vx_uint8* src1p = (vx_uint8 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width;
+            vx_uint8* dstp = (vx_uint8 *)out->base[0] + out->tile_x + y * out->image.width;
+            for (x = 0; x < out->tile_block.width; x += 16)
+            {
+                uint8x16_t vsrc0 = vld1q_u8( src0p + x);
+                uint8x16_t vsrc1 = vld1q_u8( src1p + x);
+                vst1q_u8( dstp + x, vmaxq_u8( vsrc0, vsrc1 ) );
+            }
+        }
+        break;
+    case VX_DF_IMAGE_S16:
+        for (y = low_height; y < height; y++)
+        {
+            vx_uint8* src0p = (vx_uint8 *)in_1->base[0] + 2*in_1->tile_x + y * in_1->addr->stride_y;
+            vx_uint8* src1p = (vx_uint8 *)in_2->base[0] + 2*in_2->tile_x + y * in_2->addr->stride_y;
+            vx_uint8* dstp = (vx_uint8 *)out->base[0] + 2*out->tile_x + y * out->addr->stride_y;
+            for (x = 0; x < out->tile_block.width; x += 8)
+            {
+                int16x8_t vsrc0 = vld1q_s16( (vx_int16 *)(src0p + x * in_1->addr[0].stride_x));
+                int16x8_t vsrc1 = vld1q_s16( (vx_int16 *)(src1p + x * in_2->addr[0].stride_x));
+                vst1q_s16( (vx_int16 *)(dstp + x * out->addr[0].stride_x), vmaxq_s16( vsrc0, vsrc1 ) );
+            }
+        }
+        break;
+    }
+}
+
+#define MAX_FLEXIBLE(low_y, low_x, high_y, high_x, in_1_tile_x, in_2_tile_x, out_tile_x)                                   \
+    for (y = low_y; y < high_y; ++y)                                                                                       \
+    {                                                                                                                      \
+        for (x = low_x; x < high_x; ++x)                                                                                   \
+        {                                                                                                                  \
+            switch (out->image.format)                                                                                     \
+            {                                                                                                              \
+            case VX_DF_IMAGE_U8:                                                                                           \
+                src0p = (vx_uint8 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width + x * in_1->addr[0].stride_x;     \
+                src1p = (vx_uint8 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width + x * in_2->addr[0].stride_x;     \
+                dstp = (vx_uint8 *)out->base[0] + out_tile_x + y * out->image.width + x * out->addr[0].stride_x;          \
+                val0 = *(src0p);                                                                                           \
+                val1 = *(src1p);                                                                                           \
+                *dstp = val0 > val1 ? val0 : val1;                                                                         \
+                break;                                                                                                     \
+            case VX_DF_IMAGE_S16:                                                                                          \
+                src0p = (vx_uint8 *)in_1->base[0] + 2*in_1_tile_x + y * in_1->addr->stride_y + x * in_1->addr[0].stride_x;\
+                src1p = (vx_uint8 *)in_2->base[0] + 2*in_2_tile_x + y * in_2->addr->stride_y + x * in_2->addr[0].stride_x;\
+                dstp = (vx_uint8 *)out->base[0] + 2*out_tile_x + y * out->addr->stride_y + x * out->addr[0].stride_x;     \
+                val0_16 = *(vx_int16 *)(src0p);                                                                            \
+                val1_16 = *(vx_int16 *)(src1p);                                                                            \
+                *(vx_int16 *)dstp = val0_16 > val1_16 ? val0_16 : val1_16;                                                 \
+                break;                                                                                                     \
+            }                                                                                                              \
+        }                                                                                                                  \
+    }                                                                                                                      \
+
+void Max_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];  
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    vx_uint8 *src0p, *src1p, *dstp;
+    vx_uint8 val0, val1;
+    vx_int16 val0_16, val1_16;
+    if (ty == 0 && tx == 0)
+    {
+        MAX_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x)
+    }
+    else
+    {
+        MAX_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x)
+        MAX_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), 0, 0, 0)
+    }
+}
+
+void Min_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];    
+    vx_uint32 low_height = out->tile_y;
+    vx_uint32 height = out->tile_y + out->tile_block.height;
+    switch (out->image.format)
+    {
+    case VX_DF_IMAGE_U8:
+        for (y = low_height; y < height; y++)
+        {
+            vx_uint8* src0p = (vx_uint8 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width;
+            vx_uint8* src1p = (vx_uint8 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width;
+            vx_uint8* dstp = (vx_uint8 *)out->base[0] + out->tile_x + y * out->image.width;
+            for (x = 0; x < out->tile_block.width; x += 16)
+            {
+                uint8x16_t vsrc0 = vld1q_u8( src0p + x);
+                uint8x16_t vsrc1 = vld1q_u8( src1p + x);
+                vst1q_u8( dstp + x, vminq_u8( vsrc0, vsrc1 ) );
+            }
+        }
+        break;
+    case VX_DF_IMAGE_S16:
+        for (y = low_height; y < height; y++)
+        {
+            vx_uint8* src0p = (vx_uint8 *)in_1->base[0] + 2*in_1->tile_x + y * in_1->addr->stride_y;
+            vx_uint8* src1p = (vx_uint8 *)in_2->base[0] + 2*in_2->tile_x + y * in_2->addr->stride_y;
+            vx_uint8* dstp = (vx_uint8 *)out->base[0] + 2*out->tile_x + y * out->addr->stride_y;
+            for (x = 0; x < out->tile_block.width; x += 8)
+            {
+                int16x8_t vsrc0 = vld1q_s16( (vx_int16 *)(src0p + x * in_1->addr[0].stride_x));
+                int16x8_t vsrc1 = vld1q_s16( (vx_int16 *)(src1p + x * in_2->addr[0].stride_x));
+                vst1q_s16( (vx_int16 *)(dstp + x * out->addr[0].stride_x), vminq_s16( vsrc0, vsrc1 ) );
+            }
+        }
+        break;
+    }
+}
+
+#define MIN_FLEXIBLE(low_y, low_x, high_y, high_x, in_1_tile_x, in_2_tile_x, out_tile_x)                                   \
+    for (y = low_y; y < high_y; ++y)                                                                                       \
+    {                                                                                                                      \
+        for (x = low_x; x < high_x; ++x)                                                                                   \
+        {                                                                                                                  \
+            switch (out->image.format)                                                                                     \
+            {                                                                                                              \
+            case VX_DF_IMAGE_U8:                                                                                           \
+                src0p = (vx_uint8 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width + x * in_1->addr[0].stride_x;     \
+                src1p = (vx_uint8 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width + x * in_2->addr[0].stride_x;     \
+                dstp = (vx_uint8 *)out->base[0] + out_tile_x + y * out->image.width + x * out->addr[0].stride_x;          \
+                val0 = *(src0p);                                                                                           \
+                val1 = *(src1p);                                                                                           \
+                *dstp = val0 < val1 ? val0 : val1;                                                                         \
+                break;                                                                                                     \
+            case VX_DF_IMAGE_S16:                                                                                          \
+                src0p = (vx_uint8 *)in_1->base[0] + 2*in_1_tile_x + y * in_1->addr->stride_y + x * in_1->addr[0].stride_x;\
+                src1p = (vx_uint8 *)in_2->base[0] + 2*in_2_tile_x + y * in_2->addr->stride_y + x * in_2->addr[0].stride_x;\
+                dstp = (vx_uint8 *)out->base[0] + 2*out_tile_x + y * out->addr->stride_y + x * out->addr[0].stride_x;     \
+                val0_16 = *(vx_int16 *)(src0p);                                                                            \
+                val1_16 = *(vx_int16 *)(src1p);                                                                            \
+                *(vx_int16 *)dstp = val0_16 < val1_16 ? val0_16 : val1_16;                                                 \
+                break;                                                                                                     \
+            }                                                                                                              \
+        }                                                                                                                  \
+    }                                                                                                                      \
+
+void Min_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];  
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    vx_uint8 *src0p, *src1p, *dstp;
+    vx_uint8 val0, val1;
+    vx_int16 val0_16, val1_16;
+    if (ty == 0 && tx == 0)
+    {
+        MIN_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x)
+    }
+    else
+    {
+        MIN_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x)
+        MIN_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), 0, 0, 0)
+    }
+}
diff --git a/kernels/tiling/tiling_morphology.c b/kernels/tiling/tiling_morphology.c
new file mode 100644
index 0000000..678b6c2
--- /dev/null
+++ b/kernels/tiling/tiling_morphology.c
@@ -0,0 +1,249 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+static inline void opt_max(uint8x8_t *a, uint8x8_t *b)
+{
+    const uint8x8_t max = vmax_u8(*a, *b);
+    *a = max;
+}
+static inline void opt_min(uint8x8_t *a, uint8x8_t *b)
+{
+    const uint8x8_t min = vmin_u8(*a, *b);
+    *a = min;
+}
+
+void Erode3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x, y;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    if (low_y == 0)
+    {
+        low_y = 1;
+    }
+    if (high_y == out->image.height)
+    {
+        high_y = high_y - 1;
+    }
+
+    for (y = low_y; y < high_y; y++)
+    {
+        vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * out->addr->stride_y;
+        vx_uint8* top_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y;
+        vx_uint8* mid_src = (vx_uint8 *)src_base + (y) * in->addr->stride_y;
+        vx_uint8* bot_src = (vx_uint8 *)src_base + (y + 1) * in->addr->stride_y;
+
+        for (x = 0; x < out->tile_block.width; x += 8)
+        {
+            const uint8x16_t top_data = vld1q_u8(top_src);
+            const uint8x16_t mid_data = vld1q_u8(mid_src);
+            const uint8x16_t bot_data = vld1q_u8(bot_src);
+            
+            uint8x8_t p0 = vget_low_u8(top_data);
+            uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
+            uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
+            uint8x8_t p3 = vget_low_u8(mid_data);
+            uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+            uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+            uint8x8_t p6 = vget_low_u8(bot_data);
+            uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
+            uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
+
+            opt_min(&p0, &p1);
+            opt_min(&p0, &p2);
+            opt_min(&p0, &p3);
+            opt_min(&p0, &p4);
+            opt_min(&p0, &p5);
+            opt_min(&p0, &p6);
+            opt_min(&p0, &p7);
+            opt_min(&p0, &p8);
+
+            vst1_u8(dst, p0);       
+            
+            top_src+=8;
+            mid_src+=8;
+            bot_src+=8;
+            dst += 8;
+        }
+    }
+}
+
+#define Erode3x3(low_y, high_y, low_x, high_x)                                         \
+    for (y = low_y; y < high_y; y++)                                                   \
+    {                                                                                  \
+        for (x = low_x; x < high_x; x++)                                               \
+        {                                                                              \
+            vx_int32 j, i;                                                             \
+            vx_uint8 min_pixel = vxImagePixel(vx_uint8, in, 0, x, y, -1, -1);          \
+            for (j = vxNeighborhoodTop(in); j <= vxNeighborhoodBottom(in); j++)        \
+            {                                                                          \
+                for (i = vxNeighborhoodLeft(in); i <= vxNeighborhoodRight(in); i++)    \
+                {                                                                      \
+                    if (min_pixel < vxImagePixel(vx_uint8, in, 0, x, y, i, j))         \
+                        min_pixel = min_pixel;                                         \
+                    else                                                               \
+                        min_pixel = vxImagePixel(vx_uint8, in, 0, x, y, i, j);         \
+                }                                                                      \
+            }                                                                          \
+            vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = min_pixel;                    \
+        }                                                                              \
+    }
+
+void Erode3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    if (low_y == 0 && low_x == 0)
+    {
+        Erode3x3(low_y + 1, high_y - 1, low_x + 1, high_x - 1)
+    }
+    else
+    {
+        Erode3x3(1, low_y, low_x, high_x - 1)
+        Erode3x3(low_y, high_y, 1, high_x - 1)
+    }
+}
+
+
+void Dilate3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x, y;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    if (low_y == 0)
+    {
+        low_y = 1;
+    }
+    if (high_y == out->image.height)
+    {
+        high_y = high_y - 1;
+    }
+
+    for (y = low_y; y < high_y; y++)
+    {
+        vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * out->addr->stride_y;
+        vx_uint8* top_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y;
+        vx_uint8* mid_src = (vx_uint8 *)src_base + (y) * in->addr->stride_y;
+        vx_uint8* bot_src = (vx_uint8 *)src_base + (y + 1) * in->addr->stride_y;
+
+        for (x = 0; x < out->tile_block.width; x += 8)
+        {
+            const uint8x16_t top_data = vld1q_u8(top_src);
+            const uint8x16_t mid_data = vld1q_u8(mid_src);
+            const uint8x16_t bot_data = vld1q_u8(bot_src);
+            
+            uint8x8_t p0 = vget_low_u8(top_data);
+            uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
+            uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
+            uint8x8_t p3 = vget_low_u8(mid_data);
+            uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+            uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+            uint8x8_t p6 = vget_low_u8(bot_data);
+            uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
+            uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
+
+            opt_max(&p0, &p1);
+            opt_max(&p0, &p2);
+            opt_max(&p0, &p3);
+            opt_max(&p0, &p4);
+            opt_max(&p0, &p5);
+            opt_max(&p0, &p6);
+            opt_max(&p0, &p7);
+            opt_max(&p0, &p8);
+
+            vst1_u8(dst, p0);
+
+            top_src+=8;
+            mid_src+=8;
+            bot_src+=8;
+            dst += 8;
+        }
+    }
+}
+
+
+#define Dilate3x3(low_y, high_y, low_x, high_x)                                        \
+    for (y = low_y; y < high_y; y++)                                                   \
+    {                                                                                  \
+        for (x = low_x; x < high_x; x++)                                               \
+        {                                                                              \
+            vx_int32 j, i;                                                             \
+            vx_uint8 max_pixel = vxImagePixel(vx_uint8, in, 0, x, y, -1, -1);          \
+            for (j = vxNeighborhoodTop(in); j <= vxNeighborhoodBottom(in); j++)        \
+            {                                                                          \
+                for (i = vxNeighborhoodLeft(in); i <= vxNeighborhoodRight(in); i++)    \
+                {                                                                      \
+                    if (max_pixel > vxImagePixel(vx_uint8, in, 0, x, y, i, j))         \
+                        max_pixel = max_pixel;                                         \
+                    else                                                               \
+                        max_pixel = vxImagePixel(vx_uint8, in, 0, x, y, i, j);         \
+                }                                                                      \
+            }                                                                          \
+            vxImagePixel(vx_uint8, out, 0, x, y, 0, 0) = max_pixel;                    \
+        }                                                                              \
+    }
+
+
+void Dilate3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    if (low_y == 0 && low_x == 0)
+    {
+        Dilate3x3(low_y + 1, high_y - 1, low_x + 1, high_x - 1)
+    }
+    else
+    {
+        Dilate3x3(1, low_y, low_x, high_x - 1)
+        Dilate3x3(low_y, high_y, 1, high_x - 1)
+    }
+}
diff --git a/kernels/tiling/tiling_multiply.c b/kernels/tiling/tiling_multiply.c
new file mode 100644
index 0000000..d0db034
--- /dev/null
+++ b/kernels/tiling/tiling_multiply.c
@@ -0,0 +1,267 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+// nodeless version of the Multiply kernel
+void Multiply_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_float32 *scale = (vx_float32*)parameters[2];
+    vx_enum *overflow_policy = (vx_enum*)parameters[3];
+    vx_enum *rounding_policy = (vx_enum*)parameters[4];
+    vx_tile_t *out = (vx_tile_t *)parameters[5];    
+    vx_uint32 low_height = out->tile_y;
+    vx_uint32 height = out->tile_y + out->tile_block.height;
+    
+    for (y = low_height; y < height; y++)
+    {
+        vx_uint8 *src0p = (vx_uint8 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width;
+        vx_uint8 *src1p = (vx_uint8 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width;
+        vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out->tile_x + y * out->image.width;        
+        vx_int16 *src0p_16 = (vx_int16 *)in_1->base[0] + in_1->tile_x + y * in_1->image.width;
+        vx_int16 *src1p_16 = (vx_int16 *)in_2->base[0] + in_2->tile_x + y * in_2->image.width; 
+        vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out->tile_x + y * out->image.width; 
+        for (x = 0; x < out->tile_block.width; x += 8)
+        {            
+            int32x4_t src01;
+            int32x4_t src02;
+            int32x4_t src11;
+            int32x4_t src12;
+            if(in_1->image.format == VX_DF_IMAGE_U8)
+            {
+                uint8x8_t in01_8x8_data = vld1_u8((vx_uint8*)src0p);
+                uint16x8_t tmp16x8 = vmovl_u8 (in01_8x8_data);
+                int32x4x2_t tmp32x4_int_u8 =
+                {
+                    {
+                        vreinterpretq_s32_u32 (vmovl_u16 (vget_low_u16(tmp16x8))),
+                        vreinterpretq_s32_u32 (vmovl_u16 (vget_high_u16(tmp16x8)))
+                    }
+                };
+                src01 = tmp32x4_int_u8.val[0];
+                src02 = tmp32x4_int_u8.val[1];
+                src0p += 8;
+            }
+            else
+            {
+                int16x8_t int02_16x8_data = vld1q_s16((vx_int16*)src0p_16);
+                int32x4x2_t tmp32x4_int_s16 =
+                {
+                    {
+                        vmovl_s16 (vget_low_s16(int02_16x8_data)),
+                        vmovl_s16 (vget_high_s16(int02_16x8_data))
+                    }
+                };
+                src01 = tmp32x4_int_s16.val[0];
+                src02 = tmp32x4_int_s16.val[1];
+                src0p_16 += 8;
+            }            
+            if(in_2->image.format == VX_DF_IMAGE_U8)
+            {
+                uint8x8_t in01_8x8_data = vld1_u8((vx_uint8*)src1p);
+                uint16x8_t tmp16x8 = vmovl_u8 (in01_8x8_data);
+                int32x4x2_t tmp32x4_int_u8 =
+                {
+                    {
+                        vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp16x8))),
+                        vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp16x8)))
+                    }
+                };
+                src11 = tmp32x4_int_u8.val[0];
+                src12 = tmp32x4_int_u8.val[1];
+                src1p += 8;
+            }
+            else
+            {
+                int16x8_t int02_16x8_data = vld1q_s16((vx_int16*)src1p_16);
+                int32x4x2_t tmp32x4_int_s16 =
+                {
+                    {
+                        vmovl_s16(vget_low_s16(int02_16x8_data)),
+                        vmovl_s16(vget_high_s16(int02_16x8_data))
+                    }
+                };
+                src11 = tmp32x4_int_s16.val[0];
+                src12 = tmp32x4_int_s16.val[1];
+                src1p_16 += 8;
+            }
+            int32x4_t unscaled_unconverted_result1 = vmulq_s32(src01, src11);
+            int32x4_t unscaled_unconverted_result2 = vmulq_s32(src02, src12);
+            vx_int32 tmp0 = vgetq_lane_s32(unscaled_unconverted_result1, 0);
+            vx_int32 tmp1 = vgetq_lane_s32(unscaled_unconverted_result1, 1);
+            vx_int32 tmp2 = vgetq_lane_s32(unscaled_unconverted_result1, 2);
+            vx_int32 tmp3 = vgetq_lane_s32(unscaled_unconverted_result1, 3);
+            vx_int32 tmp4 = vgetq_lane_s32(unscaled_unconverted_result2, 0);
+            vx_int32 tmp5 = vgetq_lane_s32(unscaled_unconverted_result2, 1);
+            vx_int32 tmp6 = vgetq_lane_s32(unscaled_unconverted_result2, 2);
+            vx_int32 tmp7 = vgetq_lane_s32(unscaled_unconverted_result2, 3);
+               
+            vx_int32 i;
+            for(i = 0; i < 8; i++)
+            {   
+                vx_int32 tmp_int32;
+                if(i == 0)
+                  tmp_int32 = tmp0;
+                else if(i == 1)
+                  tmp_int32 = tmp1;
+                else if(i == 2)
+                  tmp_int32 = tmp2;
+                else if(i == 3)
+                  tmp_int32 = tmp3;
+                else if(i == 4)
+                  tmp_int32 = tmp4;
+                else if(i == 5)
+                  tmp_int32 = tmp5;
+                else if(i == 6)
+                  tmp_int32 = tmp6;
+                else if(i == 7)
+                  tmp_int32 = tmp7;
+                vx_float64 unscaled_result = (vx_float64)tmp_int32;
+                vx_float64 scaled_result = (*scale) * unscaled_result;
+                vx_int32 int_typed_result = (vx_int32)scaled_result;
+                vx_int32 final_result_value;
+                if (*overflow_policy == VX_CONVERT_POLICY_SATURATE)
+                {
+                    if (out->image.format == VX_DF_IMAGE_U8)
+                    {
+                        if (int_typed_result > UINT8_MAX)
+                            final_result_value = UINT8_MAX;
+                        else if (int_typed_result < 0)
+                            final_result_value = 0;
+                        else
+                            final_result_value = int_typed_result;
+                    }
+                    else 
+                    {
+                        if (int_typed_result > INT16_MAX)
+                            final_result_value = INT16_MAX;
+                        else if (int_typed_result < INT16_MIN)
+                            final_result_value = INT16_MIN;
+                        else
+                            final_result_value = int_typed_result;
+                    }
+                }
+                else 
+                {
+                    final_result_value = (out->image.format == VX_DF_IMAGE_U8) ?
+                        (vx_uint8)int_typed_result : (vx_int16)int_typed_result;
+                }
+
+                if (out->image.format == VX_DF_IMAGE_U8)
+                {
+                    *dstp = (vx_uint8)final_result_value;
+                    dstp += 1;
+                }
+                else
+                {
+                    *dstp_16 = (vx_int16)final_result_value;
+                    dstp_16 += 1;
+                }
+            }
+        }
+    }
+}
+
+#define MULTIPLY_FLEXIBLE(low_y, low_x, high_y, high_x, in_1_tile_x, in_2_tile_x, out_tile_x) \
+    for (y = low_y; y < high_y; y++)                                                           \
+    {                                                                                          \
+        vx_uint8 *src0p = (vx_uint8 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width;    \
+        vx_uint8 *src1p = (vx_uint8 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width;    \
+        vx_uint8 *dstp = (vx_uint8 *)out->base[0] + out_tile_x + y * out->image.width;        \
+        vx_int16 *src0p_16 = (vx_int16 *)in_1->base[0] + in_1_tile_x + y * in_1->image.width; \
+        vx_int16 *src1p_16 = (vx_int16 *)in_2->base[0] + in_2_tile_x + y * in_2->image.width; \
+        vx_int16 *dstp_16 = (vx_int16 *)out->base[0] + out_tile_x + y * out->image.width;     \
+        for (x = low_x; x < high_x; x++)                                                       \
+        {                                                                                      \
+            vx_int32 src0 = in_1->image.format == VX_DF_IMAGE_U8 ? *src0p : *src0p_16;         \
+            vx_int32 src1 = in_2->image.format == VX_DF_IMAGE_U8 ? *src1p : *src1p_16;         \
+            src0p++;                                                                           \
+            src1p++;                                                                           \
+            src0p_16++;                                                                        \
+            src1p_16++;                                                                        \
+            vx_int32 unscaled_unconverted_result = src0 * src1;                                \
+            vx_float64 unscaled_result = (vx_float64)unscaled_unconverted_result;              \
+            vx_float64 scaled_result = (*scale) * unscaled_result;                             \
+            vx_int32 int_typed_result = (vx_int32)scaled_result;                               \
+            vx_int32 final_result_value;                                                       \
+            if (*overflow_policy == VX_CONVERT_POLICY_SATURATE)                                \
+            {                                                                                  \
+                if (out->image.format == VX_DF_IMAGE_U8)                                       \
+                {                                                                              \
+                    if (int_typed_result > UINT8_MAX)                                          \
+                        final_result_value = UINT8_MAX;                                        \
+                    else if (int_typed_result < 0)                                             \
+                        final_result_value = 0;                                                \
+                    else                                                                       \
+                        final_result_value = int_typed_result;                                 \
+                }                                                                              \
+                else                                                                           \
+                {                                                                              \
+                    if (int_typed_result > INT16_MAX)                                          \
+                        final_result_value = INT16_MAX;                                        \
+                    else if (int_typed_result < INT16_MIN)                                     \
+                        final_result_value = INT16_MIN;                                        \
+                    else                                                                       \
+                        final_result_value = int_typed_result;                                 \
+                }                                                                              \
+            }                                                                                  \
+            else                                                                               \
+            {                                                                                  \
+                final_result_value = (out->image.format == VX_DF_IMAGE_U8) ?                   \
+                    (vx_uint8)int_typed_result : (vx_int16)int_typed_result;                   \
+            }                                                                                  \
+            if (out->image.format == VX_DF_IMAGE_U8)                                           \
+            {                                                                                  \
+                *dstp = (vx_uint8)final_result_value;                                          \
+                dstp++;                                                                        \
+            }                                                                                  \
+            else                                                                               \
+            {                                                                                  \
+                *dstp_16 = (vx_int16)final_result_value;                                       \
+                dstp_16++;                                                                     \
+            }                                                                                  \
+        }                                                                                      \
+    }                                                                                          
+    
+    
+void Multiply_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;    
+    vx_tile_t *in_1 = (vx_tile_t *)parameters[0];
+    vx_tile_t *in_2 = (vx_tile_t *)parameters[1];
+    vx_float32 *scale = (vx_float32*)parameters[2];
+    vx_enum *overflow_policy = (vx_enum*)parameters[3];
+    vx_enum *rounding_policy = (vx_enum*)parameters[4];
+    vx_tile_t *out = (vx_tile_t *)parameters[5];
+    
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    if (ty == 0 && tx == 0)
+    {
+        MULTIPLY_FLEXIBLE(0, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x)     
+    }
+    else
+    {
+        MULTIPLY_FLEXIBLE(0, tx, ty, vxTileWidth(out, 0), in_1->tile_x, in_2->tile_x, out->tile_x)
+        MULTIPLY_FLEXIBLE(ty, 0, vxTileHeight(out, 0), vxTileWidth(out, 0), 0, 0, 0)
+    }
+}
+
diff --git a/kernels/tiling/tiling_nonlinearfilter.c b/kernels/tiling/tiling_nonlinearfilter.c
new file mode 100644
index 0000000..2b57318
--- /dev/null
+++ b/kernels/tiling/tiling_nonlinearfilter.c
@@ -0,0 +1,1217 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+
+#include <tiling.h>
+
+#include <stdlib.h>
+
+struct src_ptr
+{
+    vx_uint8* top2_src;
+    vx_uint8* top_src;
+    vx_uint8* mid_src;
+    vx_uint8* bot_src;
+    vx_uint8* bot2_src;
+};
+
+
+static void sort(uint8x8_t *a, uint8x8_t *b)
+{
+    const uint8x8_t min = vmin_u8(*a, *b);
+    const uint8x8_t max = vmax_u8(*a, *b);
+    *a                   = min;
+    *b                   = max;
+}
+
+static void sort_min(uint8x8_t *a, uint8x8_t *b)
+{
+    const uint8x8_t min = vmin_u8(*a, *b);
+    *a                   = min;
+}
+
+static void sort_max(uint8x8_t *a, uint8x8_t *b)
+{
+    const uint8x8_t max = vmax_u8(*a, *b);
+    *a                   = max;
+}
+
+// Calculations that do not affect the median were removed.
+static void sort5_mid(uint8x8_t *p0, uint8x8_t *p1, uint8x8_t *p2, uint8x8_t *p3, uint8x8_t *p4)
+{
+    sort(p0, p1);
+    sort(p2, p3);
+    sort(p0, p2);
+    sort(p1, p3);
+    sort(p1, p2);
+    sort(p0, p4);
+    sort(p1, p4);
+    sort(p2, p4);
+}
+
+static void sort5_min(uint8x8_t *p0, uint8x8_t *p1, uint8x8_t *p2, uint8x8_t *p3, uint8x8_t *p4)
+{
+    sort_min(p0, p1);
+    sort_min(p0, p2);
+    sort_min(p0, p3);
+    sort_min(p0, p4);
+}
+
+static void sort5_max(uint8x8_t *p0, uint8x8_t *p1, uint8x8_t *p2, uint8x8_t *p3, uint8x8_t *p4)
+{
+    sort_max(p0, p1);
+    sort_max(p0, p2);
+    sort_max(p0, p3);
+    sort_max(p0, p4);
+}
+
+static void sort9_mid(uint8x8_t *p0, uint8x8_t *p1, uint8x8_t *p2,
+                      uint8x8_t *p3, uint8x8_t *p4, uint8x8_t *p5,
+                      uint8x8_t *p6, uint8x8_t *p7, uint8x8_t *p8)
+{
+    sort(p1, p2);
+    sort(p4, p5);
+    sort(p7, p8);
+    sort(p0, p1);
+    sort(p3, p4);
+    sort(p6, p7);
+    sort(p1, p2);
+    sort(p4, p5);
+    sort(p7, p8);
+    sort(p0, p3);
+    sort(p5, p8);
+    sort(p4, p7);
+    sort(p3, p6);
+    sort(p1, p4);
+    sort(p2, p5);
+    sort(p4, p7);
+    sort(p4, p2);
+    sort(p6, p4);
+    sort(p4, p2);
+}
+
+static void sort9_min(uint8x8_t *p0, uint8x8_t *p1, uint8x8_t *p2,
+                      uint8x8_t *p3, uint8x8_t *p4, uint8x8_t *p5,
+                      uint8x8_t *p6, uint8x8_t *p7, uint8x8_t *p8)
+{
+    sort_min(p0, p1);
+    sort_min(p0, p2);
+    sort_min(p0, p3);
+    sort_min(p0, p4);
+    sort_min(p0, p5);
+    sort_min(p0, p6);
+    sort_min(p0, p7);
+    sort_min(p0, p8);
+}
+
+static void sort9_max(uint8x8_t *p0, uint8x8_t *p1, uint8x8_t *p2,
+                      uint8x8_t *p3, uint8x8_t *p4, uint8x8_t *p5,
+                      uint8x8_t *p6, uint8x8_t *p7, uint8x8_t *p8)
+{
+    sort_max(p0, p1);
+    sort_max(p0, p2);
+    sort_max(p0, p3);
+    sort_max(p0, p4);
+    sort_max(p0, p5);
+    sort_max(p0, p6);
+    sort_max(p0, p7);
+    sort_max(p0, p8);
+}
+
+static void sort21_mid(uint8x8_t p[21])
+{
+    sort(&p[0], &p[1]);
+    sort(&p[2], &p[3]);
+    sort(&p[4], &p[5]);
+    sort(&p[6], &p[7]);
+    sort(&p[8], &p[9]);
+    sort(&p[10], &p[11]);
+    sort(&p[12], &p[13]);
+    sort(&p[14], &p[15]);
+    sort(&p[16], &p[17]);
+    sort(&p[18], &p[19]);
+    sort(&p[0], &p[2]);
+    sort(&p[1], &p[3]);
+    sort(&p[4], &p[6]);
+    sort(&p[5], &p[7]);
+    sort(&p[8], &p[10]);
+    sort(&p[9], &p[11]);
+    sort(&p[12], &p[14]);
+    sort(&p[13], &p[15]);
+    sort(&p[16], &p[18]);
+    sort(&p[17], &p[19]);
+    sort(&p[1], &p[2]);
+    sort(&p[5], &p[6]);
+    sort(&p[0], &p[4]);
+    sort(&p[3], &p[7]);
+    sort(&p[9], &p[10]);
+    sort(&p[13], &p[14]);
+    sort(&p[8], &p[12]);
+    sort(&p[11], &p[15]);
+    sort(&p[17], &p[18]);
+    sort(&p[16], &p[20]);
+    sort(&p[1], &p[5]);
+    sort(&p[2], &p[6]);
+    sort(&p[9], &p[13]);
+    sort(&p[10], &p[14]);
+    sort(&p[0], &p[8]);
+    sort(&p[7], &p[15]);
+    sort(&p[17], &p[20]);
+    sort(&p[1], &p[4]);
+    sort(&p[3], &p[6]);
+    sort(&p[9], &p[12]);
+    sort(&p[11], &p[14]);
+    sort(&p[18], &p[20]);
+    sort(&p[0], &p[16]);
+    sort(&p[2], &p[4]);
+    sort(&p[3], &p[5]);
+    sort(&p[10], &p[12]);
+    sort(&p[11], &p[13]);
+    sort(&p[1], &p[9]);
+    sort(&p[6], &p[14]);
+    sort(&p[19], &p[20]);
+    sort(&p[3], &p[4]);
+    sort(&p[11], &p[12]);
+    sort(&p[1], &p[8]);
+    sort(&p[2], &p[10]);
+    sort(&p[5], &p[13]);
+    sort(&p[7], &p[14]);
+    sort(&p[3], &p[11]);
+    sort(&p[2], &p[8]);
+    sort(&p[4], &p[12]);
+    sort(&p[7], &p[13]);
+    sort(&p[1], &p[17]);
+    sort(&p[3], &p[10]);
+    sort(&p[5], &p[12]);
+    sort(&p[1], &p[16]);
+    sort(&p[2], &p[18]);
+    sort(&p[3], &p[9]);
+    sort(&p[6], &p[12]);
+    sort(&p[2], &p[16]);
+    sort(&p[3], &p[8]);
+    sort(&p[7], &p[12]);
+    sort(&p[5], &p[9]);
+    sort(&p[6], &p[10]);
+    sort(&p[4], &p[8]);
+    sort(&p[7], &p[11]);
+    sort(&p[3], &p[19]);
+    sort(&p[5], &p[8]);
+    sort(&p[7], &p[10]);
+    sort(&p[3], &p[18]);
+    sort(&p[4], &p[20]);
+    sort(&p[6], &p[8]);
+    sort(&p[7], &p[9]);
+    sort(&p[3], &p[17]);
+    sort(&p[5], &p[20]);
+    sort(&p[7], &p[8]);
+    sort(&p[3], &p[16]);
+    sort(&p[6], &p[20]);
+    sort(&p[5], &p[17]);
+    sort(&p[7], &p[20]);
+    sort(&p[4], &p[16]);
+    sort(&p[6], &p[18]);
+    sort(&p[5], &p[16]);
+    sort(&p[7], &p[19]);
+    sort(&p[7], &p[18]);
+    sort(&p[6], &p[16]);
+    sort(&p[7], &p[17]);
+    sort(&p[10], &p[18]);
+    sort(&p[7], &p[16]);
+    sort(&p[9], &p[17]);
+    sort(&p[8], &p[16]);
+    sort(&p[9], &p[16]);
+    sort(&p[10], &p[16]);
+}
+
+static void sort21_min(uint8x8_t p[21])
+{
+    sort_min(&p[0], &p[1]);
+    sort_min(&p[0], &p[2]);
+    sort_min(&p[0], &p[3]);
+    sort_min(&p[0], &p[4]);
+    sort_min(&p[0], &p[5]);
+    sort_min(&p[0], &p[6]);
+    sort_min(&p[0], &p[7]);
+    sort_min(&p[0], &p[8]);
+    sort_min(&p[0], &p[9]);
+    sort_min(&p[0], &p[10]);
+    sort_min(&p[0], &p[11]);
+    sort_min(&p[0], &p[12]);
+    sort_min(&p[0], &p[13]);
+    sort_min(&p[0], &p[14]);
+    sort_min(&p[0], &p[15]);
+    sort_min(&p[0], &p[16]);
+    sort_min(&p[0], &p[17]);
+    sort_min(&p[0], &p[18]);
+    sort_min(&p[0], &p[19]);
+    sort_min(&p[0], &p[20]);
+}
+
+
+static void sort21_max(uint8x8_t p[21])
+{
+    sort_max(&p[0], &p[1]);
+    sort_max(&p[0], &p[2]);
+    sort_max(&p[0], &p[3]);
+    sort_max(&p[0], &p[4]);
+    sort_max(&p[0], &p[5]);
+    sort_max(&p[0], &p[6]);
+    sort_max(&p[0], &p[7]);
+    sort_max(&p[0], &p[8]);
+    sort_max(&p[0], &p[9]);
+    sort_max(&p[0], &p[10]);
+    sort_max(&p[0], &p[11]);
+    sort_max(&p[0], &p[12]);
+    sort_max(&p[0], &p[13]);
+    sort_max(&p[0], &p[14]);
+    sort_max(&p[0], &p[15]);
+    sort_max(&p[0], &p[16]);
+    sort_max(&p[0], &p[17]);
+    sort_max(&p[0], &p[18]);
+    sort_max(&p[0], &p[19]);
+    sort_max(&p[0], &p[20]);
+}
+
+static void sort25_mid(uint8x8_t p[25])
+{
+    sort(&p[1], &p[2]);
+    sort(&p[0], &p[1]);
+    sort(&p[1], &p[2]);
+    sort(&p[4], &p[5]);
+    sort(&p[3], &p[4]);
+    sort(&p[4], &p[5]);
+    sort(&p[0], &p[3]);
+    sort(&p[2], &p[5]);
+    sort(&p[2], &p[3]);
+    sort(&p[1], &p[4]);
+    sort(&p[1], &p[2]);
+    sort(&p[3], &p[4]);
+    sort(&p[7], &p[8]);
+    sort(&p[6], &p[7]);
+    sort(&p[7], &p[8]);
+    sort(&p[10], &p[11]);
+    sort(&p[9], &p[10]);
+    sort(&p[10], &p[11]);
+    sort(&p[6], &p[9]);
+    sort(&p[8], &p[11]);
+    sort(&p[8], &p[9]);
+    sort(&p[7], &p[10]);
+    sort(&p[7], &p[8]);
+    sort(&p[9], &p[10]);
+    sort(&p[0], &p[6]);
+    sort(&p[4], &p[10]);
+    sort(&p[4], &p[6]);
+    sort(&p[2], &p[8]);
+    sort(&p[2], &p[4]);
+    sort(&p[6], &p[8]);
+    sort(&p[1], &p[7]);
+    sort(&p[5], &p[11]);
+    sort(&p[5], &p[7]);
+    sort(&p[3], &p[9]);
+    sort(&p[3], &p[5]);
+    sort(&p[7], &p[9]);
+    sort(&p[1], &p[2]);
+    sort(&p[3], &p[4]);
+    sort(&p[5], &p[6]);
+    sort(&p[7], &p[8]);
+    sort(&p[9], &p[10]);
+    sort(&p[13], &p[14]);
+    sort(&p[12], &p[13]);
+    sort(&p[13], &p[14]);
+    sort(&p[16], &p[17]);
+    sort(&p[15], &p[16]);
+    sort(&p[16], &p[17]);
+    sort(&p[12], &p[15]);
+    sort(&p[14], &p[17]);
+    sort(&p[14], &p[15]);
+    sort(&p[13], &p[16]);
+    sort(&p[13], &p[14]);
+    sort(&p[15], &p[16]);
+    sort(&p[19], &p[20]);
+    sort(&p[18], &p[19]);
+    sort(&p[19], &p[20]);
+    sort(&p[21], &p[22]);
+    sort(&p[23], &p[24]);
+    sort(&p[21], &p[23]);
+    sort(&p[22], &p[24]);
+    sort(&p[22], &p[23]);
+    sort(&p[18], &p[21]);
+    sort(&p[20], &p[23]);
+    sort(&p[20], &p[21]);
+    sort(&p[19], &p[22]);
+    sort(&p[22], &p[24]);
+    sort(&p[19], &p[20]);
+    sort(&p[21], &p[22]);
+    sort(&p[23], &p[24]);
+    sort(&p[12], &p[18]);
+    sort(&p[16], &p[22]);
+    sort(&p[16], &p[18]);
+    sort(&p[14], &p[20]);
+    sort(&p[20], &p[24]);
+    sort(&p[14], &p[16]);
+    sort(&p[18], &p[20]);
+    sort(&p[22], &p[24]);
+    sort(&p[13], &p[19]);
+    sort(&p[17], &p[23]);
+    sort(&p[17], &p[19]);
+    sort(&p[15], &p[21]);
+    sort(&p[15], &p[17]);
+    sort(&p[19], &p[21]);
+    sort(&p[13], &p[14]);
+    sort(&p[15], &p[16]);
+    sort(&p[17], &p[18]);
+    sort(&p[19], &p[20]);
+    sort(&p[21], &p[22]);
+    sort(&p[23], &p[24]);
+    sort(&p[0], &p[12]);
+    sort(&p[8], &p[20]);
+    sort(&p[8], &p[12]);
+    sort(&p[4], &p[16]);
+    sort(&p[16], &p[24]);
+    sort(&p[12], &p[16]);
+    sort(&p[2], &p[14]);
+    sort(&p[10], &p[22]);
+    sort(&p[10], &p[14]);
+    sort(&p[6], &p[18]);
+    sort(&p[6], &p[10]);
+    sort(&p[10], &p[12]);
+    sort(&p[1], &p[13]);
+    sort(&p[9], &p[21]);
+    sort(&p[9], &p[13]);
+    sort(&p[5], &p[17]);
+    sort(&p[13], &p[17]);
+    sort(&p[3], &p[15]);
+    sort(&p[11], &p[23]);
+    sort(&p[11], &p[15]);
+    sort(&p[7], &p[19]);
+    sort(&p[7], &p[11]);
+    sort(&p[11], &p[13]);
+    sort(&p[11], &p[12]);
+}
+
+
+static void sort25_min(uint8x8_t p[25])
+{
+    sort_min(&p[0], &p[1]);
+    sort_min(&p[0], &p[2]);
+    sort_min(&p[0], &p[3]);
+    sort_min(&p[0], &p[4]);
+    sort_min(&p[0], &p[5]);
+    sort_min(&p[0], &p[6]);
+    sort_min(&p[0], &p[7]);
+    sort_min(&p[0], &p[8]);
+    sort_min(&p[0], &p[9]);
+    sort_min(&p[0], &p[10]);
+    sort_min(&p[0], &p[11]);
+    sort_min(&p[0], &p[12]);
+    sort_min(&p[0], &p[13]);
+    sort_min(&p[0], &p[14]);
+    sort_min(&p[0], &p[15]);
+    sort_min(&p[0], &p[16]);
+    sort_min(&p[0], &p[17]);
+    sort_min(&p[0], &p[18]);
+    sort_min(&p[0], &p[19]);
+    sort_min(&p[0], &p[20]);
+    sort_min(&p[0], &p[21]);
+    sort_min(&p[0], &p[22]);
+    sort_min(&p[0], &p[23]);
+    sort_min(&p[0], &p[24]);
+}
+
+static void sort25_max(uint8x8_t p[25])
+{
+    sort_max(&p[0], &p[1]);
+    sort_max(&p[0], &p[2]);
+    sort_max(&p[0], &p[3]);
+    sort_max(&p[0], &p[4]);
+    sort_max(&p[0], &p[5]);
+    sort_max(&p[0], &p[6]);
+    sort_max(&p[0], &p[7]);
+    sort_max(&p[0], &p[8]);
+    sort_max(&p[0], &p[9]);
+    sort_max(&p[0], &p[10]);
+    sort_max(&p[0], &p[11]);
+    sort_max(&p[0], &p[12]);
+    sort_max(&p[0], &p[13]);
+    sort_max(&p[0], &p[14]);
+    sort_max(&p[0], &p[15]);
+    sort_max(&p[0], &p[16]);
+    sort_max(&p[0], &p[17]);
+    sort_max(&p[0], &p[18]);
+    sort_max(&p[0], &p[19]);
+    sort_max(&p[0], &p[20]);
+    sort_max(&p[0], &p[21]);
+    sort_max(&p[0], &p[22]);
+    sort_max(&p[0], &p[23]);
+    sort_max(&p[0], &p[24]);
+}
+
+static void filter_cross_3x3_neon(struct src_ptr src, vx_uint8* dst, vx_int32 low_x, vx_int32 high_x, vx_enum function)
+{
+    vx_uint32 x;
+    for (x = low_x; x < high_x; x += 8)
+    {
+        const uint8x8_t top_data = vld1_u8(src.top_src);
+        const uint8x16_t mid_data = vld1q_u8(src.mid_src);
+        const uint8x8_t bot_data = vld1_u8(src.bot_src);
+
+        uint8x8_t p0 = top_data;
+        uint8x8_t p1 = vget_low_u8(mid_data);
+        uint8x8_t p2 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+        uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+        uint8x8_t p4 = bot_data;
+
+        switch (function)
+        {
+            /* minimal value */
+            case VX_NONLINEAR_FILTER_MIN:
+            {
+                sort5_min(&p0, &p1, &p2, &p3, &p4);
+                vst1_u8(dst, p0);
+                break;
+            }
+            /* maximum value */
+            case VX_NONLINEAR_FILTER_MAX:
+            {
+                sort5_max(&p0, &p1, &p2, &p3, &p4);
+                vst1_u8(dst, p0);
+                break;
+            }
+            /* pick the middle value */
+            case VX_NONLINEAR_FILTER_MEDIAN:
+            {
+                sort5_mid(&p0, &p1, &p2, &p3, &p4);
+                vst1_u8(dst, p2);
+                break;
+            }
+        }
+
+        dst += 8;
+        src.top_src += 8;
+        src.mid_src += 8;
+        src.bot_src += 8;
+    }
+}
+
+
+static void filter_cross_3x3(vx_tile_t *in, vx_tile_t *out, vx_enum function, vx_size ry0)
+{
+    vx_uint32 y;
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    vx_uint32 low_x = 0;
+    vx_uint32 high_x = out->tile_block.width;
+
+    vx_int32 src_stride_y = in->addr->stride_y;
+    vx_int32 dst_stride_y = out->addr->stride_y;
+
+    struct src_ptr src;
+
+    if (ry0 == 0)
+    {
+        if (high_y == out->image.height)
+        {
+            high_y = high_y - 2;
+        }
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_uint8* dst = (vx_uint8 *)dst_base + y * dst_stride_y;
+            src.top_src = (vx_uint8 *)src_base + 1 + (y)* src_stride_y;
+            src.mid_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y;
+            src.bot_src = (vx_uint8 *)src_base + 1 + (y + 2) * src_stride_y;
+
+            filter_cross_3x3_neon(src, dst, low_x, high_x, function);
+        }
+    }
+    else 
+    {
+        if (low_y == 0)
+        {
+            low_y = 1;
+        }
+        if (high_y == out->image.height)
+        {
+            high_y = high_y - 1;
+        }
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * dst_stride_y;
+            src.top_src = (vx_uint8 *)src_base + 1 + (y - 1) * src_stride_y;
+            src.mid_src = (vx_uint8 *)src_base + (y)* src_stride_y;
+            src.bot_src = (vx_uint8 *)src_base + 1 + (y + 1) * src_stride_y;
+
+            filter_cross_3x3_neon(src, dst, low_x, high_x, function);
+        }
+    }
+}
+
+
+static void filter_box_3x3_neon(struct src_ptr src, vx_uint8* dst, vx_int32 low_x, vx_int32 high_x, vx_enum function)
+{
+    vx_uint32 x;
+    for (x = low_x; x < high_x; x += 8)
+    {
+        const uint8x16_t top_data = vld1q_u8(src.top_src);
+        const uint8x16_t mid_data = vld1q_u8(src.mid_src);
+        const uint8x16_t bot_data = vld1q_u8(src.bot_src);
+
+        uint8x8_t p0 = vget_low_u8(top_data);
+        uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
+        uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
+        uint8x8_t p3 = vget_low_u8(mid_data);
+        uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+        uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+        uint8x8_t p6 = vget_low_u8(bot_data);
+        uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
+        uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
+
+        switch (function)
+        {
+            /* minimal value */
+            case VX_NONLINEAR_FILTER_MIN:
+            {
+                sort9_min(&p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8);
+                vst1_u8(dst, p0);
+                break;
+            }
+            /* maximum value */
+            case VX_NONLINEAR_FILTER_MAX:
+            {
+                sort9_max(&p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8);
+                vst1_u8(dst, p0);
+                break;
+            }
+            /* pick the middle value */
+            case VX_NONLINEAR_FILTER_MEDIAN:
+            {
+                sort9_mid(&p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8);
+                vst1_u8(dst, p4);
+                break;
+            }
+        }
+
+        dst += 8;
+        src.top_src += 8;
+        src.mid_src += 8;
+        src.bot_src += 8;
+    }
+}
+
+
+static void filter_box_3x3(vx_tile_t *in, vx_tile_t *out, vx_enum function, vx_size ry0)
+{
+    vx_uint32 y;
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    vx_uint32 low_x = 0;
+    vx_uint32 high_x = out->tile_block.width;
+
+    vx_int32 src_stride_y = in->addr->stride_y;
+    vx_int32 dst_stride_y = out->addr->stride_y;
+
+    struct src_ptr src;
+
+    if (ry0 == 0)
+    {
+        if (high_y == out->image.height)
+        {
+            high_y = high_y - 2;
+        }
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_uint8* dst = (vx_uint8 *)dst_base + y * dst_stride_y;
+            src.top_src = (vx_uint8 *)src_base + (y)* src_stride_y;
+            src.mid_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y;
+            src.bot_src = (vx_uint8 *)src_base + (y + 2) * src_stride_y;
+
+            filter_box_3x3_neon(src, dst, low_x, high_x, function);
+        }
+    }
+    else
+    {
+        if (low_y == 0)
+        {
+            low_y = 1;
+        }
+        if (high_y == out->image.height)
+        {
+            high_y = high_y - 1;
+        }
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * dst_stride_y;
+            src.top_src = (vx_uint8 *)src_base + (y - 1) * src_stride_y;
+            src.mid_src = (vx_uint8 *)src_base + (y)* src_stride_y;
+            src.bot_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y;
+
+            filter_box_3x3_neon(src, dst, low_x, high_x, function);
+        }
+    }
+}
+
+
+static void filter_cross_5x5_neon(struct src_ptr src, vx_uint8* dst, vx_int32 low_x, vx_int32 high_x, vx_enum function)
+{
+    vx_uint32 x;
+    for (x = low_x; x < high_x; x += 8)
+    {
+        const uint8x8_t  top2_data = vld1_u8(src.top2_src);
+        const uint8x8_t  top_data = vld1_u8(src.top_src);
+        const uint8x16_t mid_data = vld1q_u8(src.mid_src);
+        const uint8x8_t  bot_data = vld1_u8(src.bot_src);
+        const uint8x8_t  bot2_data = vld1_u8(src.bot2_src);
+
+        uint8x8_t p0 = top2_data;
+        uint8x8_t p1 = top_data;
+        uint8x8_t p2 = vget_low_u8(mid_data);
+        uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
+        uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
+        uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 3);
+        uint8x8_t p6 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 4);
+        uint8x8_t p7 = bot_data;
+        uint8x8_t p8 = bot2_data;
+
+        switch (function)
+        {
+            /* minimal value */
+        case VX_NONLINEAR_FILTER_MIN:
+        {
+            sort9_min(&p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8);
+            vst1_u8(dst, p0);
+            break;
+        }
+        /* maximum value */
+        case VX_NONLINEAR_FILTER_MAX:
+        {
+            sort9_max(&p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8);
+            vst1_u8(dst, p0);
+            break;
+        }
+        /* pick the middle value */
+        case VX_NONLINEAR_FILTER_MEDIAN:
+        {
+            sort9_mid(&p0, &p1, &p2, &p3, &p4, &p5, &p6, &p7, &p8);
+            vst1_u8(dst, p4);
+            break;
+        }
+        }
+
+        dst += 8;
+        src.top2_src += 8;
+        src.top_src += 8;
+        src.mid_src += 8;
+        src.bot_src += 8;
+        src.bot2_src += 8;
+    }
+}
+
+
+static void* filter_cross_5x5(vx_tile_t *in, vx_tile_t *out, vx_enum function, vx_size ry0)
+{
+    vx_uint32 y;
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    vx_uint32 low_x = 0;
+    vx_uint32 high_x = out->tile_block.width;
+
+    vx_int32 src_stride_y = in->addr->stride_y;
+    vx_int32 dst_stride_y = out->addr->stride_y;
+
+    struct src_ptr src;
+
+    if (ry0 == 1)
+    {
+        if (high_y == out->image.height)
+        {
+            high_y = high_y - 3;
+        }
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * dst_stride_y;
+            src.top2_src = (vx_uint8 *)src_base + 2 + (y - 1)* src_stride_y;
+            src.top_src = (vx_uint8 *)src_base + 2 + (y)* src_stride_y;
+            src.mid_src = (vx_uint8 *)src_base + (y + 1)* src_stride_y;
+            src.bot_src = (vx_uint8 *)src_base + 2 + (y + 2)* src_stride_y;
+            src.bot2_src = (vx_uint8 *)src_base + 2 + (y + 3)* src_stride_y;
+
+            filter_cross_5x5_neon(src, dst, low_x, high_x, function);
+        }
+    }
+    else
+    {
+        if (low_y == 0)
+        {
+            low_y = 2;
+        }
+        if (high_y == out->image.height)
+        {
+            high_y = high_y - 2;
+        }
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_uint8* dst = (vx_uint8 *)dst_base + 2 + y * dst_stride_y;
+            src.top2_src = (vx_uint8 *)src_base + 2 + (y - 2) * src_stride_y;
+            src.top_src = (vx_uint8 *)src_base + 2 + (y - 1) * src_stride_y;
+            src.mid_src = (vx_uint8 *)src_base + (y)* src_stride_y;
+            src.bot_src = (vx_uint8 *)src_base + 2 + (y + 1)* src_stride_y;
+            src.bot2_src = (vx_uint8 *)src_base + 2 + (y + 2)* src_stride_y;
+
+            filter_cross_5x5_neon(src, dst, low_x, high_x, function);
+        }
+    }
+}
+
+
+static void filter_box_5x5_neon(struct src_ptr src, vx_uint8* dst, vx_int32 low_x, vx_int32 high_x, vx_enum function)
+{
+    vx_uint32 x;
+    for (x = low_x; x < high_x; x += 8)
+    {
+        const uint8x16_t top2_data = vld1q_u8(src.top2_src);
+        const uint8x16_t top_data = vld1q_u8(src.top_src);
+        const uint8x16_t mid_data = vld1q_u8(src.mid_src);
+        const uint8x16_t bot_data = vld1q_u8(src.bot_src);
+        const uint8x16_t bot2_data = vld1q_u8(src.bot2_src);
+
+        const uint8x8_t d[] =
+        {
+            vget_low_u8(top2_data),
+            vget_high_u8(top2_data),
+            vget_low_u8(top_data),
+            vget_high_u8(top_data),
+            vget_low_u8(mid_data),
+            vget_high_u8(mid_data),
+            vget_low_u8(bot_data),
+            vget_high_u8(bot_data),
+            vget_low_u8(bot2_data),
+            vget_high_u8(bot2_data)
+        };
+
+        uint8x8_t p[25];
+        for (vx_uint32 i = 0; i < 5; ++i)
+        {
+            const vx_uint32 idx_d = i * 2;
+            const vx_uint32 idx_p = i * 5;
+
+            p[idx_p] = d[idx_d];
+            p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
+            p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
+            p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
+            p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
+        }
+
+        switch (function)
+        {
+            /* minimal value */
+        case VX_NONLINEAR_FILTER_MIN:
+        {
+            sort25_min(p);
+            vst1_u8(dst, p[0]);
+            break;
+        }
+        /* maximum value */
+        case VX_NONLINEAR_FILTER_MAX:
+        {
+            sort25_max(p);
+            vst1_u8(dst, p[0]);
+            break;
+        }
+        /* pick the middle value */
+        case VX_NONLINEAR_FILTER_MEDIAN:
+        {
+            sort25_mid(p);
+            vst1_u8(dst, p[12]);
+            break;
+        }
+        }
+
+        dst += 8;
+        src.top2_src += 8;
+        src.top_src += 8;
+        src.mid_src += 8;
+        src.bot_src += 8;
+        src.bot2_src += 8;
+    }
+}
+
+
+static void* filter_box_5x5(vx_tile_t *in, vx_tile_t *out, vx_enum function, vx_size ry0)
+{
+    vx_uint32 y;
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    vx_uint32 low_x = 0;
+    vx_uint32 high_x = out->tile_block.width;
+
+    vx_int32 src_stride_y = in->addr->stride_y;
+    vx_int32 dst_stride_y = out->addr->stride_y;
+
+    struct src_ptr src;
+
+    if (ry0 == 1)
+    {
+        if (high_y == out->image.height)
+        {
+            high_y = high_y - 3;
+        }
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * dst_stride_y;
+            src.top2_src = (vx_uint8 *)src_base + (y - 1)* src_stride_y;
+            src.top_src = (vx_uint8 *)src_base + (y)* src_stride_y;
+            src.mid_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y;
+            src.bot_src = (vx_uint8 *)src_base + (y + 2) * src_stride_y;
+            src.bot2_src = (vx_uint8 *)src_base + (y + 3) * src_stride_y;
+
+            filter_box_5x5_neon(src, dst, low_x, high_x, function);
+        }
+    }
+    else
+    {
+        if (low_y == 0)
+        {
+            low_y = 2;
+        }
+        if (high_y == out->image.height)
+        {
+            high_y = high_y - 2;
+        }
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_uint8* dst = (vx_uint8 *)dst_base + 2 + y * dst_stride_y;
+            src.top2_src = (vx_uint8 *)src_base + (y - 2) * src_stride_y;
+            src.top_src = (vx_uint8 *)src_base + (y - 1) * src_stride_y;
+            src.mid_src = (vx_uint8 *)src_base + (y)* src_stride_y;
+            src.bot_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y;
+            src.bot2_src = (vx_uint8 *)src_base + (y + 2) * src_stride_y;
+
+            filter_box_5x5_neon(src, dst, low_x, high_x, function);
+        }
+    }
+}
+
+
+static void filter_disk_5x5_neon(struct src_ptr src, vx_uint8* dst, vx_int32 low_x, vx_int32 high_x, vx_enum function)
+{
+    vx_uint32 x;
+    const uint8x16_t zero = vdupq_n_u8(0);
+    for (x = low_x; x < high_x; x += 8)
+    {
+        const uint8x16_t top2_data = vextq_u8(vld1q_u8(src.top2_src), zero, 1);
+        const uint8x16_t top_data = vld1q_u8(src.top_src);
+        const uint8x16_t mid_data = vld1q_u8(src.mid_src);
+        const uint8x16_t bot_data = vld1q_u8(src.bot_src);
+        const uint8x16_t bot2_data = vextq_u8(vld1q_u8(src.bot2_src), zero, 1);
+
+        uint8x8_t d[] =
+        {
+            vget_low_u8(top2_data),
+            vget_high_u8(top2_data),
+            vget_low_u8(top_data),
+            vget_high_u8(top_data),
+            vget_low_u8(mid_data),
+            vget_high_u8(mid_data),
+            vget_low_u8(bot_data),
+            vget_high_u8(bot_data),
+            vget_low_u8(bot2_data),
+            vget_high_u8(bot2_data)
+        };
+
+        uint8x8_t p[21];
+        p[0] = d[0];
+        p[1] = vext_u8(d[0], d[1], 1);
+        p[2] = vext_u8(d[0], d[1], 2);
+        p[18] = d[8];
+        p[19] = vext_u8(d[8], d[9], 1);
+        p[20] = vext_u8(d[8], d[9], 2);
+
+        for (vx_uint32 i = 0; i < 3; ++i)
+        {
+            const vx_uint32 idx_d = 2 + i * 2;
+            const vx_uint32 idx_p = 3 + i * 5;
+
+            p[idx_p] = d[idx_d];
+            p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
+            p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
+            p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
+            p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
+        }
+
+        switch (function)
+        {
+            /* minimal value */
+        case VX_NONLINEAR_FILTER_MIN:
+        {
+            sort21_min(p);
+            vst1_u8(dst, p[0]);
+            break;
+        }
+        /* maximum value */
+        case VX_NONLINEAR_FILTER_MAX:
+        {
+            sort21_max(p);
+            vst1_u8(dst, p[0]);
+            break;
+        }
+        /* pick the middle value */
+        case VX_NONLINEAR_FILTER_MEDIAN:
+        {
+            sort21_mid(p);
+            vst1_u8(dst, p[10]);
+            break;
+        }
+        }
+
+        dst += 8;
+        src.top2_src += 8;
+        src.top_src += 8;
+        src.mid_src += 8;
+        src.bot_src += 8;
+        src.bot2_src += 8;
+    }
+}
+
+
+static void* filter_disk_5x5(vx_tile_t *in, vx_tile_t *out, vx_enum function, vx_size ry0)
+{
+    vx_uint32 y;
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    vx_uint32 low_x = 0;
+    vx_uint32 high_x = out->tile_block.width;
+
+    vx_int32 src_stride_y = in->addr->stride_y;
+    vx_int32 dst_stride_y = out->addr->stride_y;
+
+    struct src_ptr src;
+
+    if (ry0 == 1)
+    {
+        if (high_y == out->image.height)
+        {
+            high_y = high_y - 3;
+        }
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_uint8* dst = (vx_uint8 *)dst_base + 1 + y * dst_stride_y;
+            src.top2_src = (vx_uint8 *)src_base + (y - 1)* src_stride_y;
+            src.top_src = (vx_uint8 *)src_base + (y)* src_stride_y;
+            src.mid_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y;
+            src.bot_src = (vx_uint8 *)src_base + (y + 2) * src_stride_y;
+            src.bot2_src = (vx_uint8 *)src_base + (y + 3) * src_stride_y;
+
+            filter_disk_5x5_neon(src, dst, low_x, high_x, function);
+        }
+    }
+    else
+    {
+        if (low_y == 0)
+        {
+            low_y = 2;
+        }
+        if (high_y == out->image.height)
+        {
+            high_y = high_y - 2;
+        }
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_uint8* dst = (vx_uint8 *)dst_base + 2 + y * dst_stride_y;
+            src.top2_src = (vx_uint8 *)src_base + (y - 2) * src_stride_y;
+            src.top_src = (vx_uint8 *)src_base + (y - 1) * src_stride_y;
+            src.mid_src = (vx_uint8 *)src_base + (y)* src_stride_y;
+            src.bot_src = (vx_uint8 *)src_base + (y + 1) * src_stride_y;
+            src.bot2_src = (vx_uint8 *)src_base + (y + 2) * src_stride_y;
+
+            filter_disk_5x5_neon(src, dst, low_x, high_x, function);
+        }
+    }
+}
+
+void NonLinearFilter_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+    vx_enum *func = (vx_enum *)parameters[0];
+    vx_tile_t *in = (vx_tile_t *)parameters[1];
+    vx_tile_matrix_t *mask = (vx_tile_matrix_t *)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+
+    vx_size ry0 = mask->origin.y;
+
+    vx_int32 count_mask = 0;
+    vx_int32 mask_index = 0;
+
+    for (vx_uint32 r = 0; r < mask->rows; ++r)
+    {
+        for (vx_uint32 c = 0; c < mask->columns; ++c, ++mask_index)
+        {
+            if (mask->m[mask_index])
+            {
+                ++count_mask;
+            }
+        }
+    }
+
+    switch (mask->rows)
+    {
+        case 3:   // mask = 3x3
+        {
+            if (count_mask == 5)
+                filter_cross_3x3(in, out, *func, ry0);
+            else // count_mask = 9
+                filter_box_3x3(in, out, *func, ry0);
+
+            break;
+        }
+
+        case 5:   // mask = 5x5
+        {
+            if (count_mask == 9)
+                filter_cross_5x5(in, out, *func, ry0);
+            else if (count_mask == 21)
+                filter_disk_5x5(in, out, *func, ry0);
+            else  // count_mask = 25
+                filter_box_5x5(in, out, *func, ry0);
+            break;
+        }
+    }
+}
+
+
+// helpers
+static vx_int32 vx_uint8_compare(const void *p1, const void *p2)
+{
+    vx_uint8 a = *(vx_uint8 *)p1;
+    vx_uint8 b = *(vx_uint8 *)p2;
+    if (a > b)
+        return 1;
+    else if (a == b)
+        return 0;
+    else
+        return -1;
+}
+
+
+static vx_uint32 readMaskedRectangle_U8(const void *base, const vx_imagepatch_addressing_t *addr, vx_uint32 center_x, vx_uint32 center_y,
+                                        vx_uint32 left, vx_uint32 top, vx_uint32 right, vx_uint32 bottom, vx_uint8 *mask, vx_uint8 *destination)
+{
+    vx_int32 width = (vx_int32)addr->dim_x, height = (vx_int32)addr->dim_y;
+    vx_int32 stride_y = addr->stride_y;
+    vx_int32 stride_x = addr->stride_x;
+    const vx_uint8 *ptr = (const vx_uint8 *)base;
+    vx_int32 ky, kx;
+    vx_uint32 mask_index = 0;
+    vx_uint32 dest_index = 0;
+
+    for (ky = -(vx_int32)top; ky <= (vx_int32)bottom; ++ky)
+    {
+        vx_int32 y = (vx_int32)(center_y + ky);
+        y = y < 0 ? 0 : y >= height ? height - 1 : y;
+
+        for (kx = -(vx_int32)left; kx <= (vx_int32)right; ++kx, ++mask_index)
+        {
+            vx_int32 x = (vx_int32)(center_x + kx);
+            x = x < 0 ? 0 : x >= width ? width - 1 : x;
+            if (mask[mask_index])
+                ((vx_uint8*)destination)[dest_index++] = *(vx_uint8*)(ptr + y*stride_y + x*stride_x);
+        }
+    }
+
+    return dest_index;
+}
+
+
+#define NonLinearFilter(low_y, high_y, low_x)                                                           \
+    for (y = low_y; y < high_y; y++)                                                                    \
+    {                                                                                                   \
+        for (x = low_x; x < high_x; x++)                                                                \
+        {                                                                                               \
+            vx_uint8 *dst = (vx_uint8 *)dst_base + y * out->addr->stride_y + x * out->addr->stride_x;   \
+            vx_uint32 count = (vx_uint32)readMaskedRectangle_U8(src_base, in->addr, x, y, (vx_uint32)rx0, (vx_uint32)ry0, (vx_uint32)rx1, (vx_uint32)ry1, mask->m, v);                                                                            \
+                                                                                                        \
+            qsort(v, count, sizeof(vx_uint8), vx_uint8_compare);                                        \
+                                                                                                        \
+            switch(*func)                                                                               \
+            {                                                                                           \
+                case VX_NONLINEAR_FILTER_MIN : *dst = v[0]; break;                                      \
+                case VX_NONLINEAR_FILTER_MAX : *dst = v[count - 1]; break;                              \
+                case VX_NONLINEAR_FILTER_MEDIAN : *dst = v[count / 2]; break;                           \
+            }                                                                                           \
+        }                                                                                               \
+    }
+
+
+void NonLinearFilter_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+    vx_enum *func = (vx_enum *)parameters[0];
+    vx_tile_t *in = (vx_tile_t *)parameters[1];
+    vx_tile_matrix_t *mask = (vx_tile_matrix_t *)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+
+    vx_uint8 v[C_MAX_NONLINEAR_DIM * C_MAX_NONLINEAR_DIM];
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    vx_size rx0 = mask->origin.x;
+    vx_size ry0 = mask->origin.y;
+    vx_size rx1 = mask->columns - mask->origin.x - 1;
+    vx_size ry1 = mask->rows - mask->origin.y - 1;
+
+    if (low_y == 0 && low_x == 0)
+    {
+        NonLinearFilter(low_y, high_y, low_x)
+    }
+    else
+    {
+        NonLinearFilter(0, low_y, low_x)
+
+        src_base = in->base[0];
+        dst_base = out->base[0];
+        NonLinearFilter(low_y, high_y, 0)
+    }
+
+}
diff --git a/kernels/tiling/tiling_nonmaxsuppression.c b/kernels/tiling/tiling_nonmaxsuppression.c
new file mode 100644
index 0000000..e758de9
--- /dev/null
+++ b/kernels/tiling/tiling_nonmaxsuppression.c
@@ -0,0 +1,445 @@
+/*
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+#include <arm_neon.h>
+#include <tiling.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <math.h>
+
+static void nonMaxSuppression_U8(vx_uint8* src,
+    vx_uint32 srcWidth,
+    vx_uint32 srcHeight,
+    vx_uint32 srcStride,
+    vx_uint8* mask,
+    vx_uint32 maskStride,
+    vx_uint8* dst,
+    vx_uint32 dstStride,
+    vx_int32  windowSize,
+    vx_uint32 low_height,
+    vx_uint32 height,
+    vx_uint32 x_step)
+{
+    vx_uint32 radius = (vx_uint32)windowSize >> 1;
+    vx_uint8 *maskCurr = NULL;
+    vx_uint8 *maskLeftTop = NULL;
+    vx_uint32 x, y;
+    uint8x16_t vOne16 = vdupq_n_u8(1);
+    uint8x8_t vOne8 = vdup_n_u8(1);
+    for (y = low_height + radius; y < height - radius; y++)
+    {
+        vx_uint8 *srcCurr = src + y * srcStride + radius;
+        vx_uint8 *dstCurr = dst + y * dstStride + radius;
+        vx_uint8 *leftTop = src + (y - radius) * srcStride;
+        if (mask)
+        {
+            maskCurr = mask + y * maskStride + radius;
+            maskLeftTop = mask + (y - radius) * maskStride;
+        }
+
+        for (x = 0; x < x_step; x += 16)
+        {
+            uint8x16_t vSrcCurr = vld1q_u8(srcCurr); 
+            uint8x16_t vDstCurr = vld1q_u8(dstCurr);
+            uint8x16_t vMaskCurr = vdupq_n_u8(0);
+            if (maskCurr)
+            {
+                vMaskCurr = vld1q_u8(maskCurr);
+            }
+            uint8x16_t vNeighborCurr;
+            uint8x16_t vTempResult;
+            uint8x16_t vFlag = vdupq_n_u8(1);
+            for (vx_uint32 j = 0; j < windowSize; j++)
+            {
+                for (vx_uint32 i = 0; i < windowSize; i++)
+                {   
+                    if (j == radius && i == radius)
+                        continue;
+                    else
+                    {
+                        vNeighborCurr = vld1q_u8(leftTop + j * srcStride + i);
+                        if (mask != NULL)
+                        {
+                            uint8x16_t vMaskNeighborCurr = vld1q_u8(maskLeftTop + j * maskStride + i);
+                            vMaskNeighborCurr = vsubq_u8(vOne16, vorrq_u8(vMaskNeighborCurr, vMaskCurr));
+                            vNeighborCurr = vmulq_u8(vNeighborCurr, vMaskNeighborCurr);
+                        }                                      
+                        vTempResult = (j < radius || (j == radius && i < radius)) ? vcgeq_u8(vSrcCurr, vNeighborCurr) : vcgtq_u8(vSrcCurr, vNeighborCurr);
+                        vFlag = vmulq_u8(vFlag, vTempResult);
+                    }                 
+                }
+            }
+            vDstCurr = vmulq_u8(vFlag, vSrcCurr);
+            vst1q_u8((vx_uint8 *)dstCurr, vDstCurr);
+            srcCurr += 16;
+            dstCurr += 16;
+            leftTop += 16;
+            if (mask)
+            {
+                maskCurr += 16;
+                maskLeftTop += 16;                
+            }
+        }
+    }
+}
+
+void NonMaxSuppression_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;
+    vx_uint8 mask_data = 0;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *mask = (vx_tile_t *)parameters[1];
+    vx_int32 *wsize = (vx_int32*)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    vx_df_image format = in->image.format;
+    vx_int32 border = *wsize / 2;
+    
+    vx_uint32 low_height = out->tile_y;
+    vx_uint32 height = out->tile_y + out->tile_block.height;
+    
+    vx_uint32 low_width = out->tile_x;
+    vx_uint32 width = out->tile_x + out->tile_block.width;
+    
+    if(low_height == 0)
+    {
+        low_height = low_height + border;
+    }
+    if(height == out->image.height)
+    {
+        height = height - border;
+    }
+    if (format == VX_DF_IMAGE_U8)
+    {
+        vx_uint8 *maskCurr = NULL;
+        vx_uint8 *maskLeftTop = NULL;
+        uint8x16_t vOne16 = vdupq_n_u8(1);
+        uint8x8_t vOne8 = vdup_n_u8(1);
+        for (y = low_height; y < height; y++)
+        {
+            vx_uint8 *srcCurr = (vx_uint8 *)in->base[0] + in->tile_x + y * in->addr[0].stride_y + border;
+            vx_uint8 *dstCurr = (vx_uint8 *)out->base[0] + out->tile_x + y * out->addr[0].stride_y + border;
+            vx_uint8 *leftTop = (vx_uint8 *)in->base[0] + in->tile_x + (y - border) * in->addr[0].stride_y;
+            if (mask->base[0] != NULL)
+            {
+                maskCurr = mask->base[0] + y * mask->addr[0].stride_y + border;
+                maskLeftTop = mask->base[0] + (y - border) * mask->addr[0].stride_y;
+            }
+            if(low_width == 0)
+            {
+                low_width = low_width + border;
+            }
+            if(width == out->image.width)
+            {
+                width = width - border;
+            }
+            for (x = low_width; x < width; x += 16)
+            {
+                uint8x16_t vSrcCurr = vld1q_u8(srcCurr); 
+                uint8x16_t vDstCurr = vld1q_u8(dstCurr);
+                uint8x16_t vMaskCurr = vdupq_n_u8(0);
+                if (maskCurr)
+                {
+                    vMaskCurr = vld1q_u8(maskCurr);
+                }
+                uint8x16_t vNeighborCurr;
+                uint8x16_t vTempResult;
+                uint8x16_t vFlag = vdupq_n_u8(1);
+                for (vx_uint32 j = 0; j < *wsize; j++)
+                {
+                    for (vx_uint32 i = 0; i < *wsize; i++)
+                    {   
+                        if (j == border && i == border)
+                            continue;
+                        else
+                        {
+                            vNeighborCurr = vld1q_u8(leftTop + j * in->addr[0].stride_y + i);
+                            if (mask->base[0] != NULL)
+                            {
+                                uint8x16_t vMaskNeighborCurr = vld1q_u8(maskLeftTop + j * mask->addr[0].stride_y + i);
+                                vMaskNeighborCurr = vsubq_u8(vOne16, vorrq_u8(vMaskNeighborCurr, vMaskCurr));
+                                vNeighborCurr = vmulq_u8(vNeighborCurr, vMaskNeighborCurr);
+                            }                                      
+                            vTempResult = (j < border || (j == border && i < border)) ? vcgeq_u8(vSrcCurr, vNeighborCurr) : vcgtq_u8(vSrcCurr, vNeighborCurr);
+                            vFlag = vmulq_u8(vFlag, vTempResult);
+                        }                 
+                    }
+                }
+                vDstCurr = vmulq_u8(vFlag, vSrcCurr);
+                vst1q_u8((vx_uint8 *)dstCurr, vDstCurr);
+                srcCurr += 16;
+                dstCurr += 16;
+                leftTop += 16;
+                if (mask->base[0] != NULL)
+                {
+                    maskCurr += 16;
+                    maskLeftTop += 16;                
+                }
+            }
+        }
+    }
+    else
+    {
+        vx_int32 border = *wsize / 2;
+        for (vx_int32 y = low_height; y < height; y++)
+        {
+            vx_int32 x = 0;
+            for (x = low_width; x < width; x+=8)
+            {
+                uint8x8_t _mask_8x8_o;
+                vx_uint8 *_maskp_o;
+                if (mask->base[0] != NULL)
+                {
+                    _maskp_o = (vx_uint8 *)mask->base[0] + y*mask->addr[0].stride_y + x*mask->addr[0].stride_x;
+                    _mask_8x8_o = vld1_u8(_maskp_o);
+                }
+                else
+                {
+                    _mask_8x8_o = vdup_n_u8(0);
+                }
+                vx_int16 *val_p = (vx_int16 *)((vx_uint8 *)in->base[0] + y*in->addr[0].stride_y + x*in->addr[0].stride_x);
+                vx_int16 *dest = (vx_int16 *)((vx_uint8 *)out->base[0] + y*out->addr[0].stride_y + x*out->addr[0].stride_x);
+                int16x8_t src_val_16x8 = vld1q_s16(val_p);
+                
+                int16x8_t dst_16x8; 
+                uint8x8_t t_8x8 = vdup_n_u8(0);
+                uint8x8_t maskequal0_8x8_o = vceq_u8(_mask_8x8_o, vdup_n_u8(0));
+                dst_16x8 = vbslq_s16(vmovl_u8(maskequal0_8x8_o), dst_16x8, src_val_16x8);
+                t_8x8 = vbsl_u8(maskequal0_8x8_o, t_8x8, vdup_n_u8(1));
+                
+                for (vx_int32 j = -border; j <= border; j++)
+                {
+                    for (vx_int32 i = -border; i <= border; i++)
+                    {
+                        vx_int16 *neighbor = (vx_int16 *)((vx_uint8 *)in->base[0] 
+                            + (y + j)*in->addr[0].stride_y 
+                            + (x + i)*in->addr[0].stride_x);
+                        int16x8_t neighbor_val_16x8 = vld1q_s16(neighbor);
+                        uint8x8_t _mask_8x8_i;
+                        vx_uint8 *_maskp_i;
+                        if (mask->base[0] != NULL)
+                        {
+                            _maskp_i = (vx_uint8 *)mask->base[0] + (y + j)*mask->addr[0].stride_y + (x + i)*mask->addr[0].stride_x;
+                            _mask_8x8_i = vld1_u8(_maskp_i);
+                        }
+                        else
+                        {
+                            _mask_8x8_i = vdup_n_u8(0);
+                        }
+                        
+                        uint8x8_t maskequal0_8x8_i = vceq_u8(_mask_8x8_i, vdup_n_u8(0));//(*_mask == 0)
+                        uint16x8_t j1 = vdupq_n_u16(0);
+                        if(j < 0 || (j == 0 && i <= 0))
+                        {
+                            j1 = vdupq_n_u16(1);
+                        }
+                        uint16x8_t j2 = vdupq_n_u16(0);
+                        if(j > 0 || (j == 0 && i > 0))
+                        {
+                            j2 = vdupq_n_u16(1);
+                        }
+                        uint16x8_t srcltval = vcltq_s16(src_val_16x8, neighbor_val_16x8);//<
+                        uint16x8_t srclqval = vcleq_s16(src_val_16x8, neighbor_val_16x8);//<=
+                        
+                        uint16x8_t result_16x8 = vandq_u16(vmovl_u8(maskequal0_8x8_i),
+                            vorrq_u16(vandq_u16(j1, srcltval),vandq_u16(j2,srclqval)));
+                        if(vgetq_lane_u16(result_16x8, 0) != 0 && vget_lane_u8(t_8x8, 0) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 0);
+                            t_8x8 = vset_lane_u8(1, t_8x8, 0);
+                        }
+                        if(vget_lane_u8(t_8x8, 0) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 0), dst_16x8, 0);
+                        }
+                    
+                        if(vgetq_lane_u16(result_16x8, 1) != 0 && vget_lane_u8(t_8x8, 1) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 1);
+                            t_8x8 = vset_lane_u8(1, t_8x8, 1);
+                        }
+                        if(vget_lane_u8(t_8x8, 1) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 1), dst_16x8, 1);
+                        }
+                        
+                        if(vgetq_lane_u16(result_16x8, 2) != 0 && vget_lane_u8(t_8x8, 2) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 2);
+                            t_8x8 = vset_lane_u8(1, t_8x8, 2);
+                        }
+                        if(vget_lane_u8(t_8x8, 2) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 2), dst_16x8, 2);
+                        }
+                        
+                        if(vgetq_lane_u16(result_16x8, 3) != 0 && vget_lane_u8(t_8x8, 3) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 3);
+                            t_8x8 = vset_lane_u8(1, t_8x8, 3);
+                        }
+                        if(vget_lane_u8(t_8x8, 3) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 3), dst_16x8, 3);
+                        }
+                        
+                        if(vgetq_lane_u16(result_16x8, 4) != 0 && vget_lane_u8(t_8x8, 4) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 4);
+                            t_8x8 = vset_lane_u8(1, t_8x8, 4);
+                        }
+                        if(vget_lane_u8(t_8x8, 4) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 4), dst_16x8, 4);
+                        }
+                        
+                        if(vgetq_lane_u16(result_16x8, 5) != 0 && vget_lane_u8(t_8x8, 5) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 5);
+                            t_8x8 = vset_lane_u8(1, t_8x8, 5);
+                        }
+                        if(vget_lane_u8(t_8x8, 5) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 5), dst_16x8, 5);
+                        }
+                        
+                        if(vgetq_lane_u16(result_16x8, 6) != 0 && vget_lane_u8(t_8x8, 6) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 6);
+                            t_8x8 = vset_lane_u8(1, t_8x8, 6);
+                        }
+                        if(vget_lane_u8(t_8x8, 6) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 6), dst_16x8, 6);
+                        }
+                        
+                        if(vgetq_lane_u16(result_16x8, 7) != 0 && vget_lane_u8(t_8x8, 7) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(INT16_MIN, dst_16x8, 7);
+                            t_8x8 = vset_lane_u8(1, t_8x8, 7);
+                        }
+                        if(vget_lane_u8(t_8x8, 7) ==0)
+                        {
+                            dst_16x8 = vsetq_lane_s16(vgetq_lane_s16(src_val_16x8, 7), dst_16x8, 7);
+                        }
+                    }
+                }
+                vst1q_s16(dest, dst_16x8);
+            }
+        }
+    }
+}
+
+#define NONMAXSUPPRESSION_FLEXIBLE(low_y, low_x, high_y, high_x, in_tile_x, out_tile_x)\
+for (vx_int32 y = low_y; y < high_y; y++)\
+{\
+	for (vx_int32 x = low_x; x < high_x; x++)\
+	{\
+		vx_uint8 *_mask;\
+		if (mask->base[0] != NULL)\
+		{\
+			_mask = (vx_uint8 *)mask->base[0] + mask->tile_x + y * mask->addr[0].stride_y + x * mask->addr[0].stride_x;\
+		}\
+		else\
+		{\
+			_mask = &mask_data;\
+		}\
+		void *val_p = (vx_uint8 *)in->base[0] + in_tile_x + y * in->addr[0].stride_y + x * in->addr[0].stride_x;\
+		void *dest = (vx_uint8 *)out->base[0] + out_tile_x + y * out->addr[0].stride_y + x * out->addr[0].stride_x;\
+		vx_int32 src_val = format == VX_DF_IMAGE_U8 ? *(vx_uint8 *)val_p : *(vx_int16 *)val_p;\
+		if (*_mask != 0)\
+		{\
+			if (format == VX_DF_IMAGE_U8)\
+				*(vx_uint8 *)dest = (vx_uint8)src_val;\
+			else\
+				*(vx_int16 *)dest = (vx_int16)src_val;\
+		}\
+		else\
+		{\
+			vx_bool flag = 1;\
+			for (vx_int32 i = -border; i <= border; i++)\
+			{\
+				for (vx_int32 j = -border; j <= border; j++)\
+				{\
+					void *neighbor = (vx_uint8 *)in->base[0] + in_tile_x + (y + j) * in->addr[0].stride_y + (x + i) * in->addr[0].stride_x;\
+					if (mask->base[0] != NULL)\
+					{\
+						_mask = (vx_uint8 *)mask->base[0] + mask->tile_x + (y + j) * mask->addr[0].stride_y + (x + i) * mask->addr[0].stride_x;\
+					}\
+					else\
+					{\
+						_mask = &mask_data;\
+					}\
+					vx_int32 neighbor_val = format == VX_DF_IMAGE_U8 ? *(vx_uint8 *)neighbor : *(vx_int16 *)neighbor;\
+					if ((*_mask == 0)\
+						   && (((j < 0 || (j == 0 && i <= 0)) && (src_val < neighbor_val))\
+							  || ((j > 0 || (j == 0 && i > 0)) && (src_val <= neighbor_val))))\
+					{\
+						flag = 0;\
+						break;\
+					}\
+				}\
+				if (flag == 0)\
+				{\
+					break;\
+				}\
+			}\
+			if (flag)\
+			{\
+				if (format == VX_DF_IMAGE_U8)\
+					*(vx_uint8 *)dest = (vx_uint8)src_val;\
+				else\
+					*(vx_int16 *)dest = (vx_int16)src_val;\
+			}\
+			else\
+			{\
+				if (format == VX_DF_IMAGE_U8)\
+					*(vx_uint8 *)dest = 0;\
+				else\
+					*(vx_int16 *)dest = INT16_MIN;\
+			}\
+		}\
+	}\
+}\
+
+
+void NonMaxSuppression_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;
+    vx_uint8 mask_data = 0;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *mask = (vx_tile_t *)parameters[1];
+    vx_int32 *wsize = (vx_int32*)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    vx_df_image format = in->image.format;
+    vx_int32 border = *wsize / 2;
+    
+    if (ty == 0 && tx == 0)
+    {
+        NONMAXSUPPRESSION_FLEXIBLE(border, border, (vxTileHeight(out, 0) - border), (vxTileWidth(out, 0) - border), in->tile_x, out->tile_x)
+    }
+    else
+    {
+        NONMAXSUPPRESSION_FLEXIBLE(border, tx, ty, (vxTileWidth(out, 0) - border), in->tile_x, out->tile_x)
+        NONMAXSUPPRESSION_FLEXIBLE(ty, border, (vxTileHeight(out, 0) - border), (vxTileWidth(out, 0) - border), 0, 0)
+    }
+}
diff --git a/kernels/tiling/tiling_phase.c b/kernels/tiling/tiling_phase.c
new file mode 100644
index 0000000..e32dd6d
--- /dev/null
+++ b/kernels/tiling/tiling_phase.c
@@ -0,0 +1,261 @@
+/*
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+#include <math.h>
+#include <stdlib.h>
+
+#ifndef M_PI
+#define M_PI 3.1415926535897932384626433832795
+#endif
+
+#define DBL_EPSILON     2.2204460492503131e-016 /* smallest such that 1.0+DBL_EPSILON != 1.0 */
+
+static float32x4_t vrecpq_f32(float32x4_t val)
+{
+    float32x4_t reciprocal = vrecpeq_f32(val);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+#define FASTATAN2CONST(scale)                                                            \
+        vx_float32 P1 = ((vx_float32)( 0.9997878412794807  * (180.0 / M_PI) * scale)),   \
+        P3 = ((vx_float32)(-0.3258083974640975  * (180.0 / M_PI) * scale)),              \
+        P5 = ((vx_float32)( 0.1555786518463281  * (180.0 / M_PI) * scale)),              \
+        P7 = ((vx_float32)(-0.04432655554792128 * (180.0 / M_PI) * scale)),              \
+         A_90 = ((vx_float32)(90.f * scale)),                                            \
+        A_180 = ((vx_float32)(180.f * scale)),                                           \
+        A_360 = ((vx_float32)(360.f * scale));                                           \
+        float32x4_t eps = (vdupq_n_f32((vx_float32)DBL_EPSILON)),                        \
+         _90 = (vdupq_n_f32(A_90)),                                                      \
+        _180 = (vdupq_n_f32(A_180)),                                                     \
+        _360 = (vdupq_n_f32(A_360)),                                                     \
+           z = (vdupq_n_f32(0.0f)),                                                      \
+        p1 = (vdupq_n_f32(P1)),                                                          \
+        p3 = (vdupq_n_f32(P3)),                                                          \
+        p5 = (vdupq_n_f32(P5)),                                                          \
+        p7 = (vdupq_n_f32(P7));
+
+#define FASTATAN2SCALAR(y, x, a)                                                    \
+    {                                                                               \
+        vx_float32 ax = abs(x), ay = abs(y);                                        \
+        vx_float32 c, c2;                                                           \
+        if (ax >= ay)                                                               \
+        {                                                                           \
+            c = ay / (ax + (vx_float32)DBL_EPSILON);                                \
+            c2 = c * c;                                                             \
+            a = (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c;                         \
+        }                                                                           \
+        else                                                                        \
+        {                                                                           \
+            c = ax / (ay + (vx_float32)DBL_EPSILON);                                \
+            c2 = c * c;                                                             \
+            a = A_90 - (((P7 * c2 + P5) * c2 + P3) * c2 + P1) * c;                  \
+        }                                                                           \
+        if (x < 0)                                                                  \
+            a = A_180 - a;                                                          \
+        if (y < 0)                                                                  \
+            a = A_360 - a;                                                          \
+    }
+
+#define FASTATAN2VECTOR(v_y, v_x, a)                                                \
+    {                                                                               \
+        float32x4_t ax = vabsq_f32(v_x), ay = vabsq_f32(v_y);                       \
+        float32x4_t tmin = vminq_f32(ax, ay), tmax = vmaxq_f32(ax, ay);             \
+        float32x4_t c = vmulq_f32(tmin, vrecpq_f32(vaddq_f32(tmax, eps)));          \
+        float32x4_t c2 = vmulq_f32(c, c);                                           \
+        a = vmulq_f32(c2, p7);                                                      \
+                                                                                    \
+        a = vmulq_f32(vaddq_f32(a, p5), c2);                                        \
+        a = vmulq_f32(vaddq_f32(a, p3), c2);                                        \
+        a = vmulq_f32(vaddq_f32(a, p1), c);                                         \
+                                                                                    \
+        a = vbslq_f32(vcgeq_f32(ax, ay), a, vsubq_f32(_90, a));                     \
+        a = vbslq_f32(vcltq_f32(v_x, z), vsubq_f32(_180, a), a);                    \
+        a = vbslq_f32(vcltq_f32(v_y, z), vsubq_f32(_360, a), a);                    \
+                                                                                    \
+    }
+
+
+void Phase_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x, y;
+    vx_tile_t *grad_x = (vx_tile_t *)parameters[0];
+    vx_tile_t *grad_y = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+
+    vx_uint8 *src_base_x = grad_x->base[0];
+    vx_uint8 *src_base_y = grad_y->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = out->tile_x + out->tile_block.width;
+
+    FASTATAN2CONST(256.0f / 360.0f);
+    vx_uint32 roiw16 = high_x >= 15 ? high_x - 15 : 0;
+    vx_uint32 roiw8 = high_x >= 7 ? high_x - 7 : 0;
+
+    float32x4_t v_05 = vdupq_n_f32(0.5f);
+
+    for (y = low_y; y < high_y; y++)
+    {
+        const vx_int16 * src0 = (vx_int16 *)src_base_x + y * grad_x->addr->stride_y / 2;
+        const vx_int16 * src1 = (vx_int16 *)src_base_y + y * grad_y->addr->stride_y / 2;
+        vx_uint8 * dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+
+        x = low_x;
+
+        for (; x < roiw16; x += 16)
+        {
+            int16x8_t v_src00 = vld1q_s16(src0 + x), v_src01 = vld1q_s16(src0 + x + 8);
+            int16x8_t v_src10 = vld1q_s16(src1 + x), v_src11 = vld1q_s16(src1 + x + 8);
+
+            // 0
+            float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
+            float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
+            float32x4_t v_dst32f0;
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0);
+
+            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
+            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
+            float32x4_t v_dst32f1;
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1);
+
+            uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
+                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+
+            // 1
+            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01)));
+            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11)));
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0);
+
+            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01)));
+            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11)));
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1);
+
+            uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
+                                                vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+
+            vst1q_u8(dst + x, vcombine_u8(vmovn_u16(v_dst16s0),
+                                          vmovn_u16(v_dst16s1)));
+        }
+
+        for (; x < roiw8; x += 8)
+        {
+            int16x8_t v_src0 = vld1q_s16(src0 + x);
+            int16x8_t v_src1 = vld1q_s16(src1 + x);
+
+            float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0)));
+            float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1)));
+            float32x4_t v_dst32f0;
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0);
+
+            v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0)));
+            v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1)));
+            float32x4_t v_dst32f1;
+            FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1);
+
+            uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))),
+                                            vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05))));
+
+            vst1_u8(dst + x, vmovn_u16(v_dst));
+        }
+
+        for (; x < high_x; x++)
+        {
+            vx_float32 val_x = src0[x], val_y = src1[x];
+            vx_float32 a;
+            FASTATAN2SCALAR(val_y, val_x, a);
+            dst[x] = (vx_uint8)(vx_uint32)floor(a + 0.5f);
+        }
+    }
+}
+
+
+void Phase_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x, y;
+    vx_tile_t *grad_x = (vx_tile_t *)parameters[0];
+    vx_tile_t *grad_y = (vx_tile_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+
+    vx_uint8 *src_base_x = grad_x->base[0];
+    vx_uint8 *src_base_y = grad_y->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    FASTATAN2CONST(256.0f / 360.0f);
+    
+    if (low_y == 0 && low_x == 0)
+    {
+        for (y = low_y; y < high_y; y++)
+        {
+            const vx_int16 * src0 = (vx_int16 *)src_base_x + y * grad_x->addr->stride_y / 2;
+            const vx_int16 * src1 = (vx_int16 *)src_base_y + y * grad_y->addr->stride_y / 2;
+            vx_uint8 * dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+
+            for (x = low_x; x < high_x; x++)
+            {
+                vx_float32 val_x = src0[x], val_y = src1[x];
+                vx_float32 a;
+                FASTATAN2SCALAR(val_y, val_x, a);
+                dst[x] = (vx_uint8)(vx_uint32)floor(a + 0.5f);
+            }
+        }
+    }
+    else
+    {
+        for (y = 0; y < low_y; y++)
+        {
+            const vx_int16 * src0 = (vx_int16 *)src_base_x + y * grad_x->addr->stride_y / 2;
+            const vx_int16 * src1 = (vx_int16 *)src_base_y + y * grad_y->addr->stride_y / 2;
+            vx_uint8 * dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+
+            for (x = low_x; x < high_x; x++)
+            {
+                vx_float32 val_x = src0[x], val_y = src1[x];
+                vx_float32 a;
+                FASTATAN2SCALAR(val_y, val_x, a);
+                dst[x] = (vx_uint8)(vx_uint32)floor(a + 0.5f);
+            }
+        }
+        for (y = low_y; y < high_y; y++)
+        {
+            const vx_int16 * src0 = (vx_int16 *)src_base_x + y * grad_x->addr->stride_y / 2;
+            const vx_int16 * src1 = (vx_int16 *)src_base_y + y * grad_y->addr->stride_y / 2;
+            vx_uint8 * dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+
+            for (x = 0; x < high_x; x++)
+            {
+                vx_float32 val_x = src0[x], val_y = src1[x];
+                vx_float32 a;
+                FASTATAN2SCALAR(val_y, val_x, a);
+                dst[x] = (vx_uint8)(vx_uint32)floor(a + 0.5f);
+            }
+        }
+    }
+}
diff --git a/kernels/tiling/tiling_scale.c b/kernels/tiling/tiling_scale.c
new file mode 100644
index 0000000..37c3c07
--- /dev/null
+++ b/kernels/tiling/tiling_scale.c
@@ -0,0 +1,841 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#include <arm_neon.h>
+#include <tiling.h>
+#include <math.h>
+
+static vx_bool read_pixel(void *base, vx_imagepatch_addressing_t *addr, vx_uint32 src_height, vx_uint32 src_width,
+        vx_int32 x, vx_int32 y, vx_uint8 *pixel)
+{
+    vx_bool out_of_bounds = (x < 0 || y < 0 || x >= src_width || y >= src_height);
+    vx_uint32 bx, by;
+    vx_uint8 *bpixel;
+    if (out_of_bounds)
+    {
+        return vx_false_e;
+    }
+
+    bx = x < 0 ? 0 : x >= src_width ? src_width - 1 : (vx_uint32)x;
+    by = y < 0 ? 0 : y >= src_height ? src_height - 1 : (vx_uint32)y;
+
+    vx_uint8 *new_ptr = NULL;
+    vx_uint32 offset = (addr->stride_y * by + addr->stride_x * bx);
+    new_ptr = (vx_uint8*)base;
+    bpixel = &new_ptr[offset];
+    
+    *pixel = *bpixel;
+
+    return vx_true_e;
+}
+
+static void read_pixel_v(void *base, vx_imagepatch_addressing_t *addr, vx_uint32 src_height, vx_uint32 src_width,
+        int32x4_t x_32x4, int32x4_t y_32x4, vx_uint8 *dst)
+{
+    int32x4_t o_32x4 = vdupq_n_s32(0);
+    int32x4_t dimx_32x4 = vdupq_n_s32((vx_int32)src_width);
+    int32x4_t dimy_32x4 = vdupq_n_s32((vx_int32)src_height);
+    uint32x4_t out_of_bounds_32x4 = vorrq_u32(vorrq_u32(vcltq_s32(x_32x4, o_32x4), vcltq_s32(y_32x4, o_32x4)), 
+    vorrq_u32(vcgeq_s32(x_32x4, dimx_32x4),vcgeq_s32(y_32x4, dimy_32x4)));
+    
+    char flag_1 = 0;
+    char flag_2 = 0;
+    char flag_3 = 0;
+    char flag_4 = 0;
+    if(vgetq_lane_u32(out_of_bounds_32x4, 0) == 0xFFFFFFFF)
+    {
+         flag_1 = 1;
+    }
+    if(vgetq_lane_u32(out_of_bounds_32x4, 1) == 0xFFFFFFFF)
+    {
+        flag_2 = 1;
+    }
+    if(vgetq_lane_u32(out_of_bounds_32x4, 2) == 0xFFFFFFFF)
+    {
+        flag_3 = 1;
+    }
+    if(vgetq_lane_u32(out_of_bounds_32x4, 3) == 0xFFFFFFFF)
+    {
+        flag_4 = 1;
+    }
+
+    vx_uint8 *bpixel = NULL;
+    vx_uint8 *new_ptr = NULL;
+    vx_uint32 offset = 0;
+    if(flag_1 == 0)
+    {
+        offset = (addr->stride_y * vgetq_lane_s32(y_32x4, 0) + vgetq_lane_s32(x_32x4, 0));
+        new_ptr = (vx_uint8*)base;
+        bpixel = &new_ptr[offset];
+        *dst = *bpixel;
+    }
+    if(flag_2 == 0)
+    {
+        offset = (addr->stride_y * vgetq_lane_s32(y_32x4, 1) + vgetq_lane_s32(x_32x4, 1));
+        new_ptr = (vx_uint8*)base;
+        bpixel = &new_ptr[offset];
+        *(dst+1) = *bpixel;
+    }
+    if(flag_3 == 0)
+    {
+        offset = (addr->stride_y * vgetq_lane_s32(y_32x4, 2) + vgetq_lane_s32(x_32x4, 2));
+        new_ptr = (vx_uint8*)base;
+        bpixel = &new_ptr[offset];
+        *(dst+2) = *bpixel;
+    }
+    if(flag_4 == 0)
+    {
+        offset = (addr->stride_y * vgetq_lane_s32(y_32x4, 3) + vgetq_lane_s32(x_32x4, 3));
+        new_ptr = (vx_uint8*)base;
+        bpixel = &new_ptr[offset];
+        *(dst+3) = *bpixel;
+    }
+}
+
+static vx_bool read_pixel_16s(void *base, vx_imagepatch_addressing_t *addr, vx_uint32 src_height, vx_uint32 src_width,
+    vx_int32 x, vx_int32 y, vx_int16 *pixel)
+{
+    vx_uint32 bx;
+    vx_uint32 by;
+    vx_int16* bpixel;
+
+    vx_bool out_of_bounds = (x < 0 || y < 0 || x >= src_width || y >= src_height);
+
+    if (out_of_bounds)
+    {
+        return vx_false_e;
+    }
+
+    // bounded x/y
+    bx = x < 0 ? 0 : src_width ? src_width - 1 : (vx_uint32)x;
+    by = y < 0 ? 0 : y >= src_height ? src_height - 1 : (vx_uint32)y;
+
+    vx_int16 *new_ptr = NULL;
+    vx_uint32 offset = (addr->stride_y * by + addr->stride_x * bx);
+    new_ptr = (vx_int16*)base;
+    bpixel = &new_ptr[offset];
+    
+    *pixel = *bpixel;
+
+    return vx_true_e;
+}
+
+static void vxNearestScaling_fast(vx_tile_t *src_image, vx_tile_t *dst_image)
+{
+    vx_int32 x1,y1,x2,y2;
+    vx_rectangle_t src_rect, dst_rect;
+    vx_uint32 w1 = 0, h1 = 0, w2 = 0, h2 = 0;
+    vx_float32 wr, hr;
+    vx_df_image format = 0;
+
+    w1 = src_image->image.width;
+    h1 = src_image->image.height;
+    format = src_image->image.format;
+    w2 = dst_image->image.width;
+    h2 = dst_image->image.height;
+    
+    src_rect.start_x = src_rect.start_y = 0;
+    src_rect.end_x = w1;
+    src_rect.end_y = h1;
+
+    dst_rect.start_x = dst_rect.start_y = 0;
+    dst_rect.end_x = w2;
+    dst_rect.end_y = h2;
+
+    wr = (vx_float32)w1/(vx_float32)w2;
+    hr = (vx_float32)h1/(vx_float32)h2;
+    
+    vx_uint32 low_height = dst_image->tile_y;
+    vx_uint32 height = dst_image->tile_y + dst_image->tile_block.height;
+    
+    vx_uint32 low_width = dst_image->tile_x;
+    vx_uint32 width = dst_image->tile_x + dst_image->tile_block.width;
+    
+    float32x4_t fv_0_5_32x4 = vdupq_n_f32(0.5f);
+    float32x4_t fv_wr_32x4 = vdupq_n_f32(wr);
+    float32x4_t fv_hr_32x4 = vdupq_n_f32(hr);
+    for (y2 = low_height; y2 < height; y2++)
+    {
+        vx_uint8* dst_u8 = (vx_uint8 *)dst_image->base[0] + dst_image->tile_x + y2 * dst_image->addr[0].stride_y;
+        vx_int16* dst_base_s16 = (vx_int16*)dst_image->base[0] + dst_image->tile_x + y2 * dst_image->addr[0].stride_y/2;
+        float32x4_t y2_32x4 = vdupq_n_f32((float32_t)y2);
+        float32x4_t y_src_32x4 = vsubq_f32(vmulq_f32(vaddq_f32(y2_32x4,fv_0_5_32x4), fv_hr_32x4), fv_0_5_32x4);
+        
+        int32x4_t y1_32x4 = vcvtq_s32_f32(vaddq_f32(y_src_32x4, fv_0_5_32x4));        
+        for (x2 = low_width; x2 < width; x2 += 8)
+        {           
+            float32_t arr_int32[4]={(float32_t)x2, (float32_t)(x2+1), (float32_t)(x2+2), (float32_t)(x2+3)};
+            float32x4_t x2_32x4 = vld1q_f32(arr_int32);            
+            float32x4_t x_src_32x4 = vsubq_f32(vmulq_f32(vaddq_f32(x2_32x4,fv_0_5_32x4), fv_wr_32x4), fv_0_5_32x4);
+            
+            arr_int32[0] = (float32_t)(x2+4);
+            arr_int32[1] = (float32_t)(x2+5);
+            arr_int32[2] = (float32_t)(x2+6);
+            arr_int32[3] = (float32_t)(x2+7);
+            float32x4_t x2_32x4_1 = vld1q_f32(arr_int32);
+            float32x4_t x_src_32x4_1 = vsubq_f32(vmulq_f32(vaddq_f32(x2_32x4_1,fv_0_5_32x4), fv_wr_32x4), fv_0_5_32x4);
+            int32x4_t x1_32x4 = vcvtq_s32_f32(vaddq_f32(x_src_32x4, fv_0_5_32x4));
+            int32x4_t x1_32x4_1 = vcvtq_s32_f32(vaddq_f32(x_src_32x4_1, fv_0_5_32x4));
+            
+            if (VX_DF_IMAGE_U8 == format)
+            {
+                read_pixel_v((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1_32x4, y1_32x4, dst_u8);
+                read_pixel_v((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1_32x4_1, y1_32x4, (dst_u8+4));
+                dst_u8 += 8;
+            }
+            else
+            {
+                vx_int16 v = 0;
+                vx_int16* dst = dst_base_s16 + dst_image->addr[0].stride_x/2*x2;
+                if (dst && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4, 0),vgetq_lane_s32(y1_32x4, 0),&v))
+                    *dst = v;
+                v=0;
+                if ((dst+1) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4, 1),vgetq_lane_s32(y1_32x4, 1),&v))
+                    *(dst+1) = v;
+                v=0;
+                if ((dst+2) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4, 2),vgetq_lane_s32(y1_32x4, 2),&v))
+                    *(dst+2) = v;
+                v=0;
+                if ((dst+3) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4, 3),vgetq_lane_s32(y1_32x4, 3),&v))
+                    *(dst+3) = v;
+                    
+                if ((dst+4) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4_1, 0),vgetq_lane_s32(y1_32x4, 0),&v))
+                    *(dst+4) = v;
+                v=0;
+                if ((dst+5) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4_1, 1),vgetq_lane_s32(y1_32x4, 1),&v))
+                    *(dst+5) = v;
+                v=0;
+                if ((dst+6) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4_1, 2),vgetq_lane_s32(y1_32x4, 2),&v))
+                    *(dst+6) = v;
+                v=0;
+                if ((dst+7) && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0] + src_image->tile_x, src_image->addr,h1, w1,vgetq_lane_s32(x1_32x4_1, 3),vgetq_lane_s32(y1_32x4, 3),&v))
+                    *(dst+7) = v;
+            }
+        }
+    }
+}
+
+static void vxBilinearScaling_fast(vx_tile_t *src_image, vx_tile_t *dst_image)
+{
+    vx_int32 x2,y2;
+    vx_rectangle_t src_rect, dst_rect;
+    vx_imagepatch_addressing_t src_addr, dst_addr;
+    vx_uint32 w1 = 0, h1 = 0, w2 = 0, h2 = 0;
+    vx_float32 wr, hr;
+
+    w1 = src_image->image.width;
+    h1 = src_image->image.height;
+    w2 = dst_image->image.width;
+    h2 = dst_image->image.height;
+
+    src_rect.start_x = src_rect.start_y = 0;
+    src_rect.end_x = w1;
+    src_rect.end_y = h1;
+
+    dst_rect.start_x = dst_rect.start_y = 0;
+    dst_rect.end_x = w2;
+    dst_rect.end_y = h2;
+
+    wr = (vx_float32)w1/(vx_float32)w2;
+    hr = (vx_float32)h1/(vx_float32)h2;
+    
+    vx_uint32 low_height = dst_image->tile_y;
+    vx_uint32 height = dst_image->tile_y + dst_image->tile_block.height;
+    
+    vx_uint32 low_width = dst_image->tile_x;
+    vx_uint32 width = dst_image->tile_x + dst_image->tile_block.width;
+    
+    for (y2 = low_height; y2 < height; y2++)
+    {
+        for (x2 = low_width; x2 < width; x2 += 8)
+        {
+            vx_uint8 tl = 0, tr = 0, bl = 0, br = 0;
+            vx_uint8* dst = (vx_uint8 *)dst_image->base[0] + y2 * dst_image->addr[0].stride_y + dst_image->addr[0].stride_x*x2;
+            float32x4_t x2_32x4;
+            x2_32x4 = vsetq_lane_f32((vx_float32)x2, x2_32x4, 0);
+            x2_32x4 = vsetq_lane_f32((vx_float32)(x2+1), x2_32x4, 1);
+            x2_32x4 = vsetq_lane_f32((vx_float32)(x2+2), x2_32x4, 2);
+            x2_32x4 = vsetq_lane_f32((vx_float32)(x2+3), x2_32x4, 3);
+            float32x4_t x_src_32x4 = vsubq_f32(vmulq_f32(vaddq_f32(x2_32x4,vdupq_n_f32(0.5f)), vdupq_n_f32(wr)),vdupq_n_f32(0.5f));
+            
+            float32x4_t x2_32x4_1;
+            x2_32x4_1 = vsetq_lane_f32((vx_float32)(x2+4), x2_32x4_1, 0);
+            x2_32x4_1 = vsetq_lane_f32((vx_float32)(x2+5), x2_32x4_1, 1);
+            x2_32x4_1 = vsetq_lane_f32((vx_float32)(x2+6), x2_32x4_1, 2);
+            x2_32x4_1 = vsetq_lane_f32((vx_float32)(x2+7), x2_32x4_1, 3);
+            float32x4_t x_src_32x4_1 = vsubq_f32(vmulq_f32(vaddq_f32(x2_32x4_1,vdupq_n_f32(0.5f)), vdupq_n_f32(wr)),vdupq_n_f32(0.5f));
+            
+            float32x4_t y2_32x4 = vdupq_n_f32((vx_float32)y2);
+            float32x4_t y_src_32x4 = vsubq_f32(vmulq_f32(vaddq_f32(y2_32x4,vdupq_n_f32(0.5f)), vdupq_n_f32(hr)),vdupq_n_f32(0.5f));
+            
+            float32x4_t x_min_32x4;
+            x_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4, 0)), x_min_32x4, 0);
+            x_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4, 1)), x_min_32x4, 1);
+            x_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4, 2)), x_min_32x4, 2);
+            x_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4, 3)), x_min_32x4, 3);
+            
+            float32x4_t x_min_32x4_1;
+            x_min_32x4_1 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4_1, 0)), x_min_32x4_1, 0);
+            x_min_32x4_1 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4_1, 1)), x_min_32x4_1, 1);
+            x_min_32x4_1 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4_1, 2)), x_min_32x4_1, 2);
+            x_min_32x4_1 = vsetq_lane_f32(floorf(vgetq_lane_f32(x_src_32x4_1, 3)), x_min_32x4_1, 3);
+            
+            float32x4_t y_min_32x4;
+            y_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(y_src_32x4, 0)), y_min_32x4, 0);
+            y_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(y_src_32x4, 1)), y_min_32x4, 1);
+            y_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(y_src_32x4, 2)), y_min_32x4, 2);
+            y_min_32x4 = vsetq_lane_f32(floorf(vgetq_lane_f32(y_src_32x4, 3)), y_min_32x4, 3);
+            
+            float32x4_t s_32x4 = vsubq_f32(x_src_32x4, x_min_32x4);
+            float32x4_t s_32x4_1 = vsubq_f32(x_src_32x4_1, x_min_32x4_1);    
+            
+            float32_t s_0 = vgetq_lane_f32(s_32x4, 0);
+            float32_t s_1 = vgetq_lane_f32(s_32x4, 1);
+            float32_t s_2 = vgetq_lane_f32(s_32x4, 2);
+            float32_t s_3 = vgetq_lane_f32(s_32x4, 3);
+            
+            float32_t s_4 = vgetq_lane_f32(s_32x4_1, 0);
+            float32_t s_5 = vgetq_lane_f32(s_32x4_1, 1);
+            float32_t s_6 = vgetq_lane_f32(s_32x4_1, 2);
+            float32_t s_7 = vgetq_lane_f32(s_32x4_1, 3);
+            
+            float32x4_t t_32x4 = vsubq_f32(y_src_32x4, y_min_32x4);
+            
+            float32_t t_0 = vgetq_lane_f32(t_32x4, 0);
+            float32_t t_1 = vgetq_lane_f32(t_32x4, 1);
+            float32_t t_2 = vgetq_lane_f32(t_32x4, 2);
+            float32_t t_3 = vgetq_lane_f32(t_32x4, 3);
+            
+            // the first time
+            vx_bool defined_tl_0 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 0), (vx_int32)vgetq_lane_f32(y_min_32x4, 0), &tl);
+            vx_bool defined_tr_0 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 0)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 0), &tr);
+            vx_bool defined_bl_0 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 0), (vx_int32)vgetq_lane_f32(y_min_32x4, 0)+1, &bl);
+            vx_bool defined_br_0 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 0)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 0)+1, &br);
+            vx_bool defined_0 = defined_tl_0 & defined_tr_0 & defined_bl_0 & defined_br_0;
+            if (defined_0 == vx_false_e)
+            {
+                vx_bool defined_any = defined_tl_0 | defined_tr_0 | defined_bl_0 | defined_br_0;
+                if (defined_any)
+                {
+                    if ((defined_tl_0 == vx_false_e || defined_tr_0 == vx_false_e) && fabs(t_0 - 1.0) <= 0.001)
+                        defined_tl_0 = defined_tr_0 = vx_true_e;
+                    else if ((defined_bl_0 == vx_false_e || defined_br_0 == vx_false_e) && fabs(t_0 - 0.0) <= 0.001)
+                        defined_bl_0 = defined_br_0 = vx_true_e;
+                    if ((defined_tl_0 == vx_false_e || defined_bl_0 == vx_false_e) && fabs(s_0 - 1.0) <= 0.001)
+                        defined_tl_0 = defined_bl_0 = vx_true_e;
+                    else if ((defined_tr_0 == vx_false_e || defined_br_0 == vx_false_e) && fabs(s_0 - 0.0) <= 0.001)
+                        defined_tr_0 = defined_br_0 = vx_true_e;
+                    defined_0 = defined_tl_0 & defined_tr_0 & defined_bl_0 & defined_br_0;
+                }
+            }
+            if (defined_0 == vx_true_e)
+            {
+                vx_float32 ref =
+                        (1 - s_0) * (1 - t_0) * tl +
+                        (    s_0) * (1 - t_0) * tr +
+                        (1 - s_0) * (    t_0) * bl +
+                        (    s_0) * (    t_0) * br;
+                vx_uint8 ref_8u;
+                if (ref > 255)
+                    ref_8u = 255;
+                else
+                    ref_8u = (vx_uint8)ref;
+                if (dst)
+                    *dst = ref_8u;
+            }
+            
+            // the second time
+            vx_bool defined_tl_1 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 1), (vx_int32)vgetq_lane_f32(y_min_32x4, 1), &tl);
+            vx_bool defined_tr_1 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 1)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 1), &tr);
+            vx_bool defined_bl_1 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 1), (vx_int32)vgetq_lane_f32(y_min_32x4, 1)+1, &bl);
+            vx_bool defined_br_1 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 1)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 1)+1, &br);
+            vx_bool defined_1 = defined_tl_1 & defined_tr_1 & defined_bl_1 & defined_br_1;
+            if (defined_1 == vx_false_e)
+            {
+                vx_bool defined_any = defined_tl_1 | defined_tr_1 | defined_bl_1 | defined_br_1;
+                if (defined_any)
+                {
+                    if ((defined_tl_1 == vx_false_e || defined_tr_1 == vx_false_e) && fabs(t_1 - 1.0) <= 0.001)
+                        defined_tl_1 = defined_tr_1 = vx_true_e;
+                    else if ((defined_bl_1 == vx_false_e || defined_br_1 == vx_false_e) && fabs(t_1 - 0.0) <= 0.001)
+                        defined_bl_1 = defined_br_1 = vx_true_e;
+                    if ((defined_tl_1 == vx_false_e || defined_bl_1 == vx_false_e) && fabs(s_1 - 1.0) <= 0.001)
+                        defined_tl_1 = defined_bl_1 = vx_true_e;
+                    else if ((defined_tr_1 == vx_false_e || defined_br_1 == vx_false_e) && fabs(s_1 - 0.0) <= 0.001)
+                        defined_tr_1 = defined_br_1 = vx_true_e;
+                    defined_1 = defined_tl_1 & defined_tr_1 & defined_bl_1 & defined_br_1;
+                }
+            }
+            if (defined_1 == vx_true_e)
+            {
+                vx_float32 ref =
+                        (1 - s_1) * (1 - t_1) * tl +
+                        (    s_1) * (1 - t_1) * tr +
+                        (1 - s_1) * (    t_1) * bl +
+                        (    s_1) * (    t_1) * br;
+                vx_uint8 ref_8u;
+                if (ref > 255)
+                    ref_8u = 255;
+                else
+                    ref_8u = (vx_uint8)ref;
+                if (dst+1)
+                    *(dst+1) = ref_8u;
+            }
+            
+            // the third time
+            vx_bool defined_tl_2 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 2), (vx_int32)vgetq_lane_f32(y_min_32x4, 2), &tl);
+            vx_bool defined_tr_2 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 2)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 2), &tr);
+            vx_bool defined_bl_2 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 2), (vx_int32)vgetq_lane_f32(y_min_32x4, 2)+1, &bl);
+            vx_bool defined_br_2 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 2)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 2)+1, &br);
+            vx_bool defined_2 = defined_tl_2 & defined_tr_2 & defined_bl_2 & defined_br_2;
+            if (defined_2 == vx_false_e)
+            {
+                vx_bool defined_any = defined_tl_2 | defined_tr_2 | defined_bl_2 | defined_br_2;
+                if (defined_any)
+                {
+                    if ((defined_tl_2 == vx_false_e || defined_tr_2 == vx_false_e) && fabs(t_2 - 1.0) <= 0.001)
+                        defined_tl_2 = defined_tr_2 = vx_true_e;
+                    else if ((defined_bl_2 == vx_false_e || defined_br_2 == vx_false_e) && fabs(t_2 - 0.0) <= 0.001)
+                        defined_bl_2 = defined_br_2 = vx_true_e;
+                    if ((defined_tl_2 == vx_false_e || defined_bl_2 == vx_false_e) && fabs(s_2 - 1.0) <= 0.001)
+                        defined_tl_2 = defined_bl_2 = vx_true_e;
+                    else if ((defined_tr_2 == vx_false_e || defined_br_2 == vx_false_e) && fabs(s_2 - 0.0) <= 0.001)
+                        defined_tr_2 = defined_br_2 = vx_true_e;
+                    defined_2 = defined_tl_2 & defined_tr_2 & defined_bl_2 & defined_br_2;
+                }
+            }
+            if (defined_2 == vx_true_e)
+            {
+                vx_float32 ref =
+                        (1 - s_2) * (1 - t_2) * tl +
+                        (    s_2) * (1 - t_2) * tr +
+                        (1 - s_2) * (    t_2) * bl +
+                        (    s_2) * (    t_2) * br;
+                vx_uint8 ref_8u;
+                if (ref > 255)
+                    ref_8u = 255;
+                else
+                    ref_8u = (vx_uint8)ref;
+                if (dst+2)
+                    *(dst+2) = ref_8u;
+            }
+            
+            // the fourth time
+            vx_bool defined_tl_3 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 3), (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tl);
+            vx_bool defined_tr_3 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 3)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tr);
+            vx_bool defined_bl_3 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 3), (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &bl);
+            vx_bool defined_br_3 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4, 3)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &br);
+            vx_bool defined_3 = defined_tl_3 & defined_tr_3 & defined_bl_3 & defined_br_3;
+            if (defined_3 == vx_false_e)
+            {
+                vx_bool defined_any = defined_tl_3 | defined_tr_3 | defined_bl_3 | defined_br_3;
+                if (defined_any)
+                {
+                    if ((defined_tl_3 == vx_false_e || defined_tr_3 == vx_false_e) && fabs(t_3 - 1.0) <= 0.001)
+                        defined_tl_3 = defined_tr_3 = vx_true_e;
+                    else if ((defined_bl_3 == vx_false_e || defined_br_3 == vx_false_e) && fabs(t_3 - 0.0) <= 0.001)
+                        defined_bl_3 = defined_br_3 = vx_true_e;
+                    if ((defined_tl_3 == vx_false_e || defined_bl_3 == vx_false_e) && fabs(s_3 - 1.0) <= 0.001)
+                        defined_tl_3 = defined_bl_3 = vx_true_e;
+                    else if ((defined_tr_3 == vx_false_e || defined_br_3 == vx_false_e) && fabs(s_3 - 0.0) <= 0.001)
+                        defined_tr_3 = defined_br_3 = vx_true_e;
+                    defined_3 = defined_tl_3 & defined_tr_3 & defined_bl_3 & defined_br_3;
+                }
+            }
+            if (defined_3 == vx_true_e)
+            {
+                vx_float32 ref =
+                        (1 - s_3) * (1 - t_3) * tl +
+                        (    s_3) * (1 - t_3) * tr +
+                        (1 - s_3) * (    t_3) * bl +
+                        (    s_3) * (    t_3) * br;
+                vx_uint8 ref_8u;
+                if (ref > 255)
+                    ref_8u = 255;
+                else
+                    ref_8u = (vx_uint8)ref;
+                if (dst+3)
+                    *(dst+3) = ref_8u;
+            }
+            
+            // the fifth time
+            vx_bool defined_tl_4 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 0), (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tl);
+            vx_bool defined_tr_4 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 0)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tr);
+            vx_bool defined_bl_4 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 0), (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &bl);
+            vx_bool defined_br_4 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 0)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &br);
+            vx_bool defined_4 = defined_tl_4 & defined_tr_4 & defined_bl_4 & defined_br_4;
+            if (defined_4 == vx_false_e)
+            {
+                vx_bool defined_any = defined_tl_4 | defined_tr_4 | defined_bl_4 | defined_br_4;
+                if (defined_any)
+                {
+                    if ((defined_tl_4 == vx_false_e || defined_tr_4 == vx_false_e) && fabs(t_3 - 1.0) <= 0.001)
+                        defined_tl_4 = defined_tr_4 = vx_true_e;
+                    else if ((defined_bl_4 == vx_false_e || defined_br_4 == vx_false_e) && fabs(t_3 - 0.0) <= 0.001)
+                        defined_bl_4 = defined_br_4 = vx_true_e;
+                    if ((defined_tl_4 == vx_false_e || defined_bl_4 == vx_false_e) && fabs(s_4 - 1.0) <= 0.001)
+                        defined_tl_4 = defined_bl_4 = vx_true_e;
+                    else if ((defined_tr_4 == vx_false_e || defined_br_4 == vx_false_e) && fabs(s_4 - 0.0) <= 0.001)
+                        defined_tr_4 = defined_br_4 = vx_true_e;
+                    defined_4 = defined_tl_4 & defined_tr_4 & defined_bl_4 & defined_br_4;
+                }
+            }
+            if (defined_4 == vx_true_e)
+            {
+                vx_float32 ref =
+                        (1 - s_4) * (1 - t_3) * tl +
+                        (    s_4) * (1 - t_3) * tr +
+                        (1 - s_4) * (    t_3) * bl +
+                        (    s_4) * (    t_3) * br;
+                vx_uint8 ref_8u;
+                if (ref > 255)
+                    ref_8u = 255;
+                else
+                    ref_8u = (vx_uint8)ref;
+                if (dst+4)
+                    *(dst+4) = ref_8u;
+            }
+            
+            // the sixth time
+            vx_bool defined_tl_5 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 1), (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tl);
+            vx_bool defined_tr_5 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 1)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tr);
+            vx_bool defined_bl_5 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 1), (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &bl);
+            vx_bool defined_br_5 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 1)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &br);
+            vx_bool defined_5 = defined_tl_5 & defined_tr_5 & defined_bl_5 & defined_br_5;
+            if (defined_5 == vx_false_e)
+            {
+                vx_bool defined_any = defined_tl_5 | defined_tr_5 | defined_bl_5 | defined_br_5;
+                if (defined_any)
+                {
+                    if ((defined_tl_5 == vx_false_e || defined_tr_5 == vx_false_e) && fabs(t_3 - 1.0) <= 0.001)
+                        defined_tl_5 = defined_tr_5 = vx_true_e;
+                    else if ((defined_bl_5 == vx_false_e || defined_br_5 == vx_false_e) && fabs(t_3 - 0.0) <= 0.001)
+                        defined_bl_5 = defined_br_5 = vx_true_e;
+                    if ((defined_tl_5 == vx_false_e || defined_bl_5 == vx_false_e) && fabs(s_5 - 1.0) <= 0.001)
+                        defined_tl_5 = defined_bl_5 = vx_true_e;
+                    else if ((defined_tr_5 == vx_false_e || defined_br_5 == vx_false_e) && fabs(s_5 - 0.0) <= 0.001)
+                        defined_tr_5 = defined_br_5 = vx_true_e;
+                    defined_5 = defined_tl_5 & defined_tr_5 & defined_bl_5 & defined_br_5;
+                }
+            }
+            if (defined_5 == vx_true_e)
+            {
+                vx_float32 ref =
+                        (1 - s_5) * (1 - t_3) * tl +
+                        (    s_5) * (1 - t_3) * tr +
+                        (1 - s_5) * (    t_3) * bl +
+                        (    s_5) * (    t_3) * br;
+                vx_uint8 ref_8u;
+                if (ref > 255)
+                    ref_8u = 255;
+                else
+                    ref_8u = (vx_uint8)ref;
+                if (dst+5)
+                    *(dst+5) = ref_8u;
+            }
+                
+            // the seventh time
+            vx_bool defined_tl_6 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 2), (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tl);
+            vx_bool defined_tr_6 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 2)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tr);
+            vx_bool defined_bl_6 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 2), (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &bl);
+            vx_bool defined_br_6 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 2)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &br);
+            vx_bool defined_6 = defined_tl_6 & defined_tr_6 & defined_bl_6 & defined_br_6;
+            if (defined_6 == vx_false_e)
+            {
+                vx_bool defined_any = defined_tl_6 | defined_tr_6 | defined_bl_6 | defined_br_6;
+                if (defined_any)
+                {
+                    if ((defined_tl_6 == vx_false_e || defined_tr_6 == vx_false_e) && fabs(t_3 - 1.0) <= 0.001)
+                        defined_tl_6 = defined_tr_6 = vx_true_e;
+                    else if ((defined_bl_6 == vx_false_e || defined_br_6 == vx_false_e) && fabs(t_3 - 0.0) <= 0.001)
+                        defined_bl_6 = defined_br_6 = vx_true_e;
+                    if ((defined_tl_6 == vx_false_e || defined_bl_6 == vx_false_e) && fabs(s_6 - 1.0) <= 0.001)
+                        defined_tl_6 = defined_bl_6 = vx_true_e;
+                    else if ((defined_tr_6 == vx_false_e || defined_br_6 == vx_false_e) && fabs(s_6 - 0.0) <= 0.001)
+                        defined_tr_6 = defined_br_6 = vx_true_e;
+                    defined_6 = defined_tl_6 & defined_tr_6 & defined_bl_6 & defined_br_6;
+                }
+            }
+            if (defined_6 == vx_true_e)
+            {
+                vx_float32 ref =
+                        (1 - s_6) * (1 - t_3) * tl +
+                        (    s_6) * (1 - t_3) * tr +
+                        (1 - s_6) * (    t_3) * bl +
+                        (    s_6) * (    t_3) * br;
+                vx_uint8 ref_8u;
+                if (ref > 255)
+                    ref_8u = 255;
+                else
+                    ref_8u = (vx_uint8)ref;
+                if (dst+6)
+                    *(dst+6) = ref_8u;
+            }
+            
+            // the eighth time
+            vx_bool defined_tl_7 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 3), (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tl);
+            vx_bool defined_tr_7 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 3)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3), &tr);
+            vx_bool defined_bl_7 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 3), (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &bl);
+            vx_bool defined_br_7 = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, (vx_int32)vgetq_lane_f32(x_min_32x4_1, 3)+1, (vx_int32)vgetq_lane_f32(y_min_32x4, 3)+1, &br);
+            vx_bool defined_7 = defined_tl_7 & defined_tr_7 & defined_bl_7 & defined_br_7;
+            if (defined_7 == vx_false_e)
+            {
+                vx_bool defined_any = defined_tl_7 | defined_tr_7 | defined_bl_7 | defined_br_7;
+                if (defined_any)
+                {
+                    if ((defined_tl_7 == vx_false_e || defined_tr_7 == vx_false_e) && fabs(t_3 - 1.0) <= 0.001)
+                        defined_tl_7 = defined_tr_7 = vx_true_e;
+                    else if ((defined_bl_7 == vx_false_e || defined_br_7 == vx_false_e) && fabs(t_3 - 0.0) <= 0.001)
+                        defined_bl_7 = defined_br_7 = vx_true_e;
+                    if ((defined_tl_7 == vx_false_e || defined_bl_7 == vx_false_e) && fabs(s_7 - 1.0) <= 0.001)
+                        defined_tl_7 = defined_bl_7 = vx_true_e;
+                    else if ((defined_tr_7 == vx_false_e || defined_br_7 == vx_false_e) && fabs(s_7 - 0.0) <= 0.001)
+                        defined_tr_7 = defined_br_7 = vx_true_e;
+                    defined_7 = defined_tl_7 & defined_tr_7 & defined_bl_7 & defined_br_7;
+                }
+            }
+            if (defined_7 == vx_true_e)
+            {
+                vx_float32 ref =
+                        (1 - s_7) * (1 - t_3) * tl +
+                        (    s_7) * (1 - t_3) * tr +
+                        (1 - s_7) * (    t_3) * bl +
+                        (    s_7) * (    t_3) * br;
+                vx_uint8 ref_8u;
+                if (ref > 255)
+                    ref_8u = 255;
+                else
+                    ref_8u = (vx_uint8)ref;
+                if (dst+7)
+                    *(dst+7) = ref_8u;
+            }
+        }
+    }
+}
+
+void ScaleImage_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+    vx_enum *type = (vx_enum*)parameters[2];
+    if (*type == VX_INTERPOLATION_BILINEAR)
+    {
+        vxBilinearScaling_fast(in, out);
+    }
+    else if (*type == VX_INTERPOLATION_AREA)
+    {
+        vxNearestScaling_fast(in, out);
+    }
+    else if (*type == VX_INTERPOLATION_NEAREST_NEIGHBOR)
+    {
+        vxNearestScaling_fast(in, out);
+    }
+}
+
+
+#define NEAREST_SCALING(low_y, low_x, high_y, high_x, src_image_tile_x, dst_image_tile_x) \
+    for (y2 = low_y; y2 < high_y; y2++)\
+    {\
+        for (x2 = low_x; x2 < high_x; x2++)\
+        {\
+            if (VX_DF_IMAGE_U8 == format)\
+            {\
+                vx_uint8 v = 0;\
+                vx_uint8 *dst = (vx_uint8 *)dst_image->base[0] + y2 * dst_image->addr[0].stride_y + x2 * dst_image->addr[0].stride_x;\
+                vx_float32 x_src = ((vx_float32)x2 + 0.5f)*wr - 0.5f;\
+                vx_float32 y_src = ((vx_float32)y2 + 0.5f)*hr - 0.5f;\
+                vx_float32 x_min = floorf(x_src);\
+                vx_float32 y_min = floorf(y_src);\
+                x1 = (vx_int32)x_min;\
+                y1 = (vx_int32)y_min;\
+                if (x_src - x_min >= 0.5f)\
+                    x1++;\
+                if (y_src - y_min >= 0.5f)\
+                    y1++;\
+                if (dst && vx_true_e == read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1, y1, &v))\
+                    *dst = v;\
+            }\
+            else\
+            {\
+                vx_int16 v = 0;\
+                vx_int16 *dst = (vx_int16 *)dst_image->base[0] + y2 * dst_image->addr[0].stride_y / 2+ x2 * dst_image->addr[0].stride_x /2;\
+                vx_float32 x_src = ((vx_float32)x2 + 0.5f)*wr - 0.5f;\
+                vx_float32 y_src = ((vx_float32)y2 + 0.5f)*hr - 0.5f;\
+                vx_float32 x_min = floorf(x_src);\
+                vx_float32 y_min = floorf(y_src);\
+                x1 = (vx_int32)x_min;\
+                y1 = (vx_int32)y_min;\
+                if (x_src - x_min >= 0.5f)\
+                    x1++;\
+                if (y_src - y_min >= 0.5f)\
+                    y1++;\
+                if (dst && vx_true_e == read_pixel_16s((vx_int16 *)src_image->base[0], src_image->addr, h1, w1, x1, y1, &v))\
+                    *dst = v;\
+            }\
+        }\
+    }\
+
+static void vxNearestScaling(vx_tile_t *src_image, vx_tile_t *dst_image, vx_uint32 ty, vx_uint32 tx)
+{
+    vx_int32 x1,y1,x2,y2;
+    vx_rectangle_t src_rect, dst_rect;
+    vx_uint32 w1 = 0, h1 = 0, w2 = 0, h2 = 0;
+    vx_float32 wr, hr;
+    vx_df_image format = 0;
+
+    w1 = src_image->image.width;
+    h1 = src_image->image.height;
+    format = src_image->image.format;
+    w2 = dst_image->image.width;
+    h2 = dst_image->image.height;
+    
+    src_rect.start_x = src_rect.start_y = 0;
+    src_rect.end_x = w1;
+    src_rect.end_y = h1;
+
+    dst_rect.start_x = dst_rect.start_y = 0;
+    dst_rect.end_x = w2;
+    dst_rect.end_y = h2;
+
+    wr = (vx_float32)w1/(vx_float32)w2;
+    hr = (vx_float32)h1/(vx_float32)h2;
+    
+    if (ty == 0 && tx == 0)
+    {
+        NEAREST_SCALING(0, 0, vxTileHeight(dst_image, 0), vxTileWidth(dst_image, 0), src_image->tile_x, dst_image->tile_x)
+    }
+    else
+    {
+        NEAREST_SCALING(0, tx, ty, vxTileWidth(dst_image, 0), src_image->tile_x, dst_image->tile_x)
+        NEAREST_SCALING(ty, 0, vxTileHeight(dst_image, 0), vxTileWidth(dst_image, 0), 0, 0)
+    }
+}
+
+#define BILINEAR_SCALING(low_y, low_x, high_y, high_x, src_image_tile_x, dst_image_tile_x) \
+    for (y2 = low_y; y2 < high_y; y2++)\
+    {\
+        for (x2 = low_x; x2 < high_x; x2++)\
+        {\
+            vx_uint8 tl = 0, tr = 0, bl = 0, br = 0;\
+            vx_uint8 *dst = (vx_uint8 *)dst_image->base[0] + y2 * dst_image->addr[0].stride_y + x2 * dst_image->addr[0].stride_x;\
+            vx_float32 x_src = ((vx_float32)x2+0.5f)*wr - 0.5f;\
+            vx_float32 y_src = ((vx_float32)y2+0.5f)*hr - 0.5f;\
+            vx_float32 x_min = floorf(x_src);\
+            vx_float32 y_min = floorf(y_src);\
+            vx_int32 x1 = (vx_int32)x_min;\
+            vx_int32 y1 = (vx_int32)y_min;\
+            vx_float32 s = x_src - x_min;\
+            vx_float32 t = y_src - y_min;\
+            vx_bool defined_tl = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1 + 0, y1 + 0, &tl);\
+            vx_bool defined_tr = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1 + 1, y1 + 0, &tr);\
+            vx_bool defined_bl = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1 + 0, y1 + 1, &bl);\
+            vx_bool defined_br = read_pixel((vx_uint8 *)src_image->base[0], src_image->addr, h1, w1, x1 + 1, y1 + 1, &br);\
+            vx_bool defined = defined_tl & defined_tr & defined_bl & defined_br;\
+            if (defined == vx_false_e)\
+            {\
+                vx_bool defined_any = defined_tl | defined_tr | defined_bl | defined_br;\
+                if (defined_any)\
+                {\
+                    if ((defined_tl == vx_false_e || defined_tr == vx_false_e) && fabs(t - 1.0) <= 0.001)\
+                        defined_tl = defined_tr = vx_true_e;\
+                    else if ((defined_bl == vx_false_e || defined_br == vx_false_e) && fabs(t - 0.0) <= 0.001)\
+                        defined_bl = defined_br = vx_true_e;\
+                    if ((defined_tl == vx_false_e || defined_bl == vx_false_e) && fabs(s - 1.0) <= 0.001)\
+                        defined_tl = defined_bl = vx_true_e;\
+                    else if ((defined_tr == vx_false_e || defined_br == vx_false_e) && fabs(s - 0.0) <= 0.001)\
+                        defined_tr = defined_br = vx_true_e;\
+                    defined = defined_tl & defined_tr & defined_bl & defined_br;\
+                }\
+            }\
+            if (defined == vx_true_e)\
+            {\
+                vx_float32 ref =\
+                        (1 - s) * (1 - t) * tl +\
+                        (    s) * (1 - t) * tr +\
+                        (1 - s) * (    t) * bl +\
+                        (    s) * (    t) * br;\
+                vx_uint8 ref_8u;\
+                if (ref > 255)\
+                    ref_8u = 255;\
+                else\
+                    ref_8u = (vx_uint8)ref;\
+                if (dst)\
+                    *dst = ref_8u;\
+            }\
+        }\
+    }\
+
+
+static void vxBilinearScaling(vx_tile_t *src_image, vx_tile_t *dst_image, vx_uint32 ty, vx_uint32 tx)
+{
+    vx_int32 x2,y2;
+    vx_rectangle_t src_rect, dst_rect;
+    vx_imagepatch_addressing_t src_addr, dst_addr;
+    vx_uint32 w1 = 0, h1 = 0, w2 = 0, h2 = 0;
+    vx_float32 wr, hr;
+
+    w1 = src_image->image.width;
+    h1 = src_image->image.height;
+    w2 = dst_image->image.width;
+    h2 = dst_image->image.height;
+
+    src_rect.start_x = src_rect.start_y = 0;
+    src_rect.end_x = w1;
+    src_rect.end_y = h1;
+
+    dst_rect.start_x = dst_rect.start_y = 0;
+    dst_rect.end_x = w2;
+    dst_rect.end_y = h2;
+
+    wr = (vx_float32)w1/(vx_float32)w2;
+    hr = (vx_float32)h1/(vx_float32)h2;
+
+    if (ty == 0 && tx == 0)
+    {
+        BILINEAR_SCALING(0, 0, vxTileHeight(dst_image, 0), vxTileWidth(dst_image, 0), src_image->tile_x, dst_image->tile_x)
+    }
+    else
+    {
+        BILINEAR_SCALING(0, tx, ty, vxTileWidth(dst_image, 0), src_image->tile_x, dst_image->tile_x)
+        BILINEAR_SCALING(ty, 0, vxTileHeight(dst_image, 0), vxTileWidth(dst_image, 0), 0, 0)
+    }
+}
+
+void ScaleImage_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 y, x;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *out = (vx_tile_t *)parameters[1];
+    vx_enum *type = (vx_enum*)parameters[2];
+    vx_uint32 ty = out->tile_y;
+    vx_uint32 tx = out->tile_x;
+    if (*type == VX_INTERPOLATION_BILINEAR)
+    {
+        vxBilinearScaling(in, out, ty, tx);
+    }
+    else if (*type == VX_INTERPOLATION_AREA)
+    {
+        vxNearestScaling(in, out, ty, tx);
+    }
+    else if (*type == VX_INTERPOLATION_NEAREST_NEIGHBOR)
+    {
+        vxNearestScaling(in, out, ty, tx);
+    }
+}
+
+
diff --git a/kernels/tiling/tiling_sobel3x3.c b/kernels/tiling/tiling_sobel3x3.c
new file mode 100644
index 0000000..c5eeca4
--- /dev/null
+++ b/kernels/tiling/tiling_sobel3x3.c
@@ -0,0 +1,239 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+
+#include <tiling.h>
+
+#define SOBEL3x3_VALUE                                                 \
+    const uint8x16_t top_data = vld1q_u8(top_src);                     \
+    const uint8x16_t mid_data = vld1q_u8(mid_src);                     \
+    const uint8x16_t bot_data = vld1q_u8(bot_src);                     \
+                                                                       \
+    const int16x8x2_t top_s16 =                                        \
+    {                                                                  \
+        {                                                              \
+            vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),    \
+            vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))    \
+        }                                                              \
+    };                                                                 \
+                                                                       \
+    const int16x8x2_t mid_s16 =                                        \
+    {                                                                  \
+        {                                                              \
+            vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),    \
+            vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))    \
+        }                                                              \
+    };                                                                 \
+    const int16x8x2_t bot_s16 =                                        \
+    {                                                                  \
+        {                                                              \
+            vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),    \
+            vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))    \
+        }                                                              \
+    };
+
+void Sobel3x3_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    int16x8_t two      = vdupq_n_s16(2);
+    int16x8_t minustwo = vdupq_n_s16(-2);
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *grad_x = (vx_tile_t *)parameters[1];
+    vx_tile_t *grad_y = (vx_tile_t *)parameters[2];
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+
+    if (grad_x)
+    {
+        vx_int16 *grad_x_base = (vx_int16 *)grad_x->base[0] + grad_x->tile_x;
+
+        vx_uint32 low_y = grad_x->tile_y;
+        vx_uint32 high_y = grad_x->tile_y + grad_x->tile_block.height;
+
+        if (low_y == 0)
+        {
+            low_y = 1;
+        }
+        if (high_y == grad_x->image.height)
+        {
+            high_y = high_y - 1;
+        }
+
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_int16* dstp = (vx_int16 *)grad_x_base + 1 + y * grad_x->addr->stride_y / 2;
+            vx_uint8* top_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y;
+            vx_uint8* mid_src = (vx_uint8 *)src_base + (y) * in->addr->stride_y;
+            vx_uint8* bot_src = (vx_uint8 *)src_base + (y + 1) * in->addr->stride_y;
+
+            for (x = 0; x < grad_x->tile_block.width; x += 8)
+            {
+                SOBEL3x3_VALUE           
+                //top left
+                int16x8_t out_x = vnegq_s16(top_s16.val[0]);
+                //top right
+                out_x = vaddq_s16(out_x, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+                //mid left
+                out_x = vmlaq_s16(out_x, mid_s16.val[0], minustwo);
+                //mid right
+                out_x = vmlaq_s16(out_x, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two);
+                //bot left
+                out_x = vsubq_s16(out_x, bot_s16.val[0]);
+                //bot right
+                out_x = vaddq_s16(out_x, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+                vst1q_s16(dstp, out_x);
+
+                top_src+=8;
+                mid_src+=8;
+                bot_src+=8;
+                dstp += 8;
+            }
+        }
+    }
+    if (grad_y)
+    {
+        vx_int16 *grad_y_base = (vx_int16 *)grad_y->base[0] + grad_y->tile_x;
+
+        vx_uint32 low_y = grad_y->tile_y;
+        vx_uint32 high_y = grad_y->tile_y + grad_y->tile_block.height;
+
+        if (low_y == 0)
+        {
+            low_y = 1;
+        }
+        if (high_y == grad_y->image.height)
+        {
+            high_y = high_y - 1;
+        }
+
+        for (y = low_y; y < high_y; y++)
+        {
+            vx_int16* dstp = (vx_int16 *)grad_y_base + 1 + y * grad_y->addr->stride_y / 2;
+            vx_uint8* top_src = (vx_uint8 *)src_base + (y - 1) * in->addr->stride_y;
+            vx_uint8* mid_src = (vx_uint8 *)src_base + (y) * in->addr->stride_y;
+            vx_uint8* bot_src = (vx_uint8 *)src_base + (y + 1) * in->addr->stride_y;
+
+            for (x = 0; x < grad_y->tile_block.width; x += 8)
+            {
+                SOBEL3x3_VALUE           
+                //top left
+                int16x8_t out_y = vnegq_s16(top_s16.val[0]);
+                //top mid
+                out_y = vmlaq_s16(out_y, vextq_s16(top_s16.val[0], top_s16.val[1], 1), minustwo);
+                //top right
+                out_y = vsubq_s16(out_y, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
+                //bot left
+                out_y = vaddq_s16(out_y, bot_s16.val[0]);
+                //bot mid
+                out_y = vmlaq_s16(out_y, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two);
+                //bot right
+                out_y = vaddq_s16(out_y, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
+
+                vst1q_s16(dstp, out_y);
+
+                top_src+=8;
+                mid_src+=8;
+                bot_src+=8;
+                dstp += 8;
+            }
+        }
+    }
+}
+
+
+#define SOBEL3x3_X(low_y, high_y, low_x, high_x)                                    \
+        for (y = low_y; y < high_y; y++)                                            \
+        {                                                                           \
+            for (x = low_x; x < high_x; x++)                                        \
+            {                                                                       \
+                vx_int32 value = 0;                                                 \
+                                                                                    \
+                value -= vxImagePixel(vx_uint8, in, 0, x, y, -1, -1);               \
+                value += vxImagePixel(vx_uint8, in, 0, x, y, +1, -1);               \
+                value -= vxImagePixel(vx_uint8, in, 0, x, y, -1, 0) << 1;           \
+                value += vxImagePixel(vx_uint8, in, 0, x, y, +1, 0) << 1;           \
+                value -= vxImagePixel(vx_uint8, in, 0, x, y, -1, +1);               \
+                value += vxImagePixel(vx_uint8, in, 0, x, y, +1, +1);               \
+                                                                                    \
+                vxImagePixel(vx_int16, grad_x, 0, x, y, 0, 0) = (vx_int16)value;    \
+            }                                                                       \
+        }                                                                           
+
+#define SOBEL3x3_Y(low_y, high_y, low_x, high_x)                                    \
+        for (y = low_y; y < high_y; y++)                                            \
+        {                                                                           \
+            for (x = low_x; x < high_x; x++)                                        \
+            {                                                                       \
+                vx_int32 value = 0;                                                 \
+                                                                                    \
+                value -= vxImagePixel(vx_uint8, in, 0, x, y, -1, -1);               \
+                value -= vxImagePixel(vx_uint8, in, 0, x, y, 0, -1) << 1;           \
+                value -= vxImagePixel(vx_uint8, in, 0, x, y, +1, -1);               \
+                value += vxImagePixel(vx_uint8, in, 0, x, y, -1, +1);               \
+                value += vxImagePixel(vx_uint8, in, 0, x, y, 0, +1) << 1;           \
+                value += vxImagePixel(vx_uint8, in, 0, x, y, +1, +1);               \
+                                                                                    \
+                vxImagePixel(vx_int16, grad_y, 0, x, y, 0, 0) = (vx_int16)value;    \
+            }                                                                       \
+        }                                                                           
+
+
+void Sobel3x3_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_t *grad_x = (vx_tile_t *)parameters[1];
+    vx_tile_t *grad_y = (vx_tile_t *)parameters[2];
+
+    if (grad_x)
+    {
+        vx_uint32 low_y = grad_x->tile_y;                                           
+        vx_uint32 high_y = vxTileHeight(grad_x, 0);                                 
+        vx_uint32 low_x = grad_x->tile_x;                                           
+        vx_uint32 high_x = vxTileWidth(grad_x, 0);                                  
+        if (low_y == 0 && low_x == 0)
+        {
+            SOBEL3x3_X(low_y + 1, high_y - 1, low_x + 1, high_x - 1)
+        }
+        else
+        {
+            SOBEL3x3_X(1, low_y, low_x, high_x - 1)
+            SOBEL3x3_X(low_y, high_y, 1, high_x - 1)
+        }
+    }
+    if (grad_y)
+    {
+        vx_uint32 low_y = grad_y->tile_y;
+        vx_uint32 high_y = vxTileHeight(grad_y, 0);
+        vx_uint32 low_x = grad_y->tile_x;
+        vx_uint32 high_x = vxTileWidth(grad_y, 0);
+        if (low_y == 0 && low_x == 0)
+        {
+            SOBEL3x3_Y(low_y + 1, high_y - 1, low_x + 1, high_x - 1)
+        }
+        else
+        {
+            SOBEL3x3_Y(1, low_y, low_x, high_x - 1)
+            SOBEL3x3_Y(low_y, high_y, 1, high_x - 1)
+        }
+    }
+}
diff --git a/kernels/tiling/tiling_threshold.c b/kernels/tiling/tiling_threshold.c
new file mode 100644
index 0000000..b8b8413
--- /dev/null
+++ b/kernels/tiling/tiling_threshold.c
@@ -0,0 +1,296 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+
+#include <tiling.h>
+
+void Threshold_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_threshold_t *threshold = (vx_tile_threshold_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    vx_uint8 true_value_u8 = threshold->true_value.U8;
+    vx_uint8 false_value_u8 = threshold->false_value.U8;
+
+    vx_uint8 _threshold_u8 = threshold->value.U8;
+    vx_uint8 _lower_threshold_u8 = threshold->lower.U8;
+    vx_uint8 _upper_threshold_u8 = threshold->upper.U8;
+
+    vx_int16 _threshold_s16 = threshold->value.S16;
+    vx_int16 _lower_threshold_s16 = threshold->lower.S16;
+    vx_int16 _upper_threshold_s16 = threshold->upper.S16;
+
+    vx_int32 format = threshold->input_format;
+    vx_int32 type = threshold->thresh_type;
+
+    if (format == VX_DF_IMAGE_S16)
+    {//case of input: VX_DF_IMAGE_S16 -> output: VX_DF_IMAGE_U8
+        vx_int16 *src_base = (vx_int16 *)in->base[0] + in->tile_x;
+        vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+        const uint8x8_t true_value = vdup_n_u8(true_value_u8);
+        const uint8x8_t false_value = vdup_n_u8(false_value_u8);
+
+        const int16x8_t threshold = vdupq_n_s16(_threshold_s16);
+        const int16x8_t lower_threshold = vdupq_n_s16(_lower_threshold_s16);
+        const int16x8_t upper_threshold = vdupq_n_s16(_upper_threshold_s16);
+
+        if (type == VX_THRESHOLD_TYPE_BINARY)
+        {
+            for (y = low_y; y < high_y; y++)
+            {
+                const vx_int16 *src_ptr = (vx_int16 *)src_base + y * in->addr->stride_y / 2;
+                vx_uint8 *dst_ptr = (vx_uint8 *)dst_base + y * out->addr->stride_y;  
+
+                for (x = 0; x < out->tile_block.width; x += 8)
+                {
+                    const int16x8_t vSrc = vld1q_s16(src_ptr);
+                    uint8x8_t mask = vmovn_u16(vcgtq_s16(vSrc, threshold));
+                    uint8x8_t dst_value = vbsl_u8(mask, true_value, false_value);
+
+                    vst1_u8(dst_ptr, dst_value);
+
+                    src_ptr += 8;
+                    dst_ptr += 8;
+                }
+            }
+        }
+        else if (type == VX_THRESHOLD_TYPE_RANGE)
+        {
+            for (y = low_y; y < high_y; y++)
+            {
+                const vx_int16 *src_ptr = (vx_int16 *)src_base + y * in->addr->stride_y / 2;
+                vx_uint8 *dst_ptr = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+                for (x = 0; x < out->tile_block.width; x += 8)
+                {
+                    const int16x8_t vSrc = vld1q_s16(src_ptr);
+                    uint16x8_t _mask = vcleq_s16(vSrc, upper_threshold);
+                    _mask = vandq_u16(vcgeq_s16(vSrc, lower_threshold), _mask);
+                    uint8x8_t mask = vmovn_u16(_mask);
+                    vst1_u8(dst_ptr, vbsl_u8(mask, true_value, false_value));
+
+                    src_ptr += 8;
+                    dst_ptr += 8;
+                }
+            }
+        }
+    }
+    else
+    {//case of input: VX_DF_IMAGE_U8  -> output: VX_DF_IMAGE_U8
+        vx_uint8 *src_base = in->base[0] + in->tile_x;
+        vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+        const uint8x16_t true_value = vdupq_n_u8(true_value_u8);
+        const uint8x16_t false_value = vdupq_n_u8(false_value_u8);
+
+        const uint8x16_t threshold = vdupq_n_u8(_threshold_u8);
+        const uint8x16_t lower_threshold = vdupq_n_u8(_lower_threshold_u8);
+        const uint8x16_t upper_threshold = vdupq_n_u8(_upper_threshold_u8);
+
+        if (type == VX_THRESHOLD_TYPE_BINARY)
+        {
+            for (y = low_y; y < high_y; y++)
+            {
+                const vx_uint8 *src_ptr = (vx_uint8 *)src_base + y * in->addr->stride_y;
+                vx_uint8 *dst_ptr = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+
+                for (x = 0; x < out->tile_block.width; x += 16)
+                {
+                    const uint8x16_t vSrc = vld1q_u8(src_ptr);
+                    uint8x16_t mask = vcgtq_u8(vSrc, threshold);
+                    vst1q_u8(dst_ptr, vbslq_u8(mask, true_value, false_value));
+
+                    src_ptr += 16;
+                    dst_ptr += 16;
+                }
+            }
+        }
+        else if (type == VX_THRESHOLD_TYPE_RANGE)
+        {
+            for (y = low_y; y < high_y; y++)
+            {
+                const vx_uint8 *src_ptr = (vx_uint8 *)src_base + y * in->addr->stride_y;
+                vx_uint8 *dst_ptr = (vx_uint8 *)dst_base + y * out->addr->stride_y;
+
+                for (x = 0; x < out->tile_block.width; x += 16)
+                {
+                    const uint8x16_t vSrc = vld1q_u8(src_ptr);
+                    uint8x16_t mask = vcleq_u8(vSrc, upper_threshold);
+                    mask = vandq_u8(vcgeq_u8(vSrc, lower_threshold), mask);
+                    vst1q_u8(dst_ptr, vbslq_u8(mask, true_value, false_value));
+
+                    src_ptr += 16;
+                    dst_ptr += 16;
+                }
+            }
+        }
+    }
+}
+
+
+#define vxThreshold_BINARY(type, low_y, high_y, low_x, high_x, type_size)                       \
+    for (y = low_y; y < high_y; y++)                                                            \
+    {                                                                                           \
+        const type *src_ptr = (type *)src_base + y * in->addr->stride_y / type_size;            \
+        vx_uint8 *dst_ptr = (vx_uint8 *)dst_base + y * out->addr->stride_y;                     \
+                                                                                                \
+        for (x = low_x; x < high_x; x++)                                                        \
+        {                                                                                       \
+            if (*src_ptr > _threshold_s16)                                                      \
+            {                                                                                   \
+                *dst_ptr = true_value_u8;                                                       \
+            }                                                                                   \
+            else                                                                                \
+            {                                                                                   \
+                *dst_ptr = false_value_u8;                                                      \
+            }                                                                                   \
+            src_ptr++;                                                                          \
+            dst_ptr++;                                                                          \
+        }                                                                                       \
+    }                                                                                           \
+
+
+#define vxThreshold_RANGE(type, low_y, high_y, low_x, high_x, type_size)                        \
+    for (y = low_y; y < high_y; y++)                                                            \
+    {                                                                                           \
+        const type *src_ptr = (type *)src_base + y * in->addr->stride_y / type_size;            \
+        vx_uint8 *dst_ptr = (vx_uint8 *)dst_base + y * out->addr->stride_y;                     \
+                                                                                                \
+        for (x = low_x; x < high_x; x++)                                                        \
+        {                                                                                       \
+            if (*src_ptr > _upper_threshold_s16)                                                \
+            {                                                                                   \
+                *dst_ptr = false_value_u8;                                                      \
+            }                                                                                   \
+            else if (*src_ptr < _lower_threshold_s16)                                           \
+            {                                                                                   \
+                *dst_ptr = false_value_u8;                                                      \
+            }                                                                                   \
+            else                                                                                \
+            {                                                                                   \
+                *dst_ptr = true_value_u8;                                                       \
+            }                                                                                   \
+            src_ptr++;                                                                          \
+            dst_ptr++;                                                                          \
+        }                                                                                       \
+    }                                                                                           \
+
+
+void Threshold_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_threshold_t *threshold = (vx_tile_threshold_t *)parameters[1];
+    vx_tile_t *out = (vx_tile_t *)parameters[2];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    vx_uint8 true_value_u8 = threshold->true_value.U8;
+    vx_uint8 false_value_u8 = threshold->false_value.U8;
+
+    vx_uint8 _threshold_u8 = threshold->value.U8;
+    vx_uint8 _lower_threshold_u8 = threshold->lower.U8;
+    vx_uint8 _upper_threshold_u8 = threshold->upper.U8;
+
+    vx_int16 _threshold_s16 = threshold->value.S16;
+    vx_int16 _lower_threshold_s16 = threshold->lower.S16;
+    vx_int16 _upper_threshold_s16 = threshold->upper.S16;
+
+    vx_int32 format = threshold->input_format;
+    vx_int32 type = threshold->thresh_type;
+
+    if (format == VX_DF_IMAGE_S16)
+    {//case of input: VX_DF_IMAGE_S16 -> output: VX_DF_IMAGE_U8
+        vx_int16 *src_base = (vx_int16 *)in->base[0] + in->tile_x;
+        vx_uint8 *dst_base = out->base[0] + out->tile_x;
+        if (type == VX_THRESHOLD_TYPE_BINARY)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                vxThreshold_BINARY(vx_int16, low_y, high_y, low_x, high_x, 2)
+            }
+            else
+            {
+                vxThreshold_BINARY(vx_int16, 0, low_y, low_x, high_x, 2)
+
+                src_base = (vx_int16 *)in->base[0];
+                dst_base = out->base[0];
+                vxThreshold_BINARY(vx_int16, low_y, high_y, 0, high_x, 2)
+            }
+        }
+        else if (type == VX_THRESHOLD_TYPE_RANGE)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                vxThreshold_RANGE(vx_int16, low_y, high_y, low_x, high_x, 2)
+            }
+            else
+            {
+                vxThreshold_RANGE(vx_int16, 0, low_y, low_x, high_x, 2)
+
+                src_base = (vx_int16 *)in->base[0];
+                dst_base = out->base[0];
+                vxThreshold_RANGE(vx_int16, low_y, high_y, 0, high_x, 2)
+            }
+        }
+    }
+    else
+    {//case of input: VX_DF_IMAGE_U8  -> output: VX_DF_IMAGE_U8
+        vx_uint8 *src_base = in->base[0] + in->tile_x;
+        vx_uint8 *dst_base = out->base[0] + out->tile_x;
+        if (type == VX_THRESHOLD_TYPE_BINARY)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                vxThreshold_BINARY(vx_uint8, low_y, high_y, low_x, high_x, 1)
+            }
+            else
+            {
+                vxThreshold_BINARY(vx_uint8, 0, low_y, low_x, high_x, 1)
+
+                src_base = in->base[0];
+                dst_base = out->base[0];
+                vxThreshold_BINARY(vx_uint8, low_y, high_y, 0, high_x, 1)
+            }
+        }
+        else if (type == VX_THRESHOLD_TYPE_RANGE)
+        {
+            if (low_y == 0 && low_x == 0)
+            {
+                vxThreshold_RANGE(vx_uint8, low_y, high_y, low_x, high_x, 1)
+            }
+            else
+            {
+                vxThreshold_RANGE(vx_uint8, 0, low_y, low_x, high_x, 1)
+
+                src_base = in->base[0];
+                dst_base = out->base[0];
+                vxThreshold_RANGE(vx_uint8, low_y, high_y, 0, high_x, 1)
+            }
+        }
+    }
+}
diff --git a/kernels/tiling/tiling_warp.c b/kernels/tiling/tiling_warp.c
new file mode 100644
index 0000000..95e6527
--- /dev/null
+++ b/kernels/tiling/tiling_warp.c
@@ -0,0 +1,619 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include <arm_neon.h>
+#include <tiling.h>
+
+#include <math.h>
+
+#define MIN(a,b)    (((a) < (b)) ? (a) : (b))
+
+static vx_int32 * alignPtr(vx_int32* ptr, size_t n)
+{
+    return (vx_int32 *)(((size_t)ptr + n-1) & -n);
+}
+
+static vx_float32 * alignPtr_f(vx_float32* ptr, size_t n)
+{
+    return (vx_float32 *)(((size_t)ptr + n-1) & -n);
+}
+
+static void remapNearestNeighborConst(const size_t height,
+                                      const size_t width,
+                                      const vx_uint8 * srcBase,
+                                      const vx_int32 * map,
+                                      vx_uint8 * dstBase, ptrdiff_t dstStride,
+                                      vx_uint8 borderValue)
+{
+    for (size_t y = 0; y < height; ++y)
+    {
+        const vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * width * sizeof(vx_int32));
+        vx_uint8 * dst_row = (vx_uint8 *)((vx_int8 *)dstBase + y * dstStride);
+
+        for (size_t x = 0; x < width; ++x)
+        {
+            vx_int32 src_idx = map_row[x];
+            dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue;
+        }
+    }
+}
+
+static void remapLinearConst(const size_t height,
+                             const size_t width,
+                             const vx_uint8 * srcBase,
+                             const vx_int32 * map,
+                             const vx_float32 * coeffs,
+                             vx_uint8 * dstBase, ptrdiff_t dstStride,
+                             vx_uint8 borderValue)
+{
+    int16x8_t v_zero16 = vdupq_n_s16(0);
+
+    for (size_t y = 0; y < height; ++y)
+    {
+        const vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * width * sizeof(vx_int32) * 4);
+        const vx_float32 * coeff_row = (vx_float32 *)((vx_int8 *)(coeffs) + y * width * sizeof(vx_float32) * 2);
+
+        vx_uint8 * dst_row = (vx_uint8 *)((vx_int8 *)(dstBase) + y * dstStride);
+
+        size_t x = 0;
+
+        for ( ; x + 8 < width; x += 8)
+        {
+            int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) +  4] >= 0 ? srcBase[map_row[(x << 2) +  4]] : borderValue, v_src00, 1);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) +  8] >= 0 ? srcBase[map_row[(x << 2) +  8]] : borderValue, v_src00, 2);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7);
+
+            int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) +  5] >= 0 ? srcBase[map_row[(x << 2) +  5]] : borderValue, v_src01, 1);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) +  9] >= 0 ? srcBase[map_row[(x << 2) +  9]] : borderValue, v_src01, 2);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7);
+
+            int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) +  6] >= 0 ? srcBase[map_row[(x << 2) +  6]] : borderValue, v_src10, 1);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7);
+
+            int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) +  7] >= 0 ? srcBase[map_row[(x << 2) +  7]] : borderValue, v_src11, 1);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7);
+
+            // first part
+            float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
+            float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
+
+            float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
+            float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
+                                                                               vget_low_s16(v_src00))), v_coeff.val[0]);
+            float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
+                                                                               vget_low_s16(v_src10))), v_coeff.val[0]);
+
+            float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
+            uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
+
+            // second part
+            v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
+            v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
+
+            v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
+            v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
+                                                                   vget_high_s16(v_src00))), v_coeff.val[0]);
+            v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
+                                                                   vget_high_s16(v_src10))), v_coeff.val[0]);
+
+            v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
+            uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
+
+            // store
+            vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
+        }
+        for ( ; x < width; ++x)
+        {
+            int16_t src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue;
+            int16_t src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue;
+            int16_t src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue;
+            int16_t src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue;
+
+            vx_float32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00;
+            vx_float32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10;
+            dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
+        }
+    }
+}
+
+//BLOCK_SIZE is the same as tile_size set in "vx_warp.c"
+#define BLOCK_SIZE 16
+
+void WarpAffine_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_matrix_t *mask = (vx_tile_matrix_t *)parameters[1];
+    vx_enum *type = (vx_enum *)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+
+    vx_uint8 *src_base = in->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 src_width = in->image.width;
+    vx_uint32 src_height = in->image.height;
+    vx_uint32 srcStride = in->addr->stride_y;
+
+    vx_uint32 dst_width = out->image.width;
+    vx_uint32 dst_height = out->image.height;
+    vx_uint32 dstStride = out->addr->stride_y;
+
+    int32x4_t v_width4 = vdupq_n_s32(src_width - 1), v_height4 = vdupq_n_s32(src_height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride);
+    float32x4_t v_4 = vdupq_n_f32(4.0f);
+
+    float32x4_t v_m0 = vdupq_n_f32(mask->m_f32[0]);
+    float32x4_t v_m1 = vdupq_n_f32(mask->m_f32[1]);
+    float32x4_t v_m2 = vdupq_n_f32(mask->m_f32[2]);
+    float32x4_t v_m3 = vdupq_n_f32(mask->m_f32[3]);
+    float32x4_t v_m4 = vdupq_n_f32(mask->m_f32[4]);
+    float32x4_t v_m5 = vdupq_n_f32(mask->m_f32[5]);
+
+    vx_uint8 borderValue = 0;
+
+    size_t i = out->tile_y;
+    size_t blockHeight = MIN(BLOCK_SIZE, dst_height - i);
+    size_t j = out->tile_x;
+    size_t blockWidth = MIN(BLOCK_SIZE, dst_width - j);
+
+    if (*type == VX_INTERPOLATION_NEAREST_NEIGHBOR)
+    {
+        vx_int32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
+        vx_int32 * map = alignPtr(_map, 16);
+
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+
+        // compute table
+        for (size_t y = 0; y < blockHeight; ++y)
+        {
+            vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(&map[0]) + y * blockWidth * sizeof(vx_int32));
+
+            size_t x = 0, y_ = y + i;
+            vx_float32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+            float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+            float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
+
+            for ( ; x + 4 <= blockWidth; x += 4)
+            {
+                float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+
+                int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
+                int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
+                uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
+                                              vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
+                int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
+                vst1q_s32(map_row + x, v_src_index);
+
+                v_x = vaddq_f32(v_x, v_4);
+            }
+        }
+        vx_uint8 * dstBase = (vx_uint8 *)((vx_int8 *)dst_base + i * dstStride);
+        // make remap
+        remapNearestNeighborConst(blockHeight, blockWidth, src_base, &map[0], dstBase + j, dstStride, borderValue);
+    }
+    else if (*type == VX_INTERPOLATION_BILINEAR)
+    {
+        vx_int32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
+        vx_float32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
+        vx_int32 * map = alignPtr(_map, 16);
+        vx_float32 * coeffs = alignPtr_f(_coeffs, 16);
+
+        int32x4_t v_1 = vdupq_n_s32(1);
+        float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
+
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+
+        // compute table
+        for (size_t y = 0; y < blockHeight; ++y)
+        {
+            vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * blockWidth * sizeof(vx_int32) * 4);
+            vx_float32 * coeff_row = (vx_float32 *)((vx_int8 *)(coeffs) + y * blockWidth * sizeof(vx_float32) * 2);
+
+            size_t x = 0, y_ = y + i;
+            vx_float32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+            float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_), v_4 = vdupq_n_f32(4.0f);
+            float32x4_t v_yx = vmlaq_f32(v_m4, v_m2, v_y), v_yy = vmlaq_f32(v_m5, v_m3, v_y);
+
+            for ( ; x + 4 <= blockWidth; x += 4)
+            {
+                float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+
+                int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
+                int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
+
+                float32x4x2_t v_coeff;
+                v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
+                v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
+                uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
+                v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
+
+                int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
+                int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
+
+                int32x4x4_t v_dst_index;
+                v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
+                v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
+                v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
+                v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
+
+                uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
+                uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
+                uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
+                uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
+
+                v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
+                v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
+                v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
+                v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
+
+                vst2q_f32(coeff_row + (x << 1), v_coeff);
+                vst4q_s32(map_row + (x << 2), v_dst_index);
+
+                v_x = vaddq_f32(v_x, v_4);
+            }
+        }
+
+        vx_uint8 * dstBase = (vx_uint8 *)((vx_int8 *)dst_base + i * dstStride);
+
+        remapLinearConst(blockHeight, blockWidth, src_base, &map[0], &coeffs[0], dstBase + j, dstStride, borderValue);
+    }
+}
+
+static inline float32x4_t vrecpq_f32(float32x4_t val)
+{
+    float32x4_t reciprocal = vrecpeq_f32(val);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal);
+    return reciprocal;
+}
+
+void WarpPerspective_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_matrix_t *mask = (vx_tile_matrix_t *)parameters[1];
+    vx_enum *type = (vx_enum *)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+
+    vx_uint8 *src_base = in->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 src_width = in->image.width;
+    vx_uint32 src_height = in->image.height;
+    vx_uint32 srcStride = in->addr->stride_y;
+
+    vx_uint32 dst_width = out->image.width;
+    vx_uint32 dst_height = out->image.height;
+    vx_uint32 dstStride = out->addr->stride_y;
+
+    int32x4_t v_width4 = vdupq_n_s32(src_width - 1);
+    int32x4_t v_height4 = vdupq_n_s32(src_height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride);
+    float32x4_t v_4 = vdupq_n_f32(4.0f);
+
+    float32x4_t v_m0 = vdupq_n_f32(mask->m_f32[0]);
+    float32x4_t v_m1 = vdupq_n_f32(mask->m_f32[1]);
+    float32x4_t v_m2 = vdupq_n_f32(mask->m_f32[2]);
+    float32x4_t v_m3 = vdupq_n_f32(mask->m_f32[3]);
+    float32x4_t v_m4 = vdupq_n_f32(mask->m_f32[4]);
+    float32x4_t v_m5 = vdupq_n_f32(mask->m_f32[5]);
+    float32x4_t v_m6 = vdupq_n_f32(mask->m_f32[6]);
+    float32x4_t v_m7 = vdupq_n_f32(mask->m_f32[7]);
+    float32x4_t v_m8 = vdupq_n_f32(mask->m_f32[8]);
+
+    vx_uint8 borderValue = 0;
+
+    size_t i = out->tile_y;
+    size_t blockHeight = MIN(BLOCK_SIZE, dst_height - i);
+    size_t j = out->tile_x;
+    size_t blockWidth = MIN(BLOCK_SIZE, dst_width - j);
+
+    if (*type == VX_INTERPOLATION_NEAREST_NEIGHBOR)
+    {
+        vx_int32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
+        vx_int32 * map = alignPtr(_map, 16); 
+
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+
+        // compute table
+        for (size_t y = 0; y < blockHeight; ++y)
+        {
+            vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(&map[0]) + y * blockWidth * sizeof(vx_int32));
+
+            size_t x = 0, y_ = y + i;
+            vx_float32  indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+            float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+            float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y);
+            float32x4_t v_yy = vmlaq_f32(v_m7, v_m4, v_y);
+            float32x4_t v_yw = vmlaq_f32(v_m8, v_m5, v_y);
+
+            for ( ; x + 4 <= blockWidth; x += 4)
+            {
+                float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+
+                float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
+
+                v_src_xf = vmulq_f32(v_wf, v_src_xf);
+                v_src_yf = vmulq_f32(v_wf, v_src_yf);
+
+                int32x4_t v_src_x = vcvtq_s32_f32(v_src_xf);
+                int32x4_t v_src_y = vcvtq_s32_f32(v_src_yf);
+                uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x, v_width4)),
+                vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y, v_height4)));
+                int32x4_t v_src_index = vbslq_s32(v_mask, vmlaq_s32(v_src_x, v_src_y, v_step4), v_m1_4);
+                vst1q_s32(map_row + x, v_src_index);
+
+                v_x = vaddq_f32(v_x, v_4);
+            }
+        }
+
+        vx_uint8 * dstBase = (vx_uint8 *)((vx_int8 *)dst_base + i * dstStride);
+        // make remap
+        remapNearestNeighborConst(blockHeight, blockWidth, src_base, &map[0],dstBase + j, dstStride, borderValue);
+    }
+    else if (*type == VX_INTERPOLATION_BILINEAR)
+    {
+        vx_int32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
+        vx_float32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
+        vx_int32 * map = alignPtr(_map, 16);
+        vx_float32 * coeffs = alignPtr_f(_coeffs, 16);
+
+        int32x4_t v_1 = vdupq_n_s32(1);
+        float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
+
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+
+        // compute table
+        for (size_t y = 0; y < blockHeight; ++y)
+        {
+            vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * blockWidth * sizeof(vx_int32) * 4);
+            vx_float32 * coeff_row = (vx_float32 *)((vx_int8 *)(coeffs) + y * blockWidth * sizeof(vx_float32) * 2);
+
+            size_t x = 0, y_ = y + i;
+            vx_float32 indeces[4] = { j + 0.0f, j + 1.0f, j + 2.0f, j + 3.0f };
+            float32x4_t v_x = vld1q_f32(indeces), v_y = vdupq_n_f32(y_);
+            float32x4_t v_yx = vmlaq_f32(v_m6, v_m3, v_y), v_yy = vmlaq_f32(v_m7, v_m4, v_y),
+            v_yw = vmlaq_f32(v_m8, v_m5, v_y);
+
+            for ( ; x + 4 <= blockWidth; x += 4)
+            {
+                float32x4_t v_src_xf = vmlaq_f32(v_yx, v_m0, v_x);
+                float32x4_t v_src_yf = vmlaq_f32(v_yy, v_m1, v_x);
+                float32x4_t v_wf = vrecpq_f32(vmlaq_f32(v_yw, v_m2, v_x));
+                v_src_xf = vmulq_f32(v_wf, v_src_xf);
+                v_src_yf = vmulq_f32(v_wf, v_src_yf);
+
+                int32x4_t v_src_x0 = vcvtq_s32_f32(v_src_xf);
+                int32x4_t v_src_y0 = vcvtq_s32_f32(v_src_yf);
+
+                float32x4x2_t v_coeff;
+                v_coeff.val[0] = vsubq_f32(v_src_xf, vcvtq_f32_s32(v_src_x0));
+                v_coeff.val[1] = vsubq_f32(v_src_yf, vcvtq_f32_s32(v_src_y0));
+                uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
+                v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
+
+                int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
+                int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
+
+                int32x4x4_t v_dst_index;
+                v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
+                v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
+                v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
+                v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
+
+                uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_src_xf, v_zero4), vcleq_s32(v_src_x0, v_width4));
+                uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_xf, v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
+                uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_src_yf, v_zero4), vcleq_s32(v_src_y0, v_height4));
+                uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_src_yf, v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
+
+                v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
+                v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
+                v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
+                v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
+
+                vst2q_f32(coeff_row + (x << 1), v_coeff);
+                vst4q_s32(map_row + (x << 2), v_dst_index);
+
+                v_x = vaddq_f32(v_x, v_4);
+            }
+        }
+
+        vx_uint8 * dstBase = (vx_uint8 *)((vx_int8 *)dst_base + i * dstStride);
+
+        remapLinearConst(blockHeight, blockWidth, src_base, &map[0], &coeffs[0], dstBase + j, dstStride, borderValue);
+    }
+}
+
+static vx_bool read_pixel_8u_C1(void *base, vx_imagepatch_addressing_t *addr, vx_uint32 src_height, vx_uint32 src_width,
+    vx_float32 x, vx_float32 y, vx_uint8 *pixel)
+{
+    vx_bool out_of_bounds = (x < 0 || y < 0 || x >= src_width || y >= src_height);
+    vx_uint32 bx, by;
+    vx_uint8 *bpixel;
+
+    if (out_of_bounds)
+    {
+        return vx_false_e;
+    }
+
+    // bounded x/y
+    bx = x < 0 ? 0 : x >= src_width ? src_width - 1 : (vx_uint32)x;
+    by = y < 0 ? 0 : y >= src_height ? src_height - 1 : (vx_uint32)y;
+
+    vx_uint8 *new_ptr = NULL;
+    vx_uint32 offset = (addr->stride_y * by + addr->stride_x * bx);
+    new_ptr = (vx_uint8 *)base;
+    bpixel = &new_ptr[offset];
+
+    *pixel = *bpixel;
+
+    return vx_true_e;
+}
+
+static void transform_affine(vx_uint32 dst_x, vx_uint32 dst_y, vx_float32 m[], vx_float32 *src_x, vx_float32 *src_y)
+{
+    *src_x = dst_x * m[0] + dst_y * m[2] + m[4];
+    *src_y = dst_x * m[1] + dst_y * m[3] + m[5];
+}
+
+static void transform_perspective(vx_uint32 dst_x, vx_uint32 dst_y, vx_float32 m[], vx_float32 *src_x, vx_float32 *src_y)
+{
+    vx_float32 z = dst_x * m[2] + dst_y * m[5] + m[8];
+
+    *src_x = (dst_x * m[0] + dst_y * m[3] + m[6]) / z;
+    *src_y = (dst_x * m[1] + dst_y * m[4] + m[7]) / z;
+}
+
+#define WARP(low_y, high_y, low_x, transform)                                                                                             \
+    for (y = low_y; y < high_y; y++)                                                                                                      \
+    {                                                                                                                                     \
+        vx_uint8 *dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;                                                                   \
+        for (x = low_x; x < high_x; x++)                                                                                                  \
+        {                                                                                                                                 \
+            vx_float32 xf;                                                                                                                \
+            vx_float32 yf;                                                                                                                \
+            transform(x, y, mask->m_f32, &xf, &yf);                                                                                       \
+                                                                                                                                          \
+            if (*type == VX_INTERPOLATION_NEAREST_NEIGHBOR)                                                                               \
+            {                                                                                                                             \
+                read_pixel_8u_C1(src_base, in->addr, in->image.height, in->image.width, xf, yf, dst);                                     \
+                dst++;                                                                                                                    \
+            }                                                                                                                             \
+            else if (*type == VX_INTERPOLATION_BILINEAR)                                                                                  \
+            {                                                                                                                             \
+                vx_uint8 tl = 0, tr = 0, bl = 0, br = 0;                                                                                  \
+                vx_bool defined = vx_true_e;                                                                                              \
+                defined &= read_pixel_8u_C1(src_base, in->addr, in->image.height, in->image.width, floorf(xf), floorf(yf), &tl);          \
+                defined &= read_pixel_8u_C1(src_base, in->addr, in->image.height, in->image.width, floorf(xf) + 1, floorf(yf), &tr);      \
+                defined &= read_pixel_8u_C1(src_base, in->addr, in->image.height, in->image.width, floorf(xf), floorf(yf) + 1, &bl);      \
+                defined &= read_pixel_8u_C1(src_base, in->addr, in->image.height, in->image.width, floorf(xf) + 1, floorf(yf) + 1, &br);  \
+                if (defined)                                                                                                              \
+                {                                                                                                                         \
+                    vx_float32 ar = xf - floorf(xf);                                                                                      \
+                    vx_float32 ab = yf - floorf(yf);                                                                                      \
+                    vx_float32 al = 1.0f - ar;                                                                                            \
+                    vx_float32 at = 1.0f - ab;                                                                                            \
+                    *dst = (vx_uint8)(tl * al * at + tr * ar * at + bl * al * ab + br * ar * ab);                                         \
+                }                                                                                                                         \
+                dst++;                                                                                                                    \
+            }                                                                                                                             \
+        }                                                                                                                                 \
+    }
+
+
+void WarpAffine_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_matrix_t *mask = (vx_tile_matrix_t *)parameters[1];
+    vx_enum *type = (vx_enum *)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    if (low_y == 0 && low_x == 0)
+    {
+        WARP(low_y, high_y, low_x, transform_affine)
+    }
+    else
+    {
+        WARP(0, low_y, low_x, transform_affine)
+
+        src_base = in->base[0];
+        dst_base = out->base[0];
+        WARP(low_y, high_y, 0, transform_affine)
+    }
+}
+
+void WarpPerspective_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_tile_matrix_t *mask = (vx_tile_matrix_t *)parameters[1];
+    vx_enum *type = (vx_enum *)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = vxTileHeight(out, 0);
+
+    vx_uint32 low_x = out->tile_x;
+    vx_uint32 high_x = vxTileWidth(out, 0);
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    if (low_y == 0 && low_x == 0)
+    {
+        WARP(low_y, high_y, low_x, transform_perspective)
+    }
+    else
+    {
+        WARP(0, low_y, low_x, transform_perspective)
+
+        src_base = in->base[0];
+        dst_base = out->base[0];
+        WARP(low_y, high_y, 0, transform_perspective)
+    }
+}
diff --git a/sample/framework/vx_context.c b/sample/framework/vx_context.c
old mode 100644
new mode 100755
index 1050759..69dfbbe
--- a/sample/framework/vx_context.c
+++ b/sample/framework/vx_context.c
@@ -22,21 +22,45 @@
 const vx_char implementation[VX_MAX_IMPLEMENTATION_NAME] = "khronos.sample";
 
 vx_char targetModules[][VX_MAX_TARGET_NAME] = {
+#if defined(OPENVX_USE_TILING)
+    "openvx-tiling_chaining",
+#endif
     "openvx-c_model",
+#if defined(EXPERIMENTAL_USE_VENUM)
+    "openvx-venum",
+#endif
+#if defined(EXPERIMENTAL_USE_OPENCL)
+    "openvx-opencl",
+#endif
+#if defined(EXPERIMENTAL_USE_OPENMP)
+    "openvx-openmp"
+#endif
 };
 
 const vx_char extensions[] =
 #if defined(OPENVX_USE_TILING)
     OPENVX_KHR_TILING" "
 #endif
-#if defined(OPENVX_USE_XML)
+#if defined(EXPERIMENTAL_USE_XML)
     OPENVX_KHR_XML" "
 #endif
+#if defined(EXPERIMENTAL_USE_OPENCL)
+    OPENVX_KHR_OPENCL" "
+#endif
+#if defined(EXPERIMENTAL_USE_NODE_MEMORY)
+    OPENVX_KHR_NODE_MEMORY" "
+#endif
 #if defined(OPENVX_USE_S16)
     "vx_khr_s16 "
 #endif
 #if defined(EXPERIMENTAL_USE_DOT)
     OPENVX_KHR_DOT" "
+#endif
+#if defined(EXPERIMENTAL_USE_TARGET)
+    OPENVX_EXT_TARGET" "
+#endif
+#if defined(EXPERIMENTAL_USE_VARIANTS)
+    OPENVX_KHR_VARIANTS" "
 #endif
     " ";
 
@@ -779,6 +803,18 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryContext(vx_context context, vx_enum at
                     status = VX_ERROR_INVALID_PARAMETERS;
                 }
                 break;
+#if defined(EXPERIMENTAL_USE_TARGET)
+            case VX_CONTEXT_TARGETS:
+                if (VX_CHECK_PARAM(ptr, size, vx_uint32, 0x3))
+                {
+                    *(vx_uint32 *)ptr = context->num_targets;
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_PARAMETERS;
+                }
+                break;
+#endif
             case VX_CONTEXT_IMPLEMENTATION:
                 if (size <= VX_MAX_IMPLEMENTATION_NAME && ptr)
                 {
@@ -912,7 +948,21 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryContext(vx_context context, vx_enum at
                             {
                                 VX_PRINT(VX_ZONE_INFO, "Kernel %s is unique\n", context->targets[t].kernels[k].name);
                                 table[numk].enumeration = context->targets[t].kernels[k].enumeration;
+#if defined(EXPERIMENTAL_USE_TARGET) || defined(EXPERIMENTAL_USE_VARIANT)
+                                // get the central string out
+                                {
+                                    vx_uint32 c = 0;
+                                    strncpy(table[numk].name, context->targets[t].kernels[k].name, VX_MAX_KERNEL_NAME);
+                                    for (c = 0; table[numk].name[c] != '\0'; c++) {
+                                        if (table[numk].name[c] == ';') {
+                                            table[numk].name[c] = '\0';
+                                            break;
+                                        }
+                                    }
+                                }
+#else
                                 strncpy(table[numk].name, context->targets[t].kernels[k].name, VX_MAX_KERNEL_NAME);
+#endif
                                 numk++;
                             }
                         }
@@ -932,8 +982,6 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryContext(vx_context context, vx_enum at
 VX_API_ENTRY vx_status VX_API_CALL vxHint(vx_reference reference, vx_enum hint, const void* data, vx_size data_size)
 {
     vx_status status = VX_SUCCESS;
-    (void)data;
-    (void)data_size;
 
     /* reference param should be a valid OpenVX reference*/
     if (ownIsValidContext((vx_context)reference) == vx_false_e && ownIsValidReference(reference) == vx_false_e)
@@ -1064,7 +1112,7 @@ VX_API_ENTRY vx_enum VX_API_CALL vxRegisterUserStructWithName(vx_context context
             {
                 context->user_structs[i].type = VX_TYPE_USER_STRUCT_START + i;
                 context->user_structs[i].size = size;
-                strncpy(context->user_structs[i].name, name, VX_MAX_STRUCT_NAME - 1);
+                strncpy(context->user_structs[i].name, name, VX_MAX_STRUCT_NAME);
                 type = context->user_structs[i].type;
                 break;
             }
diff --git a/sample/framework/vx_kernel.c b/sample/framework/vx_kernel.c
index 0893bda..8299b21 100644
--- a/sample/framework/vx_kernel.c
+++ b/sample/framework/vx_kernel.c
@@ -55,7 +55,7 @@ vx_kernel_t *ownAllocateKernel(vx_context context,
     if (vxGetStatus((vx_reference)kernel) == VX_SUCCESS && kernel->base.type == VX_TYPE_KERNEL)
     {
         /* setup the kernel meta-data */
-        strncpy(kernel->name, name, VX_MAX_KERNEL_NAME - 1);
+        strncpy(kernel->name, name, VX_MAX_KERNEL_NAME);
         kernel->enumeration = kenum;
         kernel->function = function;
         kernel->signature.num_parameters = numParams;
@@ -105,7 +105,7 @@ vx_status ownInitializeKernel(vx_context context,
         ownIncrementReference(&kernel->base, VX_INTERNAL);
 
         // setup the kernel meta-data
-        strncpy(kernel->name, name, VX_MAX_KERNEL_NAME - 1);
+        strncpy(kernel->name, name, VX_MAX_KERNEL_NAME);
         kernel->enumeration = kenum;
         kernel->function = function;
         kernel->signature.num_parameters = numParams;
@@ -238,7 +238,7 @@ VX_API_ENTRY vx_status VX_API_CALL vxLoadKernels(vx_context context, const vx_ch
                     }
                     else
                     {
-                        strncpy(context->modules[m].name, name, VX_INT_MAX_PATH - 1);
+                        strncpy(context->modules[m].name, name, VX_INT_MAX_PATH);
                         context->modules[m].ref_count = 1;
                         context->num_modules++;
                     }
@@ -349,6 +349,18 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxGetKernelByName(vx_context context, const v
         vx_size colons = strncount(string, VX_MAX_KERNEL_NAME, ':');
         vx_char targetName[VX_MAX_TARGET_NAME] = "default";
         vx_char kernelName[VX_MAX_KERNEL_NAME];
+#if defined(EXPERIMENTAL_USE_VARIANTS)
+        vx_char variantName[VX_MAX_VARIANT_NAME] = "default";
+#if defined(EXPERIMENTAL_USE_TARGET)
+        vx_char defaultTargets[][VX_MAX_TARGET_NAME] = {
+            "default",
+            "power",
+            "performance",
+            "memory",
+            "bandwidth",
+        };
+#endif
+#endif
 #if defined(_WIN32)
         vx_char *nameBuffer = _strdup(string);
 #else
@@ -356,23 +368,100 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxGetKernelByName(vx_context context, const v
 #endif
 
         if (colons == 0) {
-            strncpy(kernelName, string, VX_MAX_KERNEL_NAME - 1);
+            strncpy(kernelName, string, VX_MAX_KERNEL_NAME);
         }
-        else
+        else if (colons == 1)
         {
-            /* There should be no colon */
+#if defined(EXPERIMENTAL_USE_TARGET) || defined(EXPERIMENTAL_USE_VARIANTS)
+            /* could be either target:kernel or kernel:variant" */
+            vx_char *front = strtok(nameBuffer, ":");
+            vx_char *back = strtok(NULL, ":");
+#if defined(EXPERIMENTAL_USE_TARGET) && defined(EXPERIMENTAL_USE_VARIANTS)
+            vx_bool isTarget = vx_false_e;
+            /* does front match any targets? */
+            for (t = 0u; t < context->num_targets; t++)
+            {
+                if (strncmp(front, context->targets[t].name, VX_MAX_TARGET_NAME) == 0)
+                {
+                    isTarget = vx_true_e;
+                    break;
+                }
+            }
+            if (isTarget == vx_false_e)
+            {
+                for (t = 0u; t < dimof(defaultTargets); t++)
+                {
+                    if (strncmp(front, defaultTargets[t], VX_MAX_TARGET_NAME) == 0)
+                    {
+                        isTarget = vx_true_e;
+                        break;
+                    }
+                }
+            }
+            if (isTarget == vx_true_e)
+            {
+                strncpy(targetName, front, VX_MAX_TARGET_NAME);
+                strncpy(kernelName, back, VX_MAX_KERNEL_NAME);
+            }
+            else
+            {
+                strncpy(kernelName, front, VX_MAX_KERNEL_NAME);
+                strncpy(variantName, back, VX_MAX_VARIANT_NAME);
+            }
+#elif defined(EXPERIMENTAL_USE_TARGET)
+            strncpy(targetName, front, VX_MAX_TARGET_NAME);
+            strncpy(kernelName, back, VX_MAX_KERNEL_NAME);
+#elif defined(EXPERIMENTAL_USE_VARIANTS)
+            strncpy(kernelName, front, VX_MAX_KERNEL_NAME);
+            strncpy(variantName, back, VX_MAX_VARIANT_NAME);
+#endif
+#else   /* defined(EXPERIMENTAL_USE_TARGET) || defined(EXPERIMENTAL_USE_VARIANTS) */
+            /* If both TARGET and VARIANT extensions are disabled, there should be no colon */
             /* Doing nothing will leave kern = NULL, causing error condition below */
             VX_PRINT(VX_ZONE_ERROR, "Kernel name should not contain any ':' in this implementation\n");
+#endif  /* defined(EXPERIMENTAL_USE_TARGET) || defined(EXPERIMENTAL_USE_VARIANTS) */
+        }
+        else if (colons == 2)
+        {
+#if defined(EXPERIMENTAL_USE_TARGET) && defined(EXPERIMENTAL_USE_VARIANTS)
+            /* target:kernel:variant */
+            vx_char *target = strtok(nameBuffer, ":");
+            vx_char *kernel = strtok(NULL, ":");
+            vx_char *variant = strtok(NULL,":");
+            strncpy(targetName, target, VX_MAX_TARGET_NAME);
+            strncpy(kernelName, kernel, VX_MAX_KERNEL_NAME);
+            strncpy(variantName, variant, VX_MAX_VARIANT_NAME);
+#else   /* defined(EXPERIMENTAL_USE_TARGET) && defined(EXPERIMENTAL_USE_VARIANTS) */
+            /* If both TARGET and VARIANT extensions are disabled, there should be no colon */
+            /* Doing nothing will leave kern = NULL, causing error condition below */
+            VX_PRINT(VX_ZONE_ERROR, "Kernel name should not contain two ':' in this implementation\n");
+#endif  /* defined(EXPERIMENTAL_USE_TARGET) && defined(EXPERIMENTAL_USE_VARIANTS) */
+        }
+        else
+        {
+            /* no extension supports > 2 colons so far */
+            /* Doing nothing will leave kern = NULL, causing error condition below */
+            VX_PRINT(VX_ZONE_ERROR, "Kernel name should not contain more than two ':' in this implementation\n");
         }
 
         free(nameBuffer);
 
+#if defined(EXPERIMENTAL_USE_VARIANTS)
+        VX_PRINT(VX_ZONE_KERNEL, "Scanning in set of %u kernels on %u targets.\n"
+            "Target: %s\nKernel: %s\nVariant: %s\n",
+            context->num_kernels, context->num_targets,
+            targetName, kernelName, variantName);
+#endif
         for (t = 0; t < context->num_targets && kern == NULL; t++)
         {
             vx_target_t *target = &context->targets[context->priority_targets[t]];
             if (target == NULL || target->enabled == vx_false_e)
                 continue;
+#if defined(EXPERIMENTAL_USE_VARIANTS)
+            if (target->funcs.supports(target, targetName, kernelName, variantName, &k) == VX_SUCCESS)
+#else
             if (target->funcs.supports(target, targetName, kernelName, &k) == VX_SUCCESS)
+#endif
             {
                 vx_kernel kernel = &target->kernels[k];
                 vxPrintKernel(kernel);
@@ -433,6 +522,10 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxGetKernelByEnum(vx_context context, vx_enum
                     break;
                 }
             }
+            /* Acquire the highest priority target */
+            if (kernel != NULL) {
+                break;
+            }
         }
         if (kernel == NULL) {
             VX_PRINT(VX_ZONE_KERNEL, "Kernel enum %x not found.\n", kernelenum);
@@ -583,11 +676,15 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxAddUserKernel(vx_context c,
 VX_API_ENTRY vx_kernel VX_API_CALL vxAddTilingKernel(vx_context c,
                             vx_char name[VX_MAX_KERNEL_NAME],
                             vx_enum enumeration,
+                            vx_kernel_f function,
                             vx_tiling_kernel_f flexible_func_ptr,
                             vx_tiling_kernel_f fast_func_ptr,
                             vx_uint32 num_params,
+                            vx_kernel_validate_f validate,
                             vx_kernel_input_validate_f input,
-                            vx_kernel_output_validate_f output)
+                            vx_kernel_output_validate_f output,
+                            vx_kernel_initialize_f initialize,
+                            vx_kernel_deinitialize_f deinitialize)
 {
     vx_context_t *context = (vx_context_t *)c;
     vx_kernel kernel = 0;
@@ -601,9 +698,7 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxAddTilingKernel(vx_context c,
         VX_PRINT(VX_ZONE_ERROR, "Invalid Context\n");
         return (vx_kernel)NULL;
     }
-    if ((flexible_func_ptr == NULL && fast_func_ptr == NULL) ||
-        input == NULL ||
-        output == NULL ||
+    if  ( ((validate == NULL) && (input == NULL || output == NULL)) ||
         num_params > VX_INT_MAX_PARAMS || num_params == 0 ||
         name == NULL ||
         strncmp(name, "",  VX_MAX_KERNEL_NAME) == 0)
@@ -618,7 +713,7 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxAddTilingKernel(vx_context c,
     index = strnindex(name, ':', VX_MAX_TARGET_NAME);
     if (index == VX_MAX_TARGET_NAME)
     {
-        strcpy(targetName,"khronos.any");
+        strcpy(targetName,"khronos.tiling");
     }
     else
     {
@@ -636,9 +731,9 @@ VX_API_ENTRY vx_kernel VX_API_CALL vxAddTilingKernel(vx_context c,
     }
     if (target && target->funcs.addtilingkernel)
     {
-        kernel = target->funcs.addtilingkernel(target, name, enumeration,
-                                         flexible_func_ptr, fast_func_ptr, num_params,
-                                         input, output);
+        kernel = target->funcs.addtilingkernel(target, name, enumeration, function,
+                                         flexible_func_ptr, fast_func_ptr, num_params, validate,  
+                                         input, output, initialize, deinitialize);
         VX_PRINT(VX_ZONE_KERNEL,"Added Kernel %s to Target %s ("VX_FMT_REF")\n", name, target->name, kernel);
     }
     else
@@ -744,6 +839,28 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryKernel(vx_kernel kern, vx_enum attribu
                     status = VX_ERROR_INVALID_PARAMETERS;
                 }
                 break;
+#ifdef OPENVX_KHR_NODE_MEMORY
+            case VX_KERNEL_GLOBAL_DATA_SIZE:
+                if (VX_CHECK_PARAM(ptr, size, vx_size, 0x3))
+                {
+                    *(vx_size *)ptr = kernel->attributes.globalDataSize;
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_PARAMETERS;
+                }
+                break;
+            case VX_KERNEL_GLOBAL_DATA_PTR:
+                if (VX_CHECK_PARAM(ptr, size, vx_ptr_t, 0x1))
+                {
+                    *(vx_ptr_t *)ptr = kernel->attributes.globalDataPtr;
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_PARAMETERS;
+                }
+                break;
+#endif
 #ifdef OPENVX_KHR_TILING
             case VX_KERNEL_INPUT_NEIGHBORHOOD:
                 if (VX_CHECK_PARAM(ptr, size, vx_neighborhood_size_t, 0x3))
@@ -795,10 +912,17 @@ VX_API_ENTRY vx_status VX_API_CALL vxAddParameterToKernel(vx_kernel kernel,
         if (index < kern->signature.num_parameters)
         {
 #ifdef OPENVX_KHR_TILING
-            if (kern->tiling_function)
+            if (kern->tilingfast_function)
             {
                 if (((data_type != VX_TYPE_IMAGE) &&
-                     (data_type != VX_TYPE_SCALAR)) ||
+                     (data_type != VX_TYPE_SCALAR) &&
+                     (data_type != VX_TYPE_THRESHOLD) &&
+                     (data_type != VX_TYPE_REMAP) &&
+                     (data_type != VX_TYPE_CONVOLUTION) &&
+                     (data_type != VX_TYPE_TENSOR) &&
+                     (data_type != VX_TYPE_ARRAY) &&
+                     (data_type != VX_TYPE_LUT) &&
+                     (data_type != VX_TYPE_MATRIX)) ||
                     (ownIsValidDirection(dir) == vx_false_e) ||
                     (ownIsValidState(state) == vx_false_e))
                 {
@@ -953,6 +1077,29 @@ VX_API_ENTRY vx_status VX_API_CALL vxSetKernelAttribute(vx_kernel k, vx_enum att
                 status = VX_ERROR_INVALID_PARAMETERS;
             }
             break;
+#ifdef EXPERIMENTAL_USE_NODE_MEMORY
+        case VX_KERNEL_GLOBAL_DATA_SIZE:
+            if (VX_CHECK_PARAM(ptr, size, vx_size, 0x3))
+            {
+                kernel->attributes.globalDataSize = *(vx_size *)ptr;
+                VX_PRINT(VX_ZONE_KERNEL, "Set Global Data Size to "VX_FMT_SIZE" bytes\n", kernel->attributes.globalDataSize);
+            }
+            else
+            {
+                status = VX_ERROR_INVALID_PARAMETERS;
+            }
+            break;
+        case VX_KERNEL_GLOBAL_DATA_PTR:
+            if (VX_CHECK_PARAM(ptr, size, vx_ptr_t, 0x1))
+            {
+                kernel->attributes.globalDataPtr = *(vx_ptr_t *)ptr;
+            }
+            else
+            {
+                status = VX_ERROR_INVALID_PARAMETERS;
+            }
+            break;
+#endif
 #ifdef OPENVX_KHR_TILING
         case VX_KERNEL_INPUT_NEIGHBORHOOD:
             if (VX_CHECK_PARAM(ptr, size, vx_neighborhood_size_t, 0x3))
diff --git a/sample/framework/vx_node_api.c b/sample/framework/vx_node_api.c
index 5089e9b..9bae748 100644
--- a/sample/framework/vx_node_api.c
+++ b/sample/framework/vx_node_api.c
@@ -15,12 +15,6 @@
  * limitations under the License.
  */
 
-/*!
- * \file
- * \brief The Graph Mode Interface for all Base Kernels.
- * \author Erik Rainey <erik.rainey@gmail.com>
- */
-
 #include "vx_internal.h"
 
 VX_API_ENTRY vx_node VX_API_CALL vxColorConvertNode(vx_graph graph, vx_image input, vx_image output)
@@ -29,7 +23,11 @@ VX_API_ENTRY vx_node VX_API_CALL vxColorConvertNode(vx_graph graph, vx_image inp
         (vx_reference)input,
         (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph, VX_KERNEL_COLOR_CONVERT_TILING, params, dimof(params));
+#else
     return vxCreateNodeByStructure(graph, VX_KERNEL_COLOR_CONVERT, params, dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxChannelExtractNode(vx_graph graph,
@@ -66,10 +64,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxChannelCombineNode(vx_graph graph,
        (vx_reference)plane3,
        (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_CHANNEL_COMBINE_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_CHANNEL_COMBINE,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxSobel3x3Node(vx_graph graph, vx_image input, vx_image output_x, vx_image output_y)
@@ -79,10 +84,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxSobel3x3Node(vx_graph graph, vx_image input,
        (vx_reference)output_x,
        (vx_reference)output_y,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_SOBEL_3x3_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_SOBEL_3x3,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxMagnitudeNode(vx_graph graph, vx_image grad_x, vx_image grad_y, vx_image mag)
@@ -92,10 +104,18 @@ VX_API_ENTRY vx_node VX_API_CALL vxMagnitudeNode(vx_graph graph, vx_image grad_x
        (vx_reference)grad_y,
        (vx_reference)mag,
     };
+    
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_MAGNITUDE_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_MAGNITUDE,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxPhaseNode(vx_graph graph, vx_image grad_x, vx_image grad_y, vx_image orientation)
@@ -105,10 +125,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxPhaseNode(vx_graph graph, vx_image grad_x, vx
        (vx_reference)grad_y,
        (vx_reference)orientation,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_PHASE_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_PHASE,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxScaleImageNode(vx_graph graph, vx_image src, vx_image dst, vx_enum type)
@@ -120,10 +147,18 @@ VX_API_ENTRY vx_node VX_API_CALL vxScaleImageNode(vx_graph graph, vx_image src,
         (vx_reference)dst,
         (vx_reference)stype,
     };
-    vx_node node = vxCreateNodeByStructure(graph,
-                                           VX_KERNEL_SCALE_IMAGE,
-                                           params,
-                                           dimof(params));
+    vx_node node;    
+    #if defined(OPENVX_USE_TILING)
+        node = vxCreateNodeByStructure(graph,
+            VX_KERNEL_SCALE_IMAGE_TILING,
+            params,
+            dimof(params));
+    #else
+        node = vxCreateNodeByStructure(graph,
+            VX_KERNEL_SCALE_IMAGE,
+            params,
+            dimof(params));
+    #endif
     vxReleaseScalar(&stype);
     return node;
 }
@@ -135,10 +170,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxTableLookupNode(vx_graph graph, vx_image inpu
         (vx_reference)lut,
         (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_TABLE_LOOKUP_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_TABLE_LOOKUP,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxHistogramNode(vx_graph graph, vx_image input, vx_distribution distribution)
@@ -172,10 +214,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxAbsDiffNode(vx_graph graph, vx_image in1, vx_
        (vx_reference)in2,
        (vx_reference)out,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_ABSDIFF_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_ABSDIFF,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxMeanStdDevNode(vx_graph graph, vx_image input, vx_scalar mean, vx_scalar stddev)
@@ -198,10 +247,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxThresholdNode(vx_graph graph, vx_image input,
         (vx_reference)thesh,
         (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_THRESHOLD_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_THRESHOLD,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxIntegralImageNode(vx_graph graph, vx_image input, vx_image output)
@@ -210,10 +266,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxIntegralImageNode(vx_graph graph, vx_image in
         (vx_reference)input,
         (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_INTEGRAL_IMAGE_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_INTEGRAL_IMAGE,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxErode3x3Node(vx_graph graph, vx_image input, vx_image output)
@@ -222,10 +285,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxErode3x3Node(vx_graph graph, vx_image input,
         (vx_reference)input,
         (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_ERODE_3x3_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_ERODE_3x3,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxDilate3x3Node(vx_graph graph, vx_image input, vx_image output)
@@ -234,10 +304,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxDilate3x3Node(vx_graph graph, vx_image input,
         (vx_reference)input,
         (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_DILATE_3x3_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_DILATE_3x3,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxMedian3x3Node(vx_graph graph, vx_image input, vx_image output)
@@ -246,10 +323,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxMedian3x3Node(vx_graph graph, vx_image input,
         (vx_reference)input,
         (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_MEDIAN_3x3_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_MEDIAN_3x3,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxBox3x3Node(vx_graph graph, vx_image input, vx_image output)
@@ -258,10 +342,18 @@ VX_API_ENTRY vx_node VX_API_CALL vxBox3x3Node(vx_graph graph, vx_image input, vx
         (vx_reference)input,
         (vx_reference)output,
     };
+
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+        VX_KERNEL_BOX_3x3_TILING,
+        params,
+        dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_BOX_3x3,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxGaussian3x3Node(vx_graph graph, vx_image input, vx_image output)
@@ -270,10 +362,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxGaussian3x3Node(vx_graph graph, vx_image inpu
         (vx_reference)input,
         (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_GAUSSIAN_3x3_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_GAUSSIAN_3x3,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxNonLinearFilterNode(vx_graph graph, vx_enum function, vx_image input, vx_matrix mask, vx_image output)
@@ -286,11 +385,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxNonLinearFilterNode(vx_graph graph, vx_enum f
         (vx_reference)mask,
         (vx_reference)output,
     };
-
+#if defined(OPENVX_USE_TILING)
+    vx_node node = vxCreateNodeByStructure(graph,
+        VX_KERNEL_NON_LINEAR_FILTER_TILING,
+        params,
+        dimof(params));
+#else
     vx_node node = vxCreateNodeByStructure(graph,
         VX_KERNEL_NON_LINEAR_FILTER,
         params,
         dimof(params));
+#endif
 
     vxReleaseScalar(&func);
     return node;
@@ -303,10 +408,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxConvolveNode(vx_graph graph, vx_image input,
         (vx_reference)conv,
         (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_CUSTOM_CONVOLUTION_TILING,
+                                   params,
+                                   dimof(params));
+#else
     return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_CUSTOM_CONVOLUTION,
                                    params,
                                    dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxGaussianPyramidNode(vx_graph graph, vx_image input, vx_pyramid gaussian)
@@ -407,6 +519,31 @@ VX_API_ENTRY vx_node VX_API_CALL vxMinMaxLocNode(vx_graph graph,
                                    dimof(params));
 }
 
+VX_API_ENTRY vx_node VX_API_CALL vxWeightedAverageImageNode(vx_graph graph,
+	vx_image img1,
+	vx_scalar alpha,
+	vx_image img2,
+	vx_image output)
+{
+	vx_reference params[] = {
+		(vx_reference)img1,
+		(vx_reference)alpha,
+		(vx_reference)img2,
+		(vx_reference)output,
+	};
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph, 
+        VX_KERNEL_WEIGHTED_AVERAGE_TILING, 
+        params, 
+        dimof(params));
+#else
+    return vxCreateNodeByStructure(graph,
+		VX_KERNEL_WEIGHTED_AVERAGE,
+		params,
+		dimof(params));
+#endif
+}
+
 VX_API_ENTRY vx_node VX_API_CALL vxConvertDepthNode(vx_graph graph, vx_image input, vx_image output, vx_enum policy, vx_scalar shift)
 {
     vx_scalar pol = vxCreateScalar(vxGetContext((vx_reference)graph), VX_TYPE_ENUM, &policy);
@@ -416,10 +553,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxConvertDepthNode(vx_graph graph, vx_image inp
         (vx_reference)pol,
         (vx_reference)shift,
     };
+#if defined(OPENVX_USE_TILING)
+    vx_node node = vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_CONVERTDEPTH_TILING,
+                                   params,
+                                   dimof(params));
+#else
     vx_node node = vxCreateNodeByStructure(graph,
                                    VX_KERNEL_CONVERTDEPTH,
                                    params,
                                    dimof(params));
+#endif
     vxReleaseScalar(&pol);
     return node;
 }
@@ -453,10 +597,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxAndNode(vx_graph graph, vx_image in1, vx_imag
        (vx_reference)in2,
        (vx_reference)out,
     };
-    return vxCreateNodeByStructure(graph,
+    #if defined(OPENVX_USE_TILING)
+        return vxCreateNodeByStructure(graph,
+            VX_KERNEL_AND_TILING,
+            params,
+            dimof(params));
+    #else
+        return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_AND,
                                    params,
                                    dimof(params));
+    #endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxOrNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out)
@@ -466,10 +617,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxOrNode(vx_graph graph, vx_image in1, vx_image
        (vx_reference)in2,
        (vx_reference)out,
     };
-    return vxCreateNodeByStructure(graph,
+    #if defined(OPENVX_USE_TILING)
+        return vxCreateNodeByStructure(graph,
+            VX_KERNEL_OR_TILING,
+            params,
+            dimof(params));
+    #else
+        return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_OR,
                                    params,
                                    dimof(params));
+    #endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxXorNode(vx_graph graph, vx_image in1, vx_image in2, vx_image out)
@@ -479,10 +637,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxXorNode(vx_graph graph, vx_image in1, vx_imag
        (vx_reference)in2,
        (vx_reference)out,
     };
-    return vxCreateNodeByStructure(graph,
+    #if defined(OPENVX_USE_TILING)
+        return vxCreateNodeByStructure(graph,
+            VX_KERNEL_XOR_TILING,
+            params,
+            dimof(params));
+    #else
+        return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_XOR,
                                    params,
                                    dimof(params));
+    #endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxNotNode(vx_graph graph, vx_image input, vx_image output)
@@ -491,10 +656,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxNotNode(vx_graph graph, vx_image input, vx_im
        (vx_reference)input,
        (vx_reference)output,
     };
-    return vxCreateNodeByStructure(graph,
+    #if defined(OPENVX_USE_TILING)
+        return vxCreateNodeByStructure(graph,
+            VX_KERNEL_NOT_TILING,
+            params,
+            dimof(params));
+    #else
+        return vxCreateNodeByStructure(graph,
                                    VX_KERNEL_NOT,
                                    params,
                                    dimof(params));
+    #endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxMultiplyNode(vx_graph graph, vx_image in1, vx_image in2, vx_scalar scale, vx_enum overflow_policy, vx_enum rounding_policy, vx_image out)
@@ -510,10 +682,19 @@ VX_API_ENTRY vx_node VX_API_CALL vxMultiplyNode(vx_graph graph, vx_image in1, vx
        (vx_reference)rpolicy,
        (vx_reference)out,
     };
-    vx_node node = vxCreateNodeByStructure(graph,
-                                           VX_KERNEL_MULTIPLY,
-                                           params,
-                                           dimof(params));
+    vx_node node;
+    
+    #if defined(OPENVX_USE_TILING)
+        node = vxCreateNodeByStructure(graph,
+            VX_KERNEL_MULTIPLY_TILING,
+            params,
+            dimof(params));
+    #else
+        node = vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_MULTIPLY,
+                                   params,
+                                   dimof(params));
+    #endif
     vxReleaseScalar(&spolicy);
     vxReleaseScalar(&rpolicy);
     return node;
@@ -529,10 +710,18 @@ VX_API_ENTRY vx_node VX_API_CALL vxAddNode(vx_graph graph, vx_image in1, vx_imag
        (vx_reference)spolicy,
        (vx_reference)out,
     };
-    vx_node node = vxCreateNodeByStructure(graph,
-                                           VX_KERNEL_ADD,
-                                           params,
-                                           dimof(params));
+    vx_node node;                        
+#if defined(OPENVX_USE_TILING)
+    node = vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_ADD_TILING,
+                                   params,
+                                   dimof(params));
+#else
+    node = vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_ADD,
+                                   params,
+                                   dimof(params));
+#endif
     vxReleaseScalar(&spolicy);
     return node;
 }
@@ -547,10 +736,18 @@ VX_API_ENTRY vx_node VX_API_CALL vxSubtractNode(vx_graph graph, vx_image in1, vx
        (vx_reference)spolicy,
        (vx_reference)out,
     };
-    vx_node node = vxCreateNodeByStructure(graph,
-                                           VX_KERNEL_SUBTRACT,
-                                           params,
-                                           dimof(params));
+    vx_node node;                                           
+#if defined(OPENVX_USE_TILING)
+    node = vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_SUBTRACT_TILING,
+                                   params,
+                                   dimof(params));
+#else
+    node = vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_SUBTRACT,
+                                   params,
+                                   dimof(params));
+#endif
     vxReleaseScalar(&spolicy);
     return node;
 }
@@ -565,10 +762,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxWarpAffineNode(vx_graph graph, vx_image input
             (vx_reference)stype,
             (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_WARP_AFFINE_TILING,
+                                           params,
+                                           dimof(params));
+#else
     vx_node node = vxCreateNodeByStructure(graph,
                                            VX_KERNEL_WARP_AFFINE,
                                            params,
                                            dimof(params));
+#endif
     vxReleaseScalar(&stype);
 
     if (vxGetStatus((vx_reference)node) == VX_SUCCESS)
@@ -592,10 +796,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxWarpPerspectiveNode(vx_graph graph, vx_image
             (vx_reference)stype,
             (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_WARP_PERSPECTIVE_TILING,
+                                           params,
+                                           dimof(params));
+#else
     vx_node node = vxCreateNodeByStructure(graph,
                                            VX_KERNEL_WARP_PERSPECTIVE,
                                            params,
                                            dimof(params));
+#endif
     vxReleaseScalar(&stype);
 
     if (vxGetStatus((vx_reference)node) == VX_SUCCESS)
@@ -650,10 +861,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxFastCornersNode(vx_graph graph, vx_image inpu
             (vx_reference)corners,
             (vx_reference)num_corners,
     };
+#if defined(OPENVX_USE_TILING)
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_FAST_CORNERS_TILING,
+                                           params,
+                                           dimof(params));
+#else
     vx_node node = vxCreateNodeByStructure(graph,
                                            VX_KERNEL_FAST_CORNERS,
                                            params,
                                            dimof(params));
+#endif
     vxReleaseScalar(&nonmax);
     return node;
 }
@@ -667,10 +885,19 @@ VX_API_ENTRY vx_node VX_API_CALL vxNonMaxSuppressionNode(vx_graph graph, vx_imag
         (vx_reference)wsize,
         (vx_reference)output,
     };
-    vx_node node = vxCreateNodeByStructure(graph,
-        VX_KERNEL_NON_MAX_SUPPRESSION,
-        params,
-        dimof(params));
+vx_node node;                                           
+    #if defined(OPENVX_USE_TILING)
+        node = vxCreateNodeByStructure(graph,
+                                       VX_KERNEL_NON_MAX_SUPPRESSION_TILING,
+                                       params,
+                                       dimof(params));
+    #else
+        node = vxCreateNodeByStructure(graph,
+                                       VX_KERNEL_NON_MAX_SUPPRESSION,
+                                       params,
+                                       dimof(params));
+    #endif 
+
     vxReleaseScalar(&wsize);
     return node;
 }
@@ -723,10 +950,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxRemapNode(vx_graph graph,
             (vx_reference)spolicy,
             (vx_reference)output,
     };
+#if defined(OPENVX_USE_TILING)
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_REMAP_TILING,
+                                           params,
+                                           dimof(params));
+#else
     vx_node node = vxCreateNodeByStructure(graph,
                                            VX_KERNEL_REMAP,
                                            params,
                                            dimof(params));
+#endif
     vxReleaseScalar(&spolicy);
 
     if (vxGetStatus((vx_reference)node) == VX_SUCCESS)
@@ -748,10 +982,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxHalfScaleGaussianNode(vx_graph graph, vx_imag
             (vx_reference)output,
             (vx_reference)ksize,
     };
+#if defined(OPENVX_USE_TILING)
+    vx_node node = vxCreateNodeByStructure(graph,
+                                           VX_KERNEL_HALFSCALE_GAUSSIAN_TILING,
+                                           params,
+                                           dimof(params));
+#else
     vx_node node = vxCreateNodeByStructure(graph,
                                            VX_KERNEL_HALFSCALE_GAUSSIAN,
                                            params,
                                            dimof(params));
+#endif
     vxReleaseScalar(&ksize);
     return node;
 }
@@ -928,11 +1169,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxMinNode(vx_graph graph, vx_image in1, vx_imag
 	    (vx_reference) in2,
             (vx_reference) out,
     };
-    vx_node node = vxCreateNodeByStructure(graph,
-                    					   VX_KERNEL_MIN,
-                                           params,
-                                           dimof(params));
-    return node;
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_MIN_TILING,
+                                   params,
+                                   dimof(params));
+#else
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_MIN,
+                                   params,
+                                   dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxCopyNode(vx_graph graph, vx_reference input, vx_reference output)
@@ -952,12 +1199,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxMaxNode(vx_graph graph, vx_image in1, vx_imag
 	    (vx_reference) in2,
             (vx_reference) out,
     };
-
-    vx_node node = vxCreateNodeByStructure(graph,
-                                           VX_KERNEL_MAX,
-                                           params,
-                                           dimof(params));
-    return node;
+#if defined(OPENVX_USE_TILING)
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_MAX_TILING,
+                                   params,
+                                   dimof(params));
+#else
+    return vxCreateNodeByStructure(graph,
+                                   VX_KERNEL_MAX,
+                                   params,
+                                   dimof(params));
+#endif
 }
 
 VX_API_ENTRY vx_node VX_API_CALL vxLBPNode(vx_graph graph,
@@ -972,10 +1224,17 @@ VX_API_ENTRY vx_node VX_API_CALL vxLBPNode(vx_graph graph,
             (vx_reference)out,
     };
 
+#if defined(OPENVX_USE_TILING)
+    vx_node node = vxCreateNodeByStructure(graph,
+        VX_KERNEL_LBP_TILING,
+        params,
+        dimof(params));
+#else
     vx_node node = vxCreateNodeByStructure(graph,
         VX_KERNEL_LBP,
         params,
         dimof(params));
+#endif
 
     vxReleaseScalar(&sformat);
     vxReleaseScalar(&ksize);
@@ -1071,9 +1330,12 @@ vx_node VX_API_CALL vxHOGCellsNode(vx_graph graph, vx_image input, vx_int32 cell
         (vx_reference)magnitudes,
         (vx_reference)bins,
     };
-    vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_HOG_CELLS, params, dimof(params));
-
-    vxReleaseScalar(&cell_width_scalar);
+    #if defined(OPENVX_USE_TILING)
+        vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_HOG_CELLS_TILING, params, dimof(params));
+    #else
+        vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_HOG_CELLS, params, dimof(params));
+    #endif
+	vxReleaseScalar(&cell_width_scalar);
     vxReleaseScalar(&cell_height_scalar);
     vxReleaseScalar(&num_bins_scalar);
 
@@ -1095,7 +1357,11 @@ vx_node VX_API_CALL vxHOGFeaturesNode(vx_graph graph, vx_image input, vx_tensor
         (vx_reference)hog_param_size_scalar,
         (vx_reference)features,
     };
+#if defined(OPENVX_USE_TILING)
+    vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_HOG_FEATURES_TILING, param, dimof(param));
+#else
     vx_node node = vxCreateNodeByStructure(graph, VX_KERNEL_HOG_FEATURES, param, dimof(param));
+#endif
 
     vxReleaseScalar(&hog_param_size_scalar);
     vxReleaseArray(&hog_param);
diff --git a/sample/include/vx_internal.h b/sample/include/vx_internal.h
old mode 100644
new mode 100755
index ba900ca..aab7726
--- a/sample/include/vx_internal.h
+++ b/sample/include/vx_internal.h
@@ -69,12 +69,24 @@
 #define OPENVX_TILING_1_0
 #include <VX/vx_khr_tiling.h>
 #endif
+#if defined(EXPERIMENTAL_USE_NODE_MEMORY)
+#include <VX/vx_khr_node_memory.h>
+#endif
+#if defined(EXPERIMENTAL_USE_OPENCL)
+#include <VX/vx_khr_opencl.h>
+#endif
 #if defined(EXPERIMENTAL_USE_DOT)
 #include <VX/vx_khr_dot.h>
 #endif
-#if defined(OPENVX_USE_XML)
+#if defined(EXPERIMENTAL_USE_XML)
 #include <VX/vx_khr_xml.h>
 #endif
+#if defined(EXPERIMENTAL_USE_TARGET)
+#include <VX/vx_ext_target.h>
+#endif
+#if defined(EXPERIMENTAL_USE_VARIANTS)
+#include <VX/vx_khr_variants.h>
+#endif
 #include <VX/vx_lib_extras.h>
 #if defined(OPENVX_USE_IX)
 #include <VX/vx_khr_ix.h>
@@ -108,10 +120,12 @@
  */
 #define VX_INT_MAX_PATH     (256)
 
+#ifndef EXPERIMENTAL_USE_TARGET /* Otherwise already defined in <VX/vx_ext_target.h> */
 /*! \brief Defines the maximum number of characters in a target string.
  * \ingroup group_target
  */
 #define VX_MAX_TARGET_NAME (64)
+#endif
 
 #ifndef VX_MAX_STRUCT_NAME
 #define VX_MAX_STRUCT_NAME (64)
@@ -125,7 +139,7 @@
 /*! \brief Maximum number of references in the context.
  * \ingroup group_int_defines
  */
-#define VX_INT_MAX_REF      (4096)
+#define VX_INT_MAX_REF      (1024)
 
 /*! \brief Maximum number of user defined structs/
  * \ingroup group_int_defines
@@ -195,7 +209,7 @@
 /*! \brief Used to determine if a type is a struct.
  * \ingroup group_int_macros
  */
-#define VX_TYPE_IS_STRUCT(type) ((type) >= VX_TYPE_RECTANGLE && (type) < VX_TYPE_VENDOR_STRUCT_END)
+#define VX_TYPE_IS_STRUCT(type) ((type) >= VX_TYPE_RECTANGLE && (type) < VX_TYPE_KHRONOS_STRUCT_MAX)
 
 /*! \brief Used to determine if a type is a data object.
  * \ingroup group_int_macros
@@ -206,7 +220,7 @@
 /*! \brief Used to determine if a type is an object.
  * \ingroup group_int_macros
  */
-#define VX_TYPE_IS_OBJECT(type) ((type) >= VX_TYPE_REFERENCE && (type) < VX_TYPE_VENDOR_OBJECT_END)
+#define VX_TYPE_IS_OBJECT(type) ((type) >= VX_TYPE_REFERENCE && (type) < VX_TYPE_KHRONOS_OBJECT_END)
 
 /*! A parameter checker for size and alignment.
  * \ingroup group_int_macros
@@ -388,7 +402,9 @@ typedef struct _vx_processor_t {
 // forward declarations
 struct _vx_threadpool_t;
 struct _vx_threadpool_worker_t;
+#if !defined(EXPERIMENTAL_USE_TARGET)
 typedef struct _vx_target *vx_target;
+#endif
 
 /*! \brief The function pointer to the worker function.
  * \param [in] worker The per-thread worker data structure.
@@ -518,6 +534,10 @@ typedef struct _vx_reference {
     vx_int32 delay_slot_index;
     /*! \brief This indicates that if the object is virtual whether it is accessible at the moment or not */
     vx_bool is_accessible;
+#if defined(EXPERIMENTAL_USE_OPENCL)
+    /*! \brief An OpenCL event that the framework can block upon for this object */
+    cl_event event;
+#endif
     /*! \brief The reference name */
     char name[VX_MAX_REFERENCE_NAME];
 } vx_reference_t;
@@ -694,7 +714,8 @@ typedef struct _vx_kernel {
     vx_uint32 affinity;
 #ifdef OPENVX_KHR_TILING
     /*! \brief The tiling function pointer interface */
-    vx_tiling_kernel_f tiling_function;
+    vx_tiling_kernel_f tilingfast_function;
+    vx_tiling_kernel_f tilingflexible_function;
 #endif
 } vx_kernel_t;
 
@@ -730,6 +751,9 @@ typedef vx_status (*vx_target_deinit_f)(vx_target target);
 typedef vx_status (*vx_target_supports_f)(vx_target target,
                                           vx_char targetName[VX_MAX_TARGET_NAME],
                                           vx_char kernelName[VX_MAX_TARGET_NAME],
+#if defined(EXPERIMENTAL_USE_VARIANTS)
+                                          vx_char variantName[VX_MAX_VARIANT_NAME],
+#endif
                                           vx_uint32 *pIndex);
 
 /*! \brief Processes the array of nodes supplied.
@@ -784,6 +808,7 @@ typedef vx_kernel (*vx_target_addtilingkernel_f)(vx_target target,
                                                   vx_tiling_kernel_f flexible_func_ptr,
                                                   vx_tiling_kernel_f fast_func_ptr,
                                                   vx_uint32 num_parameters,
+                                                  vx_kernel_validate_f validate,
                                                   vx_kernel_input_validate_f input,
                                                   vx_kernel_output_validate_f output);
 #endif
@@ -810,14 +835,32 @@ typedef struct _vx_target_funcs_t {
 #endif
 } vx_target_funcs_t;
 
+#ifndef EXPERIMENTAL_USE_TARGET /* Otherwise already defined in <VX/vx_ext_target.h> */
 enum vx_ext_target_type_e {
     VX_TYPE_TARGET = 0x816,/*!< \brief A <tt>\ref vx_target</tt> */
 };
+#endif
 
 /*! \brief The priority list of targets.
  * \ingroup group_int_target
  */
 enum vx_target_priority_e {
+#if defined(OPENVX_USE_TILING)
+    /*! \brief Defines the priority of the TILING Target */
+    VX_TARGET_PRIORITY_TILING,
+#endif
+#if defined(EXPERIMENTAL_USE_OPENCL)
+    /*! \brief Defines the priority of the OpenCL Target */
+    VX_TARGET_PRIORITY_OPENCL,
+#endif
+#if defined(EXPERIMENTAL_USE_OPENMP)
+    /*! \brief Defines the priority of the OpenMP targets */
+    VX_TARGET_PRIORITY_OPENMP,
+#endif
+    /*! \brief Defines the priority of the VENUM targets */
+#if defined(EXPERIMENTAL_USE_VENUM)
+    VX_TARGET_PRIORITY_VENUM,
+#endif
     /*! \brief Defines the priority of the C model target */
     VX_TARGET_PRIORITY_C_MODEL,
     /*! \brief Defines the maximum priority */
@@ -976,6 +1019,19 @@ typedef struct _vx_context {
     } user_structs[VX_INT_MAX_USER_STRUCTS];
     /*! \brief The worker pool used to parallelize the graph*/
     vx_threadpool_t    *workers;
+#if defined(EXPERIMENTAL_USE_OPENCL)
+#define CL_MAX_PLATFORMS (1)
+#define CL_MAX_DEVICES   (2)
+#define CL_MAX_KERNELS   (50)
+    /*! \brief The array of platform ids */
+    cl_platform_id      platforms[CL_MAX_PLATFORMS];
+    /*! \brief The number of platform ids */
+    cl_uint             num_platforms;
+    cl_device_id        devices[CL_MAX_PLATFORMS][CL_MAX_DEVICES];
+    cl_uint             num_devices[CL_MAX_PLATFORMS];
+    cl_context          global[CL_MAX_PLATFORMS];
+    cl_command_queue    queues[CL_MAX_PLATFORMS][CL_MAX_DEVICES];
+#endif
     /*! \brief The immediate mode border */
     vx_border_t         imm_border;
     /*! \brief The unsupported border mode policy for immediate mode functions */
@@ -1129,15 +1185,23 @@ typedef struct _vx_memory_t {
     /*! \brief The array of pointers (one per plane for images) */
     vx_uint8*      ptrs[VX_PLANE_MAX];
     /*! \brief The number of dimensions per ptr */
-    vx_uint32       ndims;
+    vx_int32       ndims;
     /*! \brief The dimensional values per ptr */
-    vx_uint32       dims[VX_PLANE_MAX][VX_DIM_MAX];
+    vx_int32       dims[VX_PLANE_MAX][VX_DIM_MAX];
     /*! \brief The per ptr stride values per dimension */
     vx_int32       strides[VX_PLANE_MAX][VX_DIM_MAX];
     /*! \brief The write locks. Used by Access/Commit pairs on usages which have
      * VX_WRITE_ONLY or VX_READ_AND_WRITE flag parts. Only single writers are permitted.
      */
     vx_sem_t locks[VX_PLANE_MAX];
+#if defined(EXPERIMENTAL_USE_OPENCL)
+    /*! \brief This contains the OpenCL memory references */
+    cl_mem hdls[VX_PLANE_MAX];
+    /*! \brief This describes the type of memory allocated with OpenCL */
+    cl_mem_object_type cl_type;
+    /*! \brief This describes the image format (if it is an image) */
+    cl_image_format cl_format;
+#endif
 } vx_memory_t;
 
 /*! \brief The internal representation of a \ref vx_image
@@ -1174,6 +1238,10 @@ typedef struct _vx_image {
     vx_rectangle_t region;
     /*! \brief The memory type */
     vx_enum        memory_type;
+#if defined(EXPERIMENTAL_USE_OPENCL)
+    /*! \brief This describes the type of OpenCL Image that maps to this image (if applicable). */
+    cl_image_format cl_format;
+#endif
 } vx_image_t;
 
 /*! \brief The internal representation of a \ref vx_array
diff --git a/sample/targets/CMakeLists.txt b/sample/targets/CMakeLists.txt
index 0816dc0..2d4a98b 100644
--- a/sample/targets/CMakeLists.txt
+++ b/sample/targets/CMakeLists.txt
@@ -16,3 +16,7 @@
 
 
 add_subdirectory( c_model )
+if (OPENVX_USE_TILING)
+    add_subdirectory( tiling )
+endif (OPENVX_USE_TILING)
+
diff --git a/sample/targets/c_model/vx_interface.c b/sample/targets/c_model/vx_interface.c
index 2c19a7f..98dbc55 100644
--- a/sample/targets/c_model/vx_interface.c
+++ b/sample/targets/c_model/vx_interface.c
@@ -15,12 +15,6 @@
  * limitations under the License.
  */
 
-/*!
- * \file
- * \brief The C-Model Target Interface
- * \author Erik Rainey <erik.rainey@gmail.com>
- */
-
 #include <VX/vx.h>
 #include <VX/vx_helper.h>
 
@@ -358,7 +352,7 @@ vx_kernel vxTargetAddTilingKernel(vx_target target,
         kernel = &(target->kernels[k]);
         if (kernel->enabled == vx_false_e)
         {
-            kernel->tiling_function = fast_func_ptr;
+            kernel->tilingfast_function = fast_func_ptr;
             ownInitializeKernel(target->base.context,
                                kernel,
                                enumeration, vxTilingKernel, name,
@@ -511,7 +505,7 @@ vx_status VX_CALLBACK vxTilingKernel(vx_node node, const vx_reference parameters
             {
                 //printf("Calling Tile{%u,%u} with %s\n", tx, ty, ((vx_node_t *)node)->kernel->name);
                 tile_memory = ((vx_node_t *)node)->attributes.tileDataPtr;
-                ((vx_node_t *)node)->kernel->tiling_function(params, tile_memory, size);
+                ((vx_node_t *)node)->kernel->tilingfast_function(params, tile_memory, size);
             }
             else
             {
diff --git a/sample/targets/opencl/CMakeLists.txt b/sample/targets/opencl/CMakeLists.txt
new file mode 100644
index 0000000..b558883
--- /dev/null
+++ b/sample/targets/opencl/CMakeLists.txt
@@ -0,0 +1,55 @@
+#
+
+# Copyright (c) 2011-2017 The Khronos Group Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+if ( EXPERIMENTAL_USE_OPENCL )
+    # set target name
+    set( TARGET_NAME openvx-opencl )
+
+    include_directories( BEFORE
+                     ${CMAKE_CURRENT_SOURCE_DIR}
+					 ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+					 ${CMAKE_SOURCE_DIR}/include
+					 ${CMAKE_SOURCE_DIR}/debug
+                     ${OPENCL_INCLUDE_PATH} )
+					     					 
+    set( INVERTED_COMMA "\"" )
+	set( CL_SOURCE_DIR ${INVERTED_COMMA}${CMAKE_SOURCE_DIR}/kernels/opencl${INVERTED_COMMA} )
+					 
+	add_definitions( -DVX_CL_SOURCE_DIR=${CL_SOURCE_DIR} )
+			
+    FIND_SOURCES()
+	
+	if ((WIN32) OR (CYGWIN))
+		set( DEF_FILE openvx-target.def )
+	endif ((WIN32) OR (CYGWIN))
+
+    # add a target named ${TARGET_NAME}
+    add_library (${TARGET_NAME} SHARED ${SOURCE_FILES} ${DEF_FILE})
+	
+    if (CYGWIN)
+        set_target_properties( ${TARGET_NAME} PROPERTIES LINK_FLAGS ${CMAKE_CURRENT_SOURCE_DIR}/${DEF_FILE} )
+    endif (CYGWIN)
+
+    target_link_libraries( ${TARGET_NAME} openvx )
+
+    install ( TARGETS ${TARGET_NAME} 
+              RUNTIME DESTINATION bin
+              ARCHIVE DESTINATION lib
+              LIBRARY DESTINATION bin )
+		  
+endif ( EXPERIMENTAL_USE_OPENCL )
\ No newline at end of file
diff --git a/sample/targets/opencl/concerto.mak b/sample/targets/opencl/concerto.mak
new file mode 100644
index 0000000..43cfbeb
--- /dev/null
+++ b/sample/targets/opencl/concerto.mak
@@ -0,0 +1,38 @@
+# 
+
+# Copyright (c) 2012-2017 The Khronos Group Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+
+include $(PRELUDE)
+TARGET := openvx-opencl
+TARGETTYPE := dsmo
+DEFFILE := openvx-target.def
+CSOURCES = $(call all-c-files)
+IDIRS += $(HOST_ROOT)/$(OPENVX_SRC)/include $(HOST_ROOT)/debug
+SHARED_LIBS += openvx
+DEFS += VX_CL_SOURCE_DIR="\"$(HOST_ROOT)/kernels/opencl\""
+ifeq ($(TARGET_BUILD),debug)
+# This is to use the local headers instead of system defined ones it's temporary
+DEFS += VX_INCLUDE_DIR="\"$(HOST_ROOT)/include\""
+endif
+ifneq (,$(findstring EXPERIMENTAL_USE_OPENCL,$(SYSDEFS)))
+USE_OPENCL:=true
+else
+SKIPBUILD:=1
+endif
+include $(FINALE)
+
diff --git a/sample/targets/opencl/openvx-target.def b/sample/targets/opencl/openvx-target.def
new file mode 100644
index 0000000..19bee4f
--- /dev/null
+++ b/sample/targets/opencl/openvx-target.def
@@ -0,0 +1,9 @@
+LIBRARY "openvx-opencl.dll"
+VERSION 1.0
+EXPORTS
+    vxTargetInit
+    vxTargetDeinit
+    vxTargetVerify
+    vxTargetProcess
+    vxTargetSupports
+    vxTargetAddKernel
diff --git a/sample/targets/opencl/vx_bitwise.c b/sample/targets/opencl/vx_bitwise.c
new file mode 100644
index 0000000..c900f48
--- /dev/null
+++ b/sample/targets/opencl/vx_bitwise.c
@@ -0,0 +1,409 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <VX/vx.h>
+#include <VX/vx_helper.h>
+#include <vx_support.h>
+
+#include "vx_interface.h"
+
+/*
+ * The three bitwise kernels with binary parameters have the same parameter domain so
+ * let's just have one set of validators.
+ */
+
+static vx_status VX_CALLBACK vxBinaryBitwiseInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+                status = VX_SUCCESS;
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_image images[2];
+        vx_parameter param[2] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, 1),
+        };
+        vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0]));
+        vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1]));
+        if (images[0] && images[1])
+        {
+            vx_uint32 width[2], height[2];
+            vx_df_image format[2];
+
+            vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0]));
+            vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1]));
+            vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0]));
+            vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1]));
+            vxQueryImage(images[0], VX_IMAGE_FORMAT, &format[0], sizeof(format[0]));
+            vxQueryImage(images[1], VX_IMAGE_FORMAT, &format[1], sizeof(format[1]));
+            if (width[0] == width[1] && height[0] == height[1] && format[0] == format[1])
+                status = VX_SUCCESS;
+            vxReleaseImage(&images[1]);
+            vxReleaseImage(&images[0]);
+        }
+        vxReleaseParameter(&param[0]);
+        vxReleaseParameter(&param[1]);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxBinaryBitwiseOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 2)
+    {
+        vx_parameter param0 = vxGetParameterByIndex(node, 0);
+        if (param0)
+        {
+            vx_image image0 = 0;
+            vxQueryParameter(param0, VX_PARAMETER_REF, &image0, sizeof(image0));
+            /*
+             * When passing on the geometry to the output image, we only look at image 0, as
+             * both input images are verified to match, at input validation.
+             */
+            if (image0)
+            {
+                vx_uint32 width = 0, height = 0;
+                vxQueryImage(image0, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(image0, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = VX_DF_IMAGE_U8;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&image0);
+            }
+            vxReleaseParameter(&param0);
+        }
+    }
+    return status;
+}
+
+static vx_param_description_t binary_bitwise_kernel_params[] = {
+    {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+};
+
+
+static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_FAILURE;
+    vx_context context = node->base.context;
+
+    vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration);
+    vx_uint32 pidx, pln, didx, plidx, argidx;
+    cl_int err = 0;
+    size_t off_dim[3] = {0,0,0};
+    size_t work_dim[3];
+
+    cl_event writeEvents[VX_INT_MAX_PARAMS];
+    cl_event readEvents[VX_INT_MAX_PARAMS];
+    cl_int we = 0, re = 0;
+
+    // determine which platform to use
+    plidx = 0;
+
+    // determine which device to use
+    didx = 0;
+
+    cl_kernel kernel = vxclk->kernels[plidx];
+
+    pln = 0;
+
+    for (argidx = 0, pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        vx_memory_t *memory = &((vx_image)ref)->memory;
+
+        /* set the work dimensions */
+        work_dim[0] = memory->dims[pln][VX_DIM_X];
+        work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+        //stride_x, stride_y
+        err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+        err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+        VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+        err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+        CL_ERROR_MSG(err, "clSetKernelArg");
+        if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL)
+        {
+            err = clEnqueueWriteBuffer(context->queues[plidx][didx],
+                                       memory->hdls[pln],
+                                       CL_TRUE,
+                                       0,
+                                       ownComputeMemorySize(memory, pln),
+                                       memory->ptrs[pln],
+                                       0,
+                                       NULL,
+                                       &ref->event);
+        }
+    }
+
+    we = 0;
+    for (pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) 
+        {
+            memcpy(&writeEvents[we++],&ref->event, sizeof(cl_event));
+        }
+    }
+
+
+    err = clEnqueueNDRangeKernel(context->queues[plidx][didx],
+                                 kernel,
+                                 2,
+                                 off_dim,
+                                 work_dim,
+                                 NULL,
+                                 we, writeEvents, &node->base.event);
+
+    clFinish(context->queues[plidx][didx]);
+
+    CL_ERROR_MSG(err, "clEnqueueNDRangeKernel");
+
+    pln = 0;
+
+    vx_reference ref;
+    /* enqueue a read on all output data */
+    if (num == 3)
+        ref = node->parameters[2];
+    else  // Not kernel
+        ref = node->parameters[1];
+
+    vx_memory_t *memory = NULL;
+
+    memory = &((vx_image)ref)->memory;
+
+    err = clEnqueueReadBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE, 0, ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0, NULL, NULL);
+
+    CL_ERROR_MSG(err, "clEnqueueReadBuffer");
+
+    clFinish(context->queues[plidx][didx]);
+
+    re = 0;
+    for (pidx = 0; pidx < num; pidx++) 
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) 
+        {
+            memcpy(&readEvents[re++],&ref->event, sizeof(cl_event));
+        }
+    }
+    err = clFlush(context->queues[plidx][didx]);
+    CL_ERROR_MSG(err, "Flush");
+    VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n");
+    clWaitForEvents(re, readEvents);
+    if (err == CL_SUCCESS)
+        status = VX_SUCCESS;
+
+    VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status);
+    return status;
+}
+
+static vx_status VX_CALLBACK vxAndKernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+vx_cl_kernel_description_t and_kernel = {
+    {
+        VX_KERNEL_AND,
+        "org.khronos.openvx.and",
+        vxAndKernel,
+        binary_bitwise_kernel_params, dimof(binary_bitwise_kernel_params),
+        NULL,
+        vxBinaryBitwiseInputValidator,
+        vxBinaryBitwiseOutputValidator,
+        NULL,
+        NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_and.cl",
+    "vx_and",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
+
+static vx_status VX_CALLBACK vxOrKernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+vx_cl_kernel_description_t orr_kernel = {
+    {
+        VX_KERNEL_OR,
+        "org.khronos.openvx.or",
+        vxOrKernel,
+        binary_bitwise_kernel_params, dimof(binary_bitwise_kernel_params),
+        NULL,
+        vxBinaryBitwiseInputValidator,
+        vxBinaryBitwiseOutputValidator,
+        NULL,
+        NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_orr.cl",
+    "vx_orr",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
+
+static vx_status VX_CALLBACK vxXorKernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+vx_cl_kernel_description_t xor_kernel = {
+    {
+        VX_KERNEL_XOR,
+        "org.khronos.openvx.xor",
+        vxXorKernel,
+        binary_bitwise_kernel_params, dimof(binary_bitwise_kernel_params),
+        NULL,
+        vxBinaryBitwiseInputValidator,
+        vxBinaryBitwiseOutputValidator,
+        NULL,
+        NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_xor.cl",
+    "vx_xor",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
+
+/* The Not kernel is an unary operator, requiring separate validators. */
+
+static vx_status VX_CALLBACK vxUnaryBitwiseInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+                status = VX_SUCCESS;
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxUnaryBitwiseOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, 0);
+        if (param)
+        {
+            vx_image inimage = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &inimage, sizeof(inimage));
+            if (inimage)
+            {
+                vx_uint32 width = 0, height = 0;
+                vxQueryImage(inimage, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(inimage, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = VX_DF_IMAGE_U8;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&inimage);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_param_description_t unary_bitwise_kernel_params[] = {
+    {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+};
+
+static vx_status VX_CALLBACK vxNotKernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+vx_cl_kernel_description_t not_kernel = {
+    {
+        VX_KERNEL_NOT,
+        "org.khronos.openvx.not",
+        vxNotKernel,
+        unary_bitwise_kernel_params, dimof(unary_bitwise_kernel_params),
+        NULL,
+        vxUnaryBitwiseInputValidator,
+        vxUnaryBitwiseOutputValidator,
+        NULL,
+        NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_not.cl",
+    "vx_not",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
diff --git a/sample/targets/opencl/vx_convolution.c b/sample/targets/opencl/vx_convolution.c
new file mode 100644
index 0000000..33a3308
--- /dev/null
+++ b/sample/targets/opencl/vx_convolution.c
@@ -0,0 +1,340 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <VX/vx.h>
+#include <VX/vx_helper.h>
+#include <vx_support.h>
+
+#include "vx_interface.h"
+
+#define C_MAX_CONVOLUTION_DIM (15)
+
+#if (C_MAX_CONVOLUTION_DIM != VX_INT_MAX_CONVOLUTION_DIM)
+#if defined(_WIN32)
+#pragma error("C Model does not support VX required Convolution Size")
+#elif defined(__GNUC__)
+#error "C Model does not support VX required Convolution Size"
+#endif
+#endif
+
+
+static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_FAILURE;
+    vx_context context = node->base.context;
+
+    vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration);
+    vx_uint32 pln, didx, plidx, argidx;
+    cl_int err = 0;
+    size_t off_dim[3] = { 0,0,0 };
+    size_t work_dim[3];
+
+    cl_event writeEvents[VX_INT_MAX_PARAMS];
+    cl_event readEvents[VX_INT_MAX_PARAMS];
+    cl_int we = 0, re = 0;
+
+    // determine which platform to use
+    plidx = 0;
+
+    // determine which device to use
+    didx = 0;
+
+    cl_kernel kernel = vxclk->kernels[plidx];
+
+    pln = 0;
+
+    argidx = 0;
+
+    //Set Input
+    vx_reference ref = node->parameters[0];
+    vx_enum dir = node->kernel->signature.directions[0];
+    vx_memory_t *memory = &((vx_image)ref)->memory;
+
+    //stride_x, stride_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    err = clEnqueueWriteBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE,
+        0,
+        ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0,
+        NULL,
+        &ref->event);
+
+    //Set bordermode
+    vx_border_t bordermode;
+    status = vxQueryNode(node, VX_NODE_BORDER, &bordermode, sizeof(bordermode));
+
+    int border_mode = bordermode.mode;
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &border_mode);
+
+    //Set const value for constant boder 
+    uint8_t const_vaule = bordermode.constant_value.U8;
+    err = clSetKernelArg(kernel, argidx++, sizeof(uint8_t), &const_vaule);
+
+    //Set conv_mat
+    vx_size conv_width, conv_height;
+    vx_int16 _conv_mat[C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM] = { 0 };
+    vx_uint32 scale = 1;
+
+    vx_convolution conv = (vx_convolution)parameters[1];
+
+    status |= vxQueryConvolution(conv, VX_CONVOLUTION_COLUMNS, &conv_width, sizeof(conv_width));
+    status |= vxQueryConvolution(conv, VX_CONVOLUTION_ROWS, &conv_height, sizeof(conv_height));
+    status |= vxQueryConvolution(conv, VX_CONVOLUTION_SCALE, &scale, sizeof(scale));
+
+    status |= vxCopyConvolutionCoefficients(conv, _conv_mat, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_uint32), &conv_width);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_uint32), &conv_height);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_uint32), &scale);
+
+    short matrix_size = C_MAX_CONVOLUTION_DIM * C_MAX_CONVOLUTION_DIM * sizeof(short);
+
+    cl_mem conv_mat = clCreateBuffer(context->global[0], CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, matrix_size, _conv_mat, &err);
+
+    err = clEnqueueWriteBuffer(context->queues[plidx][didx],
+        conv_mat,
+        CL_TRUE,
+        0,
+        matrix_size,
+        _conv_mat,
+        0,
+        NULL,
+        NULL);
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &conv_mat);
+
+    //Set Output
+    ref = node->parameters[2];
+    memory = &((vx_image)ref)->memory;
+
+    /* set the work dimensions */
+    work_dim[0] = memory->dims[pln][VX_DIM_X];
+    work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+    //stride_x, stride_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    we = 0;
+    ref = node->parameters[0];
+    memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event));
+
+    err = clEnqueueNDRangeKernel(context->queues[plidx][didx],
+        kernel,
+        2,
+        off_dim,
+        work_dim,
+        NULL,
+        we, writeEvents, &node->base.event);
+
+    clFinish(context->queues[plidx][didx]);
+
+    CL_ERROR_MSG(err, "clEnqueueNDRangeKernel");
+
+    /* enqueue a read on all output data */
+    ref = node->parameters[2];
+
+    memory = &((vx_image)ref)->memory;
+
+    err = clEnqueueReadBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE, 0, ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0, NULL, NULL);
+
+    CL_ERROR_MSG(err, "clEnqueueReadBuffer");
+
+    clFinish(context->queues[plidx][didx]);
+
+    re = 0;
+
+    ref = node->parameters[2];
+    memcpy(&readEvents[re++], &ref->event, sizeof(cl_event));
+
+    err = clFlush(context->queues[plidx][didx]);
+    CL_ERROR_MSG(err, "Flush");
+    VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n");
+    clWaitForEvents(re, readEvents);
+    if (err == CL_SUCCESS)
+        status = VX_SUCCESS;
+
+    VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status);
+
+    clReleaseMemObject(conv_mat);
+
+    return status;
+}
+
+static vx_status VX_CALLBACK vxConvolveInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+
+#if defined(EXPERIMENTAL_USE_S16)
+            if( (format == VX_DF_IMAGE_U8) || (format == VX_DF_IMAGE_S16) )
+#else
+            if (format == VX_DF_IMAGE_U8)
+#endif
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    if (index == 1)
+    {
+        vx_image input = 0;
+        vx_convolution conv = 0;
+
+        vx_parameter param0 = vxGetParameterByIndex(node, 0);
+        vx_parameter param1 = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param0, VX_PARAMETER_REF, &input, sizeof(input));
+        vxQueryParameter(param1, VX_PARAMETER_REF, &conv, sizeof(conv));
+        if (input && conv)
+        {
+            vx_uint32 width = 0;
+            vx_uint32 height = 0;
+            vx_size dims[2] = { 0, 0 };
+
+            vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+            vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+
+            vxQueryConvolution(conv, VX_CONVOLUTION_COLUMNS, &dims[0], sizeof(dims[0]));
+            vxQueryConvolution(conv, VX_CONVOLUTION_ROWS, &dims[1], sizeof(dims[1]));
+
+            if ((dims[0] <= VX_INT_MAX_CONVOLUTION_DIM) &&
+                (dims[1] <= VX_INT_MAX_CONVOLUTION_DIM) &&
+                (width >= dims[0]) &&
+                (height >= dims[1]))
+            {
+                status = VX_SUCCESS;
+            }
+
+            vxReleaseImage(&input);
+            vxReleaseConvolution(&conv);
+        }
+
+        vxReleaseParameter(&param0);
+        vxReleaseParameter(&param1);
+    }
+
+    return status;
+}
+
+static vx_status VX_CALLBACK vxConvolveOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 2)
+    {
+        vx_parameter params[2] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, index),
+        };
+        if ((vxGetStatus((vx_reference)params[0]) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)params[1]) == VX_SUCCESS))
+        {
+            vx_image input = 0;
+            vx_image output = 0;
+            vxQueryParameter(params[0], VX_PARAMETER_REF, &input, sizeof(input));
+            vxQueryParameter(params[1], VX_PARAMETER_REF, &output, sizeof(output));
+            if (input && output)
+            {
+                vx_uint32 width = 0, height = 0;
+                vx_df_image format = 0;
+                vx_df_image output_format = 0;
+                vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+                vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+
+                vxQueryImage(output, VX_IMAGE_FORMAT, &output_format, sizeof(output_format));
+
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = output_format == VX_DF_IMAGE_U8 ? VX_DF_IMAGE_U8 : VX_DF_IMAGE_S16;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+
+                vxReleaseImage(&input);
+                vxReleaseImage(&output);
+            }
+            vxReleaseParameter(&params[0]);
+            vxReleaseParameter(&params[1]);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxConvolveKernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    printf("OpenCL Convolve\n");
+
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+static vx_param_description_t convolution_kernel_params[] = {
+    {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_CONVOLUTION, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+};
+
+vx_cl_kernel_description_t convolution_kernel = {
+    {
+    VX_KERNEL_CUSTOM_CONVOLUTION,
+    "org.khronos.openvx.custom_convolution",
+    vxConvolveKernel,
+    convolution_kernel_params, dimof(convolution_kernel_params),
+    NULL,
+    vxConvolveInputValidator,
+    vxConvolveOutputValidator,
+    NULL,
+    NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_convolve.cl",
+    "vx_Convolve",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
+
diff --git a/sample/targets/opencl/vx_filter.c b/sample/targets/opencl/vx_filter.c
new file mode 100644
index 0000000..02f320e
--- /dev/null
+++ b/sample/targets/opencl/vx_filter.c
@@ -0,0 +1,304 @@
+/*
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <VX/vx.h>
+#include <VX/vx_helper.h>
+#include <vx_support.h>
+
+#include "vx_interface.h"
+
+static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_FAILURE;
+    vx_context context = node->base.context;
+
+    vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration);
+    vx_uint32 pidx, pln, didx, plidx, argidx;
+    cl_int err = 0;
+    size_t off_dim[3] = { 0,0,0 };
+    size_t work_dim[3];
+
+    cl_event writeEvents[VX_INT_MAX_PARAMS];
+    cl_event readEvents[VX_INT_MAX_PARAMS];
+    cl_int we = 0, re = 0;
+
+    // determine which platform to use
+    plidx = 0;
+
+    // determine which device to use
+    didx = 0;
+
+    cl_kernel kernel = vxclk->kernels[plidx];
+
+    pln = 0;
+
+    argidx = 0;
+
+    //Set Input
+    vx_reference ref = node->parameters[0];
+    vx_enum dir = node->kernel->signature.directions[0];
+    vx_memory_t *memory = &((vx_image)ref)->memory;
+
+    /* set the work dimensions */
+    work_dim[0] = memory->dims[pln][VX_DIM_X];
+    work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+    //stride_x, stride_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    err = clEnqueueWriteBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE,
+        0,
+        ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0,
+        NULL,
+        &ref->event);
+
+    //Set bordermode
+    vx_border_t bordermode;
+    status = vxQueryNode(node, VX_NODE_BORDER, &bordermode, sizeof(bordermode));
+
+    int border_mode = bordermode.mode;
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &border_mode);
+
+    //Set const value for constant boder 
+    uint8_t const_vaule = bordermode.constant_value.U8;
+    err = clSetKernelArg(kernel, argidx++, sizeof(uint8_t), &const_vaule);
+
+    //Set Output
+    ref = node->parameters[1];
+    memory = &((vx_image)ref)->memory;
+
+    /* set the work dimensions */
+    work_dim[0] = memory->dims[pln][VX_DIM_X];
+    work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+    //stride_x, stride_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    we = 0;
+    for (pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL)
+        {
+            memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event));
+        }
+    }
+
+
+    err = clEnqueueNDRangeKernel(context->queues[plidx][didx],
+        kernel,
+        2,
+        off_dim,
+        work_dim,
+        NULL,
+        we, writeEvents, &node->base.event);
+
+    clFinish(context->queues[plidx][didx]);
+
+    CL_ERROR_MSG(err, "clEnqueueNDRangeKernel");
+
+    pln = 0;
+
+    /* enqueue a read on all output data */
+    ref = node->parameters[1];
+
+    memory = &((vx_image)ref)->memory;
+
+    err = clEnqueueReadBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE, 0, ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0, NULL, NULL);
+
+    CL_ERROR_MSG(err, "clEnqueueReadBuffer");
+
+    clFinish(context->queues[plidx][didx]);
+
+    re = 0;
+    for (pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL)
+        {
+            memcpy(&readEvents[re++], &ref->event, sizeof(cl_event));
+        }
+    }
+    err = clFlush(context->queues[plidx][didx]);
+    CL_ERROR_MSG(err, "Flush");
+    VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n");
+    clWaitForEvents(re, readEvents);
+    if (err == CL_SUCCESS)
+        status = VX_SUCCESS;
+
+    VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status);
+    return status;
+}
+
+
+static vx_status VX_CALLBACK vxFilterInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxFilterOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 1)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, 0); /* we reference the input image */
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_uint32 width = 0, height = 0;
+            vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+            vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+            ptr->type = VX_TYPE_IMAGE;
+            ptr->dim.image.format = VX_DF_IMAGE_U8;
+            ptr->dim.image.width = width;
+            ptr->dim.image.height = height;
+            status = VX_SUCCESS;
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    return status;
+}
+
+static vx_param_description_t filter_kernel_params[] = {
+    {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+};
+
+static vx_status VX_CALLBACK vxBox3x3Kernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+vx_cl_kernel_description_t box3x3_clkernel = {
+    {
+        VX_KERNEL_BOX_3x3,
+        "org.khronos.openvx.box3x3",
+        vxBox3x3Kernel,
+        filter_kernel_params, dimof(filter_kernel_params),
+        NULL,
+        vxFilterInputValidator,
+        vxFilterOutputValidator,
+        NULL,
+        NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_box3x3.cl",
+    "vx_box3x3",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
+
+static vx_status VX_CALLBACK vxGaussian3x3Kernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+vx_cl_kernel_description_t gaussian3x3_clkernel = {
+    {
+        VX_KERNEL_GAUSSIAN_3x3,
+        "org.khronos.openvx.gaussian3x3",
+        vxGaussian3x3Kernel,
+        filter_kernel_params, dimof(filter_kernel_params),
+        NULL,
+        vxFilterInputValidator,
+        vxFilterOutputValidator,
+        NULL,
+        NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_gaussian3x3.cl",
+    "vx_gaussian3x3",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
+
+static vx_status VX_CALLBACK vxMedian3x3Kernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+vx_cl_kernel_description_t median3x3_kernel = {
+    {
+        VX_KERNEL_MEDIAN_3x3,
+        "org.khronos.openvx.median_3x3",
+        vxMedian3x3Kernel,
+        filter_kernel_params, dimof(filter_kernel_params),
+        NULL,
+        vxFilterInputValidator,
+        vxFilterOutputValidator,
+        NULL,
+        NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_median3x3.cl",
+    "vx_median3x3",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
diff --git a/sample/targets/opencl/vx_gradients.c b/sample/targets/opencl/vx_gradients.c
new file mode 100644
index 0000000..8ccba30
--- /dev/null
+++ b/sample/targets/opencl/vx_gradients.c
@@ -0,0 +1,313 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <VX/vx.h>
+#include <VX/vx_helper.h>
+#include <vx_support.h>
+
+#include "vx_interface.h"
+
+static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_FAILURE;
+    vx_context context = node->base.context;
+
+    vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration);
+    vx_uint32 pidx, pln, didx, plidx, argidx;
+    cl_int err = 0;
+    size_t off_dim[3] = { 0,0,0 };
+    size_t work_dim[3];
+
+    cl_event writeEvents[VX_INT_MAX_PARAMS];
+    cl_event readEvents[VX_INT_MAX_PARAMS];
+    cl_int we = 0, re = 0;
+
+    // determine which platform to use
+    plidx = 0;
+
+    // determine which device to use
+    didx = 0;
+
+    cl_kernel kernel = vxclk->kernels[plidx];
+
+    pln = 0;
+
+    argidx = 0;
+
+    //Set Input
+    vx_reference ref = node->parameters[0];
+    vx_enum dir = node->kernel->signature.directions[0];
+    vx_memory_t *memory = &((vx_image)ref)->memory;
+
+    /* set the work dimensions */
+    work_dim[0] = memory->dims[pln][VX_DIM_X];
+    work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+    //stride_x, stride_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    err = clEnqueueWriteBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE,
+        0,
+        ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0,
+        NULL,
+        &ref->event);
+
+    //Set bordermode
+    vx_border_t bordermode;
+    status = vxQueryNode(node, VX_NODE_BORDER, &bordermode, sizeof(bordermode));
+
+    int border_mode = bordermode.mode;
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &border_mode);
+
+    //Set const value for constant boder 
+    uint8_t const_vaule = bordermode.constant_value.U8;
+    err = clSetKernelArg(kernel, argidx++, sizeof(uint8_t), &const_vaule);
+
+    //Set grad_x
+    ref = node->parameters[1];
+    memory = &((vx_image)ref)->memory;
+
+    /* set the work dimensions */
+    work_dim[0] = memory->dims[pln][VX_DIM_X];
+    work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+    int stride_x = memory->strides[pln][VX_DIM_X] / 2;
+    int stride_y = memory->strides[pln][VX_DIM_Y] / 2;
+
+    //stride_x, stride_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &stride_x);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &stride_y);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+
+    //Set grad_y
+    ref = node->parameters[2];
+    memory = &((vx_image)ref)->memory;
+
+    /* set the work dimensions */
+    work_dim[0] = memory->dims[pln][VX_DIM_X];
+    work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+    int stride_x1 = memory->strides[pln][VX_DIM_X] / 2;
+    int stride_y1 = memory->strides[pln][VX_DIM_Y] / 2;
+
+    //stride_x, stride_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &stride_x1);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &stride_y1);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    we = 0;
+    for (pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL)
+        {
+            memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event));
+        }
+    }
+
+    err = clEnqueueNDRangeKernel(context->queues[plidx][didx],
+        kernel,
+        2,
+        off_dim,
+        work_dim,
+        NULL,
+        we, writeEvents, &node->base.event);
+
+    clFinish(context->queues[plidx][didx]);
+
+    CL_ERROR_MSG(err, "clEnqueueNDRangeKernel");
+
+    pln = 0;
+
+    /* enqueue a read on all output data */
+    ref = node->parameters[1];
+
+    memory = &((vx_image)ref)->memory;
+
+    err = clEnqueueReadBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE, 0, ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0, NULL, NULL);
+
+    CL_ERROR_MSG(err, "clEnqueueReadBuffer");
+
+    clFinish(context->queues[plidx][didx]);
+
+    ref = node->parameters[2];
+
+    memory = &((vx_image)ref)->memory;
+
+    err = clEnqueueReadBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE, 0, ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0, NULL, NULL);
+
+    CL_ERROR_MSG(err, "clEnqueueReadBuffer");
+
+    clFinish(context->queues[plidx][didx]);
+
+    re = 0;
+    for (pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL)
+        {
+            memcpy(&readEvents[re++], &ref->event, sizeof(cl_event));
+        }
+    }
+    err = clFlush(context->queues[plidx][didx]);
+    CL_ERROR_MSG(err, "Flush");
+    VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n");
+    clWaitForEvents(re, readEvents);
+    if (err == CL_SUCCESS)
+        status = VX_SUCCESS;
+
+    VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status);
+    return status;
+}
+
+static vx_param_description_t sobel3x3_kernel_params[] =
+{
+    { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+};
+
+static vx_status VX_CALLBACK ownSobel3x3Kernel(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+} /* ownSobel3x3Kernel() */
+
+static
+vx_status VX_CALLBACK own_sobel3x3_validator(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[])
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+
+    if (NULL != node && NULL != parameters && num == dimof(sobel3x3_kernel_params) && NULL != metas)
+    {
+        vx_parameter param1 = vxGetParameterByIndex(node, 0);
+        vx_parameter param2 = vxGetParameterByIndex(node, 1);
+        vx_parameter param3 = vxGetParameterByIndex(node, 2);
+
+        if (VX_SUCCESS == vxGetStatus((vx_reference)param1) &&
+            ( (VX_SUCCESS == vxGetStatus((vx_reference)param2)) || (VX_SUCCESS == vxGetStatus((vx_reference)param3)) ))
+        {
+            vx_uint32   src_width  = 0;
+            vx_uint32   src_height = 0;
+            vx_df_image src_format = 0;
+            vx_image    input = 0;
+
+            status = vxQueryParameter(param1, VX_PARAMETER_REF, &input, sizeof(input));
+
+            status |= vxQueryImage(input, VX_IMAGE_WIDTH,  &src_width,  sizeof(src_width));
+            status |= vxQueryImage(input, VX_IMAGE_HEIGHT, &src_height, sizeof(src_height));
+            status |= vxQueryImage(input, VX_IMAGE_FORMAT, &src_format, sizeof(src_format));
+
+            /* validate input image */
+            if (VX_SUCCESS == status)
+            {
+                if (src_width >= 3 && src_height >= 3 && src_format == VX_DF_IMAGE_U8)
+                    status = VX_SUCCESS;
+                else
+                    status = VX_ERROR_INVALID_PARAMETERS;
+            }
+
+            /* validate output images */
+            if (VX_SUCCESS == status)
+            {
+                vx_enum dst_format = VX_DF_IMAGE_S16;
+
+                if (NULL == metas[1] && NULL == metas[2])
+                    status = VX_ERROR_INVALID_PARAMETERS;
+
+                if (VX_SUCCESS == status && NULL != metas[1])
+                {
+                    /* if optional parameter non NULL */
+                    status |= vxSetMetaFormatAttribute(metas[1], VX_IMAGE_WIDTH,  &src_width,  sizeof(src_width));
+                    status |= vxSetMetaFormatAttribute(metas[1], VX_IMAGE_HEIGHT, &src_height, sizeof(src_height));
+                    status |= vxSetMetaFormatAttribute(metas[1], VX_IMAGE_FORMAT, &dst_format, sizeof(dst_format));
+                }
+
+                if (VX_SUCCESS == status && NULL != metas[2])
+                {
+                    /* if optional parameter non NULL */
+                    status |= vxSetMetaFormatAttribute(metas[2], VX_IMAGE_WIDTH,  &src_width,  sizeof(src_width));
+                    status |= vxSetMetaFormatAttribute(metas[2], VX_IMAGE_HEIGHT, &src_height, sizeof(src_height));
+                    status |= vxSetMetaFormatAttribute(metas[2], VX_IMAGE_FORMAT, &dst_format, sizeof(dst_format));
+                }
+            }
+
+            if (NULL != input)
+                vxReleaseImage(&input);
+
+            if (NULL != param1)
+                vxReleaseParameter(&param1);
+
+            if (NULL != param2)
+                vxReleaseParameter(&param2);
+
+            if (NULL != param3)
+                vxReleaseParameter(&param3);
+        }
+    } /* if ptrs non NULL */
+
+    return status;
+} /* own_sobel3x3_validator() */
+
+
+vx_cl_kernel_description_t sobel3x3_clkernel = {
+    {
+    VX_KERNEL_SOBEL_3x3,
+    "org.khronos.openvx.sobel_3x3",
+    ownSobel3x3Kernel,
+    sobel3x3_kernel_params, dimof(sobel3x3_kernel_params),
+    own_sobel3x3_validator,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_sobel3x3.cl",
+    "vx_sobel3x3",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
+
diff --git a/sample/targets/opencl/vx_interface.c b/sample/targets/opencl/vx_interface.c
new file mode 100644
index 0000000..9bdfdc6
--- /dev/null
+++ b/sample/targets/opencl/vx_interface.c
@@ -0,0 +1,817 @@
+/*
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <VX/vx.h>
+#include <VX/vx_helper.h>
+#include <VX/vx_khr_variants.h>
+#include <vx_debug.h>
+#include "vx_internal.h"
+#include <vx_interface.h>
+#include <vx_support.h>
+#include <sys/time.h>
+
+static const vx_char name[VX_MAX_TARGET_NAME] = "pc.opencl";
+
+/*! \brief Prototype for assigning to kernel */
+static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const const vx_reference *parameters, vx_uint32 num);
+
+static vx_cl_kernel_description_t *cl_kernels[] =
+{
+    &box3x3_clkernel,
+	&and_kernel,
+	&xor_kernel,
+	&orr_kernel,
+	&not_kernel,
+    &gaussian3x3_clkernel,
+    &sobel3x3_clkernel,
+    &erode3x3_kernel,
+    &dilate3x3_kernel,
+    &median3x3_kernel,
+	&nonlinearfilter_kernel,
+    &phase_kernel,
+	&warp_affine_kernel,
+	&warp_perspective_kernel,
+    &convolution_kernel,
+};
+
+static vx_uint32 num_cl_kernels = dimof(cl_kernels);
+
+static void VX_CALLBACK vxcl_platform_notifier(const char *errinfo,
+                                const void *private_info,
+                                size_t cb,
+                                void *user_data)
+{
+    //vx_target target = (vx_target)user_data;
+    VX_PRINT(VX_ZONE_ERROR, "%s\n", errinfo);
+}
+
+vx_status vxTargetInit(vx_target_t *target)
+{
+    vx_status status = VX_ERROR_NO_RESOURCES;
+    cl_int err = 0;
+    vx_context context = target->base.context;
+    cl_uint p, d, k;
+    char *vx_incs = getenv("VX_CL_INCLUDE_DIR");
+    //char *vx_incs = "/usr/include -I/home/pi/khronos-openvx-1.2-on-raspberrypi-3b/openvx_sample/include -I/home/pi/khronos-openvx-1.2-on-raspberrypi-3b/openvx_sample/include/VX";
+	char *cl_dirs = getenv("VX_CL_SOURCE_DIR");
+    //char *cl_dirs = "/home/pi/khronos-openvx-1.2-on-raspberrypi-3b/openvx_sample/kernels/opencl";
+	char cl_args[1024];
+
+    if(NULL == vx_incs)
+        return VX_FAILURE;
+
+    snprintf(cl_args, sizeof(cl_args), "-D VX_CL_KERNEL -I %s -I %s %s %s", vx_incs, cl_dirs,
+#if !defined(__APPLE__)
+        "-D CL_USE_LUMINANCE",
+#else
+        "",
+#endif
+#if defined(VX_INCLUDE_DIR)
+    "-I "VX_INCLUDE_DIR" "
+#else
+    " "
+#endif
+    );
+    printf("flags: %s\n", cl_args);
+    if (cl_dirs == NULL) {
+#ifdef VX_CL_SOURCE_DIR
+        const char *sdir = VX_CL_SOURCE_DIR;
+        int len = strlen(sdir);
+        cl_dirs = malloc(len);
+        strncpy(cl_dirs, sdir, len);
+#else
+        return status;
+#endif
+    }
+
+    strncpy(target->name, name, VX_MAX_TARGET_NAME);
+    target->priority = VX_TARGET_PRIORITY_OPENCL;
+
+    context->num_platforms = CL_MAX_PLATFORMS;
+    err = clGetPlatformIDs(CL_MAX_PLATFORMS, context->platforms, NULL);
+    if (err != CL_SUCCESS)
+        goto exit;
+
+    for (p = 0; p < context->num_platforms; p++) {
+        err = clGetDeviceIDs(context->platforms[p], CL_DEVICE_TYPE_ALL,
+            0, NULL, &context->num_devices[p]);
+        err = clGetDeviceIDs(context->platforms[p], CL_DEVICE_TYPE_ALL,
+            context->num_devices[p] > CL_MAX_DEVICES ? CL_MAX_DEVICES : context->num_devices[p],
+            context->devices[p], NULL);
+        if (err == CL_SUCCESS) {
+            cl_context_properties props[] = {
+                (cl_context_properties)CL_CONTEXT_PLATFORM,
+                (cl_context_properties)context->platforms[p],
+                (cl_context_properties)0,
+            };
+            for (d = 0; d < context->num_devices[p]; d++) {
+                char deviceName[64];
+                cl_bool compiler = CL_FALSE;
+                cl_bool available = CL_FALSE;
+                cl_bool image_support = CL_FALSE;
+                err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
+                CL_ERROR_MSG(err, "clGetDeviceInfo");
+                err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_COMPILER_AVAILABLE, sizeof(cl_bool), &compiler, NULL);
+                CL_ERROR_MSG(err, "clGetDeviceInfo");
+                err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL);
+                CL_ERROR_MSG(err, "clGetDeviceInfo");
+                err = clGetDeviceInfo(context->devices[p][d], CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL);
+                CL_ERROR_MSG(err, "clGetDeviceInfo");
+                VX_PRINT(VX_ZONE_INFO, "Device %s (compiler=%s) (available=%s) (images=%s)\n", deviceName, (compiler?"TRUE":"FALSE"), (available?"TRUE":"FALSE"), (image_support?"TRUE":"FALSE"));
+            }
+            context->global[p] = clCreateContext(props,
+                                                 context->num_devices[p],
+                                                 context->devices[p],
+                                                 vxcl_platform_notifier,
+                                                 target,
+                                                 &err);
+            if (err != CL_SUCCESS)
+                break;
+
+            /* check for supported formats */
+            if (err == CL_SUCCESS) {
+                cl_uint f,num_entries = 0u;
+                cl_image_format *formats = NULL;
+                cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR;
+                cl_mem_object_type type = CL_MEM_OBJECT_IMAGE2D;
+
+                err = clGetSupportedImageFormats(context->global[p], flags, type, 0, NULL, &num_entries);
+                formats = (cl_image_format *)malloc(num_entries * sizeof(cl_image_format));
+                err = clGetSupportedImageFormats(context->global[p], flags, type, num_entries, formats, NULL);
+                for (f = 0; f < num_entries; f++) {
+                    char order[256];
+                    char datat[256];
+    #define CASE_STRINGERIZE2(value, string) case value: strcpy(string, #value); break
+                    switch(formats[f].image_channel_order) {
+                        CASE_STRINGERIZE2(CL_R, order);
+                        CASE_STRINGERIZE2(CL_A, order);
+                        CASE_STRINGERIZE2(CL_RG, order);
+                        CASE_STRINGERIZE2(CL_RA, order);
+                        CASE_STRINGERIZE2(CL_RGB, order);
+                        CASE_STRINGERIZE2(CL_RGBA, order);
+                        CASE_STRINGERIZE2(CL_BGRA, order);
+                        CASE_STRINGERIZE2(CL_ARGB, order);
+                        CASE_STRINGERIZE2(CL_INTENSITY, order);
+                        CASE_STRINGERIZE2(CL_LUMINANCE, order);
+                        CASE_STRINGERIZE2(CL_Rx, order);
+                        CASE_STRINGERIZE2(CL_RGx, order);
+                        CASE_STRINGERIZE2(CL_RGBx, order);
+    #if defined(CL_VERSION_1_2) && defined(cl_khr_gl_depth_images)
+                        CASE_STRINGERIZE2(CL_DEPTH, order);
+                        CASE_STRINGERIZE2(CL_DEPTH_STENCIL, order);
+    #if defined(__APPLE__)
+                        CASE_STRINGERIZE2(CL_1RGB_APPLE, order);
+                        CASE_STRINGERIZE2(CL_BGR1_APPLE, order);
+                        CASE_STRINGERIZE2(CL_SFIXED14_APPLE, order);
+                        CASE_STRINGERIZE2(CL_BIASED_HALF_APPLE, order);
+                        CASE_STRINGERIZE2(CL_YCbYCr_APPLE, order);
+                        CASE_STRINGERIZE2(CL_CbYCrY_APPLE, order);
+                        CASE_STRINGERIZE2(CL_ABGR_APPLE, order);
+    #endif
+    #endif
+                        default:
+                            sprintf(order, "%x", formats[f].image_channel_order);
+                            break;
+                    }
+                    switch(formats[f].image_channel_data_type) {
+                        CASE_STRINGERIZE2(CL_SNORM_INT8, datat);
+                        CASE_STRINGERIZE2(CL_SNORM_INT16, datat);
+                        CASE_STRINGERIZE2(CL_UNORM_INT8, datat);
+                        CASE_STRINGERIZE2(CL_UNORM_INT16, datat);
+                        CASE_STRINGERIZE2(CL_UNORM_SHORT_565, datat);
+                        CASE_STRINGERIZE2(CL_UNORM_SHORT_555, datat);
+                        CASE_STRINGERIZE2(CL_UNORM_INT_101010, datat);
+                        CASE_STRINGERIZE2(CL_SIGNED_INT8, datat);
+                        CASE_STRINGERIZE2(CL_SIGNED_INT16, datat);
+                        CASE_STRINGERIZE2(CL_SIGNED_INT32, datat);
+                        CASE_STRINGERIZE2(CL_UNSIGNED_INT8, datat);
+                        CASE_STRINGERIZE2(CL_UNSIGNED_INT16, datat);
+                        CASE_STRINGERIZE2(CL_UNSIGNED_INT32, datat);
+                        CASE_STRINGERIZE2(CL_HALF_FLOAT, datat);
+                        CASE_STRINGERIZE2(CL_FLOAT, datat);
+    #if defined(CL_VERSION_2_0)
+                        CASE_STRINGERIZE2(CL_UNORM_INT24, datat);
+    #endif
+                        default:
+                            sprintf(order, "%x", formats[f].image_channel_data_type);
+                            break;
+                    }
+                    VX_PRINT(VX_ZONE_INFO, "%s : %s\n", order, datat);
+                }
+            }
+
+            /* create a queue for each device */
+            for (d = 0; d < context->num_devices[p]; d++)
+            {
+                context->queues[p][d] = clCreateCommandQueue(context->global[p],
+                                                          context->devices[p][d],
+                                                          CL_QUEUE_PROFILING_ENABLE,
+                                                          &err);
+                if (err == CL_SUCCESS) {
+                }
+            }
+
+            /* for each kernel */
+            for (k = 0; k < num_cl_kernels; k++)
+            {
+                char *sources = NULL;
+                size_t programSze = 0;
+
+                /* load the source file */
+                VX_PRINT(VX_ZONE_INFO, "Joiner: %s\n", FILE_JOINER);
+                VX_PRINT(VX_ZONE_INFO, "Path: %s\n", VX_CL_SOURCEPATH);
+                VX_PRINT(VX_ZONE_INFO, "Kernel[%u] File: %s\n", k, cl_kernels[k]->sourcepath);
+                VX_PRINT(VX_ZONE_INFO, "Kernel[%u] Name: %s\n", k, cl_kernels[k]->kernelname);
+                VX_PRINT(VX_ZONE_INFO, "Kernel[%u] ID: %s\n", k, cl_kernels[k]->description.name);
+                sources = clLoadSources(cl_kernels[k]->sourcepath, &programSze);
+                /* create a program with this source */
+                cl_kernels[k]->program[p] = clCreateProgramWithSource(context->global[p],
+                    1,
+                    (const char **)&sources,
+                    &programSze,
+                    &err);
+                if (err == CL_SUCCESS)
+                {
+                    err = clBuildProgram((cl_program)cl_kernels[k]->program[p],
+                        1,
+                        (const cl_device_id *)context->devices,
+                        (const char *)cl_args,
+                        NULL,
+                        NULL);
+                    if (err != CL_SUCCESS)
+                    {
+                        CL_BUILD_MSG(err, "Build Error");
+                        if (err == CL_BUILD_PROGRAM_FAILURE)
+                        {
+                            char log[10][1024];
+                            size_t logSize = 0;
+                            clGetProgramBuildInfo((cl_program)cl_kernels[k]->program[p],
+                                (cl_device_id)context->devices[p][0],
+                                CL_PROGRAM_BUILD_LOG,
+                                sizeof(log),
+                                log,
+                                &logSize);
+                            printf("%s\n", log);
+                            VX_PRINT(VX_ZONE_ERROR, "%s", log);
+                        }
+                    }
+                    else
+                    {
+                        cl_int k2 = 0;
+                        cl_build_status bstatus = 0;
+                        size_t bs = 0;
+                        err = clGetProgramBuildInfo(cl_kernels[k]->program[p],
+                            context->devices[p][0],
+                            CL_PROGRAM_BUILD_STATUS,
+                            sizeof(cl_build_status),
+                            &bstatus,
+                            &bs);
+                        VX_PRINT(VX_ZONE_INFO, "Status = %d (%d)\n", bstatus, err);
+                        /* get the cl_kernels from the program */
+                        cl_kernels[k]->num_kernels[p] = 1;
+                        err = clCreateKernelsInProgram(cl_kernels[k]->program[p],
+                            1,
+                            &cl_kernels[k]->kernels[p],
+                            NULL);
+                        VX_PRINT(VX_ZONE_INFO, "Found %u cl_kernels in %s (%d)\n", cl_kernels[k]->num_kernels[p], cl_kernels[k]->sourcepath, err);
+                        for (k2 = 0; (err == CL_SUCCESS) && (k2 < (cl_int)cl_kernels[k]->num_kernels[p]); k2++)
+                        {
+                            char kName[VX_MAX_KERNEL_NAME];
+                            size_t size = 0;
+                            err = clGetKernelInfo(cl_kernels[k]->kernels[p],
+                                CL_KERNEL_FUNCTION_NAME,
+                                0,
+                                NULL,
+                                &size);
+                            err = clGetKernelInfo(cl_kernels[k]->kernels[p],
+                                CL_KERNEL_FUNCTION_NAME,
+                                size,
+                                kName,
+                                NULL);
+                            VX_PRINT(VX_ZONE_INFO, "Kernel %s\n", kName);
+                            if (strncmp(kName, cl_kernels[k]->kernelname, VX_MAX_KERNEL_NAME) == 0)
+                            {
+                                vx_kernel_f kfunc = cl_kernels[k]->description.function;
+                                VX_PRINT(VX_ZONE_INFO, "Linked Kernel %s on target %s\n", cl_kernels[k]->kernelname, target->name);
+                                target->num_kernels++;
+                                target->base.context->num_kernels++;
+                                status = ownInitializeKernel(target->base.context,
+                                    &target->kernels[k],
+                                    cl_kernels[k]->description.enumeration,
+                                    (kfunc == NULL ? vxclCallOpenCLKernel : kfunc),
+                                    cl_kernels[k]->description.name,
+                                    cl_kernels[k]->description.parameters,
+                                    cl_kernels[k]->description.numParams,
+                                    cl_kernels[k]->description.validate,
+                                    cl_kernels[k]->description.input_validate,
+                                    cl_kernels[k]->description.output_validate,
+                                    cl_kernels[k]->description.initialize,
+                                    cl_kernels[k]->description.deinitialize);
+                                if (ownIsKernelUnique(&target->kernels[k]) == vx_true_e) {
+                                    target->base.context->num_unique_kernels++;
+                                } else {
+                                    VX_PRINT(VX_ZONE_KERNEL, "Kernel %s is NOT unqiue\n", target->kernels[k].name);
+                                }
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    CL_ERROR_MSG(err, "Program");
+                }
+                free(sources);
+            }
+        }
+    }
+exit:
+    if (err == CL_SUCCESS) {
+        status = VX_SUCCESS;
+    } else {
+        status = VX_ERROR_NO_RESOURCES;
+    }
+    return status;
+}
+
+vx_status vxTargetDeinit(vx_target_t *target)
+{
+    vx_context context = target->base.context;
+    if (vxGetStatus((vx_reference)context) == VX_SUCCESS)
+    {
+        cl_uint p = 0, d = 0;
+        vx_uint32 k = 0;
+        for (p = 0; p < context->num_platforms; p++)
+        {
+            for (k = 0; k < num_cl_kernels; k++)
+            {
+                ownDecrementReference(&target->kernels[k].base, VX_INTERNAL);
+                clReleaseKernel(cl_kernels[k]->kernels[p]);
+                clReleaseProgram(cl_kernels[k]->program[p]);
+
+            }
+            for (d = 0; d < context->num_devices[p]; d++)
+            {
+                clReleaseCommandQueue(context->queues[p][d]);
+            }
+            clReleaseContext(context->global[p]);
+        }
+    }
+    return VX_SUCCESS;
+}
+
+vx_status vxTargetSupports(vx_target_t *target,
+                           vx_char targetName[VX_MAX_TARGET_NAME],
+                           vx_char kernelName[VX_MAX_KERNEL_NAME],
+#if defined(EXPERIMENTAL_USE_VARIANTS)
+                           vx_char variantName[VX_MAX_VARIANT_NAME],
+#endif
+                           vx_uint32 *pIndex)
+{
+    vx_status status = VX_ERROR_NOT_SUPPORTED;
+    if (strncmp(targetName, name, VX_MAX_TARGET_NAME) == 0 ||
+        strncmp(targetName, "default", VX_MAX_TARGET_NAME) == 0 ||
+        strncmp(targetName, "performance", VX_MAX_TARGET_NAME) == 0)
+    {
+        vx_uint32 k = 0u;
+        for (k = 0u; k < VX_INT_MAX_KERNELS; k++)
+        {
+            if (strncmp(kernelName, target->kernels[k].name, VX_MAX_KERNEL_NAME) == 0)
+            {
+                status = VX_SUCCESS;
+                if (pIndex) *pIndex = k;
+                break;
+            }
+        }
+    }
+    return status;
+}
+
+vx_action vxTargetProcess(vx_target_t *target, vx_node_t *nodes[], vx_size startIndex, vx_size numNodes)
+{
+    vx_action action = VX_ACTION_CONTINUE;
+    vx_status status = VX_SUCCESS;
+    vx_size n = 0;
+    for (n = startIndex; (n < (startIndex + numNodes)) && (action == VX_ACTION_CONTINUE); n++)
+    {
+        VX_PRINT(VX_ZONE_GRAPH,"Executing Kernel %s:%d in Nodes[%u] on target %s\n",
+            nodes[n]->kernel->name,
+            nodes[n]->kernel->enumeration,
+            n,
+            nodes[n]->base.context->targets[nodes[n]->affinity].name);
+
+        ownStartCapture(&nodes[n]->perf);
+        status = nodes[n]->kernel->function((vx_node)nodes[n],
+                                            (vx_reference *)nodes[n]->parameters,
+                                            nodes[n]->kernel->signature.num_parameters);
+        nodes[n]->executed = vx_true_e;
+        nodes[n]->status = status;
+        ownStopCapture(&nodes[n]->perf);
+
+        VX_PRINT(VX_ZONE_GRAPH,"kernel %s returned %d\n", nodes[n]->kernel->name, status);
+
+        if (status == VX_SUCCESS)
+        {
+            /* call the callback if it is attached */
+            if (nodes[n]->callback)
+            {
+                action = nodes[n]->callback((vx_node)nodes[n]);
+                VX_PRINT(VX_ZONE_GRAPH,"callback returned action %d\n", action);
+            }
+        }
+        else
+        {
+            action = VX_ACTION_ABANDON;
+            VX_PRINT(VX_ZONE_ERROR, "Abandoning Graph due to error (%d)!\n", status);
+        }
+    }
+    return action;
+}
+
+vx_status vxTargetVerify(vx_target_t *target, vx_node_t *node)
+{
+    vx_status status = VX_SUCCESS;
+    return status;
+}
+
+vx_kernel vxTargetAddKernel(vx_target_t *target,
+                            vx_char name[VX_MAX_KERNEL_NAME],
+                            vx_enum enumeration,
+                            vx_kernel_f func_ptr,
+                            vx_uint32 numParams,
+                            vx_kernel_validate_f validate,
+                            vx_kernel_input_validate_f input,
+                            vx_kernel_output_validate_f output,
+                            vx_kernel_initialize_f initialize,
+                            vx_kernel_deinitialize_f deinitialize)
+{
+    vx_uint32 k = 0u;
+    vx_kernel_t *kernel = NULL;
+    for (k = 0; k < VX_INT_MAX_KERNELS; k++)
+    {
+        kernel = &(target->kernels[k]);
+        if (kernel->enabled == vx_false_e)
+        {
+            ownInitializeKernel(target->base.context,
+                               kernel,
+                               enumeration, func_ptr, name,
+                               NULL, numParams,
+                               validate, input, output, initialize, deinitialize);
+            VX_PRINT(VX_ZONE_KERNEL, "Reserving %s Kernel[%u] for %s\n", target->name, k, kernel->name);
+            target->num_kernels++;
+            break;
+        }
+        kernel = NULL;
+    }
+    return (vx_kernel)kernel;
+}
+
+vx_cl_kernel_description_t *vxclFindKernel(vx_enum enumeration)
+{
+    vx_cl_kernel_description_t *vxclk = NULL;
+    vx_uint32 k;
+    for (k = 0; k < num_cl_kernels; k++)
+    {
+        if (enumeration == cl_kernels[k]->description.enumeration)
+        {
+            vxclk = cl_kernels[k];
+            break;
+        }
+    }
+    return vxclk;
+}
+
+/*! \brief Calls an OpenCL kernel from OpenVX Graph.
+ * Steps:
+ * \arg Find the target
+ * \arg Get the vxcl context
+ * \arg Find the kernel (to get cl kernel information)
+ * \arg for each input parameter that is an object, enqueue write
+ * \arg wait for finish
+ * \arg for each parameter, SetKernelArg
+ * \arg call kernel
+ * \arg wait for finish
+ * \arg for each output parameter that is an object, enqueue read
+ * \arg wait for finish
+ * \note This implementation will attempt to use the External API as much as possible,
+ * but will cast to internal representation when needed (due to lack of API or
+ * need for secret information). This is not an optimal OpenCL invocation.
+ */
+static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    static struct timeval start, start1, end;
+    gettimeofday(&start, NULL);
+
+    vx_status status = VX_FAILURE;
+    vx_context context = node->base.context;
+    vx_target target = (vx_target_t *)&node->base.context->targets[node->affinity];
+    vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration);
+    vx_uint32 pidx, pln, didx, plidx, argidx;
+    cl_int err = 0;
+    size_t off_dim[3] = {0,0,0};
+    size_t work_dim[3];
+    //size_t local_dim[3];
+    cl_event writeEvents[VX_INT_MAX_PARAMS];
+    cl_event readEvents[VX_INT_MAX_PARAMS];
+    cl_int we = 0, re = 0;
+
+    // determine which platform to use
+    plidx = 0;
+
+    // determine which device to use
+    didx = 0;
+
+    /* for each input/bi data object, enqueue it and set the kernel parameters */
+    for (argidx = 0, pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        vx_enum type = node->kernel->signature.types[pidx];
+        vx_memory_t *memory = NULL;
+
+        switch (type)
+        {
+            case VX_TYPE_ARRAY:
+                memory = &((vx_array)ref)->memory;
+                break;
+            case VX_TYPE_CONVOLUTION:
+                memory = &((vx_convolution)ref)->base.memory;
+                break;
+            case VX_TYPE_DISTRIBUTION:
+                memory = &((vx_distribution)ref)->memory;
+                break;
+            case VX_TYPE_IMAGE:
+                memory = &((vx_image)ref)->memory;
+                break;
+            case VX_TYPE_LUT:
+                memory = &((vx_lut_t*)ref)->memory;
+                break;
+            case VX_TYPE_MATRIX:
+                memory = &((vx_matrix)ref)->memory;
+                break;
+            //case VX_TYPE_PYRAMID:
+            //    break;
+            case VX_TYPE_REMAP:
+                memory = &((vx_remap)ref)->memory;
+                break;
+            //case VX_TYPE_SCALAR:
+            //case VX_TYPE_THRESHOLD:
+            //    break;
+        }
+        if (memory) {
+            for (pln = 0; pln < memory->nptrs; pln++) {
+                if (memory->cl_type == CL_MEM_OBJECT_BUFFER) {
+                    if (type == VX_TYPE_IMAGE) {
+
+                        /* set the work dimensions */
+                        work_dim[0] = memory->dims[pln][VX_DIM_X];
+                        work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+                        // width, height, stride_x, stride_y
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_X]);
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->dims[pln][VX_DIM_Y]);
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+                        VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 4 parameters\n");
+                    } else if (type == VX_TYPE_ARRAY || type == VX_TYPE_LUT) {
+                        vx_array arr = (vx_array)ref;
+                        // sizeof item, active count, capacity
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->item_size);
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->num_items); // this is output?
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&arr->capacity);
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &arr->memory.strides[VX_DIM_X]);
+                        VX_PRINT(VX_ZONE_INFO, "Setting vx_buffer as Buffer with 4 parameters\n");
+                    } else if (type == VX_TYPE_MATRIX) {
+                        vx_matrix mat = (vx_matrix)ref;
+                        // columns, rows
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->columns);
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&mat->rows);
+                        VX_PRINT(VX_ZONE_INFO, "Setting vx_matrix as Buffer with 2 parameters\n");
+                    } else if (type == VX_TYPE_DISTRIBUTION) {
+                        vx_distribution dist = (vx_distribution)ref;
+                        // num, range, offset, num_bins
+                        vx_uint32 num_bins = dist->memory.dims[0][VX_DIM_X];
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->memory.dims[VX_DIM_X]);
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->range_x);
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&dist->offset_x);
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&num_bins);
+                    } else if (type == VX_TYPE_CONVOLUTION) {
+                        vx_convolution conv = (vx_convolution)ref;
+                        // columns, rows, scale
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.columns);
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->base.rows);
+                        err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint32), (vx_uint32 *)&conv->scale);
+                    }
+                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+                    CL_ERROR_MSG(err, "clSetKernelArg");
+                    if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL)
+                    {
+                        gettimeofday(&start1, NULL);
+                        err = clEnqueueWriteBuffer(context->queues[plidx][didx],
+                                                   memory->hdls[pln],
+                                                   CL_TRUE,
+                                                   0,
+                                                   ownComputeMemorySize(memory, pln),
+                                                   memory->ptrs[pln],
+                                                   0,
+                                                   NULL,
+                                                   &ref->event);
+                        gettimeofday(&end, NULL);
+
+                        double costTime = ((double)end.tv_sec * 1000.0 + (double)end.tv_usec / 1000.0)
+                                            - ((double)start1.tv_sec * 1000.0 + (double)start1.tv_usec / 1000.0);
+
+                        printf("opencl write DMA %f ms\n", costTime);
+                    }
+                } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) {
+                    vx_rectangle_t rect = {0};
+                    vx_image image = (vx_image)ref;
+                    vxGetValidRegionImage(image, &rect);
+                    size_t origin[3] = {rect.start_x, rect.start_y, 0};
+                    size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1};
+                    /* set the work dimensions */
+                    work_dim[0] = rect.end_x-rect.start_x;
+                    work_dim[1] = rect.end_y-rect.start_y;
+                    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as image2d_t wd={%zu,%zu} arg:%u\n",work_dim[0], work_dim[1], argidx);
+                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+                    CL_ERROR_MSG(err, "clSetKernelArg");
+                    if (err != CL_SUCCESS) {
+                        VX_PRINT(VX_ZONE_ERROR, "Error Calling Kernel %s, parameter %u\n", node->kernel->name, pidx);
+                    }
+                    if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL)
+                    {
+                        err = clEnqueueWriteImage(context->queues[plidx][didx],
+                                                  memory->hdls[pln],
+                                                  CL_TRUE,
+                                                  origin, region,
+                                                  memory->strides[pln][VX_DIM_Y],
+                                                  0,
+                                                  memory->ptrs[pln],
+                                                  0, NULL,
+                                                  NULL);
+                        CL_ERROR_MSG(err, "clEnqueueWriteImage");
+                    }
+                }
+            }
+        } else {
+            if (type == VX_TYPE_SCALAR) {
+                vx_value_t value; // largest platform atomic
+                vx_size size = 0ul;
+                vx_scalar sc = (vx_scalar)ref;
+                vx_enum stype = VX_TYPE_INVALID;
+                vxCopyScalar(sc, &value, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                vxQueryScalar(sc, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                size = ownSizeOfType(stype);
+                err = clSetKernelArg(vxclk->kernels[plidx], argidx++, size, &value);
+            }
+            else if (type == VX_TYPE_THRESHOLD) {
+                vx_enum ttype = 0;
+                vx_threshold th = (vx_threshold)ref;
+                vxQueryThreshold(th, VX_THRESHOLD_TYPE, &ttype, sizeof(ttype));
+                if (ttype == VX_THRESHOLD_TYPE_BINARY) {
+                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->value);
+                } else if (ttype == VX_THRESHOLD_TYPE_RANGE) {
+                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->lower);
+                    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_uint8), &th->upper);
+                }
+            }
+        }
+    }
+    we = 0;
+    for (pidx = 0; pidx < num; pidx++) {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL) {
+            memcpy(&writeEvents[we++],&ref->event, sizeof(cl_event));
+        }
+    }
+    //local_dim[0] = 1;
+    //local_dim[1] = 1;
+    err = clEnqueueNDRangeKernel(context->queues[plidx][didx],
+                                 vxclk->kernels[plidx],
+                                 2,
+                                 off_dim,
+                                 work_dim,
+                                 NULL,
+                                 we, writeEvents, &node->base.event);
+
+    CL_ERROR_MSG(err, "clEnqueueNDRangeKernel");
+    /* enqueue a read on all output data */
+    for (pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        vx_enum type = node->kernel->signature.types[pidx];
+
+        if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL)
+        {
+            vx_memory_t *memory = NULL;
+
+            switch (type)
+            {
+                case VX_TYPE_ARRAY:
+                    memory = &((vx_array)ref)->memory;
+                    break;
+                case VX_TYPE_CONVOLUTION:
+                    memory = &((vx_convolution)ref)->base.memory;
+                    break;
+                case VX_TYPE_DISTRIBUTION:
+                    memory = &((vx_distribution)ref)->memory;
+                    break;
+                case VX_TYPE_IMAGE:
+                    memory = &((vx_image)ref)->memory;
+                    break;
+                case VX_TYPE_LUT:
+                    memory = &((vx_lut_t*)ref)->memory;
+                    break;
+                case VX_TYPE_MATRIX:
+                    memory = &((vx_matrix)ref)->memory;
+                    break;
+                //case VX_TYPE_PYRAMID:
+                //    break;
+                case VX_TYPE_REMAP:
+                    memory = &((vx_remap)ref)->memory;
+                    break;
+                //case VX_TYPE_SCALAR:
+                //case VX_TYPE_THRESHOLD:
+                //    break;
+            }
+            if (memory) {
+                for (pln = 0; pln < memory->nptrs; pln++) {
+                    if (memory->cl_type == CL_MEM_OBJECT_BUFFER) {
+                        gettimeofday(&start1, NULL);
+                        err = clEnqueueReadBuffer(context->queues[plidx][didx],
+                            memory->hdls[pln],
+                            CL_TRUE, 0, ownComputeMemorySize(memory, pln),
+                            memory->ptrs[pln],
+                            0, NULL, NULL);
+                        gettimeofday(&end, NULL);
+
+                        double costTime = ((double)end.tv_sec * 1000.0 + (double)end.tv_usec / 1000.0)
+                                            - ((double)start1.tv_sec * 1000.0 + (double)start1.tv_usec / 1000.0);
+
+                        printf("opencl read DMA %f ms\n", costTime);
+                        CL_ERROR_MSG(err, "clEnqueueReadBuffer");
+                    } else if (memory->cl_type == CL_MEM_OBJECT_IMAGE2D) {
+                        vx_rectangle_t rect = {0};
+                        vx_image image = (vx_image)ref;
+                        vxGetValidRegionImage(image, &rect);
+                        size_t origin[3] = {rect.start_x, rect.start_y, 0};
+                        size_t region[3] = {rect.end_x-rect.start_x, rect.end_y-rect.start_y, 1};
+                        /* set the work dimensions */
+                        work_dim[0] = rect.end_x-rect.start_x;
+                        work_dim[1] = rect.end_y-rect.start_y;
+                        err = clEnqueueReadImage(context->queues[plidx][didx],
+                                                  memory->hdls[pln],
+                                                  CL_TRUE,
+                                                  origin, region,
+                                                  memory->strides[pln][VX_DIM_Y],
+                                                  0,
+                                                  memory->ptrs[pln],
+                                                  1, &node->base.event,
+                                                  &ref->event);
+                        CL_ERROR_MSG(err, "clEnqueueReadImage");
+                        VX_PRINT(VX_ZONE_INFO, "Reading Image wd={%zu,%zu}\n", work_dim[0], work_dim[1]);
+                    }
+                }
+            }
+        }
+    }
+    re = 0;
+    for (pidx = 0; pidx < num; pidx++) {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL) {
+            memcpy(&readEvents[re++],&ref->event, sizeof(cl_event));
+        }
+    }
+    err = clFlush(context->queues[plidx][didx]);
+    gettimeofday(&end, NULL);
+
+    double costTime1 = ((double)end.tv_sec * 1000.0 + (double)end.tv_usec / 1000.0)
+                        - ((double)start.tv_sec * 1000.0 + (double)start.tv_usec / 1000.0);
+
+    printf("box3x3 core %f ms\n", costTime1);
+    CL_ERROR_MSG(err, "Flush");
+    VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n");
+    clWaitForEvents(re, readEvents);
+    if (err == CL_SUCCESS)
+        status = VX_SUCCESS;
+//exit:
+    VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status);
+    return status;
+}
+
diff --git a/sample/targets/opencl/vx_interface.h b/sample/targets/opencl/vx_interface.h
new file mode 100644
index 0000000..1c8aa69
--- /dev/null
+++ b/sample/targets/opencl/vx_interface.h
@@ -0,0 +1,107 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _VX_INTERFACE_H_
+#define _VX_INTERFACE_H_
+
+#include "vx_internal.h"
+
+#if defined(DARWIN)
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <VX/vx_khr_opencl.h>
+
+/*! \brief The maximum number of platforms */
+#define VX_CL_MAX_PLATFORMS (1)
+
+/*! \brief The maximum number of CL devices in the system */
+#define VX_CL_MAX_DEVICES   (2)
+
+/*! \brief The maximum number of characters on a line of OpenCL source code */
+#define VX_CL_MAX_LINE_WIDTH (160)
+
+/*! \brief The maximum path name */
+#define VX_CL_MAX_PATH (256)
+
+#ifndef VX_CL_ARGS
+#define VX_CL_ARGS "-I."
+#endif
+
+#ifndef VX_CL_SOURCEPATH
+#define VX_CL_SOURCEPATH ""
+#endif
+
+typedef void (*cl_notifier_f)(cl_program program, void *args);
+
+typedef void (*cl_platform_notifier_f)(const char *errinfo,
+                                       const void *private_info,
+                                       size_t cb,
+                                       void *user_data);
+
+typedef struct _vx_cl_context_t {
+    cl_uint          num_platforms;
+    cl_uint          num_devices[VX_CL_MAX_PLATFORMS];
+    cl_platform_id   platform[VX_CL_MAX_PLATFORMS];
+    cl_device_id     devices[VX_CL_MAX_PLATFORMS][VX_CL_MAX_DEVICES];
+    cl_context       context[VX_CL_MAX_PLATFORMS];
+    cl_context_properties context_props;
+    cl_command_queue queues[VX_CL_MAX_PLATFORMS][VX_CL_MAX_DEVICES];
+    struct _vx_cl_kernel_description_t **kernels;
+    vx_uint32 num_kernels;
+} vx_cl_context_t;
+
+#define INIT_PROGRAMS {0}
+#define INIT_KERNELS  {0}
+#define INIT_NUMKERNELS {0}
+#define INIT_RETURNS  {{0,0}}
+
+typedef struct _vx_cl_kernel_description_t {
+    vx_kernel_description_t description;
+    char             sourcepath[VX_CL_MAX_PATH];
+    char             kernelname[VX_MAX_KERNEL_NAME];
+    cl_program       program[VX_CL_MAX_PLATFORMS];
+    cl_kernel        kernels[VX_CL_MAX_PLATFORMS];
+    cl_uint          num_kernels[VX_CL_MAX_PLATFORMS];
+    cl_int           returns[VX_CL_MAX_PLATFORMS][VX_CL_MAX_DEVICES];
+    void            *reserved; /* for additional data */
+} vx_cl_kernel_description_t;
+
+vx_cl_kernel_description_t *vxclFindKernel(vx_enum enumeration);
+
+extern vx_cl_kernel_description_t box3x3_clkernel;
+extern vx_cl_kernel_description_t add_clkernel;
+extern vx_cl_kernel_description_t and_kernel;
+extern vx_cl_kernel_description_t xor_kernel;
+extern vx_cl_kernel_description_t orr_kernel;
+extern vx_cl_kernel_description_t not_kernel;
+extern vx_cl_kernel_description_t gaussian3x3_clkernel;
+extern vx_cl_kernel_description_t sobel3x3_clkernel;
+extern vx_cl_kernel_description_t erode3x3_kernel;
+extern vx_cl_kernel_description_t dilate3x3_kernel;
+extern vx_cl_kernel_description_t median3x3_kernel;
+extern vx_cl_kernel_description_t nonlinearfilter_kernel;
+extern vx_cl_kernel_description_t phase_kernel;
+extern vx_cl_kernel_description_t warp_affine_kernel;
+extern vx_cl_kernel_description_t warp_perspective_kernel;
+extern vx_cl_kernel_description_t convolution_kernel;
+
+#endif
+
+
diff --git a/sample/targets/opencl/vx_morphology.c b/sample/targets/opencl/vx_morphology.c
new file mode 100644
index 0000000..2b1a2c2
--- /dev/null
+++ b/sample/targets/opencl/vx_morphology.c
@@ -0,0 +1,280 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <VX/vx.h>
+#include <VX/vx_helper.h>
+#include <vx_support.h>
+
+#include "vx_interface.h"
+
+static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_FAILURE;
+    vx_context context = node->base.context;
+
+    vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration);
+    vx_uint32 pidx, pln, didx, plidx, argidx;
+    cl_int err = 0;
+    size_t off_dim[3] = { 0,0,0 };
+    size_t work_dim[3];
+
+    cl_event writeEvents[VX_INT_MAX_PARAMS];
+    cl_event readEvents[VX_INT_MAX_PARAMS];
+    cl_int we = 0, re = 0;
+
+    // determine which platform to use
+    plidx = 0;
+
+    // determine which device to use
+    didx = 0;
+
+    cl_kernel kernel = vxclk->kernels[plidx];
+
+    pln = 0;
+
+    argidx = 0;
+
+    //Set Input
+    vx_reference ref = node->parameters[0];
+    vx_enum dir = node->kernel->signature.directions[0];
+    vx_memory_t *memory = &((vx_image)ref)->memory;
+
+    /* set the work dimensions */
+    work_dim[0] = memory->dims[pln][VX_DIM_X];
+    work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+    //stride_x, stride_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    err = clEnqueueWriteBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE,
+        0,
+        ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0,
+        NULL,
+        &ref->event);
+
+    //Set bordermode
+    vx_border_t bordermode;
+    status = vxQueryNode(node, VX_NODE_BORDER, &bordermode, sizeof(bordermode));
+
+    int border_mode = bordermode.mode;
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &border_mode);
+
+    //Set const value for constant boder 
+    uint8_t const_vaule = bordermode.constant_value.U8;
+    err = clSetKernelArg(kernel, argidx++, sizeof(uint8_t), &const_vaule);
+
+    //Set Output
+    ref = node->parameters[1];
+    memory = &((vx_image)ref)->memory;
+
+    /* set the work dimensions */
+    work_dim[0] = memory->dims[pln][VX_DIM_X];
+    work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+    //stride_x, stride_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    we = 0;
+    for (pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL)
+        {
+            memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event));
+        }
+    }
+
+
+    err = clEnqueueNDRangeKernel(context->queues[plidx][didx],
+        kernel,
+        2,
+        off_dim,
+        work_dim,
+        NULL,
+        we, writeEvents, &node->base.event);
+
+    clFinish(context->queues[plidx][didx]);
+
+    CL_ERROR_MSG(err, "clEnqueueNDRangeKernel");
+
+    pln = 0;
+
+    /* enqueue a read on all output data */
+    ref = node->parameters[1];
+
+    memory = &((vx_image)ref)->memory;
+
+    err = clEnqueueReadBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE, 0, ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0, NULL, NULL);
+
+    CL_ERROR_MSG(err, "clEnqueueReadBuffer");
+
+    clFinish(context->queues[plidx][didx]);
+
+    re = 0;
+    for (pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL)
+        {
+            memcpy(&readEvents[re++], &ref->event, sizeof(cl_event));
+        }
+    }
+    err = clFlush(context->queues[plidx][didx]);
+    CL_ERROR_MSG(err, "Flush");
+    VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n");
+    clWaitForEvents(re, readEvents);
+    if (err == CL_SUCCESS)
+        status = VX_SUCCESS;
+
+    VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status);
+    return status;
+}
+
+static vx_status VX_CALLBACK vxErode3x3Kernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+
+static vx_status VX_CALLBACK vxDilate3x3Kernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+static vx_status VX_CALLBACK vxMorphologyInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxMorphologyOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, 0); /* we reference the input image */
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_image input = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+            if (input)
+            {
+                vx_uint32 width = 0, height = 0;
+                vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = VX_DF_IMAGE_U8;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&input);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_param_description_t morphology_kernel_params[] = {
+    {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+};
+
+vx_cl_kernel_description_t erode3x3_kernel = {
+    {
+    VX_KERNEL_ERODE_3x3,
+    "org.khronos.openvx.erode_3x3",
+    vxErode3x3Kernel,
+    morphology_kernel_params, dimof(morphology_kernel_params),
+    NULL,
+    vxMorphologyInputValidator,
+    vxMorphologyOutputValidator,
+    NULL,
+    NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_erode3x3.cl",
+    "vx_erode3x3",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
+
+vx_cl_kernel_description_t dilate3x3_kernel = {
+    {
+    VX_KERNEL_DILATE_3x3,
+    "org.khronos.openvx.dilate_3x3",
+    vxDilate3x3Kernel,
+    morphology_kernel_params, dimof(morphology_kernel_params),
+    NULL,
+    vxMorphologyInputValidator,
+    vxMorphologyOutputValidator,
+    NULL,
+    NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_dilate3x3.cl",
+    "vx_dilate3x3",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
+
+
diff --git a/sample/targets/opencl/vx_nonlinearfilter.c b/sample/targets/opencl/vx_nonlinearfilter.c
new file mode 100644
index 0000000..b20b19f
--- /dev/null
+++ b/sample/targets/opencl/vx_nonlinearfilter.c
@@ -0,0 +1,366 @@
+/*
+
+ * Copyright (c) 2016-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <VX/vx.h>
+#include <VX/vx_helper.h>
+#include <vx_support.h>
+
+#include "vx_interface.h"
+
+#define C_MAX_NONLINEAR_DIM (9)
+
+static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_FAILURE;
+    vx_context context = node->base.context;
+
+    vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration);
+    vx_uint32 pln, didx, plidx, argidx;
+    cl_int err = 0;
+    size_t off_dim[3] = { 0,0,0 };
+    size_t work_dim[3];
+
+    cl_event writeEvents[VX_INT_MAX_PARAMS];
+    cl_event readEvents[VX_INT_MAX_PARAMS];
+    cl_int we = 0, re = 0;
+
+    // determine which platform to use
+    plidx = 0;
+
+    // determine which device to use
+    didx = 0;
+
+    cl_kernel kernel = vxclk->kernels[plidx];
+
+    pln = 0;
+    argidx = 0;
+
+    // Input function
+    vx_reference ref = node->parameters[0];
+    vx_value_t value; // largest platform atomic
+    vx_size size = 0ul;
+    vx_scalar sc = (vx_scalar)ref;
+    vx_enum stype = VX_TYPE_INVALID;
+    vxCopyScalar(sc, &value, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+    vxQueryScalar(sc, VX_SCALAR_TYPE, &stype, sizeof(stype));
+    size = ownSizeOfType(stype);
+    err = clSetKernelArg(kernel, argidx++, size, &value);
+
+
+    // Input src
+    ref = node->parameters[1];
+    vx_memory_t *memory = &((vx_image)ref)->memory;
+
+    /* set the work dimensions */
+    work_dim[0] = memory->dims[pln][VX_DIM_X];
+    work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+    //stride_x, stride_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    err = clEnqueueWriteBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE,
+        0,
+        ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0,
+        NULL,
+        &ref->event);
+
+
+    // Input mask
+    ref = node->parameters[2];
+    memory = &((vx_matrix)ref)->memory;
+
+    size = ownComputeMemorySize(memory, pln);
+
+    memory->hdls[pln] = clCreateBuffer(context->global[0], CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size, memory->ptrs[pln], &err);
+    err = clEnqueueWriteBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE,
+        0,
+        size,
+        memory->ptrs[pln],
+        0,
+        NULL,
+        NULL);
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    // Origin matrix
+    vx_matrix mask = (vx_matrix)parameters[2];
+    vx_coordinates2d_t origin;
+    status |= vxQueryMatrix(mask, VX_MATRIX_ORIGIN, &origin, sizeof(origin));
+
+    vx_matrix mat = (vx_matrix)ref;
+    vx_size rx0 = origin.x;
+    vx_size ry0 = origin.y;
+    vx_size rx1 = mat->columns - origin.x - 1;
+    vx_size ry1 = mat->rows - origin.y - 1;
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_size), &rx0);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_size), &ry0);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_size), &rx1);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_size), &ry1);
+
+    vx_uint8 m[C_MAX_NONLINEAR_DIM * C_MAX_NONLINEAR_DIM];
+    status |= vxCopyMatrix(mask, m, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+    int mask_index = 0;
+    int count_mask = 0;
+    for (int r = 0; r < mat->rows; ++r)
+    {
+        for (int c = 0; c < mat->columns; ++c, ++mask_index)
+        {
+            if (m[mask_index])
+                ++count_mask;
+        }
+    }
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(int), &mat->rows);
+    err = clSetKernelArg(kernel, argidx++, sizeof(int), &count_mask);
+
+    //Set bordermode
+    vx_border_t bordermode;
+    status = vxQueryNode(node, VX_NODE_BORDER, &bordermode, sizeof(bordermode));
+
+    int border_mode = bordermode.mode;
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &border_mode);
+
+    //Set const value for constant boder 
+    uint8_t const_vaule = bordermode.constant_value.U8;
+    err = clSetKernelArg(kernel, argidx++, sizeof(uint8_t), &const_vaule);
+
+
+    //Set Output
+    ref = node->parameters[3];
+    memory = &((vx_image)ref)->memory;
+
+    /* set the work dimensions */
+    work_dim[0] = memory->dims[pln][VX_DIM_X];
+    work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+    //stride_x, stride_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    we = 0;
+
+    // Input src
+    ref = node->parameters[1];
+    memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event));
+
+    err = clEnqueueNDRangeKernel(context->queues[plidx][didx],
+        kernel,
+        2,
+        off_dim,
+        work_dim,
+        NULL,
+        we, writeEvents, &node->base.event);
+
+    clFinish(context->queues[plidx][didx]);
+
+    CL_ERROR_MSG(err, "clEnqueueNDRangeKernel");
+
+    pln = 0;
+
+    /* enqueue a read on all output data */
+    ref = node->parameters[3];
+
+    memory = &((vx_image)ref)->memory;
+
+    err = clEnqueueReadBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE, 0, ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0, NULL, NULL);
+
+    CL_ERROR_MSG(err, "clEnqueueReadBuffer");
+
+    clFinish(context->queues[plidx][didx]);
+
+    re = 0;
+
+    ref = node->parameters[3];
+
+    memcpy(&readEvents[re++], &ref->event, sizeof(cl_event));
+
+    err = clFlush(context->queues[plidx][didx]);
+    CL_ERROR_MSG(err, "Flush");
+    VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n");
+    clWaitForEvents(re, readEvents);
+    if (err == CL_SUCCESS)
+        status = VX_SUCCESS;
+
+    VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status);
+    return status;
+}
+
+static vx_status VX_CALLBACK vxNonLinearFilterKernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+static vx_status VX_CALLBACK vxNonLinearFilterInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_scalar scalar = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+        if (scalar)
+        {
+            vx_enum stype = 0;
+            vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+            if (stype == VX_TYPE_ENUM)
+            {
+                vx_enum function = 0;
+                vxCopyScalar(scalar, &function, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                if ((function == VX_NONLINEAR_FILTER_MEDIAN) ||
+                    (function == VX_NONLINEAR_FILTER_MIN) ||
+                    (function == VX_NONLINEAR_FILTER_MAX))
+                {
+                    status = VX_SUCCESS;
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_VALUE;
+                }
+            }
+            else
+            {
+                status = VX_ERROR_INVALID_TYPE;
+            }
+            vxReleaseScalar(&scalar);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 2)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (param)
+        {
+            vx_matrix matrix;
+            vxQueryParameter(param, VX_PARAMETER_REF, &matrix, sizeof(matrix));
+            if (matrix)
+            {
+                vx_enum data_type = 0;
+                vx_size cols = 0, rows = 0;
+                vxQueryMatrix(matrix, VX_MATRIX_TYPE, &data_type, sizeof(data_type));
+                vxQueryMatrix(matrix, VX_MATRIX_COLUMNS, &cols, sizeof(cols));
+                vxQueryMatrix(matrix, VX_MATRIX_ROWS, &rows, sizeof(rows));
+                if ((rows <= VX_INT_MAX_NONLINEAR_DIM) &&
+                    (cols <= VX_INT_MAX_NONLINEAR_DIM) &&
+                    (data_type == VX_TYPE_UINT8))
+                {
+                    status = VX_SUCCESS;
+                }
+                vxReleaseMatrix(&matrix);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxNonLinearFilterOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 3)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, 1); /* we reference the input image */
+        if (param)
+        {
+            vx_image input = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+            if (input)
+            {
+                vx_uint32 width = 0, height = 0;
+                vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = VX_DF_IMAGE_U8;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&input);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_param_description_t filter_kernel_params[] = {
+    { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT,  VX_TYPE_IMAGE,  VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT,  VX_TYPE_MATRIX, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE,  VX_PARAMETER_STATE_REQUIRED },
+};
+
+vx_cl_kernel_description_t nonlinearfilter_kernel = {
+    {
+    VX_KERNEL_NON_LINEAR_FILTER,
+    "org.khronos.openvx.non_linear_filter",
+    vxNonLinearFilterKernel,
+    filter_kernel_params, dimof(filter_kernel_params),
+    NULL,
+    vxNonLinearFilterInputValidator,
+    vxNonLinearFilterOutputValidator,
+    NULL,
+    NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_nonlinearfilter.cl",
+    "vx_nonlinearfilter",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
diff --git a/sample/targets/opencl/vx_phase.c b/sample/targets/opencl/vx_phase.c
new file mode 100644
index 0000000..c38d30a
--- /dev/null
+++ b/sample/targets/opencl/vx_phase.c
@@ -0,0 +1,271 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <VX/vx.h>
+#include <VX/vx_helper.h>
+#include <vx_support.h>
+
+#include "vx_interface.h"
+
+static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_FAILURE;
+    vx_context context = node->base.context;
+
+    vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration);
+    vx_uint32 pidx, pln, didx, plidx, argidx;
+    cl_int err = 0;
+    size_t off_dim[3] = { 0,0,0 };
+    size_t work_dim[3];
+
+    cl_event writeEvents[VX_INT_MAX_PARAMS];
+    cl_event readEvents[VX_INT_MAX_PARAMS];
+    cl_int we = 0, re = 0;
+
+    // determine which platform to use
+    plidx = 0;
+
+    // determine which device to use
+    didx = 0;
+
+    cl_kernel kernel = vxclk->kernels[plidx];
+
+    pln = 0;
+
+    for (argidx = 0, pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        vx_memory_t *memory = &((vx_image)ref)->memory;
+
+        /* set the work dimensions */
+        work_dim[0] = memory->dims[pln][VX_DIM_X];
+        work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+        //stride_x, stride_y
+        err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+        err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+        VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 2 parameters\n");
+
+        err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+        CL_ERROR_MSG(err, "clSetKernelArg");
+        if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL)
+        {
+            err = clEnqueueWriteBuffer(context->queues[plidx][didx],
+                memory->hdls[pln],
+                CL_TRUE,
+                0,
+                ownComputeMemorySize(memory, pln),
+                memory->ptrs[pln],
+                0,
+                NULL,
+                &ref->event);
+        }
+    }
+
+    we = 0;
+    for (pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        if (dir == VX_INPUT || dir == VX_BIDIRECTIONAL)
+        {
+            memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event));
+        }
+    }
+
+    err = clEnqueueNDRangeKernel(context->queues[plidx][didx],
+        kernel,
+        2,
+        off_dim,
+        work_dim,
+        NULL,
+        we, writeEvents, &node->base.event);
+
+    clFinish(context->queues[plidx][didx]);
+
+    CL_ERROR_MSG(err, "clEnqueueNDRangeKernel");
+
+    pln = 0;
+
+    vx_reference ref;
+    /* enqueue a read on all output data */
+    ref = node->parameters[2];
+
+    vx_memory_t *memory = NULL;
+
+    memory = &((vx_image)ref)->memory;
+
+    err = clEnqueueReadBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE, 0, ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0, NULL, NULL);
+
+    CL_ERROR_MSG(err, "clEnqueueReadBuffer");
+
+    clFinish(context->queues[plidx][didx]);
+
+    re = 0;
+    for (pidx = 0; pidx < num; pidx++)
+    {
+        vx_reference ref = node->parameters[pidx];
+        vx_enum dir = node->kernel->signature.directions[pidx];
+        if (dir == VX_OUTPUT || dir == VX_BIDIRECTIONAL)
+        {
+            memcpy(&readEvents[re++], &ref->event, sizeof(cl_event));
+        }
+    }
+    err = clFlush(context->queues[plidx][didx]);
+    CL_ERROR_MSG(err, "Flush");
+    VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n");
+    clWaitForEvents(re, readEvents);
+    if (err == CL_SUCCESS)
+        status = VX_SUCCESS;
+
+    VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status);
+    return status;
+}
+
+static
+vx_param_description_t phase_kernel_params[] =
+{
+    { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+};
+
+static
+vx_status VX_CALLBACK vxPhaseKernel(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+} /* vxPhaseKernel() */
+
+static vx_status VX_CALLBACK vxPhaseInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+
+    if (index == 0 || index == 1)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_S16 || format == VX_DF_IMAGE_F32)
+            {
+                if (index == 0)
+                {
+                    status = VX_SUCCESS;
+                }
+                else
+                {
+                    vx_parameter param0 = vxGetParameterByIndex(node, index);
+                    vx_image input0 = 0;
+
+                    vxQueryParameter(param0, VX_PARAMETER_REF, &input0, sizeof(input0));
+                    if (input0)
+                    {
+                        vx_uint32 width0 = 0, height0 = 0, width1 = 0, height1 = 0;
+                        vxQueryImage(input0, VX_IMAGE_WIDTH, &width0, sizeof(width0));
+                        vxQueryImage(input0, VX_IMAGE_HEIGHT, &height0, sizeof(height0));
+                        vxQueryImage(input, VX_IMAGE_WIDTH, &width1, sizeof(width1));
+                        vxQueryImage(input, VX_IMAGE_HEIGHT, &height1, sizeof(height1));
+
+                        if (width0 == width1 && height0 == height1)
+                            status = VX_SUCCESS;
+
+                        vxReleaseImage(&input0);
+                    }
+
+                    vxReleaseParameter(&param0);
+                }
+            }
+
+            vxReleaseImage(&input);
+        }
+
+        vxReleaseParameter(&param);
+    }
+
+    return status;
+}
+
+static vx_status VX_CALLBACK vxPhaseOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+
+    if (index == 2)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, 0);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_uint32   width = 0;
+            vx_uint32   height = 0;
+            vx_df_image format = 0;
+
+            vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+            vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+
+            ptr->type = VX_TYPE_IMAGE;
+            ptr->dim.image.format = VX_DF_IMAGE_U8;
+            ptr->dim.image.width  = width;
+            ptr->dim.image.height = height;
+
+            status = VX_SUCCESS;
+
+            vxReleaseImage(&input);
+        }
+
+        vxReleaseParameter(&param);
+    }
+
+    return status;
+}
+
+vx_cl_kernel_description_t phase_kernel =
+{
+    {
+    VX_KERNEL_PHASE,
+    "org.khronos.openvx.phase",
+    vxPhaseKernel,
+    phase_kernel_params, dimof(phase_kernel_params),
+    NULL,
+    vxPhaseInputValidator,
+    vxPhaseOutputValidator,
+    NULL,
+    NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_phase.cl",
+    "vx_phase",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
+
+
diff --git a/sample/targets/opencl/vx_support.c b/sample/targets/opencl/vx_support.c
new file mode 100644
index 0000000..aebbd44
--- /dev/null
+++ b/sample/targets/opencl/vx_support.c
@@ -0,0 +1,264 @@
+/* 
+
+ * Copyright (c) 2011-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vx_support.h>
+
+#define CASE_STRINGERIZE(err, label, function, file, line) \
+    case err: \
+        fprintf(stderr, "%s: OpenCL error "#err" at %s in %s:%d\n", label, function, file, line); \
+        break
+
+static size_t flen(FILE *fp)
+{
+    size_t len = 0;
+    fseek(fp, 0, SEEK_END);
+    len = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+    return len;
+}
+
+static size_t flines(FILE *fp)
+{
+    size_t numLines = 0;
+    if (fp) {
+        char line[CL_MAX_LINESIZE];
+        fseek(fp, 0, SEEK_SET);
+        while (fgets(line, sizeof(line), fp) != NULL) {
+            numLines++;
+        }
+        //printf("%lu lines in file %p\n",numLines,fp);
+        fseek(fp, 0, SEEK_SET);
+    }
+    return numLines;
+}
+        
+cl_int clBuildError(cl_int build_status, const char *label, const char *function, const char *file, int line)
+{
+    switch (build_status)
+    {
+        case CL_BUILD_SUCCESS:
+            fprintf(stdout, "%s: Build Successful!\n", label);
+            break;
+        CASE_STRINGERIZE(CL_BUILD_NONE, label, function, file, line);
+        CASE_STRINGERIZE(CL_BUILD_ERROR, label, function, file, line);
+        CASE_STRINGERIZE(CL_BUILD_IN_PROGRESS, label, function, file, line);
+        default:
+            fprintf(stderr, "%s: Unknown build error %d at %s in %s:%d\n", label, build_status, function, file, line);
+            break;
+    }
+    return build_status;
+}
+
+cl_int clPrintError(cl_int err, const char *label, const char *function, const char *file, int line)
+{
+    switch (err)
+    {
+        //CASE_STRINGERIZE(CL_SUCCESS, label, function, file, line);
+        case CL_SUCCESS:
+            break;
+        CASE_STRINGERIZE(CL_BUILD_PROGRAM_FAILURE, label, function, file, line);
+        CASE_STRINGERIZE(CL_COMPILER_NOT_AVAILABLE, label, function, file, line);
+        CASE_STRINGERIZE(CL_DEVICE_NOT_AVAILABLE, label, function, file, line);
+        CASE_STRINGERIZE(CL_DEVICE_NOT_FOUND, label, function, file, line);
+        CASE_STRINGERIZE(CL_IMAGE_FORMAT_MISMATCH, label, function, file, line);
+        CASE_STRINGERIZE(CL_IMAGE_FORMAT_NOT_SUPPORTED, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_ARG_INDEX, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_ARG_SIZE, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_ARG_VALUE, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_BINARY, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_BUFFER_SIZE, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_BUILD_OPTIONS, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_COMMAND_QUEUE, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_CONTEXT, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_DEVICE, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_DEVICE_TYPE, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_EVENT, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_EVENT_WAIT_LIST, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_GL_OBJECT, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_GLOBAL_OFFSET, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_HOST_PTR, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_IMAGE_SIZE, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_KERNEL_NAME, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_KERNEL, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_KERNEL_ARGS, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_KERNEL_DEFINITION, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_MEM_OBJECT, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_OPERATION, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_PLATFORM, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_PROGRAM, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_PROGRAM_EXECUTABLE, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_QUEUE_PROPERTIES, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_SAMPLER, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_VALUE, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_WORK_DIMENSION, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_WORK_GROUP_SIZE, label, function, file, line);
+        CASE_STRINGERIZE(CL_INVALID_WORK_ITEM_SIZE, label, function, file, line);
+        CASE_STRINGERIZE(CL_MAP_FAILURE, label, function, file, line);
+        CASE_STRINGERIZE(CL_MEM_OBJECT_ALLOCATION_FAILURE, label, function, file, line);
+        CASE_STRINGERIZE(CL_MEM_COPY_OVERLAP, label, function, file, line);
+        CASE_STRINGERIZE(CL_OUT_OF_HOST_MEMORY, label, function, file, line);
+        CASE_STRINGERIZE(CL_OUT_OF_RESOURCES, label, function, file, line);
+        CASE_STRINGERIZE(CL_PROFILING_INFO_NOT_AVAILABLE, label, function, file, line);
+        default:
+            fprintf(stderr, "%s: Unknown error %d at %s in %s:%d\n", label, err, function, file, line);
+            break;
+    }
+    return err;
+}
+
+char *clLoadSources(char *filename, size_t *programSize)
+{
+    FILE *pFile = NULL;
+    char *programSource = NULL;
+    VX_PRINT(VX_ZONE_INFO, "Reading source file %s\n", filename);
+    pFile = fopen((char *)filename, "rb");
+    if (pFile != NULL && programSize)
+    {
+        // obtain file size:
+        fseek(pFile, 0, SEEK_END);
+        *programSize = ftell(pFile);
+        rewind(pFile);
+
+        int size = *programSize + 1;
+        programSource = (char*)malloc(sizeof(char)*(size));
+        if (programSource == NULL)
+        {
+            fclose(pFile);
+            free(programSource);
+            return NULL;
+        }
+
+        fread(programSource, sizeof(char), *programSize, pFile);
+        programSource[*programSize] = '\0';
+        fclose(pFile);
+    }
+    return programSource;
+}
+
+#if defined(EXPERIMENTAL_USE_FNMATCH)
+static int vx_source_filter(const struct dirent *de)
+{
+    if (de && 0 == fnmatch("vx_*.cl", de->d_name,
+#if defined(__QNX__) || defined(__APPLE__)
+    FNM_PERIOD|FNM_PATHNAME))
+#else
+    FNM_PERIOD|FNM_FILE_NAME))
+#endif
+        return 1;
+    else
+        return 0;
+}
+
+#if defined(__QNX__)
+typedef int (*sorting_f)(const void *, const void *);
+#else
+typedef int (*sorting_f)(const struct dirent **, const struct dirent **);
+#endif
+
+static int name_sort(const struct dirent **a, const struct dirent **b)
+{
+    return strcmp((*a)->d_name, (*b)->d_name);
+}
+
+cl_program vxLoadProgram(cl_context context, const char *src_dir, cl_int *perr)
+{
+    cl_program program;
+    struct dirent **names = NULL;
+    int i, f, num_lines = 0, cur_line = 0;
+    int num_files = scandir(src_dir, &names, &vx_source_filter, &name_sort);
+    size_t *lengths = NULL, lineSize = CL_MAX_LINESIZE;
+    char **source = NULL;
+    printf("Matched %d files\n", num_files);
+    for (f = 0; f < num_files; f++) {
+        if (names[f]->d_name) {
+            char pathname[CL_MAX_LINESIZE];
+            sprintf(pathname, "%s%s", src_dir, names[f]->d_name);
+            FILE *fp = fopen(pathname, "r");
+            if (fp) {
+                num_lines += flines(fp);
+                fclose(fp);
+            }
+        }
+    }
+    printf("Total Number Lines: %d\n", num_lines);
+    // allocate big array of lines
+    source = ALLOC(char *, num_lines);
+    lengths = ALLOC(size_t, num_lines);
+    for (i = 0; i < num_lines; i++) {
+        source[i] = ALLOC(char, lineSize);
+        lengths[i] = lineSize;
+    }
+    // load all source into a single array
+    for (f = 0; f < num_files; f++) {
+        if (names[f]->d_name) {
+            char pathname[CL_MAX_LINESIZE];
+            sprintf(pathname, "%s%s", src_dir, names[f]->d_name);
+            FILE *fp = fopen(pathname, "r");
+            if (fp) {
+                printf("Reading from file %s\n", pathname);
+                do {
+                    if (fgets(source[cur_line], lengths[cur_line], fp) == NULL)
+                        break;
+                    // trim to exact lengths
+                    lengths[cur_line] = strlen(source[cur_line]);
+                    cur_line++;
+                } while (1);
+                printf("@ %u lines\n", cur_line);
+                fclose(fp);
+            }
+        }
+    }
+    if (num_lines != cur_line) {
+        fprintf(stderr, "Failed to read in all lines from source files!\n");
+        return 0;
+    }
+#if 1
+    for (i = 0; i < num_lines; i++) {
+        printf("%4d [%4zu] %s", i, lengths[i], source[i]);
+    }
+#endif
+    program = clCreateProgramWithSource(context, num_lines, (const char **)source, lengths, perr);
+    CL_ERROR_MSG(*perr, "clCreateProgramWithSource");
+#if 0
+    if (perr != CL_SUCCESS) {
+        cl_int err = 0;
+        size_t src_size = 0;
+        char *src = NULL;
+        err = clGetProgramInfo(program, CL_PROGRAM_SOURCE, 0, NULL, &src_size);
+        CL_ERROR_MSG(err, "clGetProgramInfo");
+        printf("Source Code has %zu bytes\n", src_size);
+        src = (char *)malloc(src_size);
+        err = clGetProgramInfo(program, CL_PROGRAM_SOURCE, src_size, src, NULL);
+        CL_ERROR_MSG(err, "clGetProgramInfo");
+        printf("%s", src);
+        free(src);
+    }
+#endif
+    return program;
+}
+
+#elif defined(_WIN32)
+
+cl_program vxLoadProgram(cl_context context, const char *src_dir, cl_int *perr) {
+    return 0;
+}
+
+#endif
+
+
+
diff --git a/sample/targets/opencl/vx_support.h b/sample/targets/opencl/vx_support.h
new file mode 100644
index 0000000..64aa1f5
--- /dev/null
+++ b/sample/targets/opencl/vx_support.h
@@ -0,0 +1,46 @@
+/* 
+
+ * Copyright (c) 2011-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vx_interface.h>
+#include "vx_internal.h"
+
+#if defined(__ANDROID__) || defined(__linux__) || defined(__QNX__) || defined(__CYGWIN__) || defined(__APPLE__)
+#if !defined(__QNX__) && !defined(__APPLE__)
+#include <features.h>
+#else
+#define __EXT_UNIX_MISC //Needed by QNX version of dirent.h to include scandir()
+#endif
+#include <sys/types.h>
+#if defined(__APPLE__)
+#include <sys/dirent.h>
+#endif
+#include <dirent.h>
+#include <fnmatch.h>
+#define EXPERIMENTAL_USE_FNMATCH
+#elif defined(_WIN32)
+#define snprintf _snprintf
+#endif
+
+#define CL_MAX_LINESIZE (1024)
+
+#define ALLOC(type,count)                               (type *)calloc(count, sizeof(type))
+#define CL_ERROR_MSG(err, string)                       clPrintError(err, string, __FUNCTION__, __FILE__, __LINE__)
+#define CL_BUILD_MSG(err, string)                       clBuildError(err, string, __FUNCTION__, __FILE__, __LINE__)
+
+char *clLoadSources(char *filename, size_t *programSize);
+cl_int clBuildError(cl_int build_status, const char *label, const char *function, const char *file, int line);
+cl_int clPrintError(cl_int err, const char *label, const char *function, const char *file, int line);
diff --git a/sample/targets/opencl/vx_warp.c b/sample/targets/opencl/vx_warp.c
new file mode 100644
index 0000000..243515e
--- /dev/null
+++ b/sample/targets/opencl/vx_warp.c
@@ -0,0 +1,395 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <VX/vx.h>
+#include <VX/vx_helper.h>
+#include <vx_support.h>
+
+#include "vx_interface.h"
+
+
+static vx_status VX_CALLBACK vxclCallOpenCLKernel(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_FAILURE;
+    vx_context context = node->base.context;
+
+    vx_cl_kernel_description_t *vxclk = vxclFindKernel(node->kernel->enumeration);
+    vx_uint32 pln, didx, plidx, argidx;
+    cl_int err = 0;
+    size_t off_dim[3] = { 0,0,0 };
+    size_t work_dim[3];
+
+    cl_event writeEvents[VX_INT_MAX_PARAMS];
+    cl_event readEvents[VX_INT_MAX_PARAMS];
+    cl_int we = 0, re = 0;
+
+    // determine which platform to use
+    plidx = 0;
+
+    // determine which device to use
+    didx = 0;
+
+    cl_kernel kernel = vxclk->kernels[plidx];
+
+    pln = 0;
+    argidx = 0;
+
+    vx_reference ref;
+
+    // Input src
+    ref = node->parameters[0];
+    vx_memory_t *memory = &((vx_image)ref)->memory;
+
+    vx_size in_step_x = 1;
+    vx_size in_step_y = 1;
+    vx_size in_offset_first_element_in_bytes = 0;
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+
+    //stride_x, step_x, stride_y, step_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &in_step_x);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &in_step_y);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &in_offset_first_element_in_bytes);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 5 parameters\n");
+
+    vx_int32 src_width = memory->dims[pln][VX_DIM_X];
+    vx_int32 src_height = memory->dims[pln][VX_DIM_Y];
+
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    err = clEnqueueWriteBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE,
+        0,
+        ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0,
+        NULL,
+        &ref->event);
+
+    //Set Output
+    ref = node->parameters[3];
+    memory = &((vx_image)ref)->memory;
+
+    vx_size out_step_x = 4;
+    vx_size out_step_y = memory->strides[pln][VX_DIM_Y];
+    vx_size out_offset_first_element_in_bytes = 0;
+
+    /* set the work dimensions */
+    work_dim[0] = memory->dims[pln][VX_DIM_X] / 4;
+    work_dim[1] = memory->dims[pln][VX_DIM_Y];
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &memory->hdls[pln]);
+
+    //stride_x, step_x, stride_y, step_y
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_X]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &out_step_x);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &memory->strides[pln][VX_DIM_Y]);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &out_step_y);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &out_offset_first_element_in_bytes);
+    VX_PRINT(VX_ZONE_INFO, "Setting vx_image as Buffer with 5 parameters\n");
+
+    int width = memory->dims[pln][VX_DIM_X];
+    int height = memory->dims[pln][VX_DIM_Y];
+    //width, height
+    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &src_width);
+    err = clSetKernelArg(vxclk->kernels[plidx], argidx++, sizeof(vx_int32), &src_height);
+
+    CL_ERROR_MSG(err, "clSetKernelArg");
+
+    vx_matrix mask = (vx_matrix)parameters[1];
+
+    vx_size matrix_size = 9;
+
+    //vx_float32 *m = (vx_float32 *)malloc(matrix_size * sizeof(vx_float32));
+    vx_float32 m[9];
+
+    status |= vxCopyMatrix(mask, m, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+
+    cl_mem mat = clCreateBuffer(context->global[0], CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, matrix_size * sizeof(vx_float32), m, &err);
+
+    err = clEnqueueWriteBuffer(context->queues[plidx][didx],
+        mat,
+        CL_TRUE,
+        0,
+        matrix_size * sizeof(vx_float32),
+        m,
+        0,
+        NULL,
+        NULL);
+
+    err = clSetKernelArg(kernel, argidx++, sizeof(cl_mem), &mat);
+
+    //Set bordermode
+    vx_border_t bordermode;
+    status = vxQueryNode(node, VX_NODE_BORDER, &bordermode, sizeof(bordermode));
+    //Set const value for constant boder 
+    uint8_t const_vaule = bordermode.constant_value.U8;
+    err = clSetKernelArg(kernel, argidx++, sizeof(uint8_t), &const_vaule);
+
+    //Set type
+    vx_scalar stype = (vx_scalar)parameters[2];
+    vx_int32 type = 0;
+    status |= vxCopyScalar(stype, &type, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+    err = clSetKernelArg(kernel, argidx++, sizeof(vx_int32), &type);
+
+    we = 0;
+
+    // Input src
+    ref = node->parameters[0];
+    memcpy(&writeEvents[we++], &ref->event, sizeof(cl_event));
+
+    err = clEnqueueNDRangeKernel(context->queues[plidx][didx],
+        kernel,
+        2,
+        off_dim,
+        work_dim,
+        NULL,
+        we, writeEvents, &node->base.event);
+
+    clFinish(context->queues[plidx][didx]);
+
+    CL_ERROR_MSG(err, "clEnqueueNDRangeKernel");
+
+    /* enqueue a read on all output data */
+    ref = node->parameters[3];
+
+    memory = &((vx_image)ref)->memory;
+
+    err = clEnqueueReadBuffer(context->queues[plidx][didx],
+        memory->hdls[pln],
+        CL_TRUE, 0, ownComputeMemorySize(memory, pln),
+        memory->ptrs[pln],
+        0, NULL, NULL);
+
+    CL_ERROR_MSG(err, "clEnqueueReadBuffer");
+
+    clFinish(context->queues[plidx][didx]);
+
+    re = 0;
+
+    memcpy(&readEvents[re++], &ref->event, sizeof(cl_event));
+
+    err = clFlush(context->queues[plidx][didx]);
+    CL_ERROR_MSG(err, "Flush");
+    VX_PRINT(VX_ZONE_TARGET, "Waiting for read events!\n");
+    clWaitForEvents(re, readEvents);
+    if (err == CL_SUCCESS)
+        status = VX_SUCCESS;
+
+    VX_PRINT(VX_ZONE_API, "%s exiting %d\n", __FUNCTION__, status);
+
+    clReleaseMemObject(mat);
+
+    return status;
+}
+
+static vx_status vxWarpInputValidator(vx_node node, vx_uint32 index, vx_size mat_columns)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_matrix matrix;
+            vxQueryParameter(param, VX_PARAMETER_REF, &matrix, sizeof(matrix));
+            if (matrix)
+            {
+                vx_enum data_type = 0;
+                vx_size rows = 0ul, columns = 0ul;
+                vxQueryMatrix(matrix, VX_MATRIX_TYPE, &data_type, sizeof(data_type));
+                vxQueryMatrix(matrix, VX_MATRIX_ROWS, &rows, sizeof(rows));
+                vxQueryMatrix(matrix, VX_MATRIX_COLUMNS, &columns, sizeof(columns));
+                if ((data_type == VX_TYPE_FLOAT32) && (columns == mat_columns) && (rows == 3))
+                {
+                    status = VX_SUCCESS;
+                }
+                vxReleaseMatrix(&matrix);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    else if (index == 2)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar scalar = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum stype = 0;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                if (stype == VX_TYPE_ENUM)
+                {
+                    vx_enum interp = 0;
+                    vxCopyScalar(scalar, &interp, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ((interp == VX_INTERPOLATION_NEAREST_NEIGHBOR) ||
+                        (interp == VX_INTERPOLATION_BILINEAR))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxWarpAffineInputValidator(vx_node node, vx_uint32 index)
+{
+    return vxWarpInputValidator(node, index, 2);
+}
+
+static vx_status VX_CALLBACK vxWarpPerspectiveInputValidator(vx_node node, vx_uint32 index)
+{
+    return vxWarpInputValidator(node, index, 3);
+}
+
+static vx_status VX_CALLBACK vxWarpOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 3)
+    {
+        vx_parameter dst_param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)dst_param) == VX_SUCCESS)
+        {
+            vx_image dst = 0;
+            vxQueryParameter(dst_param, VX_PARAMETER_REF, &dst, sizeof(dst));
+            if (dst)
+            {
+                vx_uint32 w1 = 0, h1 = 0;
+                vx_df_image f1 = VX_DF_IMAGE_VIRT;
+
+                vxQueryImage(dst, VX_IMAGE_WIDTH, &w1, sizeof(w1));
+                vxQueryImage(dst, VX_IMAGE_HEIGHT, &h1, sizeof(h1));
+                vxQueryImage(dst, VX_IMAGE_FORMAT, &f1, sizeof(f1));
+                /* output can not be virtual */
+                if ((w1 != 0) && (h1 != 0) && (f1 == VX_DF_IMAGE_U8))
+                {
+                    /* fill in the meta data with the attributes so that the checker will pass */
+                    ptr->type = VX_TYPE_IMAGE;
+                    ptr->dim.image.format = VX_DF_IMAGE_U8;
+                    ptr->dim.image.width = w1;
+                    ptr->dim.image.height = h1;
+                    status = VX_SUCCESS;
+                }
+                vxReleaseImage(&dst);
+            }
+            vxReleaseParameter(&dst_param);
+        }
+    }
+    return status;
+}
+
+static vx_param_description_t warp_kernel_params[] = {
+    {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_MATRIX, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL},
+    {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+};
+
+static vx_status VX_CALLBACK vxWarpAffineKernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    printf("OpenCL WarpAffine\n");
+
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+static vx_status VX_CALLBACK vxWarpPerspectiveKernel(vx_node node, const vx_reference *parameters, vx_uint32 num)
+{
+    printf("OpenCL WarpPerspective\n");
+
+    vx_status status = vxclCallOpenCLKernel(node, parameters, num);
+
+    return status;
+}
+
+vx_cl_kernel_description_t warp_affine_kernel = {
+    {
+        VX_KERNEL_WARP_AFFINE,
+        "org.khronos.openvx.warp_affine",
+        vxWarpAffineKernel,
+        warp_kernel_params, dimof(warp_kernel_params),
+        NULL,
+        vxWarpAffineInputValidator,
+        vxWarpOutputValidator,
+        NULL,
+        NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_warp_affine.cl",
+    "warp_affine",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
+
+vx_cl_kernel_description_t warp_perspective_kernel = {
+    {
+        VX_KERNEL_WARP_PERSPECTIVE,
+        "org.khronos.openvx.warp_perspective",
+        vxWarpPerspectiveKernel,
+        warp_kernel_params, dimof(warp_kernel_params),
+        NULL,
+        vxWarpPerspectiveInputValidator,
+        vxWarpOutputValidator,
+        NULL,
+        NULL,
+    },
+    VX_CL_SOURCE_DIR""FILE_JOINER"vx_warp_perspective.cl",
+    "warp_perspective",
+    INIT_PROGRAMS,
+    INIT_KERNELS,
+    INIT_NUMKERNELS,
+    INIT_RETURNS,
+    NULL,
+};
diff --git a/sample/targets/tiling/CMakeLists.txt b/sample/targets/tiling/CMakeLists.txt
new file mode 100644
index 0000000..5dd957a
--- /dev/null
+++ b/sample/targets/tiling/CMakeLists.txt
@@ -0,0 +1,50 @@
+#
+# Copyright (c) 2011-2018 The Khronos Group Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+# set target name
+set( TARGET_NAME openvx-tiling_chaining )
+
+include_directories( BEFORE
+                     ${CMAKE_CURRENT_SOURCE_DIR}
+                     ${CMAKE_CURRENT_SOURCE_DIR}/../../include
+                     ${VX_HEADER_DIR}
+                     ${CMAKE_SOURCE_DIR}/kernels/tiling
+                     ${CMAKE_SOURCE_DIR}/debug
+                     ${CMAKE_SOURCE_DIR}/utils
+                     )
+
+FIND_SOURCES()
+
+if ((WIN32) OR (CYGWIN))
+    set( DEF_FILE openvx-target.def )
+endif ((WIN32) OR (CYGWIN))
+
+# add a target named ${TARGET_NAME}
+add_library (${TARGET_NAME} SHARED ${SOURCE_FILES} ${DEF_FILE})
+
+if (CYGWIN)
+    set_target_properties( ${TARGET_NAME} PROPERTIES LINK_FLAGS ${CMAKE_CURRENT_SOURCE_DIR}/${DEF_FILE} )
+endif (CYGWIN)
+
+target_link_libraries( ${TARGET_NAME} openvx-debug-lib openvx-extras-lib openvx-helper openvx-tiling_chaining-lib openvx vxu half)
+
+install ( TARGETS ${TARGET_NAME} 
+          RUNTIME DESTINATION bin
+          ARCHIVE DESTINATION lib
+          LIBRARY DESTINATION bin )
+		  
+set_target_properties( ${TARGET_NAME} PROPERTIES FOLDER ${SAMPLE_TARGETS_FOLDER} )
diff --git a/sample/targets/tiling/openvx-target.def b/sample/targets/tiling/openvx-target.def
new file mode 100644
index 0000000..ac029d2
--- /dev/null
+++ b/sample/targets/tiling/openvx-target.def
@@ -0,0 +1,12 @@
+LIBRARY "openvx-tiling_chaining.dll"
+VERSION 1.0
+EXPORTS
+    vxTargetInit
+    vxTargetDeinit
+    vxTargetVerify
+    vxTargetProcess
+    vxTargetSupports
+    vxTargetAddKernel
+    vxTargetAddTilingKernel
+    vxPublishKernels
+    vxUnpublishKernels
diff --git a/sample/targets/tiling/vx_absdiff.c b/sample/targets/tiling/vx_absdiff.c
new file mode 100644
index 0000000..9ab8d44
--- /dev/null
+++ b/sample/targets/tiling/vx_absdiff.c
@@ -0,0 +1,146 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+#include "vx_internal.h"
+#include <tiling.h>
+
+static vx_status VX_CALLBACK vxAbsDiffInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0 )
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8
+                || format == VX_DF_IMAGE_S16
+#if defined(OPENVX_USE_S16)
+                || format == VX_DF_IMAGE_U16
+#endif
+                )
+                status = VX_SUCCESS;
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_image images[2];
+        vx_parameter param[2] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, 1),
+        };
+        vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0]));
+        vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1]));
+        if (images[0] && images[1])
+        {
+            vx_uint32 width[2], height[2];
+            vx_df_image format[2];
+
+            vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0]));
+            vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1]));
+            vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0]));
+            vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1]));
+            vxQueryImage(images[0], VX_IMAGE_FORMAT, &format[0], sizeof(format[0]));
+            vxQueryImage(images[1], VX_IMAGE_FORMAT, &format[1], sizeof(format[1]));
+            if (width[0] == width[1] && height[0] == height[1] && format[0] == format[1])
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&images[0]);
+            vxReleaseImage(&images[1]);
+        }
+        vxReleaseParameter(&param[0]);
+        vxReleaseParameter(&param[1]);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxAbsDiffOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 2)
+    {
+        vx_parameter param[2] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, 1),
+        };
+        if ((vxGetStatus((vx_reference)param[0]) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)param[1]) == VX_SUCCESS))
+        {
+            vx_image images[2];
+            vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0]));
+            vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1]));
+            if (images[0] && images[1])
+            {
+                vx_uint32 width[2], height[2];
+                vx_df_image format = 0;
+                vxQueryImage(images[0], VX_IMAGE_FORMAT, &format, sizeof(format));
+                vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0]));
+                vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1]));
+                vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0]));
+                vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1]));
+                if (width[0] == width[1] && height[0] == height[1] &&
+                    (format == VX_DF_IMAGE_U8
+                     || format == VX_DF_IMAGE_S16
+#if defined(OPENVX_USE_S16)
+                     || format == VX_DF_IMAGE_U16
+#endif
+                     ))
+                {
+                    ptr->type = VX_TYPE_IMAGE;
+                    ptr->dim.image.format = format;
+                    ptr->dim.image.width = width[0];
+                    ptr->dim.image.height = height[1];
+                    status = VX_SUCCESS;
+                }
+                vxReleaseImage(&images[0]);
+                vxReleaseImage(&images[1]);
+            }
+            vxReleaseParameter(&param[0]);
+            vxReleaseParameter(&param[1]);
+        }
+    }
+    return status;
+}
+vx_tiling_kernel_t absdiff_kernel = 
+{
+    "org.khronos.openvx.tiling_absdiff",
+    VX_KERNEL_ABSDIFF_TILING,
+    NULL,
+    AbsDiff_image_tiling_flexible,
+    AbsDiff_image_tiling_fast,
+    3,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+	NULL,
+    vxAbsDiffInputValidator,
+    vxAbsDiffOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
diff --git a/sample/targets/tiling/vx_addsub.c b/sample/targets/tiling/vx_addsub.c
new file mode 100644
index 0000000..5c99f8e
--- /dev/null
+++ b/sample/targets/tiling/vx_addsub.c
@@ -0,0 +1,213 @@
+/*
+
+ * Copyright (c) 2013-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+#include "vx_internal.h"
+#include <tiling.h>
+
+static vx_status VX_CALLBACK vxAddSubtractInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16)
+                status = VX_SUCCESS;
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_image images[2];
+        vx_parameter param[2] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, 1),
+        };
+        vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0]));
+        vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1]));
+        if (images[0] && images[1])
+        {
+            vx_uint32 width[2], height[2];
+            vx_df_image format1;
+
+            vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0]));
+            vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1]));
+            vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0]));
+            vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1]));
+            vxQueryImage(images[1], VX_IMAGE_FORMAT, &format1, sizeof(format1));
+            if (width[0] == width[1] && height[0] == height[1] &&
+                (format1 == VX_DF_IMAGE_U8 || format1 == VX_DF_IMAGE_S16))
+                status = VX_SUCCESS;
+            vxReleaseImage(&images[0]);
+            vxReleaseImage(&images[1]);
+        }
+        vxReleaseParameter(&param[0]);
+        vxReleaseParameter(&param[1]);
+    }
+    else if (index == 2)        /* overflow_policy: truncate or saturate. */
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar scalar = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum stype = 0;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                if (stype == VX_TYPE_ENUM)
+                {
+                    vx_enum overflow_policy = 0;
+                    vxCopyScalar(scalar, &overflow_policy, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ((overflow_policy == VX_CONVERT_POLICY_WRAP) ||
+                        (overflow_policy == VX_CONVERT_POLICY_SATURATE))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxAddSubtractOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 3)
+    {
+        /*
+         * We need to look at both input images, but only for the format:
+         * if either is S16 or the output type is not U8, then it's S16.
+         * The geometry of the output image is copied from the first parameter:
+         * the input images are known to match from input parameters validation.
+         */
+        vx_parameter param[] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, 1),
+            vxGetParameterByIndex(node, index),
+        };
+        if ((vxGetStatus((vx_reference)param[0]) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)param[1]) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)param[2]) == VX_SUCCESS))
+        {
+            vx_image images[3];
+            vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0]));
+            vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1]));
+            vxQueryParameter(param[2], VX_PARAMETER_REF, &images[2], sizeof(images[2]));
+            if (images[0] && images[1] && images[2])
+            {
+                vx_uint32 width = 0, height = 0;
+                vx_df_image informat[2] = {VX_DF_IMAGE_VIRT, VX_DF_IMAGE_VIRT};
+                vx_df_image outformat = VX_DF_IMAGE_VIRT;
+
+                /*
+                 * When passing on the geometry to the output image, we only look at
+                 * image 0, as both input images are verified to match, at input
+                 * validation.
+                 */
+                vxQueryImage(images[0], VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height, sizeof(height));
+                vxQueryImage(images[0], VX_IMAGE_FORMAT, &informat[0], sizeof(informat[0]));
+                vxQueryImage(images[1], VX_IMAGE_FORMAT, &informat[1], sizeof(informat[1]));
+                vxQueryImage(images[2], VX_IMAGE_FORMAT, &outformat, sizeof(outformat));
+
+                if (informat[0] == VX_DF_IMAGE_U8 && informat[1] == VX_DF_IMAGE_U8 && outformat == VX_DF_IMAGE_U8)
+                {
+                    status = VX_SUCCESS;
+                }
+                else
+                {
+                    outformat = VX_DF_IMAGE_S16;
+                    status = VX_SUCCESS;
+                }
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = outformat;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                vxReleaseImage(&images[0]);
+                vxReleaseImage(&images[1]);
+                vxReleaseImage(&images[2]);
+            }
+            vxReleaseParameter(&param[0]);
+            vxReleaseParameter(&param[1]);
+            vxReleaseParameter(&param[2]);
+        }
+    }
+    return status;
+}
+
+vx_tiling_kernel_t add_kernel = {
+    "org.khronos.openvx.tiling_add",
+    VX_KERNEL_ADD_TILING,
+    NULL,
+    Addition_image_tiling_flexible,
+    Addition_image_tiling_fast,
+    4,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+        { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxAddSubtractInputValidator,
+    vxAddSubtractOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
+vx_tiling_kernel_t subtract_kernel = {
+    "org.khronos.openvx.tiling_subtract",
+    VX_KERNEL_SUBTRACT_TILING,
+    NULL,
+    Subtraction_image_tiling_flexible,
+    Subtraction_image_tiling_fast,
+    4,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+        { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxAddSubtractInputValidator,
+    vxAddSubtractOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
diff --git a/sample/targets/tiling/vx_bitwise.c b/sample/targets/tiling/vx_bitwise.c
new file mode 100644
index 0000000..69dc6fa
--- /dev/null
+++ b/sample/targets/tiling/vx_bitwise.c
@@ -0,0 +1,236 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+
+#include "vx_internal.h"
+
+#include <tiling.h>
+
+static vx_status VX_CALLBACK vxBinaryBitwiseInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+                status = VX_SUCCESS;
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_image images[2];
+        vx_parameter param[2] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, 1),
+        };
+        vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0]));
+        vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1]));
+        if (images[0] && images[1])
+        {
+            vx_uint32 width[2], height[2];
+            vx_df_image format[2];
+
+            vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0]));
+            vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1]));
+            vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0]));
+            vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1]));
+            vxQueryImage(images[0], VX_IMAGE_FORMAT, &format[0], sizeof(format[0]));
+            vxQueryImage(images[1], VX_IMAGE_FORMAT, &format[1], sizeof(format[1]));
+            if (width[0] == width[1] && height[0] == height[1] && format[0] == format[1])
+                status = VX_SUCCESS;
+            vxReleaseImage(&images[1]);
+            vxReleaseImage(&images[0]);
+        }
+        vxReleaseParameter(&param[0]);
+        vxReleaseParameter(&param[1]);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxBinaryBitwiseOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 2)
+    {
+        vx_parameter param0 = vxGetParameterByIndex(node, 0);
+        if (param0)
+        {
+            vx_image image0 = 0;
+            vxQueryParameter(param0, VX_PARAMETER_REF, &image0, sizeof(image0));
+            /*
+             * When passing on the geometry to the output image, we only look at image 0, as
+             * both input images are verified to match, at input validation.
+             */
+            if (image0)
+            {
+                vx_uint32 width = 0, height = 0;
+                vxQueryImage(image0, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(image0, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = VX_DF_IMAGE_U8;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&image0);
+            }
+            vxReleaseParameter(&param0);
+        }
+    }
+    return status;
+}
+
+vx_tiling_kernel_t And_kernel = 
+{
+    "org.khronos.openvx.tiling_and",
+    VX_KERNEL_AND_TILING,
+    NULL,
+    And_image_tiling_flexible,
+    And_image_tiling_fast,
+    3,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxBinaryBitwiseInputValidator,
+    vxBinaryBitwiseOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+vx_tiling_kernel_t Or_kernel = 
+{
+    "org.khronos.openvx.tiling_or",
+    VX_KERNEL_OR_TILING,
+    NULL,
+    Or_image_tiling_flexible,
+    Or_image_tiling_fast,
+    3,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxBinaryBitwiseInputValidator,
+    vxBinaryBitwiseOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+vx_tiling_kernel_t Xor_kernel = 
+{
+    "org.khronos.openvx.tiling_xor",
+    VX_KERNEL_XOR_TILING,
+    NULL,
+    Xor_image_tiling_flexible,
+    Xor_image_tiling_fast,
+    3,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxBinaryBitwiseInputValidator,
+    vxBinaryBitwiseOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
+/* The Not kernel is an unary operator, requiring separate validators. */
+static vx_status VX_CALLBACK vxUnaryBitwiseInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+                status = VX_SUCCESS;
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxUnaryBitwiseOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, 0);
+        if (param)
+        {
+            vx_image inimage = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &inimage, sizeof(inimage));
+            if (inimage)
+            {
+                vx_uint32 width = 0, height = 0;
+                vxQueryImage(inimage, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(inimage, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = VX_DF_IMAGE_U8;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&inimage);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+vx_tiling_kernel_t Not_kernel = 
+{
+    "org.khronos.openvx.tiling_not",
+    VX_KERNEL_NOT_TILING,
+    NULL,
+    Not_image_tiling_flexible,
+    Not_image_tiling_fast,
+    2,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxUnaryBitwiseInputValidator,
+    vxUnaryBitwiseOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
diff --git a/sample/targets/tiling/vx_channelcombine.c b/sample/targets/tiling/vx_channelcombine.c
new file mode 100644
index 0000000..1add231
--- /dev/null
+++ b/sample/targets/tiling/vx_channelcombine.c
@@ -0,0 +1,189 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+
+#include "vx_internal.h"
+
+#include "tiling.h"
+
+static vx_status VX_CALLBACK vxChannelCombineInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index < 4)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_image image = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &image, sizeof(image));
+            if (image)
+            {
+                vx_df_image format = 0;
+                vxQueryImage(image, VX_IMAGE_FORMAT, &format, sizeof(format));
+                if (format == VX_DF_IMAGE_U8)
+                {
+                    status = VX_SUCCESS;
+                }
+                vxReleaseImage(&image);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxChannelCombineOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 4)
+    {
+        vx_uint32 p, width = 0, height = 0;
+        vx_uint32 uv_x_scale = 0, uv_y_scale = 0;
+        vx_parameter params[] = {
+                vxGetParameterByIndex(node, 0),
+                vxGetParameterByIndex(node, 1),
+                vxGetParameterByIndex(node, 2),
+                vxGetParameterByIndex(node, 3),
+                vxGetParameterByIndex(node, index)
+        };
+        vx_bool planes_present[4] = { vx_false_e, vx_false_e, vx_false_e, vx_false_e };
+        /* check for equal plane sizes and determine plane presence */
+        for (p = 0; p < index; p++)
+        {
+            if (vxGetStatus((vx_reference)params[p]) == VX_SUCCESS)
+            {
+                vx_image image = 0;
+                vxQueryParameter(params[p], VX_PARAMETER_REF, &image, sizeof(image));
+                planes_present[p] = image != 0;
+
+                if (image)
+                {
+                    uint32_t w = 0, h = 0;
+                    vxQueryImage(image, VX_IMAGE_WIDTH, &w, sizeof(w));
+                    vxQueryImage(image, VX_IMAGE_HEIGHT, &h, sizeof(h));
+                    if (width == 0 && height == 0)
+                    {
+                        width = w;
+                        height = h;
+                    }
+                    else if (uv_x_scale == 0 && uv_y_scale == 0)
+                    {
+                        uv_x_scale = width == w ? 1 : (width == 2*w ? 2 : 0);
+                        uv_y_scale = height == h ? 1 : (height == 2*h ? 2 : 0);
+                        if (uv_x_scale == 0 || uv_y_scale == 0 || uv_y_scale > uv_x_scale)
+                        {
+                            status = VX_ERROR_INVALID_DIMENSION;
+                            vxAddLogEntry((vx_reference)image, status, "Input image channel %u does not match in dimensions!\n", p);
+                            goto exit;
+                        }
+                    }
+                    else if (width != w * uv_x_scale || height != h * uv_y_scale)
+                    {
+                        status = VX_ERROR_INVALID_DIMENSION;
+                        vxAddLogEntry((vx_reference)image, status, "Input image channel %u does not match in dimensions!\n", p);
+                        goto exit;
+                    }
+                    vxReleaseImage(&image);
+                }
+            }
+        }
+        if (params[index])
+        {
+            vx_image output = 0;
+            vxQueryParameter(params[index], VX_PARAMETER_REF, &output, sizeof(output));
+            if (output)
+            {
+                vx_df_image format = VX_DF_IMAGE_VIRT;
+                vx_bool supported_format = vx_true_e;
+                vx_bool correct_planes = planes_present[0] && planes_present[1] && planes_present[2];
+
+                vxQueryImage(output, VX_IMAGE_FORMAT, &format, sizeof(format));
+                switch (format)
+                {
+                    case VX_DF_IMAGE_RGB:
+                    case VX_DF_IMAGE_YUV4:
+                        correct_planes = correct_planes && uv_y_scale == 1 && uv_x_scale == 1;
+                        break;
+                    case VX_DF_IMAGE_RGBX:
+                        correct_planes = correct_planes && planes_present[3] && uv_y_scale == 1 && uv_x_scale == 1;
+                        break;
+                    case VX_DF_IMAGE_YUYV:
+                    case VX_DF_IMAGE_UYVY:
+                        correct_planes = correct_planes && uv_y_scale == 1 && uv_x_scale == 2;
+                        break;
+                    case VX_DF_IMAGE_NV12:
+                    case VX_DF_IMAGE_NV21:
+                    case VX_DF_IMAGE_IYUV:
+                        correct_planes = correct_planes && uv_y_scale == 2 && uv_x_scale == 2;
+                        break;
+                    default:
+                        supported_format = vx_false_e;
+                }
+                if (supported_format)
+                {
+                    if (correct_planes)
+                    {
+                        ptr->type = VX_TYPE_IMAGE;
+                        ptr->dim.image.format = format;
+                        ptr->dim.image.width = width;
+                        ptr->dim.image.height = height;
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        VX_PRINT(VX_ZONE_API, "Valid format but missing planes!\n");
+                    }
+                }
+                vxReleaseImage(&output);
+            }
+        }
+exit:
+        for (p = 0; p < dimof(params); p++)
+        {
+            if (params[p])
+            {
+                vxReleaseParameter(&params[p]);
+            }
+        }
+    }
+    VX_PRINT(VX_ZONE_API, "%s:%u returned %d\n", __FUNCTION__, index, status);
+    return status;
+}
+
+vx_tiling_kernel_t channelcombine_kernel = 
+{
+    "org.khronos.openvx.tiling_channel_combine",
+    VX_KERNEL_CHANNEL_COMBINE_TILING,
+    NULL,
+    ChannelCombine_image_tiling_flexible,
+    ChannelCombine_image_tiling_fast,
+    5,
+    { { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL },
+      { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL },
+      { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL, 
+    vxChannelCombineInputValidator,
+    vxChannelCombineOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_colorconvert.c b/sample/targets/tiling/vx_colorconvert.c
new file mode 100644
index 0000000..eefab0b
--- /dev/null
+++ b/sample/targets/tiling/vx_colorconvert.c
@@ -0,0 +1,190 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+
+#include "vx_internal.h"
+
+#include "tiling.h"
+
+static vx_status VX_CALLBACK vxColorConvertInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_SUCCESS;
+    if (index == 0)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, 0);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_image image = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &image, sizeof(image));
+            if (image)
+            {
+                vx_df_image format = 0;
+                vx_uint32 width = 0, height = 0;
+
+                vxQueryImage(image, VX_IMAGE_FORMAT, &format, sizeof(format));
+                vxQueryImage(image, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(image, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                // check to make sure the input format is supported.
+                switch (format)
+                {
+                    case VX_DF_IMAGE_RGB:  /* 8:8:8 interleaved */
+                    case VX_DF_IMAGE_RGBX: /* 8:8:8:8 interleaved */
+                    case VX_DF_IMAGE_NV12: /* 4:2:0 co-planar*/
+                    case VX_DF_IMAGE_NV21: /* 4:2:0 co-planar*/
+                    case VX_DF_IMAGE_IYUV: /* 4:2:0 planar */
+                        if (height & 1)
+                        {
+                            status = VX_ERROR_INVALID_DIMENSION;
+                            break;
+                        }
+                        /* no break */
+                    case VX_DF_IMAGE_YUYV: /* 4:2:2 interleaved */
+                    case VX_DF_IMAGE_UYVY: /* 4:2:2 interleaved */
+                        if (width & 1)
+                        {
+                            status = VX_ERROR_INVALID_DIMENSION;
+                        }
+                        break;
+                    default:
+                        status = VX_ERROR_INVALID_FORMAT;
+                        break;
+                }
+                vxReleaseImage(&image);
+            }
+            else
+            {
+                status = VX_ERROR_INVALID_PARAMETERS;
+            }
+            vxReleaseParameter(&param);
+        }
+        else
+        {
+            status = VX_ERROR_INVALID_PARAMETERS;
+        }
+    }
+    else
+    {
+        status = VX_ERROR_INVALID_PARAMETERS;
+    }
+    return status;
+}
+
+static vx_df_image color_combos[][2] = {
+        /* {src, dst} */
+        {VX_DF_IMAGE_RGB, VX_DF_IMAGE_RGBX},
+        {VX_DF_IMAGE_RGB, VX_DF_IMAGE_NV12},
+        {VX_DF_IMAGE_RGB, VX_DF_IMAGE_YUV4},
+        {VX_DF_IMAGE_RGB, VX_DF_IMAGE_IYUV},
+        {VX_DF_IMAGE_RGBX,VX_DF_IMAGE_RGB},
+        {VX_DF_IMAGE_RGBX,VX_DF_IMAGE_NV12},
+        {VX_DF_IMAGE_RGBX,VX_DF_IMAGE_YUV4},
+        {VX_DF_IMAGE_RGBX,VX_DF_IMAGE_IYUV},
+        {VX_DF_IMAGE_NV12,VX_DF_IMAGE_RGB},
+        {VX_DF_IMAGE_NV12,VX_DF_IMAGE_RGBX},
+        {VX_DF_IMAGE_NV12,VX_DF_IMAGE_NV21},
+        {VX_DF_IMAGE_NV12,VX_DF_IMAGE_YUV4},
+        {VX_DF_IMAGE_NV12,VX_DF_IMAGE_IYUV},
+        {VX_DF_IMAGE_NV21,VX_DF_IMAGE_RGB},
+        {VX_DF_IMAGE_NV21,VX_DF_IMAGE_RGBX},
+        {VX_DF_IMAGE_NV21,VX_DF_IMAGE_NV12},
+        {VX_DF_IMAGE_NV21,VX_DF_IMAGE_YUV4},
+        {VX_DF_IMAGE_NV21,VX_DF_IMAGE_IYUV},
+        {VX_DF_IMAGE_UYVY,VX_DF_IMAGE_RGB},
+        {VX_DF_IMAGE_UYVY,VX_DF_IMAGE_RGBX},
+        {VX_DF_IMAGE_UYVY,VX_DF_IMAGE_NV12},
+        {VX_DF_IMAGE_UYVY,VX_DF_IMAGE_YUV4},
+        {VX_DF_IMAGE_UYVY,VX_DF_IMAGE_IYUV},
+        {VX_DF_IMAGE_YUYV,VX_DF_IMAGE_RGB},
+        {VX_DF_IMAGE_YUYV,VX_DF_IMAGE_RGBX},
+        {VX_DF_IMAGE_YUYV,VX_DF_IMAGE_NV12},
+        {VX_DF_IMAGE_YUYV,VX_DF_IMAGE_YUV4},
+        {VX_DF_IMAGE_YUYV,VX_DF_IMAGE_IYUV},
+        {VX_DF_IMAGE_IYUV,VX_DF_IMAGE_RGB},
+        {VX_DF_IMAGE_IYUV,VX_DF_IMAGE_RGBX},
+        {VX_DF_IMAGE_IYUV,VX_DF_IMAGE_NV12},
+        {VX_DF_IMAGE_IYUV,VX_DF_IMAGE_YUV4},
+};
+
+static vx_status VX_CALLBACK vxColorConvertOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 1)
+    {
+        vx_parameter param0 = vxGetParameterByIndex(node, 0);
+        vx_parameter param1 = vxGetParameterByIndex(node, 1);
+        if ((vxGetStatus((vx_reference)param0) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)param1) == VX_SUCCESS))
+        {
+            vx_image output = 0, input = 0;
+            vxQueryParameter(param0, VX_PARAMETER_REF, &input, sizeof(input));
+            vxQueryParameter(param1, VX_PARAMETER_REF, &output, sizeof(output));
+            if (input && output)
+            {
+                vx_df_image src = VX_DF_IMAGE_VIRT;
+                vx_df_image dst = VX_DF_IMAGE_VIRT;
+                vxQueryImage(input, VX_IMAGE_FORMAT, &src, sizeof(src));
+                vxQueryImage(output, VX_IMAGE_FORMAT, &dst, sizeof(dst));
+                if (dst != VX_DF_IMAGE_VIRT) /* can't be a unspecified format */
+                {
+                    vx_uint32 i = 0;
+                    for (i = 0; i < dimof(color_combos); i++)
+                    {
+                        if ((color_combos[i][0] == src) &&
+                            (color_combos[i][1] == dst))
+                        {
+                            ptr->type = VX_TYPE_IMAGE;
+                            ptr->dim.image.format = dst;
+                            vxQueryImage(input, VX_IMAGE_WIDTH, &ptr->dim.image.width, sizeof(ptr->dim.image.width));
+                            vxQueryImage(input, VX_IMAGE_HEIGHT, &ptr->dim.image.height, sizeof(ptr->dim.image.height));
+                            status = VX_SUCCESS;
+                            break;
+                        }
+                    }
+                }
+                vxReleaseImage(&input);
+                vxReleaseImage(&output);
+            }
+            vxReleaseParameter(&param0);
+            vxReleaseParameter(&param1);
+        }
+    }
+    VX_PRINT(VX_ZONE_API, "%s:%u returned %d\n", __FUNCTION__, index, status);
+    return status;
+}
+
+/*! \brief The exported kernel table entry */
+vx_tiling_kernel_t colorconvert_kernel = 
+{
+    "org.khronos.openvx.tiling_color_convert",
+    VX_KERNEL_COLOR_CONVERT_TILING,
+    NULL,
+    ConvertColor_image_tiling_flexible,
+    ConvertColor_image_tiling_fast,
+    2, 
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxColorConvertInputValidator,
+    vxColorConvertOutputValidator,
+    NULL,
+    NULL,
+    { 8, 8 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
diff --git a/sample/targets/tiling/vx_convertdepth.c b/sample/targets/tiling/vx_convertdepth.c
new file mode 100644
index 0000000..619c57f
--- /dev/null
+++ b/sample/targets/tiling/vx_convertdepth.c
@@ -0,0 +1,210 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+#include "vx_internal.h"
+
+#include <tiling.h>
+
+static vx_status VX_CALLBACK vxConvertDepthInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_image input = 0;
+            status = vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+            if ((status == VX_SUCCESS) && input)
+            {
+                vx_df_image format = 0;
+                status = vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+                if ((status != VX_SUCCESS) ||
+                    (format == VX_DF_IMAGE_U8)  ||
+#if defined(EXPERIMENTAL_USE_S16)
+                    (format == VX_DF_IMAGE_U16) ||
+                    (format == VX_DF_IMAGE_U32) ||
+                    (format == VX_DF_IMAGE_S32) ||
+                    (format == VX_DF_IMAGE_F32) ||
+#endif
+                    (format == VX_DF_IMAGE_S16))
+                {
+                    status = VX_SUCCESS;
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_PARAMETERS;
+                }
+                vxReleaseImage(&input);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    else if (index == 2)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar scalar = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum stype = 0;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                if (stype == VX_TYPE_ENUM)
+                {
+                    vx_enum overflow_policy = 0;
+                    vxCopyScalar(scalar, &overflow_policy, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ((overflow_policy == VX_CONVERT_POLICY_WRAP) ||
+                        (overflow_policy == VX_CONVERT_POLICY_SATURATE))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        printf("Overflow given as %08x\n", overflow_policy);
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    else if (index == 3)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar scalar = 0;
+            status = vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (status == VX_SUCCESS)
+            {
+                vx_enum type = 0;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &type, sizeof(type));
+                if (type == VX_TYPE_INT32)
+                {
+                    vx_int32 shift = 0;
+                    status = vxCopyScalar(scalar, &shift, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if (status == VX_SUCCESS)
+                    {
+                        /*! \internal Allowing \f$ 0 \le shift < 32 \f$ could
+                         * produce weird results for smaller bit depths */
+                        if (shift < 0 || shift >= 32)
+                        {
+                            status = VX_ERROR_INVALID_VALUE;
+                        }
+                        /* status should be VX_SUCCESS from call */
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxConvertDepthOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 1)
+    {
+        vx_parameter param[2] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, 1),
+        };
+        if ((vxGetStatus((vx_reference)param[0]) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)param[1]) == VX_SUCCESS))
+        {
+            vx_image images[2] = {0,0};
+            status  = VX_SUCCESS;
+            status |= vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0]));
+            status |= vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1]));
+            if ((status == VX_SUCCESS) && (images[0]) && (images[1]))
+            {
+                vx_uint32 width = 0, height = 0;
+                vx_df_image format[2] = {VX_DF_IMAGE_VIRT, VX_DF_IMAGE_VIRT};
+                status |= vxQueryImage(images[0], VX_IMAGE_WIDTH, &width, sizeof(width));
+                status |= vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height, sizeof(height));
+                status |= vxQueryImage(images[0], VX_IMAGE_FORMAT, &format[0], sizeof(format[0]));
+                status |= vxQueryImage(images[1], VX_IMAGE_FORMAT, &format[1], sizeof(format[1]));
+                if (((format[0] == VX_DF_IMAGE_U8)  && (format[1] == VX_DF_IMAGE_S16)) ||
+#if defined(EXPERIMENTAL_USE_S16)
+                    ((format[0] == VX_DF_IMAGE_U8)  && (format[1] == VX_DF_IMAGE_U16)) ||
+                    ((format[0] == VX_DF_IMAGE_U8)  && (format[1] == VX_DF_IMAGE_U32)) ||
+                    ((format[0] == VX_DF_IMAGE_U16) && (format[1] == VX_DF_IMAGE_U8))  ||
+                    ((format[0] == VX_DF_IMAGE_U16) && (format[1] == VX_DF_IMAGE_U32)) ||
+                    ((format[0] == VX_DF_IMAGE_S16) && (format[1] == VX_DF_IMAGE_S32)) ||
+                    ((format[0] == VX_DF_IMAGE_U32) && (format[1] == VX_DF_IMAGE_U8))  ||
+                    ((format[0] == VX_DF_IMAGE_U32) && (format[1] == VX_DF_IMAGE_U16)) ||
+                    ((format[0] == VX_DF_IMAGE_S32) && (format[1] == VX_DF_IMAGE_S16)) ||
+                    ((format[0] == VX_DF_IMAGE_F32) && (format[1] == VX_DF_IMAGE_U8))  || /* non-specification */
+#endif
+                    ((format[0] == VX_DF_IMAGE_S16) && (format[1] == VX_DF_IMAGE_U8)))
+                {
+                    ptr->type = VX_TYPE_IMAGE;
+                    ptr->dim.image.format = format[1];
+                    ptr->dim.image.width = width;
+                    ptr->dim.image.height = height;
+                    status = VX_SUCCESS;
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_PARAMETERS;
+                }
+                vxReleaseImage(&images[0]);
+                vxReleaseImage(&images[1]);
+            }
+            vxReleaseParameter(&param[0]);
+            vxReleaseParameter(&param[1]);
+        }
+    }
+    return status;
+}
+
+
+vx_tiling_kernel_t convertdepth_kernel =
+{
+    "org.khronos.openvx.tiling_convertdepth",
+    VX_KERNEL_CONVERTDEPTH_TILING,
+    NULL,
+    ConvertDepth_image_tiling_flexible,
+    ConvertDepth_image_tiling_fast,
+    4,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_SCALAR,  VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_SCALAR,  VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxConvertDepthInputValidator,
+    vxConvertDepthOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_convolution.c b/sample/targets/tiling/vx_convolution.c
new file mode 100644
index 0000000..60f9d3e
--- /dev/null
+++ b/sample/targets/tiling/vx_convolution.c
@@ -0,0 +1,154 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+
+#include "vx_internal.h"
+
+#include "tiling.h"
+
+static vx_status VX_CALLBACK vxConvolveInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+
+#if defined(EXPERIMENTAL_USE_S16)
+            if( (format == VX_DF_IMAGE_U8) || (format == VX_DF_IMAGE_S16) )
+#else
+            if (format == VX_DF_IMAGE_U8)
+#endif
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    if (index == 1)
+    {
+        vx_image input = 0;
+        vx_convolution conv = 0;
+
+        vx_parameter param0 = vxGetParameterByIndex(node, 0);
+        vx_parameter param1 = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param0, VX_PARAMETER_REF, &input, sizeof(input));
+        vxQueryParameter(param1, VX_PARAMETER_REF, &conv, sizeof(conv));
+        if (input && conv)
+        {
+            vx_uint32 width = 0;
+            vx_uint32 height = 0;
+            vx_size dims[2] = { 0, 0 };
+
+            vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+            vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+
+            vxQueryConvolution(conv, VX_CONVOLUTION_COLUMNS, &dims[0], sizeof(dims[0]));
+            vxQueryConvolution(conv, VX_CONVOLUTION_ROWS, &dims[1], sizeof(dims[1]));
+
+            if ((dims[0] <= VX_INT_MAX_CONVOLUTION_DIM) &&
+                (dims[1] <= VX_INT_MAX_CONVOLUTION_DIM) &&
+                (width >= dims[0]) &&
+                (height >= dims[1]))
+            {
+                status = VX_SUCCESS;
+            }
+
+            vxReleaseImage(&input);
+            vxReleaseConvolution(&conv);
+        }
+
+        vxReleaseParameter(&param0);
+        vxReleaseParameter(&param1);
+    }
+
+    return status;
+}
+
+static vx_status VX_CALLBACK vxConvolveOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 2)
+    {
+        vx_parameter params[2] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, index),
+        };
+        if ((vxGetStatus((vx_reference)params[0]) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)params[1]) == VX_SUCCESS))
+        {
+            vx_image input = 0;
+            vx_image output = 0;
+            vxQueryParameter(params[0], VX_PARAMETER_REF, &input, sizeof(input));
+            vxQueryParameter(params[1], VX_PARAMETER_REF, &output, sizeof(output));
+            if (input && output)
+            {
+                vx_uint32 width = 0, height = 0;
+                vx_df_image format = 0;
+                vx_df_image output_format = 0;
+                vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+                vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+
+                vxQueryImage(output, VX_IMAGE_FORMAT, &output_format, sizeof(output_format));
+
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = output_format == VX_DF_IMAGE_U8 ? VX_DF_IMAGE_U8 : VX_DF_IMAGE_S16;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+
+                vxReleaseImage(&input);
+                vxReleaseImage(&output);
+            }
+            vxReleaseParameter(&params[0]);
+            vxReleaseParameter(&params[1]);
+        }
+    }
+    return status;
+}
+
+vx_tiling_kernel_t convolution_kernel =
+{
+    "org.khronos.openvx.tiling_custom_convolution",
+    VX_KERNEL_CUSTOM_CONVOLUTION_TILING,
+    NULL,
+    Convolve_image_tiling_flexible,
+    Convolve_image_tiling_fast,
+    3,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_CONVOLUTION, VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxConvolveInputValidator,
+    vxConvolveOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
diff --git a/sample/targets/tiling/vx_fast9.c b/sample/targets/tiling/vx_fast9.c
new file mode 100644
index 0000000..1cfff14
--- /dev/null
+++ b/sample/targets/tiling/vx_fast9.c
@@ -0,0 +1,156 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+
+#include "vx_internal.h"
+
+#include "tiling.h"
+
+static vx_status VX_CALLBACK vxFast9InputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_image input = 0;
+            status = vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+            if ((status == VX_SUCCESS) && (input))
+            {
+                vx_df_image format = 0;
+                status = vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+                if ((status == VX_SUCCESS) && (format == VX_DF_IMAGE_U8))
+                {
+                    status = VX_SUCCESS;
+                }
+                vxReleaseImage(&input);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar sens = 0;
+            status = vxQueryParameter(param, VX_PARAMETER_REF, &sens, sizeof(sens));
+            if ((status == VX_SUCCESS) && (sens))
+            {
+                vx_enum type = VX_TYPE_INVALID;
+                vxQueryScalar(sens, VX_SCALAR_TYPE, &type, sizeof(type));
+                if (type == VX_TYPE_FLOAT32)
+                {
+                    vx_float32 k = 0.0f;
+                    status = vxCopyScalar(sens, &k, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ((status == VX_SUCCESS) && (k > 0) && (k < 256))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&sens);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    if (index == 2)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar s_nonmax = 0;
+            status = vxQueryParameter(param, VX_PARAMETER_REF, &s_nonmax, sizeof(s_nonmax));
+            if ((status == VX_SUCCESS) && (s_nonmax))
+            {
+                vx_enum type = VX_TYPE_INVALID;
+                vxQueryScalar(s_nonmax, VX_SCALAR_TYPE, &type, sizeof(type));
+                if (type == VX_TYPE_BOOL)
+                {
+                    vx_bool nonmax = vx_false_e;
+                    status = vxCopyScalar(s_nonmax, &nonmax, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ((status == VX_SUCCESS) && ((nonmax == vx_false_e) ||
+                                                   (nonmax == vx_true_e)))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&s_nonmax);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxFast9OutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 3)
+    {
+        ptr->type = VX_TYPE_ARRAY;
+        ptr->dim.array.item_type = VX_TYPE_KEYPOINT;
+        ptr->dim.array.capacity = 0; /* no defined capacity requirement */
+        status = VX_SUCCESS;
+    }
+    else if (index == 4)
+    {
+        ptr->dim.scalar.type = VX_TYPE_SIZE;
+        status = VX_SUCCESS;
+    }
+    return status;
+}
+
+vx_tiling_kernel_t fast9_kernel =
+{
+    "org.khronos.openvx.tiling_fast_corners",
+    VX_KERNEL_FAST_CORNERS_TILING,
+    NULL,
+    Fast9Corners_image_tiling_flexible,
+    Fast9Corners_image_tiling_fast,
+    5,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_ARRAY, VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL } },
+    NULL,
+    vxFast9InputValidator,
+    vxFast9OutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_filter.c b/sample/targets/tiling/vx_filter.c
new file mode 100644
index 0000000..a6a1d3b
--- /dev/null
+++ b/sample/targets/tiling/vx_filter.c
@@ -0,0 +1,139 @@
+/*
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "vx_interface.h"
+
+#include <tiling.h>
+
+static vx_status VX_CALLBACK vxFilterInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (param)
+        {
+            vx_image input = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+            if (input)
+            {
+                vx_df_image format = 0;
+                vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+                if (format == VX_DF_IMAGE_U8)
+                {
+                    status = VX_SUCCESS;
+                }
+                vxReleaseImage(&input);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxFilterOutputValidator(vx_node node, vx_uint32 index, vx_meta_format meta)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, 0); /* we reference an input image */
+        if (param)
+        {
+            vx_image input = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+            if (input)
+            {
+                vx_uint32 width = 0, height = 0;
+                vx_df_image format = VX_DF_IMAGE_U8;
+
+                vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+
+                vxSetMetaFormatAttribute(meta, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxSetMetaFormatAttribute(meta, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                vxSetMetaFormatAttribute(meta, VX_IMAGE_FORMAT, &format, sizeof(format));
+
+                vxReleaseImage(&input);
+
+                status = VX_SUCCESS;
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+vx_tiling_kernel_t box_3x3_kernels =
+{
+    "org.khronos.openvx.tiling_box_3x3",
+    VX_KERNEL_BOX_3x3_TILING,
+    NULL,
+    box3x3_image_tiling_flexible,
+    box3x3_image_tiling_fast,
+    2,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxFilterInputValidator,
+    vxFilterOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
+vx_tiling_kernel_t median3x3_kernel = 
+{
+    "org.khronos.openvx.tiling_median_3x3",
+    VX_KERNEL_MEDIAN_3x3_TILING,
+    NULL,
+    Median3x3_image_tiling_flexible,
+    Median3x3_image_tiling_fast,
+    2,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxFilterInputValidator,
+    vxFilterOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
+vx_tiling_kernel_t gaussian3x3_kernel = 
+{
+    "org.khronos.openvx.tiling_gaussian_3x3",
+    VX_KERNEL_GAUSSIAN_3x3_TILING,
+    NULL,
+    Gaussian3x3_image_tiling_flexible,
+    Gaussian3x3_image_tiling_fast,
+    2,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxFilterInputValidator,
+    vxFilterOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_gradients.c b/sample/targets/tiling/vx_gradients.c
new file mode 100644
index 0000000..520e201
--- /dev/null
+++ b/sample/targets/tiling/vx_gradients.c
@@ -0,0 +1,124 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "vx_interface.h"
+
+#include <tiling.h>
+
+static vx_param_description_t sobel3x3_kernel_params[] =
+{
+    { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL },
+};
+
+static vx_status VX_CALLBACK own_sobel3x3_validator(vx_node node, const vx_reference parameters[], vx_uint32 num, vx_meta_format metas[])
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+
+    if (NULL != node && NULL != parameters && num == dimof(sobel3x3_kernel_params) && NULL != metas)
+    {
+        vx_parameter param1 = vxGetParameterByIndex(node, 0);
+        vx_parameter param2 = vxGetParameterByIndex(node, 1);
+        vx_parameter param3 = vxGetParameterByIndex(node, 2);
+
+        if (VX_SUCCESS == vxGetStatus((vx_reference)param1) &&
+            ((VX_SUCCESS == vxGetStatus((vx_reference)param2)) || (VX_SUCCESS == vxGetStatus((vx_reference)param3))))
+        {
+            vx_uint32   src_width = 0;
+            vx_uint32   src_height = 0;
+            vx_df_image src_format = 0;
+            vx_image    input = 0;
+
+            status = vxQueryParameter(param1, VX_PARAMETER_REF, &input, sizeof(input));
+
+            status |= vxQueryImage(input, VX_IMAGE_WIDTH, &src_width, sizeof(src_width));
+            status |= vxQueryImage(input, VX_IMAGE_HEIGHT, &src_height, sizeof(src_height));
+            status |= vxQueryImage(input, VX_IMAGE_FORMAT, &src_format, sizeof(src_format));
+
+            /* validate input image */
+            if (VX_SUCCESS == status)
+            {
+                if (src_width >= 3 && src_height >= 3 && src_format == VX_DF_IMAGE_U8)
+                    status = VX_SUCCESS;
+                else
+                    status = VX_ERROR_INVALID_PARAMETERS;
+            }
+
+            /* validate output images */
+            if (VX_SUCCESS == status)
+            {
+                vx_enum dst_format = VX_DF_IMAGE_S16;
+
+                if (NULL == metas[1] && NULL == metas[2])
+                    status = VX_ERROR_INVALID_PARAMETERS;
+
+                if (VX_SUCCESS == status && NULL != metas[1])
+                {
+                    /* if optional parameter non NULL */
+                    status |= vxSetMetaFormatAttribute(metas[1], VX_IMAGE_WIDTH, &src_width, sizeof(src_width));
+                    status |= vxSetMetaFormatAttribute(metas[1], VX_IMAGE_HEIGHT, &src_height, sizeof(src_height));
+                    status |= vxSetMetaFormatAttribute(metas[1], VX_IMAGE_FORMAT, &dst_format, sizeof(dst_format));
+                }
+
+                if (VX_SUCCESS == status && NULL != metas[2])
+                {
+                    /* if optional parameter non NULL */
+                    status |= vxSetMetaFormatAttribute(metas[2], VX_IMAGE_WIDTH, &src_width, sizeof(src_width));
+                    status |= vxSetMetaFormatAttribute(metas[2], VX_IMAGE_HEIGHT, &src_height, sizeof(src_height));
+                    status |= vxSetMetaFormatAttribute(metas[2], VX_IMAGE_FORMAT, &dst_format, sizeof(dst_format));
+                }
+            }
+
+            if (NULL != input)
+                vxReleaseImage(&input);
+
+            if (NULL != param1)
+                vxReleaseParameter(&param1);
+
+            if (NULL != param2)
+                vxReleaseParameter(&param2);
+
+            if (NULL != param3)
+                vxReleaseParameter(&param3);
+        }
+    } /* if ptrs non NULL */
+
+    return status;
+} /* own_sobel3x3_validator() */
+
+vx_tiling_kernel_t sobel3x3_kernel =
+{
+    "org.khronos.openvx.tiling_sobel_3x3",
+    VX_KERNEL_SOBEL_3x3_TILING,
+    NULL,
+    Sobel3x3_image_tiling_flexible,
+    Sobel3x3_image_tiling_fast,
+    3,
+    { { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL },
+      { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL } },
+    own_sobel3x3_validator,
+    NULL,
+    NULL,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_hog.c b/sample/targets/tiling/vx_hog.c
new file mode 100644
index 0000000..3872b5f
--- /dev/null
+++ b/sample/targets/tiling/vx_hog.c
@@ -0,0 +1,318 @@
+/*
+* Copyright (c) 2016-2017 The Khronos Group Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and/or associated documentation files (the
+* "Materials"), to deal in the Materials without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Materials, and to
+* permit persons to whom the Materials are furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Materials.
+*
+* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+*    https://www.khronos.org/registry/
+*
+* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+*/
+
+#include "vx_interface.h"
+#include "vx_internal.h"
+#include "tiling.h"
+
+
+static vx_status VX_CALLBACK vxHogCellsInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_ATTRIBUTE_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1 || index == 2 || index == 3)
+    {
+        vx_scalar scalar = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum type = -1;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &type, sizeof(type));
+                if (type == VX_TYPE_INT32)
+                {
+                    vx_int32 para = 0;
+                    if ((vxCopyScalar(scalar, &para, VX_READ_ONLY, VX_MEMORY_TYPE_HOST) == VX_SUCCESS) &&
+                        (para >= 0))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxHogCellsOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    vx_enum format;
+    vx_tensor tensor;
+    vx_parameter param = vxGetParameterByIndex(node, index);
+    vxQueryParameter(param, VX_PARAMETER_ATTRIBUTE_REF, &tensor, sizeof(tensor));
+    if (tensor && index == 4)
+    {
+        format = VX_TYPE_INT16;
+        vx_uint8 fixed_point_pos1 = 8;
+        vx_size out_num_dims;
+        vx_size out_dims[2];
+        status = vxQueryTensor(tensor, VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims));
+        status |= vxQueryTensor(tensor, VX_TENSOR_DIMS, out_dims, sizeof(out_dims));
+        status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_DATA_TYPE, &format, sizeof(format));
+        status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_FIXED_POINT_POSITION, &fixed_point_pos1, sizeof(fixed_point_pos1));
+        status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_DIMS, out_dims, sizeof(*out_dims) * out_num_dims);
+        status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims));
+    }
+    else if (tensor && index == 5)
+    {
+        format = VX_TYPE_INT8;
+        vx_uint8 fixed_point_pos1 = 0;
+        vx_size out_num_dims;
+        vx_size out_dims[3];
+        status = vxQueryTensor(tensor, VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims));
+        status |= vxQueryTensor(tensor, VX_TENSOR_DIMS, out_dims, sizeof(out_dims));
+        status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_DATA_TYPE, &format, sizeof(format));
+        status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_FIXED_POINT_POSITION, &fixed_point_pos1, sizeof(fixed_point_pos1));
+        status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_DIMS, out_dims, sizeof(*out_dims) * out_num_dims);
+        status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims));
+    }
+    vxReleaseTensor(&tensor);
+    vxReleaseParameter(&param);
+    return status;
+}
+
+static vx_status VX_CALLBACK vxHogFeaturesInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_ATTRIBUTE_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_ATTRIBUTE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_tensor mag = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vxQueryParameter(param, VX_PARAMETER_REF, &mag, sizeof(mag));
+            if (mag)
+            {
+                vx_enum format = -1;
+                vxQueryTensor(mag, VX_TENSOR_DATA_TYPE, &format, sizeof(format));
+                if (format == VX_TYPE_INT16)
+                {
+                    
+                    status = VX_SUCCESS; 
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseTensor(&mag);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    else if (index == 2)
+    {
+        vx_tensor mag = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vxQueryParameter(param, VX_PARAMETER_REF, &mag, sizeof(mag));
+            if (mag)
+            {
+                vx_enum format = -1;
+                vxQueryTensor(mag, VX_TENSOR_DATA_TYPE, &format, sizeof(format));
+                if (format == VX_TYPE_INT8)
+                {
+
+                    status = VX_SUCCESS;
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseTensor(&mag);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    else if (index == 3)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_array arr = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &arr, sizeof(arr));
+            if (arr)
+            {
+                vx_enum item_type = 0;
+                vxQueryArray(arr, VX_ARRAY_ITEMTYPE, &item_type, sizeof(item_type));
+                if (item_type == VX_TYPE_HOG_PARAMS)
+                {
+                    status = VX_SUCCESS;
+                }
+                vxReleaseArray(&arr);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    else if (index == 4)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar hog_param_size = 0;
+            status = vxQueryParameter(param, VX_PARAMETER_REF, &hog_param_size, sizeof(hog_param_size));
+            if ((status == VX_SUCCESS) && (hog_param_size))
+            {
+                vx_enum type = 0;
+                vxQueryScalar(hog_param_size, VX_SCALAR_TYPE, &type, sizeof(type));
+                if (type == VX_TYPE_INT32)
+                {
+                    status = VX_SUCCESS;
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&hog_param_size);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxHogFeaturesOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    vx_enum format;
+    vx_tensor tensor;
+    vx_parameter param = vxGetParameterByIndex(node, index);
+    vxQueryParameter(param, VX_PARAMETER_ATTRIBUTE_REF, &tensor, sizeof(tensor));
+    if (tensor && index == 5)
+    {
+        format = VX_TYPE_INT16;
+        vx_uint8 fixed_point_pos1 = 8;
+        vx_size out_num_dims;
+        vx_size out_dims[3];
+        status = vxQueryTensor(tensor, VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims));
+        status |= vxQueryTensor(tensor, VX_TENSOR_DIMS, out_dims, sizeof(out_dims));
+        status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_DATA_TYPE, &format, sizeof(format));
+        status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_FIXED_POINT_POSITION, &fixed_point_pos1, sizeof(fixed_point_pos1));
+        status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_DIMS, out_dims, sizeof(*out_dims) * out_num_dims);
+        status |= vxSetMetaFormatAttribute(ptr, VX_TENSOR_NUMBER_OF_DIMS, &out_num_dims, sizeof(out_num_dims));
+    }
+    vxReleaseTensor(&tensor);
+    vxReleaseParameter(&param);
+    return status;
+}
+
+vx_tiling_kernel_t hogcells_kernel = 
+{
+    "org.khronos.openvx.tiling_hogcells",
+    VX_KERNEL_HOG_CELLS_TILING,
+    NULL,
+    HogCells_image_tiling_flexible,
+    HogCells_image_tiling_fast,
+    6,
+    { { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxHogCellsInputValidator,
+    vxHogCellsOutputValidator,
+    NULL,
+    NULL,
+    { 32, 32 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
+vx_tiling_kernel_t hogfeatures_kernel = 
+{
+    "org.khronos.openvx.tiling_hogfeatures",
+    VX_KERNEL_HOG_FEATURES_TILING,
+    NULL,
+    HogFeatures_image_tiling_flexible,
+    HogFeatures_image_tiling_fast,
+    6,
+    { { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_ARRAY, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxHogFeaturesInputValidator,
+    vxHogFeaturesOutputValidator,
+    NULL,
+    NULL,
+    { 32, 32 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_integralimage.c b/sample/targets/tiling/vx_integralimage.c
new file mode 100644
index 0000000..3e836e4
--- /dev/null
+++ b/sample/targets/tiling/vx_integralimage.c
@@ -0,0 +1,95 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+#include "vx_internal.h"
+
+#include <tiling.h>
+
+static vx_status VX_CALLBACK vxIntegralInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxIntegralOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, 0); /* we reference the input image */
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_image input = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+            if (input)
+            {
+                vx_uint32 width = 0, height = 0;
+                vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = VX_DF_IMAGE_U32;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&input);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+vx_tiling_kernel_t integral_image_kernel =
+{
+    "org.khronos.openvx.tiling_integral_image",
+    VX_KERNEL_INTEGRAL_IMAGE_TILING,
+    NULL,
+    IntegralImage_image_tiling_flexible,
+    IntegralImage_image_tiling_fast,
+    2,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxIntegralInputValidator,
+    vxIntegralOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
+
diff --git a/sample/targets/tiling/vx_interface.c b/sample/targets/tiling/vx_interface.c
new file mode 100644
index 0000000..05b64f7
--- /dev/null
+++ b/sample/targets/tiling/vx_interface.c
@@ -0,0 +1,706 @@
+/*
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_internal.h"
+#include <vx_interface.h>
+
+vx_status VX_CALLBACK vxTilingKernel(vx_node node, vx_reference parameters[], vx_uint32 num);
+
+static const vx_char name[VX_MAX_TARGET_NAME] = "khronos.tiling";
+
+vx_tiling_kernel_t *tiling_kernels[] =
+{
+    &box_3x3_kernels,
+    &phase_kernel,
+    &And_kernel,
+    &Or_kernel,
+    &Xor_kernel,
+    &Not_kernel,
+    &threshold_kernel,
+    &colorconvert_kernel,
+    &Multiply_kernel,
+    &nonlinearfilter_kernel,
+    &Magnitude_kernel,
+    &erode3x3_kernel,
+    &dilate3x3_kernel,
+    &median3x3_kernel,
+    &sobel3x3_kernel,
+    &Max_kernel,
+    &Min_kernel,
+    &gaussian3x3_kernel,
+    &add_kernel,
+    &subtract_kernel,
+    &convertdepth_kernel,
+    &warp_affine_kernel,
+    &warp_perspective_kernel,
+    &weightedaverage_kernel,
+    &absdiff_kernel,
+    &integral_image_kernel,
+    &remap_kernel,
+    &convolution_kernel,
+    &hogfeatures_kernel,
+    &fast9_kernel,
+    &lbp_kernel,
+    &scale_image_kernel,
+    &lut_kernel,
+    &channelcombine_kernel,
+    &halfscale_gaussian_kernel,
+    &nonmaxsuppression_kernel,
+    &hogcells_kernel,
+};
+
+/*! \brief The Entry point into a user defined kernel module */
+vx_status VX_API_CALL vxPublishKernels(vx_context context)
+{
+    // tag::publish_function[]
+    vx_status status = VX_SUCCESS;
+    vx_uint32 k = 0;
+    for (k = 0; k < dimof(tiling_kernels); k++)
+    {
+        if (k == 34)
+		{
+			int aa = 0;
+		}
+        vx_kernel kernel = vxAddTilingKernel(context,
+            tiling_kernels[k]->name,
+            tiling_kernels[k]->enumeration,
+            tiling_kernels[k]->function,
+            tiling_kernels[k]->flexible_function,
+            tiling_kernels[k]->fast_function,
+            tiling_kernels[k]->num_params,            
+            tiling_kernels[k]->validate,
+            tiling_kernels[k]->input_validator,
+            tiling_kernels[k]->output_validator,
+            tiling_kernels[k]->initialize,
+            tiling_kernels[k]->deinitialize);
+        if (kernel)
+        {
+            vx_uint32 p = 0;
+            for (p = 0; p < tiling_kernels[k]->num_params; p++)
+            {
+                status |= vxAddParameterToKernel(kernel, p,
+                    tiling_kernels[k]->parameters[p].direction,
+                    tiling_kernels[k]->parameters[p].data_type,
+                    tiling_kernels[k]->parameters[p].state);
+            }
+            status |= vxSetKernelAttribute(kernel, VX_KERNEL_INPUT_NEIGHBORHOOD,
+                &tiling_kernels[k]->nbhd, sizeof(vx_neighborhood_size_t));
+            status |= vxSetKernelAttribute(kernel, VX_KERNEL_OUTPUT_TILE_BLOCK_SIZE,
+                &tiling_kernels[k]->block, sizeof(vx_tile_block_size_t));
+            status |= vxSetKernelAttribute(kernel, VX_KERNEL_BORDER,
+                &tiling_kernels[k]->border, sizeof(vx_border_t));
+            if (status != VX_SUCCESS)
+            {
+                vxRemoveKernel(kernel);
+            }
+            else
+            {
+                status = vxFinalizeKernel(kernel);
+            }
+            if (status != VX_SUCCESS)
+            {
+                printf("Failed to publish kernel %s\n", tiling_kernels[k]->name);
+                break;
+            }
+        }
+    }
+    // end::publish_function[]
+    return status;
+}
+
+
+/*VX_API_ENTRY*/ vx_status VX_API_CALL vxUnpublishKernels(vx_context context)
+{
+    vx_status status = VX_FAILURE;
+
+    vx_uint32 k = 0;
+    for (k = 0; k < dimof(tiling_kernels); k++)
+    {
+        vx_kernel kernel = vxGetKernelByName(context, tiling_kernels[k]->name);
+        kernel->user_kernel = 1;
+        vx_kernel kernelcpy = kernel;
+
+        if (kernel)
+        {
+            status = vxReleaseKernel(&kernelcpy);
+            if (status != VX_SUCCESS)
+            {
+                vxAddLogEntry((vx_reference)context, status, "Failed to release kernel[%u]=%s\n", k, tiling_kernels[k]->name);
+            }
+            else
+            {
+                kernelcpy = kernel;
+                status = vxRemoveKernel(kernelcpy);
+                if (status != VX_SUCCESS)
+                {
+                    vxAddLogEntry((vx_reference)context, status, "Failed to remove kernel[%u]=%s\n", k, tiling_kernels[k]->name);
+                }
+            }
+        }
+        else
+        {
+            vxAddLogEntry((vx_reference)context, status, "Failed to get added kernel %s\n", tiling_kernels[k]->name);
+        }
+    }
+
+    return status;
+}
+
+vx_status vxTargetInit(vx_target target)
+{
+    if (target)
+    {
+        strncpy(target->name, name, VX_MAX_TARGET_NAME);
+        target->priority = VX_TARGET_PRIORITY_TILING;
+    }
+    return vxPublishKernels(target->base.context);
+}
+
+vx_status vxTargetDeinit(vx_target target)
+{
+    return vxUnpublishKernels(target->base.context);
+}
+
+vx_status vxTargetSupports(vx_target target,
+                           vx_char targetName[VX_MAX_TARGET_NAME],
+                           vx_char kernelName[VX_MAX_KERNEL_NAME],
+                           vx_uint32 *pIndex)
+{
+    vx_status status = VX_ERROR_NOT_SUPPORTED;
+    if (strncmp(targetName, name, VX_MAX_TARGET_NAME) == 0 ||
+        strncmp(targetName, "default", VX_MAX_TARGET_NAME) == 0 ||
+        strncmp(targetName, "power", VX_MAX_TARGET_NAME) == 0 ||
+        strncmp(targetName, "performance", VX_MAX_TARGET_NAME) == 0)
+    {
+        vx_uint32 k = 0u;
+        for (k = 0u; k < VX_INT_MAX_KERNELS; k++)
+        {
+            vx_char targetKernelName[VX_MAX_KERNEL_NAME];
+            vx_char *kernel;
+            vx_char def[8] = "default";
+
+            strncpy(targetKernelName, target->kernels[k].name, VX_MAX_KERNEL_NAME);
+            kernel = strtok(targetKernelName, ":");
+            if (kernel == NULL)
+                kernel = def;
+
+            if (strncmp(kernelName, kernel, VX_MAX_KERNEL_NAME) == 0)
+            {
+                status = VX_SUCCESS;
+                if (pIndex) *pIndex = k;
+                break;
+            }
+        }
+    }
+    return status;
+}
+
+vx_action vxTargetProcess(vx_target target, vx_node_t *nodes[], vx_size startIndex, vx_size numNodes)
+{
+    vx_action action = VX_ACTION_CONTINUE;
+    vx_status status = VX_SUCCESS;
+    vx_size n = 0;
+    for (n = startIndex; (n < (startIndex + numNodes)) && (action == VX_ACTION_CONTINUE); n++)
+    {
+        vx_context context = vxGetContext((vx_reference)nodes[n]);
+        VX_PRINT(VX_ZONE_GRAPH, "Executing Kernel %s:%d in Nodes[%u] on target %s\n",
+            nodes[n]->kernel->name,
+            nodes[n]->kernel->enumeration,
+            n,
+            nodes[n]->base.context->targets[nodes[n]->affinity].name);
+
+        if (context->perf_enabled)
+            ownStartCapture(&nodes[n]->perf);
+
+        if (nodes[n]->is_replicated == vx_true_e)
+        {
+            vx_size num_replicas = 0;
+            vx_uint32 param;
+            vx_uint32 num_parameters = nodes[n]->kernel->signature.num_parameters;
+            vx_reference parameters[VX_INT_MAX_PARAMS] = { NULL };
+
+            for (param = 0; param < num_parameters; ++param)
+            {
+                if (nodes[n]->replicated_flags[param] == vx_true_e)
+                {
+                    vx_size numItems = 0;
+                    if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_PYRAMID)
+                    {
+                        vx_pyramid pyr = (vx_pyramid)(nodes[n]->parameters[param])->scope;
+                        numItems = pyr->numLevels;
+                    }
+                    else if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_OBJECT_ARRAY)
+                    {
+                        vx_object_array arr = (vx_object_array)(nodes[n]->parameters[param])->scope;
+                        numItems = arr->num_items;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_PARAMETERS;
+                        break;
+                    }
+
+                    if (num_replicas == 0)
+                        num_replicas = numItems;
+                    else if (numItems != num_replicas)
+                    {
+                        status = VX_ERROR_INVALID_PARAMETERS;
+                        break;
+                    }
+                }
+                else
+                {
+                    parameters[param] = nodes[n]->parameters[param];
+                }
+            }
+
+            if (status == VX_SUCCESS)
+            {
+                vx_size replica;
+                for (replica = 0; replica < num_replicas; ++replica)
+                {
+                    for (param = 0; param < num_parameters; ++param)
+                    {
+                        if (nodes[n]->replicated_flags[param] == vx_true_e)
+                        {
+                            if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_PYRAMID)
+                            {
+                                vx_pyramid pyr = (vx_pyramid)(nodes[n]->parameters[param])->scope;
+                                parameters[param] = (vx_reference)pyr->levels[replica];
+                            }
+                            else if ((nodes[n]->parameters[param])->scope->type == VX_TYPE_OBJECT_ARRAY)
+                            {
+                                vx_object_array arr = (vx_object_array)(nodes[n]->parameters[param])->scope;
+                                parameters[param] = (vx_reference)arr->items[replica];
+                            }
+                        }
+                    }
+
+                    status = nodes[n]->kernel->function((vx_node)nodes[n],
+                        parameters,
+                        num_parameters);
+                }
+            }
+        }
+        else
+        {
+            status = nodes[n]->kernel->function((vx_node)nodes[n],
+                (vx_reference *)nodes[n]->parameters,
+                nodes[n]->kernel->signature.num_parameters);
+        }
+
+        nodes[n]->executed = vx_true_e;
+        nodes[n]->status = status;
+
+        if (context->perf_enabled)
+            ownStopCapture(&nodes[n]->perf);
+
+        VX_PRINT(VX_ZONE_GRAPH, "kernel %s returned %d\n", nodes[n]->kernel->name, status);
+
+        if (status == VX_SUCCESS)
+        {
+            /* call the callback if it is attached */
+            if (nodes[n]->callback)
+            {
+                action = nodes[n]->callback((vx_node)nodes[n]);
+                VX_PRINT(VX_ZONE_GRAPH, "callback returned action %d\n", action);
+            }
+        }
+        else
+        {
+            action = VX_ACTION_ABANDON;
+            VX_PRINT(VX_ZONE_ERROR, "Abandoning Graph due to error (%d)!\n", status);
+        }
+    }
+    return action;
+}
+
+vx_status vxTargetVerify(vx_target target, vx_node_t *node)
+{
+    vx_status status = VX_SUCCESS;
+    return status;
+}
+
+vx_kernel vxTargetAddKernel(vx_target target,
+                            vx_char name[VX_MAX_KERNEL_NAME],
+                            vx_enum enumeration,
+                            vx_kernel_f func_ptr,
+                            vx_uint32 numParams,
+                            vx_kernel_validate_f validate,
+                            vx_kernel_input_validate_f input,
+                            vx_kernel_output_validate_f output,
+                            vx_kernel_initialize_f initialize,
+                            vx_kernel_deinitialize_f deinitialize)
+{
+    vx_uint32 k = 0u;
+    vx_kernel_t *kernel = NULL;
+    // ownSemWait(&target->base.lock);
+    for (k = 0; k < VX_INT_MAX_KERNELS; k++)
+    {
+        kernel = &(target->kernels[k]);
+        if (kernel->enabled == vx_false_e)
+        {
+            ownInitializeKernel(target->base.context,
+                               kernel,
+                               enumeration, func_ptr, name,
+                               NULL, numParams,
+                               validate, input, output, initialize, deinitialize);
+            VX_PRINT(VX_ZONE_KERNEL, "Reserving %s Kernel[%u] for %s\n", target->name, k, kernel->name);
+            target->num_kernels++;
+            break;
+        }
+        kernel = NULL;
+    }
+    // ownSemPost(&target->base.lock);
+    return (vx_kernel)kernel;
+}
+
+#ifdef OPENVX_KHR_TILING
+vx_kernel vxTargetAddTilingKernel(vx_target target,
+                            vx_char name[VX_MAX_KERNEL_NAME],
+                            vx_enum enumeration,
+                            vx_kernel_f function,
+                            vx_tiling_kernel_f flexible_func_ptr,
+                            vx_tiling_kernel_f fast_func_ptr,
+                            vx_uint32 numParams,
+                            vx_kernel_validate_f validate,
+                            vx_kernel_input_validate_f input,
+                            vx_kernel_output_validate_f output,
+                            vx_kernel_initialize_f initialize,
+                            vx_kernel_deinitialize_f deinitialize)
+{
+    vx_uint32 k = 0u;
+    vx_kernel_t *kernel = NULL;
+    for (k = 0; k < VX_INT_MAX_KERNELS; k++)
+    {
+        kernel = &(target->kernels[k]);
+        if (kernel->enabled == vx_false_e)
+        {
+            kernel->tilingfast_function = fast_func_ptr;
+            kernel->tilingflexible_function = flexible_func_ptr;
+
+            if (function == NULL)
+            {
+                ownInitializeKernel(target->base.context,
+                                   kernel,
+                                   enumeration, vxTilingKernel, name,
+                                   NULL, numParams,
+                                   validate, input, output, initialize, deinitialize);
+            }
+            else //Kernel with more than one node like HalfScaleGaussian
+            {
+                ownInitializeKernel(target->base.context,
+                                   kernel,
+                                   enumeration, function, name,
+                                   NULL, numParams,
+                                   validate, input, output, initialize, deinitialize);
+            }
+            VX_PRINT(VX_ZONE_KERNEL, "Reserving %s Kernel[%u] for %s\n", target->name, k, kernel->name);
+            target->num_kernels++;
+            break;
+        }
+        kernel = NULL;
+    }
+    return (vx_kernel)kernel;
+}
+
+static vx_status vxGetPatchToTile(vx_image image, vx_rectangle_t *rect, vx_tile_t *tile)
+{
+    vx_status status = VX_SUCCESS;
+    vx_uint32 p = 0;
+    vx_image_t *img = (vx_image_t *)image;
+
+    for (p = 0; p < img->planes; p++)
+    {
+        tile->base[p] = NULL;
+        if(image->constant == 1)
+            status = vxAccessImagePatch(image, rect, p, &tile->addr[p], (void **)&tile->base[p], VX_READ_ONLY);
+        else
+            status = vxAccessImagePatch(image, rect, p, &tile->addr[p], (void **)&tile->base[p], VX_READ_AND_WRITE);
+    }
+
+    return status;
+}
+
+static vx_status vxSetTileToPatch(vx_image image, vx_rectangle_t *rect, vx_tile_t *tile)
+{
+    vx_image_t *img = (vx_image_t *)image;
+    vx_uint32 p = 0;
+    vx_status status = VX_SUCCESS;
+
+    for (p = 0; p < img->planes; p++)
+    {
+        status = vxCommitImagePatch(image, rect, p, &tile->addr[p], tile->base[p]);
+    }
+
+    return status;
+}
+
+static void* ownAllocateTensorMemory_tiling(vx_tensor tensor)
+{
+    vx_size total_size = ownSizeOfType(tensor->data_type);
+
+    if (tensor->addr == NULL)
+    {
+        for (vx_uint32 i = 0; i < tensor->number_of_dimensions; i++)
+        {
+            total_size *= tensor->dimensions[i];
+        }
+        tensor->addr = calloc(total_size, 1);
+    }
+    return tensor->addr;
+}
+
+vx_status VX_CALLBACK vxTilingKernel(vx_node node, vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+
+    vx_image images[VX_INT_MAX_PARAMS];
+    vx_uint32 ty = 0u, tx = 0u, p = 0u;
+    vx_rectangle_t rect;
+    vx_tile_t tiles[VX_INT_MAX_PARAMS];
+    void *params[VX_INT_MAX_PARAMS] = {NULL};
+    vx_enum dirs[VX_INT_MAX_PARAMS];
+    vx_enum types[VX_INT_MAX_PARAMS];
+    size_t scalars[VX_INT_MAX_PARAMS];
+    vx_uint32 index = UINT32_MAX;
+    vx_uint32 tile_size_y = 0u, tile_size_x = 0u;
+    vx_uint32 block_multiple = 64;
+    vx_uint32 height = 0u, width = 0u;
+    vx_border_t borders = {VX_BORDER_UNDEFINED, 0};
+    vx_neighborhood_size_t nbhd;
+    void *tile_memory = NULL;
+    vx_size size = 0;
+
+    vx_tile_threshold_t threshold[VX_INT_MAX_PARAMS];
+    vx_tile_matrix_t mask[VX_INT_MAX_PARAMS];
+    vx_tile_convolution_t conv[VX_INT_MAX_PARAMS];
+    vx_tensor tensor[VX_INT_MAX_PARAMS];
+    vx_tile_array_t array_t[VX_INT_MAX_PARAMS];
+    vx_array arrays[VX_INT_MAX_PARAMS];
+
+    /* Do the following:
+     * \arg find out each parameters direction
+     * \arg assign each image from the parameters
+     * \arg assign the block/neighborhood info
+     */
+    for (p = 0u; p < num; p++)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, p);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vxQueryParameter(param, VX_PARAMETER_DIRECTION, &dirs[p], sizeof(dirs[p]));
+            vxQueryParameter(param, VX_PARAMETER_TYPE, &types[p], sizeof(types[p]));
+            vxReleaseParameter(&param);
+        }
+        if (types[p] == VX_TYPE_IMAGE)
+        {
+            vxQueryNode(node, VX_NODE_OUTPUT_TILE_BLOCK_SIZE, &tiles[p].tile_block, sizeof(vx_tile_block_size_t));
+            vxQueryNode(node, VX_NODE_INPUT_NEIGHBORHOOD, &tiles[p].neighborhood, sizeof(vx_neighborhood_size_t));
+            images[p] = (vx_image)parameters[p];
+            vxQueryImage(images[p], VX_IMAGE_WIDTH, &tiles[p].image.width, sizeof(vx_uint32));
+            vxQueryImage(images[p], VX_IMAGE_HEIGHT, &tiles[p].image.height, sizeof(vx_uint32));
+            vxQueryImage(images[p], VX_IMAGE_FORMAT, &tiles[p].image.format, sizeof(vx_df_image));
+            vxQueryImage(images[p], VX_IMAGE_SPACE, &tiles[p].image.space, sizeof(vx_enum));
+            vxQueryImage(images[p], VX_IMAGE_RANGE, &tiles[p].image.range, sizeof(vx_enum));
+            params[p] = &tiles[p];
+            if ((dirs[p] == VX_OUTPUT) && (index == UINT32_MAX))
+            {
+                index = p;
+            }
+        }
+        else if (types[p] == VX_TYPE_SCALAR)
+        {
+            vxCopyScalar((vx_scalar)parameters[p], (void *)&scalars[p], VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+            params[p] = &scalars[p];
+        }
+        else if (types[p] == VX_TYPE_THRESHOLD)
+        {
+            vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_TYPE, &threshold[p].thresh_type, sizeof(threshold[p].thresh_type));
+            vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_THRESHOLD_VALUE, &threshold[p].value, sizeof(threshold[p].value));
+            vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_THRESHOLD_LOWER, &threshold[p].lower, sizeof(threshold[p].lower));
+            vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_THRESHOLD_UPPER, &threshold[p].upper, sizeof(threshold[p].upper));
+            vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_TRUE_VALUE, &threshold[p].true_value, sizeof(threshold[p].true_value));
+            vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_FALSE_VALUE, &threshold[p].false_value, sizeof(threshold[p].false_value));
+            vxQueryThreshold((vx_threshold)parameters[p], VX_THRESHOLD_INPUT_FORMAT, &threshold[p].input_format, sizeof(threshold[p].input_format));
+
+            params[p] = &threshold[p];
+        }
+        else if (types[p] == VX_TYPE_MATRIX)
+        {
+            vxQueryMatrix((vx_matrix)parameters[p], VX_MATRIX_ROWS, &mask[p].rows, sizeof(mask[p].rows));
+            vxQueryMatrix((vx_matrix)parameters[p], VX_MATRIX_COLUMNS, &mask[p].columns, sizeof(mask[p].columns));
+            vxQueryMatrix((vx_matrix)parameters[p], VX_MATRIX_TYPE, &mask[p].data_type, sizeof(mask[p].data_type));
+            vxQueryMatrix((vx_matrix)parameters[p], VX_MATRIX_ORIGIN, &mask[p].origin, sizeof(mask[p].origin));
+
+            if ((mask[p].data_type != VX_TYPE_UINT8) || (sizeof(mask[p].m) < mask[p].rows * mask[p].columns))
+                status = VX_ERROR_INVALID_PARAMETERS;
+
+            vxCopyMatrix((vx_matrix)parameters[p], mask[p].m, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+            vxCopyMatrix((vx_matrix)parameters[p], mask[p].m_f32, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+
+            params[p] = &mask[p];
+        }
+        else if (types[p] == VX_TYPE_REMAP)
+        {
+            vx_remap map = (vx_remap)parameters[p];
+            params[p] = &map;
+        }
+        else if (types[p] == VX_TYPE_CONVOLUTION)
+        {
+            vxQueryConvolution((vx_convolution)parameters[p], VX_CONVOLUTION_COLUMNS, &conv[p].conv_width, sizeof(conv[p].conv_width));
+            vxQueryConvolution((vx_convolution)parameters[p], VX_CONVOLUTION_ROWS, &conv[p].conv_height, sizeof(conv[p].conv_height));
+            vxQueryConvolution((vx_convolution)parameters[p], VX_CONVOLUTION_SCALE, &conv[p].scale, sizeof(conv[p].scale));
+
+            vxCopyConvolutionCoefficients((vx_convolution)parameters[p], conv[p].conv_mat, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+
+            params[p] = &conv[p];
+        }
+        else if (types[p] == VX_TYPE_TENSOR)
+        {
+            tensor[p] = (vx_tensor)parameters[p];
+
+            if (tensor[p]->addr == NULL)
+                ownAllocateTensorMemory_tiling(tensor[p]);
+
+            params[p] = tensor[p]->addr;
+        }
+        else if (types[p] == VX_TYPE_ARRAY || types[p] == VX_TYPE_LUT)
+        {
+            arrays[p] = (vx_array)parameters[p];
+
+            array_t[p].ptr = ((vx_array)parameters[p])->memory.ptrs[0];
+            array_t[p].capacity = ((vx_array)parameters[p])->capacity;
+            array_t[p].item_size = ((vx_array)parameters[p])->item_size;
+            array_t[p].item_type = ((vx_array)parameters[p])->item_type;
+            array_t[p].num_items = ((vx_array)parameters[p])->num_items;
+            array_t[p].offset = ((vx_array)parameters[p])->offset;
+
+            params[p] = &array_t[p];
+        }
+    }
+
+    if (index == UINT32_MAX)
+        index = 0;
+
+    /* choose the index of the first output image to based the tiling on */
+    status |= vxQueryImage(images[index], VX_IMAGE_WIDTH, &width, sizeof(width));
+    status |= vxQueryImage(images[index], VX_IMAGE_HEIGHT, &height, sizeof(height));
+    status |= vxQueryNode(node, VX_NODE_BORDER, &borders, sizeof(borders));
+    status |= vxQueryNode(node, VX_NODE_INPUT_NEIGHBORHOOD, &nbhd, sizeof(nbhd));
+    status |= vxQueryNode(node, VX_NODE_TILE_MEMORY_SIZE, &size, sizeof(size));
+
+    tile_size_y = tiles[index].tile_block.height;
+    tile_size_x = tiles[index].tile_block.width;
+
+    if ((borders.mode != VX_BORDER_UNDEFINED) &&
+        (borders.mode != VX_BORDER_MODE_SELF))
+    {
+        return VX_ERROR_NOT_SUPPORTED;
+    }
+
+    status = VX_SUCCESS;
+
+    rect.start_x = 0;
+    rect.start_y = 0;
+    rect.end_x = width;
+    rect.end_y = height;
+    for (p = 0u; p < num; p++)
+    {
+        if (types[p] == VX_TYPE_IMAGE && images[p] != NULL)
+        {
+            tiles[p].tile_x = 0;
+            tiles[p].tile_y = 0;
+            status |= vxGetPatchToTile(images[p], &rect, &tiles[p]);
+        }
+    }
+
+    vx_uint32 blkCntY = (height / tile_size_y) * tile_size_y;
+    vx_uint32 blkCntX = (width / tile_size_x) * tile_size_x;
+
+    //tiling fast function    
+    if (((vx_node_t *)node)->kernel->tilingfast_function)
+    {
+        for (ty = 0u; (ty < blkCntY) && (status == VX_SUCCESS); ty += tile_size_y)
+        {
+            for (tx = 0u; tx < blkCntX; tx += tile_size_x)
+            {
+                for (p = 0u; p < num; p++)
+                {
+                    if (types[p] == VX_TYPE_IMAGE)
+                    {
+                        tiles[p].tile_x = tx;
+                        tiles[p].tile_y = ty;
+                    }
+                }
+                tile_memory = ((vx_node_t *)node)->attributes.tileDataPtr;
+                ((vx_node_t *)node)->kernel->tilingfast_function(params, tile_memory, size);
+            }
+        }
+    
+        if (((vx_node_t *)node)->kernel->tilingflexible_function && ((blkCntY < height) || (blkCntX < width)))
+        {    
+            for (p = 0u; p < num; p++)
+            {
+                if (types[p] == VX_TYPE_IMAGE)
+                {
+                    tiles[p].tile_x = tx;
+                    tiles[p].tile_y = ty;
+                }
+            }
+            tile_memory = ((vx_node_t *)node)->attributes.tileDataPtr;
+            ((vx_node_t *)node)->kernel->tilingflexible_function(params, tile_memory, size);
+        }
+    }
+    //tiling flexible function  
+    else if (((vx_node_t *)node)->kernel->tilingflexible_function)
+    {
+        for (p = 0u; p < num; p++)
+        {
+            if (types[p] == VX_TYPE_IMAGE)
+            {
+                tiles[p].tile_x = tx;
+                tiles[p].tile_y = ty;
+            }
+        }
+        tile_memory = ((vx_node_t *)node)->attributes.tileDataPtr;
+        ((vx_node_t *)node)->kernel->tilingflexible_function(params, tile_memory, size);
+    }
+
+    for (p = 0u; p < num; p++)
+    {
+        if (types[p] == VX_TYPE_IMAGE)
+        {
+            if (dirs[p] == VX_INPUT && images[p] != NULL)
+            {
+                status |= vxSetTileToPatch(images[p], 0, &tiles[p]);
+            }
+            else if (dirs[p] == VX_OUTPUT)
+            {
+                status |= vxSetTileToPatch(images[p], &rect, &tiles[p]);
+            }
+        }
+        else if (types[p] == VX_TYPE_ARRAY && dirs[p] == VX_OUTPUT)
+        {
+            arrays[p]->memory.ptrs[0] = array_t[p].ptr;
+            arrays[p]->num_items = array_t[p].num_items;
+        }
+    }
+
+    return status;
+}
+#endif
diff --git a/sample/targets/tiling/vx_interface.h b/sample/targets/tiling/vx_interface.h
new file mode 100644
index 0000000..db17e06
--- /dev/null
+++ b/sample/targets/tiling/vx_interface.h
@@ -0,0 +1,62 @@
+/*
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _OPENVX_INTERFACE_H_
+#define _OPENVX_INTERFACE_H_
+
+#include <VX/vx_khr_tiling.h>
+
+extern vx_tiling_kernel_t box_3x3_kernels;
+extern vx_tiling_kernel_t phase_kernel;
+extern vx_tiling_kernel_t And_kernel;
+extern vx_tiling_kernel_t Or_kernel;
+extern vx_tiling_kernel_t Xor_kernel;
+extern vx_tiling_kernel_t Not_kernel;
+extern vx_tiling_kernel_t threshold_kernel;
+extern vx_tiling_kernel_t colorconvert_kernel;
+extern vx_tiling_kernel_t Multiply_kernel;
+extern vx_tiling_kernel_t nonlinearfilter_kernel;
+extern vx_tiling_kernel_t Magnitude_kernel;
+extern vx_tiling_kernel_t erode3x3_kernel;
+extern vx_tiling_kernel_t dilate3x3_kernel;
+extern vx_tiling_kernel_t median3x3_kernel;
+extern vx_tiling_kernel_t sobel3x3_kernel;
+extern vx_tiling_kernel_t Max_kernel;
+extern vx_tiling_kernel_t Min_kernel;
+extern vx_tiling_kernel_t gaussian3x3_kernel;
+extern vx_tiling_kernel_t add_kernel;
+extern vx_tiling_kernel_t subtract_kernel;
+extern vx_tiling_kernel_t convertdepth_kernel;
+extern vx_tiling_kernel_t warp_affine_kernel;
+extern vx_tiling_kernel_t warp_perspective_kernel;
+extern vx_tiling_kernel_t weightedaverage_kernel;
+extern vx_tiling_kernel_t absdiff_kernel;
+extern vx_tiling_kernel_t integral_image_kernel;
+extern vx_tiling_kernel_t remap_kernel;
+extern vx_tiling_kernel_t convolution_kernel;
+extern vx_tiling_kernel_t hogfeatures_kernel;
+extern vx_tiling_kernel_t fast9_kernel;
+extern vx_tiling_kernel_t lbp_kernel;
+extern vx_tiling_kernel_t scale_image_kernel;
+extern vx_tiling_kernel_t lut_kernel;
+extern vx_tiling_kernel_t channelcombine_kernel;
+extern vx_tiling_kernel_t halfscale_gaussian_kernel;
+extern vx_tiling_kernel_t nonmaxsuppression_kernel;
+extern vx_tiling_kernel_t hogcells_kernel;
+
+#endif
+
diff --git a/sample/targets/tiling/vx_lbp.c b/sample/targets/tiling/vx_lbp.c
new file mode 100644
index 0000000..c1a5251
--- /dev/null
+++ b/sample/targets/tiling/vx_lbp.c
@@ -0,0 +1,195 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+
+#include "vx_internal.h"
+
+#include "tiling.h"
+
+static vx_status VX_CALLBACK vxLBPInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar scalar = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum stype = 0;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                if (stype == VX_TYPE_ENUM)
+                {
+                    vx_enum format = 0;
+                    vxCopyScalar(scalar, &format, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ((format == VX_LBP) ||
+                        (format == VX_MLBP) ||
+                        (format == VX_ULBP))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            } //end if(scalar)
+            vxReleaseParameter(&param);
+        }
+    }
+    else if (index == 2)
+    {
+        vx_enum format = 0;
+        vx_parameter param_format = vxGetParameterByIndex(node, 1);
+        if (vxGetStatus((vx_reference)param_format) == VX_SUCCESS)
+        {
+            vx_scalar scalar = 0;
+            vxQueryParameter(param_format, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum stype = 0;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                if (stype == VX_TYPE_ENUM)
+                {
+                    vxCopyScalar(scalar, &format, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param_format);
+        }
+
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar value = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &value, sizeof(value));
+            if (value)
+            {
+                vx_enum stype = 0;
+                vxQueryScalar(value, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                if (stype == VX_TYPE_INT8)
+                {
+                    vx_int8 gs = 0;
+                    vxCopyScalar(value, &gs, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ( (format == VX_LBP || format == VX_ULBP) &&
+                         (gs == 3 || gs == 5))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else if ( format == VX_MLBP && gs == 5 )
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&value);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxLBPOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{   
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 3)
+    {
+        vx_parameter src_param = vxGetParameterByIndex(node, 0);
+        if (vxGetStatus((vx_reference)src_param) == VX_SUCCESS)
+        {
+            vx_image src = 0;
+            vxQueryParameter(src_param, VX_PARAMETER_REF, &src, sizeof(src));
+            if (src)
+            {
+                vx_df_image format = 0;
+                vx_uint32 width = 0, height = 0;
+
+                vxQueryImage(src, VX_IMAGE_FORMAT, &format, sizeof(format));
+                vxQueryImage(src, VX_IMAGE_WIDTH, &width, sizeof(height));
+                vxQueryImage(src, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                /* output is equal type and size */
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = format;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&src);
+            }
+            vxReleaseParameter(&src_param);
+        }
+    }
+    return status;
+}
+
+vx_tiling_kernel_t lbp_kernel = 
+{
+    "org.khronos.openvx.tiling_lbp",
+    VX_KERNEL_LBP_TILING,
+    NULL,
+    LBP_image_tiling_flexible,
+    LBP_image_tiling_fast,
+    4,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxLBPInputValidator,
+    vxLBPOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
+
+
diff --git a/sample/targets/tiling/vx_lut.c b/sample/targets/tiling/vx_lut.c
new file mode 100644
index 0000000..1f6aeed
--- /dev/null
+++ b/sample/targets/tiling/vx_lut.c
@@ -0,0 +1,118 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+
+#include "vx_internal.h"
+
+#include "tiling.h"
+
+static vx_status VX_CALLBACK vxTableLookupInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        vx_lut lut = 0;
+        vxQueryParameter(param, VX_PARAMETER_REF, &lut, sizeof(lut));
+        if (lut)
+        {
+            vx_enum type = 0;
+            vxQueryLUT(lut, VX_LUT_TYPE, &type, sizeof(type));
+            if (type == VX_TYPE_UINT8 || type == VX_TYPE_INT16)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseLUT(&lut);
+        }
+        vxReleaseParameter(&param);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxTableLookupOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 2)
+    {
+        vx_parameter src_param = vxGetParameterByIndex(node, 0);
+        if (vxGetStatus((vx_reference)src_param) == VX_SUCCESS)
+        {
+            vx_image src = 0;
+            vxQueryParameter(src_param, VX_PARAMETER_REF, &src, sizeof(src));
+            if (src)
+            {
+                vx_df_image format = 0;
+                vx_uint32 width = 0, height = 0;
+
+                vxQueryImage(src, VX_IMAGE_FORMAT, &format, sizeof(format));
+                vxQueryImage(src, VX_IMAGE_WIDTH, &width, sizeof(height));
+                vxQueryImage(src, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                /* output is equal type and size */
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = format;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&src);
+            }
+            vxReleaseParameter(&src_param);
+        }
+    }
+    return status;
+}
+
+vx_tiling_kernel_t lut_kernel = 
+{
+    "org.khronos.openvx.tiling_table_lookup",
+    VX_KERNEL_TABLE_LOOKUP_TILING,
+    NULL,
+    TableLookup_image_tiling_flexible,
+    TableLookup_image_tiling_fast,
+    3,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_LUT, VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxTableLookupInputValidator,
+    vxTableLookupOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
+
diff --git a/sample/targets/tiling/vx_magnitude.c b/sample/targets/tiling/vx_magnitude.c
new file mode 100644
index 0000000..8cbce62
--- /dev/null
+++ b/sample/targets/tiling/vx_magnitude.c
@@ -0,0 +1,118 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#include "vx_interface.h"
+#include "vx_internal.h"
+#include <tiling.h>
+
+static vx_status VX_CALLBACK vxMagnitudeInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0 || index == 1)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_S16)
+            {
+                if (index == 0)
+                {
+                    status = VX_SUCCESS;
+                }
+                else
+                {
+                    vx_parameter param0 = vxGetParameterByIndex(node, index);
+                    vx_image input0 = 0;
+
+                    vxQueryParameter(param0, VX_PARAMETER_REF, &input0, sizeof(input0));
+                    if (input0)
+                    {
+                        vx_uint32 width0 = 0, height0 = 0, width1 = 0, height1 = 0;
+                        vxQueryImage(input0, VX_IMAGE_WIDTH, &width0, sizeof(width0));
+                        vxQueryImage(input0, VX_IMAGE_HEIGHT, &height0, sizeof(height0));
+                        vxQueryImage(input, VX_IMAGE_WIDTH, &width1, sizeof(width1));
+                        vxQueryImage(input, VX_IMAGE_HEIGHT, &height1, sizeof(height1));
+
+                        if (width0 == width1 && height0 == height1)
+                            status = VX_SUCCESS;
+                        vxReleaseImage(&input0);
+                    }
+                    vxReleaseParameter(&param0);
+                }
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxMagnitudeOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 2)
+    {
+        vx_parameter param  = vxGetParameterByIndex(node, 0);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_image input = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+            if (input)
+            {
+                vx_uint32 width = 0, height = 0;
+
+                vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = VX_DF_IMAGE_S16;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&input);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+vx_tiling_kernel_t Magnitude_kernel = 
+{
+    "org.khronos.openvx.tiling_magnitude",
+    VX_KERNEL_MAGNITUDE_TILING,
+    NULL,
+    Magnitude_image_tiling_flexible,
+    Magnitude_image_tiling_fast,
+    3,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxMagnitudeInputValidator,
+    vxMagnitudeOutputValidator,
+    NULL,
+    NULL,
+    { 8, 8 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
diff --git a/sample/targets/tiling/vx_minmax.c b/sample/targets/tiling/vx_minmax.c
new file mode 100644
index 0000000..bf35f2a
--- /dev/null
+++ b/sample/targets/tiling/vx_minmax.c
@@ -0,0 +1,153 @@
+/*
+
+ * Copyright (c) 2017-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+#include "vx_interface.h"
+#include "vx_internal.h"
+#include <tiling.h>
+
+static vx_status VX_CALLBACK vxMinMaxInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxMinMaxOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 2)
+    {
+        vx_parameter param[] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, 1),
+            vxGetParameterByIndex(node, index),
+        };
+        if ((vxGetStatus((vx_reference)param[0]) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)param[1]) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)param[2]) == VX_SUCCESS))
+        {
+            vx_image images[3];
+            vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0]));
+            vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1]));
+            vxQueryParameter(param[2], VX_PARAMETER_REF, &images[2], sizeof(images[2]));
+            if (images[0] && images[1] && images[2])
+            {
+                vx_uint32 width = 0, height = 0;
+                vx_df_image informat[2] = {VX_DF_IMAGE_VIRT, VX_DF_IMAGE_VIRT};
+                vx_df_image outformat = VX_DF_IMAGE_VIRT;
+                vxQueryImage(images[0], VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height, sizeof(height));
+                vxQueryImage(images[0], VX_IMAGE_FORMAT, &informat[0], sizeof(informat[0]));
+                vxQueryImage(images[1], VX_IMAGE_FORMAT, &informat[1], sizeof(informat[1]));
+                vxQueryImage(images[2], VX_IMAGE_FORMAT, &outformat, sizeof(outformat));
+                if (informat[0] == VX_DF_IMAGE_U8 && informat[1] == VX_DF_IMAGE_U8 && outformat == VX_DF_IMAGE_U8)
+                {
+                    status = VX_SUCCESS;
+                }
+                else if (informat[0] == VX_DF_IMAGE_S16 && informat[1] == VX_DF_IMAGE_S16 && outformat == VX_DF_IMAGE_S16)
+                {
+                    status = VX_SUCCESS;
+                }
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = outformat;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                vxReleaseImage(&images[0]);
+                vxReleaseImage(&images[1]);
+                vxReleaseImage(&images[2]);
+            }
+            vxReleaseParameter(&param[0]);
+            vxReleaseParameter(&param[1]);
+            vxReleaseParameter(&param[2]);
+        }
+    }
+    return status;
+}
+vx_tiling_kernel_t Max_kernel = 
+{
+    "org.khronos.openvx.tiling_max",
+    VX_KERNEL_MAX_TILING,
+    NULL,
+    Max_image_tiling_flexible,
+    Max_image_tiling_fast,
+    3,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxMinMaxInputValidator,
+    vxMinMaxOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+vx_tiling_kernel_t Min_kernel = 
+{
+    "org.khronos.openvx.tiling_min",
+    VX_KERNEL_MIN_TILING,
+    NULL,
+    Min_image_tiling_flexible,
+    Min_image_tiling_fast,
+    3,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxMinMaxInputValidator,
+    vxMinMaxOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
diff --git a/sample/targets/tiling/vx_morphology.c b/sample/targets/tiling/vx_morphology.c
new file mode 100644
index 0000000..adfecab
--- /dev/null
+++ b/sample/targets/tiling/vx_morphology.c
@@ -0,0 +1,121 @@
+/*
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+
+#include "vx_internal.h"
+
+#include <tiling.h>
+
+static vx_status VX_CALLBACK vxMorphologyInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxMorphologyOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, 0); /* we reference the input image */
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_image input = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+            if (input)
+            {
+                vx_uint32 width = 0, height = 0;
+                vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = VX_DF_IMAGE_U8;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&input);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_param_description_t morphology_kernel_params[] = {
+    {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+};
+
+vx_tiling_kernel_t erode3x3_kernel =
+{
+    "org.khronos.openvx.tiling_erode_3x3",
+    VX_KERNEL_ERODE_3x3_TILING,
+    NULL,
+    Erode3x3_image_tiling_flexible,
+    Erode3x3_image_tiling_fast,
+    2,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxMorphologyInputValidator,
+    vxMorphologyOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
+vx_tiling_kernel_t dilate3x3_kernel = 
+{
+    "org.khronos.openvx.tiling_dilate_3x3",
+    VX_KERNEL_DILATE_3x3_TILING,
+    NULL,
+    Dilate3x3_image_tiling_flexible,
+    Dilate3x3_image_tiling_fast,
+    2,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxMorphologyInputValidator,
+    vxMorphologyOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
+
diff --git a/sample/targets/tiling/vx_multiply.c b/sample/targets/tiling/vx_multiply.c
new file mode 100644
index 0000000..b69db1b
--- /dev/null
+++ b/sample/targets/tiling/vx_multiply.c
@@ -0,0 +1,263 @@
+/*
+
+ * Copyright (c) 2013-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+
+#include "vx_internal.h"
+
+#include <tiling.h>
+
+static vx_status VX_CALLBACK vxMultiplyInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16)
+                status = VX_SUCCESS;
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_image images[2];
+        vx_parameter param[2] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, 1),
+        };
+        vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0]));
+        vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1]));
+        if (images[0] && images[1])
+        {
+            vx_uint32 width[2], height[2];
+            vx_df_image format1;
+
+            vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0]));
+            vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1]));
+            vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0]));
+            vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1]));
+            vxQueryImage(images[1], VX_IMAGE_FORMAT, &format1, sizeof(format1));
+            if (width[0] == width[1] && height[0] == height[1] &&
+                (format1 == VX_DF_IMAGE_U8 || format1 == VX_DF_IMAGE_S16))
+                status = VX_SUCCESS;
+            vxReleaseImage(&images[0]);
+            vxReleaseImage(&images[1]);
+        }
+        vxReleaseParameter(&param[0]);
+        vxReleaseParameter(&param[1]);
+    }
+    else if (index == 2)        /* scale: must be non-negative. */
+    {
+        vx_scalar scalar = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum type = -1;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &type, sizeof(type));
+                if (type == VX_TYPE_FLOAT32)
+                {
+                    vx_float32 scale = 0.0f;
+                    if ((vxCopyScalar(scalar, &scale, VX_READ_ONLY, VX_MEMORY_TYPE_HOST) == VX_SUCCESS) &&
+                        (scale >= 0))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    else if (index == 3)        /* overflow_policy: truncate or saturate. */
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar scalar = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum stype = 0;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                if (stype == VX_TYPE_ENUM)
+                {
+                    vx_enum overflow_policy = 0;
+                    vxCopyScalar(scalar, &overflow_policy, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ((overflow_policy == VX_CONVERT_POLICY_WRAP) ||
+                        (overflow_policy == VX_CONVERT_POLICY_SATURATE))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    else if (index == 4)        /* rounding_policy: truncate or saturate. */
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar scalar = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum stype = 0;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                if (stype == VX_TYPE_ENUM)
+                {
+                    vx_enum rouding_policy = 0;
+                    vxCopyScalar(scalar, &rouding_policy, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ((rouding_policy == VX_ROUND_POLICY_TO_ZERO) ||
+                        (rouding_policy == VX_ROUND_POLICY_TO_NEAREST_EVEN))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxMultiplyOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 5)
+    {
+        /*
+         * We need to look at both input images, but only for the format:
+         * if either is S16 or the output type is not U8, then it's S16.
+         * The geometry of the output image is copied from the first parameter:
+         * the input images are known to match from input parameters validation.
+         */
+        vx_parameter param[] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, 1),
+            vxGetParameterByIndex(node, index),
+        };
+        if ((vxGetStatus((vx_reference)param[0]) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)param[1]) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)param[2]) == VX_SUCCESS))
+        {
+            vx_image images[3];
+            vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0]));
+            vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1]));
+            vxQueryParameter(param[2], VX_PARAMETER_REF, &images[2], sizeof(images[2]));
+            if (images[0] && images[1] && images[2])
+            {
+                vx_uint32 width = 0, height = 0;
+                vx_df_image informat[2] = {VX_DF_IMAGE_VIRT, VX_DF_IMAGE_VIRT};
+                vx_df_image outformat = VX_DF_IMAGE_VIRT;
+
+                /*
+                 * When passing on the geometry to the output image, we only look at
+                 * image 0, as both input images are verified to match, at input
+                 * validation.
+                 */
+                vxQueryImage(images[0], VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height, sizeof(height));
+                vxQueryImage(images[0], VX_IMAGE_FORMAT, &informat[0], sizeof(informat[0]));
+                vxQueryImage(images[1], VX_IMAGE_FORMAT, &informat[1], sizeof(informat[1]));
+                vxQueryImage(images[2], VX_IMAGE_FORMAT, &outformat, sizeof(outformat));
+
+                if (informat[0] == VX_DF_IMAGE_U8 && informat[1] == VX_DF_IMAGE_U8 && outformat == VX_DF_IMAGE_U8)
+                {
+                    status = VX_SUCCESS;
+                }
+                else
+                {
+                    status = VX_SUCCESS;
+                    outformat = VX_DF_IMAGE_S16;
+                }
+
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = outformat;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                vxReleaseImage(&images[0]);
+                vxReleaseImage(&images[1]);
+                vxReleaseImage(&images[2]);
+            }
+            vxReleaseParameter(&param[0]);
+            vxReleaseParameter(&param[1]);
+            vxReleaseParameter(&param[2]);
+        }
+    }
+    return status;
+}
+vx_tiling_kernel_t Multiply_kernel = 
+{
+    "org.khronos.openvx.tiling_multiply",
+    VX_KERNEL_MULTIPLY_TILING,
+    NULL,
+    Multiply_image_tiling_flexible,
+    Multiply_image_tiling_fast,
+    6,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+        { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxMultiplyInputValidator,
+    vxMultiplyOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_nonlinearfilter.c b/sample/targets/tiling/vx_nonlinearfilter.c
new file mode 100644
index 0000000..0f93f7b
--- /dev/null
+++ b/sample/targets/tiling/vx_nonlinearfilter.c
@@ -0,0 +1,160 @@
+/*
+
+ * Copyright (c) 2016-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_internal.h"
+
+#include "tiling.h"
+
+#if (C_MAX_NONLINEAR_DIM != VX_INT_MAX_NONLINEAR_DIM)
+#if defined(_WIN32)
+#pragma error("C Model does not support VX required nonlinear filter size")
+#elif defined(__GNUC__)
+#error "C Model does not support VX required nonlinear filter size"
+#endif
+#endif
+
+static vx_status VX_CALLBACK vxNonLinearFilterInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_scalar scalar = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+        if (scalar)
+        {
+            vx_enum stype = 0;
+            vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+            if (stype == VX_TYPE_ENUM)
+            {
+                vx_enum function = 0;
+                vxCopyScalar(scalar, &function, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                if ((function == VX_NONLINEAR_FILTER_MEDIAN) ||
+                    (function == VX_NONLINEAR_FILTER_MIN) ||
+                    (function == VX_NONLINEAR_FILTER_MAX))
+                {
+                    status = VX_SUCCESS;
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_VALUE;
+                }
+            }
+            else
+            {
+                status = VX_ERROR_INVALID_TYPE;
+            }
+            vxReleaseScalar(&scalar);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 2)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (param)
+        {
+            vx_matrix matrix;
+            vxQueryParameter(param, VX_PARAMETER_REF, &matrix, sizeof(matrix));
+            if (matrix)
+            {
+                vx_enum data_type = 0;
+                vx_size cols = 0, rows = 0;
+                vxQueryMatrix(matrix, VX_MATRIX_TYPE, &data_type, sizeof(data_type));
+                vxQueryMatrix(matrix, VX_MATRIX_COLUMNS, &cols, sizeof(cols));
+                vxQueryMatrix(matrix, VX_MATRIX_ROWS, &rows, sizeof(rows));
+                if ((rows <= VX_INT_MAX_NONLINEAR_DIM) &&
+                    (cols <= VX_INT_MAX_NONLINEAR_DIM) &&
+                    (data_type == VX_TYPE_UINT8))
+                {
+                    status = VX_SUCCESS;
+                }
+                vxReleaseMatrix(&matrix);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxNonLinearFilterOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 3)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, 1); /* we reference the input image */
+        if (param)
+        {
+            vx_image input = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+            if (input)
+            {
+                vx_uint32 width = 0, height = 0;
+                vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = VX_DF_IMAGE_U8;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&input);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+vx_tiling_kernel_t nonlinearfilter_kernel = 
+{
+    "org.khronos.openvx.tiling_non_linear_filter",
+    VX_KERNEL_NON_LINEAR_FILTER_TILING,
+    NULL,
+    NonLinearFilter_image_tiling_flexible,
+    NonLinearFilter_image_tiling_fast,
+    4,
+    { { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT,  VX_TYPE_IMAGE,  VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT,  VX_TYPE_MATRIX, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE,  VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxNonLinearFilterInputValidator,
+    vxNonLinearFilterOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_nonmaxsuppression.c b/sample/targets/tiling/vx_nonmaxsuppression.c
new file mode 100644
index 0000000..bf53f83
--- /dev/null
+++ b/sample/targets/tiling/vx_nonmaxsuppression.c
@@ -0,0 +1,168 @@
+/*
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+*/
+
+#include <vx_interface.h>
+#include <tiling.h>
+#include <vx_internal.h>
+#include <VX/vx_helper.h>
+
+static vx_status VX_CALLBACK vxNonMaxSuppressionInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_image images[2];
+        vx_parameter param[2] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, 1),
+        };
+        vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0]));
+        vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1]));
+        if (images[0] && images[1])
+        {
+            vx_uint32 width[2], height[2];
+            vx_df_image format;
+
+            vxQueryImage(images[0], VX_IMAGE_WIDTH, &width[0], sizeof(width[0]));
+            vxQueryImage(images[1], VX_IMAGE_WIDTH, &width[1], sizeof(width[1]));
+            vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height[0], sizeof(height[0]));
+            vxQueryImage(images[1], VX_IMAGE_HEIGHT, &height[1], sizeof(height[1]));
+            vxQueryImage(images[1], VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (width[0] == width[1] && height[0] == height[1] && format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&images[0]);
+            vxReleaseImage(&images[1]);
+        }
+        vxReleaseParameter(&param[0]);
+        vxReleaseParameter(&param[1]);
+    }
+    else if (index == 2)
+    {
+        vx_scalar win_size = 0;
+        vx_image input = 0;
+        vx_parameter param[2] = {
+            vxGetParameterByIndex(node, 0),
+            vxGetParameterByIndex(node, 2)
+        };
+
+        vxQueryParameter(param[0], VX_PARAMETER_REF, &input, sizeof(input));
+        vxQueryParameter(param[1], VX_PARAMETER_REF, &win_size, sizeof(win_size));
+        if (input && win_size)
+        {
+            vx_enum type = 0;
+            vx_uint32 width, height;
+            vx_int32 wsize = 0;
+            vxCopyScalar(win_size, &wsize, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+            vxQueryScalar(win_size, VX_SCALAR_TYPE, &type, sizeof(type));
+            vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+            vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+            if (type == VX_TYPE_INT32)
+            {
+                if ( (wsize <= width) && (wsize <= height) && (wsize % 2 == 1))
+                {
+                    status = VX_SUCCESS;
+                }
+            }
+            else
+            {
+                status = VX_ERROR_INVALID_TYPE;
+            }
+
+            vxReleaseScalar(&win_size);
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param[0]);
+        vxReleaseParameter(&param[1]);
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxNonMaxSuppressionOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 3)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, 0);
+        if (param)
+        {
+            vx_image img = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &img, sizeof(img));
+            if (img)
+            {
+                vx_uint32 width = 0, height = 0;
+                vx_df_image format = 0;
+
+                vxQueryImage(img, VX_IMAGE_WIDTH, &width, sizeof(width));
+                vxQueryImage(img, VX_IMAGE_HEIGHT, &height, sizeof(height));
+                vxQueryImage(img, VX_IMAGE_FORMAT, &format, sizeof(format));
+                
+                /* fill in the meta data with the attributes so that the checker will pass */
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = format;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+
+                status = VX_SUCCESS;
+                vxReleaseImage(&img);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+vx_tiling_kernel_t nonmaxsuppression_kernel = 
+{
+    "org.khronos.openvx.tiling_nonmaxsuppression",
+    VX_KERNEL_NON_MAX_SUPPRESSION_TILING,
+    NULL,
+    NonMaxSuppression_image_tiling_flexible,
+    NonMaxSuppression_image_tiling_fast,
+    4,
+    { { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT,  VX_TYPE_IMAGE, VX_PARAMETER_STATE_OPTIONAL },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },},
+	NULL,
+    vxNonMaxSuppressionInputValidator,
+    vxNonMaxSuppressionOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_phase.c b/sample/targets/tiling/vx_phase.c
new file mode 100644
index 0000000..d0c3a03
--- /dev/null
+++ b/sample/targets/tiling/vx_phase.c
@@ -0,0 +1,132 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+
+#include "vx_internal.h"
+
+#include <tiling.h>
+
+static vx_status VX_CALLBACK vxPhaseInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+
+    if (index == 0 || index == 1)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_S16 || format == VX_DF_IMAGE_F32)
+            {
+                if (index == 0)
+                {
+                    status = VX_SUCCESS;
+                }
+                else
+                {
+                    vx_parameter param0 = vxGetParameterByIndex(node, index);
+                    vx_image input0 = 0;
+
+                    vxQueryParameter(param0, VX_PARAMETER_REF, &input0, sizeof(input0));
+                    if (input0)
+                    {
+                        vx_uint32 width0 = 0, height0 = 0, width1 = 0, height1 = 0;
+                        vxQueryImage(input0, VX_IMAGE_WIDTH, &width0, sizeof(width0));
+                        vxQueryImage(input0, VX_IMAGE_HEIGHT, &height0, sizeof(height0));
+                        vxQueryImage(input, VX_IMAGE_WIDTH, &width1, sizeof(width1));
+                        vxQueryImage(input, VX_IMAGE_HEIGHT, &height1, sizeof(height1));
+
+                        if (width0 == width1 && height0 == height1)
+                            status = VX_SUCCESS;
+
+                        vxReleaseImage(&input0);
+                    }
+
+                    vxReleaseParameter(&param0);
+                }
+            }
+
+            vxReleaseImage(&input);
+        }
+
+        vxReleaseParameter(&param);
+    }
+
+    return status;
+}
+
+static vx_status VX_CALLBACK vxPhaseOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+
+    if (index == 2)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, 0);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_uint32   width = 0;
+            vx_uint32   height = 0;
+            vx_df_image format = 0;
+
+            vxQueryImage(input, VX_IMAGE_WIDTH, &width, sizeof(width));
+            vxQueryImage(input, VX_IMAGE_HEIGHT, &height, sizeof(height));
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+
+            ptr->type = VX_TYPE_IMAGE;
+            ptr->dim.image.format = VX_DF_IMAGE_U8;
+            ptr->dim.image.width  = width;
+            ptr->dim.image.height = height;
+
+            status = VX_SUCCESS;
+
+            vxReleaseImage(&input);
+        }
+
+        vxReleaseParameter(&param);
+    }
+
+    return status;
+}
+
+vx_tiling_kernel_t phase_kernel =
+{
+    "org.khronos.openvx.tiling_phase",
+    VX_KERNEL_PHASE_TILING,
+    NULL,
+    Phase_image_tiling_flexible,
+    Phase_image_tiling_fast,
+    3,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxPhaseInputValidator,
+    vxPhaseOutputValidator,
+    NULL,
+    NULL,
+    { 8, 8 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_remap.c b/sample/targets/tiling/vx_remap.c
new file mode 100644
index 0000000..78c6578
--- /dev/null
+++ b/sample/targets/tiling/vx_remap.c
@@ -0,0 +1,553 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+
+#include "vx_internal.h"
+
+#include <arm_neon.h>
+
+#include "tiling.h"
+
+#define MIN(a,b)    (((a) < (b)) ? (a) : (b))
+
+static vx_int32 * alignPtr(vx_int32* ptr, size_t n)
+{
+    return (vx_int32 *)(((size_t)ptr + n-1) & -n);
+}
+
+static vx_float32 * alignPtr_f(vx_float32* ptr, size_t n)
+{
+    return (vx_float32 *)(((size_t)ptr + n-1) & -n);
+}
+
+
+static void remapNearestNeighborConst(const size_t height,
+                                      const size_t width,
+                                      const vx_uint8 * srcBase,
+                                      const vx_int32 * map,
+                                      vx_uint8 * dstBase, ptrdiff_t dstStride,
+                                      vx_uint8 borderValue)
+{
+    for (size_t y = 0; y < height; ++y)
+    {
+        const vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * width * sizeof(vx_int32));
+        vx_uint8 * dst_row = (vx_uint8 *)((vx_int8 *)dstBase + y * dstStride);
+
+        for (size_t x = 0; x < width; ++x)
+        {
+            vx_int32 src_idx = map_row[x];
+            dst_row[x] = src_idx >= 0 ? srcBase[map_row[x]] : borderValue;
+        }
+    }
+}
+
+static void remapLinearConst(const size_t height,
+                             const size_t width,
+                             const vx_uint8 * srcBase,
+                             const vx_int32 * map,
+                             const vx_float32 * coeffs,
+                             vx_uint8 * dstBase, ptrdiff_t dstStride,
+                             vx_uint8 borderValue)
+{
+    int16x8_t v_zero16 = vdupq_n_s16(0);
+
+    for (size_t y = 0; y < height; ++y)
+    {
+        const vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * width * sizeof(vx_int32) * 4);
+        const vx_float32 * coeff_row = (vx_float32 *)((vx_int8 *)(coeffs) + y * width * sizeof(vx_float32) * 2);
+
+        vx_uint8 * dst_row = (vx_uint8 *)((vx_int8 *)(dstBase) + y * dstStride);
+
+        size_t x = 0;
+
+        for ( ; x + 8 < width; x += 8)
+        {
+            int16x8_t v_src00 = vsetq_lane_s16(map_row[(x << 2)] >= 0 ? srcBase[map_row[(x << 2)]] : borderValue, v_zero16, 0);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) +  4] >= 0 ? srcBase[map_row[(x << 2) +  4]] : borderValue, v_src00, 1);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) +  8] >= 0 ? srcBase[map_row[(x << 2) +  8]] : borderValue, v_src00, 2);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 12] >= 0 ? srcBase[map_row[(x << 2) + 12]] : borderValue, v_src00, 3);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 16] >= 0 ? srcBase[map_row[(x << 2) + 16]] : borderValue, v_src00, 4);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 20] >= 0 ? srcBase[map_row[(x << 2) + 20]] : borderValue, v_src00, 5);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 24] >= 0 ? srcBase[map_row[(x << 2) + 24]] : borderValue, v_src00, 6);
+            v_src00 = vsetq_lane_s16(map_row[(x << 2) + 28] >= 0 ? srcBase[map_row[(x << 2) + 28]] : borderValue, v_src00, 7);
+
+            int16x8_t v_src01 = vsetq_lane_s16(map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue, v_zero16, 0);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) +  5] >= 0 ? srcBase[map_row[(x << 2) +  5]] : borderValue, v_src01, 1);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) +  9] >= 0 ? srcBase[map_row[(x << 2) +  9]] : borderValue, v_src01, 2);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 13] >= 0 ? srcBase[map_row[(x << 2) + 13]] : borderValue, v_src01, 3);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 17] >= 0 ? srcBase[map_row[(x << 2) + 17]] : borderValue, v_src01, 4);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 21] >= 0 ? srcBase[map_row[(x << 2) + 21]] : borderValue, v_src01, 5);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 25] >= 0 ? srcBase[map_row[(x << 2) + 25]] : borderValue, v_src01, 6);
+            v_src01 = vsetq_lane_s16(map_row[(x << 2) + 29] >= 0 ? srcBase[map_row[(x << 2) + 29]] : borderValue, v_src01, 7);
+
+            int16x8_t v_src10 = vsetq_lane_s16(map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue, v_zero16, 0);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) +  6] >= 0 ? srcBase[map_row[(x << 2) +  6]] : borderValue, v_src10, 1);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 10] >= 0 ? srcBase[map_row[(x << 2) + 10]] : borderValue, v_src10, 2);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 14] >= 0 ? srcBase[map_row[(x << 2) + 14]] : borderValue, v_src10, 3);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 18] >= 0 ? srcBase[map_row[(x << 2) + 18]] : borderValue, v_src10, 4);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 22] >= 0 ? srcBase[map_row[(x << 2) + 22]] : borderValue, v_src10, 5);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 26] >= 0 ? srcBase[map_row[(x << 2) + 26]] : borderValue, v_src10, 6);
+            v_src10 = vsetq_lane_s16(map_row[(x << 2) + 30] >= 0 ? srcBase[map_row[(x << 2) + 30]] : borderValue, v_src10, 7);
+
+            int16x8_t v_src11 = vsetq_lane_s16(map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue, v_zero16, 0);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) +  7] >= 0 ? srcBase[map_row[(x << 2) +  7]] : borderValue, v_src11, 1);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 11] >= 0 ? srcBase[map_row[(x << 2) + 11]] : borderValue, v_src11, 2);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 15] >= 0 ? srcBase[map_row[(x << 2) + 15]] : borderValue, v_src11, 3);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 19] >= 0 ? srcBase[map_row[(x << 2) + 19]] : borderValue, v_src11, 4);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 23] >= 0 ? srcBase[map_row[(x << 2) + 23]] : borderValue, v_src11, 5);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 27] >= 0 ? srcBase[map_row[(x << 2) + 27]] : borderValue, v_src11, 6);
+            v_src11 = vsetq_lane_s16(map_row[(x << 2) + 31] >= 0 ? srcBase[map_row[(x << 2) + 31]] : borderValue, v_src11, 7);
+
+            // first part
+            float32x4_t v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00)));
+            float32x4_t v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10)));
+
+            float32x4x2_t v_coeff = vld2q_f32(coeff_row + (x << 1));
+            float32x4_t v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src01),
+                                                                               vget_low_s16(v_src00))), v_coeff.val[0]);
+            float32x4_t v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_low_s16(v_src11),
+                                                                               vget_low_s16(v_src10))), v_coeff.val[0]);
+
+            float32x4_t v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
+            uint16x4_t v_dst0 = vmovn_u32(vcvtq_u32_f32(v_dst));
+
+            // second part
+            v_src00_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00)));
+            v_src10_f = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10)));
+
+            v_coeff = vld2q_f32(coeff_row + (x << 1) + 8);
+            v_dst_0 = vmlaq_f32(v_src00_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src01),
+                                                                   vget_high_s16(v_src00))), v_coeff.val[0]);
+            v_dst_1 = vmlaq_f32(v_src10_f, vcvtq_f32_s32(vsubl_s16(vget_high_s16(v_src11),
+                                                                   vget_high_s16(v_src10))), v_coeff.val[0]);
+
+            v_dst = vmlaq_f32(v_dst_0, vsubq_f32(v_dst_1, v_dst_0), v_coeff.val[1]);
+            uint16x4_t v_dst1 = vmovn_u32(vcvtq_u32_f32(v_dst));
+
+            // store
+            vst1_u8(dst_row + x, vmovn_u16(vcombine_u16(v_dst0, v_dst1)));
+        }
+        for ( ; x < width; ++x)
+        {
+            int16_t src00 = map_row[(x << 2) + 0] >= 0 ? srcBase[map_row[(x << 2) + 0]] : borderValue;
+            int16_t src01 = map_row[(x << 2) + 1] >= 0 ? srcBase[map_row[(x << 2) + 1]] : borderValue;
+            int16_t src10 = map_row[(x << 2) + 2] >= 0 ? srcBase[map_row[(x << 2) + 2]] : borderValue;
+            int16_t src11 = map_row[(x << 2) + 3] >= 0 ? srcBase[map_row[(x << 2) + 3]] : borderValue;
+
+            vx_float32 dst_val_0 = (src01 - src00) * coeff_row[(x << 1)] + src00;
+            vx_float32 dst_val_1 = (src11 - src10) * coeff_row[(x << 1)] + src10;
+            dst_row[x] = floorf((dst_val_1 - dst_val_0) * coeff_row[(x << 1) + 1] + dst_val_0);
+        }
+    }
+}
+
+
+//BLOCK_SIZE is the same as tile_size set in "vx_remap.c"
+#define BLOCK_SIZE 16
+
+void Remap_image_tiling_fast(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_remap *table = (vx_remap *)parameters[1];
+    vx_scalar *stype = (vx_scalar *)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+
+    vx_uint8 *src_base = in->base[0];
+    vx_uint8 *dst_base = out->base[0];
+
+    vx_uint32 low_y = out->tile_y;
+    vx_uint32 high_y = out->tile_y + out->tile_block.height;
+
+    vx_int32 policy = (vx_int32)*stype;
+
+    vx_uint32 src_width = in->image.width;
+    vx_uint32 src_height = in->image.height;
+    vx_uint32 srcStride = in->addr->stride_y;
+
+    vx_uint32 dst_width = out->image.width;
+    vx_uint32 dst_height = out->image.height;
+    vx_uint32 dstStride = out->addr->stride_y;
+
+    int32x4_t v_width4 = vdupq_n_s32(src_width - 1), v_height4 = vdupq_n_s32(src_height - 1);
+    int32x4_t v_step4 = vdupq_n_s32(srcStride), v_1 = vdupq_n_s32(1);
+    float32x4_t v_zero4f = vdupq_n_f32(0.0f), v_one4f = vdupq_n_f32(1.0f);
+
+    vx_uint8 borderValue = 0;
+
+    size_t i = out->tile_y;
+    size_t blockHeight = MIN(BLOCK_SIZE, dst_height - i);
+    size_t j = out->tile_x;
+    size_t blockWidth = MIN(BLOCK_SIZE, dst_width - j);
+
+    size_t tableStride = (&(*table)->memory)->strides[0][VX_DIM_Y];
+    vx_float32 * tableBase = (vx_float32 *)(&(*table)->memory)->ptrs[0];
+
+    if (policy == VX_INTERPOLATION_NEAREST_NEIGHBOR)
+    {
+        vx_int32 _map[BLOCK_SIZE * BLOCK_SIZE + 16];
+        vx_int32 * map = alignPtr(_map, 16);
+
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+        int32x2_t v_m1_2 = vdup_n_s32(-1);
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+        float32x2_t v_zero2 = vdup_n_f32(0.0f);
+
+        // compute table
+        for (size_t y = 0; y < blockHeight; ++y)
+        {
+            const vx_float32 * table_row = (vx_float32 *)((vx_int8 *)(tableBase) + (i + y) * tableStride) + (j << 1);
+            vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(&map[0]) + y * blockWidth * sizeof(vx_int32));
+
+            size_t x = 0;
+            for ( ; x + 8 <= blockWidth; x += 8)
+            {
+                float32x4x2_t v_table0 = vld2q_f32(table_row + (x << 1)),
+                              v_table1 = vld2q_f32(table_row + (x << 1) + 8);
+
+                int32x4_t v_dst_x = vcvtq_s32_f32(v_table0.val[0]);
+                int32x4_t v_dst_y = vcvtq_s32_f32(v_table0.val[1]);
+                uint32x4_t v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table0.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
+                                              vandq_u32(vcgeq_f32(v_table0.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
+                int32x4_t v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
+                vst1q_s32(map_row + x, v_dst_index);
+
+                v_dst_x = vcvtq_s32_f32(v_table1.val[0]);
+                v_dst_y = vcvtq_s32_f32(v_table1.val[1]);
+                v_mask = vandq_u32(vandq_u32(vcgeq_f32(v_table1.val[0], v_zero4), vcleq_s32(v_dst_x, v_width4)),
+                                   vandq_u32(vcgeq_f32(v_table1.val[1], v_zero4), vcleq_s32(v_dst_y, v_height4)));
+                v_dst_index = vbslq_s32(v_mask, vmlaq_s32(v_dst_x, v_dst_y, v_step4), v_m1_4);
+                vst1q_s32(map_row + x + 4, v_dst_index);
+            }
+        }
+        vx_uint8 * dstBase = (vx_uint8 *)((vx_int8 *)dst_base + i * dstStride);
+        // make remap
+        remapNearestNeighborConst(blockHeight, blockWidth, src_base, &map[0], dstBase + j, dstStride, borderValue);
+    }
+    else if (policy == VX_INTERPOLATION_BILINEAR)
+    {
+        vx_int32 _map[((BLOCK_SIZE * BLOCK_SIZE) << 2) + 16];
+        vx_float32 _coeffs[((BLOCK_SIZE * BLOCK_SIZE) << 1) + 16];
+        vx_int32 * map = alignPtr(_map, 16);
+        vx_float32 * coeffs = alignPtr_f(_coeffs, 16);
+
+        float32x4_t v_zero4 = vdupq_n_f32(0.0f);
+        int32x4_t v_m1_4 = vdupq_n_s32(-1);
+
+        // compute table
+        for (size_t y = 0; y < blockHeight; ++y)
+        {
+            const vx_float32 * table_row = (vx_float32 *)((vx_int8 *)(tableBase) + (i + y) * tableStride) + (j << 1);
+            vx_int32 * map_row = (vx_int32 *)((vx_int8 *)(map) + y * blockWidth * sizeof(vx_int32) * 4);
+            vx_float32 * coeff_row = (vx_float32 *)((vx_int8 *)(coeffs) + y * blockWidth * sizeof(vx_float32) * 2);
+
+            size_t x = 0;
+            for ( ; x + 4 <= blockWidth; x += 4)
+            {
+                float32x4x2_t v_table = vld2q_f32(table_row + (x << 1));
+
+                int32x4_t v_src_x0 = vcvtq_s32_f32(v_table.val[0]);
+                int32x4_t v_src_y0 = vcvtq_s32_f32(v_table.val[1]);
+
+                float32x4x2_t v_coeff;
+                v_coeff.val[0] = vsubq_f32(v_table.val[0], vcvtq_f32_s32(v_src_x0));
+                v_coeff.val[1] = vsubq_f32(v_table.val[1], vcvtq_f32_s32(v_src_y0));
+                uint32x4_t v_maskx = vcltq_f32(v_coeff.val[0], v_zero4f);
+                uint32x4_t v_masky = vcltq_f32(v_coeff.val[1], v_zero4f);
+                v_coeff.val[0] = vbslq_f32(v_maskx, vaddq_f32(v_one4f, v_coeff.val[0]), v_coeff.val[0]);
+                v_coeff.val[1] = vbslq_f32(v_masky, vaddq_f32(v_one4f, v_coeff.val[1]), v_coeff.val[1]);
+                v_src_x0 = vbslq_s32(v_maskx, vsubq_s32(v_src_x0, v_1), v_src_x0);
+                v_src_y0 = vbslq_s32(v_masky, vsubq_s32(v_src_y0, v_1), v_src_y0);
+
+                int32x4_t v_src_x1 = vaddq_s32(v_src_x0, v_1);
+                int32x4_t v_src_y1 = vaddq_s32(v_src_y0, v_1);
+
+                int32x4x4_t v_dst_index;
+                v_dst_index.val[0] = vmlaq_s32(v_src_x0, v_src_y0, v_step4);
+                v_dst_index.val[1] = vmlaq_s32(v_src_x1, v_src_y0, v_step4);
+                v_dst_index.val[2] = vmlaq_s32(v_src_x0, v_src_y1, v_step4);
+                v_dst_index.val[3] = vmlaq_s32(v_src_x1, v_src_y1, v_step4);
+
+                uint32x4_t v_mask_x0 = vandq_u32(vcgeq_f32(v_table.val[0], v_zero4), vcleq_s32(v_src_x0, v_width4));
+                uint32x4_t v_mask_x1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[0], v_one4f), v_zero4), vcleq_s32(v_src_x1, v_width4));
+                uint32x4_t v_mask_y0 = vandq_u32(vcgeq_f32(v_table.val[1], v_zero4), vcleq_s32(v_src_y0, v_height4));
+                uint32x4_t v_mask_y1 = vandq_u32(vcgeq_f32(vaddq_f32(v_table.val[1], v_one4f), v_zero4), vcleq_s32(v_src_y1, v_height4));
+
+                v_dst_index.val[0] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y0), v_dst_index.val[0], v_m1_4);
+                v_dst_index.val[1] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y0), v_dst_index.val[1], v_m1_4);
+                v_dst_index.val[2] = vbslq_s32(vandq_u32(v_mask_x0, v_mask_y1), v_dst_index.val[2], v_m1_4);
+                v_dst_index.val[3] = vbslq_s32(vandq_u32(v_mask_x1, v_mask_y1), v_dst_index.val[3], v_m1_4);
+
+                vst2q_f32(coeff_row + (x << 1), v_coeff);
+                vst4q_s32(map_row + (x << 2), v_dst_index);
+            }
+        }
+        vx_uint8 * dstBase = (vx_uint8 *)((vx_int8 *)dst_base + i * dstStride);
+        remapLinearConst(blockHeight, blockWidth, src_base, &map[0], &coeffs[0], dstBase + j, dstStride, borderValue);
+    }
+}
+
+
+static vx_bool read_pixel(void *base, vx_imagepatch_addressing_t *addr, vx_uint32 src_height, vx_uint32 src_width,
+    vx_float32 x, vx_float32 y, vx_uint8 *pixel)
+{
+    vx_bool out_of_bounds = (x < 0 || y < 0 || x >= src_width || y >= src_height);
+    vx_uint32 bx, by;
+    vx_uint8 *bpixel;
+
+    if (out_of_bounds)
+    {
+        return vx_false_e;
+    }
+
+    // bounded x/y
+    bx = x < 0 ? 0 : x >= src_width ? src_width - 1 : (vx_uint32)x;
+    by = y < 0 ? 0 : y >= src_height ? src_height - 1 : (vx_uint32)y;
+
+    vx_uint8 *new_ptr = NULL;
+    vx_uint32 offset = (addr->stride_y * by + addr->stride_x * bx);
+    new_ptr = (vx_uint8 *)base;
+    bpixel = &new_ptr[offset];
+
+    *pixel = *bpixel;
+
+    return vx_true_e;
+}
+
+#define REMAP(low_y, high_y, low_x)                                                                                    \
+    for (y = low_y; y < high_y; y++)                                                                                   \
+    {                                                                                                                  \
+        vx_uint8 *dst = (vx_uint8 *)dst_base + y * out->addr->stride_y;                                                \
+        for (x = low_x; x < high_x; x++)                                                                               \
+        {                                                                                                              \
+            vx_float32 src_x = 0.0f;                                                                                   \
+            vx_float32 src_y = 0.0f;                                                                                   \
+                                                                                                                       \
+            vxGetRemapPoint(*table, x, y, &src_x, &src_y);                                                             \
+                                                                                                                       \
+            if (policy == VX_INTERPOLATION_NEAREST_NEIGHBOR)                                                           \
+            {                                                                                                          \
+                read_pixel(src_base, in->addr, in->image.height, in->image.width, src_x + 0.5f, src_y + 0.5f, dst);    \
+                dst++;                                                                                                 \
+            }                                                                                                          \
+            else if (policy == VX_INTERPOLATION_BILINEAR)                                                              \
+            {                                                                                                          \
+                vx_uint8 tl = 0;                                                                                       \
+                vx_uint8 tr = 0;                                                                                       \
+                vx_uint8 bl = 0;                                                                                       \
+                vx_uint8 br = 0;                                                                                       \
+                vx_float32 xf = floorf(src_x);                                                                         \
+                vx_float32 yf = floorf(src_y);                                                                         \
+                vx_float32 dx = src_x - xf;                                                                            \
+                vx_float32 dy = src_y - yf;                                                                            \
+                vx_float32 a[] = { (1.0f - dx) * (1.0f - dy), (1.0f - dx) * (dy), (dx)* (1.0f - dy), (dx)* (dy), };    \
+                vx_bool defined = vx_true_e;                                                                           \
+                defined &= read_pixel(src_base, in->addr, in->image.height, in->image.width, xf + 0, yf + 0, &tl);     \
+                defined &= read_pixel(src_base, in->addr, in->image.height, in->image.width, xf + 1, yf + 0, &tr);     \
+                defined &= read_pixel(src_base, in->addr, in->image.height, in->image.width, xf + 0, yf + 1, &bl);     \
+                defined &= read_pixel(src_base, in->addr, in->image.height, in->image.width, xf + 1, yf + 1, &br);     \
+                if (defined)                                                                                           \
+                    *dst = (vx_uint8)(a[0] * tl + a[2] * tr + a[1] * bl + a[3] * br);                                  \
+                dst++;                                                                                                 \
+            }                                                                                                          \
+        }                                                                                                              \
+    }
+
+
+void Remap_image_tiling_flexible(void * parameters[], void * tile_memory, vx_size tile_memory_size)
+{
+    vx_uint32 x = 0, y = 0;
+
+    vx_tile_t *in = (vx_tile_t *)parameters[0];
+    vx_remap *table = (vx_remap *)parameters[1];
+    vx_scalar *stype = (vx_scalar *)parameters[2];
+    vx_tile_t *out = (vx_tile_t *)parameters[3];
+
+    vx_uint32 low_y = in->tile_y;
+    vx_uint32 high_y = vxTileHeight(in, 0);
+
+    vx_uint32 low_x = in->tile_x;
+    vx_uint32 high_x = vxTileWidth(in, 0);
+
+    vx_uint8 *src_base = in->base[0] + in->tile_x;
+    vx_uint8 *dst_base = out->base[0] + out->tile_x;
+
+    vx_int32 policy = (vx_int32)*stype;
+
+    if (low_y == 0 && low_x == 0)
+    {
+        REMAP(low_y, high_y, low_x)
+    }
+    else
+    {
+        REMAP(0, low_y, low_x)
+
+        src_base = in->base[0];
+        dst_base = out->base[0];
+        REMAP(low_y, high_y, 0)
+    }
+}
+
+static vx_status VX_CALLBACK vxRemapInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_remap table;
+            vxQueryParameter(param, VX_PARAMETER_REF, &table, sizeof(table));
+            if (table)
+            {
+                /* \todo what are we checking? */
+                status = VX_SUCCESS;
+                vxReleaseRemap(&table);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    else if (index == 2)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar scalar = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum stype = 0;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                if (stype == VX_TYPE_ENUM)
+                {
+                    vx_enum policy = 0;
+                    vxCopyScalar(scalar, &policy, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ((policy == VX_INTERPOLATION_NEAREST_NEIGHBOR) ||
+                        (policy == VX_INTERPOLATION_BILINEAR))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxRemapOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 3)
+    {
+        vx_parameter src_param = vxGetParameterByIndex(node, 0);
+        vx_parameter tbl_param = vxGetParameterByIndex(node, 1);
+        vx_parameter dst_param = vxGetParameterByIndex(node, index);
+        if ((vxGetStatus((vx_reference)src_param) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)dst_param) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)tbl_param) == VX_SUCCESS))
+        {
+            vx_image src = 0;
+            vx_image dst = 0;
+            vx_remap tbl = 0;
+            vxQueryParameter(src_param, VX_PARAMETER_REF, &src, sizeof(src));
+            vxQueryParameter(dst_param, VX_PARAMETER_REF, &dst, sizeof(dst));
+            vxQueryParameter(tbl_param, VX_PARAMETER_REF, &tbl, sizeof(tbl));
+            if ((src) && (dst) && (tbl))
+            {
+                vx_uint32 w1 = 0, h1 = 0;
+                vx_uint32 w2 = 0, h2 = 0;
+                vx_uint32 w3 = 0, h3 = 0;
+
+                vxQueryImage(src, VX_IMAGE_WIDTH, &w1, sizeof(w1));
+                vxQueryImage(src, VX_IMAGE_HEIGHT, &h1, sizeof(h1));
+                vxQueryRemap(tbl, VX_REMAP_SOURCE_WIDTH, &w2, sizeof(w2));
+                vxQueryRemap(tbl, VX_REMAP_SOURCE_HEIGHT, &h2, sizeof(h2));
+                vxQueryRemap(tbl, VX_REMAP_DESTINATION_WIDTH, &w3, sizeof(w3));
+                vxQueryRemap(tbl, VX_REMAP_DESTINATION_HEIGHT, &h3, sizeof(h3));
+
+                if ((w1 == w2) && (h1 == h2))
+                {
+                    /* fill in the meta data with the attributes so that the checker will pass */
+                    ptr->type = VX_TYPE_IMAGE;
+                    ptr->dim.image.format = VX_DF_IMAGE_U8;
+                    ptr->dim.image.width = w3;
+                    ptr->dim.image.height = h3;
+                    status = VX_SUCCESS;
+                }
+                vxReleaseImage(&src);
+                vxReleaseRemap(&tbl);
+                vxReleaseImage(&dst);
+            }
+            vxReleaseParameter(&src_param);
+            vxReleaseParameter(&tbl_param);
+            vxReleaseParameter(&dst_param);
+        }
+    }
+    return status;
+}
+
+vx_tiling_kernel_t remap_kernel =
+{ 
+    "org.khronos.openvx.tiling_remap",
+    VX_KERNEL_REMAP_TILING,
+    NULL,
+    Remap_image_tiling_flexible,
+    Remap_image_tiling_fast,
+    4,
+    { { VX_INPUT,  VX_TYPE_IMAGE,  VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT,  VX_TYPE_REMAP,  VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
+      { VX_OUTPUT, VX_TYPE_IMAGE,  VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxRemapInputValidator,
+    vxRemapOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_scale.c b/sample/targets/tiling/vx_scale.c
new file mode 100644
index 0000000..29bfd6b
--- /dev/null
+++ b/sample/targets/tiling/vx_scale.c
@@ -0,0 +1,523 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+#include "vx_interface.h"
+#include <tiling.h>
+#include <vx_internal.h>
+#include <VX/vx_helper.h>
+
+static vx_status VX_CALLBACK vxScaleImageInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+
+        if (input)
+        {
+            vx_df_image format = 0;
+
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            else if (format == VX_DF_IMAGE_S16)
+            {
+                /* enable internal S16 format support (needed for laplacian pyramid reconstruction) */
+                vx_scalar scalar = 0;
+                vx_parameter param1 = vxGetParameterByIndex(node, 2);
+                vxQueryParameter(param1, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+                if (scalar)
+                {
+                    vx_enum stype = 0;
+                    vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                    if (VX_TYPE_ENUM == stype)
+                    {
+                        vx_enum interp = 0;
+                        vxCopyScalar(scalar, &interp, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                        if (VX_INTERPOLATION_NEAREST_NEIGHBOR == interp)
+                        {
+                            /* only NN interpolation is required for laplacian pyramid */
+                            status = VX_SUCCESS;
+                        }
+                    }
+
+                    vxReleaseScalar(&scalar);
+                }
+
+                vxReleaseParameter(&param1);
+            }
+
+            vxReleaseImage(&input);
+        }
+
+        vxReleaseParameter(&param);
+    }
+    else if (index == 2)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar scalar = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum stype = 0;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                if (stype == VX_TYPE_ENUM)
+                {
+                    vx_enum interp = 0;
+                    vxCopyScalar(scalar, &interp, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ((interp == VX_INTERPOLATION_NEAREST_NEIGHBOR) ||
+                        (interp == VX_INTERPOLATION_BILINEAR) ||
+                        (interp == VX_INTERPOLATION_AREA))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxScaleImageOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 1)
+    {
+        vx_parameter src_param = vxGetParameterByIndex(node, 0);
+        vx_parameter dst_param = vxGetParameterByIndex(node, index);
+        if ((vxGetStatus((vx_reference)src_param) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)dst_param) == VX_SUCCESS))
+        {
+            vx_image src = 0;
+            vx_image dst = 0;
+            vxQueryParameter(src_param, VX_PARAMETER_REF, &src, sizeof(src));
+            vxQueryParameter(dst_param, VX_PARAMETER_REF, &dst, sizeof(dst));
+            if ((src) && (dst))
+            {
+                vx_uint32 w1 = 0, h1 = 0, w2 = 0, h2 = 0;
+                vx_df_image f1 = VX_DF_IMAGE_VIRT, f2 = VX_DF_IMAGE_VIRT;
+
+                vxQueryImage(src, VX_IMAGE_WIDTH, &w1, sizeof(w1));
+                vxQueryImage(src, VX_IMAGE_HEIGHT, &h1, sizeof(h1));
+                vxQueryImage(dst, VX_IMAGE_WIDTH, &w2, sizeof(w2));
+                vxQueryImage(dst, VX_IMAGE_HEIGHT, &h2, sizeof(h2));
+                vxQueryImage(src, VX_IMAGE_FORMAT, &f1, sizeof(f1));
+                vxQueryImage(dst, VX_IMAGE_FORMAT, &f2, sizeof(f2));
+                /* output can not be virtual */
+                if ((w2 != 0) && (h2 != 0) && (f2 != VX_DF_IMAGE_VIRT) && (f1 == f2))
+                {
+                    /* fill in the meta data with the attributes so that the checker will pass */
+                    ptr->type = VX_TYPE_IMAGE;
+                    ptr->dim.image.format = f2;
+                    ptr->dim.image.width = w2;
+                    ptr->dim.image.height = h2;
+                    status = VX_SUCCESS;
+                }
+                vxReleaseImage(&src);
+                vxReleaseImage(&dst);
+            }
+            vxReleaseParameter(&src_param);
+            vxReleaseParameter(&dst_param);
+        }
+    }
+    return status;
+}
+
+/* scale image kernel */
+static vx_param_description_t scale_kernel_params[] =
+{
+    { VX_INPUT,  VX_TYPE_IMAGE,  VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE,  VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
+};
+
+static vx_status VX_CALLBACK vxScaleImageInitializer(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (num == dimof(scale_kernel_params))
+    {
+        vx_image src = (vx_image)parameters[0];
+        vx_image dst = (vx_image)parameters[1];
+        vx_uint32 w1 = 0, h1 = 0, w2 = 0, h2 = 0;
+#if AREA_SCALE_ENABLE
+        vx_uint32 gcd_w = 0, gcd_h = 0;
+#endif
+        vx_size size = 0;
+        vx_size kernel_data_size = 0;
+
+        vxQueryImage(src, VX_IMAGE_WIDTH, &w1, sizeof(w1));
+        vxQueryImage(src, VX_IMAGE_HEIGHT, &h1, sizeof(h1));
+        vxQueryImage(dst, VX_IMAGE_WIDTH, &w2, sizeof(w2));
+        vxQueryImage(dst, VX_IMAGE_HEIGHT, &h2, sizeof(h2));
+
+        /* AREA interpolation requires a scratch buffer, however, if AREA
+        * implementation is disabled, then no scratch buffer is required, and
+        * size can be 0 (setting to 1 so that checks can pass in the kernel) */
+#if AREA_SCALE_ENABLE
+        gcd_w = math_gcd(w1, w2);
+        gcd_h = math_gcd(h1, h2);
+        /* printf("%ux%u => %ux%u :: GCD_w %u GCD_h %u\n", w1,h1, w2,h2, gcd_w, gcd_h); */
+        if (gcd_w != 0 && gcd_h != 0)
+        {
+            size = (w1 / gcd_w) * (w2 / gcd_w) * (h1 / gcd_h) * (h2 / gcd_h) * sizeof(vx_float64);
+        }
+        /* printf("Requesting "VX_FMT_SIZE" bytes for resizer\n", size); */
+#else
+        size = 1;
+#endif
+        vxQueryKernel(node->kernel, VX_KERNEL_LOCAL_DATA_SIZE, &kernel_data_size, sizeof(kernel_data_size));
+        if (kernel_data_size == 0)
+        {
+            node->attributes.localDataSize = size;
+        }
+        status = VX_SUCCESS;
+    }
+    return status;
+}
+
+vx_tiling_kernel_t scale_image_kernel = 
+{
+    "org.khronos.openvx.tiling_scale_image",
+    VX_KERNEL_SCALE_IMAGE_TILING,
+    NULL,
+    ScaleImage_image_tiling_flexible,
+    ScaleImage_image_tiling_fast,
+    3,
+    { { VX_INPUT,  VX_TYPE_IMAGE,  VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_IMAGE,  VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL }},
+	NULL,
+    vxScaleImageInputValidator,
+    vxScaleImageOutputValidator,
+    vxScaleImageInitializer,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
+/* half scale gaussian kernel */
+static vx_status VX_CALLBACK vxHalfscaleGaussianKernel(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_FAILURE;
+
+    if (num == dimof(scale_kernel_params))
+    {
+        vx_graph subgraph = ownGetChildGraphOfNode(node);
+        status = vxProcessGraph(subgraph);
+    }
+
+    return status;
+}
+
+static vx_status VX_CALLBACK vxHalfscaleGaussianInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 2)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar scalar = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum stype = 0;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                if (stype == VX_TYPE_INT32)
+                {
+                    vx_int32 ksize = 0;
+                    vxCopyScalar(scalar, &ksize, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ((ksize == 1) || (ksize == 3) || (ksize == 5))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxHalfscaleGaussianOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 1)
+    {
+        vx_parameter src_param = vxGetParameterByIndex(node, 0);
+        vx_parameter dst_param = vxGetParameterByIndex(node, index);
+        if ((vxGetStatus((vx_reference)src_param) == VX_SUCCESS) &&
+            (vxGetStatus((vx_reference)dst_param) == VX_SUCCESS))
+        {
+            vx_image src = 0;
+            vx_image dst = 0;
+            vxQueryParameter(src_param, VX_PARAMETER_REF, &src, sizeof(src));
+            vxQueryParameter(dst_param, VX_PARAMETER_REF, &dst, sizeof(dst));
+            if ((src) && (dst))
+            {
+                vx_uint32 w1 = 0, h1 = 0;
+                vx_df_image f1 = VX_DF_IMAGE_VIRT;
+
+                vxQueryImage(src, VX_IMAGE_WIDTH, &w1, sizeof(w1));
+                vxQueryImage(src, VX_IMAGE_HEIGHT, &h1, sizeof(h1));
+                vxQueryImage(src, VX_IMAGE_FORMAT, &f1, sizeof(f1));
+
+                /* fill in the meta data with the attributes so that the checker will pass */
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = f1;
+                ptr->dim.image.width = (w1 + 1) / 2;
+                ptr->dim.image.height = (h1 + 1) / 2;
+                status = VX_SUCCESS;
+            }
+            if (src) vxReleaseImage(&src);
+            if (dst) vxReleaseImage(&dst);
+            vxReleaseParameter(&src_param);
+            vxReleaseParameter(&dst_param);
+        }
+    }
+    return status;
+}
+
+static const vx_uint32 gaussian5x5scale = 256;
+static const vx_int16 gaussian5x5[5][5] =
+{
+    {1,  4,  6,  4, 1},
+    {4, 16, 24, 16, 4},
+    {6, 24, 36, 24, 6},
+    {4, 16, 24, 16, 4},
+    {1,  4,  6,  4, 1}
+};
+
+static vx_convolution vxCreateGaussian5x5Convolution(vx_context context)
+{
+    vx_convolution conv = vxCreateConvolution(context, 5, 5);
+    vx_status status = vxCopyConvolutionCoefficients(conv, (vx_int16 *)gaussian5x5, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+    if (status != VX_SUCCESS)
+    {
+        vxReleaseConvolution(&conv);
+        return NULL;
+    }
+
+    status = vxSetConvolutionAttribute(conv, VX_CONVOLUTION_SCALE, (void *)&gaussian5x5scale, sizeof(vx_uint32));
+    if (status != VX_SUCCESS)
+    {
+        vxReleaseConvolution(&conv);
+        return NULL;
+    }
+    return conv;
+}
+
+static vx_status VX_CALLBACK vxHalfscaleGaussianInitializer(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+
+    if (num == dimof(scale_kernel_params))
+    {
+        vx_context context = vxGetContext((vx_reference)node);
+
+        vx_graph subgraph = node->child;
+        if (subgraph)
+        {
+            /* deallocate subgraph resources */
+            status = vxReleaseGraph(&subgraph);
+            if (VX_SUCCESS != status)
+                return status;
+
+            status = ownSetChildGraphOfNode(node, 0);
+            if (VX_SUCCESS != status)
+                return status;
+        }
+
+        /* allocate subgraph resources */
+        subgraph = vxCreateGraph(context);
+
+        status = vxGetStatus((vx_reference)subgraph);
+        if (status == VX_SUCCESS)
+        {
+            vx_uint32 i;
+            vx_image input  = (vx_image)parameters[0];
+            vx_image output = (vx_image)parameters[1];
+            vx_int32 kernel_size = 3;
+            vx_convolution convolution = 0;
+
+            /* We have a child-graph; we want to make sure the parent
+               graph is recognized as a valid scope for sake of virtual
+               image parameters. */
+            subgraph->parentGraph = node->graph;
+
+            status |= vxCopyScalar((vx_scalar)parameters[2], &kernel_size, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+            if (kernel_size == 1)
+            {
+                    vx_node nodes[] =
+                    {
+                        vxScaleImageNode(subgraph, input, output, VX_INTERPOLATION_NEAREST_NEIGHBOR),
+                    };
+
+                    vx_border_t borders;
+                    status |= vxQueryNode(node, VX_NODE_BORDER, &borders, sizeof(borders));
+                    for (i = 0; i < dimof(nodes); i++)
+                    {
+                        status |= vxSetNodeAttribute(nodes[i], VX_NODE_BORDER, &borders, sizeof(borders));
+                    }
+
+                    status |= vxAddParameterToGraphByIndex(subgraph, nodes[0], 0); /* input image */
+                    status |= vxAddParameterToGraphByIndex(subgraph, nodes[0], 1); /* output image */
+
+                    status |= vxVerifyGraph(subgraph);
+
+                    /* release our references, the graph will hold it's own */
+                    for (i = 0; i < dimof(nodes); i++)
+                    {
+                        status |= vxReleaseNode(&nodes[i]);
+                    }
+
+                    status |= ownSetChildGraphOfNode(node, subgraph);
+            }
+            else if (kernel_size == 3 || kernel_size == 5)
+            {
+                if (kernel_size == 5)
+                {
+                    convolution = vxCreateGaussian5x5Convolution(context);
+                }
+                if (kernel_size == 3 || convolution)
+                {
+                    vx_image virt = vxCreateVirtualImage(subgraph, 0, 0, VX_DF_IMAGE_U8);
+                    vx_node nodes[] =
+                    {
+                            kernel_size == 3 ? vxGaussian3x3Node(subgraph, input, virt) : vxConvolveNode(subgraph, input, convolution, virt),
+                            vxScaleImageNode(subgraph, virt, output, VX_INTERPOLATION_NEAREST_NEIGHBOR),
+                    };
+
+                    vx_border_t borders;
+                    status |= vxQueryNode(node, VX_NODE_BORDER, &borders, sizeof(borders));
+                    for (i = 0; i < dimof(nodes); i++)
+                    {
+                        status |= vxSetNodeAttribute(nodes[i], VX_NODE_BORDER, &borders, sizeof(borders));
+                    }
+
+                    status |= vxAddParameterToGraphByIndex(subgraph, nodes[0], 0); /* input image */
+                    status |= vxAddParameterToGraphByIndex(subgraph, nodes[1], 1); /* output image */
+                    status |= vxAddParameterToGraphByIndex(subgraph, node, 2);     /* gradient size - refer to self to quiet sub-graph validator */
+
+                    status |= vxVerifyGraph(subgraph);
+
+                    /* release our references, the graph will hold it's own */
+                    for (i = 0; i < dimof(nodes); i++)
+                    {
+                        status |= vxReleaseNode(&nodes[i]);
+                    }
+                    
+                    if (convolution)
+                        status |= vxReleaseConvolution(&convolution);
+
+                    status |= vxReleaseImage(&virt);
+
+                    status |= ownSetChildGraphOfNode(node, subgraph);
+                }
+            }
+        }
+    }
+
+    return status;
+}
+
+static vx_status VX_CALLBACK vxHalfscaleGaussianDeinitializer(vx_node node, const vx_reference parameters[], vx_uint32 num)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+
+    if (num == dimof(scale_kernel_params))
+    {
+        vx_graph subgraph = ownGetChildGraphOfNode(node);
+
+        status = VX_SUCCESS;
+
+        status |= vxReleaseGraph(&subgraph);
+
+        /* set subgraph to "null" */
+        status |= ownSetChildGraphOfNode(node, 0);
+    }
+
+    return status;
+}
+
+
+vx_tiling_kernel_t halfscale_gaussian_kernel =
+{
+    "org.khronos.openvx.tiling_halfscale_gaussian",
+    VX_KERNEL_HALFSCALE_GAUSSIAN_TILING,
+    vxHalfscaleGaussianKernel,
+    NULL,
+    NULL,
+    3,
+    { { VX_INPUT,  VX_TYPE_IMAGE,  VX_PARAMETER_STATE_REQUIRED },
+      { VX_OUTPUT, VX_TYPE_IMAGE,  VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL } },
+    NULL,
+    vxHalfscaleGaussianInputValidator,
+    vxHalfscaleGaussianOutputValidator,
+    vxHalfscaleGaussianInitializer,
+    vxHalfscaleGaussianDeinitializer,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_threshold.c b/sample/targets/tiling/vx_threshold.c
new file mode 100644
index 0000000..87f669d
--- /dev/null
+++ b/sample/targets/tiling/vx_threshold.c
@@ -0,0 +1,138 @@
+/*
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+
+#include "vx_internal.h"
+
+#include "tiling.h"
+
+static vx_status VX_CALLBACK vxThresholdInputValidator(vx_node node, vx_uint32 index)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_image input = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+            if (input)
+            {
+                vx_df_image format = 0;
+                vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+                if ((format == VX_DF_IMAGE_U8 || format == VX_DF_IMAGE_S16))
+                {
+                    status = VX_SUCCESS;
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_FORMAT;
+                }
+                vxReleaseImage(&input);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    else if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_threshold threshold = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &threshold, sizeof(threshold));
+            if (threshold)
+            {
+                vx_enum type = 0;
+                vxQueryThreshold(threshold, VX_THRESHOLD_TYPE, &type, sizeof(type));
+                if ((type == VX_THRESHOLD_TYPE_BINARY) ||
+                     (type == VX_THRESHOLD_TYPE_RANGE))
+                {
+                    vx_enum data_type = 0;
+                    vxQueryThreshold(threshold, VX_THRESHOLD_DATA_TYPE, &data_type, sizeof(data_type));
+                    if (data_type == VX_TYPE_UINT8)
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_TYPE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseThreshold(&threshold);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxThresholdOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 2)
+    {
+        vx_parameter src_param = vxGetParameterByIndex(node, 0);
+        if (vxGetStatus((vx_reference)src_param) == VX_SUCCESS)
+        {
+            vx_image src = 0;
+            vxQueryParameter(src_param, VX_PARAMETER_REF, &src, sizeof(src));
+            if (src)
+            {
+                vx_uint32 width = 0, height = 0;
+
+                vxQueryImage(src, VX_IMAGE_WIDTH, &width, sizeof(height));
+                vxQueryImage(src, VX_IMAGE_HEIGHT, &height, sizeof(height));
+
+                /* fill in the meta data with the attributes so that the checker will pass */
+                ptr->type = VX_TYPE_IMAGE;
+                ptr->dim.image.format = VX_DF_IMAGE_U8;
+                ptr->dim.image.width = width;
+                ptr->dim.image.height = height;
+                status = VX_SUCCESS;
+                vxReleaseImage(&src);
+            }
+            vxReleaseParameter(&src_param);
+        }
+    }
+    return status;
+}
+
+vx_tiling_kernel_t threshold_kernel = 
+{
+    "org.khronos.openvx.tiling_threshold",
+    VX_KERNEL_THRESHOLD_TILING,
+    NULL,
+    Threshold_image_tiling_flexible,
+    Threshold_image_tiling_fast,
+    3, 
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_THRESHOLD,   VX_PARAMETER_STATE_REQUIRED },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxThresholdInputValidator,
+    vxThresholdOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_warp.c b/sample/targets/tiling/vx_warp.c
new file mode 100644
index 0000000..84056bc
--- /dev/null
+++ b/sample/targets/tiling/vx_warp.c
@@ -0,0 +1,200 @@
+/* 
+
+ * Copyright (c) 2012-2017 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vx_interface.h"
+#include "vx_internal.h"
+
+#include <tiling.h>
+
+static vx_status vxWarpInputValidator(vx_node node, vx_uint32 index, vx_size mat_columns)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 0)
+    {
+        vx_image input = 0;
+        vx_parameter param = vxGetParameterByIndex(node, index);
+
+        vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+        if (input)
+        {
+            vx_df_image format = 0;
+            vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+            if (format == VX_DF_IMAGE_U8)
+            {
+                status = VX_SUCCESS;
+            }
+            vxReleaseImage(&input);
+        }
+        vxReleaseParameter(&param);
+    }
+    else if (index == 1)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_matrix matrix;
+            vxQueryParameter(param, VX_PARAMETER_REF, &matrix, sizeof(matrix));
+            if (matrix)
+            {
+                vx_enum data_type = 0;
+                vx_size rows = 0ul, columns = 0ul;
+                vxQueryMatrix(matrix, VX_MATRIX_TYPE, &data_type, sizeof(data_type));
+                vxQueryMatrix(matrix, VX_MATRIX_ROWS, &rows, sizeof(rows));
+                vxQueryMatrix(matrix, VX_MATRIX_COLUMNS, &columns, sizeof(columns));
+                if ((data_type == VX_TYPE_FLOAT32) && (columns == mat_columns) && (rows == 3))
+                {
+                    status = VX_SUCCESS;
+                }
+                vxReleaseMatrix(&matrix);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    else if (index == 2)
+    {
+        vx_parameter param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+        {
+            vx_scalar scalar = 0;
+            vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+            if (scalar)
+            {
+                vx_enum stype = 0;
+                vxQueryScalar(scalar, VX_SCALAR_TYPE, &stype, sizeof(stype));
+                if (stype == VX_TYPE_ENUM)
+                {
+                    vx_enum interp = 0;
+                    vxCopyScalar(scalar, &interp, VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
+                    if ((interp == VX_INTERPOLATION_NEAREST_NEIGHBOR) ||
+                        (interp == VX_INTERPOLATION_BILINEAR))
+                    {
+                        status = VX_SUCCESS;
+                    }
+                    else
+                    {
+                        status = VX_ERROR_INVALID_VALUE;
+                    }
+                }
+                else
+                {
+                    status = VX_ERROR_INVALID_TYPE;
+                }
+                vxReleaseScalar(&scalar);
+            }
+            vxReleaseParameter(&param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxWarpOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+    vx_status status = VX_ERROR_INVALID_PARAMETERS;
+    if (index == 3)
+    {
+        vx_parameter dst_param = vxGetParameterByIndex(node, index);
+        if (vxGetStatus((vx_reference)dst_param) == VX_SUCCESS)
+        {
+            vx_image dst = 0;
+            vxQueryParameter(dst_param, VX_PARAMETER_REF, &dst, sizeof(dst));
+            if (dst)
+            {
+                vx_uint32 w1 = 0, h1 = 0;
+                vx_df_image f1 = VX_DF_IMAGE_VIRT;
+
+                vxQueryImage(dst, VX_IMAGE_WIDTH, &w1, sizeof(w1));
+                vxQueryImage(dst, VX_IMAGE_HEIGHT, &h1, sizeof(h1));
+                vxQueryImage(dst, VX_IMAGE_FORMAT, &f1, sizeof(f1));
+                /* output can not be virtual */
+                if ((w1 != 0) && (h1 != 0) && (f1 == VX_DF_IMAGE_U8))
+                {
+                    /* fill in the meta data with the attributes so that the checker will pass */
+                    ptr->type = VX_TYPE_IMAGE;
+                    ptr->dim.image.format = VX_DF_IMAGE_U8;
+                    ptr->dim.image.width = w1;
+                    ptr->dim.image.height = h1;
+                    status = VX_SUCCESS;
+                }
+                vxReleaseImage(&dst);
+            }
+            vxReleaseParameter(&dst_param);
+        }
+    }
+    return status;
+}
+
+static vx_status VX_CALLBACK vxWarpAffineInputValidator(vx_node node, vx_uint32 index)
+{
+    return vxWarpInputValidator(node, index, 2);
+}
+
+static vx_status VX_CALLBACK vxWarpPerspectiveInputValidator(vx_node node, vx_uint32 index)
+{
+    return vxWarpInputValidator(node, index, 3);
+}
+
+static vx_param_description_t warp_kernel_params[] = {
+    {VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_MATRIX, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL},
+    {VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED},
+};
+
+vx_tiling_kernel_t warp_affine_kernel = 
+{
+    "org.khronos.openvx.tiling_warp_affine",
+    VX_KERNEL_WARP_AFFINE_TILING,
+    NULL,
+    WarpAffine_image_tiling_flexible,
+    WarpAffine_image_tiling_fast,
+    4,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_MATRIX, VX_PARAMETER_STATE_REQUIRED },
+      { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
+      { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxWarpAffineInputValidator,
+    vxWarpOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
+
+
+vx_tiling_kernel_t warp_perspective_kernel = 
+{
+    "org.khronos.openvx.tiling_warp_perspective",
+    VX_KERNEL_WARP_PERSPECTIVE_TILING,
+    NULL,
+    WarpPerspective_image_tiling_flexible,
+    WarpPerspective_image_tiling_fast,
+    4,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_MATRIX, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_OPTIONAL },
+    { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+    NULL,
+    vxWarpPerspectiveInputValidator,
+    vxWarpOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};
diff --git a/sample/targets/tiling/vx_weighted_average.c b/sample/targets/tiling/vx_weighted_average.c
new file mode 100644
index 0000000..87760f1
--- /dev/null
+++ b/sample/targets/tiling/vx_weighted_average.c
@@ -0,0 +1,163 @@
+/*
+
+* Copyright (c) 2012-2017 The Khronos Group Inc.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+#include "vx_interface.h"
+#include "vx_internal.h"
+#include <tiling.h>
+
+static vx_status VX_CALLBACK vxWeightedAverageInputValidator(vx_node node, vx_uint32 index)
+{
+	vx_status status = VX_ERROR_INVALID_PARAMETERS;
+	if (index == 0)
+	{
+		vx_image input = 0;
+		vx_parameter param = vxGetParameterByIndex(node, index);
+
+		vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+		if (input)
+		{
+			vx_df_image format = 0;
+			vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+			if ((format == VX_DF_IMAGE_U8))
+			{
+				status = VX_SUCCESS;
+			}
+			vxReleaseImage(&input);
+		}
+		vxReleaseParameter(&param);
+	}
+	else if (index == 1)
+	{
+		vx_scalar scalar = 0;
+		vx_parameter param = vxGetParameterByIndex(node, index);
+		if (vxGetStatus((vx_reference)param) == VX_SUCCESS)
+		{
+			vxQueryParameter(param, VX_PARAMETER_REF, &scalar, sizeof(scalar));
+			if (scalar)
+			{
+				vx_enum type = -1;
+				vxQueryScalar(scalar, VX_SCALAR_TYPE, &type, sizeof(type));
+				if (type == VX_TYPE_FLOAT32)
+				{
+					vx_float32 scale = 0.0f;
+					if ((vxCopyScalar(scalar, &scale, VX_READ_ONLY, VX_MEMORY_TYPE_HOST) == VX_SUCCESS) &&
+						(scale >= 0) && (scale <= 1.0))
+					{
+						status = VX_SUCCESS;
+					}
+					else
+					{
+						status = VX_ERROR_INVALID_VALUE;
+					}
+				}
+				else
+				{
+					status = VX_ERROR_INVALID_TYPE;
+				}
+				vxReleaseScalar(&scalar);
+			}
+			vxReleaseParameter(&param);
+		}
+	}
+	else if (index == 2)
+	{
+		vx_image input = 0;
+		vx_parameter param = vxGetParameterByIndex(node, index);
+		vxQueryParameter(param, VX_PARAMETER_REF, &input, sizeof(input));
+		if (input)
+		{
+			vx_df_image format = 0;
+			vxQueryImage(input, VX_IMAGE_FORMAT, &format, sizeof(format));
+			if (format == VX_DF_IMAGE_U8)
+			{
+				status = VX_SUCCESS;
+			}
+			vxReleaseImage(&input);
+		}
+		vxReleaseParameter(&param);
+	}
+	return status;
+}
+
+static vx_status VX_CALLBACK vxWeightedAverageOutputValidator(vx_node node, vx_uint32 index, vx_meta_format_t *ptr)
+{
+	vx_status status = VX_ERROR_INVALID_PARAMETERS;
+	if (index == 3)
+	{
+		vx_parameter param[] = {
+			vxGetParameterByIndex(node, 0),
+			vxGetParameterByIndex(node, 2),
+			vxGetParameterByIndex(node, index),
+		};
+		if ((vxGetStatus((vx_reference)param[0]) == VX_SUCCESS) &&
+			(vxGetStatus((vx_reference)param[1]) == VX_SUCCESS) &&
+			(vxGetStatus((vx_reference)param[2]) == VX_SUCCESS))
+		{
+			vx_image images[3];
+			vxQueryParameter(param[0], VX_PARAMETER_REF, &images[0], sizeof(images[0]));
+			vxQueryParameter(param[1], VX_PARAMETER_REF, &images[1], sizeof(images[1]));
+			vxQueryParameter(param[2], VX_PARAMETER_REF, &images[2], sizeof(images[2]));
+			if (images[0] && images[1] && images[2])
+			{
+				vx_uint32 width = 0, height = 0;
+				vx_df_image informat[2] = { VX_DF_IMAGE_VIRT, VX_DF_IMAGE_VIRT };
+				vx_df_image outformat = VX_DF_IMAGE_VIRT;
+				vxQueryImage(images[0], VX_IMAGE_WIDTH, &width, sizeof(width));
+				vxQueryImage(images[0], VX_IMAGE_HEIGHT, &height, sizeof(height));
+				vxQueryImage(images[0], VX_IMAGE_FORMAT, &informat[0], sizeof(informat[0]));
+				vxQueryImage(images[1], VX_IMAGE_FORMAT, &informat[1], sizeof(informat[1]));
+				vxQueryImage(images[2], VX_IMAGE_FORMAT, &outformat, sizeof(outformat));
+				if (informat[0] == VX_DF_IMAGE_U8 && informat[1] == VX_DF_IMAGE_U8 && outformat == VX_DF_IMAGE_U8)
+				{
+					status = VX_SUCCESS;
+				}
+				ptr->type = VX_TYPE_IMAGE;
+				ptr->dim.image.format = outformat;
+				ptr->dim.image.width = width;
+				ptr->dim.image.height = height;
+				vxReleaseImage(&images[0]);
+				vxReleaseImage(&images[1]);
+				vxReleaseImage(&images[2]);
+			}
+			vxReleaseParameter(&param[0]);
+			vxReleaseParameter(&param[1]);
+			vxReleaseParameter(&param[2]);
+		}
+	}
+	return status;
+}
+vx_tiling_kernel_t weightedaverage_kernel = 
+{
+    "org.khronos.openvx.tiling_weightedaverage",
+    VX_KERNEL_WEIGHTED_AVERAGE_TILING,
+    NULL,
+    WeightedAverage_image_tiling_flexible,
+    WeightedAverage_image_tiling_fast,
+    4,
+    { { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+        { VX_INPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED },
+        { VX_OUTPUT, VX_TYPE_IMAGE, VX_PARAMETER_STATE_REQUIRED } },
+	NULL,
+	vxWeightedAverageInputValidator,
+	vxWeightedAverageOutputValidator,
+    NULL,
+    NULL,
+    { 16, 16 },
+    { -1, 1, -1, 1 },
+    { VX_BORDER_MODE_UNDEFINED, 0 },
+};