From 7f1c190c1a419d9eff946018638dfdc57b207799 Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Tue, 13 Jun 2017 16:31:42 +0800 Subject: Use aligned16 and aligne4 kernel to copy for large 3D image with TILE_Y. It is similar with 2D image for avoiding extended image width truncated. Signed-off-by: Yan Wang Reviewed-by: Yang Rong --- src/CMakeLists.txt | 2 + src/cl_context.h | 60 ++++++++++++---------- src/cl_mem.c | 50 ++++++++++++++---- .../cl_internal_copy_buffer_to_image_3d_align16.cl | 18 +++++++ .../cl_internal_copy_buffer_to_image_3d_align4.cl | 18 +++++++ .../cl_internal_copy_image_3d_to_buffer_align16.cl | 19 +++++++ .../cl_internal_copy_image_3d_to_buffer_align4.cl | 19 +++++++ 7 files changed, 149 insertions(+), 37 deletions(-) create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_3d_align16.cl create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_3d_align4.cl create mode 100644 src/kernels/cl_internal_copy_image_3d_to_buffer_align16.cl create mode 100644 src/kernels/cl_internal_copy_image_3d_to_buffer_align4.cl diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 87ad48bb..ecb98b96 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -54,6 +54,8 @@ cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_2d_align16 cl_internal_copy_buffer_to_image_3d cl_internal_copy_buffer_to_image_2d_align4 cl_internal_copy_image_2d_to_buffer_align4 +cl_internal_copy_buffer_to_image_3d_align4 cl_internal_copy_image_3d_to_buffer_align4 +cl_internal_copy_buffer_to_image_3d_align16 cl_internal_copy_image_3d_to_buffer_align16 cl_internal_fill_buf_align8 cl_internal_fill_buf_align4 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign cl_internal_fill_buf_align128 cl_internal_fill_image_1d diff --git a/src/cl_context.h b/src/cl_context.h index 75bf8952..3a2e13be 100644 --- a/src/cl_context.h +++ b/src/cl_context.h @@ -49,38 +49,42 @@ enum _cl_internal_ker_type { CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET, CL_ENQUEUE_COPY_BUFFER_RECT, CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4, - CL_ENQUEUE_COPY_IMAGE_1D_TO_1D, //copy image 1d to image 1d - CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, //copy image 2d to image 2d - CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, //copy image 3d to image 2d - CL_ENQUEUE_COPY_IMAGE_2D_TO_3D, //copy image 2d to image 3d - CL_ENQUEUE_COPY_IMAGE_3D_TO_3D, //copy image 3d to image 3d - CL_ENQUEUE_COPY_IMAGE_2D_TO_2D_ARRAY, //copy image 2d to image 2d array - CL_ENQUEUE_COPY_IMAGE_1D_ARRAY_TO_1D_ARRAY, //copy image 1d array to image 1d array - CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D_ARRAY, //copy image 2d array to image 2d array - CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D, //copy image 2d array to image 2d - CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D, //copy image 2d array to image 3d - CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY, //copy image 3d to image 2d array - CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER, //copy image 2d to buffer + CL_ENQUEUE_COPY_IMAGE_1D_TO_1D, // copy image 1d to image 1d + CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, // copy image 2d to image 2d + CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, // copy image 3d to image 2d + CL_ENQUEUE_COPY_IMAGE_2D_TO_3D, // copy image 2d to image 3d + CL_ENQUEUE_COPY_IMAGE_3D_TO_3D, // copy image 3d to image 3d + CL_ENQUEUE_COPY_IMAGE_2D_TO_2D_ARRAY, // copy image 2d to image 2d array + CL_ENQUEUE_COPY_IMAGE_1D_ARRAY_TO_1D_ARRAY, // copy image 1d array to image 1d array + CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D_ARRAY, // copy image 2d array to image 2d array + CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D, // copy image 2d array to image 2d + CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D, // copy image 2d array to image 3d + CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY, // copy image 3d to image 2d array + CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER, // copy image 2d to buffer CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN4, - CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, //copy image 3d tobuffer - CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, //copy buffer to image 2d + CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, // copy image 3d tobuffer + CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER_ALIGN16, + CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER_ALIGN4, + CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, // copy buffer to image 2d CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN4, - CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, //copy buffer to image 3d - CL_ENQUEUE_FILL_BUFFER_UNALIGN, //fill buffer with 1 aligne pattern, pattern size=1 - CL_ENQUEUE_FILL_BUFFER_ALIGN2, //fill buffer with 2 aligne pattern, pattern size=2 - CL_ENQUEUE_FILL_BUFFER_ALIGN4, //fill buffer with 4 aligne pattern, pattern size=4 - CL_ENQUEUE_FILL_BUFFER_ALIGN8_8, //fill buffer with 8 aligne pattern, pattern size=8 - CL_ENQUEUE_FILL_BUFFER_ALIGN8_16, //fill buffer with 16 aligne pattern, pattern size=16 - CL_ENQUEUE_FILL_BUFFER_ALIGN8_32, //fill buffer with 16 aligne pattern, pattern size=32 - CL_ENQUEUE_FILL_BUFFER_ALIGN8_64, //fill buffer with 16 aligne pattern, pattern size=64 - CL_ENQUEUE_FILL_BUFFER_ALIGN128, //fill buffer with 128 aligne pattern, pattern size=128 - CL_ENQUEUE_FILL_IMAGE_1D, //fill image 1d - CL_ENQUEUE_FILL_IMAGE_1D_ARRAY, //fill image 1d array - CL_ENQUEUE_FILL_IMAGE_2D, //fill image 2d - CL_ENQUEUE_FILL_IMAGE_2D_ARRAY, //fill image 2d array - CL_ENQUEUE_FILL_IMAGE_3D, //fill image 3d + CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, // copy buffer to image 3d + CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D_ALIGN16, + CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D_ALIGN4, + CL_ENQUEUE_FILL_BUFFER_UNALIGN, // fill buffer with 1 aligne pattern, pattern size=1 + CL_ENQUEUE_FILL_BUFFER_ALIGN2, // fill buffer with 2 aligne pattern, pattern size=2 + CL_ENQUEUE_FILL_BUFFER_ALIGN4, // fill buffer with 4 aligne pattern, pattern size=4 + CL_ENQUEUE_FILL_BUFFER_ALIGN8_8, // fill buffer with 8 aligne pattern, pattern size=8 + CL_ENQUEUE_FILL_BUFFER_ALIGN8_16, // fill buffer with 16 aligne pattern, pattern size=16 + CL_ENQUEUE_FILL_BUFFER_ALIGN8_32, // fill buffer with 16 aligne pattern, pattern size=32 + CL_ENQUEUE_FILL_BUFFER_ALIGN8_64, // fill buffer with 16 aligne pattern, pattern size=64 + CL_ENQUEUE_FILL_BUFFER_ALIGN128, // fill buffer with 128 aligne pattern, pattern size=128 + CL_ENQUEUE_FILL_IMAGE_1D, // fill image 1d + CL_ENQUEUE_FILL_IMAGE_1D_ARRAY, // fill image 1d array + CL_ENQUEUE_FILL_IMAGE_2D, // fill image 2d + CL_ENQUEUE_FILL_IMAGE_2D_ARRAY, // fill image 2d array + CL_ENQUEUE_FILL_IMAGE_3D, // fill image 3d CL_INTERNAL_KERNEL_MAX }; diff --git a/src/cl_mem.c b/src/cl_mem.c index b6dce3f3..ad92234b 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -2162,14 +2162,13 @@ get_align_size_for_copy_kernel(struct _cl_mem_image* image, const size_t origin0 const size_t offset, cl_image_format *fmt) { size_t align_size = 0; - if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % ALIGN16 == 0) && - ((origin0 * image->bpp) % ALIGN16 == 0) && (region0 % ALIGN16 == 0) && (offset % ALIGN16 == 0)){ + if (((image->w * image->bpp) % ALIGN16 == 0) && ((origin0 * image->bpp) % ALIGN16 == 0) && (region0 % ALIGN16 == 0) && + (offset % ALIGN16 == 0)) { fmt->image_channel_order = CL_RGBA; fmt->image_channel_data_type = CL_UNSIGNED_INT32; align_size = ALIGN16; - } - else if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % ALIGN4 == 0) && - ((origin0 * image->bpp) % ALIGN4 == 0) && (region0 % ALIGN4 == 0) && (offset % ALIGN4 == 0)){ + } else if (((image->w * image->bpp) % ALIGN4 == 0) && ((origin0 * image->bpp) % ALIGN4 == 0) && + (region0 % ALIGN4 == 0) && (offset % ALIGN4 == 0)) { fmt->image_channel_order = CL_R; fmt->image_channel_data_type = CL_UNSIGNED_INT32; align_size = ALIGN4; @@ -2247,11 +2246,28 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, cl_event event, struct _cl_m cl_internal_copy_image_2d_to_buffer_str, (size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL); } }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) { - extern char cl_internal_copy_image_3d_to_buffer_str[]; - extern size_t cl_internal_copy_image_3d_to_buffer_str_size; + if (align_size == ALIGN16) { + extern char cl_internal_copy_image_3d_to_buffer_align16_str[]; + extern size_t cl_internal_copy_image_3d_to_buffer_align16_str_size; + + ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER_ALIGN16, + cl_internal_copy_image_3d_to_buffer_align16_str, + (size_t)cl_internal_copy_image_3d_to_buffer_align16_str_size, NULL); + } else if (align_size == ALIGN4) { + extern char cl_internal_copy_image_3d_to_buffer_align4_str[]; + extern size_t cl_internal_copy_image_3d_to_buffer_align4_str_size; + + ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER_ALIGN4, + cl_internal_copy_image_3d_to_buffer_align4_str, + (size_t)cl_internal_copy_image_3d_to_buffer_align4_str_size, NULL); + } else { + extern char cl_internal_copy_image_3d_to_buffer_str[]; + extern size_t cl_internal_copy_image_3d_to_buffer_str_size; - ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, - cl_internal_copy_image_3d_to_buffer_str, (size_t)cl_internal_copy_image_3d_to_buffer_str_size, NULL); + ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, + cl_internal_copy_image_3d_to_buffer_str, + (size_t)cl_internal_copy_image_3d_to_buffer_str_size, NULL); + } } if (!ker) { @@ -2347,11 +2363,27 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_event event, cl_mem buffe cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL); } }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) { + if (align_size == ALIGN16) { + extern char cl_internal_copy_buffer_to_image_3d_align16_str[]; + extern size_t cl_internal_copy_buffer_to_image_3d_align16_str_size; + + ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D_ALIGN16, + cl_internal_copy_buffer_to_image_3d_align16_str, + (size_t)cl_internal_copy_buffer_to_image_3d_align16_str_size, NULL); + } else if (align_size == ALIGN4) { + extern char cl_internal_copy_buffer_to_image_3d_align4_str[]; + extern size_t cl_internal_copy_buffer_to_image_3d_align4_str_size; + + ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D_ALIGN4, + cl_internal_copy_buffer_to_image_3d_align4_str, + (size_t)cl_internal_copy_buffer_to_image_3d_align4_str_size, NULL); + } else { extern char cl_internal_copy_buffer_to_image_3d_str[]; extern size_t cl_internal_copy_buffer_to_image_3d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, cl_internal_copy_buffer_to_image_3d_str, (size_t)cl_internal_copy_buffer_to_image_3d_str_size, NULL); + } } if (!ker) return CL_OUT_OF_RESOURCES; diff --git a/src/kernels/cl_internal_copy_buffer_to_image_3d_align16.cl b/src/kernels/cl_internal_copy_buffer_to_image_3d_align16.cl new file mode 100644 index 00000000..b57b4878 --- /dev/null +++ b/src/kernels/cl_internal_copy_buffer_to_image_3d_align16.cl @@ -0,0 +1,18 @@ +kernel void __cl_copy_buffer_to_image_3d_align16(__write_only image3d_t image, global uint4 *buffer, + unsigned int region0, unsigned int region1, unsigned int region2, + unsigned int dst_origin0, unsigned int dst_origin1, + unsigned int dst_origin2, unsigned int src_offset) { + int i = get_global_id(0); + int j = get_global_id(1); + int k = get_global_id(2); + uint4 color = (uint4)(0); + int4 dst_coord; + if ((i >= region0) || (j >= region1) || (k >= region2)) + return; + dst_coord.x = dst_origin0 + i; + dst_coord.y = dst_origin1 + j; + dst_coord.z = dst_origin2 + k; + src_offset += (k * region1 + j) * region0 + i; + color = buffer[src_offset]; + write_imageui(image, dst_coord, color); +} diff --git a/src/kernels/cl_internal_copy_buffer_to_image_3d_align4.cl b/src/kernels/cl_internal_copy_buffer_to_image_3d_align4.cl new file mode 100644 index 00000000..717af979 --- /dev/null +++ b/src/kernels/cl_internal_copy_buffer_to_image_3d_align4.cl @@ -0,0 +1,18 @@ +kernel void __cl_copy_buffer_to_image_3d_align4(__write_only image3d_t image, global uint *buffer, unsigned int region0, + unsigned int region1, unsigned int region2, unsigned int dst_origin0, + unsigned int dst_origin1, unsigned int dst_origin2, + unsigned int src_offset) { + int i = get_global_id(0); + int j = get_global_id(1); + int k = get_global_id(2); + uint4 color = (uint4)(0); + int4 dst_coord; + if ((i >= region0) || (j >= region1) || (k >= region2)) + return; + dst_coord.x = dst_origin0 + i; + dst_coord.y = dst_origin1 + j; + dst_coord.z = dst_origin2 + k; + src_offset += (k * region1 + j) * region0 + i; + color.x = buffer[src_offset]; + write_imageui(image, dst_coord, color); +} diff --git a/src/kernels/cl_internal_copy_image_3d_to_buffer_align16.cl b/src/kernels/cl_internal_copy_image_3d_to_buffer_align16.cl new file mode 100644 index 00000000..a7a3c2e3 --- /dev/null +++ b/src/kernels/cl_internal_copy_image_3d_to_buffer_align16.cl @@ -0,0 +1,19 @@ +kernel void __cl_copy_image_3d_to_buffer_align16(__read_only image3d_t image, global uint4 *buffer, + unsigned int region0, unsigned int region1, unsigned int region2, + unsigned int src_origin0, unsigned int src_origin1, + unsigned int src_origin2, unsigned int dst_offset) { + int i = get_global_id(0); + int j = get_global_id(1); + int k = get_global_id(2); + uint4 color; + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; + int4 src_coord; + if ((i >= region0) || (j >= region1) || (k >= region2)) + return; + src_coord.x = src_origin0 + i; + src_coord.y = src_origin1 + j; + src_coord.z = src_origin2 + k; + color = read_imageui(image, sampler, src_coord); + dst_offset += (k * region1 + j) * region0 + i; + *(buffer + dst_offset) = color; +} diff --git a/src/kernels/cl_internal_copy_image_3d_to_buffer_align4.cl b/src/kernels/cl_internal_copy_image_3d_to_buffer_align4.cl new file mode 100644 index 00000000..bb001afd --- /dev/null +++ b/src/kernels/cl_internal_copy_image_3d_to_buffer_align4.cl @@ -0,0 +1,19 @@ +kernel void __cl_copy_image_3d_to_buffer_align4(__read_only image3d_t image, global uint *buffer, unsigned int region0, + unsigned int region1, unsigned int region2, unsigned int src_origin0, + unsigned int src_origin1, unsigned int src_origin2, + unsigned int dst_offset) { + int i = get_global_id(0); + int j = get_global_id(1); + int k = get_global_id(2); + uint4 color; + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; + int4 src_coord; + if ((i >= region0) || (j >= region1) || (k >= region2)) + return; + src_coord.x = src_origin0 + i; + src_coord.y = src_origin1 + j; + src_coord.z = src_origin2 + k; + color = read_imageui(image, sampler, src_coord); + dst_offset += (k * region1 + j) * region0 + i; + buffer[dst_offset] = color.x; +} -- cgit v1.2.1