From 9f6001fed00bb409cd416b86b448ddbb2cb4793c Mon Sep 17 00:00:00 2001 From: Yan Wang Date: Tue, 16 May 2017 19:03:33 +0800 Subject: Create image with TILE_Y mode still when image size>128MB for performance. It may failed to copy data from host ptr to TILE_Y large image. So use clCopyBufferToImage to do this on GPU side. Signed-off-by: Yan Wang Reviewed-by: Yang Rong --- src/cl_context.c | 6 ++++ src/cl_context.h | 2 +- src/cl_mem.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- src/cl_mem.h | 2 ++ 4 files changed, 111 insertions(+), 6 deletions(-) diff --git a/src/cl_context.c b/src/cl_context.c index 1ba23024..4b8281c1 100644 --- a/src/cl_context.c +++ b/src/cl_context.c @@ -342,6 +342,7 @@ cl_context_new(struct _cl_context_prop *props, cl_uint dev_num, cl_device_id* al TRY_ALLOC_NO_ERR (ctx->drv, cl_driver_new(props)); ctx->props = *props; ctx->ver = cl_driver_get_ver(ctx->drv); + ctx->image_queue = NULL; exit: return ctx; @@ -362,6 +363,11 @@ cl_context_delete(cl_context ctx) if (CL_OBJECT_DEC_REF(ctx) > 1) return; + if (ctx->image_queue) { + clReleaseCommandQueue(ctx->image_queue); + ctx->image_queue = NULL; + } + /* delete the internal programs. */ for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) { if (ctx->internal_kernels[i]) { diff --git a/src/cl_context.h b/src/cl_context.h index 4812afd1..8ba499f7 100644 --- a/src/cl_context.h +++ b/src/cl_context.h @@ -129,7 +129,7 @@ struct _cl_context { void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *); /* User's callback when error occur in context */ void *user_data; /* A pointer to user supplied data */ - + cl_command_queue image_queue; /* A internal command queue for image data copying */ }; #define CL_OBJECT_CONTEXT_MAGIC 0x20BBCADE993134AALL diff --git a/src/cl_mem.c b/src/cl_mem.c index 4a7bec82..0c49c3d7 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -153,6 +153,8 @@ cl_mem_allocate(enum cl_mem_type type, if (mem->type == CL_MEM_IMAGE_TYPE) { cl_mem_image(mem)->is_image_from_buffer = 0; cl_mem_image(mem)->is_image_from_nv12_image = 0; + cl_mem_image(mem)->is_ker_copy = 0; + cl_mem_image(mem)->tmp_ker_buf = NULL; } if (sz != 0) { @@ -750,6 +752,80 @@ cl_image_tiling_t cl_get_default_tiling(cl_driver drv) return tiling; } +static cl_mem +_cl_new_image_copy_from_host_ptr(cl_context ctx, + cl_mem_flags flags, + const cl_image_format *fmt, + const cl_mem_object_type image_type, + size_t w, + size_t h, + size_t depth, + size_t pitch, + size_t slice_pitch, + size_t sz, + size_t aligned_pitch, + uint32_t intel_fmt, + uint32_t bpp, + cl_image_tiling_t tiling, + void *data, //pointer from application + cl_int *errcode_ret) +{ + cl_int err = CL_SUCCESS; + cl_mem mem = NULL; + size_t origin[3] = {0, 0, 0}; + size_t region[3] = {w, h, depth}; + size_t aligned_slice_pitch = 0; + + if (ctx->image_queue == NULL) { + ctx->image_queue = clCreateCommandQueueWithProperties(ctx, ctx->devices[0], 0, &err); + if (err != CL_SUCCESS || !ctx->image_queue) { + *errcode_ret = err; + ctx->image_queue = NULL; + return NULL; + } + } + + // Map host ptr to OCL buffer + cl_mem buf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, sz, data, &err); + if (err != CL_SUCCESS) { + *errcode_ret = err; + return NULL; + } + + mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err); + if (mem == NULL || err != CL_SUCCESS) { + clReleaseMemObject(buf); + return NULL; + } + + cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch); + + if (image_type == CL_MEM_OBJECT_IMAGE2D) + aligned_slice_pitch = 0; + else + //SKL need use tiling's aligned_h to calc slice_pitch and IVB to BDW need CL_NO_TILE's aligned_h to calc. + aligned_slice_pitch = aligned_pitch * ALIGN(h, cl_buffer_get_tiling_align(ctx, tiling, 2)); + + cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt, + intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling, + 0, 0, 0); + + err = clEnqueueCopyBufferToImage(ctx->image_queue, buf, mem, 0, origin, region, 0, NULL, NULL); + if(err != CL_SUCCESS) { + clReleaseMemObject(buf); + clReleaseMemObject(mem); + return NULL; + } + + clReleaseMemObject(buf); + if (flags & CL_MEM_USE_HOST_PTR && data) { + mem->host_ptr = data; + cl_mem_image(mem)->host_row_pitch = pitch; + cl_mem_image(mem)->host_slice_pitch = slice_pitch; + } + return mem; +} + static cl_mem _cl_mem_new_image(cl_context ctx, cl_mem_flags flags, @@ -765,6 +841,7 @@ _cl_mem_new_image(cl_context ctx, cl_int *errcode_ret) { cl_int err = CL_SUCCESS; + cl_bool is_ker_copy = 0; cl_mem mem = NULL; cl_mem_object_type image_type = orig_image_type; uint32_t bpp = 0, intel_fmt = INTEL_UNSUPPORTED_FORMAT; @@ -931,11 +1008,25 @@ _cl_mem_new_image(cl_context ctx, /* If sz is large than 128MB, map gtt may fail in some system. Because there is no obviours performance drop, disable tiling. */ - if(tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) { - tiling = CL_NO_TILE; - aligned_pitch = w * bpp; - aligned_h = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1)); - sz = aligned_pitch * aligned_h * depth; + if (tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) { + if ((image_type == CL_MEM_OBJECT_IMAGE2D || image_type == CL_MEM_OBJECT_IMAGE3D) && + buffer == NULL) { + if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) { + mem = _cl_new_image_copy_from_host_ptr(ctx, flags, fmt, image_type, w, h, depth, pitch, + slice_pitch, sz, aligned_pitch, intel_fmt, bpp, tiling, data, &err); + if (mem != NULL) { + cl_mem_image(mem)->is_ker_copy = 1; + goto exit; + } else + goto error; + } else + is_ker_copy = 1; + } else { + tiling = CL_NO_TILE; + aligned_pitch = w * bpp; + aligned_h = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1)); + sz = aligned_pitch * aligned_h * depth; + } } if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER) { @@ -992,6 +1083,8 @@ _cl_mem_new_image(cl_context ctx, cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data); } + cl_mem_image(mem)->is_ker_copy = is_ker_copy; + exit: if (errcode_ret) *errcode_ret = err; @@ -1389,6 +1482,10 @@ cl_mem_delete(cl_mem mem) mem->bo = NULL; } } + if (cl_mem_image(mem)->tmp_ker_buf) { + cl_mem_delete(cl_mem_image(mem)->tmp_ker_buf); + cl_mem_image(mem)->tmp_ker_buf = NULL; + } } /* Someone still mapped, unmap */ diff --git a/src/cl_mem.h b/src/cl_mem.h index edfd0436..0b33c317 100644 --- a/src/cl_mem.h +++ b/src/cl_mem.h @@ -145,6 +145,8 @@ struct _cl_mem_image { uint8_t is_image_from_buffer; /* IMAGE from Buffer*/ cl_mem nv12_image; /* if the image is created from nv12 Image, it point to the image.*/ uint8_t is_image_from_nv12_image; /* IMAGE from NV12 Image*/ + cl_bool is_ker_copy; /* this object is copied by OCL kernel */ + cl_mem tmp_ker_buf; /* this object is tmp buffer for OCL kernel copying */ }; struct _cl_mem_gl_image { -- cgit v1.2.1