/* * Copyright © 2012 Intel Corporation * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see . * * Author: Rong Yang */ #include "cl_device_enqueue.h" #include "cl_mem.h" #include "cl_utils.h" #include "cl_context.h" #include "cl_program.h" #include "cl_alloc.h" #include "cl_kernel.h" #include "cl_command_queue.h" #include "cl_event.h" LOCAL cl_int cl_device_enqueue_fix_offset(cl_kernel ker) { uint32_t i; void *ptr; cl_mem mem; enum gbe_arg_type arg_type; /* kind of argument */ for (i = 0; i < ker->arg_n; ++i) { arg_type = interp_kernel_get_arg_type(ker->opaque, i); //HOW about image if (!(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR) || !ker->args[i].mem) continue; if(!ker->args[i].is_svm) { mem = ker->args[i].mem; ptr = cl_mem_map(mem, 0); cl_buffer_set_softpin_offset(mem->bo, (size_t)ptr); cl_buffer_set_bo_use_full_range(mem->bo, 1); cl_buffer_disable_reuse(mem->bo); mem->host_ptr = ptr; cl_mem_unmap(mem); ker->device_enqueue_infos[ker->device_enqueue_info_n++] = ptr; } else { ker->device_enqueue_infos[ker->device_enqueue_info_n++] = ker->args[i].mem->host_ptr; } } return 0; } LOCAL cl_int cl_device_enqueue_bind_buffer(cl_gpgpu gpgpu, cl_kernel ker, uint32_t *max_bti, cl_gpgpu_kernel *kernel) { int32_t value = GBE_CURBE_ENQUEUE_BUF_POINTER; int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0); size_t buf_size = 32 * 1024 * 1024; //fix 32M cl_mem mem; if(offset > 0) { if(ker->useDeviceEnqueue == false) { if(ker->device_enqueue_ptr == NULL) ker->device_enqueue_ptr = cl_mem_svm_allocate(ker->program->ctx, 0, buf_size, 0); if(ker->device_enqueue_infos == NULL) ker->device_enqueue_infos = cl_calloc(ker->arg_n, sizeof(void *)); ker->device_enqueue_info_n = 0; ker->useDeviceEnqueue = CL_TRUE; cl_device_enqueue_fix_offset(ker); cl_kernel_add_ref(ker); } mem = cl_context_get_svm_from_ptr(ker->program->ctx, ker->device_enqueue_ptr); assert(mem); cl_gpgpu_bind_buf(gpgpu, mem->bo, offset, 0, buf_size, *max_bti); cl_gpgpu_set_kernel(gpgpu, ker); } return 0; } typedef struct ndrange_info_t { int type; int global_work_size[3]; int local_work_size[3]; int global_work_offset[3]; } ndrange_info_t; typedef struct Block_literal { void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock int flags; int reserved; int index; struct Block_descriptor_1 { unsigned long int slm_size; // NULL unsigned long int size; // sizeof(struct Block_literal_1) // optional helper functions void *copy_helper; // IFF (1<<25) void *dispose_helper; // IFF (1<<25) // required ABI.2010.3.16 const char *signature; // IFF (1<<30) } *descriptor; // imported variables } Block_literal; LOCAL cl_int cl_device_enqueue_parse_result(cl_command_queue queue, cl_gpgpu gpgpu) { cl_mem mem; int size, type, dim, i; const char * kernel_name; cl_kernel child_ker; cl_event evt = NULL; cl_kernel ker = cl_gpgpu_get_kernel(gpgpu); if(ker == NULL || ker->useDeviceEnqueue == CL_FALSE) return 0; void *buf = cl_gpgpu_ref_batch_buf(gpgpu); //wait the gpgpu's batch buf finish, the gpgpu in queue may be not //same as the param gpgpu, for example when flush event. cl_gpgpu_sync(buf); cl_gpgpu_unref_batch_buf(buf); mem = cl_context_get_svm_from_ptr(ker->program->ctx, ker->device_enqueue_ptr); if(mem == NULL) return -1; char *ptr = (char *)cl_mem_map(mem, 0); size = *(int *)ptr; ptr += 4; while(size > 0) { size_t fixed_global_off[] = {0,0,0}; size_t fixed_global_sz[] = {1,1,1}; size_t fixed_local_sz[] = {1,1,1}; ndrange_info_t* ndrange_info = (ndrange_info_t *)ptr; size -= sizeof(ndrange_info_t); ptr += sizeof(ndrange_info_t); Block_literal *block = (Block_literal *)ptr; size -= block->descriptor->size; ptr += block->descriptor->size; type = ndrange_info->type; dim = (type & 0xf0) >> 4; type = type & 0xf; assert(dim <= 2); for(i = 0; i <= dim; i++) { fixed_global_sz[i] = ndrange_info->global_work_size[i]; if(type > 1) fixed_local_sz[i] = ndrange_info->local_work_size[i]; if(type > 2) fixed_global_off[i] = ndrange_info->global_work_offset[i]; } int *slm_sizes = (int *)ptr; int slm_size = block->descriptor->slm_size; size -= slm_size; ptr += slm_size; kernel_name = interp_program_get_device_enqueue_kernel_name(ker->program->opaque, block->index); child_ker = cl_program_create_kernel(ker->program, kernel_name, NULL); assert(child_ker); cl_kernel_set_arg_svm_pointer(child_ker, 0, block); int index = 1; for(i=0; idevice_enqueue_info_n * sizeof(void *), ker->device_enqueue_infos); if (evt != NULL) { clReleaseEvent(evt); evt = NULL; } clEnqueueNDRangeKernel(queue, child_ker, dim + 1, fixed_global_off, fixed_global_sz, fixed_local_sz, 0, NULL, &evt); cl_command_queue_flush_gpgpu(gpgpu); cl_kernel_delete(child_ker); } if (evt != NULL) { //Can't call clWaitForEvents here, it may cause dead lock. //If evt->exec_data.gpgpu is NULL, evt has finished. if (evt->exec_data.gpgpu) { buf = cl_gpgpu_ref_batch_buf(evt->exec_data.gpgpu); //wait the gpgpu's batch buf finish, the gpgpu in queue may be not //same as the param gpgpu, for example when flush event. cl_gpgpu_sync(buf); cl_gpgpu_unref_batch_buf(buf); } clReleaseEvent(evt); evt = NULL; } cl_mem_unmap_auto(mem); cl_kernel_delete(ker); return 0; }