diff options
Diffstat (limited to 'liboffloadmic/runtime/offload_engine.cpp')
-rw-r--r-- | liboffloadmic/runtime/offload_engine.cpp | 428 |
1 files changed, 387 insertions, 41 deletions
diff --git a/liboffloadmic/runtime/offload_engine.cpp b/liboffloadmic/runtime/offload_engine.cpp index 2fe0d24430c..16b440d7ab6 100644 --- a/liboffloadmic/runtime/offload_engine.cpp +++ b/liboffloadmic/runtime/offload_engine.cpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2014 Intel Corporation. All Rights Reserved. + Copyright (c) 2014-2015 Intel Corporation. All Rights Reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -37,6 +37,14 @@ #include "offload_host.h" #include "offload_table.h" +#include "offload_iterator.h" + +// Static members of Stream class must be described somewhere. +// This members describe the list of all streams defined in programm +// via call to _Offload_stream_create. +uint64_t Stream::m_streams_count = 0; +StreamMap Stream::all_streams; +mutex_t Stream::m_stream_lock; const char* Engine::m_func_names[Engine::c_funcs_total] = { @@ -47,7 +55,8 @@ const char* Engine::m_func_names[Engine::c_funcs_total] = #endif // MYO_SUPPORT "server_init", "server_var_table_size", - "server_var_table_copy" + "server_var_table_copy", + "server_set_stream_affinity" }; // Symbolic representation of system signals. Fix for CQ233593 @@ -115,6 +124,7 @@ void Engine::init_process(void) COIENGINE engine; COIRESULT res; const char **environ; + char buf[4096]; // For exe path name // create environment for the target process environ = (const char**) mic_env_vars.create_environ_for_card(m_index); @@ -127,39 +137,147 @@ void Engine::init_process(void) // Create execution context in the specified device OFFLOAD_DEBUG_TRACE(2, "Getting device %d (engine %d) handle\n", m_index, m_physical_index); - res = COI::EngineGetHandle(COI_ISA_KNC, m_physical_index, &engine); + res = COI::EngineGetHandle(COI_ISA_MIC, m_physical_index, &engine); check_result(res, c_get_engine_handle, m_index, res); - // Target executable should be available by the time when we - // attempt to initialize the device + // Get engine info on threads and cores. + // The values of core number and thread number will be used later at stream + // creation by call to _Offload_stream_create(device,number_of_cpus). + + COI_ENGINE_INFO engine_info; + + res = COI::EngineGetInfo(engine, sizeof(COI_ENGINE_INFO), &engine_info); + check_result(res, c_get_engine_info, m_index, res); + + // m_cpus bitset has 1 for available thread. At the begining all threads + // are available and m_cpus(i) is set to + // 1 for i = [0...engine_info.NumThreads]. + m_cpus.reset(); + for (int i = 0; i < engine_info.NumThreads; i++) { + m_cpus.set(i); + } + + // The following values will be used at pipeline creation for streams + m_num_cores = engine_info.NumCores; + m_num_threads = engine_info.NumThreads; + + // Check if OFFLOAD_DMA_CHANNEL_COUNT is set to 2 + // Only the value 2 is supported in 16.0 + if (mic_dma_channel_count == 2) { + if (COI::ProcessConfigureDMA) { + // Set DMA channels using COI API + COI::ProcessConfigureDMA(2, COI::DMA_MODE_READ_WRITE); + } + else { + // Set environment variable COI_DMA_CHANNEL_COUNT + // use putenv instead of setenv as Windows has no setenv. + // Note: putenv requires its argument can't be freed or modified. + // So no free after call to putenv or elsewhere. + char * env_var = (char*) malloc(sizeof("COI_DMA_CHANNEL_COUNT=2" + 1)); + sprintf(env_var, "COI_DMA_CHANNEL_COUNT=2"); + putenv(env_var); + } + } + + // Target executable is not available then use compiler provided offload_main if (__target_exe == 0) { - LIBOFFLOAD_ERROR(c_no_target_exe); - exit(1); + if (mic_device_main == 0) + LIBOFFLOAD_ERROR(c_report_no_host_exe); + + OFFLOAD_DEBUG_TRACE(2, + "Loading target executable %s\n",mic_device_main); + + res = COI::ProcessCreateFromFile( + engine, // in_Engine + mic_device_main, // in_pBinaryName + 0, // in_Argc + 0, // in_ppArgv + environ == 0, // in_DupEnv + environ, // in_ppAdditionalEnv + mic_proxy_io, // in_ProxyActive + mic_proxy_fs_root, // in_ProxyfsRoot + mic_buffer_size, // in_BufferSpace + mic_library_path, // in_LibrarySearchPath + &m_process // out_pProcess + ); } + else { + // Target executable should be available by the time when we + // attempt to initialize the device - OFFLOAD_DEBUG_TRACE(2, - "Loading target executable \"%s\" from %p, size %lld\n", - __target_exe->name, __target_exe->data, __target_exe->size); - - res = COI::ProcessCreateFromMemory( - engine, // in_Engine - __target_exe->name, // in_pBinaryName - __target_exe->data, // in_pBinaryBuffer - __target_exe->size, // in_BinaryBufferLength, - 0, // in_Argc - 0, // in_ppArgv - environ == 0, // in_DupEnv - environ, // in_ppAdditionalEnv - mic_proxy_io, // in_ProxyActive - mic_proxy_fs_root, // in_ProxyfsRoot - mic_buffer_size, // in_BufferSpace - mic_library_path, // in_LibrarySearchPath - __target_exe->origin, // in_FileOfOrigin - __target_exe->offset, // in_FileOfOriginOffset - &m_process // out_pProcess - ); + // Need the full path of the FAT exe for VTUNE + { +#ifndef TARGET_WINNT + ssize_t len = readlink("/proc/self/exe", buf,1000); +#else + int len = GetModuleFileName(NULL, buf,1000); +#endif // TARGET_WINNT + if (len == -1) { + LIBOFFLOAD_ERROR(c_report_no_host_exe); + exit(1); + } + else if (len > 999) { + LIBOFFLOAD_ERROR(c_report_path_buff_overflow); + exit(1); + } + buf[len] = '\0'; + } + + OFFLOAD_DEBUG_TRACE(2, + "Loading target executable \"%s\" from %p, size %lld, host file %s\n", + __target_exe->name, __target_exe->data, __target_exe->size, + buf); + + res = COI::ProcessCreateFromMemory( + engine, // in_Engine + __target_exe->name, // in_pBinaryName + __target_exe->data, // in_pBinaryBuffer + __target_exe->size, // in_BinaryBufferLength, + 0, // in_Argc + 0, // in_ppArgv + environ == 0, // in_DupEnv + environ, // in_ppAdditionalEnv + mic_proxy_io, // in_ProxyActive + mic_proxy_fs_root, // in_ProxyfsRoot + mic_buffer_size, // in_BufferSpace + mic_library_path, // in_LibrarySearchPath + buf, // in_FileOfOrigin + -1, // in_FileOfOriginOffset use -1 to indicate to + // COI that is is a FAT binary + &m_process // out_pProcess + ); + } check_result(res, c_process_create, m_index, res); + if ((mic_4k_buffer_size != 0) || (mic_2m_buffer_size !=0)) { + // available only in MPSS 4.2 and greater + if (COI::ProcessSetCacheSize != 0 ) { + int flags; + // Need compiler to use MPSS 3.2 or greater to get these + // definition so currently hardcoding it + // COI_CACHE_ACTION_GROW_NOW && COI_CACHE_MODE_ONDEMAND_SYNC; + flags = 0x00020002; + res = COI::ProcessSetCacheSize( + m_process, // in_Process + mic_2m_buffer_size, // in_HugePagePoolSize + flags, // inHugeFlags + mic_4k_buffer_size, // in_SmallPagePoolSize + flags, // inSmallFlags + 0, // in_NumDependencies + 0, // in_pDependencies + 0 // out_PCompletion + ); + OFFLOAD_DEBUG_TRACE(2, + "Reserve target buffers 4K pages = %d 2M pages = %d\n", + mic_4k_buffer_size, mic_2m_buffer_size); + check_result(res, c_process_set_cache_size, m_index, res); + } + else { + OFFLOAD_DEBUG_TRACE(2, + "Reserve target buffers not supported in current MPSS\n"); + } + } + // get function handles res = COI::ProcessGetFunctionHandles(m_process, c_funcs_total, m_func_names, m_funcs); @@ -226,8 +344,9 @@ void Engine::load_libraries() // load libraries collected so far for (TargetImageList::iterator it = m_images.begin(); it != m_images.end(); it++) { - OFFLOAD_DEBUG_TRACE(2, "Loading library \"%s\" from %p, size %llu\n", - it->name, it->data, it->size); + OFFLOAD_DEBUG_TRACE(2, + "Loading library \"%s\" from %p, size %llu, host file %s\n", + it->name, it->data, it->size, it->origin); // load library to the device COILIBRARY lib; @@ -238,9 +357,10 @@ void Engine::load_libraries() it->name, mic_library_path, it->origin, - it->offset, + (it->origin) ? -1 : 0, COI_LOADLIBRARY_V1_FLAGS, &lib); + m_dyn_libs.push_front(DynLib(it->name, it->data, lib)); if (res != COI_SUCCESS && res != COI_ALREADY_EXISTS) { check_result(res, c_load_library, m_index, res); @@ -249,6 +369,27 @@ void Engine::load_libraries() m_images.clear(); } +void Engine::unload_library(const void *data, const char *name) +{ + if (m_process == 0) { + return; + } + for (DynLibList::iterator it = m_dyn_libs.begin(); + it != m_dyn_libs.end(); it++) { + if (it->data == data) { + COIRESULT res; + OFFLOAD_DEBUG_TRACE(2, + "Unloading library \"%s\"\n",name); + res = COI::ProcessUnloadLibrary(m_process,it->lib); + m_dyn_libs.erase(it); + if (res != COI_SUCCESS) { + check_result(res, c_unload_library, m_index, res); + } + return; + } + } +} + static bool target_entry_cmp( const VarList::BufEntry &l, const VarList::BufEntry &r @@ -273,8 +414,9 @@ void Engine::init_ptr_data(void) COIEVENT event; // Prepare table of host entries - std::vector<const VarTable::Entry*> host_table(__offload_vars.begin(), - __offload_vars.end()); + std::vector<const VarTable::Entry*> host_table( + Iterator(__offload_vars.get_head()), + Iterator()); // no need to do anything further is host table is empty if (host_table.size() <= 0) { @@ -348,17 +490,16 @@ void Engine::init_ptr_data(void) while (hi != he && ti != te) { int res = strcmp((*hi)->name, reinterpret_cast<const char*>(ti->name)); if (res == 0) { + bool is_new; // add matching entry to var map - std::pair<PtrSet::iterator, bool> res = - m_ptr_set.insert(PtrData((*hi)->addr, (*hi)->size)); + PtrData *ptr = insert_ptr_data((*hi)->addr, (*hi)->size, is_new); // store address for new entries - if (res.second) { - PtrData *ptr = const_cast<PtrData*>(res.first.operator->()); + if (is_new) { ptr->mic_addr = ti->addr; ptr->is_static = true; } - + ptr->alloc_ptr_data_lock.unlock(); hi++; ti++; } @@ -379,6 +520,7 @@ void Engine::init_ptr_data(void) } COIRESULT Engine::compute( + _Offload_stream stream, const std::list<COIBUFFER> &buffers, const void* data, uint16_t data_size, @@ -413,9 +555,11 @@ COIRESULT Engine::compute( bufs = 0; flags = 0; } - + COIPIPELINE pipeline = (stream == no_stream) ? + get_pipeline() : + get_pipeline(stream); // start computation - res = COI::PipelineRunFunction(get_pipeline(), + res = COI::PipelineRunFunction(pipeline, m_funcs[c_func_compute], num_bufs, bufs, flags, num_deps, deps, @@ -528,12 +672,214 @@ COIPIPELINE Engine::get_pipeline(void) // create pipeline for this thread res = COI::PipelineCreate(m_process, 0, mic_stack_size, &pipeline); check_result(res, c_pipeline_create, m_index, res); - thread->set_pipeline(m_index, pipeline); } return pipeline; } +Stream* Stream::find_stream(uint64_t handle, bool remove) +{ + Stream *stream = 0; + + m_stream_lock.lock(); + { + StreamMap::iterator it = all_streams.find(handle); + if (it != all_streams.end()) { + stream = it->second; + if (remove) { + all_streams.erase(it); + } + } + } + m_stream_lock.unlock(); + return stream; +} + +COIPIPELINE Engine::get_pipeline(_Offload_stream handle) +{ + Stream * stream = Stream::find_stream(handle, false); + + if (!stream) { + LIBOFFLOAD_ERROR(c_offload_no_stream, m_index); + LIBOFFLOAD_ABORT; + } + + COIPIPELINE pipeline = stream->get_pipeline(); + + if (pipeline == 0) { + COIRESULT res; + int proc_num; + COI_CPU_MASK in_Mask ; + +#ifndef TARGET_WINNT + proc_num = __sync_fetch_and_add(&m_proc_number, 1); +#else // TARGET_WINNT + proc_num = _InterlockedIncrement(&m_proc_number); +#endif // TARGET_WINNT + + if (proc_num > COI_PIPELINE_MAX_PIPELINES) { + LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES); + LIBOFFLOAD_ABORT; + } + + m_stream_lock.lock(); + + // start process if not done yet + if (m_process == 0) { + init_process(); + } + + // create CPUmask + res = COI::PipelineClearCPUMask(in_Mask); + check_result(res, c_clear_cpu_mask, m_index, res); + + int stream_cpu_num = stream->get_cpu_number(); + + stream->m_stream_cpus.reset(); + + int threads_per_core = m_num_threads / m_num_cores; + + // The "stream_cpu_num" available threads is set in mask. + // Available threads are defined by examining of m_cpus bitset. + // We skip thread 0 . + for (int i = 1; i < m_num_threads; i++) { + // for available thread i m_cpus[i] is equal to 1 + if (m_cpus[i]) { + res = COI::PipelineSetCPUMask(m_process, + i / threads_per_core, + i % threads_per_core, + in_Mask); + + check_result(res, c_set_cpu_mask, res); + // mark thread i as nonavailable + m_cpus.set(i,0); + // Mark thread i as given for the stream. + // In case of stream destroying by call to + // _Offload_stream_destroy we can mark the thread i as + // available. + stream->m_stream_cpus.set(i); + if (--stream_cpu_num <= 0) { + break; + } + } + } + + // if stream_cpu_num is greater than 0 there are not enough + // available threads + if (stream_cpu_num > 0) { + LIBOFFLOAD_ERROR(c_create_pipeline_for_stream, m_num_threads); + LIBOFFLOAD_ABORT; + } + // create pipeline for this thread + OFFLOAD_DEBUG_TRACE(2, "COIPipelineCreate Mask\n" + "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n" + "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n", + in_Mask[0], in_Mask[1], in_Mask[2], in_Mask[3], + in_Mask[4], in_Mask[5], in_Mask[6], in_Mask[7], + in_Mask[8], in_Mask[9], in_Mask[10], in_Mask[11], + in_Mask[12], in_Mask[13], in_Mask[14], in_Mask[15]); + res = COI::PipelineCreate(m_process, in_Mask, + mic_stack_size, &pipeline); + check_result(res, c_pipeline_create, m_index, res); + + // Set stream's affinities + { + struct affinity_spec affinity_spec; + char* affinity_type; + int i; + + // "compact" by default + affinity_spec.affinity_type = affinity_compact; + + // Check if user has specified type of affinity + if ((affinity_type = getenv("OFFLOAD_STREAM_AFFINITY")) != + NULL) + { + char affinity_str[16]; + int affinity_str_len; + + OFFLOAD_DEBUG_TRACE(2, + "User has specified OFFLOAD_STREAM_AFFINITY=%s\n", + affinity_type); + + // Set type of affinity requested + affinity_str_len = strlen(affinity_type); + for (i=0; i<affinity_str_len && i<15; i++) + { + affinity_str[i] = tolower(affinity_type[i]); + } + affinity_str[i] = '\0'; + if (strcmp(affinity_str, "compact") == 0) { + affinity_spec.affinity_type = affinity_compact; + OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n"); + } else if (strcmp(affinity_str, "scatter") == 0) { + affinity_spec.affinity_type = affinity_scatter; + OFFLOAD_DEBUG_TRACE(2, "Setting affinity=scatter\n"); + } else { + LIBOFFLOAD_ERROR(c_incorrect_affinity, affinity_str); + affinity_spec.affinity_type = affinity_compact; + OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n"); + } + } + // Make flat copy of sink mask because COI's mask is opaque + for (i=0; i<16; i++) { + affinity_spec.sink_mask[i] = in_Mask[i]; + } + // Set number of cores and threads + affinity_spec.num_cores = m_num_cores; + affinity_spec.num_threads = m_num_threads; + + COIEVENT event; + res = COI::PipelineRunFunction(pipeline, + m_funcs[c_func_set_stream_affinity], + 0, 0, 0, + 0, 0, + &affinity_spec, sizeof(affinity_spec), + 0, 0, + &event); + check_result(res, c_pipeline_run_func, m_index, res); + + res = COI::EventWait(1, &event, -1, 1, 0, 0); + check_result(res, c_event_wait, res); + } + + m_stream_lock.unlock(); + stream->set_pipeline(pipeline); + } + return pipeline; +} + +void Engine::stream_destroy(_Offload_stream handle) +{ + // get stream + Stream * stream = Stream::find_stream(handle, true); + + if (stream) { + // return cpus for future use + for (int i = 0; i < m_num_threads; i++) { + if (stream->m_stream_cpus.test(i)) { + m_cpus.set(i); + } + } + delete stream; + } + else { + LIBOFFLOAD_ERROR(c_offload_no_stream, m_index); + LIBOFFLOAD_ABORT; + } +} + +uint64_t Engine::get_thread_id(void) +{ + Thread* thread = (Thread*) thread_getspecific(mic_thread_key); + if (thread == 0) { + thread = new Thread(&m_proc_number); + thread_setspecific(mic_thread_key, thread); + } + + return reinterpret_cast<uint64_t>(thread); +} + AutoSet& Engine::get_auto_vars(void) { Thread* thread = (Thread*) thread_getspecific(mic_thread_key); |