summaryrefslogtreecommitdiff
path: root/liboffloadmic/runtime/offload_engine.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'liboffloadmic/runtime/offload_engine.cpp')
-rw-r--r--liboffloadmic/runtime/offload_engine.cpp428
1 files changed, 387 insertions, 41 deletions
diff --git a/liboffloadmic/runtime/offload_engine.cpp b/liboffloadmic/runtime/offload_engine.cpp
index 2fe0d24430c..16b440d7ab6 100644
--- a/liboffloadmic/runtime/offload_engine.cpp
+++ b/liboffloadmic/runtime/offload_engine.cpp
@@ -1,5 +1,5 @@
/*
- Copyright (c) 2014 Intel Corporation. All Rights Reserved.
+ Copyright (c) 2014-2015 Intel Corporation. All Rights Reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@@ -37,6 +37,14 @@
#include "offload_host.h"
#include "offload_table.h"
+#include "offload_iterator.h"
+
+// Static members of Stream class must be described somewhere.
+// This members describe the list of all streams defined in programm
+// via call to _Offload_stream_create.
+uint64_t Stream::m_streams_count = 0;
+StreamMap Stream::all_streams;
+mutex_t Stream::m_stream_lock;
const char* Engine::m_func_names[Engine::c_funcs_total] =
{
@@ -47,7 +55,8 @@ const char* Engine::m_func_names[Engine::c_funcs_total] =
#endif // MYO_SUPPORT
"server_init",
"server_var_table_size",
- "server_var_table_copy"
+ "server_var_table_copy",
+ "server_set_stream_affinity"
};
// Symbolic representation of system signals. Fix for CQ233593
@@ -115,6 +124,7 @@ void Engine::init_process(void)
COIENGINE engine;
COIRESULT res;
const char **environ;
+ char buf[4096]; // For exe path name
// create environment for the target process
environ = (const char**) mic_env_vars.create_environ_for_card(m_index);
@@ -127,39 +137,147 @@ void Engine::init_process(void)
// Create execution context in the specified device
OFFLOAD_DEBUG_TRACE(2, "Getting device %d (engine %d) handle\n", m_index,
m_physical_index);
- res = COI::EngineGetHandle(COI_ISA_KNC, m_physical_index, &engine);
+ res = COI::EngineGetHandle(COI_ISA_MIC, m_physical_index, &engine);
check_result(res, c_get_engine_handle, m_index, res);
- // Target executable should be available by the time when we
- // attempt to initialize the device
+ // Get engine info on threads and cores.
+ // The values of core number and thread number will be used later at stream
+ // creation by call to _Offload_stream_create(device,number_of_cpus).
+
+ COI_ENGINE_INFO engine_info;
+
+ res = COI::EngineGetInfo(engine, sizeof(COI_ENGINE_INFO), &engine_info);
+ check_result(res, c_get_engine_info, m_index, res);
+
+ // m_cpus bitset has 1 for available thread. At the begining all threads
+ // are available and m_cpus(i) is set to
+ // 1 for i = [0...engine_info.NumThreads].
+ m_cpus.reset();
+ for (int i = 0; i < engine_info.NumThreads; i++) {
+ m_cpus.set(i);
+ }
+
+ // The following values will be used at pipeline creation for streams
+ m_num_cores = engine_info.NumCores;
+ m_num_threads = engine_info.NumThreads;
+
+ // Check if OFFLOAD_DMA_CHANNEL_COUNT is set to 2
+ // Only the value 2 is supported in 16.0
+ if (mic_dma_channel_count == 2) {
+ if (COI::ProcessConfigureDMA) {
+ // Set DMA channels using COI API
+ COI::ProcessConfigureDMA(2, COI::DMA_MODE_READ_WRITE);
+ }
+ else {
+ // Set environment variable COI_DMA_CHANNEL_COUNT
+ // use putenv instead of setenv as Windows has no setenv.
+ // Note: putenv requires its argument can't be freed or modified.
+ // So no free after call to putenv or elsewhere.
+ char * env_var = (char*) malloc(sizeof("COI_DMA_CHANNEL_COUNT=2" + 1));
+ sprintf(env_var, "COI_DMA_CHANNEL_COUNT=2");
+ putenv(env_var);
+ }
+ }
+
+ // Target executable is not available then use compiler provided offload_main
if (__target_exe == 0) {
- LIBOFFLOAD_ERROR(c_no_target_exe);
- exit(1);
+ if (mic_device_main == 0)
+ LIBOFFLOAD_ERROR(c_report_no_host_exe);
+
+ OFFLOAD_DEBUG_TRACE(2,
+ "Loading target executable %s\n",mic_device_main);
+
+ res = COI::ProcessCreateFromFile(
+ engine, // in_Engine
+ mic_device_main, // in_pBinaryName
+ 0, // in_Argc
+ 0, // in_ppArgv
+ environ == 0, // in_DupEnv
+ environ, // in_ppAdditionalEnv
+ mic_proxy_io, // in_ProxyActive
+ mic_proxy_fs_root, // in_ProxyfsRoot
+ mic_buffer_size, // in_BufferSpace
+ mic_library_path, // in_LibrarySearchPath
+ &m_process // out_pProcess
+ );
}
+ else {
+ // Target executable should be available by the time when we
+ // attempt to initialize the device
- OFFLOAD_DEBUG_TRACE(2,
- "Loading target executable \"%s\" from %p, size %lld\n",
- __target_exe->name, __target_exe->data, __target_exe->size);
-
- res = COI::ProcessCreateFromMemory(
- engine, // in_Engine
- __target_exe->name, // in_pBinaryName
- __target_exe->data, // in_pBinaryBuffer
- __target_exe->size, // in_BinaryBufferLength,
- 0, // in_Argc
- 0, // in_ppArgv
- environ == 0, // in_DupEnv
- environ, // in_ppAdditionalEnv
- mic_proxy_io, // in_ProxyActive
- mic_proxy_fs_root, // in_ProxyfsRoot
- mic_buffer_size, // in_BufferSpace
- mic_library_path, // in_LibrarySearchPath
- __target_exe->origin, // in_FileOfOrigin
- __target_exe->offset, // in_FileOfOriginOffset
- &m_process // out_pProcess
- );
+ // Need the full path of the FAT exe for VTUNE
+ {
+#ifndef TARGET_WINNT
+ ssize_t len = readlink("/proc/self/exe", buf,1000);
+#else
+ int len = GetModuleFileName(NULL, buf,1000);
+#endif // TARGET_WINNT
+ if (len == -1) {
+ LIBOFFLOAD_ERROR(c_report_no_host_exe);
+ exit(1);
+ }
+ else if (len > 999) {
+ LIBOFFLOAD_ERROR(c_report_path_buff_overflow);
+ exit(1);
+ }
+ buf[len] = '\0';
+ }
+
+ OFFLOAD_DEBUG_TRACE(2,
+ "Loading target executable \"%s\" from %p, size %lld, host file %s\n",
+ __target_exe->name, __target_exe->data, __target_exe->size,
+ buf);
+
+ res = COI::ProcessCreateFromMemory(
+ engine, // in_Engine
+ __target_exe->name, // in_pBinaryName
+ __target_exe->data, // in_pBinaryBuffer
+ __target_exe->size, // in_BinaryBufferLength,
+ 0, // in_Argc
+ 0, // in_ppArgv
+ environ == 0, // in_DupEnv
+ environ, // in_ppAdditionalEnv
+ mic_proxy_io, // in_ProxyActive
+ mic_proxy_fs_root, // in_ProxyfsRoot
+ mic_buffer_size, // in_BufferSpace
+ mic_library_path, // in_LibrarySearchPath
+ buf, // in_FileOfOrigin
+ -1, // in_FileOfOriginOffset use -1 to indicate to
+ // COI that is is a FAT binary
+ &m_process // out_pProcess
+ );
+ }
check_result(res, c_process_create, m_index, res);
+ if ((mic_4k_buffer_size != 0) || (mic_2m_buffer_size !=0)) {
+ // available only in MPSS 4.2 and greater
+ if (COI::ProcessSetCacheSize != 0 ) {
+ int flags;
+ // Need compiler to use MPSS 3.2 or greater to get these
+ // definition so currently hardcoding it
+ // COI_CACHE_ACTION_GROW_NOW && COI_CACHE_MODE_ONDEMAND_SYNC;
+ flags = 0x00020002;
+ res = COI::ProcessSetCacheSize(
+ m_process, // in_Process
+ mic_2m_buffer_size, // in_HugePagePoolSize
+ flags, // inHugeFlags
+ mic_4k_buffer_size, // in_SmallPagePoolSize
+ flags, // inSmallFlags
+ 0, // in_NumDependencies
+ 0, // in_pDependencies
+ 0 // out_PCompletion
+ );
+ OFFLOAD_DEBUG_TRACE(2,
+ "Reserve target buffers 4K pages = %d 2M pages = %d\n",
+ mic_4k_buffer_size, mic_2m_buffer_size);
+ check_result(res, c_process_set_cache_size, m_index, res);
+ }
+ else {
+ OFFLOAD_DEBUG_TRACE(2,
+ "Reserve target buffers not supported in current MPSS\n");
+ }
+ }
+
// get function handles
res = COI::ProcessGetFunctionHandles(m_process, c_funcs_total,
m_func_names, m_funcs);
@@ -226,8 +344,9 @@ void Engine::load_libraries()
// load libraries collected so far
for (TargetImageList::iterator it = m_images.begin();
it != m_images.end(); it++) {
- OFFLOAD_DEBUG_TRACE(2, "Loading library \"%s\" from %p, size %llu\n",
- it->name, it->data, it->size);
+ OFFLOAD_DEBUG_TRACE(2,
+ "Loading library \"%s\" from %p, size %llu, host file %s\n",
+ it->name, it->data, it->size, it->origin);
// load library to the device
COILIBRARY lib;
@@ -238,9 +357,10 @@ void Engine::load_libraries()
it->name,
mic_library_path,
it->origin,
- it->offset,
+ (it->origin) ? -1 : 0,
COI_LOADLIBRARY_V1_FLAGS,
&lib);
+ m_dyn_libs.push_front(DynLib(it->name, it->data, lib));
if (res != COI_SUCCESS && res != COI_ALREADY_EXISTS) {
check_result(res, c_load_library, m_index, res);
@@ -249,6 +369,27 @@ void Engine::load_libraries()
m_images.clear();
}
+void Engine::unload_library(const void *data, const char *name)
+{
+ if (m_process == 0) {
+ return;
+ }
+ for (DynLibList::iterator it = m_dyn_libs.begin();
+ it != m_dyn_libs.end(); it++) {
+ if (it->data == data) {
+ COIRESULT res;
+ OFFLOAD_DEBUG_TRACE(2,
+ "Unloading library \"%s\"\n",name);
+ res = COI::ProcessUnloadLibrary(m_process,it->lib);
+ m_dyn_libs.erase(it);
+ if (res != COI_SUCCESS) {
+ check_result(res, c_unload_library, m_index, res);
+ }
+ return;
+ }
+ }
+}
+
static bool target_entry_cmp(
const VarList::BufEntry &l,
const VarList::BufEntry &r
@@ -273,8 +414,9 @@ void Engine::init_ptr_data(void)
COIEVENT event;
// Prepare table of host entries
- std::vector<const VarTable::Entry*> host_table(__offload_vars.begin(),
- __offload_vars.end());
+ std::vector<const VarTable::Entry*> host_table(
+ Iterator(__offload_vars.get_head()),
+ Iterator());
// no need to do anything further is host table is empty
if (host_table.size() <= 0) {
@@ -348,17 +490,16 @@ void Engine::init_ptr_data(void)
while (hi != he && ti != te) {
int res = strcmp((*hi)->name, reinterpret_cast<const char*>(ti->name));
if (res == 0) {
+ bool is_new;
// add matching entry to var map
- std::pair<PtrSet::iterator, bool> res =
- m_ptr_set.insert(PtrData((*hi)->addr, (*hi)->size));
+ PtrData *ptr = insert_ptr_data((*hi)->addr, (*hi)->size, is_new);
// store address for new entries
- if (res.second) {
- PtrData *ptr = const_cast<PtrData*>(res.first.operator->());
+ if (is_new) {
ptr->mic_addr = ti->addr;
ptr->is_static = true;
}
-
+ ptr->alloc_ptr_data_lock.unlock();
hi++;
ti++;
}
@@ -379,6 +520,7 @@ void Engine::init_ptr_data(void)
}
COIRESULT Engine::compute(
+ _Offload_stream stream,
const std::list<COIBUFFER> &buffers,
const void* data,
uint16_t data_size,
@@ -413,9 +555,11 @@ COIRESULT Engine::compute(
bufs = 0;
flags = 0;
}
-
+ COIPIPELINE pipeline = (stream == no_stream) ?
+ get_pipeline() :
+ get_pipeline(stream);
// start computation
- res = COI::PipelineRunFunction(get_pipeline(),
+ res = COI::PipelineRunFunction(pipeline,
m_funcs[c_func_compute],
num_bufs, bufs, flags,
num_deps, deps,
@@ -528,12 +672,214 @@ COIPIPELINE Engine::get_pipeline(void)
// create pipeline for this thread
res = COI::PipelineCreate(m_process, 0, mic_stack_size, &pipeline);
check_result(res, c_pipeline_create, m_index, res);
-
thread->set_pipeline(m_index, pipeline);
}
return pipeline;
}
+Stream* Stream::find_stream(uint64_t handle, bool remove)
+{
+ Stream *stream = 0;
+
+ m_stream_lock.lock();
+ {
+ StreamMap::iterator it = all_streams.find(handle);
+ if (it != all_streams.end()) {
+ stream = it->second;
+ if (remove) {
+ all_streams.erase(it);
+ }
+ }
+ }
+ m_stream_lock.unlock();
+ return stream;
+}
+
+COIPIPELINE Engine::get_pipeline(_Offload_stream handle)
+{
+ Stream * stream = Stream::find_stream(handle, false);
+
+ if (!stream) {
+ LIBOFFLOAD_ERROR(c_offload_no_stream, m_index);
+ LIBOFFLOAD_ABORT;
+ }
+
+ COIPIPELINE pipeline = stream->get_pipeline();
+
+ if (pipeline == 0) {
+ COIRESULT res;
+ int proc_num;
+ COI_CPU_MASK in_Mask ;
+
+#ifndef TARGET_WINNT
+ proc_num = __sync_fetch_and_add(&m_proc_number, 1);
+#else // TARGET_WINNT
+ proc_num = _InterlockedIncrement(&m_proc_number);
+#endif // TARGET_WINNT
+
+ if (proc_num > COI_PIPELINE_MAX_PIPELINES) {
+ LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES);
+ LIBOFFLOAD_ABORT;
+ }
+
+ m_stream_lock.lock();
+
+ // start process if not done yet
+ if (m_process == 0) {
+ init_process();
+ }
+
+ // create CPUmask
+ res = COI::PipelineClearCPUMask(in_Mask);
+ check_result(res, c_clear_cpu_mask, m_index, res);
+
+ int stream_cpu_num = stream->get_cpu_number();
+
+ stream->m_stream_cpus.reset();
+
+ int threads_per_core = m_num_threads / m_num_cores;
+
+ // The "stream_cpu_num" available threads is set in mask.
+ // Available threads are defined by examining of m_cpus bitset.
+ // We skip thread 0 .
+ for (int i = 1; i < m_num_threads; i++) {
+ // for available thread i m_cpus[i] is equal to 1
+ if (m_cpus[i]) {
+ res = COI::PipelineSetCPUMask(m_process,
+ i / threads_per_core,
+ i % threads_per_core,
+ in_Mask);
+
+ check_result(res, c_set_cpu_mask, res);
+ // mark thread i as nonavailable
+ m_cpus.set(i,0);
+ // Mark thread i as given for the stream.
+ // In case of stream destroying by call to
+ // _Offload_stream_destroy we can mark the thread i as
+ // available.
+ stream->m_stream_cpus.set(i);
+ if (--stream_cpu_num <= 0) {
+ break;
+ }
+ }
+ }
+
+ // if stream_cpu_num is greater than 0 there are not enough
+ // available threads
+ if (stream_cpu_num > 0) {
+ LIBOFFLOAD_ERROR(c_create_pipeline_for_stream, m_num_threads);
+ LIBOFFLOAD_ABORT;
+ }
+ // create pipeline for this thread
+ OFFLOAD_DEBUG_TRACE(2, "COIPipelineCreate Mask\n"
+ "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n"
+ "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n",
+ in_Mask[0], in_Mask[1], in_Mask[2], in_Mask[3],
+ in_Mask[4], in_Mask[5], in_Mask[6], in_Mask[7],
+ in_Mask[8], in_Mask[9], in_Mask[10], in_Mask[11],
+ in_Mask[12], in_Mask[13], in_Mask[14], in_Mask[15]);
+ res = COI::PipelineCreate(m_process, in_Mask,
+ mic_stack_size, &pipeline);
+ check_result(res, c_pipeline_create, m_index, res);
+
+ // Set stream's affinities
+ {
+ struct affinity_spec affinity_spec;
+ char* affinity_type;
+ int i;
+
+ // "compact" by default
+ affinity_spec.affinity_type = affinity_compact;
+
+ // Check if user has specified type of affinity
+ if ((affinity_type = getenv("OFFLOAD_STREAM_AFFINITY")) !=
+ NULL)
+ {
+ char affinity_str[16];
+ int affinity_str_len;
+
+ OFFLOAD_DEBUG_TRACE(2,
+ "User has specified OFFLOAD_STREAM_AFFINITY=%s\n",
+ affinity_type);
+
+ // Set type of affinity requested
+ affinity_str_len = strlen(affinity_type);
+ for (i=0; i<affinity_str_len && i<15; i++)
+ {
+ affinity_str[i] = tolower(affinity_type[i]);
+ }
+ affinity_str[i] = '\0';
+ if (strcmp(affinity_str, "compact") == 0) {
+ affinity_spec.affinity_type = affinity_compact;
+ OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n");
+ } else if (strcmp(affinity_str, "scatter") == 0) {
+ affinity_spec.affinity_type = affinity_scatter;
+ OFFLOAD_DEBUG_TRACE(2, "Setting affinity=scatter\n");
+ } else {
+ LIBOFFLOAD_ERROR(c_incorrect_affinity, affinity_str);
+ affinity_spec.affinity_type = affinity_compact;
+ OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n");
+ }
+ }
+ // Make flat copy of sink mask because COI's mask is opaque
+ for (i=0; i<16; i++) {
+ affinity_spec.sink_mask[i] = in_Mask[i];
+ }
+ // Set number of cores and threads
+ affinity_spec.num_cores = m_num_cores;
+ affinity_spec.num_threads = m_num_threads;
+
+ COIEVENT event;
+ res = COI::PipelineRunFunction(pipeline,
+ m_funcs[c_func_set_stream_affinity],
+ 0, 0, 0,
+ 0, 0,
+ &affinity_spec, sizeof(affinity_spec),
+ 0, 0,
+ &event);
+ check_result(res, c_pipeline_run_func, m_index, res);
+
+ res = COI::EventWait(1, &event, -1, 1, 0, 0);
+ check_result(res, c_event_wait, res);
+ }
+
+ m_stream_lock.unlock();
+ stream->set_pipeline(pipeline);
+ }
+ return pipeline;
+}
+
+void Engine::stream_destroy(_Offload_stream handle)
+{
+ // get stream
+ Stream * stream = Stream::find_stream(handle, true);
+
+ if (stream) {
+ // return cpus for future use
+ for (int i = 0; i < m_num_threads; i++) {
+ if (stream->m_stream_cpus.test(i)) {
+ m_cpus.set(i);
+ }
+ }
+ delete stream;
+ }
+ else {
+ LIBOFFLOAD_ERROR(c_offload_no_stream, m_index);
+ LIBOFFLOAD_ABORT;
+ }
+}
+
+uint64_t Engine::get_thread_id(void)
+{
+ Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
+ if (thread == 0) {
+ thread = new Thread(&m_proc_number);
+ thread_setspecific(mic_thread_key, thread);
+ }
+
+ return reinterpret_cast<uint64_t>(thread);
+}
+
AutoSet& Engine::get_auto_vars(void)
{
Thread* thread = (Thread*) thread_getspecific(mic_thread_key);