diff options
author | bviyer <bviyer@138bc75d-0d04-0410-961f-82ee72b054a4> | 2013-10-29 18:37:47 +0000 |
---|---|---|
committer | bviyer <bviyer@138bc75d-0d04-0410-961f-82ee72b054a4> | 2013-10-29 18:37:47 +0000 |
commit | 4710dd5101f8103638ffe082a220f701f592df36 (patch) | |
tree | 235d812c6202e962d45c0cce844b2afcc5a0596d /libcilkrts/runtime | |
parent | d037099fed7476ffedb6784a1f544132f258d792 (diff) | |
download | gcc-4710dd5101f8103638ffe082a220f701f592df36.tar.gz |
Added Cilk runtime library (libcilkrts) into GCC.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@204173 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libcilkrts/runtime')
67 files changed, 23137 insertions, 0 deletions
diff --git a/libcilkrts/runtime/acknowledgements.dox b/libcilkrts/runtime/acknowledgements.dox new file mode 100644 index 00000000000..79b5d876f33 --- /dev/null +++ b/libcilkrts/runtime/acknowledgements.dox @@ -0,0 +1,51 @@ +/* acknowledgements.dox
+ *
+ *************************************************************************
+ *
+ * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/
+
+/*
+ * This file contains acknowledgements of community contributions to the
+ * Cilk Plus runtime.
+ */
+
+/**
+ * @mainpage
+ *
+ * @section Acknowledgements Acknowledgements
+ *
+ * Modifications to build the Cilk Plus runtime for VxWorks provided by
+ * Brian Kuhl of Wind River.
+ */
diff --git a/libcilkrts/runtime/bug.cpp b/libcilkrts/runtime/bug.cpp new file mode 100644 index 00000000000..dbdf1fd3216 --- /dev/null +++ b/libcilkrts/runtime/bug.cpp @@ -0,0 +1,139 @@ +/* bug.cpp -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#include "bug.h" + +#include <exception> +#include <stdio.h> +#include <stdarg.h> +#include <stdlib.h> +#ifdef _WIN32 +# include "windows-clean.h" +# include "internal/abi.h" +# include "cilktools/cilkscreen.h" +# include <crtdbg.h> +#endif + +__CILKRTS_BEGIN_EXTERN_C + +COMMON_PORTABLE const char *const __cilkrts_assertion_failed = + "%s:%d: cilk assertion failed: %s\n"; + +COMMON_PORTABLE void __cilkrts_bug(const char *fmt,...) cilk_nothrow +{ +#if defined (_WIN32) && defined(_DEBUG) + _CRTIMP void __cdecl _wassert(__in_z const wchar_t * _Message, + __in_z const wchar_t *_File, + __in unsigned _Line); + char message[256]; + wchar_t wmessage[256]; + va_list l; + va_start(l, fmt); + _vsnprintf_s(message, 256, _TRUNCATE, fmt, l); + va_end(l); + _snwprintf_s(wmessage, 256, _TRUNCATE, _CRT_WIDE("%S"), + message); /* widen */ + + // Force asserts to go to stderr and the debugger. This isn't polite, but + // we're about to kill the app anyway and it will prevent our tests from + // hanging + _CrtSetReportMode(_CRT_ASSERT, _CRTDBG_MODE_FILE| _CRTDBG_MODE_DEBUG); + _CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR); + + _wassert(wmessage, _CRT_WIDE(__FILE__), __LINE__); + + // If there's a debugger attached, give it a chance to look at the failure + if (IsDebuggerPresent()) + DebugBreak(); + + abort(); +/* __asm int 3 */ +#else + /* To reduce user confusion, write all user-generated output + before the system-generated error message. */ + va_list l; + fflush(NULL); + va_start(l, fmt); + vfprintf(stderr, fmt, l); + va_end(l); + fflush(stderr); + +#ifndef _WIN32 + abort(); +#endif + +#endif + + exit(1); +} + +COMMON_PORTABLE void cilkbug_assert_no_uncaught_exception(void) +{ + bool uncaught = std::uncaught_exception(); + CILK_ASSERT(!uncaught); +} + +COMMON_SYSDEP void abort_because_rts_is_corrupted(void) +{ + __cilkrts_bug("The Cilk Plus runtime system detected a corruption " + "in its data structures. This is most likely caused " + "by an application bug. Aborting execution.\n"); +} + +#ifdef WIN32 +COMMON_SYSDEP void __cilkrts_dbgprintf(const char *fmt,...) +{ + char message[2048]; + va_list l; + + // Cilkscreen shouldn't watch this + __cilkscreen_disable_checking(); + + va_start(l, fmt); + _vsnprintf_s(message, 2048, _TRUNCATE, fmt, l); + va_end(l); + OutputDebugStringA (message); + + // Re-enable Cilkscreen + __cilkscreen_enable_checking(); +} +#endif + +__CILKRTS_END_EXTERN_C + +/* End bug.cpp */ diff --git a/libcilkrts/runtime/bug.h b/libcilkrts/runtime/bug.h new file mode 100644 index 00000000000..bb18913787d --- /dev/null +++ b/libcilkrts/runtime/bug.h @@ -0,0 +1,141 @@ +/* bug.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file bug.h + * + * @brief Support for reporting bugs and debugging. + */ + +#ifndef INCLUDED_BUG_DOT_H +#define INCLUDED_BUG_DOT_H + +#include "rts-common.h" +#include <cilk/common.h> + +__CILKRTS_BEGIN_EXTERN_C + +/** + * Flush all output, write error message to stderr and abort the execution. + * On Windows the error is also written to the debugger. + * + * @param fmt printf-style format string. Any remaining parameters will be + * be interpreted based on the format string text. + */ +COMMON_PORTABLE NORETURN __cilkrts_bug(const char *fmt,...) cilk_nothrow; + +#ifndef CILK_ASSERT + +/** Standard text for failed assertion */ +COMMON_PORTABLE extern const char *const __cilkrts_assertion_failed; + +/** + * Macro to assert an invariant that must be true. If the statement evalutes + * to false, __cilkrts_bug will be called to report the failure and terminate + * the application. + */ +#define CILK_ASSERT(ex) \ + (__builtin_expect((ex) != 0, 1) ? (void)0 : \ + __cilkrts_bug(__cilkrts_assertion_failed, __FILE__, __LINE__, #ex)) + +#define CILK_ASSERT_MSG(ex, msg) \ + (__builtin_expect((ex) != 0, 1) ? (void)0 : \ + __cilkrts_bug(__cilkrts_assertion_failed, __FILE__, __LINE__, \ + #ex "\n " msg)) +#endif // CILK_ASSERT + +/** + * Assert that there is no uncaught exception. + * + * Not valid on Windows or Android. + * + * On Android, calling std::uncaught_exception with the stlport library causes + * a seg fault. Since we're not supporting exceptions there at this point, + * just don't do the check. It works with the GNU STL library, but that's + * GPL V3 licensed. + */ +COMMON_PORTABLE void cilkbug_assert_no_uncaught_exception(void); +#if defined(_WIN32) || defined(ANDROID) +# define CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION() +#else +# define CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION() \ + cilkbug_assert_no_uncaught_exception() +#endif + + +/** + * Call __cilkrts_bug with a standard message that the runtime state is + * corrupted and the application is being terminated. + */ +COMMON_SYSDEP void abort_because_rts_is_corrupted(void); + +// Debugging aids +#ifndef _DEBUG +# define DBGPRINTF(_fmt, ...) +#elif defined(_WIN32) + +/** + * Write debugging output. On windows this is written to the debugger. + * + * @param fmt printf-style format string. Any remaining parameters will be + * be interpreted based on the format string text. + */ +COMMON_SYSDEP void __cilkrts_dbgprintf(const char *fmt,...) cilk_nothrow; + +/** + * Macro to write debugging output which will be elided if this is not a + * debug build. The macro is currently always elided on non-Windows builds. + * + * @param _fmt printf-style format string. Any remaining parameters will be + * be interpreted based on the format string text. + */ +# define DBGPRINTF(_fmt, ...) __cilkrts_dbgprintf(_fmt, __VA_ARGS__) + +#else /* if _DEBUG && !_WIN32 */ + /* Non-Windows debug logging. Someday we should make GetCurrentFiber() + * and GetWorkerFiber() do something. + */ +# include <stdio.h> + __CILKRTS_INLINE void* GetCurrentFiber() { return 0; } + __CILKRTS_INLINE void* GetWorkerFiber(__cilkrts_worker* w) { return 0; } +# define DBGPRINTF(_fmt, ...) fprintf(stderr, _fmt, __VA_ARGS__) +#endif // _DEBUG + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_BUG_DOT_H) diff --git a/libcilkrts/runtime/c_reducers.c b/libcilkrts/runtime/c_reducers.c new file mode 100644 index 00000000000..52615e93f43 --- /dev/null +++ b/libcilkrts/runtime/c_reducers.c @@ -0,0 +1,57 @@ +/* c_reducers.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2010-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +/* Implementation of C reducers */ + +// Disable warning about integer conversions losing significant bits. +// The code is correct as is. +#ifdef __INTEL_COMPILER +#pragma warning(disable:2259) +#endif + +#define CILK_C_DEFINE_REDUCERS + +#include <cilk/reducer_opadd.h> +#include <cilk/reducer_opand.h> +#include <cilk/reducer_opmul.h> +#include <cilk/reducer_opor.h> +#include <cilk/reducer_opxor.h> +#include <cilk/reducer_min_max.h> + +/* End reducer_opadd.c */ diff --git a/libcilkrts/runtime/cilk-abi-cilk-for.cpp b/libcilkrts/runtime/cilk-abi-cilk-for.cpp new file mode 100644 index 00000000000..4fa6dcec82a --- /dev/null +++ b/libcilkrts/runtime/cilk-abi-cilk-for.cpp @@ -0,0 +1,406 @@ +/* cilk-abi-cilk-for.cpp -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2011, 2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +/* Implementation of cilk_for ABI. + * + * This file must be C++, not C, in order to handle C++ exceptions correctly + * from within the body of the cilk_for loop + */ + +#include "internal/abi.h" +#include "metacall_impl.h" +#include "global_state.h" + +// Icky macros to determine if we're compiled with optimization. Based on +// the declaration of __CILKRTS_ASSERT in common.h +#if defined(_WIN32) +# if defined (_DEBUG) +# define CILKRTS_OPTIMIZED 0 // Assumes /MDd is always used with /Od +# else +# define CILKRTS_OPTIMIZED 1 +# endif // defined(_DEBUG) +#else +# if defined(__OPTIMIZE__) +# define CILKRTS_OPTIMIZED 1 +# else +# define CILKRTS_OPTIMIZED 0 +# endif +#endif + +template <typename count_t> +static inline int grainsize(int req, count_t count) +{ + // A positive requested grain size comes from the user. A very high grain + // size risks losing parallelism, but the user told us what they want for + // grainsize. Who are we to argue? + if (req > 0) + return req; + + // At present, a negative requested grain size is treated the same way as + // a zero grain size, i.e., the runtime computes the actual grainsize + // using a hueristic. In the future, the compiler may give us additional + // information about the size of the cilk_for body by passing a negative + // grain size. + + // Avoid generating a zero grainsize, even for empty loops. + if (count < 1) + return 1; + + global_state_t* g = cilkg_get_global_state(); + if (g->under_ptool) + { + // Grainsize = 1, when running under PIN, and when the grainsize has + // not explicitly been set by the user. + return 1; + } + else + { + // Divide loop count by 8 times the worker count and round up. + const int Px8 = g->P * 8; + count_t n = (count + Px8 - 1) / Px8; + + // 2K should be enough to amortize the cost of the cilk_for. Any + // larger grainsize risks losing parallelism. + if (n > 2048) + return 2048; + return (int) n; // n <= 2048, so no loss of precision on cast to int + } +} + +/* + * call_cilk_for_loop_body + * + * Centralizes the code to call the loop body. The compiler should be + * inlining this code + * + * low - Low loop index we're considering in this portion of the algorithm + * high - High loop index we're considering in this portion of the algorithm + * body - lambda function for the cilk_for loop body + * data - data used by the lambda function + * w - __cilkrts_worker we're currently executing on + * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of + * the cilk_for loop to flatten out the internal nodes + */ +template <typename count_t, typename F> +inline static +void call_cilk_for_loop_body(count_t low, count_t high, + F body, void *data, + __cilkrts_worker *w, + __cilkrts_pedigree *loop_root_pedigree) +{ + // Cilkscreen should not report this call in a stack trace + NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); + + // The worker is only valid until the first spawn. Fetch the + // __cilkrts_stack_frame out of the worker, since it will be stable across + // steals. The sf pointer actually points to the *parent's* + // __cilkrts_stack_frame, since this function is a non-spawning function + // and therefore has no cilk stack frame of its own. + __cilkrts_stack_frame *sf = w->current_stack_frame; + + // Save the pedigree node pointed to by the worker. We'll need to restore + // that when we exit since the spawn helpers in the cilk_for call tree + // will assume that it's valid + const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent; + + // Add the leaf pedigree node to the chain. The parent is the root node + // to flatten the tree regardless of the DAG branches in the cilk_for + // divide-and-conquer recursion. + // + // The rank is initialized to the low index. The user is + // expected to call __cilkrts_bump_loop_rank at the end of the cilk_for + // loop body. + __cilkrts_pedigree loop_leaf_pedigree; + + loop_leaf_pedigree.rank = (uint64_t)low; + loop_leaf_pedigree.parent = loop_root_pedigree; + + // The worker's pedigree always starts with a rank of 0 + w->pedigree.rank = 0; + w->pedigree.parent = &loop_leaf_pedigree; + + // Call the compiler generated cilk_for loop body lambda function + body(data, low, high); + + // The loop body may have included spawns, so we must refetch the worker + // from the __cilkrts_stack_frame, which is stable regardless of which + // worker we're executing on. + w = sf->worker; + + // Restore the pedigree chain. It must be valid because the spawn helpers + // generated by the cilk_for implementation will access it. + w->pedigree.parent = saved_next_pedigree_node; +} + +/* capture_spawn_arg_stack_frame + * + * Efficiently get the address of the caller's __cilkrts_stack_frame. The + * preconditons are that 'w' is the worker at the time of the call and + * 'w->current_stack_frame' points to the __cilkrts_stack_frame within the + * spawn helper. This function should be called only within the argument list + * of a function that is being spawned because that is the only situation in + * which these preconditions hold. This function returns the worker + * (unchanged) after storing the captured stack frame pointer is stored in the + * sf argument. + * + * The purpose of this function is to get the caller's stack frame in a + * context where the caller's worker is known but its stack frame is not + * necessarily initialized. The "shrink wrap" optimization delays + * initializing the contents of a spawning function's '__cilkrts_stack_frame' + * as well as the 'current_stack_frame' pointer within the worker. By calling + * this function within a spawning function's argument list, we can ensure + * that these initializations have occured but that a detach (which would + * invalidate the worker pointer in the caller) has not yet occured. Once the + * '__cilkrts_stack_frame' has been retrieved in this way, it is stable for the + * remainder of the caller's execution, and becomes an efficient way to get + * the worker (much more efficient than calling '__cilkrts_get_tls_worker()'), + * even after a spawn or sync. + */ +inline __cilkrts_worker* +capture_spawn_arg_stack_frame(__cilkrts_stack_frame* &sf, __cilkrts_worker* w) +{ + // Get current stack frame + sf = w->current_stack_frame; +#ifdef __INTEL_COMPILER +# if __INTEL_COMPILER <= 1300 && __INTEL_COMPILER_BUILD_DATE < 20130101 + // In older compilers 'w->current_stack_frame' points to the + // spawn-helper's stack frame. In newer compiler's however, it points + // directly to the pointer's stack frame. (This change was made to avoid + // having the spawn helper in the frame list when evaluating function + // arguments, thus avoiding corruption when those arguments themselves + // contain cilk_spawns.) + + // w->current_stack_frame is the spawn helper's stack frame. + // w->current_stack_frame->call_parent is the caller's stack frame. + sf = sf->call_parent; +# endif +#endif + return w; +} + +/* + * cilk_for_recursive + * + * Templatized function to implement the recursive divide-and-conquer + * algorithm that's how we implement a cilk_for. + * + * low - Low loop index we're considering in this portion of the algorithm + * high - High loop index we're considering in this portion of the algorithm + * body - lambda function for the cilk_for loop body + * data - data used by the lambda function + * grain - grain size (0 if it should be computed) + * w - __cilkrts_worker we're currently executing on + * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of + * the cilk_for loop to flatten out the internal nodes + */ +template <typename count_t, typename F> +static +void cilk_for_recursive(count_t low, count_t high, + F body, void *data, int grain, + __cilkrts_worker *w, + __cilkrts_pedigree *loop_root_pedigree) +{ +tail_recurse: + // Cilkscreen should not report this call in a stack trace + // This needs to be done everytime the worker resumes + NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); + + count_t count = high - low; + // Invariant: count > 0, grain >= 1 + if (count > grain) + { + // Invariant: count >= 2 + count_t mid = low + count / 2; + // The worker is valid only until the first spawn and is expensive to + // retrieve (using '__cilkrts_get_tls_worker') after the spawn. The + // '__cilkrts_stack_frame' is more stable, but isn't initialized until + // the first spawn. Thus, we want to grab the address of the + // '__cilkrts_stack_frame' after it is initialized but before the + // spawn detaches. The only place we can do that is within the + // argument list of the spawned function, hence the call to + // capture_spawn_arg_stack_frame(). + __cilkrts_stack_frame *sf; + _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, + capture_spawn_arg_stack_frame(sf, w), + loop_root_pedigree); + w = sf->worker; + low = mid; + + goto tail_recurse; + } + + // Call the cilk_for loop body lambda function passed in by the compiler to + // execute one grain + call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree); +} + +static void noop() { } + +/* + * cilk_for_root + * + * Templatized function to implement the top level of a cilk_for loop. + * + * body - lambda function for the cilk_for loop body + * data - data used by the lambda function + * count - trip count for loop + * grain - grain size (0 if it should be computed) + */ +template <typename count_t, typename F> +static void cilk_for_root(F body, void *data, count_t count, int grain) +{ + // Cilkscreen should not report this call in a stack trace + NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); + + // Pedigree computation: + // + // If the last pedigree node on entry to the _Cilk_for has value X, + // then at the start of each iteration of the loop body, the value of + // the last pedigree node should be 0, the value of the second-to-last + // node should equal the loop counter, and the value of the + // third-to-last node should be X. On return from the _Cilk_for, the + // value of the last pedigree should be incremented to X+2. The + // pedigree within the loop is thus flattened, such that the depth of + // recursion does not affect the results either inside or outside of + // the loop. Note that the pedigree after the loop exists is the same + // as if a single spawn and sync were executed within this function. + + // TBD: Since the shrink-wrap optimization was turned on in the compiler, + // it is not possible to get the current stack frame without actually + // forcing a call to bind-thread. This spurious spawn is a temporary + // stopgap until the correct intrinsics are added to give us total control + // over frame initialization. + _Cilk_spawn noop(); + + // Fetch the current worker. From that we can get the current stack frame + // which will be constant even if we're stolen + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + __cilkrts_stack_frame *sf = w->current_stack_frame; + + // Decrement the rank by one to undo the pedigree change from the + // _Cilk_spawn + --w->pedigree.rank; + + // Save the current worker pedigree into loop_root_pedigree, which will be + // the root node for our flattened pedigree. + __cilkrts_pedigree loop_root_pedigree = w->pedigree; + + // Don't splice the loop_root node in yet. It will be done when we + // call the loop body lambda function +// w->pedigree.rank = 0; +// w->pedigree.next = &loop_root_pedigree; + + /* Spawn is necessary at top-level to force runtime to start up. + * Runtime must be started in order to call the grainsize() function. + */ + int gs = grainsize(grain, count); + cilk_for_recursive((count_t) 0, count, body, data, gs, w, + &loop_root_pedigree); + + // Need to refetch the worker after calling a spawning function. + w = sf->worker; + + // Restore the pedigree in the worker. + w->pedigree = loop_root_pedigree; + + // Bump the worker pedigree. + ++w->pedigree.rank; + + // Implicit sync will increment the pedigree leaf rank again, for a total + // of two increments. If the noop spawn above is removed, then we'll need + // to re-enable the following code: +// // If this is an optimized build, then the compiler will have optimized +// // out the increment of the worker's pedigree in the implied sync. We +// // need to add one to make the pedigree_loop test work correctly. +// #if CILKRTS_OPTIMIZED +// ++sf->worker->pedigree.rank; +// #endif +} + +// Use extern "C" to suppress name mangling of __cilkrts_cilk_for_32 and +// __cilkrts_cilk_for_64. +extern "C" { + +/* + * __cilkrts_cilk_for_32 + * + * Implementation of cilk_for for 32-bit trip counts (regardless of processor + * word size). Assumes that the range is 0 - count. + * + * body - lambda function for the cilk_for loop body + * data - data used by the lambda function + * count - trip count for loop + * grain - grain size (0 if it should be computed) + */ + +CILK_ABI_THROWS_VOID __cilkrts_cilk_for_32(__cilk_abi_f32_t body, void *data, + cilk32_t count, int grain) +{ + // Cilkscreen should not report this call in a stack trace + NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0); + + // Check for an empty range here as an optimization - don't need to do any + // __cilkrts_stack_frame initialization + if (count > 0) + cilk_for_root(body, data, count, grain); +} + +/* + * __cilkrts_cilk_for_64 + * + * Implementation of cilk_for for 64-bit trip counts (regardless of processor + * word size). Assumes that the range is 0 - count. + * + * body - lambda function for the cilk_for loop body + * data - data used by the lambda function + * count - trip count for loop + * grain - grain size (0 if it should be computed) + */ +CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data, + cilk64_t count, int grain) +{ + // Check for an empty range here as an optimization - don't need to do any + // __cilkrts_stack_frame initialization + if (count > 0) + cilk_for_root(body, data, count, grain); +} + +} // end extern "C" + +/* End cilk-abi-cilk-for.cpp */ diff --git a/libcilkrts/runtime/cilk-abi-vla-internal.c b/libcilkrts/runtime/cilk-abi-vla-internal.c new file mode 100644 index 00000000000..6fb92677ad0 --- /dev/null +++ b/libcilkrts/runtime/cilk-abi-vla-internal.c @@ -0,0 +1,83 @@ +/* cilk-abi-vla-internal.c -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/* + * These functions are provided in their own compilation unit so I can debug + * them. cilk-abi-vla.c must always be compiled with optimization on so that + * inlining occurs. + */ + +#include "internal/abi.h" +#include "cilk-abi-vla-internal.h" +#include "bug.h" +#include "full_frame.h" +#include "local_state.h" + +#include <stdlib.h> +#include <stdint.h> + +#include "bug.h" + +void *vla_internal_heap_alloc(__cilkrts_stack_frame *sf, + size_t full_size, + uint32_t align) +{ + return malloc(full_size); +} + +void vla_internal_heap_free(void *t, size_t size) +{ + free(t); +} + +void vla_free_from_original_stack(__cilkrts_stack_frame *sf, + size_t full_size) +{ + // The __cilkrts_stack_frame must be initialized + CILK_ASSERT(sf->worker); + +#if 1 + // Add full_size to ff->sync_sp so that when we return, the VLA will no + // longer be allocated on the stack + __cilkrts_adjust_stack(sf->worker->l->frame_ff, full_size); +#else + // Inline __cilkrts_adjust_stack for Kevin + full_frame *ff = sf->worker->l->frame_ff; + ff->sync_sp = ff->sync_sp + full_size; +#endif +} diff --git a/libcilkrts/runtime/cilk-abi-vla-internal.h b/libcilkrts/runtime/cilk-abi-vla-internal.h new file mode 100644 index 00000000000..909f08fa471 --- /dev/null +++ b/libcilkrts/runtime/cilk-abi-vla-internal.h @@ -0,0 +1,90 @@ +/* cilk-abi-vla-internal.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file cilk-abi-vla-internal.h + * + * @brief Allocation/deallocation function for use with Variable Length + * Arrays in spawning functions. + * + * These should be the only functions in the Cilk runtime allocating memory + * from the standard C runtime heap. This memory will be provided to user + * code for use in VLAs, when the memory cannot be allocated from the stack. + * + * While these functions are simply passthroughs to malloc and free at the + * moment, once we've got the basics of VLA allocations working we'll make + * them do fancier tricks. + */ + +/** + * @brief Allocate memory from the heap for use by a Variable Length Array in + * a spawning function. + * + * @param sf The __cilkrts_stack_frame for the spawning function containing + * the VLA. + * @param full_size The number of bytes to be allocated, including any tags + * needed to identify this as allocated from the heap. + * @param align Any alignment necessary for the allocation. + */ + +void *vla_internal_heap_alloc(__cilkrts_stack_frame *sf, + size_t full_size, + uint32_t align); + +/** + * @brief Deallocate memory from the heap used by a Variable Length Array in + * a spawning function. + * + * @param t The address of the memory block to be freed. + * @param size The size of the memory block to be freed. + */ + +void vla_internal_heap_free(void *t, + size_t size); + +/** + * @brief Deallocate memory from the original stack. We'll do this by adding + * full_size to ff->sync_sp. So after the sync, the Variable Length Array + * will no longer be allocated on the stack. + * + * @param sf The __cilkrts_stack_frame for the spawning function that is + * deallocating a VLA. + * @param full_size The size of the VLA, including any alignment and tags. + */ +void vla_free_from_original_stack(__cilkrts_stack_frame *sf, + size_t full_size); diff --git a/libcilkrts/runtime/cilk-abi.c b/libcilkrts/runtime/cilk-abi.c new file mode 100644 index 00000000000..1da05239ebc --- /dev/null +++ b/libcilkrts/runtime/cilk-abi.c @@ -0,0 +1,733 @@ +/* Cilk_abi.c -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2010-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +/** + * @file cilk-abi.c + * + * @brief cilk-abi.c implements all of the entrypoints to the Intel Cilk + * Plus runtime. + */ + +/* + * Define this macro so that compiliation of this file generates the + * non-inlined versions of certain functions in cilk_api.h. + */ +#include "internal/abi.h" +#include "cilk/cilk_api.h" +#include "cilk/cilk_undocumented.h" +#include "cilktools/cilkscreen.h" + +#include "global_state.h" +#include "os.h" +#include "os_mutex.h" +#include "bug.h" +#include "local_state.h" +#include "full_frame.h" +#include "pedigrees.h" +#include "scheduler.h" +#include "sysdep.h" +#include "except.h" +#include "cilk_malloc.h" +#include "record-replay.h" + +#include <errno.h> +#include <string.h> +#include <stdlib.h> + +#ifdef _MSC_VER +/* Some versions of icc don't support limits.h on Linux if + gcc 4.3 or newer is installed. */ +#include <limits.h> + +/* Declare _ReturnAddress compiler intrinsic */ +void * _ReturnAddress(void); +#pragma intrinsic(_ReturnAddress) + +#include "sysdep-win.h" // Needed for sysdep_init_module() +#endif /* _WIN32 */ + +#include "metacall_impl.h" +#include "reducer_impl.h" +#include "cilk-ittnotify.h" +#include "cilk-tbb-interop.h" + +#define TBB_INTEROP_DATA_DELAYED_UNTIL_BIND (void *)-1 + +/** + * __cilkrts_bind_thread is a versioned entrypoint. The runtime should be + * exporting copies of __cilkrts_bind_version for the current and all previous + * versions of the ABI. + * + * This macro should always be set to generate a version to match the current + * version; __CILKRTS_ABI_VERSION. + */ +#define BIND_THREAD_RTN __cilkrts_bind_thread_1 + +static inline +void enter_frame_internal(__cilkrts_stack_frame *sf, uint32_t version) +{ + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + if (w == 0) { /* slow path */ + w = BIND_THREAD_RTN(); + + sf->flags = CILK_FRAME_LAST | (version << 24); + CILK_ASSERT((sf->flags & CILK_FRAME_FLAGS_MASK) == CILK_FRAME_LAST); + } else { + sf->flags = (version << 24); + CILK_ASSERT((sf->flags & CILK_FRAME_FLAGS_MASK) == 0); + } + sf->call_parent = w->current_stack_frame; + sf->worker = w; + w->current_stack_frame = sf; +} + +CILK_ABI_VOID __cilkrts_enter_frame(__cilkrts_stack_frame *sf) +{ + enter_frame_internal(sf, 0); +} + +CILK_ABI_VOID __cilkrts_enter_frame_1(__cilkrts_stack_frame *sf) +{ + enter_frame_internal(sf, 1); + sf->reserved = 0; +} + +static inline +void enter_frame_fast_internal(__cilkrts_stack_frame *sf, uint32_t version) +{ + __cilkrts_worker *w = __cilkrts_get_tls_worker_fast(); + sf->flags = version << 24; + sf->call_parent = w->current_stack_frame; + sf->worker = w; + w->current_stack_frame = sf; +} + +CILK_ABI_VOID __cilkrts_enter_frame_fast(__cilkrts_stack_frame *sf) +{ + enter_frame_fast_internal(sf, 0); +} + +CILK_ABI_VOID __cilkrts_enter_frame_fast_1(__cilkrts_stack_frame *sf) +{ + enter_frame_fast_internal(sf, 1); + sf->reserved = 0; +} + +/** + * A component of the THE protocol. __cilkrts_undo_detach checks whether + * this frame's parent has been stolen. If it hasn't, the frame can return + * normally. If the parent has been stolen, of if we suspect it might be, + * then __cilkrts_leave_frame() needs to call into the runtime. + * + * @note __cilkrts_undo_detach() is comparing the exception pointer against + * the tail pointer. The exception pointer is modified when another worker + * is considering whether it can steal a frame. The head pointer is updated + * to match when the worker lock is taken out and the thief is sure that + * it can complete the steal. If the steal cannot be completed, the thief + * will restore the exception pointer. + * + * @return true if undo-detach failed. + */ +static int __cilkrts_undo_detach(__cilkrts_stack_frame *sf) +{ + __cilkrts_worker *w = sf->worker; + __cilkrts_stack_frame *volatile *t = w->tail; + +/* DBGPRINTF("%d - __cilkrts_undo_detach - sf %p\n", w->self, sf); */ + + --t; + w->tail = t; + /* On x86 the __sync_fetch_and_<op> family includes a + full memory barrier. In theory the sequence in the + second branch of the #if should be faster, but on + most x86 it is not. */ +#if defined __i386__ || defined __x86_64__ + __sync_fetch_and_and(&sf->flags, ~CILK_FRAME_DETACHED); +#else + __cilkrts_fence(); /* membar #StoreLoad */ + sf->flags &= ~CILK_FRAME_DETACHED; +#endif + + return __builtin_expect(t < w->exc, 0); +} + +CILK_ABI_VOID __cilkrts_leave_frame(__cilkrts_stack_frame *sf) +{ + __cilkrts_worker *w = sf->worker; + +/* DBGPRINTF("%d-%p __cilkrts_leave_frame - sf %p, flags: %x\n", w->self, GetWorkerFiber(w), sf, sf->flags); */ + +#ifdef _WIN32 + /* if leave frame was called from our unwind handler, leave_frame should + proceed no further. */ + if (sf->flags & CILK_FRAME_UNWINDING) + { +/* DBGPRINTF("%d - __cilkrts_leave_frame - aborting due to UNWINDING flag\n", w->self); */ + + // If this is the frame of a spawn helper (indicated by the + // CILK_FRAME_DETACHED flag) we must update the pedigree. The pedigree + // points to nodes allocated on the stack. Failing to update it will + // result in a accvio/segfault if the pedigree is walked. This must happen + // for all spawn helper frames, even if we're processing an exception + if ((sf->flags & CILK_FRAME_DETACHED)) + { + update_pedigree_on_leave_frame(w, sf); + } + return; + } +#endif + +#if CILK_LIB_DEBUG + /* ensure the caller popped itself */ + CILK_ASSERT(w->current_stack_frame != sf); +#endif + + /* The exiting function should have checked for zero flags, + so there is no check for flags == 0 here. */ + +#if CILK_LIB_DEBUG + if (__builtin_expect(sf->flags & (CILK_FRAME_EXITING|CILK_FRAME_UNSYNCHED), 0)) + __cilkrts_bug("W%u: function exiting with invalid flags %02x\n", + w->self, sf->flags); +#endif + + /* Must return normally if (1) the active function was called + and not spawned, or (2) the parent has never been stolen. */ + if ((sf->flags & CILK_FRAME_DETACHED)) { +/* DBGPRINTF("%d - __cilkrts_leave_frame - CILK_FRAME_DETACHED\n", w->self); */ + +#ifndef _WIN32 + if (__builtin_expect(sf->flags & CILK_FRAME_EXCEPTING, 0)) { +// Pedigree will be updated in __cilkrts_leave_frame. We need the +// pedigree before the update for record/replay +// update_pedigree_on_leave_frame(w, sf); + __cilkrts_return_exception(sf); + /* If return_exception returns the caller is attached. + leave_frame is called from a cleanup (destructor) + for the frame object. The caller will reraise the + exception. */ + return; + } +#endif + + // During replay, check whether w was the last worker to continue + replay_wait_for_steal_if_parent_was_stolen(w); + + // Attempt to undo the detach + if (__builtin_expect(__cilkrts_undo_detach(sf), 0)) { + // The update of pedigree for leaving the frame occurs + // inside this call if it does not return. + __cilkrts_c_THE_exception_check(w, sf); + } + + update_pedigree_on_leave_frame(w, sf); + + /* This path is taken when undo-detach wins the race with stealing. + Otherwise this strand terminates and the caller will be resumed + via setjmp at sync. */ + if (__builtin_expect(sf->flags & CILK_FRAME_FLAGS_MASK, 0)) + __cilkrts_bug("W%u: frame won undo-detach race with flags %02x\n", + w->self, sf->flags); + + return; + } + +#if CILK_LIB_DEBUG + sf->flags |= CILK_FRAME_EXITING; +#endif + + if (__builtin_expect(sf->flags & CILK_FRAME_LAST, 0)) + __cilkrts_c_return_from_initial(w); /* does return */ + else if (sf->flags & CILK_FRAME_STOLEN) + __cilkrts_return(w); /* does return */ + +/* DBGPRINTF("%d-%p __cilkrts_leave_frame - returning, StackBase: %p\n", w->self, GetWorkerFiber(w)); */ +} + +/* Caller must have called setjmp. */ +CILK_ABI_VOID __cilkrts_sync(__cilkrts_stack_frame *sf) +{ + __cilkrts_worker *w = sf->worker; +/* DBGPRINTF("%d-%p __cilkrts_sync - sf %p\n", w->self, GetWorkerFiber(w), sf); */ + if (__builtin_expect(!(sf->flags & CILK_FRAME_UNSYNCHED), 0)) + __cilkrts_bug("W%u: double sync %p\n", w->self, sf); +#ifndef _WIN32 + if (__builtin_expect(sf->flags & CILK_FRAME_EXCEPTING, 0)) { + __cilkrts_c_sync_except(w, sf); + } +#endif + + __cilkrts_c_sync(w, sf); +} + +/* + * __cilkrts_get_sf + * + * Debugging aid to provide access to the current __cilkrts_stack_frame. + * + * Not documented! + */ + +CILK_API_VOID_PTR +__cilkrts_get_sf(void) +{ + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + if (0 == w) + return NULL; + + return w->current_stack_frame; +} + +/* Call with global lock held */ +static __cilkrts_worker *find_free_worker(global_state_t *g) +{ + __cilkrts_worker *w = 0; + int i; + + // Scan the non-system workers looking for one which is free so we can + // use it. + for (i = g->P - 1; i < g->total_workers; ++i) { + w = g->workers[i]; + CILK_ASSERT(WORKER_SYSTEM != w->l->type); + if (w->l->type == WORKER_FREE) { + w->l->type = WORKER_USER; + w->l->team = w; + return w; + } + } + + // If we ran out of workers, create a new one. It doesn't actually belong + // to the Cilk global state so nobody will ever try to steal from it. + w = (__cilkrts_worker *)__cilkrts_malloc(sizeof(*w)); + __cilkrts_cilkscreen_ignore_block(w, w+1); + make_worker(g, -1, w); + w->l->type = WORKER_USER; + w->l->team = w; + return w; +} + +/* + * __cilkrts_bind_thread + * + * Exported function to bind a thread to the runtime. + * + * This function name should always have a trailing suffix for the latest ABI + * version. This means that code built with a new compiler will not load + * against an old copy of the runtime. + * + * Symbols for the function called by code compiled with old versions of the + * compiler are created in an OS-specific manner: + * - On Windows the old symbols are defined in the cilk-exports.def linker + * definitions file as aliases of BIND_THREAD_RTN + * - On Linux aliased symbols are created for BIND_THREAD_RTN in this file + * - On MacOS the alternate entrypoints are implemented and simply call + * BIND_THREAD_RTN. + */ +CILK_ABI_WORKER_PTR BIND_THREAD_RTN(void) +{ + __cilkrts_worker *w; + int start_cilkscreen = 0; +#ifdef USE_ITTNOTIFY + static int unique_obj; +#endif + + // Cannot set this pointer until after __cilkrts_init_internal() call: + global_state_t* g; + + ITT_SYNC_CREATE (&unique_obj, "Initialization"); + ITT_SYNC_PREPARE(&unique_obj); + ITT_SYNC_ACQUIRED(&unique_obj); + + + /* 1: Initialize and start the Cilk runtime */ + __cilkrts_init_internal(1); + + /* + * 2: Choose a worker for this thread (fail if none left). The table of + * user workers is protected by the global OS mutex lock. + */ + g = cilkg_get_global_state(); + global_os_mutex_lock(); + if (__builtin_expect(g->work_done, 0)) + __cilkrts_bug("Attempt to enter Cilk while Cilk is shutting down"); + w = find_free_worker(g); + CILK_ASSERT(w); + + __cilkrts_set_tls_worker(w); + __cilkrts_cilkscreen_establish_worker(w); + { + full_frame *ff = __cilkrts_make_full_frame(w, 0); + + ff->fiber_self = cilk_fiber_allocate_from_thread(); + CILK_ASSERT(ff->fiber_self); + + cilk_fiber_set_owner(ff->fiber_self, w); + cilk_fiber_tbb_interop_use_saved_stack_op_info(ff->fiber_self); + + CILK_ASSERT(ff->join_counter == 0); + ff->join_counter = 1; + w->l->frame_ff = ff; + w->reducer_map = __cilkrts_make_reducer_map(w); + __cilkrts_set_leftmost_reducer_map(w->reducer_map, 1); + load_pedigree_leaf_into_user_worker(w); + } + + // Make sure that the head and tail are reset, and saved_protected_tail + // allows all frames to be stolen. + // + // Note that we must NOT check w->exc, since workers that are trying to + // steal from it will be updating w->exc and we don't own the worker lock. + // It's not worth taking out the lock just for an assertion. + CILK_ASSERT(w->head == w->l->ltq); + CILK_ASSERT(w->tail == w->l->ltq); + CILK_ASSERT(w->protected_tail == w->ltq_limit); + + // There may have been an old pending exception which was freed when the + // exception was caught outside of Cilk + w->l->pending_exception = NULL; + + w->reserved = NULL; + + // If we've already created a scheduling fiber for this worker, we'll just + // reuse it. If w->self < 0, it means that this is an ad-hoc user worker + // not known to the global state. Thus, we need to create a scheduling + // stack only if we don't already have one and w->self >= 0. + if (NULL == w->l->scheduling_fiber && w->self >= 0) + { + START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) { + // Create a scheduling fiber for this worker. + w->l->scheduling_fiber = + cilk_fiber_allocate_from_heap(CILK_SCHEDULING_STACK_SIZE); + cilk_fiber_reset_state(w->l->scheduling_fiber, + scheduler_fiber_proc_for_user_worker); + cilk_fiber_set_owner(w->l->scheduling_fiber, w); + } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE); + } + + // If the scheduling fiber is NULL, we've either exceeded our quota for + // fibers or workers or we're out of memory, so we should lose parallelism + // by disallowing stealing. + if (NULL == w->l->scheduling_fiber) + __cilkrts_disallow_stealing(w, NULL); + + start_cilkscreen = (0 == w->g->Q); + + if (w->self != -1) { + // w->self != -1, means that w is a normal user worker and must be + // accounted for by the global state since other workers can steal from + // it. + + // w->self == -1, means that w is an overflow worker and was created on + // demand. I.e., it does not need to be accounted for by the global + // state. + + __cilkrts_enter_cilk(w->g); + } + + global_os_mutex_unlock(); + + /* If there's only 1 worker, the counts will be started in + * __cilkrts_scheduler */ + if (g->P > 1) + { + START_INTERVAL(w, INTERVAL_IN_SCHEDULER); + START_INTERVAL(w, INTERVAL_WORKING); + } + + ITT_SYNC_RELEASING(&unique_obj); + + /* Turn on Cilkscreen if this is the first worker. This needs to be done + * when we are NOT holding the os mutex. */ + if (start_cilkscreen) + __cilkrts_cilkscreen_enable_instrumentation(); + + return w; +} + +#ifndef _MSC_VER +/* + * Define old version-specific symbols for binding threads (since they exist in + * all Cilk code). These aliases prohibit newly compiled code from loading an + * old version of the runtime. We can handle old code with a new runtime, but + * new code with an old runtime is verboten! + * + * For Windows, the aliased symbol is exported in cilk-exports.def. + */ +#if defined(_DARWIN_C_SOURCE) || defined(__APPLE__) +/** + * Mac OS X: Unfortunately, Darwin doesn't allow aliasing, so we just make a + * call and hope the optimizer does the right thing. + */ +CILK_ABI_WORKER_PTR __cilkrts_bind_thread (void) { + return BIND_THREAD_RTN(); +} +#else + +/** + * Macro to convert a parameter to a string. Used on Linux or BSD. + */ +#define STRINGIFY(x) #x + +/** + * Macro to generate an __attribute__ for an aliased name + */ +#define ALIASED_NAME(x) __attribute__ ((alias (STRINGIFY(x)))) + +/** + * Linux or BSD: Use the alias attribute to make the labels for the versioned + * functions point to the same place in the code as the original. Using + * the two macros is annoying but required. + */ + +CILK_ABI_WORKER_PTR __cilkrts_bind_thread(void) + ALIASED_NAME(BIND_THREAD_RTN); + +#endif // defined _DARWIN_C_SOURCE || defined __APPLE__ +#endif // !defined _MSC_VER + +CILK_API_SIZET +__cilkrts_get_stack_size(void) { + return cilkg_get_stack_size(); +} + +// Method for debugging. +CILK_API_VOID __cilkrts_dump_stats(void) +{ + // While the stats aren't protected by the global OS mutex, the table + // of workers is, so take out the global OS mutex while we're doing this + global_os_mutex_lock(); + if (cilkg_is_published()) { + global_state_t *g = cilkg_get_global_state(); + __cilkrts_dump_stats_to_stderr(g); + } + else { + __cilkrts_bug("Attempting to report Cilk stats before the runtime has started\n"); + } + global_os_mutex_unlock(); +} + +#ifndef _WIN32 +CILK_ABI_THROWS_VOID __cilkrts_rethrow(__cilkrts_stack_frame *sf) +{ + __cilkrts_gcc_rethrow(sf); +} +#endif + +/* + * __cilkrts_unwatch_stack + * + * Callback for TBB to tell us they don't want to watch the stack anymore + */ + +static __cilk_tbb_retcode __cilkrts_unwatch_stack(void *data) +{ + __cilk_tbb_stack_op_thunk o; + + // If the cilk_fiber wasn't available fetch it now + if (TBB_INTEROP_DATA_DELAYED_UNTIL_BIND == data) + { + full_frame *ff; + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + if (NULL == w) + { + // Free any saved stack op information + cilk_fiber_tbb_interop_free_stack_op_info(); + + return 0; /* Success! */ + } + + __cilkrts_worker_lock(w); + ff = w->l->frame_ff; + __cilkrts_frame_lock(w,ff); + data = ff->fiber_self; + __cilkrts_frame_unlock(w,ff); + __cilkrts_worker_unlock(w); + } + +#if CILK_LIB_DEBUG /* Debug code */ + /* Get current stack */ + full_frame *ff; + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + __cilkrts_worker_lock(w); + ff = w->l->frame_ff; + __cilkrts_frame_lock(w,ff); + CILK_ASSERT (data == ff->fiber_self); + __cilkrts_frame_unlock(w,ff); + __cilkrts_worker_unlock(w); +#endif + + /* Clear the callback information */ + o.data = NULL; + o.routine = NULL; + cilk_fiber_set_stack_op((cilk_fiber*)data, o); + + // Note. Do *NOT* free any saved stack information here. If they want to + // free the saved stack op information, they'll do it when the thread is + // unbound + + return 0; /* Success! */ +} + +/* + * __cilkrts_watch_stack + * + * Called by TBB, defined by Cilk. + * + * Requests that Cilk invoke the stack op routine when it orphans a stack. + * Cilk sets *u to a thunk that TBB should call when it is no longer interested + * in watching the stack. + */ + +CILK_API_TBB_RETCODE +__cilkrts_watch_stack(__cilk_tbb_unwatch_thunk *u, + __cilk_tbb_stack_op_thunk o) +{ + cilk_fiber* current_fiber; + __cilkrts_worker *w; + +#ifdef _MSC_VER + // This may be called by TBB *before* the OS has given us our + // initialization call. Make sure the module is initialized. + sysdep_init_module(); +#endif + + // Fetch the __cilkrts_worker bound to this thread + w = __cilkrts_get_tls_worker(); + if (NULL == w) + { + // Save data for later. We'll deal with it when/if this thread binds + // to the runtime + cilk_fiber_tbb_interop_save_stack_op_info(o); + + u->routine = __cilkrts_unwatch_stack; + u->data = TBB_INTEROP_DATA_DELAYED_UNTIL_BIND; + + return 0; + } + + /* Get current stack */ + __cilkrts_worker_lock(w); + current_fiber = w->l->frame_ff->fiber_self; + __cilkrts_worker_unlock(w); + +/* CILK_ASSERT( !sd->stack_op_data ); */ +/* CILK_ASSERT( !sd->stack_op_routine ); */ + + /* Give TBB our callback */ + u->routine = __cilkrts_unwatch_stack; + u->data = current_fiber; + /* Save the callback information */ + cilk_fiber_set_stack_op(current_fiber, o); + + return 0; /* Success! */ +} + + +// This function must be called only within a continuation, within the stack +// frame of the continuation itself. +CILK_API_INT __cilkrts_synched(void) +{ + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + + // If we don't have a worker, then we're synched by definition :o) + if (NULL == w) + return 1; + + // Check to see if we are in a stolen continuation. If not, then + // we are synched. + uint32_t flags = w->current_stack_frame->flags; + if (0 == (flags & CILK_FRAME_UNSYNCHED)) + return 1; + + // We are in a stolen continutation, but the join counter might have been + // decremented to one, making us synched again. Get the full frame so + // that we can check the join counter. ASSUME: frame_ff is stable (can be + // read without a lock) in a stolen continuation -- it can't be stolen + // while it's currently executing. + full_frame *ff = w->l->frame_ff; + + // Make sure we have a full frame + // TBD: Don't think that we should ever not have a full frame here. + // CILK_ASSERT(NULL != ff); ? + if (NULL == ff) + return 1; + + // We're synched if there are no outstanding children at this instant in + // time. Note that this is a known race, but it's ok since we're only + // reading. We can get false negatives, but not false positives. (I.e., + // we can read a non-one join_counter just before it goes to one, but the + // join_counter cannot go from one to greater than one while we're + // reading.) + return 1 == ff->join_counter; +} + + + + +CILK_API_INT +__cilkrts_bump_loop_rank_internal(__cilkrts_worker* w) +{ + // If we don't have a worker, then the runtime is not bound to this + // thread and there is no rank to increment + if (NULL == w) + return -1; + + // We're at the start of the loop body. Advance the cilk_for loop + // body pedigree by following the parent link and updating its + // rank. + + // Normally, we'd just write "w->pedigree.parent->rank++" + // But we need to cast away the "const". + ((__cilkrts_pedigree*) w->pedigree.parent)->rank++; + + // Zero the worker's pedigree rank since this is the start of a new + // pedigree domain. + w->pedigree.rank = 0; + + return 0; +} + +CILK_ABI_VOID +__cilkrts_save_fp_ctrl_state(__cilkrts_stack_frame *sf) +{ + // Pass call onto OS/architecture dependent function + sysdep_save_fp_ctrl_state(sf); +} + +/* end cilk-abi.c */ diff --git a/libcilkrts/runtime/cilk-ittnotify.h b/libcilkrts/runtime/cilk-ittnotify.h new file mode 100644 index 00000000000..ff995db6fbb --- /dev/null +++ b/libcilkrts/runtime/cilk-ittnotify.h @@ -0,0 +1,100 @@ +/* cilk-ittnotify.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#ifndef INCLUDED_CILK_ITTNOTIFY_DOT_H +#define INCLUDED_CILK_ITTNOTIFY_DOT_H + +#ifdef __INTEL_COMPILER +#endif +#include <stdio.h> + +// ITTNOTIFY does not support ARM at this time +#ifdef __arm__ +#undef USE_ITTNOTIFY +#endif + +#ifdef USE_ITTNOTIFY +#include <ittnotify.h> + +#ifdef _WIN32 +# define ITT_SYNC_CREATE(_address, _description) \ + __itt_sync_createA(_address, \ + "Intel Cilk Plus " _description, \ + "", \ + __itt_attr_barrier) +#else +# define ITT_SYNC_CREATE(_address, _description) \ + __itt_sync_create(_address, \ + "Intel Cilk Plus " _description, \ + "", \ + __itt_attr_barrier) +#endif + +#define ITT_SYNC_PREPARE(_address) __itt_sync_prepare(_address) +#define ITT_SYNC_ACQUIRED(_address) __itt_sync_acquired(_address) +#define ITT_SYNC_RELEASING(_address) __itt_sync_releasing(_address) +#define ITT_SYNC_DESTROY(_address) __itt_sync_destroy(_address) +// Note that we subtract 5 from the return address to find the CALL instruction +// to __cilkrts_sync +#if 1 // Disable renaming for now. Piersol isn't ready yet +#define ITT_SYNC_SET_NAME_AND_PREPARE(_address, _sync_ret_address) __itt_sync_prepare(_address) +#else +#define ITT_SYNC_SET_NAME_AND_PREPARE(_address, _sync_ret_address) \ + if (NULL != __itt_sync_prepare_ptr) { \ + if (0 == _sync_ret_address) \ + __itt_sync_renameA(_address, ""); \ + else \ + { \ + char buf[128]; \ + sprintf_s(buf, 128, "IP:0x%p", (DWORD_PTR)_sync_ret_address - 5); \ + __itt_sync_renameA(_address, buf); \ + _sync_ret_address = 0; \ + } \ + __itt_sync_prepare(_address); \ + } +#endif +#else // USE_ITTNOTIFY not defined, compile out all calls +#define ITT_SYNC_CREATE(_address, _description) +#define ITT_SYNC_PREPARE(_address) +#define ITT_SYNC_ACQUIRED(_address) +#define ITT_SYNC_RELEASING(_addresss) +#define ITT_SYNC_DESTROY(_address) +#define ITT_SYNC_SET_NAME_AND_PREPARE(_sync_address, _wait_address) +#endif + +#endif // ! defined(INCLUDED_CILK_ITTNOTIFY_DOT_H) diff --git a/libcilkrts/runtime/cilk-tbb-interop.h b/libcilkrts/runtime/cilk-tbb-interop.h new file mode 100644 index 00000000000..cc5cff4b57e --- /dev/null +++ b/libcilkrts/runtime/cilk-tbb-interop.h @@ -0,0 +1,192 @@ +/* cilk-tbb-interop.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file cilk-tbb-interop.h + * + * @brief Interface between TBB and Cilk to allow TBB to associate it's + * per-thread data with Cilk workers, and maintain the association as work + * moves between worker threads. This handles the case where TBB calls + * into a Cilk function which may later call back to a function making + * TBB calls. + * + * Each thunk structure has two pointers: \"routine\" and \"data\". + * The caller of the thunk invokes *routine, passing \"data\" as the void* + * parameter. + */ + +#ifndef INCLUDED_CILK_TBB_INTEROP_DOT_H +#define INCLUDED_CILK_TBB_INTEROP_DOT_H + +#include <cilk/common.h> // for CILK_EXPORT + +__CILKRTS_BEGIN_EXTERN_C + +/** A return code. 0 indicates success. */ +typedef int __cilk_tbb_retcode; + +/** + * Enumeration of reasons that Cilk will call the TBB stack operation + * function. + * + * When a non-empty stack is transfered between threads, the first thread must + * orphan it and the second thread must adopt it. + * + * An empty stack can be transfered similarly, or simply released by the first + * thread. + * + * Here is a summary of the actions as transitions on a state machine. +@verbatim + watch ORPHAN + -->--> -->-- + / \ / \ + (freed empty stack) (TBB sees stack running on thread) (stack in limbo) + \ / \ / + --<-- --<-- + RELEASE or ADOPT + unwatch +@endverbatim + */ +typedef enum __cilk_tbb_stack_op { + /** + * Disconnecting stack from a thread. + * + * The thunk must be invoked on the thread disconnecting itself from the + * stack. Must \"happen before\" the stack is adopted elsewhere. + */ + CILK_TBB_STACK_ORPHAN, + + /** + * Reconnecting orphaned stack to a thread. + * + * The thunk must be invoked on the thread adopting the stack. + */ + CILK_TBB_STACK_ADOPT, + + /** + * Releasing stack. + * + * The thunk must be invoked on the thread doing the releasing, Must + * \"happen before\" the stack is used elsewhere. + */ + CILK_TBB_STACK_RELEASE +} __cilk_tbb_stack_op; + +/** + * Function that will be called by the Cilk runtime to inform TBB of a change + * in the stack associated with the current thread. + * + * It does not matter what stack the thunk runs on. + * The thread (not fiber) on which the thunk runs is important. + * + * @param op Enumerated value indicating what type of change is ocurring. + * @param data Context value provided by TBB in the __cilkrts_watch_stack + * call. This data is opaque to Cilk. + * + * @return 0 indicates success. + */ +typedef __cilk_tbb_retcode (*__cilk_tbb_pfn_stack_op)(enum __cilk_tbb_stack_op op, + void* data); + +/** + * Function that will be called by TBB to inform the Cilk runtime that TBB + * is no longer interested in watching the stack bound to the current thread. + * + * @param data Context value provided to TBB by the __cilkrts_watch_stack + * call. This data is opaque to TBB. + * + * @return 0 indicates success. + */ +typedef __cilk_tbb_retcode (*__cilk_tbb_pfn_unwatch_stacks)(void *data); + +/** + * Thunk invoked by Cilk to call back to TBB to tell it about a change in + * the stack bound to the current thread. + */ +typedef struct __cilk_tbb_stack_op_thunk { + /// Function in TBB the Cilk runtime should call when something + // "interesting" happens involving a stack + __cilk_tbb_pfn_stack_op routine; + + /// TBB context data to pass with the call to the stack_op routine + void* data; +} __cilk_tbb_stack_op_thunk; + +/** + * Thunk invoked by TBB when it is no longer interested in watching the stack + * bound to the current thread. + */ +typedef struct __cilk_tbb_unwatch_thunk { + /// Function in Cilk runtime to call when TBB no longer wants to watch + // stacks + __cilk_tbb_pfn_unwatch_stacks routine; + + /// Cilk runtime context data to pass with the call to the unwatch_stacks + /// routine + void* data; +} __cilk_tbb_unwatch_thunk; + +/** + * Requests that Cilk invoke __cilk_tbb_orphan_thunk when it orphans a stack. + * Cilk sets *u to a thunk that TBB should call when it is no longer + * interested in watching the stack. + * + * If the thread is not yet bound to the Cilk runtime, the Cilk runtime should + * save this data in thread-local storage until __cilkrts_bind_thread is called. + * + * Called by TBB, defined by Cilk. This function is exported from the Cilk + * runtime DLL/shared object. This declaration also appears in + * cilk/cilk_undocumented.h -- don't change one declaration without also + * changing the other. + * + * @param u __cilk_tbb_unwatch_thunk. This structure will be filled in by + * the Cilk runtime to allow TBB to register that it is no longer interested + * in watching the stack bound to the current thread. + * @param o __cilk_tbb_stack_op_thunk. This structure specifies the routine + * that the Cilk runtime should call when an "interesting" change in the stack + * associate with the current worker occurs. + * + * @return 0 indicates success. + */ +CILK_EXPORT +__cilk_tbb_retcode __cilkrts_watch_stack(__cilk_tbb_unwatch_thunk* u, + __cilk_tbb_stack_op_thunk o); + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_CILK_TBB_INTEROP_DOT_H) diff --git a/libcilkrts/runtime/cilk_api.c b/libcilkrts/runtime/cilk_api.c new file mode 100644 index 00000000000..bbca984bc03 --- /dev/null +++ b/libcilkrts/runtime/cilk_api.c @@ -0,0 +1,255 @@ +/* cilk_api.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/* + * Implementation of functions declared in cilk_api.h + */ + +/* + * Define the COMPILING_CILK_ABI_FUNCTIONS macro, so that + * compilation of this file generates non-inlined definitions for the + * functions marked as CILK_EXPORT_AND_INLINE in cilk_api.h. + * + * We must deal with these functions differently because we need to + * continue to ship nonlined versions of these functions. + * + * CILK_EXPORT_AND_INLINE int __cilkrts_get_worker_rank(uint64_t *rank); + * CILK_EXPORT_AND_INLINE int __cilkrts_bump_worker_rank(); + * CILK_EXPORT_AND_INLINE int __cilkrts_bump_loop_rank(); + */ +#define COMPILING_CILK_API_FUNCTIONS + +#include <internal/abi.h> +#include <cilk/cilk_api.h> + +#include "os.h" +#include "os_mutex.h" +#include "bug.h" +#include "global_state.h" +#include "local_state.h" +#include "scheduler.h" +#include "sysdep.h" + +CILK_API_VOID __cilkrts_init(void) +{ + // Initialize, but don't start, the cilk runtime. + __cilkrts_init_internal(0); +} + +CILK_API_VOID __cilkrts_end_cilk(void) +{ + // Take out the global OS mutex while we do this to protect against + // another thread attempting to bind while we do this + global_os_mutex_lock(); + + if (cilkg_is_published()) { + global_state_t *g = cilkg_get_global_state(); + if (g->Q || __cilkrts_get_tls_worker()) + __cilkrts_bug("Attempt to shut down Cilk while Cilk is still " + "running"); + __cilkrts_stop_workers(g); + __cilkrts_deinit_internal(g); + } + + global_os_mutex_unlock(); +} + +CILK_API_INT +__cilkrts_get_nworkers() +{ + return cilkg_get_nworkers(); +} + +CILK_API_INT +__cilkrts_get_total_workers() +{ + return cilkg_get_total_workers(); +} + +CILK_API_INT __cilkrts_get_force_reduce(void) +{ + return cilkg_get_force_reduce(); +} + +CILK_API_INT __cilkrts_set_param(const char* param, const char* value) +{ + return cilkg_set_param(param, value); +} + +#ifdef _WIN32 +CILK_API_INT __cilkrts_set_param_w(const wchar_t* param, const wchar_t* value) +{ + return cilkg_set_param_w(param, value); +} +#endif // _WIN32 + +/* Return a small integer indicating which Cilk worker the function is + * currently running on. Each thread started by the Cilk runtime library + * (system worker) has a unique worker number in the range 1..P-1, where P is + * the valued returned by __cilkrts_get_nworkers(). All threads started by + * the user or by other libraries (user workers) share the worker number 0. + * Therefore, the worker number is not unique across multiple user threads. + * + * Implementor's note: The value returned from this function is different from + * the value, w->self, used in most debug messages. + */ +CILK_API_INT +__cilkrts_get_worker_number(void) +{ + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + + if (0 == w) + /* A non-worker always has a worker number of zero. */ + return 0; + else if (WORKER_USER == w->l->type) + /* User worker was once a non-worker, so its number should still be + * zero. */ + return 0; + else + /* w->self for a system worker is in range 0..(P-1); adjust to 1..P + * to avoid conflicting with the user thread's worker number. */ + return w->self + 1; +} + +/** + * Internal definition of the pedigree context. The size of the + * structure must match __cilkrts_pedigree_context_t defined in abi.i + */ +typedef struct pedigree_context_t +{ + /** Size of the structure, in bytes */ + size_t size; + + /** Next __cilkrts_pedigree to return */ + const __cilkrts_pedigree *pedigree; + + /** Unused. Left over from previous implementation */ + void *unused1; + + /** Unused. Left over from previous implementation */ + void *unused2; + + // // Debugging aid for pedigree-test: + // __cilkrts_stack_frame *expected_sf; +} pedigree_context_t; + +/* + * __cilkrts_get_pedigree_info + * + * Fetch the birthrank for a stack frame. To initialize the walk, both sf_in + * and frame_in should be NULL. parent_sf_ptr and parent_frame_ptr provide + * context for the stackwalk and should be returned as sf_in and frame_in on + * the next call. + * + * Returns: + * 0 - Success - birthrank, parent_sf_out and parent_frame_out are valid + * >1 - Pedigree walk completed + * <1 - Failure - -1: No worker bound to thread, -2: Sanity check failed + */ + +#define PEDIGREE_WALK_COMPLETE (__cilkrts_pedigree *)-1 + +CILK_API_INT +__cilkrts_get_pedigree_info(__cilkrts_pedigree_context_t *external_context, + uint64_t *sf_birthrank) +{ + pedigree_context_t *context = (pedigree_context_t *)external_context; + + CILK_ASSERT(sizeof(__cilkrts_pedigree_context_t) == + sizeof(pedigree_context_t)); + if (context->size != sizeof(pedigree_context_t)) + return -3; // Invalid size + + // If the pointer to the last __cilkrts_pedigree is -1, we've + // finished the walk. We're still done. + if (PEDIGREE_WALK_COMPLETE == context->pedigree) + return 1; + + // The passed in context value contains a pointer to the last + // __cilkrts_pedigree returned, or NULL if we're starting a + // new walk + if (NULL == context->pedigree) + { + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + __cilkrts_pedigree* pedigree_node; + if (NULL != w) { + pedigree_node = &w->pedigree; + } + else { + pedigree_node = __cilkrts_get_tls_pedigree_leaf(1); + } + context->pedigree = pedigree_node->parent; + } + else + context->pedigree = context->pedigree->parent; + + // Note: If we want to omit the user root node, + // stop at context->pedigree->parent instead. + if (NULL == context->pedigree) + { + context->pedigree = PEDIGREE_WALK_COMPLETE; + return 1; + } + + *sf_birthrank = context->pedigree->rank; + return 0; +} + +CILK_API_PEDIGREE +__cilkrts_get_pedigree_internal(__cilkrts_worker *w) +{ + if (NULL != w) { + return w->pedigree; + } + else { + const __cilkrts_pedigree *pedigree = + __cilkrts_get_tls_pedigree_leaf(1); + return *pedigree; + } +} + + +CILK_API_INT __cilkrts_bump_worker_rank_internal(__cilkrts_worker *w) +{ + __cilkrts_pedigree *pedigree; + pedigree = (w ? &w->pedigree : __cilkrts_get_tls_pedigree_leaf(1)); + pedigree->rank++; + return 0; +} + +/* End cilk_api.c */ diff --git a/libcilkrts/runtime/cilk_fiber-unix.cpp b/libcilkrts/runtime/cilk_fiber-unix.cpp new file mode 100644 index 00000000000..b9b47e364a5 --- /dev/null +++ b/libcilkrts/runtime/cilk_fiber-unix.cpp @@ -0,0 +1,273 @@ +/* cilk_fiber-unix.cpp -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2012-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#include "cilk_fiber-unix.h" +#include "cilk_malloc.h" +#include "bug.h" +#include "os.h" + +#include <cstdio> +#include <cstdlib> + +#include <alloca.h> +#include <errno.h> +#include <sys/mman.h> +#include <unistd.h> + +// MAP_ANON is deprecated on Linux, but seems to be required on Mac... +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +// Magic number for sanity checking fiber structure +const unsigned magic_number = 0x5afef00d; + +int cilk_fiber_sysdep::s_page_size = getpagesize(); + +cilk_fiber_sysdep::cilk_fiber_sysdep(std::size_t stack_size) + : cilk_fiber(stack_size) + , m_magic(magic_number) +{ + // Set m_stack and m_stack_base. + make_stack(stack_size); + + // Get high-address of stack, with 32-bytes of spare space, and rounded + // down to the nearest 32-byte boundary. + const uintptr_t align_mask = 32 - 1; + m_stack_base -= ((std::size_t) m_stack_base) & align_mask; +} + +cilk_fiber_sysdep::cilk_fiber_sysdep(from_thread_t) + : cilk_fiber() + , m_magic(magic_number) +{ + this->set_allocated_from_thread(true); + + // Dummy stack data for thread-main fiber + m_stack = NULL; + m_stack_base = NULL; +} + +void cilk_fiber_sysdep::convert_fiber_back_to_thread() +{ + // Does nothing on Linux. +} + +cilk_fiber_sysdep::~cilk_fiber_sysdep() +{ + CILK_ASSERT(magic_number == m_magic); + if (!this->is_allocated_from_thread()) + free_stack(); +} + +#if SUPPORT_GET_CURRENT_FIBER +cilk_fiber_sysdep* cilk_fiber_sysdep::get_current_fiber_sysdep() +{ + return cilkos_get_tls_cilk_fiber(); +} +#endif + +// Jump to resume other fiber. We may or may not come back. +inline void cilk_fiber_sysdep::resume_other_sysdep(cilk_fiber_sysdep* other) +{ + if (other->is_resumable()) { + other->set_resumable(false); + // Resume by longjmp'ing to the place where we suspended. + CILK_LONGJMP(other->m_resume_jmpbuf); + } + else { + // Otherwise, we've never ran this fiber before. Start the + // proc method. + other->run(); + } +} + +void cilk_fiber_sysdep::suspend_self_and_resume_other_sysdep(cilk_fiber_sysdep* other) +{ +#if SUPPORT_GET_CURRENT_FIBER + cilkos_set_tls_cilk_fiber(other); +#endif + CILK_ASSERT(this->is_resumable()); + + + // Jump to the other fiber. We expect to come back. + if (! CILK_SETJMP(m_resume_jmpbuf)) { + resume_other_sysdep(other); + } + + // Return here when another fiber resumes me. + // If the fiber that switched to me wants to be deallocated, do it now. + do_post_switch_actions(); +} + +NORETURN cilk_fiber_sysdep::jump_to_resume_other_sysdep(cilk_fiber_sysdep* other) +{ +#if SUPPORT_GET_CURRENT_FIBER + cilkos_set_tls_cilk_fiber(other); +#endif + CILK_ASSERT(!this->is_resumable()); + + // Jump to the other fiber. But we are never coming back because + // this fiber is being reset. + resume_other_sysdep(other); + + // We should never come back here... + __cilkrts_bug("Should not get here"); +} + + +NORETURN cilk_fiber_sysdep::run() +{ + // Only fibers created from a pool have a proc method to run and execute. + CILK_ASSERT(m_start_proc); + CILK_ASSERT(!this->is_allocated_from_thread()); + CILK_ASSERT(!this->is_resumable()); + + // TBD: This setjmp/longjmp pair simply changes the stack pointer. + // We could probably replace this code with some assembly. + if (! CILK_SETJMP(m_resume_jmpbuf)) + { + // Calculate the size of the current stack frame (i.e., this + // run() function. + size_t frame_size = (size_t)JMPBUF_FP(m_resume_jmpbuf) - (size_t)JMPBUF_SP(m_resume_jmpbuf); + + // Macs require 16-byte alignment. Do it always because it just + // doesn't matter + if (frame_size & (16-1)) + frame_size += 16 - (frame_size & (16-1)); + + // Assert that we are getting a reasonable frame size out of + // it. If this run() function is using more than 4096 bytes + // of space for its local variables / any state that spills to + // registers, something is probably *very* wrong here... + // + // 4096 bytes just happens to be a number that seems "large + // enough" --- for an example GCC 32-bit compilation, the + // frame size was 48 bytes. + CILK_ASSERT(frame_size < 4096); + + // Change stack pointer to fiber stack. Offset the + // calculation by the frame size, so that we've allocated + // enough extra space from the top of the stack we are + // switching to for any temporaries required for this run() + // function. + JMPBUF_SP(m_resume_jmpbuf) = m_stack_base - frame_size; + CILK_LONGJMP(m_resume_jmpbuf); + } + + // Note: our resetting of the stack pointer is valid only if the + // compiler has not saved any temporaries onto the stack for this + // function before the longjmp that we still care about at this + // point. + + // Verify that 1) 'this' is still valid and 2) '*this' has not been + // corrupted. + CILK_ASSERT(magic_number == m_magic); + + // If the fiber that switched to me wants to be deallocated, do it now. + do_post_switch_actions(); + + // Now call the user proc on the new stack + m_start_proc(this); + + // alloca() to force generation of frame pointer. The argument to alloca + // is contrived to prevent the compiler from optimizing it away. This + // code should never actually be executed. + int* dummy = (int*) alloca((sizeof(int) + (std::size_t) m_start_proc) & 0x1); + *dummy = 0xface; + + // User proc should never return. + __cilkrts_bug("Should not get here"); +} + +void cilk_fiber_sysdep::make_stack(size_t stack_size) +{ + char* p; + // We've already validated that the stack size is page-aligned and + // is a reasonable value. No need to do any extra rounding here. + size_t rounded_stack_size = stack_size; + + // Normally, we have already validated that the stack size is + // aligned to 4K. In the rare case that pages are huge though, we + // need to do some extra checks. + if (rounded_stack_size < 3 * (size_t)s_page_size) { + // If the specified stack size is too small, round up to 3 + // pages. We need at least 2 extra for the guard pages. + rounded_stack_size = 3 * (size_t)s_page_size; + } + else { + // Otherwise, the stack size is large enough, but might not be + // a multiple of page size. Round up to nearest multiple of + // s_page_size, just to be safe. + size_t remainder = rounded_stack_size % s_page_size; + if (remainder) { + rounded_stack_size += s_page_size - remainder; + } + } + + p = (char*)mmap(0, rounded_stack_size, + PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS, + -1, 0); + if (MAP_FAILED == p) { + // For whatever reason (probably ran out of memory), mmap() failed. + // There is no stack to return, so the program loses parallelism. + m_stack = NULL; + m_stack_base = NULL; + return; + } + + // mprotect guard pages. + mprotect(p + rounded_stack_size - s_page_size, s_page_size, PROT_NONE); + mprotect(p, s_page_size, PROT_NONE); + + m_stack = p; + m_stack_base = p + rounded_stack_size - s_page_size; +} + + +void cilk_fiber_sysdep::free_stack() +{ + if (m_stack) { + size_t rounded_stack_size = m_stack_base - m_stack + s_page_size; + if (munmap(m_stack, rounded_stack_size) < 0) + __cilkrts_bug("Cilk: stack munmap failed error %d\n", errno); + } +} + +/* End cilk_fiber-unix.cpp */ diff --git a/libcilkrts/runtime/cilk_fiber-unix.h b/libcilkrts/runtime/cilk_fiber-unix.h new file mode 100644 index 00000000000..9f47d5b0437 --- /dev/null +++ b/libcilkrts/runtime/cilk_fiber-unix.h @@ -0,0 +1,149 @@ +/* cilk_fiber-unix.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2012-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#ifndef INCLUDED_CILK_FIBER_UNIX_DOT_H +#define INCLUDED_CILK_FIBER_UNIX_DOT_H + +#ifndef __cplusplus +# error cilk_fiber-unix.h is a C++-only header +#endif + +#include "cilk_fiber.h" +#include "jmpbuf.h" + +/** + * @file cilk_fiber-unix.h + * + * @brief Unix-specific implementation for cilk_fiber. + */ + +/** + * @brief Unix-specific fiber class derived from portable fiber class + */ +struct cilk_fiber_sysdep : public cilk_fiber +{ + public: + +#if SUPPORT_GET_CURRENT_FIBER + /** + * @brief Gets the current fiber from TLS. + */ + static cilk_fiber_sysdep* get_current_fiber_sysdep(); +#endif + + /** + * @brief Construct the system-dependent portion of a fiber. + * + * @param stack_size The size of the stack for this fiber. + */ + cilk_fiber_sysdep(std::size_t stack_size); + + /** + * @brief Construct the system-dependent of a fiber created from a + * thread. + */ + cilk_fiber_sysdep(from_thread_t); + + /** + * @brief Destructor + */ + ~cilk_fiber_sysdep(); + + /** + * @brief OS-specific calls to convert this fiber back to thread. + * + * Nothing to do for Linux. + */ + void convert_fiber_back_to_thread(); + + /** + * @brief System-dependent function to suspend self and resume execution of "other". + * + * This fiber is suspended. + * + * @pre @c is_resumable() should be true. + * + * @param other Fiber to resume. + */ + void suspend_self_and_resume_other_sysdep(cilk_fiber_sysdep* other); + + /** + * @brief System-dependent function called to jump to @p other + * fiber. + * + * @pre @c is_resumable() should be false. + * + * @param other Fiber to resume. + */ + NORETURN jump_to_resume_other_sysdep(cilk_fiber_sysdep* other); + + /** + * @brief Runs the start_proc. + * @pre is_resumable() should be false. + * @pre is_allocated_from_thread() should be false. + * @pre m_start_proc must be valid. + */ + NORETURN run(); + + /** + * @brief Returns the base of this fiber's stack. + */ + inline char* get_stack_base_sysdep() { return m_stack_base; } + + private: + char* m_stack_base; ///< The base of this fiber's stack. + char* m_stack; // Stack memory (low address) + __CILK_JUMP_BUFFER m_resume_jmpbuf; // Place to resume fiber + unsigned m_magic; // Magic number for checking + + static int s_page_size; // Page size for + // stacks. + + // Allocate memory for a stack. This method + // initializes m_stack and m_stack_base. + void make_stack(size_t stack_size); + + // Deallocates memory for the stack. + void free_stack(); + + // Common helper method for implementation of resume_other_sysdep + // variants. + inline void resume_other_sysdep(cilk_fiber_sysdep* other); +}; + +#endif // ! defined(INCLUDED_CILK_FIBER_UNIX_DOT_H) diff --git a/libcilkrts/runtime/cilk_fiber.cpp b/libcilkrts/runtime/cilk_fiber.cpp new file mode 100644 index 00000000000..0c66f234d3b --- /dev/null +++ b/libcilkrts/runtime/cilk_fiber.cpp @@ -0,0 +1,1078 @@ +/* cilk_fiber.cpp -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2012-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/* Implementations of non-platform-specific aspects of cilk_fiber, especially + * the cilk_fiber_pool interface. + */ +#include "cilk_fiber.h" +#ifdef _WIN32 +# include "cilk_fiber-win.h" +#else +# include "cilk_fiber-unix.h" +#endif +#include "cilk_malloc.h" +#include "bug.h" +#include <new> + +#include <climits> +#include <cstdio> +#include <cstdlib> +#include <cstring> + +#include "sysdep.h" + + +extern "C" { + +inline int cilk_fiber_pool_sanity_check(cilk_fiber_pool *pool, const char* desc) +{ + int errors = 0; +#if FIBER_DEBUG >= 1 + if ((NULL != pool) && pool->total > 0) { + + // Root pool should not allocate more fibers than alloc_max + errors += ((pool->parent == NULL) && + (pool->total > pool->alloc_max)); + errors += (pool->total > pool->high_water); + + if (errors) { + fprintf(stderr, "ERROR at %s: pool=%p has max_size=%u, total=%d, high_water=%d\n", + desc, + pool, pool->max_size, pool->total, pool->high_water); + } + } +#endif + return (errors == 0); +} + +inline void increment_pool_total(cilk_fiber_pool* pool) +{ + ++pool->total; + if (pool->high_water < pool->total) + pool->high_water = pool->total; +} + +inline void decrement_pool_total(cilk_fiber_pool* pool, int fibers_freed) +{ + pool->total -= fibers_freed; +} + + +/** + * @brief Free fibers from this pool until we have at most @c + * num_to_keep fibers remaining, and then put a fiber back. + * + * @pre We do not hold @c pool->lock + * @post After completion, we do not hold @c pool->lock + */ +static void cilk_fiber_pool_free_fibers_from_pool(cilk_fiber_pool* pool, + unsigned num_to_keep, + cilk_fiber* fiber_to_return) +{ + // Free our own fibers, until we fall below our desired threshold. + // Each iteration of this loop proceeds in the following stages: + // 1. Acquire the pool lock, + // 2. Grabs up to B fibers from the pool, stores them into a buffer. + // 3. Check if pool is empty enough. If yes, put the last fiber back, + // and remember that we should quit. + // 4. Release the pool lock, and actually free any buffered fibers. + // 5. Check if we are done and should exit the loop. Otherwise, try again. + // + const bool need_lock = pool->lock; + bool last_fiber_returned = false; + + do { + const int B = 10; // Pull at most this many fibers from the + // parent for one lock acquisition. Make + // this value large enough to amortize + // against the cost of acquiring and + // releasing the lock. + int num_to_free = 0; + cilk_fiber* fibers_to_free[B]; + + // Stage 1: Grab the lock. + if (need_lock) { + spin_mutex_lock(pool->lock); + } + + // Stage 2: Grab up to B fibers to free. + int fibers_freed = 0; + while ((pool->size > num_to_keep) && (num_to_free < B)) { + fibers_to_free[num_to_free++] = pool->fibers[--pool->size]; + fibers_freed++; + } + decrement_pool_total(pool, fibers_freed); + + // Stage 3. Pool is below threshold. Put extra fiber back. + if (pool->size <= num_to_keep) { + // Put the last fiber back into the pool. + if (fiber_to_return) { + CILK_ASSERT(pool->size < pool->max_size); + pool->fibers[pool->size] = fiber_to_return; + pool->size++; + } + last_fiber_returned = true; + } + + // Stage 4: Release the lock, and actually free any fibers + // buffered. + if (need_lock) { + spin_mutex_unlock(pool->lock); + } + + for (int i = 0; i < num_to_free; ++i) { + fibers_to_free[i]->deallocate_to_heap(); + } + + } while (!last_fiber_returned); +} + + +/****************************************************************** + * TBD: We want to simplify / rework the logic for allocating and + * deallocating fibers, so that they are hopefully simpler and work + * more elegantly for more than two levels. + ******************************************************************/ + +/** + * @brief Transfer fibers from @c pool to @c pool->parent. + * + * @pre Must hold @c pool->lock if it exists. + * @post After completion, some number of fibers + * have been moved from this pool to the parent. + * The lock @c pool->lock is still held. + * + * TBD: Do we wish to guarantee that the lock has never been + * released? It may depend on the implementation... + */ +static void cilk_fiber_pool_move_fibers_to_parent_pool(cilk_fiber_pool* pool, + unsigned num_to_keep) +{ + // ASSERT: We should hold the lock on pool (if it has one). + CILK_ASSERT(pool->parent); + cilk_fiber_pool* parent_pool = pool->parent; + + // Move fibers from our pool to the parent until we either run out + // of space in the parent, or hit our threshold. + // + // This operation must be done while holding the parent lock. + + // If the parent pool appears to be full, just return early. + if (parent_pool->size >= parent_pool->max_size) + return; + + spin_mutex_lock(pool->parent->lock); + while ((parent_pool->size < parent_pool->max_size) && + (pool->size > num_to_keep)) { + parent_pool->fibers[parent_pool->size++] = + pool->fibers[--pool->size]; + } + + // If the child pool has deallocated more than fibers to the heap + // than it has allocated, then transfer this "surplus" to the + // parent, so that the parent is free to allocate more from the + // heap. + // + // This transfer means that the total in the parent can + // temporarily go negative. + if (pool->total < 0) { + // Reduce parent total by the surplus we have in the local + // pool. + parent_pool->total += pool->total; + pool->total = 0; + } + + spin_mutex_unlock(pool->parent->lock); +} + +void cilk_fiber_pool_init(cilk_fiber_pool* pool, + cilk_fiber_pool* parent, + size_t stack_size, + unsigned buffer_size, + int alloc_max, + int is_shared) +{ +#if FIBER_DEBUG >= 1 + fprintf(stderr, "fiber_pool_init, pool=%p, parent=%p, alloc_max=%u\n", + pool, parent, alloc_max); +#endif + + pool->lock = (is_shared ? spin_mutex_create() : NULL); + pool->parent = parent; + pool->stack_size = stack_size; + pool->max_size = buffer_size; + pool->size = 0; + pool->total = 0; + pool->high_water = 0; + pool->alloc_max = alloc_max; + pool->fibers = + (cilk_fiber**) __cilkrts_malloc(buffer_size * sizeof(cilk_fiber*)); + CILK_ASSERT(NULL != pool->fibers); + +#ifdef __MIC__ +#define PREALLOCATE_FIBERS +#endif + +#ifdef PREALLOCATE_FIBERS + // Pre-allocate 1/4 of fibers in the pools ahead of time. This + // value is somewhat arbitrary. It was chosen to be less than the + // threshold (of about 3/4) of fibers to keep in the pool when + // transferring fibers to the parent. + + int pre_allocate_count = buffer_size/4; + for (pool->size = 0; pool->size < pre_allocate_count; pool->size++) { + pool->fibers[pool->size] = cilk_fiber::allocate_from_heap(pool->stack_size); + } +#endif +} + + +void cilk_fiber_pool_set_fiber_limit(cilk_fiber_pool* root_pool, + unsigned max_fibers_to_allocate) +{ + // Should only set limit on root pool, not children. + CILK_ASSERT(NULL == root_pool->parent); + root_pool->alloc_max = max_fibers_to_allocate; +} + +void cilk_fiber_pool_destroy(cilk_fiber_pool* pool) +{ + CILK_ASSERT(cilk_fiber_pool_sanity_check(pool, "pool_destroy")); + + // Lock my own pool, if I need to. + if (pool->lock) { + spin_mutex_lock(pool->lock); + } + + // Give any remaining fibers to parent pool. + if (pool->parent) { + cilk_fiber_pool_move_fibers_to_parent_pool(pool, 0); + } + + // Unlock pool. + if (pool->lock) { + spin_mutex_unlock(pool->lock); + } + + // If I have any left in my pool, just free them myself. + // This method may acquire the pool lock. + cilk_fiber_pool_free_fibers_from_pool(pool, 0, NULL); + + // Destroy the lock if there is one. + if (pool->lock) { + spin_mutex_destroy(pool->lock); + } + __cilkrts_free(pool->fibers); +} + + +cilk_fiber* cilk_fiber_allocate(cilk_fiber_pool* pool) +{ + CILK_ASSERT(cilk_fiber_pool_sanity_check(pool, "allocate")); + return cilk_fiber::allocate(pool); +} + +cilk_fiber* cilk_fiber_allocate_from_heap(size_t stack_size) +{ + return cilk_fiber::allocate_from_heap(stack_size); +} + +void cilk_fiber_reset_state(cilk_fiber* fiber, cilk_fiber_proc start_proc) +{ + fiber->reset_state(start_proc); +} + +int cilk_fiber_remove_reference(cilk_fiber *fiber, cilk_fiber_pool *pool) +{ + return fiber->remove_reference(pool); +} + +cilk_fiber* cilk_fiber_allocate_from_thread() +{ + return cilk_fiber::allocate_from_thread(); +} + +int cilk_fiber_deallocate_from_thread(cilk_fiber *fiber) +{ + return fiber->deallocate_from_thread(); +} + +int cilk_fiber_remove_reference_from_thread(cilk_fiber *fiber) +{ + return fiber->remove_reference_from_thread(); +} + +int cilk_fiber_is_allocated_from_thread(cilk_fiber *fiber) +{ + return fiber->is_allocated_from_thread(); +} + +#if SUPPORT_GET_CURRENT_FIBER +cilk_fiber* cilk_fiber_get_current_fiber(void) +{ + return cilk_fiber::get_current_fiber(); +} +#endif + +void cilk_fiber_suspend_self_and_resume_other(cilk_fiber* self, + cilk_fiber* other) +{ + self->suspend_self_and_resume_other(other); +} + + +void cilk_fiber::reset_state(cilk_fiber_proc start_proc) +{ + // Setup the fiber and return. + this->m_start_proc = start_proc; + + CILK_ASSERT(!this->is_resumable()); + CILK_ASSERT(NULL == this->m_pending_remove_ref); + CILK_ASSERT(NULL == this->m_pending_pool); +} + +NORETURN +cilk_fiber_remove_reference_from_self_and_resume_other(cilk_fiber* self, + cilk_fiber_pool* self_pool, + cilk_fiber* other) +{ +#if FIBER_DEBUG >= 3 + __cilkrts_worker* w = __cilkrts_get_tls_worker(); + fprintf(stderr, "W=%d: cilk_fiber_deactivate_self_and_resume_other: self=%p, other=%p\n", + w->self, + self, other); +#endif + CILK_ASSERT(cilk_fiber_pool_sanity_check(self_pool, "remove_reference_from_self_resume_other")); + self->remove_reference_from_self_and_resume_other(self_pool, other); + + // We should never return here. +} + +void cilk_fiber_set_post_switch_proc(cilk_fiber *self, + cilk_fiber_proc post_switch_proc) +{ + self->set_post_switch_proc(post_switch_proc); +} + +void cilk_fiber_invoke_tbb_stack_op(cilk_fiber* fiber, + __cilk_tbb_stack_op op) +{ + fiber->invoke_tbb_stack_op(op); +} + +cilk_fiber_data* cilk_fiber_get_data(cilk_fiber* fiber) +{ + return fiber->get_data(); + + /// TBD: Change this code to "return (cilk_fiber_data*)fiber;" + // plus a static assert, so that this function is + // more easily inlined by the compiler. +} + +int cilk_fiber_is_resumable(cilk_fiber *fiber) +{ + return fiber->is_resumable(); +} + +char* cilk_fiber_get_stack_base(cilk_fiber *fiber) +{ + return fiber->get_stack_base(); +} + + +#if defined(_WIN32) && 0 // Only works on Windows. Disable debugging for now. +#define DBG_STACK_OPS(_fmt, ...) __cilkrts_dbgprintf(_fmt, __VA_ARGS__) +#else +#define DBG_STACK_OPS(_fmt, ...) +#endif + +void cilk_fiber_set_stack_op(cilk_fiber *fiber, + __cilk_tbb_stack_op_thunk o) +{ + cilk_fiber_data *fdata = cilk_fiber_get_data(fiber); + DBG_STACK_OPS ("cilk_fiber_set_stack_op - cilk_fiber %p, routine: %p, data: %p\n", + fiber, + o.routine, + o.data); + fdata->stack_op_routine = o.routine; + fdata->stack_op_data = o.data; +} + +#if 0 // Debugging function +static +const char *NameStackOp (enum __cilk_tbb_stack_op op) +{ + switch(op) + { + case CILK_TBB_STACK_ORPHAN: return "CILK_TBB_STACK_ORPHAN"; + case CILK_TBB_STACK_ADOPT: return "CILK_TBB_STACK_ADOPT"; + case CILK_TBB_STACK_RELEASE: return "CILK_TBB_STACK_RELEASE"; + default: return "Unknown"; + } +} +#endif + +/* + * Save TBB interop information for an unbound thread. It will get picked + * up when the thread is bound to the runtime. + */ +void cilk_fiber_tbb_interop_save_stack_op_info(__cilk_tbb_stack_op_thunk o) +{ + __cilk_tbb_stack_op_thunk *saved_thunk = + __cilkrts_get_tls_tbb_interop(); + + DBG_STACK_OPS("Calling save_stack_op; o.routine=%p, o.data=%p, saved_thunk=%p\n", + o.routine, o.data, saved_thunk); + + // If there is not already space allocated, allocate some. + if (NULL == saved_thunk) { + saved_thunk = (__cilk_tbb_stack_op_thunk*) + __cilkrts_malloc(sizeof(__cilk_tbb_stack_op_thunk)); + __cilkrts_set_tls_tbb_interop(saved_thunk); + } + + *saved_thunk = o; + + DBG_STACK_OPS ("Unbound Thread %04x: tbb_interop_save_stack_op_info - saved info\n", + cilkos_get_current_thread_id()); +} + +/* + * Save TBB interop information from the cilk_fiber. It will get picked + * up when the thread is bound to the runtime next time. + */ +void cilk_fiber_tbb_interop_save_info_from_stack(cilk_fiber *fiber) +{ + __cilk_tbb_stack_op_thunk *saved_thunk; + cilk_fiber_data* fdata; + + if (NULL == fiber) + return; + + fdata = cilk_fiber_get_data(fiber); + // If there is no TBB interop data, just return + if (NULL == fdata->stack_op_routine) + return; + + saved_thunk = __cilkrts_get_tls_tbb_interop(); + + // If there is not already space allocated, allocate some. + if (NULL == saved_thunk) { + saved_thunk = (__cilk_tbb_stack_op_thunk*) + __cilkrts_malloc(sizeof(__cilk_tbb_stack_op_thunk)); + __cilkrts_set_tls_tbb_interop(saved_thunk); + } + + saved_thunk->routine = fdata->stack_op_routine; + saved_thunk->data = fdata->stack_op_data; +} + +/* + * If there's TBB interop information that was saved before the thread was + * bound, apply it now + */ +void cilk_fiber_tbb_interop_use_saved_stack_op_info(cilk_fiber* fiber) +{ + __cilk_tbb_stack_op_thunk *saved_thunk = + __cilkrts_get_tls_tbb_interop(); + + CILK_ASSERT(fiber); + // If we haven't allocated a TBB interop index, we don't have any saved info + if (NULL == saved_thunk) { + DBG_STACK_OPS ("cilk_fiber %p: tbb_interop_use_saved_stack_op_info - no saved info\n", + fiber); + return; + } + + DBG_STACK_OPS ("cilk_fiber %p: tbb_interop_use_saved_stack_op_info - using saved info\n", + fiber); + + // Associate the saved info with the __cilkrts_stack + cilk_fiber_set_stack_op(fiber, *saved_thunk); + + // Free the saved data. We'll save it again if needed when the code + // returns from the initial function + cilk_fiber_tbb_interop_free_stack_op_info(); +} + +/* + * Free saved TBB interop memory. Should only be called when the thread is + * not bound. + */ +void cilk_fiber_tbb_interop_free_stack_op_info(void) +{ + __cilk_tbb_stack_op_thunk *saved_thunk = + __cilkrts_get_tls_tbb_interop(); + + // If we haven't allocated a TBB interop index, we don't have any saved info + if (NULL == saved_thunk) + return; + + DBG_STACK_OPS ("tbb_interop_free_stack_op_info - freeing saved info\n"); + + // Free the memory and wipe out the TLS value + __cilkrts_free(saved_thunk); + __cilkrts_set_tls_tbb_interop(NULL); +} + + + +#if NEED_FIBER_REF_COUNTS +int cilk_fiber_has_references(cilk_fiber *fiber) +{ + return (fiber->get_ref_count() > 0); +} + +int cilk_fiber_get_ref_count(cilk_fiber *fiber) +{ + return fiber->get_ref_count(); +} + +void cilk_fiber_add_reference(cilk_fiber *fiber) +{ + fiber->inc_ref_count(); +} +#endif // NEED_FIBER_REF_COUNTS + + +} // End extern "C" + + +cilk_fiber_sysdep* cilk_fiber::sysdep() +{ + return static_cast<cilk_fiber_sysdep*>(this); +} + + +cilk_fiber::cilk_fiber() + : m_start_proc(NULL) + , m_post_switch_proc(NULL) + , m_pending_remove_ref(NULL) + , m_pending_pool(NULL) + , m_flags(0) +{ + // Clear cilk_fiber_data base-class data members + std::memset((cilk_fiber_data*) this, 0, sizeof(cilk_fiber_data)); + + // cilk_fiber data members + init_ref_count(0); +} + +cilk_fiber::cilk_fiber(std::size_t stack_size) +{ + *this = cilk_fiber(); // A delegating constructor would be nice here + this->stack_size = stack_size; +} + +cilk_fiber::~cilk_fiber() +{ + // Empty destructor. +} + + +char* cilk_fiber::get_stack_base() +{ + return this->sysdep()->get_stack_base_sysdep(); +} + +cilk_fiber* cilk_fiber::allocate_from_heap(std::size_t stack_size) +{ + // Case 1: pool is NULL. create a new fiber from the heap + // No need for locks here. + cilk_fiber_sysdep* ret = + (cilk_fiber_sysdep*) __cilkrts_malloc(sizeof(cilk_fiber_sysdep)); + + // Error condition. If we failed to allocate a fiber from the + // heap, we are in trouble though... + if (!ret) + return NULL; + + ::new(ret) cilk_fiber_sysdep(stack_size); + + CILK_ASSERT(0 == ret->m_flags); + CILK_ASSERT(NULL == ret->m_pending_remove_ref); + CILK_ASSERT(NULL == ret->m_pending_pool); + ret->init_ref_count(1); + return ret; +} + + +#if USE_FIBER_TRY_ALLOCATE_FROM_POOL +/** + * Helper method: try to allocate a fiber from this pool or its + * ancestors without going to the OS / heap. + * + * Returns allocated pool, or NULL if no pool is found. + * + * If pool contains a suitable fiber. Return it. Otherwise, try to + * recursively grab a fiber from the parent pool, if there is one. + * + * This method will not allocate a fiber from the heap. + * + * This method could be written either recursively or iteratively. + * It probably does not matter which one we do. + * + * @note This method is compiled, but may not be used unless the + * USE_FIBER_TRY_ALLOCATE_FROM_POOL switch is set. + */ +cilk_fiber* cilk_fiber::try_allocate_from_pool_recursive(cilk_fiber_pool* pool) +{ + cilk_fiber* ret = NULL; + + if (pool->size > 0) { + // Try to get the lock. + if (pool->lock) { + // For some reason, it seems to be better to just block on the parent + // pool lock, instead of using a try-lock? +#define USE_TRY_LOCK_IN_FAST_ALLOCATE 0 +#if USE_TRY_LOCK_IN_FAST_ALLOCATE + int got_lock = spin_mutex_trylock(pool->lock); + if (!got_lock) { + // If we fail, skip to the parent. + if (pool->parent) { + return try_allocate_from_pool_recursive(pool->parent); + } + } +#else + spin_mutex_lock(pool->lock); +#endif + } + + // Check in the pool if we have the lock. + if (pool->size > 0) { + ret = pool->fibers[--pool->size]; + } + + // Release the lock once we are done updating pool fields. + if (pool->lock) { + spin_mutex_unlock(pool->lock); + } + } + + if ((!ret) && (pool->parent)) { + return try_allocate_from_pool_recursive(pool->parent); + } + + if (ret) { + // When we pull a fiber out of the pool, set its reference + // count before we return it. + ret->init_ref_count(1); + } + return ret; +} +#endif // USE_FIBER_TRY_ALLOCATE_FROM_POOL + + +cilk_fiber* cilk_fiber::allocate(cilk_fiber_pool* pool) +{ + // Pool should not be NULL in this method. But I'm not going to + // actually assert it, because we are likely to seg fault anyway + // if it is. + // CILK_ASSERT(NULL != pool); + + cilk_fiber *ret = NULL; + +#if USE_FIBER_TRY_ALLOCATE_FROM_POOL + // "Fast" path, which doesn't go to the heap or OS until checking + // the ancestors first. + ret = try_allocate_from_pool_recursive(pool); + if (ret) + return ret; +#endif + + // If we don't get anything from the "fast path", then go through + // a slower path to look for a fiber. + // + // 1. Lock the pool if it is shared. + // 2. Look in our local pool. If we find one, release the lock + // and quit searching. + // 3. Otherwise, check whether we can allocate from heap. + // 4. Release the lock if it was acquired. + // 5. Try to allocate from the heap, if step 3 said we could. + // If we find a fiber, then quit searching. + // 6. If none of these steps work, just recursively try again + // from the parent. + + // 1. Lock the pool if it is shared. + if (pool->lock) { + spin_mutex_lock(pool->lock); + } + + // 2. Look in local pool. + if (pool->size > 0) { + ret = pool->fibers[--pool->size]; + if (ret) { + // If we found one, release the lock once we are + // done updating pool fields, and break out of the + // loop. + if (pool->lock) { + spin_mutex_unlock(pool->lock); + } + + // When we pull a fiber out of the pool, set its reference + // count just in case. + ret->init_ref_count(1); + return ret; + } + } + + // 3. Check whether we can allocate from the heap. + bool can_allocate_from_heap = false; + if (pool->total < pool->alloc_max) { + // Track that we are allocating a new fiber from the + // heap, originating from this pool. + // This increment may be undone if we happen to fail to + // allocate from the heap. + increment_pool_total(pool); + can_allocate_from_heap = true; + } + + // 4. Unlock the pool, and then allocate from the heap. + if (pool->lock) { + spin_mutex_unlock(pool->lock); + } + + // 5. Actually try to allocate from the heap / OS. + if (can_allocate_from_heap) { + ret = allocate_from_heap(pool->stack_size); + // If we got something from the heap, just return it. + if (ret) { + return ret; + } + + // Otherwise, we failed in our attempt to allocate a + // fiber from the heap. Grab the lock and decrement + // the total again. + if (pool->lock) { + spin_mutex_lock(pool->lock); + } + decrement_pool_total(pool, 1); + if (pool->lock) { + spin_mutex_unlock(pool->lock); + } + } + + // 6. If we get here, then searching this pool failed. Go search + // the parent instead if we have one. + if (pool->parent) { + return allocate(pool->parent); + } + + return ret; +} + +int cilk_fiber::remove_reference(cilk_fiber_pool* pool) +{ + int ref_count = this->dec_ref_count(); + if (ref_count == 0) { + if (pool) { + deallocate_self(pool); + } + else { + deallocate_to_heap(); + } + } + return ref_count; +} + +cilk_fiber* cilk_fiber::allocate_from_thread() +{ + void* retmem = __cilkrts_malloc(sizeof(cilk_fiber_sysdep)); + CILK_ASSERT(retmem); + cilk_fiber_sysdep* ret = ::new(retmem) cilk_fiber_sysdep(from_thread); + + // A fiber allocated from a thread begins with a reference count + // of 2. The first is for being created, and the second is for + // being running. + // + // Suspending this fiber will decrement the count down to 1. + ret->init_ref_count(2); + +#if SUPPORT_GET_CURRENT_FIBER + // We're creating the main fiber for this thread. Set this fiber as the + // current fiber. + cilkos_set_tls_cilk_fiber(ret); +#endif + return ret; +} + +int cilk_fiber::deallocate_from_thread() +{ + CILK_ASSERT(this->is_allocated_from_thread()); +#if SUPPORT_GET_CURRENT_FIBER + CILK_ASSERT(this == cilkos_get_tls_cilk_fiber()); + // Reverse of "allocate_from_thread". + cilkos_set_tls_cilk_fiber(NULL); +#endif + + this->assert_ref_count_at_least(2); + + // Suspending the fiber should conceptually decrement the ref + // count by 1. + cilk_fiber_sysdep* self = this->sysdep(); + self->convert_fiber_back_to_thread(); + + // Then, freeing the fiber itself decrements the ref count again. + int ref_count = this->sub_from_ref_count(2); + if (ref_count == 0) { + self->~cilk_fiber_sysdep(); + __cilkrts_free(self); + } + return ref_count; +} + +int cilk_fiber::remove_reference_from_thread() +{ + int ref_count = dec_ref_count(); + if (ref_count == 0) { + cilk_fiber_sysdep* self = this->sysdep(); + self->~cilk_fiber_sysdep(); + __cilkrts_free(self); + } + return ref_count; +} + + +#if SUPPORT_GET_CURRENT_FIBER +cilk_fiber* cilk_fiber::get_current_fiber() +{ + return cilk_fiber_sysdep::get_current_fiber_sysdep(); +} +#endif + +void cilk_fiber::do_post_switch_actions() +{ + if (m_post_switch_proc) + { + cilk_fiber_proc proc = m_post_switch_proc; + m_post_switch_proc = NULL; + proc(this); + } + + if (m_pending_remove_ref) + { + m_pending_remove_ref->remove_reference(m_pending_pool); + + // Even if we don't free it, + m_pending_remove_ref = NULL; + m_pending_pool = NULL; + } +} + +void cilk_fiber::suspend_self_and_resume_other(cilk_fiber* other) +{ +#if FIBER_DEBUG >=1 + fprintf(stderr, "suspend_self_and_resume_other: self =%p, other=%p [owner=%p, resume_sf=%p]\n", + this, other, other->owner, other->resume_sf); +#endif + + // Decrement my reference count (to suspend) + // Increment other's count (to resume) + // Suspended fiber should have a reference count of at least 1. (It is not in a pool). + this->dec_ref_count(); + other->inc_ref_count(); + this->assert_ref_count_at_least(1); + + // Pass along my owner. + other->owner = this->owner; + this->owner = NULL; + + // Change this fiber to resumable. + CILK_ASSERT(!this->is_resumable()); + this->set_resumable(true); + + // Normally, I'd assert other->is_resumable(). But this flag may + // be false the first time we try to "resume" a fiber. + cilk_fiber_sysdep* self = this->sysdep(); + self->suspend_self_and_resume_other_sysdep(other->sysdep()); + + // HAVE RESUMED EXECUTION + // When we come back here, we should have at least two references: + // one for the fiber being allocated / out of a pool, and one for it being active. + this->assert_ref_count_at_least(2); +} + +NORETURN +cilk_fiber::remove_reference_from_self_and_resume_other(cilk_fiber_pool* self_pool, + cilk_fiber* other) +{ + // Decrement my reference count once (to suspend) + // Increment other's count (to resume) + // Suspended fiber should have a reference count of at least 1. (It is not in a pool). + this->dec_ref_count(); + other->inc_ref_count(); + + // Set a pending remove reference for this fiber, once we have + // actually switched off. + other->m_pending_remove_ref = this; + other->m_pending_pool = self_pool; + + // Pass along my owner. + other->owner = this->owner; + this->owner = NULL; + + // Since we are deallocating self, this fiber does not become + // resumable. + CILK_ASSERT(!this->is_resumable()); + + cilk_fiber_sysdep* self = this->sysdep(); + self->jump_to_resume_other_sysdep(other->sysdep()); + + __cilkrts_bug("Deallocating fiber. We should never come back here."); + std::abort(); +} + + +void cilk_fiber::deallocate_to_heap() +{ + cilk_fiber_sysdep* self = this->sysdep(); + self->~cilk_fiber_sysdep(); + __cilkrts_free(self); +} + +void cilk_fiber::deallocate_self(cilk_fiber_pool* pool) +{ + this->set_resumable(false); + + CILK_ASSERT(NULL != pool); + CILK_ASSERT(!this->is_allocated_from_thread()); + this->assert_ref_count_equals(0); + + // Cases: + // + // 1. pool has space: Add to this pool. + // 2. pool is full: Give some fibers to parent, and then free + // enough to make space for the fiber we are deallocating. + // Then put the fiber back into the pool. + + const bool need_lock = pool->lock; + // Grab the lock for the remaining cases. + if (need_lock) { + spin_mutex_lock(pool->lock); + } + + // Case 1: this pool has space. Return the fiber. + if (pool->size < pool->max_size) + { + // Add this fiber to pool + pool->fibers[pool->size++] = this; + if (need_lock) { + spin_mutex_unlock(pool->lock); + } + return; + } + + // Case 2: Pool is full. + // + // First free up some space by giving fibers to the parent. + if (pool->parent) + { + // Pool is full. Move all but "num_to_keep" fibers to parent, + // if we can. + unsigned num_to_keep = pool->max_size/2 + pool->max_size/4; + cilk_fiber_pool_move_fibers_to_parent_pool(pool, num_to_keep); + } + + if (need_lock) { + spin_mutex_unlock(pool->lock); + } + + // Now, free a fiber to make room for the one we need to put back, + // and then put this fiber back. This step may actually return + // fibers to the heap. + cilk_fiber_pool_free_fibers_from_pool(pool, pool->max_size -1, this); +} + + +// NOTE: Except for print-debug, this code is the same as in Windows. +void cilk_fiber::invoke_tbb_stack_op(__cilk_tbb_stack_op op) +{ + cilk_fiber_data *fdata = this->get_data(); + + if (0 == fdata->stack_op_routine) + { + if (CILK_TBB_STACK_RELEASE != op) + DBG_STACK_OPS ("Wkr %p: invoke_tbb_stack_op - %s (%d) for cilk_fiber %p, fiber %p, thread id %04x - No stack op routine\n", + fdata->owner, + NameStackOp(op), + op, + fdata, + this, + cilkos_get_current_thread_id()); + return; + } + + // Call TBB to do it's thing + DBG_STACK_OPS ("Wkr %p: invoke_tbb_stack_op - op %s data %p for cilk_fiber %p, fiber %p, thread id %04x\n", + fdata->owner, + NameStackOp(op), + fdata->stack_op_data, + fdata, + this, + cilkos_get_current_thread_id()); + + (*fdata->stack_op_routine)(op, fdata->stack_op_data); + if (op == CILK_TBB_STACK_RELEASE) + { + fdata->stack_op_routine = 0; + fdata->stack_op_data = 0; + } +} + + + +#if NEED_FIBER_REF_COUNTS + +void cilk_fiber::atomic_inc_ref_count() +{ + cilkos_atomic_add(&m_outstanding_references, 1); +} + +long cilk_fiber::atomic_dec_ref_count() +{ + return cilkos_atomic_add(&m_outstanding_references, -1); +} + +long cilk_fiber::atomic_sub_from_ref_count(long v) +{ + return cilkos_atomic_add(&m_outstanding_references, -v); +} + +#endif // NEED_FIBER_REF_COUNTS + +/* End cilk_fibers.cpp */ diff --git a/libcilkrts/runtime/cilk_fiber.h b/libcilkrts/runtime/cilk_fiber.h new file mode 100644 index 00000000000..2671f924681 --- /dev/null +++ b/libcilkrts/runtime/cilk_fiber.h @@ -0,0 +1,882 @@ +/* cilk_fiber.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2012-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file cilk_fiber.h + * + * @brief Abstraction of a "fiber": A coprocess-like stack and auxiliary data + */ + +#ifndef INCLUDED_CILK_FIBER_DOT_H +#define INCLUDED_CILK_FIBER_DOT_H + +#include <cilk/common.h> +#ifdef __cplusplus +# include <cstddef> +#else +# include <stddef.h> +#endif + +#include "bug.h" +#include "cilk-tbb-interop.h" +#include "spin_mutex.h" +#include "internal/abi.h" // Define __cilkrts_stack_frame + +/** + * @brief Debugging level for Cilk fiber code. + * + * A value of 0 means no debugging. + * Higher values generate more debugging output. + */ +#define FIBER_DEBUG 0 + +/** + * @brief Flag for validating reference counts. + * + * Set to 1 to assert that fiber reference counts are reasonable. + */ +#define FIBER_CHECK_REF_COUNTS 1 + +/** + * @brief Flag to determine whether fibers support reference counting. + * We require reference counting only on Windows, for exception + * processing. Unix does not need reference counting. + */ +#if defined(_WIN32) +# define NEED_FIBER_REF_COUNTS 1 +#endif + +/** + * @brief Flag to enable support for the + * cilk_fiber_get_current_fiber() method. + * + * I'd like this flag to be 0. However, the cilk_fiber test depends + * on being able to call this method. + */ +#if !defined(SUPPORT_GET_CURRENT_FIBER) +# define SUPPORT_GET_CURRENT_FIBER 0 +#endif + +/** + * @brief Switch for enabling "fast path" check for fibers, which + * doesn't go to the heap or OS until checking the ancestors first. + * + * Doing this check seems to make the stress test in + * cilk_fiber_pool.t.cpp run faster. But it doesn't seem to make much + * difference in other benchmarks, so it is disabled by default. + */ +#define USE_FIBER_TRY_ALLOCATE_FROM_POOL 0 + + +__CILKRTS_BEGIN_EXTERN_C + +/// @brief Forward reference to fiber pool. +typedef struct cilk_fiber_pool cilk_fiber_pool; + +/** @brief Opaque data structure representing a fiber */ +typedef struct cilk_fiber cilk_fiber; + +/** @brief Function pointer type for use as a fiber's "main" procedure */ +typedef void (*cilk_fiber_proc)(cilk_fiber*); + +/** @brief Data structure associated with each fiber. */ +typedef struct cilk_fiber_data +{ + __STDNS size_t stack_size; /**< Size of stack for fiber */ + __cilkrts_worker* owner; /**< Worker using this fiber */ + __cilkrts_stack_frame* resume_sf; /**< Stack frame to resume */ + __cilk_tbb_pfn_stack_op stack_op_routine; /**< Cilk/TBB interop callback */ + void* stack_op_data; /**< Data for Cilk/TBB callback */ + void* client_data; /**< Data managed by client */ + +#ifdef _WIN32 + char *initial_sp; /**< Initalized in fiber_stub */ +# ifdef _WIN64 + char *steal_frame_sp; /**< RSP for frame stealing work */ + // Needed for exception handling so we can + // identify when about to unwind off stack +# endif +#endif + +} cilk_fiber_data; + +/** @brief Pool of cilk_fiber for fiber reuse + * + * Pools form a hierarchy, with each pool pointing to its parent. When the + * pool undeflows, it gets a fiber from its parent. When a pool overflows, + * it returns some fibers to its parent. If the root pool underflows, it + * allocates and initializes a new fiber from the heap but only if the total + * is less than max_size; otherwise, fiber creation fails. + */ +struct cilk_fiber_pool +{ + spin_mutex* lock; ///< Mutual exclusion for pool operations + __STDNS size_t stack_size; ///< Size of stacks for fibers in this pool. + cilk_fiber_pool* parent; ///< @brief Parent pool. + ///< If this pool is empty, get from parent + + // Describes inactive fibers stored in the pool. + cilk_fiber** fibers; ///< Array of max_size fiber pointers + unsigned max_size; ///< Limit on number of fibers in pool + unsigned size; ///< Number of fibers currently in the pool + + // Statistics on active fibers that were allocated from this pool, + // but no longer in the pool. + int total; ///< @brief Fibers allocated - fiber deallocated from pool + ///< total may be negative for non-root pools. + int high_water; ///< High water mark of total fibers + int alloc_max; ///< Limit on number of fibers allocated from the heap/OS +}; + +/** @brief Initializes a cilk_fiber_pool structure + * + * @param pool - The address of the pool that is to be initialized + * @param parent - The address of this pool's parent, or NULL for root pool + * @param stack_size - Size of stacks for fibers allocated from this pool. + * @param buffer_size - The maximum number of fibers that may be pooled. + * @param alloc_max - Limit on # of fibers this pool can allocate from the heap. + * @param is_shared - True if accessing this pool needs a lock, false otherwise. + */ +void cilk_fiber_pool_init(cilk_fiber_pool* pool, + cilk_fiber_pool* parent, + size_t stack_size, + unsigned buffer_size, + int alloc_max, + int is_shared); + +/** @brief Sets the maximum number of fibers to allocate from a root pool. + * + * @param root_pool - A root fiber pool + * @param max_fibers_to_allocate - The limit on # of fibers to allocate. + * + * Sets the maximum number of fibers that can be allocated from this + * pool and all its descendants. This pool must be a root pool. + */ +void cilk_fiber_pool_set_fiber_limit(cilk_fiber_pool* root_pool, + unsigned max_fibers_to_allocate); + +/** @brief De-initalizes a cilk_fiber_pool + * + * @param pool - The address of the pool that is to be destroyed + */ +void cilk_fiber_pool_destroy(cilk_fiber_pool* pool); + +/** @brief Allocates a new cilk_fiber. + * + * If the specified pool is empty, this method may choose to either + * allocate a fiber from the heap (if pool->total < pool->alloc_max), + * or retrieve a fiber from the parent pool. + * + * @note If a non-null fiber is returned, @c cilk_fiber_reset_state + * should be called on this fiber before using it. + * + * An allocated fiber begins with a reference count of 1. + * This method may lock @c pool or one of its ancestors. + * + * @pre pool should not be NULL. + * + * @param pool The fiber pool from which to retrieve a fiber. + * @return An allocated fiber, or NULL if failed to allocate. + */ +cilk_fiber* cilk_fiber_allocate(cilk_fiber_pool* pool); + +/** @brief Allocate and initialize a new cilk_fiber using memory from + * the heap and/or OS. + * + * The allocated fiber begins with a reference count of 1. + * + * @param stack_size The size (in bytes) to be allocated for the fiber's + * stack. + * @return An initialized fiber. This method should not return NULL + * unless some exceptional condition has occurred. + */ +cilk_fiber* cilk_fiber_allocate_from_heap(size_t stack_size); + + +/** @brief Resets an fiber object just allocated from a pool with the + * specified proc. + * + * After this call, cilk_fiber_data object associated with this fiber + * is filled with zeros. + * + * This function can be called only on a fiber that has been allocated + * from a pool, but never used. + * + * @param fiber The fiber to reset and initialize. + * @param start_proc The function to run when switching to the fiber. If + * null, the fiber can be used with cilk_fiber_run_proc() + * but not with cilk_fiber_resume(). + */ +void cilk_fiber_reset_state(cilk_fiber* fiber, + cilk_fiber_proc start_proc); + +/** @brief Remove a reference from this fiber, possibly deallocating it. + * + * This fiber is deallocated only when there are no other references + * to it. Deallocation happens either by returning the fiber to the + * specified pool, or returning it to the heap. + * + * A fiber that is currently executing should not remove the last + * reference to itself. + * + * When a fiber is deallocated, destructors are not called for the + * objects (if any) still on its stack. The fiber's stack and fiber + * data is returned to the stack pool but the client fiber data is not + * deallocated. + * + * If the pool overflows because of a deallocation, then some fibers + * will be returned to the parent pool. If the root pool overflows, + * then the fiber is returned to the heap. + * + * @param fiber The Cilk fiber to remove a reference to. + * @param pool The fiber pool to which the fiber should be returned. The + * caller is assumed to have exclusive access to the pool + * either because there is no contention for it or because + * its lock has been acquired. If pool is NULL, any + * deallocated fiber is destroyed and returned to the + * heap. + * + * @return Final reference count. If the count is 0, the fiber was + * returned to a pool or the heap. + */ +int cilk_fiber_remove_reference(cilk_fiber *fiber, cilk_fiber_pool *pool); + +/** @brief Allocates and intializes this thread's main fiber + * + * Each thread has an "implicit" main fiber that control's the + * thread's initial stack. This function makes this fiber visible to + * the client and allocates the Cilk-specific aspects of the implicit + * fiber. A call to this function must be paired with a call to + * cilk_fiber_deallocate_fiber_from_thread() + * or a memory leak (or worse) will result. + * + * A fiber allocated from a thread begins with a reference count of 2. + * One is for being allocated, and one is for being active. + * (A fiber created from a thread is automatically currently executing.) + * The matching calls above each decrement the reference count by 1. + * + * @return A fiber for the currently executing thread. + */ +cilk_fiber* cilk_fiber_allocate_from_thread(void); + +/** @brief Remove a fiber created from a thread, + * possibly deallocating it. + * + * Same as cilk_fiber_remove_reference, except that it works on fibers + * created via cilk_fiber_allocate_from_thread(). + * + * Fibers created from a thread are never returned to a pool. + * + * @param fiber The Cilk fiber to remove a reference from. + * @return Final reference count. If the count is 0, the fiber was + * returned to the heap. + */ +int cilk_fiber_remove_reference_from_thread(cilk_fiber *fiber); + +/** @brief Deallocate a fiber created from a thread, + * possibly destroying it. + * + * This method decrements the reference count of the fiber by 2, and + * destroys the fiber struct if the reference count is 0. + * + * OS-specific cleanup for the fiber executes unconditionally with + * this method. The destruction of the actual object, however, does + * not occur unless the reference count is 0. + * + * @param fiber The cilk_fiber to deallocate from a thread. + * @return Final reference count. If the count is 0, the fiber was + * returned to the heap. + */ +int cilk_fiber_deallocate_from_thread(cilk_fiber *fiber); + +/** @brief Returns true if this fiber is allocated from a thread. + */ +int cilk_fiber_is_allocated_from_thread(cilk_fiber *fiber); + + +/** @brief Suspend execution on current fiber resumes other fiber. + * + * Suspends the current fiber and transfers control to a new fiber. Execution + * on the new fiber resumes from the point at which fiber suspended itself to + * run a different fiber. If fiber was freshly allocated, then runs the + * start_proc function specified at allocation. This function returns when + * another fiber resumes the self fiber. Note that the state of the + * floating-point control register (i.e., the register that controls rounding + * mode, etc.) is valid but indeterminate on return -- different + * implementations will have different results. + * + * When the @c self fiber is resumed, execution proceeds as though + * this function call returns. + * + * This operation increments the reference count of @p other. + * This operation decrements the reference count of @p self. + * + * @param self Fiber to switch from. Must equal current fiber. + * @param other Fiber to switch to. + */ +void cilk_fiber_suspend_self_and_resume_other(cilk_fiber* self, + cilk_fiber* other); + +/** @brief Removes a reference from the currently executing fiber and + * resumes other fiber. + * + * Removes a reference from @p self and transfer control to @p other + * fiber. Execution on @p other resumes from the point at which @p + * other suspended itself to run a different fiber. If @p other fiber + * was freshly allocated, then runs the function specified at + * creation. + * + * + * This operation increments the reference count of @p other. + * + * This operation conceptually decrements the reference count of + * @p self twice, once to suspend it, and once to remove a reference to + * it. Then, if the count is 0, it is returned to the specified pool + * or destroyed. + * + * @pre @p self is the currently executing fiber. + * + * @param self Fiber to remove reference switch from. + * @param self_pool Pool to which the current fiber should be returned + * @param other Fiber to switch to. + */ +NORETURN +cilk_fiber_remove_reference_from_self_and_resume_other(cilk_fiber* self, + cilk_fiber_pool* self_pool, + cilk_fiber* other); + +/** @brief Set the proc method to execute immediately after a switch + * to this fiber. + * + * The @c post_switch_proc method executes immediately after switching + * away form @p self fiber to some other fiber, but before @c self + * gets cleaned up. + * + * @note A fiber can have only one post_switch_proc method at a time. + * If this method is called multiple times before switching to the + * fiber, only the last proc method will execute. + * + * @param self Fiber. + * @param post_switch_proc Proc method to execute immediately after switching to this fiber. + */ +void cilk_fiber_set_post_switch_proc(cilk_fiber* self, cilk_fiber_proc post_switch_proc); + +/** @brief Invoke TBB stack op for this fiber. + * + * @param fiber Fiber to invoke stack op for. + * @param op The stack op to invoke + */ +void cilk_fiber_invoke_tbb_stack_op(cilk_fiber* fiber, __cilk_tbb_stack_op op); + +/** @brief Returns the fiber data associated with the specified fiber. + * + * The returned struct is owned by the fiber and is deallocated automatically + * when the fiber is destroyed. However, the client_data field is owned by + * the client and must be deallocated separately. When called for a + * newly-allocated fiber, the returned data is zero-filled. + * + * @param fiber The fiber for which data is being requested. + * @return The fiber data for the specified fiber + */ +cilk_fiber_data* cilk_fiber_get_data(cilk_fiber* fiber); + +/** @brief Retrieve the owner field from the fiber. + * + * This method is provided for convenience. One can also get the + * fiber data, and then get the owner field. + */ +__CILKRTS_INLINE +__cilkrts_worker* cilk_fiber_get_owner(cilk_fiber* fiber) +{ + // TBD: We really want a static assert here, that this cast is + // doing the right thing. + cilk_fiber_data* fdata = (cilk_fiber_data*)fiber; + return fdata->owner; +} + +/** @brief Sets the owner field of a fiber. + * + * This method is provided for convenience. One can also get the + * fiber data, and then get the owner field. + */ +__CILKRTS_INLINE +void cilk_fiber_set_owner(cilk_fiber* fiber, __cilkrts_worker* owner) +{ + // TBD: We really want a static assert here, that this cast is + // doing the right thing. + cilk_fiber_data* fdata = (cilk_fiber_data*)fiber; + fdata->owner = owner; +} + +/** @brief Returns true if this fiber is resumable. + * + * A fiber is considered resumable when it is not currently being + * executed. + * + * This function is used by Windows exception code. + * @param fiber The fiber to check. + * @return Nonzero value if fiber is resumable. + */ +int cilk_fiber_is_resumable(cilk_fiber* fiber); + +/** + * @brief Returns the base of this fiber's stack. + * + * On some platforms (e.g., Windows), the fiber must have started + * running before we can get this information. + * + * @param fiber The fiber to get the stack pointer from. + * @return The base of the stack, or NULL if this + * information is not available yet. + */ +char* cilk_fiber_get_stack_base(cilk_fiber* fiber); + + +/**************************************************************************** + * TBB interop functions + * **************************************************************************/ +/** + * @brief Set the TBB callback information for a stack + * + * @param fiber The fiber to set the TBB callback information for + * @param o The TBB callback thunk. Specifies the callback address and + * context value. + */ +void cilk_fiber_set_stack_op(cilk_fiber *fiber, + __cilk_tbb_stack_op_thunk o); + +/** + * @brief Save the TBB callback address and context value in + * thread-local storage. + * + * We'll use it later when the thread binds to a worker. + * + * @param o The TBB callback thunk which is to be saved. + */ +void cilk_fiber_tbb_interop_save_stack_op_info(__cilk_tbb_stack_op_thunk o); + +/** + * @brief Move TBB stack-op info from thread-local storage and store + * it into the fiber. + * + * Called when we bind a thread to the runtime. If there is any TBB + * interop information in thread-local storage, bind it to the stack + * now. + * + * @pre \c fiber should not be NULL. + * @param fiber The fiber that should take over the TBB interop information. + */ +void cilk_fiber_tbb_interop_use_saved_stack_op_info(cilk_fiber *fiber); + +/** + * @brief Free any TBB interop information saved in thread-local storage + */ +void cilk_fiber_tbb_interop_free_stack_op_info(void); + +/** + * @brief Migrate any TBB interop information from a cilk_fiber to + * thread-local storage. + * + * Returns immediately if no TBB interop information has been + * associated with the stack. + * + * @param fiber The cilk_fiber who's TBB interop information should be + * saved in thread-local storage. + */ +void cilk_fiber_tbb_interop_save_info_from_stack(cilk_fiber* fiber); + + +#if SUPPORT_GET_CURRENT_FIBER +/** @brief Returns the fiber associated with the currently executing thread + * + * @note This function is currently used only for testing the Cilk + * runtime. + * + * @return Fiber associated with the currently executing thread or NULL if no + * fiber was associated with this thread. + */ +cilk_fiber* cilk_fiber_get_current_fiber(void); +#endif + + +#if NEED_FIBER_REF_COUNTS +/** @brief Returns true if this fiber has reference count > 0. + * + * @param fiber The fiber to check for references. + * @return Nonzero value if the fiber has references. + */ +int cilk_fiber_has_references(cilk_fiber *fiber); + +/** @brief Returns the value of the reference count. + * + * @param fiber The fiber to check for references. + * @return The value of the reference count of fiber. + */ +int cilk_fiber_get_ref_count(cilk_fiber *fiber); + +/** @brief Adds a reference to this fiber. + * + * Increments the reference count of a current fiber. Fibers with + * nonzero reference count will not be freed or returned to a fiber + * pool. + * + * @param fiber The fiber to add a reference to. + */ +void cilk_fiber_add_reference(cilk_fiber *fiber); + +#endif // NEED_FIBER_REF_COUNTS + +__CILKRTS_END_EXTERN_C + +#ifdef __cplusplus +// Some C++ implementation details + +/// Opaque declaration of a cilk_fiber_sysdep object. +struct cilk_fiber_sysdep; + +/** + * cilk_fiber is a base-class for system-dependent fiber implementations. + */ +struct cilk_fiber : protected cilk_fiber_data +{ + protected: + // This is a rare acceptable use of protected inheritence and protected + // variable access: when the base class and derived class collaborate + // tightly to comprise a single component. + + /// For overloading constructor of cilk_fiber. + enum from_thread_t { from_thread = 1 }; + + // Boolean flags capturing the status of the fiber. + // Each one can be set independently. + // A default fiber is constructed with a flag value of 0. + static const int RESUMABLE = 0x01; ///< True if the fiber is in a suspended state and can be resumed. + static const int ALLOCATED_FROM_THREAD = 0x02; ///< True if fiber was allocated from a thread. + + cilk_fiber_proc m_start_proc; ///< Function to run on start up/reset + cilk_fiber_proc m_post_switch_proc; ///< Function that executes when we first switch to a new fiber from a different one. + + cilk_fiber* m_pending_remove_ref;///< Fiber to possibly delete on start up or resume + cilk_fiber_pool* m_pending_pool; ///< Pool where m_pending_remove_ref should go if it is deleted. + unsigned m_flags; ///< Captures the status of this fiber. + +#if NEED_FIBER_REF_COUNTS + volatile long m_outstanding_references; ///< Counts references to this fiber. +#endif + + /// Creates a fiber with NULL data. + cilk_fiber(); + + /** + * @brief Creates a fiber with user-specified arguments. + * + * @param stack_size Size of stack to use for this fiber. + */ + cilk_fiber(std::size_t stack_size); + + /// Empty destructor. + ~cilk_fiber(); + + /** + * @brief Performs any actions that happen after switching from + * one fiber to another. + * + * These actions are: + * 1. Execute m_post_switch_proc on a fiber. + * 2. Do any pending deallocations from the previous fiber. + */ + void do_post_switch_actions(); + + /** + *@brief Helper method that converts a @c cilk_fiber object into a + * @c cilk_fiber_sysdep object. + * + * The @c cilk_fiber_sysdep object contains the system-dependent parts + * of the implementation of a @\c cilk_fiber. + * + * We could have @c cilk_fiber_sysdep inherit from @c cilk_fiber and + * then use virtual functions. But since a given platform only uses + * one definition of @c cilk_fiber_sysdep at a time, we statically + * cast between them. + */ + inline cilk_fiber_sysdep* sysdep(); + + /** + * @brief Set resumable flag to specified state. + */ + inline void set_resumable(bool state) { + m_flags = state ? (m_flags | RESUMABLE) : (m_flags & (~RESUMABLE)); + } + + /** + *@brief Set the allocated_from_thread flag. + */ + inline void set_allocated_from_thread(bool state) { + m_flags = state ? (m_flags | ALLOCATED_FROM_THREAD) : (m_flags & (~ALLOCATED_FROM_THREAD)); + } + + public: + + /** + * @brief Allocates and initializes a new cilk_fiber, either from + * the specified pool or from the heap. + * + * @pre pool should not be NULL. + */ + static cilk_fiber* allocate(cilk_fiber_pool* pool); + + /** + * @brief Allocates a fiber from the heap. + */ + static cilk_fiber* allocate_from_heap(size_t stack_size); + + /** + * @brief Return a fiber to the heap. + */ + void deallocate_to_heap(); + + /** + * @brief Reset the state of a fiber just allocated from a pool. + */ + void reset_state(cilk_fiber_proc start_proc); + + /** + * @brief Remove a reference from this fiber, possibly + * deallocating it if the reference count becomes 0. + * + * @param pool The fiber pool to which this fiber should be returned. + * @return The final reference count. + */ + int remove_reference(cilk_fiber_pool* pool); + + /** + * @brief Deallocate the fiber by returning it to the pool. + * @pre This method should only be called if the reference count + * is 0. + * + * @param pool The fiber pool to return this fiber to. If NULL, + * fiber is returned to the heap. + */ + void deallocate_self(cilk_fiber_pool *pool); + + /** @brief Allocates and intializes this thread's main fiber. */ + static cilk_fiber* allocate_from_thread(); + + /** @brief Deallocate a fiber created from a thread, + * possibly destroying it. + * + * This method decrements the reference count of this fiber by 2, + * and destroys the fiber if the reference count is 0. + * + * OS-specific cleanup for the fiber executes unconditionally with for + * this method. The destruction of the actual object, however, does + * not occur unless the reference count is 0. + * + * @return Final reference count. If the count is 0, the fiber was + * returned to the heap. + */ + int deallocate_from_thread(); + + /** @brief Removes a reference from this fiber. + * + * This method deallocates this fiber if the reference count + * becomes 0. + * + * @pre This fiber must be allocated from a thread. + * @return The final reference count of this fiber. + */ + int remove_reference_from_thread(); + +#if SUPPORT_GET_CURRENT_FIBER + /** @brief Get the current fiber from TLS. + * + * @note This function is only used for testing the runtime. + */ + static cilk_fiber* get_current_fiber(); +#endif + + /** @brief Suspend execution on current fiber resumes other fiber. + * + * Control returns after resuming execution of the self fiber. + */ + void suspend_self_and_resume_other(cilk_fiber* other); + + + /** @brief Removes a reference from the currently executing fiber + * and resumes other fiber. + * + * This fiber may be returned to a pool or deallocated. + */ + NORETURN remove_reference_from_self_and_resume_other(cilk_fiber_pool* self_pool, + cilk_fiber* other); + + /** @brief Set the proc method to execute immediately after a switch + * to this fiber. + * + * @param post_switch_proc Proc method to execute immediately + * after switching to this fiber. + */ + inline void set_post_switch_proc(cilk_fiber_proc post_switch_proc) { + m_post_switch_proc = post_switch_proc; + } + + /** @brief Returns true if this fiber is resumable. + * + * A fiber is considered resumable when it is not currently being + * executed. + */ + inline bool is_resumable(void) { + return (m_flags & RESUMABLE); + } + + /** @brief Returns true if fiber was allocated from a thread. */ + inline bool is_allocated_from_thread(void) { + return (m_flags & ALLOCATED_FROM_THREAD); + } + + /** + *@brief Get the address at the base of the stack for this fiber. + */ + inline char* get_stack_base(); + + /** @brief Return the data for this fiber. */ + cilk_fiber_data* get_data() { return this; } + + /** @brief Return the data for this fiber. */ + cilk_fiber_data const* get_data() const { return this; } + + +#if NEED_FIBER_REF_COUNTS + /** @brief Verifies that this fiber's reference count equals v. */ + inline void assert_ref_count_equals(long v) { + #if FIBER_CHECK_REF_COUNTS + CILK_ASSERT(m_outstanding_references >= v); + #endif + } + + /** @brief Verifies that this fiber's reference count is at least v. */ + inline void assert_ref_count_at_least(long v) { + #if FIBER_CHECK_REF_COUNTS + CILK_ASSERT(m_outstanding_references >= v); + #endif + } + + /** @brief Get reference count. */ + inline long get_ref_count() { return m_outstanding_references; } + + /** @brief Initialize reference count. + * Operation is not atomic. + */ + inline void init_ref_count(long v) { m_outstanding_references = v; } + + // For Windows, updates to the fiber reference count need to be + // atomic, because exceptions can live on a stack that we are not + // currently executing on. Thus, we can update the reference + // count of a fiber we are not currently executing on. + + /** @brief Increment reference count for this fiber [Windows]. */ + inline void inc_ref_count() { atomic_inc_ref_count(); } + + /** @brief Decrement reference count for this fiber [Windows]. */ + inline long dec_ref_count() { return atomic_dec_ref_count(); } + + /** @brief Subtract v from the reference count for this fiber [Windows]. */ + inline long sub_from_ref_count(long v) { return atomic_sub_from_ref_count(v); } +#else // NEED_FIBER_REF_COUNTS + + // Without reference counting, we have placeholder methods. + inline void init_ref_count(long v) { } + + inline void inc_ref_count() { } + + // With no reference counting, dec_ref_count always return 0. + // Thus, anyone checking is always the "last" one. + inline long dec_ref_count() { return 0; } + inline long sub_from_ref_count(long v) { return 0; } + + // The assert methods do nothing. + inline void assert_ref_count_equals(long v) { } + inline void assert_ref_count_at_least(long v) { } +#endif + + /** + * @brief Call TBB to tell it about an "interesting" event. + * + * @param op Value specifying the event to track. + */ + void invoke_tbb_stack_op(__cilk_tbb_stack_op op); + +private: + + /** + * @brief Helper method: try to allocate a fiber from this pool or + * its ancestors without going to the OS / heap. + * + * Returns allocated pool, or NULL if no pool is found. + * + * If pool contains a suitable fiber. Return it. Otherwise, try to + * recursively grab a fiber from the parent pool, if there is one. + * + * This method will not allocate a fiber from the heap. + */ + static cilk_fiber* try_allocate_from_pool_recursive(cilk_fiber_pool* pool); + + +#if NEED_FIBER_REF_COUNTS + /** + * @brief Atomic increment of reference count. + */ + void atomic_inc_ref_count(); + + /** + * @brief Atomic decrement of reference count. + */ + long atomic_dec_ref_count(); + + /** + * @brief Atomic subtract of v from reference count. + * @param v Value to subtract. + */ + long atomic_sub_from_ref_count(long v); +#endif // NEED_FIBER_REF_COUNTS + +}; + +#endif // __cplusplus + +#endif // ! defined(INCLUDED_CILK_FIBER_DOT_H) diff --git a/libcilkrts/runtime/cilk_malloc.c b/libcilkrts/runtime/cilk_malloc.c new file mode 100644 index 00000000000..9d02c52d037 --- /dev/null +++ b/libcilkrts/runtime/cilk_malloc.c @@ -0,0 +1,84 @@ +/* cilk_malloc.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#include "cilk_malloc.h" + +#include <stdlib.h> +#if defined _WIN32 || defined _WIN64 || defined __linux__ +#include <malloc.h> +#define HAS_MEMALIGN 1 +#endif +#ifdef __VXWORKS__ +#define HAS_MEMALIGN 1 +#include <memLib.h> +#endif + +#define PREFERRED_ALIGNMENT 64 /* try to keep runtime system data + structures within one cache line */ + +void *__cilkrts_malloc(size_t size) +{ + /* TODO: check for out of memory */ +#ifdef _WIN32 + return _aligned_malloc(size, PREFERRED_ALIGNMENT); +#elif defined HAS_MEMALIGN + return memalign(PREFERRED_ALIGNMENT, size); +#else + return malloc(size); +#endif +} + +void *__cilkrts_realloc(void *ptr, size_t size) +{ +#ifdef _WIN32 + return _aligned_realloc(ptr, size, PREFERRED_ALIGNMENT); +#else + return realloc(ptr, size); +#endif +} + +void __cilkrts_free(void *ptr) +{ +#ifdef _WIN32 + _aligned_free(ptr); +#else + free(ptr); +#endif +} + +/* End cilk_malloc.c */ diff --git a/libcilkrts/runtime/cilk_malloc.h b/libcilkrts/runtime/cilk_malloc.h new file mode 100644 index 00000000000..fa0fa6d5c9d --- /dev/null +++ b/libcilkrts/runtime/cilk_malloc.h @@ -0,0 +1,90 @@ +/* cilk_malloc.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file cilk_malloc.h + * + * @brief Provides replacement memory allocation functions to allocate + * (and free) memory on cache line boundaries, if supported by the OS. + * + * If aligned memory functions are not provided by the OS, the calls just + * pass through to the standard memory allocation functions. + */ + +#ifndef INCLUDED_CILK_MALLOC_DOT_H +#define INCLUDED_CILK_MALLOC_DOT_H + +#include <cilk/common.h> +#include <stddef.h> + +#include "rts-common.h" + +__CILKRTS_BEGIN_EXTERN_C + +/** + * malloc replacement function to allocate memory aligned on a cache line + * boundary if aligned memory allocations are supported by the OS. + * + * @param size Number of bytes to allocate. + * + * @return pointer to memory block allocated, or NULL if unsuccessful. + */ +COMMON_PORTABLE void *__cilkrts_malloc(size_t size); + +/** + * realloc replacement function to allocate memory aligned on a cache line + * boundary if aligned memory allocations are supported by the OS. + * + * @param ptr Block to be reallocated. + * @param size Number of bytes to allocate. + * + * @return pointer to memory block allocated, or NULL if unsuccessful. + */ +COMMON_PORTABLE void *__cilkrts_realloc(void *ptr, size_t size); + +/** + * free replacement function to deallocate memory aligned on a cache line + * boundary if aligned memory allocations are supported by the OS. + * + * @param ptr Block to be freed. + */ +COMMON_PORTABLE void __cilkrts_free(void *ptr); + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_CILK_MALLOC_DOT_H) diff --git a/libcilkrts/runtime/component.h b/libcilkrts/runtime/component.h new file mode 100644 index 00000000000..64ff3e5fc42 --- /dev/null +++ b/libcilkrts/runtime/component.h @@ -0,0 +1,52 @@ +/* component.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#ifndef INCLUDED_COMPONENT_DOT_H +#define INCLUDED_COMPONENT_DOT_H + +#define COMPONENT_NAME "Intel® Cilk™ Plus Runtime" + +#define COMPONENT_INTERNAL_NAME COMPONENT_NAME + +#define COMPONENT_FILENAME "CILKRTS20" + +#define BuildVersionString(_major, _minor, _build, _rev) #_major "," #_minor "," #_build "," #_rev + +#define COMPONENT_VERSION_STRING BuildVersionString (VERSION_MAJOR, VERSION_MINOR, VERSION_BUILD, VERSION_REVISION) + +#endif // ! defined(INCLUDED_COMPONENT_DOT_H) diff --git a/libcilkrts/runtime/config/generic/cilk-abi-vla.c b/libcilkrts/runtime/config/generic/cilk-abi-vla.c new file mode 100644 index 00000000000..98fefa101bd --- /dev/null +++ b/libcilkrts/runtime/config/generic/cilk-abi-vla.c @@ -0,0 +1,107 @@ +/* cilk-abi-vla.cpp -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +/* + * Implementation of Variable Length Array (VLA) ABI. + * + * The compiler calls these functions to allocate Variable Length Arrays + * at runtime. The compiler must guarantee that __cilkrts_stack_free() is + * called to cleanup any memory allocated by __cilkrts_stack_alloc(). + * + * This generic implementation always allocates the memory from the heap. + * Optimally, the implementation should expand the frame of the calling + * function if possible, since that will be faster. See the x86 version + * for one possible implementation. + */ + +#include <assert.h> +#include <stdlib.h> +#include <stdint.h> + +#include "internal/abi.h" +#include "cilk-abi-vla-internal.h" + +#define c_cilk_ptr_from_heap 0xc2f2f00d +#define c_cilk_ptr_from_stack 0xc3f30d0f + +// Allocate space for a variable length array +CILK_ABI(__cilkrts_void_ptr) +__cilkrts_stack_alloc( + __cilkrts_stack_frame *sf, + size_t size, + size_t distance_from_sp_to_alloca_area, + uint32_t align, // align is always >= minimum stack alignment and + // >= ptr_size as well, and must be a power of 2. + uint32_t needs_tag // non-zero if the pointer being returned needs to + // be tagged +) +{ + // full_size will be a multiple of align, and contains + // enough extra space to allocate a marker. + size_t full_size = (size + align - 1) & ~(align - 1); + + // Allocate memory from the heap. The compiler is responsible + // for guaranteeing us a chance to free it before the function + // exits + + return (void *)vla_internal_heap_alloc(sf, full_size, align); +} + +// Free the space allocated for a variable length array. +CILK_ABI(void) +__cilkrts_stack_free( + __cilkrts_stack_frame *sf, + void *p, + size_t size, + size_t distance_from_sp_to_alloca_area, + uint32_t align, // same requirements as for align in allocation, + // and must match alignment that was passed when + // doing the allocation + uint32_t known_from_stack // non-zero if this is known to be allocated + // on the stack, and therefore has no tag +) +{ + // full_size will be a multiple of align, and contains + // enough extra space to allocate a marker if one was needed. + size_t full_size = (size + align - 1) & ~(align - 1); + + // Just free the allocated memory to the heap since we don't know + // how to expand/contract the calling frame + vla_internal_heap_free(t, full_size); +} diff --git a/libcilkrts/runtime/config/generic/os-fence.h b/libcilkrts/runtime/config/generic/os-fence.h new file mode 100644 index 00000000000..841307a5296 --- /dev/null +++ b/libcilkrts/runtime/config/generic/os-fence.h @@ -0,0 +1,53 @@ +/* os.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/* + * void __cilkrts_fence(void) + * + * Executes an MFENCE instruction to serialize all load and store instructions + * that were issued prior the MFENCE instruction. This serializing operation + * guarantees that every load and store instruction that precedes the MFENCE + * instruction is globally visible before any load or store instruction that + * follows the MFENCE instruction. The MFENCE instruction is ordered with + * respect to all load and store instructions, other MFENCE instructions, any + * SFENCE and LFENCE instructions, and any serializing instructions (such as + * the CPUID instruction). + */ + +COMMON_SYSDEP void __cilkrts_fence(void); ///< MFENCE instruction + diff --git a/libcilkrts/runtime/config/generic/os-unix-sysdep.c b/libcilkrts/runtime/config/generic/os-unix-sysdep.c new file mode 100644 index 00000000000..fda7fc414bc --- /dev/null +++ b/libcilkrts/runtime/config/generic/os-unix-sysdep.c @@ -0,0 +1,94 @@ +/* os-unix-sysdep.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ************************************************************************* + * + * This file contains generic implementations of system-specific code for + * Unix-based systems + */ + +#include "os.h" +#include "sysdep.h" + +/* + * The cycle counter is used for debugging. This funciton is only called if + * CILK_PROFILE is defined when the runtime is built. + */ +COMMON_SYSDEP unsigned long long __cilkrts_getticks(void) +{ +# warning "unimplemented cycle counter" + return 0; +} + +/* + * A "short pause" - called from the Cilk runtime's spinloops. + */ +COMMON_SYSDEP void __cilkrts_short_pause(void) +{ +# warning __cilkrts_short_pause empty +} + +/* + * Interlocked exchange - used to implement the Cilk runtime's spinloops + */ +COMMON_SYSDEP int __cilkrts_xchg(volatile int *ptr, int x) +{ + x = __sync_lock_test_and_set(ptr, x); + return x; +} + + +/* + * Restore the floating point state that is stored in a stack frame at each + * spawn. This should be called each time a frame is resumed. + * + * Only valid for IA32 and Intel64 processors. + */ +void restore_x86_fp_state (__cilkrts_stack_frame *sf) +{ +} + + +/* + * Save the floating point state to the __cilkrts_stack_frame at each spawn. + * + * Architecture-specific - Should only be needed on IA32 and Intel64 + * processors. + */ +void sysdep_save_fp_ctrl_state(__cilkrts_stack_frame *sf) +{ +} + diff --git a/libcilkrts/runtime/config/x86/cilk-abi-vla.c b/libcilkrts/runtime/config/x86/cilk-abi-vla.c new file mode 100644 index 00000000000..2d38e7f9a56 --- /dev/null +++ b/libcilkrts/runtime/config/x86/cilk-abi-vla.c @@ -0,0 +1,422 @@ +/* cilk-abi-vla.cpp -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +/* + * Implementation of Variable Length Array (VLA) ABI. + * + * __cilkrts_stack_alloc() and __cilkrts_stack_free must be compiled + * such that ebp/rbp is used for the stack frames. This is done by having + * each of them use alloca, which forces the special frame types needed on + * each of the ABIs. Additionally, for some forms of stack frame, special + * care must be taken because the alloca space may not be at the bottom of the + * stack frame of the caller. For Intel64 windows, and for some options + * with other ABIs, a preallocated parameter block may exist on the stack + * at a lower address than the alloca. If this is the case, the parameter + * distance_from_sp_to_alloca_area will be non-zero, and will indicate how + * much pre-allocated parameter space resides in the caller's stack frame + * between the alloca area, and the bottom of the stack when the call to + * the cilkrts is made. As such, when non-zero it also includes any space + * used for passing the cilkrts_stack_alloc or cilkrts_stack_free parameters. + */ + +#include <assert.h> +#include <stdlib.h> +#include <stdint.h> +#ifdef _WIN32 +# define alloca _alloca +# define INLINE static __inline +# pragma warning(disable:1025) // Don't whine about zero extending result of unary operation +#else +# include <alloca.h> +# define INLINE static inline +#endif + +#include "internal/abi.h" +#include "cilk-abi-vla-internal.h" + +#if defined(__x86_64) || defined(_M_X64) +INLINE void setsp(void *val) +{ + __asm__("movq %0, %%rsp" : : "r"(val): "rsp"); +} +INLINE char* getsp(void) +{ + void *res; + + __asm__("movq %%rsp, %0" : "=r"(res): : "rsp"); + return res; +} +INLINE char* getbp(void) +{ + void *res; + + __asm__("movq %%rbp, %0" : "=r"(res): : "rbp"); + return res; +} +INLINE void copy_frame_down_and_move_bp( + char *dst, + char *src, + size_t cpy_bytes, + char *new_ebp +) +{ + // In this version, dst is guaranteed to be lower address than src, + // therefore copying upwards from src into dst is safe in case + // there is overlap. The number of bytes is also guaranteed to be + // a multiple of 8, and the copy is done in 64 bit word chunks for + // best efficiency. + __asm__( + "movq %0, %%rdi;" + "movq %1, %%rsi;" + "movq %2, %%rcx;" + "shrq $3, %%rcx;" + "rep movsq;" + "movq %3, %%rbp" : + : + "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) : + "rsi", "rdi", "rcx", "rbp", "memory"); +} +INLINE void copy_frame_up_and_move_bp( + char *dst, + char *src, + size_t cpy_bytes, + char *new_ebp +) +{ + // In this version, dst is guaranteed to be higher address than src, + // therefore copying downwards from src into dst is safe in case + // there is overlap. The number of bytes is also guaranteed to be + // a multiple of 8, and the copy is done in 64 bit word chunks for + // best efficiency. + dst += cpy_bytes - 8; + src += cpy_bytes - 8; + __asm__( + "movq %0, %%rdi;" + "movq %1, %%rsi;" + "movq %2, %%rcx;" + "shrq $3, %%rcx;" + "std; rep movsq; cld;" + "movl %3, %%rbp;" : + : + "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) : + "rsi", "rdi", "rcx", "rbp", "memory"); +} +#else +INLINE void setsp(void *val) +{ + __asm__("movl %0, %%esp" : : "r"(val): "esp"); +} +INLINE char* getsp(void) +{ + void *res; + + __asm__("movl %%esp, %0" : "=r"(res): : "esp"); + return res; +} +INLINE char* getbp(void) +{ + void *res; + + __asm__("movl %%ebp, %0" : "=r"(res): : "ebp"); + return res; +} +INLINE void copy_frame_down_and_move_bp( + char *dst, + char *src, + size_t cpy_bytes, + char *new_ebp +) +{ + // In this version, dst is guaranteed to be lower address than src, + // therefore copying upwards from src into dst is safe in case + // there is overlap. The number of bytes is also guaranteed to be + // a multiple of 4, and the copy is done in 32 bit word chunks for + // best efficiency. + __asm__( + "movl %0, %%edi;" + "movl %1, %%esi;" + "movl %2, %%ecx;" + "shrl $2, %%ecx;" + "rep movsd;" + "movl %3, %%ebp" : + : + "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) : + "esi", "edi", "ecx", "ebp", "memory"); +} +INLINE void copy_frame_up_and_move_bp( + char *dst, + char *src, + size_t cpy_bytes, + char *new_ebp +) +{ + // In this version, dst is guaranteed to be higher address than src, + // therefore copying downwards from src into dst is safe in case + // there is overlap. The number of bytes is also guaranteed to be + // a multiple of 4, and the copy is done in 32 bit word chunks for + // best efficiency. + dst += cpy_bytes - 4; + src += cpy_bytes - 4; + __asm__( + "movl %0, %%edi;" + "movl %1, %%esi;" + "movl %2, %%ecx;" + "shrl $2, %%ecx;" + "std; rep movsd; cld;" + "movl %3, %%ebp" : + // "=D"(dst), "=S"(src), "=C"(cpy_bytes) : + : + "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) : + "esi", "edi", "ecx", "ebp", "memory"); +} +#endif + + +#define c_cilk_ptr_from_heap 0xc2f2f00d +#define c_cilk_ptr_from_stack 0xc3f30d0f + +CILK_ABI(__cilkrts_void_ptr) +__cilkrts_stack_alloc( + __cilkrts_stack_frame *sf, + size_t size, + size_t distance_from_sp_to_alloca_area, + uint32_t align, // align is always >= minimum stack alignment and + // >= ptr_size as well, and must be a power of 2. + uint32_t needs_tag // non-zero if the pointer being returned needs to + // be tagged +) +{ +#ifdef __INTEL_COMPILER + // full_size will be a multiple of align, and contains + // enough extra space to allocate a marker. + size_t full_size = (size + align - 1) & ~(align - 1); + + if (needs_tag) { + full_size += align; + } + + char *t; + if (sf->worker != 0 && + ((sf->flags & CILK_FRAME_UNSYNCHED) != 0)) { + t = vla_internal_heap_alloc(sf, full_size, align); + if (needs_tag) { + t += align; + ((uint32_t*)t)[-1] = c_cilk_ptr_from_heap; + } + return (void *)t; + } + + // stack is still synced, allocate full_size from esp, + // and record in 32 bits immediately below the space + // allocated that this was space that this was + // allocated in the stack. + char *old_ebp = getbp(); + char *old_esp = getsp(); + + // make top_ptr point to base of first parameter. + char *top_ptr = ((char *)(_AddressOfReturnAddress()) + + sizeof(char *)); + size_t param_size = 0; + +#if defined(__x86_64) + // For Intel64 linux & MACH ABI, all the parameters were passed in + // register, so top of the stack frame above the return address + // is just the size of the return address plus + // distance_from_sp_to_alloca_area on the chance that the alloca + // area isn't at the very bottom of the calling functions stack. +#elif defined(__MACH__) + // For ia32 MACH, parameter size is always a mutliple of 16 + // bytes to keep the stack 16 byte aligned. So we need to round + // number of parameters up to multiple of 4. + param_size = 8 * sizeof(char *); +#else + // For both windows Intel64 ABI, and the IA32 windows and + // linux ABIs, space is reserved on the stack for all these + // parameters. param_size is 5 * size of a stack slot. + param_size = 5 * sizeof(char *); +#endif + + // now make top_ptr point above the params, or if + // distance_from_sp_to_alloca_area is not zero, make + // it point above that area. When non-zero, + // distance_from_sp_to_alloca area is expected to contain + // the parameter space, so we only add one or the other, + // not both. + top_ptr += (distance_from_sp_to_alloca_area != 0) ? + distance_from_sp_to_alloca_area : param_size; + + // t needs to end up at current value of top_ptr less full_size and less + // distance_from_sp_to_alloca_area and + // then rounded down to the alignment needed. Then we have to bump + // esp down by current frame_size, so that when all is done with respect + // to executing the return sequence, the final value of esp will be the + // same value as t. + t = (top_ptr - full_size) - distance_from_sp_to_alloca_area; + intptr_t temp = (intptr_t)t; + temp &= ~((intptr_t)(align - 1)); + t = (char *)temp; + + // ok, the value of t is set where we need it. Now set esp + // to the value of t less the current frame size. + // So now when we do regular return esp should be left such + // that it has moved down by full_size. + size_t cur_fm_size = (top_ptr - old_esp); + char *new_esp = t - cur_fm_size; + char *new_ebp = old_ebp - (old_esp - new_esp); + + // extend the stack down by at least the difference between where + // I want it to be and where it currently is. This should take care + // of touching any pages necessary. + char *foo = alloca(old_esp - new_esp); + setsp(foo < new_esp ? foo : new_esp); + + // Now set esp exactly where I want it. + // setsp(new_esp); + + copy_frame_down_and_move_bp(new_esp, old_esp, cur_fm_size, new_ebp); + + if (needs_tag) { + t += align; + ((uint32_t*)t)[-1] = c_cilk_ptr_from_stack; + } + + return t; +#else // Not __INTEL_COMPILER + // Not supported unless we can figure out how to get the size of the frame + return NULL; +#endif +} + +// This frees the space allocated for a variable length array. +CILK_ABI(void) +__cilkrts_stack_free( + __cilkrts_stack_frame *sf, + void *p, + size_t size, + size_t distance_from_sp_to_alloca_area, + uint32_t align, // same requirements as for align in allocation, + // and must match alignment that was passed when + // doing the allocation + uint32_t known_from_stack // non-zero if this is known to be allocated + // on the stack, and therefore has no tag +) +{ +#ifdef __INTEL_COMPILER + uint32_t *t = (uint32_t*)p; + + // full_size will be a multiple of align, and contains + // enough extra space to allocate a marker if one was needed. + size_t full_size = (size + align - 1) & ~(align - 1); + if (known_from_stack == 0) { + // if the compiler hasn't told the run-time that this is + // known to be on the stack, then this pointer must have been + // tagged such that the run-time can tell. + assert(t[-1] == c_cilk_ptr_from_stack || + t[-1] == c_cilk_ptr_from_heap); + + known_from_stack = t[-1] == c_cilk_ptr_from_stack; + full_size += align; // accounts for extra space for marker + t = (uint32_t *)(((char *)t) - align); + } + + if (known_from_stack) { + // alloca useage forces an ebp/rbp based stack frame even though + // 0 and unused. + char *foo = alloca(0); + if (sf->worker == 0 || (sf->flags & CILK_FRAME_UNSYNCHED) == 0) { + // p was allocated from current stack frame and we + // are synced on current stack frame. Return the + // amount of the stack that needs to be freed. + char *old_ebp = getbp(); + char *old_esp = getsp(); + + // make top_ptr point to base of first parameter. + char *top_ptr = ((char *)(_AddressOfReturnAddress()) + + sizeof(char *)); + size_t param_size = 0; + +#if defined(__x86_64) + // For Intel64 linux & MACH ABI, all the parameters were passed in + // register, so top of the stack frame above the return address + // is just the size of the return address plus + // distance_from_sp_to_alloca_area on the chance that the alloca + // area isn't at the very bottom of the calling functions stack. +#elif defined(__MACH__) + // For ia32 MACH, parameter size is always a mutliple of 16 + // bytes to keep the stack 16 byte aligned. So we need to round + // number of parameters up to multiple of 4. + param_size = 8 * sizeof(char *); +#else + // For both windows Intel64 ABI, and the IA32 windows and + // linux ABIs, space is reserved on the stack for all these + // parameters. param_size is 5 * size of a stack slot. + param_size = 6 * sizeof(char *); +#endif + + // now make top_ptr point above the params, or if + // distance_from_sp_to_alloca_area is not zero, make + // it point above that area. When non-zero, + // distance_from_sp_to_alloca area is expected to contain + // the parameter space, so we only add one or the other, + // not both. + top_ptr += (distance_from_sp_to_alloca_area != 0) ? + distance_from_sp_to_alloca_area : param_size; + + size_t cur_fm_size = (top_ptr - old_esp); + char *new_esp = old_esp + full_size; + char *new_ebp = old_ebp + full_size; + + copy_frame_up_and_move_bp(new_esp, old_esp, cur_fm_size, new_ebp); + setsp(new_esp); + } + else { + // p was allocated on stack frame, but that is + // no longer the current stack frame. Need to adjust the + // saved esp that is somewhere in the cilk runtime so that + // on sync, esp will be cut back correctly. + vla_free_from_original_stack(sf, full_size); + } + } + else { + vla_internal_heap_free(t, full_size); + } +#else // Not __INTEL_COMPILER + // Not supported unless we can figure out how to get the size of the frame +#endif +} diff --git a/libcilkrts/runtime/config/x86/os-fence.h b/libcilkrts/runtime/config/x86/os-fence.h new file mode 100644 index 00000000000..ec704e94ef2 --- /dev/null +++ b/libcilkrts/runtime/config/x86/os-fence.h @@ -0,0 +1,72 @@ +/* os.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/* gcc before 4.4 does not implement __sync_synchronize properly */ +#if (__ICC >= 1110 && !(__MIC__ || __MIC2__)) \ + || (!defined __ICC && __GNUC__ * 10 + __GNUC_MINOR__ > 43) +# define HAVE_SYNC_INTRINSICS 1 +#endif + + +/* + * void __cilkrts_fence(void) + * + * Executes an MFENCE instruction to serialize all load and store instructions + * that were issued prior the MFENCE instruction. This serializing operation + * guarantees that every load and store instruction that precedes the MFENCE + * instruction is globally visible before any load or store instruction that + * follows the MFENCE instruction. The MFENCE instruction is ordered with + * respect to all load and store instructions, other MFENCE instructions, any + * SFENCE and LFENCE instructions, and any serializing instructions (such as + * the CPUID instruction). + */ +#ifdef HAVE_SYNC_INTRINSICS +# define __cilkrts_fence() __sync_synchronize() +#elif defined __ICC || defined __GNUC__ + /* mfence is a strict subset of lock add but takes longer on many + * processors. */ +// # define __cilkrts_fence() __asm__ volatile ("mfence") + /* On MIC, fence seems to be completely unnecessary. + * Just for simplicity of 1st implementation, it defaults to x86 */ +# define __cilkrts_fence() __asm__ volatile ("lock addl $0,(%rsp)") +// #elif defined _WIN32 +// # pragma intrinsic(_ReadWriteBarrier) +// # define __cilkrts_fence() _ReadWriteBarrier() +#else +COMMON_SYSDEP void __cilkrts_fence(void); ///< MFENCE instruction +#endif diff --git a/libcilkrts/runtime/config/x86/os-unix-sysdep.c b/libcilkrts/runtime/config/x86/os-unix-sysdep.c new file mode 100644 index 00000000000..881bc3f4283 --- /dev/null +++ b/libcilkrts/runtime/config/x86/os-unix-sysdep.c @@ -0,0 +1,123 @@ +/* os-unix-sysdep.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ************************************************************************* + * + * This file contains system-specific code for Unix systems + */ + +#include "os.h" +#include "sysdep.h" +#include <internal/abi.h> + +// On x86 processors (but not MIC processors), the compiler generated code to +// save the FP state (rounding mode and the like) before calling setjmp. We +// will need to restore that state when we resume. +#ifndef __MIC__ +# if defined(__i386__) || defined(__x86_64) +# define RESTORE_X86_FP_STATE +# endif // defined(__i386__) || defined(__x86_64) +#endif // __MIC__ + +/* timer support */ +COMMON_SYSDEP unsigned long long __cilkrts_getticks(void) +{ +#if defined __i386__ || defined __x86_64 + unsigned a, d; + __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); + return ((unsigned long long)a) | (((unsigned long long)d) << 32); +#else +# warning "unimplemented cycle counter" + return 0; +#endif +} + +COMMON_SYSDEP void __cilkrts_short_pause(void) +{ +#if __ICC >= 1110 +# if __MIC__ || __MIC2__ + _mm_delay_32(16); // stall for 16 cycles +# else + _mm_pause(); +# endif +#elif defined __i386__ || defined __x86_64 + __asm__("pause"); +#else +# warning __cilkrts_short_pause empty +#endif +} + +COMMON_SYSDEP int __cilkrts_xchg(volatile int *ptr, int x) +{ +#if defined __i386__ || defined __x86_64 + /* asm statement here works around icc bugs */ + __asm__("xchgl %0,%a1" :"=r" (x) : "r" (ptr), "0" (x) :"memory"); +#else + x = __sync_lock_test_and_set(ptr, x); +#endif + return x; +} + + +/* + * Restore the floating point state that is stored in a stack frame at each + * spawn. This should be called each time a frame is resumed. + * + * Only valid for IA32 and Intel64 processors. + */ +void restore_x86_fp_state (__cilkrts_stack_frame *sf) { +#ifdef RESTORE_X86_FP_STATE + __asm__ ( "ldmxcsr %0\n\t" + "fnclex\n\t" + "fldcw %1" + : + : "m" (sf->mxcsr), "m" (sf->fpcsr)); +#endif +} + + +void sysdep_save_fp_ctrl_state(__cilkrts_stack_frame *sf) +{ +// If we're not going to restore, don't bother saving it +#ifdef RESTORE_X86_FP_STATE + if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1) + { + __asm__ ("stmxcsr %0" : "=m" (sf->mxcsr)); + __asm__ ("fnstsw %0" : "=m" (sf->fpcsr)); + } +#endif +} + diff --git a/libcilkrts/runtime/doxygen-layout.xml b/libcilkrts/runtime/doxygen-layout.xml new file mode 100644 index 00000000000..fabe0ab3cd8 --- /dev/null +++ b/libcilkrts/runtime/doxygen-layout.xml @@ -0,0 +1,222 @@ +<doxygenlayout version="1.0"> + +<!-- +# @copyright +# Copyright (C) 2011-2013, Intel Corporation +# All rights reserved. +# +# @copyright +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# @copyright +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +--> + + <!-- Navigation index tabs for HTML output --> + <navindex> + <tab type="mainpage" visible="yes" title=""/> + <tab type="pages" visible="yes" title="" intro=""/> + <tab type="modules" visible="yes" title="" intro=""/> + <tab type="namespaces" visible="yes" title=""> + <tab type="namespaces" visible="yes" title="" intro=""/> + <tab type="namespacemembers" visible="yes" title="" intro=""/> + </tab> + <tab type="classes" visible="yes" title="Classes, Structs and Unions"> + <tab type="classes" visible="yes" title="Classes, Structs and Unions" intro=""/> + <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/> + <tab type="hierarchy" visible="yes" title="" intro=""/> + <tab type="classmembers" visible="yes" title="" intro=""/> + </tab> + <tab type="files" visible="yes" title=""> + <tab type="files" visible="yes" title="" intro=""/> + <tab type="globals" visible="yes" title="" intro=""/> + </tab> + <tab type="globals" visible="yes" title="Global Functions" intro=""/> + <tab type="dirs" visible="yes" title="" intro=""/> + <tab type="examples" visible="yes" title="" intro=""/> + </navindex> + + <!-- Layout definition for a class page --> + <class> + <briefdescription visible="yes"/> + <includes visible="$SHOW_INCLUDE_FILES"/> + <inheritancegraph visible="$CLASS_GRAPH"/> + <collaborationgraph visible="$COLLABORATION_GRAPH"/> + <allmemberslink visible="yes"/> + <memberdecl> + <nestedclasses visible="yes" title=""/> + <publictypes title=""/> + <publicslots title=""/> + <signals title=""/> + <publicmethods title=""/> + <publicstaticmethods title=""/> + <publicattributes title=""/> + <publicstaticattributes title=""/> + <protectedtypes title=""/> + <protectedslots title=""/> + <protectedmethods title=""/> + <protectedstaticmethods title=""/> + <protectedattributes title=""/> + <protectedstaticattributes title=""/> + <packagetypes title=""/> + <packagemethods title=""/> + <packagestaticmethods title=""/> + <packageattributes title=""/> + <packagestaticattributes title=""/> + <properties title=""/> + <events title=""/> + <privatetypes title=""/> + <privateslots title=""/> + <privatemethods title=""/> + <privatestaticmethods title=""/> + <privateattributes title=""/> + <privatestaticattributes title=""/> + <friends title=""/> + <related title="" subtitle=""/> + <membergroups visible="yes"/> + </memberdecl> + <detaileddescription title=""/> + <memberdef> + <typedefs title=""/> + <enums title=""/> + <constructors title=""/> + <functions title=""/> + <related title=""/> + <variables title=""/> + <properties title=""/> + <events title=""/> + </memberdef> + <usedfiles visible="$SHOW_USED_FILES"/> + <authorsection visible="yes"/> + </class> + + <!-- Layout definition for a namespace page --> + <namespace> + <briefdescription visible="yes"/> + <memberdecl> + <nestednamespaces visible="yes" title=""/> + <classes visible="yes" title=""/> + <typedefs title=""/> + <enums title=""/> + <functions title=""/> + <variables title=""/> + <membergroups visible="yes"/> + </memberdecl> + <detaileddescription title=""/> + <memberdef> + <typedefs title=""/> + <enums title=""/> + <functions title=""/> + <variables title=""/> + </memberdef> + <authorsection visible="yes"/> + </namespace> + + <!-- Layout definition for a file page --> + <file> + <briefdescription visible="no"/> + <includegraph visible="$INCLUDE_GRAPH"/> + <includedbygraph visible="$INCLUDED_BY_GRAPH"/> + <detaileddescription title="Description"/> + <includes visible="no"/> + <sourcelink visible="yes"/> + <memberdecl> + <classes visible="yes" title="Structures and Classes"/> + <namespaces visible="yes" title=""/> + <defines title=""/> + <typedefs title=""/> + <enums title=""/> + <functions title=""/> + <variables title=""/> + <membergroups visible="yes"/> + </memberdecl> + <memberdef> + <defines title=""/> + <typedefs title=""/> + <enums title=""/> + <functions title=""/> + <variables title=""/> + </memberdef> + <authorsection/> + </file> + + <!-- Layout definition for a group page --> + <group> + <briefdescription visible="yes"/> + <groupgraph visible="$GROUP_GRAPHS"/> + <memberdecl> + <classes visible="yes" title=""/> + <namespaces visible="yes" title=""/> + <dirs visible="yes" title=""/> + <nestedgroups visible="yes" title=""/> + <files visible="yes" title=""/> + <defines title=""/> + <typedefs title=""/> + <enums title=""/> + <enumvalues title=""/> + <functions title=""/> + <variables title=""/> + <signals title=""/> + <publicslots title=""/> + <protectedslots title=""/> + <privateslots title=""/> + <events title=""/> + <properties title=""/> + <friends title=""/> + <membergroups visible="yes"/> + </memberdecl> + <detaileddescription title=""/> + <memberdef> + <pagedocs/> + <inlineclasses title=""/> + <defines title=""/> + <typedefs title=""/> + <enums title=""/> + <enumvalues title=""/> + <functions title=""/> + <variables title=""/> + <signals title=""/> + <publicslots title=""/> + <protectedslots title=""/> + <privateslots title=""/> + <events title=""/> + <properties title=""/> + <friends title=""/> + </memberdef> + <authorsection visible="yes"/> + </group> + + <!-- Layout definition for a directory page --> + <directory> + <briefdescription visible="yes"/> + <directorygraph visible="yes"/> + <memberdecl> + <dirs visible="yes"/> + <files visible="yes"/> + </memberdecl> + <detaileddescription title=""/> + </directory> +</doxygenlayout> diff --git a/libcilkrts/runtime/doxygen.cfg b/libcilkrts/runtime/doxygen.cfg new file mode 100644 index 00000000000..684dcb51b51 --- /dev/null +++ b/libcilkrts/runtime/doxygen.cfg @@ -0,0 +1,1774 @@ +# Doxyfile 1.7.4
+
+# @copyright +# Copyright (C) 2011-2013, Intel Corporation +# All rights reserved. +# +# @copyright +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# @copyright +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY +# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME = "Intel Cilk Plus Runtime"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY =
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES = YES
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE = doxygen-layout.xml
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT = ./ \
+ ../include/internal/abi.h \
+ ../include/cilk/cilk_api.h \
+ ../include/cilk/common.h \
+ ./readme.dox
+
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS =
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE = attributes.h \
+ cilk-ittnotify.h \
+ component.h \
+ rts-common.h \
+ windows-clean.h
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS = _UNWIND_INFO \
+ _UNWIND_CODE \
+ _DISPATCHER_CONTEXT \
+ __cilkrts_stack \
+ pending_exception_info
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+# for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is adviced to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the stylesheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID = com.Intel.CilkPlusRuntime
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID = com.Intel.CilkPlusRuntime
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME = "Intel Corporation"
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+# will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the
+# mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH = http://www.mathjax.org/mathjax
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF = YES
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED = _WIN32 \
+ COMMON_SYSDEP= \
+ COMMON_PORTABLE= \
+ NON_COMMON= \
+ __CILKRTS_BEGIN_EXTERN_C= \
+ __CILKRTS_END_EXTERN_C= \
+ CILK_API(t)=t \
+ CILK_ABI(t)=t \
+ CILK_ABI_THROWS(t)=t \
+ CALLBACK= \
+ __CILKRTS_INLINE=inline \
+ __CILKRTS_ABI_VERSION=1 \
+ __cplusplus \
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS = 0
+
+# By default doxygen will write a font called Helvetica to the output
+# directory and reference it in all dot files that doxygen generates.
+# When you want a differently looking font you can specify the font name
+# using DOT_FONTNAME. You need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE = 10
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP = YES
diff --git a/libcilkrts/runtime/except-gcc.cpp b/libcilkrts/runtime/except-gcc.cpp new file mode 100644 index 00000000000..bd08d1826b3 --- /dev/null +++ b/libcilkrts/runtime/except-gcc.cpp @@ -0,0 +1,597 @@ +/* except-gcc.cpp -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#include "except-gcc.h" +#include "except.h" +#include "sysdep.h" +#include "bug.h" +#include "local_state.h" +#include "full_frame.h" +#include "scheduler.h" +#include "frame_malloc.h" +#include "pedigrees.h" + +#include <stdint.h> +#include <typeinfo> + +#define DEBUG_EXCEPTIONS 0 + +struct pending_exception_info +{ + void make(__cxa_eh_globals *, _Unwind_Exception *, bool); + void destruct(); + bool empty() const; + void check() const; + /* Active exception at time of suspend. */ + _Unwind_Exception *active; + /* If true the most recently caught exception is to be rethrown + on resume. This handling is technically incorrect but allows + running without compiler support; the proper standards-compliant + method is to save the exception in the previous field. */ + bool rethrow; + struct __cxa_eh_globals runtime_state; +}; + +void pending_exception_info::check() const +{ + if (active) + CILK_ASSERT((int)runtime_state.uncaughtExceptions > 0); +} + +void pending_exception_info::make(__cxa_eh_globals *state_in, + _Unwind_Exception *exc_in, bool rethrow_in) +{ + active = exc_in; + rethrow = rethrow_in; + runtime_state = *state_in; + /* Read and clear C++ runtime state. */ + state_in->caughtExceptions = 0; + state_in->uncaughtExceptions = 0; +#if CILK_LIB_DEBUG + check(); +#endif +} + +bool +pending_exception_info::empty() const +{ + return !active && !rethrow && !runtime_state.caughtExceptions && + !runtime_state.uncaughtExceptions; +} + +#if DEBUG_EXCEPTIONS +#include <stdio.h> +static void +decode_exceptions(char *out, size_t len, struct pending_exception_info *info) +{ + if (info->empty()) + snprintf(out, len, "[empty]"); + else if (info->rethrow) + snprintf(out, len, "[rethrow %p]", + info->runtime_state.caughtExceptions); + else + snprintf(out, len, "[throw %p]", (void *)info->active); +} +#endif + +static void +save_exception_info(__cilkrts_worker *w, + __cxa_eh_globals *state, + _Unwind_Exception *exc, + bool rethrow, + const char *why) +{ + struct pending_exception_info *info = + (struct pending_exception_info *)__cilkrts_frame_malloc(w, sizeof (struct pending_exception_info)); + CILK_ASSERT(info); + info->make(state, exc, rethrow); + +#if DEBUG_EXCEPTIONS + { + char buf[40]; + decode_exceptions(buf, sizeof buf, info); + fprintf(stderr, "make exception info W%u %p %s (%s)\n", + w->self, info, buf, why); + } +#endif + + CILK_ASSERT(w->l->pending_exception == 0); + w->l->pending_exception = info; +} + +#if DEBUG_EXCEPTIONS +#include <stdio.h> /* DEBUG */ + +static void decode_flags(int flags, char out[9]) +{ + out[0] = (flags & CILK_FRAME_STOLEN) ? 'S' : '_'; + out[1] = (flags & CILK_FRAME_UNSYNCHED) ? 'U' : '_'; + out[2] = (flags & CILK_FRAME_DETACHED) ? 'D' : '_'; + out[3] = (flags & CILK_FRAME_EXCEPTING) ? 'X' : '_'; + out[4] = '\0'; +} +#endif + +/* __cilkrts_save_except is called from the runtime epilogue + when a function is returning with an exception pending. + + If the function has a parent to which it could return normally, + return and have the caller call _Unwind_Resume, the same as if + an exception filter had not matched. + + Otherwise save the exception in the worker. + + If this is a return from a ordinary call that must go through + the runtime, the assembly epilogue must have saved the call-saved + register state in the parent frame. */ + +extern "C" +CILK_ABI_THROWS_VOID +__cilkrts_return_exception(__cilkrts_stack_frame *sf) +{ + __cilkrts_worker *w = sf->worker; + _Unwind_Exception *exc = (_Unwind_Exception *)sf->except_data; + + CILK_ASSERT(sf->flags & CILK_FRAME_DETACHED); + sf->flags &= ~CILK_FRAME_DETACHED; + + /* + * If we are in replay mode, and a steal occurred during the recording + * phase, stall till a steal actually occurs. + */ + replay_wait_for_steal_if_parent_was_stolen(w); + + /* If this is to be an abnormal return, save the active exception. */ + if (!__cilkrts_pop_tail(w)) { + /* Write a record to the replay log for an attempt to return to a + stolen parent. This must be done before the exception handler + invokes __cilkrts_leave_frame which will bump the pedigree so + the replay_wait_for_steal_if_parent_was_stolen() above will match on + replay */ + replay_record_orphaned(w); + + /* Now that the record/replay stuff is done, update the pedigree */ + update_pedigree_on_leave_frame(w, sf); + + /* Inline pop_frame; this may not be needed. */ + w->current_stack_frame = sf->call_parent; + sf->call_parent = 0; + __cxa_eh_globals *state = __cxa_get_globals(); + +#if DEBUG_EXCEPTIONS + fflush(stdout); + char decoded[9]; + decode_flags(sf->flags, decoded); + fprintf(stderr, "__cilkrts_save_except W%u sf %p/%s exc %p [%u %p] suspend\n", + w->self, sf, decoded, exc, + state->uncaughtExceptions, + state->caughtExceptions); +#endif + + /* Like __cilkrts_save_exception_state except for setting the + rethrow flag. */ + save_exception_info(w, state, exc, exc == NULL, "save_except"); + { + full_frame *ff = w->l->frame_ff; + CILK_ASSERT(NULL == ff->pending_exception); + ff->pending_exception = w->l->pending_exception; + w->l->pending_exception = NULL; + } + __cilkrts_exception_from_spawn(w, sf); /* does not return */ + } + /* This code path is taken when the parent is attached. It is on + the same stack and part of the same full frame. The caller is + cleaning up the Cilk frame during unwind and will reraise the + exception */ + + /* Now that the record/replay stuff is done, update the pedigree */ + update_pedigree_on_leave_frame(w, sf); + +#if DEBUG_EXCEPTIONS /* DEBUG ONLY */ + { + __cxa_eh_globals *state = __cxa_get_globals(); + + fflush(stdout); + char decoded[9]; + decode_flags(sf->flags, decoded); + fprintf(stderr, "__cilkrts_save_except W%d %p/%s %p->%p [%u %p] escape\n", + w->self, sf, decoded, exc, + exc ? to_cxx(exc)->nextException : 0, + state->uncaughtExceptions, + state->caughtExceptions); + + /* XXX This is triggering in the user thread which gets an exception + from somewhere but does not get the corresponding runtime exception + state. + XXX There might be two or more uncaught exceptions. Test could be + (uncaught != 0) == (exc != 0). First, design tests to see if that + case is otherwise handled correctly. And what if there's an uncaught + exception that does not belong to this function? I.e. this is a return + from spawn in a destructor. */ + if (exc) + CILK_ASSERT((int)state->uncaughtExceptions > 0); + /*CILK_ASSERT(state->uncaughtExceptions == (exc != 0));*/ + } +#endif + + /* The parent is attached so this exception can be propagated normally. */ + return; +} + +/* Save the exception state into the full frame, which is exiting + or suspending. */ +extern "C" +void __cilkrts_save_exception_state(__cilkrts_worker *w, full_frame *ff) +{ + save_exception_info(w, __cxa_get_globals(), 0, false, "undo-detach"); + CILK_ASSERT(NULL == ff->pending_exception); + ff->pending_exception = w->l->pending_exception; + w->l->pending_exception = NULL; +} + +/* __cilkrts_c_sync_except is like __cilkrts_c_sync except that it + saves exception state. __cilkrts_c_sync never returns here and + always reinstalls the saved exception state. + + This function must be used because a parent of this function may + be propagating an uncaught exception. The uncaught exception + count must be saved by the child and passed back to the parent. */ + +extern "C" +NORETURN __cilkrts_c_sync_except (__cilkrts_worker *w, __cilkrts_stack_frame *sf) +{ + __cxa_eh_globals *state = __cxa_get_globals(); + _Unwind_Exception *exc = (_Unwind_Exception *)sf->except_data; + + CILK_ASSERT((sf->flags & (CILK_FRAME_UNSYNCHED|CILK_FRAME_EXCEPTING)) == + (CILK_FRAME_UNSYNCHED|CILK_FRAME_EXCEPTING)); + sf->flags &= ~CILK_FRAME_EXCEPTING; + +#if DEBUG_EXCEPTIONS + fflush(stdout); + char decoded[9]; + decode_flags(sf->flags, decoded); + if (exc) + fprintf(stderr, "__cilkrts_sync_except W%u %p/%s %p->%p [%u %p]\n", + w->self, sf, decoded, exc, + to_cxx(exc)->nextException, + state->uncaughtExceptions, + state->caughtExceptions); + else + fprintf(stderr, "__cilkrts_sync_except W%d %p/%s none [%u %p]\n", + w->self, sf, decoded, + state->uncaughtExceptions, + state->caughtExceptions); +#endif + + /* Here the identity of an rethrown exception is always known. + If exc is NULL this call is only to preserve parent state. */ + save_exception_info(w, state, exc, false, "sync_except"); +#if 0 + { + full_frame *ff = w->l->frame_ff; + CILK_ASSERT(NULL == ff->pending_exception); + ff->pending_exception = w->l->pending_exception; + w->l->pending_exception = NULL; + } +#endif + CILK_ASSERT(!std::uncaught_exception()); + __cilkrts_c_sync(w, sf); +} + +void +pending_exception_info::destruct() +{ + if (active) { +#if DEBUG_EXCEPTIONS + fprintf(stderr, "destroy exception info %p %p\n", this, active); +#endif + _Unwind_DeleteException(active); + active = 0; + } else { +#if DEBUG_EXCEPTIONS + fprintf(stderr, "destroy exception info %p\n", this); +#endif + } + while (runtime_state.caughtExceptions) { + __cxa_exception *exc = runtime_state.caughtExceptions; + runtime_state.caughtExceptions = exc->nextException; +#if DEBUG_EXCEPTIONS + fprintf(stderr, "destroy caught exception %p\n", this); +#endif + _Unwind_DeleteException(&exc->unwindHeader); + } +} + +/* + * __cilkrts_merge_pending_exceptions + * + * Merge the right exception record into the left. The left is logically + * earlier. + * + * The active exception of E is + * E->active if it is non-NULL (in which case E->rethrow is false) + * unresolved if E->active is NULL and E->rethrow is true + * nil if E->active is NULL and E->rethrow is false + * + * The merged active exception is left active exception if it is not + * nil, otherwise the right. + * + * On entry the left state is synched and can not have an unresolved + * exception. The merge may result in an unresolved exception. + * + * Due to scoping rules at most one of the caught exception lists is + * non-NULL. + */ + +struct pending_exception_info * +__cilkrts_merge_pending_exceptions ( + __cilkrts_worker *w, + struct pending_exception_info *left, + struct pending_exception_info *right) +{ + /* If we've only got one exception, return it */ + + if (NULL == left) { +#if DEBUG_EXCEPTIONS + if (right) { + char buf[40]; + decode_exceptions(buf, sizeof buf, right); + fprintf(stderr, "__cilkrts merge W%u nil %p -> %p %s\n", + w->self, right, right, buf); + } +#endif + return right; + } + + if (NULL == right) { +#if DEBUG_EXCEPTIONS + if (left) { + char buf[40]; + decode_exceptions(buf, sizeof buf, left); + fprintf(stderr, "__cilkrts merge W%u %p nil -> %p %s\n", + w->self, left, left, buf); + } +#endif + return left; + } + +#if CILK_LIB_DEBUG + /*volatile struct pending_exception_info left_in = *left, right_in = *right;*/ + left->check(); + right->check(); +#endif + +#if DEBUG_EXCEPTIONS + { + char buf1[40], buf2[40]; + decode_exceptions(buf1, sizeof buf1, left); + decode_exceptions(buf2, sizeof buf2, right); + fprintf(stderr, "__cilkrts merge W%u %p %s %p %s\n", + w->self, left, buf1, right, buf2); + } +#endif + + /* It should not be possible for both left and right to + have accumulated catch blocks. + + The left exception record may always have a catch + chain it kept when its parent was stolen. + + If they are siblings, the right sibling should not + have accumulated any net catches. (Catch is lexically + scoped.) + + If the right frame is a parent, it should not have entered + a catch block without syncing first. If it spawned in a + catch block, the child got its catch. */ + __cxa_exception *caught = left->runtime_state.caughtExceptions; + if (caught) + CILK_ASSERT(!right->runtime_state.caughtExceptions); + else { + CILK_ASSERT(!left->rethrow); + left->rethrow = right->rethrow; + left->runtime_state.caughtExceptions = caught = right->runtime_state.caughtExceptions; + right->runtime_state.caughtExceptions = NULL; + } + + /* Merge the uncaught exception and count of uncaught exceptions. */ + const unsigned int right_uncaught = right->runtime_state.uncaughtExceptions; + if (!left->active){ + left->active = right->active; /* could be NULL */ + right->active = 0; + left->runtime_state.uncaughtExceptions += right_uncaught; + if (left->active) + /* assert is C++ exception */ + /*CILK_ASSERT(__cxxabiv1::__is_gxx_exception_class(left->active->exception_class))*/; + } else { + /* Subtract 1 if the right exception is being destructed. */ + left->runtime_state.uncaughtExceptions += right_uncaught - (right->active != 0); + } + + right->destruct(); + __cilkrts_frame_free(w, right, sizeof *right); + + /* If there is no state left, return NULL. */ + if (left->empty()) { + left->destruct(); + __cilkrts_frame_free(w, left, sizeof *left); + left = NULL; + } + +#if CILK_LIB_DEBUG + if (left) + left->check(); +#endif + + return left; +} + +#if 0 +/* __cilkrts_c_resume_except is called from the assembly language + restart code when a resumed frame has a pending exception. + + The handler count negation on rethrow was done when the throw was + resolved. + + The assembly language runtime must make the throw unwind to + the sync, spawn, or other location where the exception should + be injected. (This should not happen after a spawn but nothing + here depends on there being no exception on steal.) + + This function is unused in the Intel stack based system. */ +extern "C" +void __cilkrts_c_resume_except (_Unwind_Exception *exc) +{ +#if DEBUG_EXCEPTIONS + fprintf(stderr, "resume exception %p\n", exc); +#endif + _Unwind_Reason_Code why = _Unwind_RaiseException(exc); + __cilkrts_bug ("Cilk runtime error: failed to reinstate suspended exception %p (%d)\n", exc, why); +} +#endif + +/* Restore the caught exception chain. This assumes no C++ exception + code will run before the frame is resumed. If there is no exception + to be resumed free the object. */ + +extern "C" +void __cilkrts_setup_for_execution_sysdep(__cilkrts_worker *w, full_frame *ff) +{ + // ASSERT: We own w->lock and ff->lock || P == 1 + + __cxa_eh_globals *state = __cxa_get_globals (); + struct pending_exception_info *info = w->l->pending_exception; + + if (info == NULL) + return; + + w->l->pending_exception = 0; + +#if DEBUG_EXCEPTIONS + _Unwind_Exception *exc = info->active; + if (exc) { + fflush(stdout); + fprintf(stderr, "__cilkrts_resume_except W%u %p->%p [%u %p]\n", + w->self, exc, + to_cxx(exc)->nextException, + info->runtime_state.uncaughtExceptions, + info->runtime_state.caughtExceptions); + /*CILK_ASSERT(info->runtime_state.uncaughtExceptions > 0);*/ + } +#endif + + if (state->uncaughtExceptions || state->caughtExceptions) + __cilkrts_bug("W%u: resuming with non-empty prior exception state %u %p\n", state->uncaughtExceptions, state->caughtExceptions); + + *state = info->runtime_state; + info->runtime_state.caughtExceptions = 0; + info->runtime_state.uncaughtExceptions = 0; + + if (info->rethrow) { + info->rethrow = false; + /* Resuming function will rethrow. Runtime calls + std::terminate if there is no caught exception. */ + ff->call_stack->flags |= CILK_FRAME_EXCEPTING; + } + if (info->active) { + ff->call_stack->flags |= CILK_FRAME_EXCEPTING; + ff->call_stack->except_data = info->active; + info->active = 0; + } + + if (info->empty()) { + info->destruct(); + __cilkrts_frame_free(w, info, sizeof *info); + w->l->pending_exception = NULL; + } + +#if CILK_LIB_DEBUG + if (ff->call_stack->except_data) + CILK_ASSERT(std::uncaught_exception()); +#endif +} + +#if 0 +extern "C" +struct pending_exception_info *__cilkrts_get_exception(__cilkrts_worker *w, + __cilkrts_stack_frame *sf) +{ + struct pending_exception_info *info = w->l->pending_exception; + + if (info == NULL) { + sf->flags &= ~CILK_FRAME_EXCEPTING; + return 0; + } + + w->l->pending_exception = NULL; + + /* This exception goes into the frame. */ + + _Unwind_Exception *exc = info->active; + info->active = NULL; + info->destruct(); + __cilkrts_frame_free(w, info, sizeof *info); + info = 0; + sf->flags |= CILK_FRAME_EXCEPTING; + sf->exception = exc; + return 0; +} +#endif + +extern "C" +void __attribute__((nonnull)) __cilkrts_gcc_rethrow(__cilkrts_stack_frame *sf) +{ +#ifdef __CYGWIN__ + // Cygwin doesn't support exceptions, so _Unwind_Resume isn't available + // Which means we can't support exceptions either + __cilkrts_bug("The Cygwin implementation of the Intel Cilk Plus runtime doesn't support exceptions\n"); +#else + if (sf->except_data) { +#if CILK_LIB_DEBUG + CILK_ASSERT(std::uncaught_exception()); +#endif + _Unwind_Resume ((_Unwind_Exception *)sf->except_data); + } else { + throw; + } +#endif // __CYGWIN__ +} + +/* End except-gcc.cpp */ + diff --git a/libcilkrts/runtime/except-gcc.h b/libcilkrts/runtime/except-gcc.h new file mode 100644 index 00000000000..aa76adbc233 --- /dev/null +++ b/libcilkrts/runtime/except-gcc.h @@ -0,0 +1,146 @@ +/* except-gcc.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file except-gcc.h + * + * @brief ABI for gcc exception handling. + * + * @par Origin + * The code below is generally copied from the Intel Itanium ABI (Intel + * download 245370). + */ + +#ifndef INCLUDED_EXCEPT_GCC_DOT_H +#define INCLUDED_EXCEPT_GCC_DOT_H + +#ifndef __cplusplus +# error except-gcc.h should be used in C++ code only. +#endif + +#include <cilk/common.h> +#include <exception> +#include <typeinfo> + +struct __cxa_exception; + +__CILKRTS_BEGIN_EXTERN_C + +/** Unwind reason code (Itanium ABI 6.1.2.1) */ +typedef enum _Unwind_Reason_Code { + _URC_NO_REASON = 0, + _URC_FOREIGN_EXCEPTION_CAUGHT = 1, + _URC_FATAL_PHASE2_ERROR = 2, + _URC_FATAL_PHASE1_ERROR = 3, + _URC_NORMAL_STOP = 4, + _URC_END_OF_STACK = 5, + _URC_HANDLER_FOUND = 6, + _URC_INSTALL_CONTEXT = 7, + _URC_CONTINUE_UNWIND = 8 +} _Unwind_Reason_Code; + +typedef struct _Unwind_Exception _Unwind_Exception; + +/** Exception cleanup function pointer (Itanium ABI 6.1.2.2) */ +typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code reason, + _Unwind_Exception *exc); + +/** + * @brief Exception undwinding information + * + * This is copied from the Intel Itanium ABI except that the + * private fields are declared unsigned long for binary + * compatibility with gcc/g++ on 32 bit machines. + */ +struct _Unwind_Exception +{ + uint64_t exception_class; + _Unwind_Exception_Cleanup_Fn exception_cleanup; + unsigned long private_1; + unsigned long private_2; +}; + +/** Throw or rethrow an exception */ +_Unwind_Reason_Code +_Unwind_RaiseException(_Unwind_Exception *exception_object); + +/** Resume an exception other than by rethrowing it. */ +void _Unwind_Resume(_Unwind_Exception *exception_object); + +/** Delete an exception object */ +void _Unwind_DeleteException(_Unwind_Exception *exception_object); + +/** + * C++ exception ABI. + * The following declarations are from + * + * http://www.codesourcery.com/public/cxx-abi/abi-eh.html#cxx-abi + */ + +struct __cxa_exception { + std::type_info * exceptionType; + void (*exceptionDestructor)(void *); + std::unexpected_handler unexpectedHandler; + std::terminate_handler terminateHandler; + __cxa_exception * nextException; + + int handlerCount; + int handlerSwitchValue; + const char * actionRecord; + const char * languageSpecificData; + void * catchTemp; + void * adjustedPtr; + + _Unwind_Exception unwindHeader; +}; + +static inline __cxa_exception *to_cxx(_Unwind_Exception *e) +{ + return ((__cxa_exception *)(e+1)) - 1; +} + +typedef struct __cxa_eh_globals { + __cxa_exception *caughtExceptions; + unsigned int uncaughtExceptions; +} __cxa_eh_globals; + +__cxa_eh_globals*__cxa_get_globals(void) throw(); + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_EXCEPT_GCC_DOT_H) diff --git a/libcilkrts/runtime/except.h b/libcilkrts/runtime/except.h new file mode 100644 index 00000000000..58e2238c581 --- /dev/null +++ b/libcilkrts/runtime/except.h @@ -0,0 +1,123 @@ +/* except.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file except.h + * + * @brief Common definitions for the various implementations of exception + * handling. + */ + +#ifndef INCLUDED_EXCEPT_DOT_H +#define INCLUDED_EXCEPT_DOT_H + +#include <cilk/common.h> +#include <internal/abi.h> +#include "full_frame.h" + +__CILKRTS_BEGIN_EXTERN_C + +/** + * OS-dependent information about an exception that's being moved between + * strands. + */ +typedef struct pending_exception_info pending_exception_info; + +/** + * Merge the right exception record into the left. The left is logically + * earlier. + * + * On entry the left state is synched and can not have an unresolved + * exception. The merge may result in an unresolved exception. + * + * If there is both a right and left exception, the right exception will + * be disposed of in preference to the left exception, destructing the + * exception object. + * + * @param w The worker that is preparing to resume execution. + * @param left_exception The exception that would have happened earlier + * if the code executed serially. Can be NULL if the left strand has not + * raised an exception. + * @param right_exception The exception that would have happened later + * if the code executed serially. Can be NULL if the right strand has not + * raised an exception. + * + * @return NULL if there both the right and left exception are NULL. This + * indicates that there are no pending exceptions. + * @return The pending exception that is to be raised to continue searching + * for a catch block to handle the exception. + */ +COMMON_SYSDEP +struct pending_exception_info *__cilkrts_merge_pending_exceptions( + __cilkrts_worker *w, + pending_exception_info *left_exception, + pending_exception_info *right_exception); + +/** + * Move the exception information from the worker to the full_frame. + * + * @param w The worker which is suspending work on a full_frame. + * @param ff The full_frame which is being suspended. + */ +COMMON_SYSDEP +void __cilkrts_save_exception_state(__cilkrts_worker *w, + full_frame *ff); + +/** + * Function to delete pending exception. This will delete the + * exception object and then free the stack/fiber. + * + * @param w The worker we're running on. + * @param pei The pending exception to be delete + * @param delete_object Unused. Should always be 1. + */ +void delete_exception_obj (__cilkrts_worker *w, + struct pending_exception_info *pei, + int delete_object); + +#ifndef _WIN32 +/* gcc-style exception handling */ +NON_COMMON NORETURN __cilkrts_c_sync_except(__cilkrts_worker *w, + __cilkrts_stack_frame *sf); +NON_COMMON void __attribute__((nonnull)) +__cilkrts_gcc_rethrow(__cilkrts_stack_frame *sf); +#endif + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_EXCEPT_DOT_H) diff --git a/libcilkrts/runtime/frame_malloc.c b/libcilkrts/runtime/frame_malloc.c new file mode 100644 index 00000000000..0b38bd209a9 --- /dev/null +++ b/libcilkrts/runtime/frame_malloc.c @@ -0,0 +1,462 @@ +/* frame_malloc.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#include "frame_malloc.h" +#include "bug.h" +#include "local_state.h" +#include "cilk_malloc.h" + +#ifndef __VXWORKS__ +#include <memory.h> +#endif + +/* #define USE_MMAP 1 */ +#if USE_MMAP +#define __USE_MISC 1 +#include <sys/mman.h> +#include <errno.h> +#endif + +// Define to fill the stack frame header with the fill character when pushing +// it on a free list. Note that this should be #ifdef'd out when checked in! + +#ifdef _DEBUG +#define HEADER_FILL_CHAR 0xbf +#endif + +// HEADER_FILL_CHAR should not be defined when checked in, so put out a warning +// message if this is a release build + +#if defined(NDEBUG) && defined (HEADER_FILL_CHAR) +#pragma message ("Warning: HEADER_FILL_CHAR defined for a release build") +#endif + +static void allocate_batch(__cilkrts_worker *w, int bucket, size_t size); + +#ifndef _WIN32 + +const unsigned short __cilkrts_bucket_sizes[FRAME_MALLOC_NBUCKETS] = +{ + 64, 128, 256, 512, 1024, 2048 +}; + +#define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) __cilkrts_bucket_sizes[bucket] + +/* threshold above which we use slow malloc */ +#define FRAME_MALLOC_MAX_SIZE 2048 + +#else // _WIN32 + +/* Note that this must match the implementation of framesz_to_bucket in + * asmilator/layout.ml! */ +#define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) ((size_t)(64 << (bucket))) + +/* threshold above which we use slow malloc */ +#define FRAME_MALLOC_MAX_SIZE \ + FRAME_MALLOC_BUCKET_TO_SIZE(FRAME_MALLOC_NBUCKETS - 1) + +#endif // _WIN32 + +/* utility procedures */ +static void push(struct free_list **b, struct free_list *p) +{ +#ifdef HEADER_FILL_CHAR + memset (p, HEADER_FILL_CHAR, FRAME_MALLOC_BUCKET_TO_SIZE(0)); +#endif + /* cons! onto free list */ + p->cdr = *b; + *b = p; +} + +static struct free_list *pop(struct free_list **b) +{ + struct free_list *p = *b; + if (p) + *b = p->cdr; + return p; +} + +/************************************************************* + global allocator: +*************************************************************/ +/* request slightly less than 2^K from the OS, which after malloc + overhead and alignment should end up filling each VM page almost + completely. 128 is a guess of the total malloc overhead and cache + line alignment */ +#define FRAME_MALLOC_CHUNK (32 * 1024 - 128) + +/** Implements linked list of frames */ +struct pool_cons { + char *p; /**< This element of the list */ + struct pool_cons *cdr; /**< Remainder of the list */ +}; + +static void extend_global_pool(global_state_t *g) +{ + /* FIXME: memalign to a cache line? */ + struct pool_cons *c = (struct pool_cons *)__cilkrts_malloc(sizeof(*c)); + g->frame_malloc.pool_begin = + (char *)__cilkrts_malloc((size_t)FRAME_MALLOC_CHUNK); + g->frame_malloc.pool_end = + g->frame_malloc.pool_begin + FRAME_MALLOC_CHUNK; + g->frame_malloc.allocated_from_os += FRAME_MALLOC_CHUNK; + c->p = g->frame_malloc.pool_begin; + c->cdr = g->frame_malloc.pool_list; + g->frame_malloc.pool_list = c; +} + +/* the size is already canonicalized at this point */ +static struct free_list *global_alloc(global_state_t *g, int bucket) +{ + struct free_list *mem; + size_t size; + + CILK_ASSERT(bucket < FRAME_MALLOC_NBUCKETS); + size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); + g->frame_malloc.allocated_from_global_pool += size; + + if (!(mem = pop(&g->frame_malloc.global_free_list[bucket]))) { + + CILK_ASSERT(g->frame_malloc.pool_begin <= g->frame_malloc.pool_end); + if (g->frame_malloc.pool_begin + size > g->frame_malloc.pool_end) { + /* We waste the fragment of pool. */ + g->frame_malloc.wasted += + g->frame_malloc.pool_end - g->frame_malloc.pool_begin; + extend_global_pool(g); + } + mem = (struct free_list *)g->frame_malloc.pool_begin; + g->frame_malloc.pool_begin += size; + } + + return mem; +} + +static void global_free(global_state_t *g, void *mem, int bucket) +{ + size_t size; + + CILK_ASSERT(bucket < FRAME_MALLOC_NBUCKETS); + size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); + g->frame_malloc.allocated_from_global_pool -= size; + + push(&g->frame_malloc.global_free_list[bucket], mem); +} + +void __cilkrts_frame_malloc_global_init(global_state_t *g) +{ + int i; + + __cilkrts_mutex_init(&g->frame_malloc.lock); + g->frame_malloc.check_for_leaks = 1; + g->frame_malloc.pool_list = 0; + g->frame_malloc.pool_begin = 0; + g->frame_malloc.pool_end = 0; + g->frame_malloc.batch_size = 8000; + g->frame_malloc.potential_limit = 4 * g->frame_malloc.batch_size; + g->frame_malloc.allocated_from_os = 0; + g->frame_malloc.allocated_from_global_pool = 0; + g->frame_malloc.wasted = 0; + for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) + g->frame_malloc.global_free_list[i] = 0; +} + +// Counts how many bytes are in the global free list. +static size_t count_memory_in_global_list(global_state_t *g) +{ + + // Count the memory remaining in the global free list. + size_t size_remaining_in_global_list = 0; + int i; + for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) { + struct free_list *p; + size_t size_in_bucket = 0; + p = g->frame_malloc.global_free_list[i]; + + while (p) { + size_in_bucket += FRAME_MALLOC_BUCKET_TO_SIZE(i); + p = p->cdr; + } + size_remaining_in_global_list += size_in_bucket; + } + return size_remaining_in_global_list; +} + + +void __cilkrts_frame_malloc_global_cleanup(global_state_t *g) +{ + struct pool_cons *c; + + if (g->frame_malloc.check_for_leaks) { + size_t memory_in_global_list = count_memory_in_global_list(g); + // TBD: This check is weak. Short of memory corruption, + // I don't see how we have more memory in the free list + // than allocated from the os. + // Ideally, we should count the memory in the global free list + // and check that we have it all. But I believe the runtime + // itself also uses some memory, which is not being tracked. + if (memory_in_global_list > g->frame_malloc.allocated_from_os) { + __cilkrts_bug("\nError. The Cilk runtime data structures may have been corrupted.\n"); + } + } + + while ((c = g->frame_malloc.pool_list)) { + g->frame_malloc.pool_list = c->cdr; + __cilkrts_free(c->p); + __cilkrts_free(c); + } + + __cilkrts_mutex_destroy(0, &g->frame_malloc.lock); + + // Check that all the memory moved from the global pool into + // workers has been returned to the global pool. + if (g->frame_malloc.check_for_leaks + && (g->frame_malloc.allocated_from_global_pool != 0)) + { + __cilkrts_bug("\n" + "---------------------------" "\n" + " MEMORY LEAK DETECTED!!! " "\n" + "---------------------------" "\n" + "\n" + ); + } +} + +/************************************************************* + per-worker allocator +*************************************************************/ +/* allocate a batch of frames of size SIZE from the global pool and + store them in the worker's free list */ +static void allocate_batch(__cilkrts_worker *w, int bucket, size_t size) +{ + global_state_t *g = w->g; + + __cilkrts_mutex_lock(w, &g->frame_malloc.lock); { +#if USE_MMAP + char *p = mmap(0, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) + __cilkrts_bug("mmap failed %d", errno); + assert(size < 4096); + assert(p != MAP_FAILED); + mprotect(p, 4096, PROT_NONE); + mprotect(p + 8192, 4096, PROT_NONE); + w->l->bucket_potential[bucket] += size; + push(&w->l->free_list[bucket], (struct free_list *)(p + 8192 - size)); +#else + size_t bytes_allocated = 0; + do { + w->l->bucket_potential[bucket] += size; + bytes_allocated += size; + push(&w->l->free_list[bucket], global_alloc(g, bucket)); + } while (bytes_allocated < g->frame_malloc.batch_size); +#endif + } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock); + +} + +static void gc_bucket(__cilkrts_worker *w, int bucket, size_t size) +{ + struct free_list *p, *q; + global_state_t *g = w->g; + size_t pot = w->l->bucket_potential[bucket]; + size_t newpot; + + /* Keep up to POT/2 elements in the free list. The cost of + counting up to POT/2 is amortized against POT. */ + newpot = 0; + for (newpot = 0, p = w->l->free_list[bucket]; p && 2 * newpot < pot; + p = p->cdr, newpot += size) + ; + w->l->bucket_potential[bucket] = newpot; + + if (p) { + /* free the rest of the list. The cost of grabbing the lock + is amortized against POT/2; the cost of traversing the rest + of the list is amortized against the free operation that + puts the element on the list. */ + __cilkrts_mutex_lock(w, &g->frame_malloc.lock); { + while ((q = pop(&p->cdr))) +#if USE_MMAP + munmap((char *)q + size - 8192, 12288); +#else + global_free(g, q, bucket); +#endif + } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock); + } +} + +// Free all the memory in this bucket for the specified worker, +// returning it to the global pool's free list. +static void move_bucket_to_global_free_list(__cilkrts_worker *w, + int bucket) +{ + struct free_list *p, *q; + global_state_t *g = w->g; + p = w->l->free_list[bucket]; + + if (p) { + __cilkrts_mutex_lock(w, &g->frame_malloc.lock); { + while ((q = pop(&p))) { +#if USE_MMAP + size_t size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); + munmap((char *)q + size - 8192, 12288); +#else + global_free(g, q, bucket); +#endif + } + } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock); + } + + // I'm not sure this does anything useful now, since + // the worker is about to be destroyed. But why not? + w->l->bucket_potential[bucket] = 0; +} + +static int bucket_of_size(size_t size) +{ + int i; + + for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) + if (size <= FRAME_MALLOC_BUCKET_TO_SIZE(i)) + return i; + + CILK_ASSERT(0 /* can't happen */); + return -1; +} + +size_t __cilkrts_frame_malloc_roundup(size_t size) +{ + if (size > FRAME_MALLOC_MAX_SIZE) { + /* nothing, leave it alone */ + } else { + int bucket = bucket_of_size(size); + size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); + } + return size; +} + +size_t __cilkrts_size_of_bucket(int bucket) +{ + CILK_ASSERT(bucket >= 0 && bucket < FRAME_MALLOC_NBUCKETS); + return FRAME_MALLOC_BUCKET_TO_SIZE(bucket); +} + +void *__cilkrts_frame_malloc(__cilkrts_worker *w, size_t size) +{ + int bucket; + void *mem; + + /* if too large, or if no worker, fall back to __cilkrts_malloc() */ + if (!w || size > FRAME_MALLOC_MAX_SIZE) { + NOTE_INTERVAL(w, INTERVAL_FRAME_ALLOC_LARGE); + return __cilkrts_malloc(size); + } + + START_INTERVAL(w, INTERVAL_FRAME_ALLOC); { + bucket = bucket_of_size(size); + size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); + + while (!(mem = pop(&w->l->free_list[bucket]))) { + /* get a batch of frames from the global pool */ + START_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL) { + allocate_batch(w, bucket, size); + } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL); + } + } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC); + + return mem; +} + +void __cilkrts_frame_free(__cilkrts_worker *w, void *p0, size_t size) +{ + int bucket; + struct free_list *p = (struct free_list *)p0; + + /* if too large, or if no worker, fall back to __cilkrts_free() */ + if (!w || size > FRAME_MALLOC_MAX_SIZE) { + NOTE_INTERVAL(w, INTERVAL_FRAME_FREE_LARGE); + __cilkrts_free(p); + return; + } + +#if CILK_LIB_DEBUG + *(volatile long *)w; +#endif + + START_INTERVAL(w, INTERVAL_FRAME_FREE); { + bucket = bucket_of_size(size); + size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); + w->l->bucket_potential[bucket] += size; + push(&w->l->free_list[bucket], p); + if (w->l->bucket_potential[bucket] > + w->g->frame_malloc.potential_limit) { + START_INTERVAL(w, INTERVAL_FRAME_FREE_GLOBAL) { + gc_bucket(w, bucket, size); + } STOP_INTERVAL(w, INTERVAL_FRAME_FREE_GLOBAL); + } + } STOP_INTERVAL(w, INTERVAL_FRAME_FREE); +} + +void __cilkrts_frame_malloc_per_worker_init(__cilkrts_worker *w) +{ + int i; + local_state *l = w->l; + + for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) { + l->free_list[i] = 0; + l->bucket_potential[i] = 0; + } +} + +void __cilkrts_frame_malloc_per_worker_cleanup(__cilkrts_worker *w) +{ + int i; + // Move memory to the global pool. This operation + // ensures the memory does not become unreachable / leak + // when the worker is destroyed. + for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) { + move_bucket_to_global_free_list(w, i); + } +} + +/* + Local Variables: ** + c-file-style:"bsd" ** + c-basic-offset:4 ** + indent-tabs-mode:nil ** + End: ** +*/ diff --git a/libcilkrts/runtime/frame_malloc.h b/libcilkrts/runtime/frame_malloc.h new file mode 100644 index 00000000000..d412fb620fe --- /dev/null +++ b/libcilkrts/runtime/frame_malloc.h @@ -0,0 +1,205 @@ +/* frame_malloc.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file frame_malloc.h + * + * @brief The frame allocation routines manage memory in a per-worker pool. + * + * The name "frame malloc" refers to an earlier implementation of Cilk which + * allocated frames from the heap using this allocator. + */ + +#ifndef INCLUDED_FRAME_MALLOC_DOT_H +#define INCLUDED_FRAME_MALLOC_DOT_H + +#include "worker_mutex.h" +#include "rts-common.h" +#include <internal/abi.h> // __cilkrts_worker + +#ifdef __cplusplus +# include <cstddef> +#else +# include <stddef.h> +#endif + +__CILKRTS_BEGIN_EXTERN_C + +/** + * Number of buckets. Gives us buckets to hold 64, 128, 256, 512, 1024 + * and 2048 bytes + */ +#define FRAME_MALLOC_NBUCKETS 6 + +/** Layout of frames when unallocated */ +struct free_list { + /** Pointer to next free frame */ + struct free_list *cdr; +}; + +/** per-worker memory cache */ +struct __cilkrts_frame_cache +{ + /** Mutex to serialize access */ + struct mutex lock; + + /** Linked list of frames */ + struct pool_cons *pool_list; + + /** Low bound of memory in pool */ + char *pool_begin; + + /** High bound of memory in pool */ + char *pool_end; + + /** Global free-list buckets */ + struct free_list *global_free_list[FRAME_MALLOC_NBUCKETS]; + + /** + * How many bytes to obtain at once from the global pool + * (approximately) + */ + size_t batch_size; + + /** Garbage-collect a bucket when its potential exceeds the limit */ + size_t potential_limit; + + /** If TRUE, check for memory leaks at the end of execution */ + int check_for_leaks; + + /** Bytes of memory allocated from the OS by the global cache */ + size_t allocated_from_os; + + /** Tracks memory allocated by a chunk that isn't a full bucket size */ + size_t wasted; + + /** Bytes of memory allocated from the global cache */ + size_t allocated_from_global_pool; +}; + +/** + * Allocate memory from the per-worker pool. If the size is too large, or + * if we're given a NULL worker, the memory is allocated using + * __cilkrts_malloc(). + * + * @param w The worker to allocate the memory from. + * @param size The number of bytes to allocate. + * + * @return pointer to allocated memory block. + */ +COMMON_PORTABLE +void *__cilkrts_frame_malloc(__cilkrts_worker *w, + size_t size) cilk_nothrow; + +/** + * Return memory to the per-worker pool. If the size is too large, or + * if we're given a NULL worker, the memory is freed using + * __cilkrts_free(). + * + * @param w The worker to allocate the memory from. + * @param p The memory block to be released. + * @param size The size of the block, in bytes. + */ +COMMON_PORTABLE +void __cilkrts_frame_free(__cilkrts_worker *w, + void* p, + size_t size) cilk_nothrow; + +/** + * Destroy the global cache stored in the global state, freeing all memory + * to the global heap. Checks whether any memory has been allocated but + * not freed. + * + * @param g The global state. + */ +COMMON_PORTABLE +void __cilkrts_frame_malloc_global_cleanup(global_state_t *g); + +/** + * Initialize a worker's memory cache. Initially it is empty. + * + * @param w The worker who's memory cache is to be initialized. + */ +COMMON_PORTABLE +void __cilkrts_frame_malloc_per_worker_init(__cilkrts_worker *w); + +/** + * If check_for_leaks is set in the global state's memory cache, free any + * memory in the worker's memory cache. + * + * If check_for_leask is not set, nothing happens. + * + * @param w The worker who's memory cache is to be cleaned up. + */ +COMMON_PORTABLE +void __cilkrts_frame_malloc_per_worker_cleanup(__cilkrts_worker *w); + +/** + * Round a number of bytes to the size of the smallest bucket that will + * hold it. If the size is bigger than the largest bucket, the value is + * unchanged. + * + * @param size Number of bytes to be rounded up to the nearest bucket size. + * + * @return The size of the smallest bucket that will hold the specified bytes. + */ +COMMON_PORTABLE +size_t __cilkrts_frame_malloc_roundup(size_t size) cilk_nothrow; + +/** + * Return the number of bytes that can fit into a bucket. + * + * Preconditions: + * - The index must be in the range 0 - FRAME_MALLOC_NBUCKETS + * + * @param bucket Index of the bucket to be sized. + */ +COMMON_PORTABLE +size_t __cilkrts_size_of_bucket(int bucket) cilk_nothrow; + +/** + * Initialize the global memory cache. + * + * @param g The global state. + */ +COMMON_PORTABLE +void __cilkrts_frame_malloc_global_init(global_state_t *g); + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_FRAME_MALLOC_DOT_H) diff --git a/libcilkrts/runtime/full_frame.c b/libcilkrts/runtime/full_frame.c new file mode 100644 index 00000000000..9ccfd110d6b --- /dev/null +++ b/libcilkrts/runtime/full_frame.c @@ -0,0 +1,181 @@ +/* full_frame.c -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2010-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +#include "full_frame.h" +#include "stats.h" +#include "os.h" +#include "bug.h" +#include "jmpbuf.h" +#include "frame_malloc.h" + +COMMON_PORTABLE +full_frame *__cilkrts_make_full_frame(__cilkrts_worker *w, + __cilkrts_stack_frame *sf) +{ + full_frame *ff; + + START_INTERVAL(w, INTERVAL_ALLOC_FULL_FRAME) { + ff = (full_frame *)__cilkrts_frame_malloc(w, sizeof(*ff)); + __cilkrts_mutex_init(&ff->lock); + + ff->full_frame_magic_0 = FULL_FRAME_MAGIC_0; + ff->join_counter = 0; + ff->parent = 0; + ff->rightmost_child = 0; + ff->left_sibling = ff->right_sibling = 0; + ff->call_stack = sf; + ff->is_call_child = 0; + ff->simulated_stolen = 0; + ff->children_reducer_map = ff->right_reducer_map = 0; + ff->pending_exception = + ff->child_pending_exception = + ff->right_pending_exception = NULL; + + ff->sync_sp = 0; +#ifdef _WIN32 + ff->exception_sp = 0; + ff->trylevel = (unsigned long)-1; + ff->registration = 0; +#endif + ff->frame_size = 0; + ff->fiber_self = 0; + ff->fiber_child = 0; + + ff->sync_master = 0; + + /*__cilkrts_init_full_frame_sysdep(w, ff);*/ + ff->full_frame_magic_1 = FULL_FRAME_MAGIC_1; + } STOP_INTERVAL(w, INTERVAL_ALLOC_FULL_FRAME); + return ff; +} + +COMMON_PORTABLE void __cilkrts_put_stack(full_frame *ff, + __cilkrts_stack_frame *sf) +{ + /* When suspending frame ff prior to stealing it, __cilkrts_put_stack is + * used to store the stack pointer for eventual sync. When suspending + * frame ff prior to a sync, __cilkrts_put_stack is called to re-establish + * the sync stack pointer, offsetting it by any change in the stack depth + * that occured between the spawn and the sync. + * Although it is not usually meaningful to add two pointers, the value of + * ff->sync_sp at the time of this call is really an integer, not a + * pointer. + */ + ptrdiff_t sync_sp_i = (ptrdiff_t) ff->sync_sp; + char* sp = (char*) __cilkrts_get_sp(sf); + + ff->sync_sp = sp + sync_sp_i; + + DBGPRINTF("%d- __cilkrts_put_stack - adjust (+) sync " + "stack of full frame %p (+sp: %p) to %p\n", + __cilkrts_get_tls_worker()->self, ff, sp, ff->sync_sp); +} + +COMMON_PORTABLE void __cilkrts_take_stack(full_frame *ff, void *sp) +{ + /* When resuming the parent after a steal, __cilkrts_take_stack is used to + * subtract the new stack pointer from the current stack pointer, storing + * the offset in ff->sync_sp. When resuming after a sync, + * __cilkrts_take_stack is used to subtract the new stack pointer from + * itself, leaving ff->sync_sp at zero (null). Although the pointers being + * subtracted are not part of the same contiguous chunk of memory, the + * flat memory model allows us to subtract them and get a useable offset. + */ + ptrdiff_t sync_sp_i = ff->sync_sp - (char*) sp; + + ff->sync_sp = (char *) sync_sp_i; + + DBGPRINTF("%d- __cilkrts_take_stack - adjust (-) sync " + "stack of full frame %p to %p (-sp: %p)\n", + __cilkrts_get_tls_worker()->self, ff, ff->sync_sp, sp); +} + +COMMON_PORTABLE void __cilkrts_adjust_stack(full_frame *ff, size_t size) +{ + /* When resuming the parent after a steal, __cilkrts_take_stack is used to + * subtract the new stack pointer from the current stack pointer, storing + * the offset in ff->sync_sp. When resuming after a sync, + * __cilkrts_take_stack is used to subtract the new stack pointer from + * itself, leaving ff->sync_sp at zero (null). Although the pointers being + * subtracted are not part of the same contiguous chunk of memory, the + * flat memory model allows us to subtract them and get a useable offset. + * + * __cilkrts_adjust_stack() is used to deallocate a Variable Length Array + * by adding it's size to ff->sync_sp. + */ + ff->sync_sp = ff->sync_sp + size; + + DBGPRINTF("%d- __cilkrts_adjust_stack - adjust (+) sync " + "stack of full frame %p to %p (+ size: 0x%x)\n", + __cilkrts_get_tls_worker()->self, ff, ff->sync_sp, size); +} + +COMMON_PORTABLE +void __cilkrts_destroy_full_frame(__cilkrts_worker *w, full_frame *ff) +{ + validate_full_frame(ff); + CILK_ASSERT(ff->children_reducer_map == 0); + CILK_ASSERT(ff->right_reducer_map == 0); + CILK_ASSERT(NULL == ff->pending_exception); + CILK_ASSERT(NULL == ff->child_pending_exception); + CILK_ASSERT(NULL == ff->right_pending_exception); + __cilkrts_mutex_destroy(w, &ff->lock); + __cilkrts_frame_free(w, ff, sizeof(*ff)); +} + +COMMON_PORTABLE void validate_full_frame(full_frame *ff) +{ + /* check the magic numbers, for debugging purposes */ + if (ff->full_frame_magic_0 != FULL_FRAME_MAGIC_0 || + ff->full_frame_magic_1 != FULL_FRAME_MAGIC_1) + abort_because_rts_is_corrupted(); +} + +void __cilkrts_frame_lock(__cilkrts_worker *w, full_frame *ff) +{ + validate_full_frame(ff); + __cilkrts_mutex_lock(w, &ff->lock); +} + +void __cilkrts_frame_unlock(__cilkrts_worker *w, full_frame *ff) +{ + __cilkrts_mutex_unlock(w, &ff->lock); +} + +/* End full_frame.c */ diff --git a/libcilkrts/runtime/full_frame.h b/libcilkrts/runtime/full_frame.h new file mode 100644 index 00000000000..327a3337afe --- /dev/null +++ b/libcilkrts/runtime/full_frame.h @@ -0,0 +1,493 @@ +/* full_frame.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#ifndef INCLUDED_FULL_FRAME_DOT_H +#define INCLUDED_FULL_FRAME_DOT_H + + +#include "rts-common.h" +#include "worker_mutex.h" + +#include <cilk/common.h> +#include <internal/abi.h> +#include <stddef.h> +#include "cilk_fiber.h" + +__CILKRTS_BEGIN_EXTERN_C + +/** Magic numbers for full_frame, used for debugging */ +typedef unsigned long long ff_magic_t; + +/* COMMON_SYSDEP */ struct pending_exception_info; /* opaque */ + +/************************************************************* + Full frames +*************************************************************/ + +/** + * @file full_frame.h + * @brief A full frame includes additional information such as a join + * counter and parent frame. + * @defgroup FullFrames Full Frames + * A full frame includes additional information such as a join + * counter and parent frame. + * @{ + */ + +/** + * Convenience typedef so we don't have to specify "struct full_frame" + * all over the code. Putting it before the structure definition allows + * us to use the typedef within the structure itself + */ +typedef struct full_frame full_frame; + +/** + * @brief A full frame includes additional information such as a join + * counter and parent frame. + * + * The frame at the top of a worker's stack is promoted into a "full" + * frame, which carries additional information, such as join counter + * and parent frame. Full frames can be suspended at a sync, in which + * case they lie somewhere in memory and do not belong to any + * worker. + * + * Full frames are in contrast to the entries in the worker's deque which + * are only represented by a pointer to their __cilkrts_stack_frame. + * + * At any instant, we say that a full frame ff is either "suspended", + * or "owned" by some worker w. + * + * More precisely, we say that a worker w owns a frame ff under one of + * the following conditions: + * + * 1. Creation: Worker w has just created ff, but not yet linked ff + * into the tree of full frames. This situation can occur when a + * worker is unrolling a call stack to promote a + * __cilkrts_stack_frame to a full_frame. + * 2. Executing frame: We have w->l->frame_ff == ff, i.e,. ff is the + * currently executing frame for w. + * 3. Next frame: We have w->l->next_frame_ff == ff, i.e,. ff is the + * next frame that w is about to execute. + * 4. Resume execution: Worker w has popped ff from + * w->l->next_frame_ff, and is about to resume execution of ff. + * 5. Dying leaf: Worker w has finished executing a frame ff + * that is a leaf the tree of full frames, and is in the process + * of unlinking "ff" from the tree. + * + * Otherwise, the frame ff is suspended, and has no owner. + * Note that work-stealing changes the owner of a full frame from the + * victim to the thief. + * + * Using this notion of ownership, we classify the fields of a full + * frame into one of several categories: + * + * 1. Local: + * These fields are accessed only by the owner of the full frame. + * Because a frame can have only one owner at a time, these fields + * can be modified without any (additional) locking or + * synchronization, assuming the correct synchronization for + * changing the ownership of full frame (e.g., on a successful + * steal) is already in place. + * + * 2. Constant (i.e., read-only): + * This field is constant for the lifetime of the full frame. + * No locks are needed to access this field. + * Technically, a field could be read-only and local, but we assume + * it is shared. + * + * 3. Self-locked: + * To access this field in the frame ff, a worker should acquire + * the lock on ff. + * A self-locked field is conceptually "shared" between the worker + * that owns frame ff (which is a child) and the worker that + * owns the frame ff->parent (which is the parent of ff). + * + * 4. Parent-locked: + * To access this field in the frame ff, a worker should + * acquire the lock on ff->parent. + * A parent-locked field is conceptually "shared" between the worker + * that owns frame ff, and a worker that is either owns the + * parent frame (ff->parent) or owns a sibling frame of ff (i.e., + * any child of ff->parent). + * + * 5. Synchronization + * A field used explicitly for synchronization (i.e., locks). + */ + +/* COMMON_PORTABLE */ +struct full_frame +{ + /** + * Value to detect writes off the beginning of a full_frame. + */ +# define FULL_FRAME_MAGIC_0 ((ff_magic_t)0x361e710b9597d553ULL) + + /** + * Field to detect writes off the beginning of a full_frame. Must be + * FULL_FRAME_MAGIC_0. + * [constant] + */ + ff_magic_t full_frame_magic_0; + + /** + * Used to serialize access to this full_frame + * [synchronization] + */ + struct mutex lock; + + /** + * Count of outstanding children running in parallel + * [self-locked] + */ + int join_counter; + + /** + * If TRUE: frame was called by the parent. + * If FALSE: frame was spawned by parent. + * [constant] + */ + int is_call_child; + + /** + * TRUE if this frame is the loot of a simulated steal. + * + * This situation never happens in normal execution. However, + * when running under cilkscreen, a worker may promote frames and + * then immediately suspend them, in order to simulate an + * execution on an infinite number of processors where all spawns + * are stolen. In this case, the frame is marked as the loot of a fake + * steal. + * [local] + */ + int simulated_stolen; + + /** + * Caller of this full_frame + * [constant] + */ + full_frame *parent; + + /** + * Doubly-linked list of children. The serial execution order is + * by definition from left to right. Because of how we do work + * stealing, the parent is always to the right of all its + * children. + * + * For a frame ff, we lock the ff->parent to follow the sibling + * links for ff. + * + * [parent-locked] + */ + full_frame *left_sibling; + + /** + * @copydoc left_sibling + */ + full_frame *right_sibling; + + /** + * Pointer to rightmost child + * + * [self-locked] + */ + full_frame *rightmost_child; + + /** + * Call stack associated with this frame. + * Set and reset in make_unrunnable and make_runnable + * + * [self-locked] + */ + __cilkrts_stack_frame *call_stack; + + /** + * Accumulated reducers of children + * + * [self-locked] + */ + struct cilkred_map *children_reducer_map; + + /** + * Accumulated reducers of right siblings that have already + * terminated + * + * [parent-locked] + */ + struct cilkred_map *right_reducer_map; + + /** + * Exception that needs to be pass to our parent + * + * [local] + * + * TBD: verify that the exception code satisfies this requirement. + */ + struct pending_exception_info *pending_exception; + + /** + * Exception from one of our children + * + * [self-locked] + */ + struct pending_exception_info *child_pending_exception; + + /** + * Exception from any right siblings + * + * [parent-locked] + */ + struct pending_exception_info *right_pending_exception; + + /** + * Stack pointer to restore on sync. + * [local] + */ + char *sync_sp; + +#ifdef _WIN32 + /** + * Stack pointer to restore on exception. + * [local] + */ + char *exception_sp; + + /** + * Exception trylevel at steal + * [local] + * + * TBD: this field is set but not read? + */ + unsigned long trylevel; + + /** + * Exception registration head pointer to restore on sync. + * [local] + */ + unsigned long registration; +#endif + + /** + * Size of frame to match sync sp + * [local] + * TBD: obsolete field only used in debugging? + */ + ptrdiff_t frame_size; + + /** + * Allocated fibers that need to be freed. The fibers work + * like a reducer. The leftmost frame may have @c fiber_self + * null and owner non-null. + * + * [local] + * TBD: verify exception code satisfies this requirement. + */ + cilk_fiber *fiber_self; + + /** + * Allocated fibers that need to be freed. The fibers work + * like a reducer. The leftmost frame may have @c fiber_self + * null and owner non-null. + * + * [self-locked] + */ + cilk_fiber *fiber_child; + + /** + * If the sync_master is set, this function can only be sync'd by the team + * leader, who first entered Cilk. This is set by the first worker to steal + * from the user worker. + * + * [self-locked] + */ + __cilkrts_worker *sync_master; + + /** + * Value to detect writes off the end of a full_frame. + */ +# define FULL_FRAME_MAGIC_1 ((ff_magic_t)0x189986dcc7aee1caULL) + + /** + * Field to detect writes off the end of a full_frame. Must be + * FULL_FRAME_MAGIC_1. + * + * [constant] + */ + ff_magic_t full_frame_magic_1; +}; + +/* The functions __cilkrts_put_stack and __cilkrts_take_stack keep track of + * changes in the stack's depth between when the point at which a frame is + * stolen and when it is resumed at a sync. A stolen frame typically goes + * through the following phase changes: + * + * 1. Suspend frame while stealing it. + * 2. Resume stolen frame at begining of continuation + * 3. Suspend stolen frame at a sync + * 4. Resume frame (no longer marked stolen) after the sync + * + * When the frame is suspended (steps 1 and 3), __cilkrts_put_stack is called to + * establish the stack pointer for the sync. When the frame is resumed (steps + * 2 and 4), __cilkrts_take_stack is called to indicate the stack pointer + * (which may be on a different stack) at + * the point of resume. If the stack pointer changes between steps 2 and 3, + * e.g., as a result of pushing 4 bytes onto the stack, + * the offset is reflected in the value of ff->sync_sp after step 3 relative to + * its value after step 1 (e.g., the value of ff->sync_sp after step 3 would be + * 4 less than its value after step 1, for a down-growing stack). + * + * Imp detail: The actual call chains for each of these phase-change events is: + * + * 1. unroll_call_stack -> make_unrunnable -> __cilkrts_put_stack + * 2. do_work -> __cilkrts_resume -> __cilkrts_take_stack + * 3. do_sync -> disown -> make_runnable -> __cilkrts_put_stack + * 4. __cilkrts_resume -> __cilkrts_take_stack + * + * (The above is a changeable implementation detail. The resume, sequence, in + * particular, is more complex on some operating systems.) + */ + +/** + * @brief Records the stack pointer within the @c sf stack frame as the + * current stack pointer at the point of suspending full frame @c ff. + * + * @pre @c ff->sync_sp must be either null or contain the result of a prior call to + * @c __cilkrts_take_stack(). + * @pre If @c ff->sync_sp is not null, then @c SP(sf) must refer to the same stack as + * the @c sp argument to the prior call to @c __cilkrts_take_stack(). + * + + * @post If @c ff->sync_sp was null before the call, then @c + * ff->sync_sp will be set to @c SP(sf). + * @post Otherwise, @c ff->sync_sp will be restored to the value it had just prior + * to the last call to @c __cilkrts_take_stack(), except offset by any change + * in the stack pointer between the call to @c __cilkrts_take_stack() and + * this call to @c __cilkrts_put_stack(). + * + * @param ff The full frame that is being suspended. + * @param sf The @c __cilkrts_stack_frame that is being suspended. The stack + * pointer will be taken from the jmpbuf contained within this + * @c __cilkrts_stack_frame. + */ +COMMON_PORTABLE void __cilkrts_put_stack(full_frame *ff, + __cilkrts_stack_frame *sf); + +/** + * @brief Records the stack pointer @c sp as the stack pointer at the point of + * resuming execution on full frame @c ff. + * + * The value of @c sp may be on a different stack than the original + * value recorded for the stack pointer using __cilkrts_put_stack(). + * + * @pre @c ff->sync_sp must contain a value set by @c __cilkrts_put_stack(). + * + * @post @c ff->sync_sp contains an *integer* value used to compute a change in the + * stack pointer upon the next call to @c __cilkrts_take_stack(). + * @post If @c sp equals @c ff->sync_sp, then @c ff->sync_sp is set to null. + * + * @param ff The full frame that is being resumed. + * @param sp The stack pointer for the stack the function is being resumed on. + */ +COMMON_PORTABLE void __cilkrts_take_stack(full_frame *ff, void *sp); + +/* + * @brief Adjust the stack for to deallocate a Variable Length Array + * + * @param ff The full frame that is being adjusted. + * @param size The size of the array being deallocated from the stack + */ +COMMON_PORTABLE void __cilkrts_adjust_stack(full_frame *ff, size_t size); + +/** + * @brief Allocates and initailizes a full_frame. + * + * @param w The memory for the full_frame will be allocated out of the + * worker's pool. + * @param sf The @c __cilkrts_stack_frame which will be saved as the call_stack + * for this full_frame. + * + * @return The newly allocated and initialized full_frame. + */ +COMMON_PORTABLE +full_frame *__cilkrts_make_full_frame(__cilkrts_worker *w, + __cilkrts_stack_frame *sf); + +/** + * @brief Deallocates a full_frame. + * + * @param w The memory for the full_frame will be returned to the worker's pool. + * @param ff The full_frame to be deallocated. + */ +COMMON_PORTABLE +void __cilkrts_destroy_full_frame(__cilkrts_worker *w, full_frame *ff); + +/** + * @brief Performs sanity checks to check the integrity of a full_frame. + * + * @param ff The full_frame to be validated. + */ +COMMON_PORTABLE void validate_full_frame(full_frame *ff); + +/** + * @brief Locks the mutex contained in a full_frame. + * + * The full_frame is validated before the runtime attempts to lock it. + * + * @post @c ff->lock will be owned by @c w. + * + * @param w The worker that will own the full_frame. If the runtime is + * collecting stats, the intervals will be attributed to the worker. + * @param ff The full_frame containing the mutex to be locked. + */ +COMMON_PORTABLE void __cilkrts_frame_lock(__cilkrts_worker *w, + full_frame *ff); + +/** + * @brief Unlocks the mutex contained in a full_frame. + * + * @pre @c ff->lock must must be owned by @c w. + * + * @param w The worker that currently owns the full_frame. + * @param ff The full_frame containing the mutex to be unlocked. + */ +COMMON_PORTABLE void __cilkrts_frame_unlock(__cilkrts_worker *w, + full_frame *ff); +/** @} */ + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_FULL_FRAME_DOT_H) diff --git a/libcilkrts/runtime/global_state.cpp b/libcilkrts/runtime/global_state.cpp new file mode 100644 index 00000000000..02de54f43b1 --- /dev/null +++ b/libcilkrts/runtime/global_state.cpp @@ -0,0 +1,628 @@ +/* global_state.cpp -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#include "global_state.h" +#include "os.h" +#include "bug.h" +#include "metacall_impl.h" +#include "stats.h" +#include "cilk/cilk_api.h" +#include "cilk_malloc.h" +#include "record-replay.h" + +#include <algorithm> // For max() +#include <cstring> +#include <cstdlib> +#include <climits> +#include <cerrno> + +#ifdef _WIN32 +# include <wchar.h> +#endif + +// TBD: There is a race when multiple threads try to initialize the +// user_settable_values?? +// +// Set to true if the user settable values portion of the global state +// singleton is initialized, even if the rest of the singleton is not +// initialized. +int cilkg_user_settable_values_initialized = false; + +namespace { + +// Single copy of the global state. Zero-filled until +// cilkg_get_user_settable_values() is called and partially-zero-filled until +// cilkg_init_global_state() is called. The first field is filled in with +// the size of a void* for the debugger and must be valid before initialization +global_state_t global_state_singleton = +{ + sizeof(void *), // addr_size +}; + + +// Variables that need to export C-style names +extern "C" +{ + // Pointer to the global state singleton. + global_state_t *cilkg_singleton_ptr = NULL; + + // __cilkrts_global_state is exported and referenced by the debugger. + // The debugger expects it to be valid when the module loads. +// CILK_EXPORT_DATA + global_state_t *__cilkrts_global_state = &global_state_singleton; +} + +// Returns true if 'a' and 'b' are equal null-terminated strings +inline bool strmatch(const char* a, const char* b) +{ + return 0 == std::strcmp(a, b); +} + +// Returns the integer value represented by the null-terminated string at 's'. +inline long to_long(const char* s) +{ + char *end; + + errno = 0; + return std::strtol(s, &end, 0); +} + +#ifdef _WIN32 +// Returns true if 'a' and 'b' are equal null-terminated wide-char strings +inline bool strmatch(const wchar_t* a, const wchar_t* b) +{ + return 0 == wcscmp(a, b); +} + +// Returns true if the multi-byte character string at 'a' represents the same +// character sequence as the wide-character string at 'b'. The behavior is +// undefined if 'a' contains more than 30 multi-byte characters. +bool strmatch(const char* a, const wchar_t* b) +{ + // Convert 'a' to wide-characters, then compare. + wchar_t wa[31]; + std::size_t count; + errno_t err = mbstowcs_s(&count, wa, a, 30); + CILK_ASSERT(0 == err); + if (err) return false; + return strmatch(wa, b); +} + +// Returns true if the wide-character string at 'a' represents the same +// character sequence as the multi-byte character string at 'b'. The behavior +// id undefined if 'b' contains more than 30 multi-byte characters. +inline +bool strmatch(const wchar_t* a, const char* b) +{ + return strmatch(b, a); +} + + +// Returns the integer value represented by the null-terminated wide-char +// string at 's'. +inline long to_long(const wchar_t* s) +{ + wchar_t *end; + + errno = 0; + return wcstol(s, &end, 0); +} +#endif + +// Check if Cilkscreen or other sequential ptool wants to force reducers. +bool always_force_reduce() +{ + // Metacall *looks* like a no-op. volatile needed to keep compiler from + // optimizing away variable. + volatile char not_force_reduce = '\377'; + __cilkrts_metacall(METACALL_TOOL_SYSTEM, HYPER_ZERO_IF_FORCE_REDUCE, + const_cast<char*>(¬_force_reduce)); + return ! not_force_reduce; +} + +// Stores the boolean value represented by the null-terminated string at 'val' +// into the integer object at 'out'. Returns '__CILKRTS_SET_PARAM_SUCCESS' if +// 'val' is "true", "false", "0" or "1" and '__CILKRTS_SET_PARAM_INVALID' +// otherwise. +template <typename INT_T, typename CHAR_T> +int store_bool(INT_T *out, const CHAR_T *val) +{ + static const char* const s_zero = "0"; + static const char* const s_one = "1"; + static const char* const s_true = "true"; + static const char* const s_false = "false"; + + if (val == 0) + return __CILKRTS_SET_PARAM_INVALID; + + if (strmatch(s_false, val) || strmatch(s_zero, val)) { + *out = 0; + return __CILKRTS_SET_PARAM_SUCCESS; + } + + if (strmatch(s_true, val) || strmatch(s_one, val)) { + *out = 1; + return __CILKRTS_SET_PARAM_SUCCESS; + } + + return __CILKRTS_SET_PARAM_INVALID; +} + +// Stores the integer value represented by the null-terminated string at 'val' +// into the integer object at 'out', restricting the result to the range 'min' +// to 'max', inclusive. Returns '__CILKRTS_SET_PARAM_SUCCESS' if the conversion +// succeeds and is in range, '__CILKRTS_SET_PARAM_XRANGE' if the conversion +// succeeds but is out of range, and '__CILKRTS_SET_PARAM_INVALID' otherwise. In +// the case of any error, '*out' is unchanged. +template <typename INT_T, typename CHAR_T> +int store_int(INT_T *out, const CHAR_T *val, INT_T min, INT_T max) +{ + errno = 0; + long val_as_long = to_long(val); + if (val_as_long == 0 && errno != 0) + return __CILKRTS_SET_PARAM_INVALID; + if (val_as_long < min || val_as_long == LONG_MIN) + return __CILKRTS_SET_PARAM_XRANGE; + else if (val_as_long > max || val_as_long == LONG_MAX) + return __CILKRTS_SET_PARAM_XRANGE; + + *out = val_as_long; + return __CILKRTS_SET_PARAM_SUCCESS; +} + +// Implementaton of cilkg_set_param templatized on character type. +// Windows will instantiate with both char and wchar_t. +// Note that g must have its user settable values set, but need not be fully +// initialized. +template <class CHAR_T> +int set_param_imp(global_state_t* g, const CHAR_T* param, const CHAR_T* value) +{ + static const char* const s_force_reduce = "force reduce"; + static const char* const s_nworkers = "nworkers"; + static const char* const s_max_user_workers = "max user workers"; + static const char* const s_local_stacks = "local stacks"; + static const char* const s_shared_stacks = "shared stacks"; + static const char* const s_nstacks = "nstacks"; + static const char* const s_stack_size = "stack size"; + + // We must have a parameter and a value + if (0 == param) + return __CILKRTS_SET_PARAM_INVALID; + if (0 == value) + return __CILKRTS_SET_PARAM_INVALID; + + if (strmatch(param, s_force_reduce)) + { + // Sets whether we force a reduce operation at every sync. Useful for + // debugging reducers. Off by default. Overridden by Cilkscreen + // + // Documented in cilk_api_<os>.h + if (always_force_reduce()) + // Force reduce is set by cilkscreen. User cannot change it. + return __CILKRTS_SET_PARAM_LATE; + + return store_bool(&g->force_reduce, value); + } + else if (strmatch(param, s_nworkers)) + { + // Set the total number of workers. Overrides count of cores we get + // from the OS and the setting of the CILK_NWORKERS environment + // variable. Setting to 0 indicates that the default worker count + // should be used. + // + // Documented in cilk_api_<os>.h + if (cilkg_singleton_ptr) + return __CILKRTS_SET_PARAM_LATE; + + // Fetch the number of cores. There must be at last 1, since we're + // executing on *something*, aren't we!? + int hardware_cpu_count = __cilkrts_hardware_cpu_count(); + CILK_ASSERT(hardware_cpu_count > 0); + + int max_cpu_count = 16 * hardware_cpu_count; + if (__cilkrts_running_under_sequential_ptool()) + { + hardware_cpu_count = 1; + max_cpu_count = 1; + } + // Allow a value of 0, which means "set to hardware thread count". + int ret = store_int(&g->P, value, 0, max_cpu_count); + if (0 == g->P) + g->P = hardware_cpu_count; + return ret; + } + else if (strmatch(param, s_max_user_workers)) + { + // ** UNDOCUMENTED ** + // + // Sets the number of slots allocated for user worker threads + int hardware_cpu_count = __cilkrts_hardware_cpu_count(); + CILK_ASSERT (hardware_cpu_count > 0); + + return store_int(&g->max_user_workers, value, 1, + 16 * hardware_cpu_count); + } + else if (strmatch(param, s_local_stacks)) + { + // ** UNDOCUMENTED ** + // + // Number of stacks we'll hold in the per-worker stack cache. Maximum + // value is 42. See __cilkrts_make_global_state for details. + return store_int(&g->fiber_pool_size, value, 0, 42); + } + else if (strmatch(param, s_shared_stacks)) + { + // ** UNDOCUMENTED ** + // + // Maximum number of stacks we'll hold in the global stack + // cache. Maximum value is 42. See __cilkrts_make_global_state for + // details. + return store_int(&g->global_fiber_pool_size, value, 0, 42); + } + else if (strmatch(param, s_nstacks)) + { + // Sets the maximum number of stacks permitted at one time. If the + // runtime reaches this maximum, it will cease to allocate stacks and + // the app will lose parallelism. 0 means unlimited. Default is + // unlimited. Minimum is twice the number of worker threads, though + // that cannot be tested at this time. + // + // Undocumented at this time, though there are plans to expose it. + // The current implentation is for Linux debugging only and is not + // robust enough for users. + if (cilkg_singleton_ptr) + return __CILKRTS_SET_PARAM_LATE; + return store_int<unsigned>(&g->max_stacks, value, 0, INT_MAX); + } + else if (strmatch(param, s_stack_size)) + { + // ** UNDOCUMENTED ** + // + // Sets the size (in bytes) of the stacks that Cilk creates. + // Can only be set before the runtime starts. + if (cilkg_singleton_ptr) + return __CILKRTS_SET_PARAM_LATE; + + // Maximum value that can be parsed is MAX_INT (32-bit). + int ret = store_int<size_t>(&g->stack_size, value, 0, INT_MAX); + + // Process the value the user set (or 0 if the user didn't set + // anything) into something nice for the current OS. This + // processing is done immediately and stored into + // g->stack_size so that a call to get stack size will return + // the value that the runtime will actually use. + g->stack_size = cilkos_validate_stack_size(g->stack_size); + return ret; + } + + + // If got here, then didn't match any of the strings + return __CILKRTS_SET_PARAM_UNIMP; +} + +inline +int calc_max_user_workers(global_state_t *g) +{ + // If it's been set by the user, give back what we got + if (g->max_user_workers > 0) + return g->max_user_workers; + + // Calculate it + return std::max(3, g->P * 2); +} + +} // end unnamed namespace + +__CILKRTS_BEGIN_EXTERN_C + +/** + * @brief Returns the global state object. If called for the first time, + * initializes the user-settable values in the global state, but does not + * initialize the rest of the structure. + */ +global_state_t* cilkg_get_user_settable_values() +{ + // Environment variable value. More than big enough for a 64-bit signed + // integer. + char envstr[24]; + + // Abbreviating &global_state_singleton as g is not only shorter, it also + // facilitates grepping for the string "g->", which appears ubiquitously + // in the runtime code. + global_state_t* g = &global_state_singleton; + + // TBD: We need synchronization around this loop to prevent + // multiple threads from initializing this data. + if (! cilkg_user_settable_values_initialized) + { + size_t len; + + // Preserve stealing disabled since it may have been set by the + // debugger + int stealing_disabled = g->stealing_disabled; + + // All fields will be zero until set. In particular + std::memset(g, 0, sizeof(global_state_t)); + + // Fetch the number of cores. There must be at last 1, since we're + // executing on *something*, aren't we!? + int hardware_cpu_count = __cilkrts_hardware_cpu_count(); + CILK_ASSERT(hardware_cpu_count > 0); + + bool under_ptool = __cilkrts_running_under_sequential_ptool(); + if (under_ptool) + hardware_cpu_count = 1; + + g->stealing_disabled = stealing_disabled; + g->under_ptool = under_ptool; + g->force_reduce = 0; // Default Off + g->P = hardware_cpu_count; // Defaults to hardware CPU count + g->max_user_workers = 0; // 0 unless set by user + g->fiber_pool_size = 7; // Arbitrary default + + g->global_fiber_pool_size = 3 * 3* g->P; // Arbitrary default + // 3*P was the default size of the worker array (including + // space for extra user workers). This parameter was chosen + // to match previous versions of the runtime. + + if (4 == sizeof(void *)) + g->max_stacks = 1200; // Only 1GB on 32-bit machines + else + g->max_stacks = 2400; // 2GB on 64-bit machines + + // If we have 2400 1MB stacks, that is 2 gb. If we reach this + // limit on a single-socket machine, we may have other + // problems. Is 2400 too small for large multicore machines? + + // TBD(jsukha, 11/27/2012): I set this limit on stacks to be a + // value independent of P. When running on a Xeon Phi with + // small values of P, I recall seeing a few microbenchmarks + // (e.g., fib) where a limit of 10*P seemed to be + // unnecessarily slowing things down. + // + // That being said, the code has changed sufficiently that + // this observation may no longer be true. + // + // Note: in general, the worst-case number of stacks required + // for a Cilk computation with spawn depth "d" on P workers is + // O(Pd). Code with unbalanced recursion may run into issues + // with this stack usage. + + g->max_steal_failures = 128; // TBD: depend on max_workers? + g->stack_size = 0; // 0 unless set by the user + + // Assume no record or replay log for now + g->record_replay_file_name = NULL; + g->record_or_replay = RECORD_REPLAY_NONE; // set by user + + if (always_force_reduce()) + g->force_reduce = true; + else if (cilkos_getenv(envstr, sizeof(envstr), "CILK_FORCE_REDUCE")) + store_bool(&g->force_reduce, envstr); + + if (under_ptool) + g->P = 1; // Ignore environment variable if under cilkscreen + else if (cilkos_getenv(envstr, sizeof(envstr), "CILK_NWORKERS")) + // Set P to environment variable, but limit to no less than 1 + // and no more than 16 times the number of hardware threads. + store_int(&g->P, envstr, 1, 16 * hardware_cpu_count); + + if (cilkos_getenv(envstr, sizeof(envstr), "CILK_MAX_USER_WORKERS")) + // Set max_user_workers to environment variable, but limit to no + // less than 1 and no more 16 times the number of hardware + // threads. If not specified, defaults (somewhat arbitrarily) to + // the larger of 3 and twice the number of hardware threads. + store_int(&g->max_user_workers, envstr, 1, 16*hardware_cpu_count); + + if (cilkos_getenv(envstr, sizeof(envstr), "CILK_STEAL_FAILURES")) + // Set the number of times a worker should fail to steal before + // it looks to see whether it should suspend itself. + store_int<unsigned>(&g->max_steal_failures, envstr, 1, INT_MAX); + + // Compute the total number of workers to allocate. Subtract one from + // nworkers and user workers so that the first user worker isn't + // factored in twice. + // + // total_workers must be computed now to support __cilkrts_get_total_workers + g->total_workers = g->P + calc_max_user_workers(g) - 1; + +#ifdef CILK_RECORD_REPLAY + // RecordReplay: See if we've been asked to replay a log + len = cilkos_getenv(envstr, 0, "CILK_REPLAY_LOG"); + if (len > 0) + { + len += 1; // Allow for trailing NUL + g->record_or_replay = REPLAY_LOG; + g->record_replay_file_name = (char *)__cilkrts_malloc(len); + cilkos_getenv(g->record_replay_file_name, len, "CILK_REPLAY_LOG"); + } + + // RecordReplay: See if we've been asked to record a log + len = cilkos_getenv(envstr, 0, "CILK_RECORD_LOG"); + if (len > 0) + { + if (RECORD_REPLAY_NONE != g->record_or_replay) + cilkos_warning("CILK_RECORD_LOG ignored since CILK_REPLAY_LOG is defined.\n"); + else + { + len += 1; // Allow for trailing NUL + g->record_or_replay = RECORD_LOG; + g->record_replay_file_name = (char *)__cilkrts_malloc(len); + cilkos_getenv(g->record_replay_file_name, len, "CILK_RECORD_LOG"); + } + } +#endif + + cilkg_user_settable_values_initialized = true; + } + + return g; +} + +int cilkg_calc_total_workers() +{ + global_state_t* g = cilkg_get_user_settable_values(); + + // Compute the total number of workers to allocate. Subtract one from + // nworkers and user workers so that the first user worker isn't + // factored in twice. + return g->P + calc_max_user_workers(g) - 1; +} + +// Should be called while holding the global lock. +global_state_t* cilkg_init_global_state() +{ + if (cilkg_singleton_ptr) + return cilkg_singleton_ptr; + + // Get partially-initialized global state. + global_state_t* g = cilkg_get_user_settable_values(); + + if (g->max_stacks > 0) { + + // nstacks is currently honored on non-Windows systems only. + + // Set an upper bound on the number of stacks that are allocated. If + // nstacks is set, each worker gets up to one stack in its cache so that + // no one worker can hog all of the free stacks and keep work from being + // stolen by the other workers. + + // nstacks corresponds to the number of stacks that will be allocated by + // the runtime apart from the initial stack created for each thread by + // the system. Therefore, if a user asks for n stacks, and there are + // p workers created, the total number of stacks is actually n + p. + + // This feature is primarily for MIC which has flat memory + // instead of virtual addresses and tends to run out really quickly. + // It is not implemented for Windows and it's non-intuitive + // interaction with the local stack cache is specifically to help out + // MIC. + + // About max_stacks / P stacks, except we require at least 1 + // per pool. + if (((int)g->max_stacks / g->P) < g->fiber_pool_size) + g->fiber_pool_size = g->max_stacks / g->P; + + if (g->fiber_pool_size <= 0) { + g->fiber_pool_size = 1; + } + + if ((int)g->max_stacks < g->P) + g->max_stacks = g->P; + + g->global_fiber_pool_size = g->P * (g->fiber_pool_size+1); + } + + // Number of bytes/address - validation for debugger integration + g->addr_size = sizeof(void *); + + __cilkrts_init_stats(&g->stats); + + __cilkrts_frame_malloc_global_init(g); + + g->Q = 0; + g->total_workers = cilkg_calc_total_workers(); + g->system_workers = g->P - 1; // system_workers is here for the debugger. + g->work_done = 0; + g->workers_running = 0; + g->ltqsize = 1024; /* FIXME */ + + g->stack_size = cilkos_validate_stack_size(g->stack_size); + g->failure_to_allocate_stack = 0; + + + return g; +} + +void cilkg_publish_global_state(global_state_t* g) +{ + + // TBD: which one of these needs to be executed first? I say + // cilkg_singleton_ptr needs to be set last, with a mfence in + // between, since it is the flag that cilkg_is_published_is + // checking for. + __cilkrts_global_state = g; + __cilkrts_fence(); + cilkg_singleton_ptr = g; +} + +void cilkg_deinit_global_state() +{ + cilkg_singleton_ptr = NULL; + __cilkrts_global_state = NULL; +} + +int cilkg_is_published(void) +{ + return NULL != cilkg_singleton_ptr; +} + +int cilkg_set_param(const char* param, const char* value) +{ + return set_param_imp(cilkg_get_user_settable_values(), param, value); +} + +#ifdef _WIN32 +int cilkg_set_param_w(const wchar_t* param, const wchar_t* value) +{ + return set_param_imp(cilkg_get_user_settable_values(), param, value); +} +#endif + +extern "C++" { + // C++ scheduler function (that may throw exceptions) + typedef void cpp_scheduler_t(__cilkrts_worker *w); +} + +void __cilkrts_run_scheduler_with_exceptions(__cilkrts_worker *w) +{ + global_state_t* g = cilkg_get_global_state(); + CILK_ASSERT(g->scheduler); + + cpp_scheduler_t* scheduler = (cpp_scheduler_t*) g->scheduler; + + try { + scheduler(w); + } catch (...) { + __cilkrts_bug("Exception escaped Cilk context"); + } +} + +__CILKRTS_END_EXTERN_C + +/* End global_state.cpp */ diff --git a/libcilkrts/runtime/global_state.h b/libcilkrts/runtime/global_state.h new file mode 100644 index 00000000000..ef455e479d5 --- /dev/null +++ b/libcilkrts/runtime/global_state.h @@ -0,0 +1,417 @@ +/* global_state.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file global_state.h + * + * @brief The global_state_t structure contains most of the global context + * maintained by the Intel Cilk runtime. + */ + +#ifndef INCLUDED_GLOBAL_STATE_DOT_H +#define INCLUDED_GLOBAL_STATE_DOT_H + +#include <cilk/common.h> + +#include "frame_malloc.h" +#include "stats.h" +#include "bug.h" +#include "cilk_fiber.h" + +__CILKRTS_BEGIN_EXTERN_C + +/** + * Non-null place-holder for a stack handle that has no meaningful value. + */ +#define PLACEHOLDER_FIBER ((cilk_fiber *) -2) + +/** + * States for record_or_replay + */ +enum record_replay_t { + RECORD_REPLAY_NONE, + RECORD_LOG, + REPLAY_LOG +}; + +/** + * @brief The global state is a structure that is shared by all workers in + * Cilk. + * + * Make the structure ready for use by calling + * cilkg_init_global_state() and then cilkg_publish_global_state(). + * + * The same global lock should be held while both of these methods are + * called. These methods are split because it is useful to execute + * other runtime initialization code in between. + * + * After cilkg_publish_global_state() has completed, Cilk runtime + * methods may call cilkg_get_global_state() to look at the published + * value without holding the global lock. + * + * Finally, clean up the global state by calling + * cilkg_deinit_global_state(). This method should be called only + * after all calls to cilkg_get_global_state() have completed, and + * while holding the global lock. + * + * Before initialization and after deinitialization, the fields in the + * global state have unspecified values, except for a few special + * fields labeled "USER SETTING", which can be read and written before + * initialization and after deinitialization. + */ + +struct global_state_t { /* COMMON_PORTABLE */ + + /* Fields described as "(fixed)" should not be changed after + * initialization. + */ + + /************************************************************************* + * Note that debugger integration must reach into the + * global state! The debugger integration is depending on the + * offsets of the addr_size, system_workers, total_workers, + * stealing_disabled, sysdep, and workers. If these offsets change, the + * debugger integration library will need to be changed to match!!! + *************************************************************************/ + + int addr_size; ///< Number of bytes for an address, used by debugger (fixed) + + int system_workers; ///< Number of system workers (fixed) + + /** + * @brief USER SETTING: Maximum number of user workers that can be + * bound to cilk workers. + * + * 0 unless set by user. Call cilkg_calc_max_user_workers to get + * the value. + */ + int max_user_workers; + + int total_workers; ///< Total number of worker threads allocated (fixed) + + int workers_running; ///< True when system workers have beens started */ + + /// Set by debugger to disable stealing (fixed) + int stealing_disabled; + + /// System-dependent part of the global state + struct global_sysdep_state *sysdep; + + /// Array of worker structures. + __cilkrts_worker **workers; + + /******* END OF DEBUGGER-INTEGRATION FIELDS ***************/ + + /// Number of frames in each worker's lazy task queue + __STDNS size_t ltqsize; + + /** + * @brief USER SETTING: Force all possible reductions. + * + * TRUE if running a p-tool that requires reducers to call the reduce() + * method even if no actual stealing occurs. + * + * When set to TRUE, runtime will simulate steals, forcing calls to the + * the reduce() methods of reducers. + * + */ + int force_reduce; + + /// USER SETTING: Per-worker fiber pool size + int fiber_pool_size; + + /// USER SETTING: Global fiber pool size + int global_fiber_pool_size; + + /** + * @brief TRUE when workers should exit scheduling loop so we can + * shut down the runtime and free the global state. + * + * @note @c work_done will be checked *FREQUENTLY* in the scheduling loop + * by idle workers. We need to ensure that it's not in a cache line which + * may be invalidated by other cores. The surrounding fields are either + * constant after initialization or not used until shutdown (stats) so we + * should be OK. + */ + volatile int work_done; + + int under_ptool; ///< True when running under a serial PIN tool + + statistics stats; ///< Statistics on use of runtime + + /** + * @brief USER SETTING: Maximum number of stacks the runtime will + * allocate (apart from those created by the OS when worker + * threads are created). + * + * If max_stacks == 0,there is no pre-defined maximum. + */ + unsigned max_stacks; + + /// Size of each stack + size_t stack_size; + + /// Global cache for per-worker memory + struct __cilkrts_frame_cache frame_malloc; + + /// Global fiber pool + cilk_fiber_pool fiber_pool; + + + /** + * @brief Track whether the runtime has failed to allocate a + * stack. + * + * Setting this flag prevents multiple warnings from being + * issued. + */ + int failure_to_allocate_stack; + + /** + * @brief USER SETTING: indicate record or replay log. + * Set to NULL if not used in this run. + */ + char *record_replay_file_name; + + /** + * @brief Record/replay state. + * Valid states are: + * RECORD_REPLAY_NONE - Not recording or replaying a log + * RECORD_LOG - Recording a log for replay later + * REPLAY_LOG - Replay a log recorded earlier + */ + enum record_replay_t record_or_replay; + + /** + * @brief Buffer to force max_steal_failures to appear on a + * different cache line from the previous member variables. + * + * This padding is needed because max_steal_failures is read + * constantly and other modified values in the global state will + * cause thrashing. + */ + char cache_buf[64]; + + /** + * @brief Maximum number of times a thread should fail to steal + * before checking if Cilk is shutting down. + */ + unsigned int max_steal_failures; + + /// Pointer to scheduler entry point + void (*scheduler)(__cilkrts_worker *w); + + /** + * @brief Buffer to force P and Q to appear on a different cache + * line from the previous member variables. + */ + char cache_buf_2[64]; + + int P; ///< USER SETTING: number of system workers + 1 (fixed) + int Q; ///< Number of user threads currently bound to workers +}; + +/** + * @brief Initialize the global state object. This method must both + * complete before referencing any fields in the global state, except + * those specified as "user-settable values". + */ +global_state_t* cilkg_init_global_state(); + +/** + * @brief Publish the global state object, so that + * cilkg_is_published can return true. + * + * @param g - the global state created by cilkg_init_global_state() to + * publish. + * + * After the global state object has been published, a thread should + * not modify this state unless it has exclusive access (i.e., holds + * the global lock). + */ +void cilkg_publish_global_state(global_state_t* g); + +/** + * @brief Return true if the global state has been fully initialized + * and published, and has not been deinitialized. + */ +int cilkg_is_published(void); + +/** + * @brief De-initializes the global state object. Must be called to free + * resources when the global state is no longer needed. + */ +void cilkg_deinit_global_state(void); + +/** + * @brief Returns the global state object. Result is valid only if the + * global state has been published (see cilkg_publish_global_state()). + */ +static inline +global_state_t* cilkg_get_global_state(void) +{ + // "private" extern declaration: + extern global_state_t *cilkg_singleton_ptr; + + __CILKRTS_ASSERT(cilkg_singleton_ptr); // Debug only + return cilkg_singleton_ptr; +} + + +/** + * @brief Implementation of __cilkrts_set_params. + * + * Set user controllable parameters + * @param param - string specifying parameter to be set + * @param value - string specifying new value + * @returns One of: CILKG_SET_PARAM_SUCCESS ( = 0), + * CILKG_SET_PARAM_UNIMP, CILKG_SET_PARAM_XRANGE, + * CILKG_SET_PARAM_INVALID, or CILKG_SET_PARAM_LATE. + * + * @attention The wide character version __cilkrts_set_param_w() is available + * only on Windows. + * + * Allowable parameter names: + * + * - "nworkers" - number of processors that should run Cilk code. + * The value is a string of digits to be parsed by strtol. + * + * - "force reduce" - test reducer callbacks by allocating new views + * for every spawn within which a reducer is accessed. This can + * significantly reduce performance. The value is "1" or "true" + * to enable, "0" or "false" to disable. + * @warning Enabling "force reduce" when running with more than a single + * worker is currently broken. + * + * - "max user workers" - (Not publicly documented) Sets the number of slots + * allocated for user worker threads + * + * - "local stacks" - (Not publicly documented) Number of stacks we'll hold in + * the per-worker stack cache. Range 1 .. 42. See + * cilkg_init_global_state for details. + * + * - "shared stacks" - (Not publicly documented) Maximum number of stacks + * we'll hold in the global stack cache. Maximum value is 42. See + * __cilkrts_make_global_state for details + * + * - "nstacks" - (Not publicly documented at this time, though it may be + * exposed in the future) Sets the maximum number of stacks permitted at one + * time. If the runtime reaches this maximum, it will cease to allocate + * stacks and the app will lose parallelism. 0 means unlimited. Default is + * unlimited. Minimum is twice the number of worker threads, though that + * cannot be tested at this time. + */ +int cilkg_set_param(const char* param, const char* value); +#ifdef _WIN32 +/** + * @brief Implementation of __cilkrts_set_params for Unicode characters on + * Windows. See the documentation on @ref cilkg_set_param for more details. + * + * Set user controllable parameters + * @param param - string specifying parameter to be set + * @param value - string specifying new value + * @returns One of: CILKG_SET_PARAM_SUCCESS ( = 0), + * CILKG_SET_PARAM_UNIMP, CILKG_SET_PARAM_XRANGE, + * CILKG_SET_PARAM_INVALID, or CILKG_SET_PARAM_LATE. + */ +int cilkg_set_param_w(const wchar_t* param, const wchar_t* value); +#endif + +/** + * @brief implementation of __cilkrts_get_nworkers() + */ +static inline +int cilkg_get_nworkers(void) +{ + // "private" extern declaration + extern global_state_t* cilkg_get_user_settable_values(void); + return cilkg_get_user_settable_values()->P; +} + +/** + * @brief implementation of __cilkrts_get_total_workers() + */ +static inline +int cilkg_get_total_workers(void) +{ + // "private" extern declaration + extern int cilkg_calc_total_workers(void); + + // This number can fluctate until initialization so we + // compute it from scratch + return cilkg_calc_total_workers(); +} + +/** + * @brief implementation of __cilkrts_get_force_reduce() + */ +static inline +int cilkg_get_force_reduce(void) +{ + // "private" extern declaration + extern global_state_t* cilkg_get_user_settable_values(void); + return cilkg_get_user_settable_values()->force_reduce; +} + +/** + * @brief implementation of __cilkrts_get_stack_size() + */ +static inline +size_t cilkg_get_stack_size(void) +{ + // "private" extern declaration + extern global_state_t* cilkg_get_user_settable_values(void); + return cilkg_get_user_settable_values()->stack_size; +} + +/** + * @brief Run the scheduler function stored in the global_state + * + * Look up the scheduler function in global_state and run it. Report a fatal + * error if an exception escapes the scheduler function. + * + * @param w - Worker structure to associate with the current thread. + * + * @attention The scheduler field of the global state must be set before this + * function is called. + */ +void __cilkrts_run_scheduler_with_exceptions(__cilkrts_worker *w); + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_GLOBAL_STATE_DOT_H) diff --git a/libcilkrts/runtime/jmpbuf.c b/libcilkrts/runtime/jmpbuf.c new file mode 100644 index 00000000000..39b51a593ce --- /dev/null +++ b/libcilkrts/runtime/jmpbuf.c @@ -0,0 +1,48 @@ +/* jmpbuf.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#include "jmpbuf.h" + +/* + * C99 requires that every inline function with external linkage have + * one extern declaration in the program. + */ +extern char *__cilkrts_get_sp(__cilkrts_stack_frame *sf); +extern ptrdiff_t __cilkrts_get_frame_size(__cilkrts_stack_frame *sf); + +/* End jmpbuf.c */ diff --git a/libcilkrts/runtime/jmpbuf.h b/libcilkrts/runtime/jmpbuf.h new file mode 100644 index 00000000000..60573f3a5fa --- /dev/null +++ b/libcilkrts/runtime/jmpbuf.h @@ -0,0 +1,136 @@ +/* jmpbuf.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file jmpbuf.h + * + * @brief Macros and functions to access the _JUMP_BUFFER initialized by a + * call to CILK_SETJMP before a cilk_spawn or cilk_sync. The definition of + * CILK_SETJMP and CILK_LONGJMP are OS dependent and in abi.h + * + */ + +#ifndef INCLUDED_JMPBUF_DOT_H +#define INCLUDED_JMPBUF_DOT_H + +#include <cilk/common.h> +#include <internal/abi.h> +#include <stddef.h> +#include <setjmp.h> + +#if 0 /* defined CILK_USE_C_SETJMP && defined JB_RSP */ +# define JMPBUF_SP(ctx) (ctx)[0].__jmpbuf[JB_RSP] +# define JMPBUF_FP(ctx) (ctx)[0].__jmpbuf[JB_RBP] +# define JMPBUF_PC(ctx) (ctx)[0].__jmpbuf[JB_PC] +#elif 0 /* defined CILK_USE_C_SETJMP && defined JB_SP */ +# define JMPBUF_SP(ctx) (ctx)[0].__jmpbuf[JB_SP] +# define JMPBUF_FP(ctx) (ctx)[0].__jmpbuf[JB_BP] +# define JMPBUF_PC(ctx) (ctx)[0].__jmpbuf[JB_PC] +#elif defined _WIN64 +# define JMPBUF_SP(ctx) ((_JUMP_BUFFER*)(&(ctx)))->Rsp +# define JMPBUF_FP(ctx) ((_JUMP_BUFFER*)(&(ctx)))->Rbp +# define JMPBUF_PC(ctx) ((_JUMP_BUFFER*)(&(ctx)))->Rip +#elif defined _WIN32 + /** Fetch stack pointer from a __cilkrts_stack_frame */ +# define JMPBUF_SP(ctx) (ctx).Esp + /** Fetch frame pointer from a __cilkrts_stack_frame */ +# define JMPBUF_FP(ctx) (ctx).Ebp + /** Fetch program counter from a __cilkrts_stack_frame */ +# define JMPBUF_PC(ctx) (ctx).Eip +#else /* defined __GNUC__ || defined __ICC */ + /* word 0 is frame address + * word 1 is resume address + * word 2 is stack address */ +# define JMPBUF_FP(ctx) (ctx)[0] +# define JMPBUF_PC(ctx) (ctx)[1] +# define JMPBUF_SP(ctx) (ctx)[2] +#endif + +/** + * @brief Get frame pointer from jump buffer in__cilkrts_stack_frame. + */ +#define FP(SF) JMPBUF_FP((SF)->ctx) + +/** + * @brief Get program counter from jump buffer in__cilkrts_stack_frame. + */ +#define PC(SF) JMPBUF_PC((SF)->ctx) + +/** + * @brief Get stack pointer from jump buffer in__cilkrts_stack_frame. + */ +#define SP(SF) JMPBUF_SP((SF)->ctx) + + +__CILKRTS_BEGIN_EXTERN_C + +/** + * Fetch the stack pointer from a __cilkrts_stack_frame. The jmpbuf was + * initialized before a cilk_spawn or cilk_sync. + * + * @param sf __cilkrts_stack_frame containing the jmpbuf. + * + * @return the stack pointer from the ctx. + */ +inline char *__cilkrts_get_sp(__cilkrts_stack_frame *sf) +{ + return (char *)SP(sf); +} + +/** + * Calculate the frame size from __cilkrts_stack_frame. The jmpbuf was + * initialized before a cilk_spawn or cilk_sync. + * + * @warning Returning an arbitrary value on Windows! + * + * @param sf __cilkrts_stack_frame containing the jmpbuf. + * + * @return the stack pointer from the ctx. + */ +inline ptrdiff_t __cilkrts_get_frame_size(__cilkrts_stack_frame *sf) +{ +#ifdef _WIN32 + if (0 == SP(sf)) + return 256; // Arbitrary! +#endif + return (ptrdiff_t)FP(sf) - (ptrdiff_t)SP(sf); +} + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_JMPBUF_DOT_H) diff --git a/libcilkrts/runtime/linux-symbols.ver b/libcilkrts/runtime/linux-symbols.ver new file mode 100644 index 00000000000..aeb4a5fb13d --- /dev/null +++ b/libcilkrts/runtime/linux-symbols.ver @@ -0,0 +1,369 @@ +/* + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +CILKABI0 +{ + global: + __cilkrts_bind_thread; + __cilkrts_cilk_for_32; + __cilkrts_cilk_for_64; + __cilkrts_debugger_notification; + __cilkrts_dump_stats; + __cilkrts_end_cilk; + __cilkrts_enter_frame; + __cilkrts_enter_frame_fast; + __cilkrts_get_force_reduce; + __cilkrts_get_nworkers; + __cilkrts_get_tls_worker; + __cilkrts_get_tls_worker_fast; + __cilkrts_get_total_workers; + __cilkrts_get_worker_number; + __cilkrts_global_state; + __cilkrts_hyper_create; + __cilkrts_hyper_destroy; + __cilkrts_hyper_lookup; + __cilkrts_hyperobject_alloc; + __cilkrts_hyperobject_dealloc; + __cilkrts_hyperobject_noop_destroy; + __cilkrts_init; + __cilkrts_irml_version; + __cilkrts_leave_frame; + __cilkrts_metacall; + __cilkrts_rethrow; + __cilkrts_return_exception; + __cilkrts_set_param; + __cilkrts_sync; + __cilkrts_synched; + __cilkrts_worker_stub; + local: *; +}; + +CILKABI1 +{ + global: + __cilkrts_bind_thread_1; + __cilkrts_bump_loop_rank; + __cilkrts_bump_loop_rank_internal; + __cilkrts_bump_worker_rank; + __cilkrts_bump_worker_rank_internal; + __cilkrts_enter_frame_1; + __cilkrts_enter_frame_fast_1; + __cilkrts_get_pedigree_info; + __cilkrts_get_pedigree_internal; + __cilkrts_get_sf; + __cilkrts_get_stack_size; + __cilkrts_get_worker_rank; + __cilkrts_save_fp_ctrl_state; + __cilkrts_stack_alloc; + __cilkrts_stack_free; + __cilkrts_watch_stack; +} CILKABI0; + +CILKLIB1.02 +{ + global: + cilk_c_reducer_max_identity_char; + cilk_c_reducer_max_identity_double; + cilk_c_reducer_max_identity_float; + cilk_c_reducer_max_identity_int; + cilk_c_reducer_max_identity_long; + cilk_c_reducer_max_identity_longdouble; + cilk_c_reducer_max_identity_longlong; + cilk_c_reducer_max_identity_schar; + cilk_c_reducer_max_identity_short; + cilk_c_reducer_max_identity_uchar; + cilk_c_reducer_max_identity_uint; + cilk_c_reducer_max_identity_ulong; + cilk_c_reducer_max_identity_ulonglong; + cilk_c_reducer_max_identity_unsigned; + cilk_c_reducer_max_identity_ushort; + cilk_c_reducer_max_identity_wchar_t; + cilk_c_reducer_max_index_identity_char; + cilk_c_reducer_max_index_identity_double; + cilk_c_reducer_max_index_identity_float; + cilk_c_reducer_max_index_identity_int; + cilk_c_reducer_max_index_identity_long; + cilk_c_reducer_max_index_identity_longdouble; + cilk_c_reducer_max_index_identity_longlong; + cilk_c_reducer_max_index_identity_schar; + cilk_c_reducer_max_index_identity_short; + cilk_c_reducer_max_index_identity_uchar; + cilk_c_reducer_max_index_identity_uint; + cilk_c_reducer_max_index_identity_ulong; + cilk_c_reducer_max_index_identity_ulonglong; + cilk_c_reducer_max_index_identity_unsigned; + cilk_c_reducer_max_index_identity_ushort; + cilk_c_reducer_max_index_identity_wchar_t; + cilk_c_reducer_max_index_reduce_char; + cilk_c_reducer_max_index_reduce_double; + cilk_c_reducer_max_index_reduce_float; + cilk_c_reducer_max_index_reduce_int; + cilk_c_reducer_max_index_reduce_long; + cilk_c_reducer_max_index_reduce_longdouble; + cilk_c_reducer_max_index_reduce_longlong; + cilk_c_reducer_max_index_reduce_schar; + cilk_c_reducer_max_index_reduce_short; + cilk_c_reducer_max_index_reduce_uchar; + cilk_c_reducer_max_index_reduce_uint; + cilk_c_reducer_max_index_reduce_ulong; + cilk_c_reducer_max_index_reduce_ulonglong; + cilk_c_reducer_max_index_reduce_unsigned; + cilk_c_reducer_max_index_reduce_ushort; + cilk_c_reducer_max_index_reduce_wchar_t; + cilk_c_reducer_max_reduce_char; + cilk_c_reducer_max_reduce_double; + cilk_c_reducer_max_reduce_float; + cilk_c_reducer_max_reduce_int; + cilk_c_reducer_max_reduce_long; + cilk_c_reducer_max_reduce_longdouble; + cilk_c_reducer_max_reduce_longlong; + cilk_c_reducer_max_reduce_schar; + cilk_c_reducer_max_reduce_short; + cilk_c_reducer_max_reduce_uchar; + cilk_c_reducer_max_reduce_uint; + cilk_c_reducer_max_reduce_ulong; + cilk_c_reducer_max_reduce_ulonglong; + cilk_c_reducer_max_reduce_unsigned; + cilk_c_reducer_max_reduce_ushort; + cilk_c_reducer_max_reduce_wchar_t; + cilk_c_reducer_min_identity_char; + cilk_c_reducer_min_identity_double; + cilk_c_reducer_min_identity_float; + cilk_c_reducer_min_identity_int; + cilk_c_reducer_min_identity_long; + cilk_c_reducer_min_identity_longdouble; + cilk_c_reducer_min_identity_longlong; + cilk_c_reducer_min_identity_schar; + cilk_c_reducer_min_identity_short; + cilk_c_reducer_min_identity_uchar; + cilk_c_reducer_min_identity_uint; + cilk_c_reducer_min_identity_ulong; + cilk_c_reducer_min_identity_ulonglong; + cilk_c_reducer_min_identity_unsigned; + cilk_c_reducer_min_identity_ushort; + cilk_c_reducer_min_identity_wchar_t; + cilk_c_reducer_min_index_identity_char; + cilk_c_reducer_min_index_identity_double; + cilk_c_reducer_min_index_identity_float; + cilk_c_reducer_min_index_identity_int; + cilk_c_reducer_min_index_identity_long; + cilk_c_reducer_min_index_identity_longdouble; + cilk_c_reducer_min_index_identity_longlong; + cilk_c_reducer_min_index_identity_schar; + cilk_c_reducer_min_index_identity_short; + cilk_c_reducer_min_index_identity_uchar; + cilk_c_reducer_min_index_identity_uint; + cilk_c_reducer_min_index_identity_ulong; + cilk_c_reducer_min_index_identity_ulonglong; + cilk_c_reducer_min_index_identity_unsigned; + cilk_c_reducer_min_index_identity_ushort; + cilk_c_reducer_min_index_identity_wchar_t; + cilk_c_reducer_min_index_reduce_char; + cilk_c_reducer_min_index_reduce_double; + cilk_c_reducer_min_index_reduce_float; + cilk_c_reducer_min_index_reduce_int; + cilk_c_reducer_min_index_reduce_long; + cilk_c_reducer_min_index_reduce_longdouble; + cilk_c_reducer_min_index_reduce_longlong; + cilk_c_reducer_min_index_reduce_schar; + cilk_c_reducer_min_index_reduce_short; + cilk_c_reducer_min_index_reduce_uchar; + cilk_c_reducer_min_index_reduce_uint; + cilk_c_reducer_min_index_reduce_ulong; + cilk_c_reducer_min_index_reduce_ulonglong; + cilk_c_reducer_min_index_reduce_unsigned; + cilk_c_reducer_min_index_reduce_ushort; + cilk_c_reducer_min_index_reduce_wchar_t; + cilk_c_reducer_min_reduce_char; + cilk_c_reducer_min_reduce_double; + cilk_c_reducer_min_reduce_float; + cilk_c_reducer_min_reduce_int; + cilk_c_reducer_min_reduce_long; + cilk_c_reducer_min_reduce_longdouble; + cilk_c_reducer_min_reduce_longlong; + cilk_c_reducer_min_reduce_schar; + cilk_c_reducer_min_reduce_short; + cilk_c_reducer_min_reduce_uchar; + cilk_c_reducer_min_reduce_uint; + cilk_c_reducer_min_reduce_ulong; + cilk_c_reducer_min_reduce_ulonglong; + cilk_c_reducer_min_reduce_unsigned; + cilk_c_reducer_min_reduce_ushort; + cilk_c_reducer_min_reduce_wchar_t; + cilk_c_reducer_opadd_identity_char; + cilk_c_reducer_opadd_identity_double; + cilk_c_reducer_opadd_identity_float; + cilk_c_reducer_opadd_identity_int; + cilk_c_reducer_opadd_identity_long; + cilk_c_reducer_opadd_identity_longdouble; + cilk_c_reducer_opadd_identity_longlong; + cilk_c_reducer_opadd_identity_schar; + cilk_c_reducer_opadd_identity_short; + cilk_c_reducer_opadd_identity_uchar; + cilk_c_reducer_opadd_identity_uint; + cilk_c_reducer_opadd_identity_ulong; + cilk_c_reducer_opadd_identity_ulonglong; + cilk_c_reducer_opadd_identity_unsigned; + cilk_c_reducer_opadd_identity_ushort; + cilk_c_reducer_opadd_identity_wchar_t; + cilk_c_reducer_opadd_reduce_char; + cilk_c_reducer_opadd_reduce_double; + cilk_c_reducer_opadd_reduce_float; + cilk_c_reducer_opadd_reduce_int; + cilk_c_reducer_opadd_reduce_long; + cilk_c_reducer_opadd_reduce_longdouble; + cilk_c_reducer_opadd_reduce_longlong; + cilk_c_reducer_opadd_reduce_schar; + cilk_c_reducer_opadd_reduce_short; + cilk_c_reducer_opadd_reduce_uchar; + cilk_c_reducer_opadd_reduce_uint; + cilk_c_reducer_opadd_reduce_ulong; + cilk_c_reducer_opadd_reduce_ulonglong; + cilk_c_reducer_opadd_reduce_unsigned; + cilk_c_reducer_opadd_reduce_ushort; + cilk_c_reducer_opadd_reduce_wchar_t; + cilk_c_reducer_opand_identity_char; + cilk_c_reducer_opand_identity_int; + cilk_c_reducer_opand_identity_long; + cilk_c_reducer_opand_identity_longlong; + cilk_c_reducer_opand_identity_schar; + cilk_c_reducer_opand_identity_short; + cilk_c_reducer_opand_identity_uchar; + cilk_c_reducer_opand_identity_uint; + cilk_c_reducer_opand_identity_ulong; + cilk_c_reducer_opand_identity_ulonglong; + cilk_c_reducer_opand_identity_unsigned; + cilk_c_reducer_opand_identity_ushort; + cilk_c_reducer_opand_identity_wchar_t; + cilk_c_reducer_opand_reduce_char; + cilk_c_reducer_opand_reduce_int; + cilk_c_reducer_opand_reduce_long; + cilk_c_reducer_opand_reduce_longlong; + cilk_c_reducer_opand_reduce_schar; + cilk_c_reducer_opand_reduce_short; + cilk_c_reducer_opand_reduce_uchar; + cilk_c_reducer_opand_reduce_uint; + cilk_c_reducer_opand_reduce_ulong; + cilk_c_reducer_opand_reduce_ulonglong; + cilk_c_reducer_opand_reduce_unsigned; + cilk_c_reducer_opand_reduce_ushort; + cilk_c_reducer_opand_reduce_wchar_t; + cilk_c_reducer_opmul_identity_char; + cilk_c_reducer_opmul_identity_double; + cilk_c_reducer_opmul_identity_float; + cilk_c_reducer_opmul_identity_int; + cilk_c_reducer_opmul_identity_long; + cilk_c_reducer_opmul_identity_longdouble; + cilk_c_reducer_opmul_identity_longlong; + cilk_c_reducer_opmul_identity_schar; + cilk_c_reducer_opmul_identity_short; + cilk_c_reducer_opmul_identity_uchar; + cilk_c_reducer_opmul_identity_uint; + cilk_c_reducer_opmul_identity_ulong; + cilk_c_reducer_opmul_identity_ulonglong; + cilk_c_reducer_opmul_identity_unsigned; + cilk_c_reducer_opmul_identity_ushort; + cilk_c_reducer_opmul_identity_wchar_t; + cilk_c_reducer_opmul_reduce_char; + cilk_c_reducer_opmul_reduce_double; + cilk_c_reducer_opmul_reduce_float; + cilk_c_reducer_opmul_reduce_int; + cilk_c_reducer_opmul_reduce_long; + cilk_c_reducer_opmul_reduce_longdouble; + cilk_c_reducer_opmul_reduce_longlong; + cilk_c_reducer_opmul_reduce_schar; + cilk_c_reducer_opmul_reduce_short; + cilk_c_reducer_opmul_reduce_uchar; + cilk_c_reducer_opmul_reduce_uint; + cilk_c_reducer_opmul_reduce_ulong; + cilk_c_reducer_opmul_reduce_ulonglong; + cilk_c_reducer_opmul_reduce_unsigned; + cilk_c_reducer_opmul_reduce_ushort; + cilk_c_reducer_opmul_reduce_wchar_t; + cilk_c_reducer_opor_identity_char; + cilk_c_reducer_opor_identity_int; + cilk_c_reducer_opor_identity_long; + cilk_c_reducer_opor_identity_longlong; + cilk_c_reducer_opor_identity_schar; + cilk_c_reducer_opor_identity_short; + cilk_c_reducer_opor_identity_uchar; + cilk_c_reducer_opor_identity_uint; + cilk_c_reducer_opor_identity_ulong; + cilk_c_reducer_opor_identity_ulonglong; + cilk_c_reducer_opor_identity_unsigned; + cilk_c_reducer_opor_identity_ushort; + cilk_c_reducer_opor_identity_wchar_t; + cilk_c_reducer_opor_reduce_char; + cilk_c_reducer_opor_reduce_int; + cilk_c_reducer_opor_reduce_long; + cilk_c_reducer_opor_reduce_longlong; + cilk_c_reducer_opor_reduce_schar; + cilk_c_reducer_opor_reduce_short; + cilk_c_reducer_opor_reduce_uchar; + cilk_c_reducer_opor_reduce_uint; + cilk_c_reducer_opor_reduce_ulong; + cilk_c_reducer_opor_reduce_ulonglong; + cilk_c_reducer_opor_reduce_unsigned; + cilk_c_reducer_opor_reduce_ushort; + cilk_c_reducer_opor_reduce_wchar_t; + cilk_c_reducer_opxor_identity_char; + cilk_c_reducer_opxor_identity_int; + cilk_c_reducer_opxor_identity_long; + cilk_c_reducer_opxor_identity_longlong; + cilk_c_reducer_opxor_identity_schar; + cilk_c_reducer_opxor_identity_short; + cilk_c_reducer_opxor_identity_uchar; + cilk_c_reducer_opxor_identity_uint; + cilk_c_reducer_opxor_identity_ulong; + cilk_c_reducer_opxor_identity_ulonglong; + cilk_c_reducer_opxor_identity_unsigned; + cilk_c_reducer_opxor_identity_ushort; + cilk_c_reducer_opxor_identity_wchar_t; + cilk_c_reducer_opxor_reduce_char; + cilk_c_reducer_opxor_reduce_int; + cilk_c_reducer_opxor_reduce_long; + cilk_c_reducer_opxor_reduce_longlong; + cilk_c_reducer_opxor_reduce_schar; + cilk_c_reducer_opxor_reduce_short; + cilk_c_reducer_opxor_reduce_uchar; + cilk_c_reducer_opxor_reduce_uint; + cilk_c_reducer_opxor_reduce_ulong; + cilk_c_reducer_opxor_reduce_ulonglong; + cilk_c_reducer_opxor_reduce_unsigned; + cilk_c_reducer_opxor_reduce_ushort; + cilk_c_reducer_opxor_reduce_wchar_t; +}; diff --git a/libcilkrts/runtime/local_state.c b/libcilkrts/runtime/local_state.c new file mode 100644 index 00000000000..14ac8271936 --- /dev/null +++ b/libcilkrts/runtime/local_state.c @@ -0,0 +1,68 @@ +/* local_state.c -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2010-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +#include "local_state.h" +#include "bug.h" +#include "full_frame.h" + +void run_scheduling_stack_fcn(__cilkrts_worker *w) +{ + scheduling_stack_fcn_t fcn = w->l->post_suspend; + full_frame *ff2 = w->l->frame_ff; + __cilkrts_stack_frame *sf2 = w->l->suspended_stack; + + w->l->post_suspend = 0; + w->l->suspended_stack = 0; + + // Conceptually, after clearing w->l->frame_ff, + // w no longer owns the full frame ff. + // The next time another (possibly different) worker takes + // ownership of ff will be at a provably_good_steal on ff. + w->l->frame_ff = NULL; + + CILK_ASSERT(fcn); + CILK_ASSERT(ff2); + fcn(w, ff2, sf2); + + // After we run the scheduling stack function, we shouldn't + // (still) not have a full frame. + CILK_ASSERT(NULL == w->l->frame_ff); +} + +/* End local_state.c */ diff --git a/libcilkrts/runtime/local_state.h b/libcilkrts/runtime/local_state.h new file mode 100644 index 00000000000..03f39897f51 --- /dev/null +++ b/libcilkrts/runtime/local_state.h @@ -0,0 +1,424 @@ +/* local_state.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file local_state.h + * + * @brief The local_state structure contains additional OS-independent + * information that's associated with a worker, but doesn't need to be visible + * to the code generated by the compiler. + */ + +#ifndef INCLUDED_LOCAL_STATE_DOT_H +#define INCLUDED_LOCAL_STATE_DOT_H + +#include <internal/abi.h> +#include "worker_mutex.h" +#include "global_state.h" +#include "record-replay.h" +#include "signal_node.h" + +#include <setjmp.h> +#include <stddef.h> +#include <stdio.h> + + +#ifndef _WIN32 +# include <pthread.h> +#endif + +__CILKRTS_BEGIN_EXTERN_C + +/* Opaque types. */ + +struct full_frame; +struct free_list; +struct pending_exception_info; +/// Opaque type for replay entry. +typedef struct replay_entry_t replay_entry_t; + +/** + * @brief Magic numbers for local_state, used for debugging + */ +typedef unsigned long long ls_magic_t; + +/** + * @brief Scheduling stack function: A function that is decided on the program stack, + * but that must be executed on the scheduling stack. + */ +typedef void (*scheduling_stack_fcn_t) (__cilkrts_worker *w, + struct full_frame *ff, + __cilkrts_stack_frame *sf); + +/** + * @brief Type of this worker. + **/ +typedef enum cilk_worker_type +{ + WORKER_FREE, ///< Unused worker - available to be bound to user threads + WORKER_SYSTEM, ///< Worker created by runtime - able to steal from any worker + WORKER_USER ///< User thread - able to steal only from team members +} cilk_worker_type; + + +/** + * @brief The local_state structure contains additional OS-independent + * information that's associated with a worker, but doesn't need to be + * visible to the compiler. + * + * No compiler-generated code should need to know the layout of this + * structure. + * + * The fields of this struct can be classified as either local or + * shared. + * + * Local: This field is only accessed by the thread bound to this + * worker struct. Local fields can be freely accessed without + * acquiring locks. + * + * Shared: This field may be accessed by multiple worker threads. + * Accesses to shared fields usually requires locks, except in + * special situations where one can prove that locks are + * unnecessary. + * + * The fields of this can also be classified as "read-only" if the + * field does not change after it is initialized. Otherwise, the + * field is "read/write". Read-only fields do not require locks to + * access (ignoring the synchronization that might be needed for + * initialization if this can occur in parallel). + * + * Finally, we explicitly classify some fields as "synchronization" + * fields if they are used as part of a synchronization protocol in + * the runtime. These variables are generally shared and read/write. + * Mostly, this category includes lock variables and other variables + * that are involved in synchronization protocols (i.e., the THE + * protocol). + */ +struct local_state /* COMMON_PORTABLE */ +{ + /** This value should be in the first field in any local_state */ +# define WORKER_MAGIC_0 ((ls_magic_t)0xe0831a4a940c60b8ULL) + + /** + * Should be WORKER_MAGIC_0 or the local_state has been corrupted + * This magic field is shared because it is read on lock acquisitions. + * + * [shared read-only] + */ + ls_magic_t worker_magic_0; + + /** + * Mutex used to serialize access to the local_state + * Synchronization field. [shared read/write] + */ + struct mutex lock; + + /** + * Flag that indicates that the worker is interested in grabbing + * LOCK, and thus thieves should leave the worker alone. + * Written only by self, may be read by others. + * + * Synchronization field. [shared read/write] + */ + int do_not_steal; + + /** + * Lock that all thieves grab in order to compete for the right + * to disturb this worker. + * + * Synchronization field. [shared read/write] + */ + struct mutex steal_lock; + + /** + * Full frame that the worker is working on. + * + * While a worker w is executing, a thief may change + * w->l->frame_ff (on a successful steal) after acquiring w's + * lock. + * + * Unlocked accesses to w->l->frame_ff are safe (by w itself) when + * w's deque is empty, or when stealing from w has been disabled. + * + * [shared read/write] + */ + struct full_frame *frame_ff; + + /** + * Full frame that the worker will be working on next + * + * This field is normally local for a worker w. Another worker v + * may modify w->l->next_frame_ff, however, in the special case + * when v is returning a frame to a user thread w since w is the + * team leader. + * + * [shared read/write] + */ + struct full_frame *next_frame_ff; + + /** + * This is set iff this is a WORKER_USER and there has been a steal. It + * points to the first frame that was stolen since the team was last fully + * sync'd. Only this worker may continue past a sync in this function. + * + * This field is set by a thief for a victim that is a user + * thread, while holding the victim's lock. + * It can be cleared without a lock by the worker that will + * continue exuecting past the sync. + * + * [shared read/write] + */ + struct full_frame *last_full_frame; + + /** + * Team on which this worker is a participant. When a user worker enters, + * its team is its own worker struct and it can never change teams. When a + * system worker steals, it adopts the team of its victim. + * + * When a system worker w steals, it reads victim->l->team and + * joins this team. w->l->team is constant until the next time w + * returns control to the runtime. + * We must acquire the worker lock to change w->l->team. + * + * @note This field is 64-byte aligned because it is the first in + * the group of shared read-only fields. We want this group to + * fall on a different cache line from the previous group, which + * is shared read-write. + * + * [shared read-only] + */ + __attribute__((aligned(64))) + __cilkrts_worker *team; + + /** + * Type of this worker + * + * This field changes only when a worker binds or unbinds. + * Otherwise, the field is read-only while the worker is bound. + * + * [shared read-only] + */ + cilk_worker_type type; + + /** + * Lazy task queue of this worker - an array of pointers to stack frames. + * + * Read-only because deques are a fixed size in the current + * implementation. + * + * @note This field is 64-byte aligned because it is the first in + * the group of local fields. We want this group to fall on a + * different cache line from the previous group, which is shared + * read-only. + * + * [local read-only] + */ + __attribute__((aligned(64))) + __cilkrts_stack_frame **ltq; + + /** + * Pool of fibers waiting to be reused. + * [local read/write] + */ + cilk_fiber_pool fiber_pool; + + /** + * The fiber for the scheduling stacks. + * [local read/write] + */ + cilk_fiber* scheduling_fiber; + + /** + * Saved pointer to the leaf node in thread-local storage, when a + * user thread is imported. This pointer gets set to a + * meaningful value when binding a user thread, and cleared on + * unbind. + * + * [local read/write] + */ + __cilkrts_pedigree* original_pedigree_leaf; + + /** + * State of the random number generator + * + * [local read/write] + */ + unsigned rand_seed; + + /** + * Function to execute after transferring onto the scheduling stack. + * + * [local read/write] + */ + scheduling_stack_fcn_t post_suspend; + + /** + * __cilkrts_stack_frame we suspended when we transferred onto the + * scheduling stack. + * + * [local read/write] + */ + __cilkrts_stack_frame *suspended_stack; + + /** + * cilk_fiber that should be freed after returning from a + * spawn with a stolen parent or after stalling at a sync. + + * We calculate the stack to free when executing a reduction on + * the user stack, but we can not actually release the stack + * until control longjmps onto a runtime scheduling stack. + * + * This field is used to pass information to the runtime across + * the longjmp onto the scheduling stack. + * + * [local read/write] + */ + cilk_fiber* fiber_to_free; + + /** + * Saved exception object for an exception that is being passed to + * our parent + * + * [local read/write] + */ + struct pending_exception_info *pending_exception; + + /** + * Buckets for the memory allocator + * + * [local read/write] + */ + struct free_list *free_list[FRAME_MALLOC_NBUCKETS]; + + /** + * Potential function for the memory allocator + * + * [local read/write] + */ + size_t bucket_potential[FRAME_MALLOC_NBUCKETS]; + + /** + * Support for statistics + * + * Useful only when CILK_PROFIlE is compiled in. + * [local read/write] + */ + statistics* stats; + + /** + * Count indicates number of failures since last successful steal. This is + * used by the scheduler to reduce contention on shared flags. + * + * [local read/write] + */ + unsigned int steal_failure_count; + + /** + * 1 if work was stolen from another worker. When true, this will flag + * setup_for_execution_pedigree to increment the pedigree when we resume + * execution to match the increment that would have been done on a return + * from a spawn helper. + * + * [local read/write] + */ + int work_stolen; + + /** + * File pointer for record or replay + * Does FILE * work on Windows? + * During record, the file will be opened in write-only mode. + * During replay, the file will be opened in read-only mode. + * + * [local read/write] + */ + FILE *record_replay_fptr; + + /** + * Root of array of replay entries - NULL if we're not replaying a log + * + * [local read/write] + */ + replay_entry_t *replay_list_root; + + /** + * Current replay entry - NULL if we're not replaying a log + * + * [local read/write] + */ + replay_entry_t *replay_list_entry; + + /** + * Separate the signal_node from other things in the local_state by the + * sizeof a cache line for performance reasons. + * + * unused + */ + char buf[64]; + + /** + * Signal object for waking/sleeping the worker. This should be a pointer + * to avoid the possibility of caching problems. + * + * [shared read-only] + */ + signal_node_t *signal_node; + + /** This value should be in the last field in any local_state */ +# define WORKER_MAGIC_1 ((ls_magic_t)0x16164afb0ea0dff9ULL) + + /** + * Should be WORKER_MAGIC_1 or the local_state has been corrupted + * This magic field is shared because it is read on lock acquisitions. + * [shared read-only] + */ + ls_magic_t worker_magic_1; +}; + +/** + * Perform cleanup according to the function set before the longjmp(). + * + * Call this after longjmp() has completed and the worker is back on a + * scheduling stack. + * + * @param w __cilkrts_worker currently executing. + */ +void run_scheduling_stack_fcn(__cilkrts_worker *w); + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_LOCAL_STATE_DOT_H) diff --git a/libcilkrts/runtime/mac-symbols.txt b/libcilkrts/runtime/mac-symbols.txt new file mode 100644 index 00000000000..38d83a8675d --- /dev/null +++ b/libcilkrts/runtime/mac-symbols.txt @@ -0,0 +1,318 @@ +# Exported symbol list: +___cilkrts_bind_thread +___cilkrts_bind_thread_1 +___cilkrts_bump_loop_rank +___cilkrts_bump_loop_rank_internal +___cilkrts_bump_worker_rank +___cilkrts_bump_worker_rank_internal +___cilkrts_cilk_for_32 +___cilkrts_cilk_for_64 +___cilkrts_debugger_notification +___cilkrts_dump_stats +___cilkrts_end_cilk +___cilkrts_enter_frame +___cilkrts_enter_frame_1 +___cilkrts_enter_frame_fast +___cilkrts_enter_frame_fast_1 +___cilkrts_get_force_reduce +___cilkrts_get_nworkers +___cilkrts_get_pedigree_info +___cilkrts_get_pedigree_internal +___cilkrts_get_sf +___cilkrts_get_stack_size +___cilkrts_get_tls_worker +___cilkrts_get_tls_worker_fast +___cilkrts_get_total_workers +___cilkrts_get_worker_number +___cilkrts_get_worker_rank +___cilkrts_global_state +___cilkrts_hyper_create +___cilkrts_hyper_destroy +___cilkrts_hyper_lookup +___cilkrts_hyperobject_alloc +___cilkrts_hyperobject_dealloc +___cilkrts_hyperobject_noop_destroy +___cilkrts_init +___cilkrts_irml_version +___cilkrts_leave_frame +___cilkrts_metacall +___cilkrts_rethrow +___cilkrts_return_exception +___cilkrts_save_fp_ctrl_state +___cilkrts_set_param +___cilkrts_stack_alloc +___cilkrts_stack_free +___cilkrts_sync +___cilkrts_synched +___cilkrts_watch_stack +___cilkrts_worker_stub +_cilk_c_reducer_max_identity_char +_cilk_c_reducer_max_identity_double +_cilk_c_reducer_max_identity_float +_cilk_c_reducer_max_identity_int +_cilk_c_reducer_max_identity_long +_cilk_c_reducer_max_identity_longdouble +_cilk_c_reducer_max_identity_longlong +_cilk_c_reducer_max_identity_schar +_cilk_c_reducer_max_identity_short +_cilk_c_reducer_max_identity_uchar +_cilk_c_reducer_max_identity_uint +_cilk_c_reducer_max_identity_ulong +_cilk_c_reducer_max_identity_ulonglong +_cilk_c_reducer_max_identity_unsigned +_cilk_c_reducer_max_identity_ushort +_cilk_c_reducer_max_identity_wchar_t +_cilk_c_reducer_max_index_identity_char +_cilk_c_reducer_max_index_identity_double +_cilk_c_reducer_max_index_identity_float +_cilk_c_reducer_max_index_identity_int +_cilk_c_reducer_max_index_identity_long +_cilk_c_reducer_max_index_identity_longdouble +_cilk_c_reducer_max_index_identity_longlong +_cilk_c_reducer_max_index_identity_schar +_cilk_c_reducer_max_index_identity_short +_cilk_c_reducer_max_index_identity_uchar +_cilk_c_reducer_max_index_identity_uint +_cilk_c_reducer_max_index_identity_ulong +_cilk_c_reducer_max_index_identity_ulonglong +_cilk_c_reducer_max_index_identity_unsigned +_cilk_c_reducer_max_index_identity_ushort +_cilk_c_reducer_max_index_identity_wchar_t +_cilk_c_reducer_max_index_reduce_char +_cilk_c_reducer_max_index_reduce_double +_cilk_c_reducer_max_index_reduce_float +_cilk_c_reducer_max_index_reduce_int +_cilk_c_reducer_max_index_reduce_long +_cilk_c_reducer_max_index_reduce_longdouble +_cilk_c_reducer_max_index_reduce_longlong +_cilk_c_reducer_max_index_reduce_schar +_cilk_c_reducer_max_index_reduce_short +_cilk_c_reducer_max_index_reduce_uchar +_cilk_c_reducer_max_index_reduce_uint +_cilk_c_reducer_max_index_reduce_ulong +_cilk_c_reducer_max_index_reduce_ulonglong +_cilk_c_reducer_max_index_reduce_unsigned +_cilk_c_reducer_max_index_reduce_ushort +_cilk_c_reducer_max_index_reduce_wchar_t +_cilk_c_reducer_max_reduce_char +_cilk_c_reducer_max_reduce_double +_cilk_c_reducer_max_reduce_float +_cilk_c_reducer_max_reduce_int +_cilk_c_reducer_max_reduce_long +_cilk_c_reducer_max_reduce_longdouble +_cilk_c_reducer_max_reduce_longlong +_cilk_c_reducer_max_reduce_schar +_cilk_c_reducer_max_reduce_short +_cilk_c_reducer_max_reduce_uchar +_cilk_c_reducer_max_reduce_uint +_cilk_c_reducer_max_reduce_ulong +_cilk_c_reducer_max_reduce_ulonglong +_cilk_c_reducer_max_reduce_unsigned +_cilk_c_reducer_max_reduce_ushort +_cilk_c_reducer_max_reduce_wchar_t +_cilk_c_reducer_min_identity_char +_cilk_c_reducer_min_identity_double +_cilk_c_reducer_min_identity_float +_cilk_c_reducer_min_identity_int +_cilk_c_reducer_min_identity_long +_cilk_c_reducer_min_identity_longdouble +_cilk_c_reducer_min_identity_longlong +_cilk_c_reducer_min_identity_schar +_cilk_c_reducer_min_identity_short +_cilk_c_reducer_min_identity_uchar +_cilk_c_reducer_min_identity_uint +_cilk_c_reducer_min_identity_ulong +_cilk_c_reducer_min_identity_ulonglong +_cilk_c_reducer_min_identity_unsigned +_cilk_c_reducer_min_identity_ushort +_cilk_c_reducer_min_identity_wchar_t +_cilk_c_reducer_min_index_identity_char +_cilk_c_reducer_min_index_identity_double +_cilk_c_reducer_min_index_identity_float +_cilk_c_reducer_min_index_identity_int +_cilk_c_reducer_min_index_identity_long +_cilk_c_reducer_min_index_identity_longdouble +_cilk_c_reducer_min_index_identity_longlong +_cilk_c_reducer_min_index_identity_schar +_cilk_c_reducer_min_index_identity_short +_cilk_c_reducer_min_index_identity_uchar +_cilk_c_reducer_min_index_identity_uint +_cilk_c_reducer_min_index_identity_ulong +_cilk_c_reducer_min_index_identity_ulonglong +_cilk_c_reducer_min_index_identity_unsigned +_cilk_c_reducer_min_index_identity_ushort +_cilk_c_reducer_min_index_identity_wchar_t +_cilk_c_reducer_min_index_reduce_char +_cilk_c_reducer_min_index_reduce_double +_cilk_c_reducer_min_index_reduce_float +_cilk_c_reducer_min_index_reduce_int +_cilk_c_reducer_min_index_reduce_long +_cilk_c_reducer_min_index_reduce_longdouble +_cilk_c_reducer_min_index_reduce_longlong +_cilk_c_reducer_min_index_reduce_schar +_cilk_c_reducer_min_index_reduce_short +_cilk_c_reducer_min_index_reduce_uchar +_cilk_c_reducer_min_index_reduce_uint +_cilk_c_reducer_min_index_reduce_ulong +_cilk_c_reducer_min_index_reduce_ulonglong +_cilk_c_reducer_min_index_reduce_unsigned +_cilk_c_reducer_min_index_reduce_ushort +_cilk_c_reducer_min_index_reduce_wchar_t +_cilk_c_reducer_min_reduce_char +_cilk_c_reducer_min_reduce_double +_cilk_c_reducer_min_reduce_float +_cilk_c_reducer_min_reduce_int +_cilk_c_reducer_min_reduce_long +_cilk_c_reducer_min_reduce_longdouble +_cilk_c_reducer_min_reduce_longlong +_cilk_c_reducer_min_reduce_schar +_cilk_c_reducer_min_reduce_short +_cilk_c_reducer_min_reduce_uchar +_cilk_c_reducer_min_reduce_uint +_cilk_c_reducer_min_reduce_ulong +_cilk_c_reducer_min_reduce_ulonglong +_cilk_c_reducer_min_reduce_unsigned +_cilk_c_reducer_min_reduce_ushort +_cilk_c_reducer_min_reduce_wchar_t +_cilk_c_reducer_opadd_identity_char +_cilk_c_reducer_opadd_identity_double +_cilk_c_reducer_opadd_identity_float +_cilk_c_reducer_opadd_identity_int +_cilk_c_reducer_opadd_identity_long +_cilk_c_reducer_opadd_identity_longdouble +_cilk_c_reducer_opadd_identity_longlong +_cilk_c_reducer_opadd_identity_schar +_cilk_c_reducer_opadd_identity_short +_cilk_c_reducer_opadd_identity_uchar +_cilk_c_reducer_opadd_identity_uint +_cilk_c_reducer_opadd_identity_ulong +_cilk_c_reducer_opadd_identity_ulonglong +_cilk_c_reducer_opadd_identity_unsigned +_cilk_c_reducer_opadd_identity_ushort +_cilk_c_reducer_opadd_identity_wchar_t +_cilk_c_reducer_opadd_reduce_char +_cilk_c_reducer_opadd_reduce_double +_cilk_c_reducer_opadd_reduce_float +_cilk_c_reducer_opadd_reduce_int +_cilk_c_reducer_opadd_reduce_long +_cilk_c_reducer_opadd_reduce_longdouble +_cilk_c_reducer_opadd_reduce_longlong +_cilk_c_reducer_opadd_reduce_schar +_cilk_c_reducer_opadd_reduce_short +_cilk_c_reducer_opadd_reduce_uchar +_cilk_c_reducer_opadd_reduce_uint +_cilk_c_reducer_opadd_reduce_ulong +_cilk_c_reducer_opadd_reduce_ulonglong +_cilk_c_reducer_opadd_reduce_unsigned +_cilk_c_reducer_opadd_reduce_ushort +_cilk_c_reducer_opadd_reduce_wchar_t +_cilk_c_reducer_opand_identity_char +_cilk_c_reducer_opand_identity_int +_cilk_c_reducer_opand_identity_long +_cilk_c_reducer_opand_identity_longlong +_cilk_c_reducer_opand_identity_schar +_cilk_c_reducer_opand_identity_short +_cilk_c_reducer_opand_identity_uchar +_cilk_c_reducer_opand_identity_uint +_cilk_c_reducer_opand_identity_ulong +_cilk_c_reducer_opand_identity_ulonglong +_cilk_c_reducer_opand_identity_unsigned +_cilk_c_reducer_opand_identity_ushort +_cilk_c_reducer_opand_identity_wchar_t +_cilk_c_reducer_opand_reduce_char +_cilk_c_reducer_opand_reduce_int +_cilk_c_reducer_opand_reduce_long +_cilk_c_reducer_opand_reduce_longlong +_cilk_c_reducer_opand_reduce_schar +_cilk_c_reducer_opand_reduce_short +_cilk_c_reducer_opand_reduce_uchar +_cilk_c_reducer_opand_reduce_uint +_cilk_c_reducer_opand_reduce_ulong +_cilk_c_reducer_opand_reduce_ulonglong +_cilk_c_reducer_opand_reduce_unsigned +_cilk_c_reducer_opand_reduce_ushort +_cilk_c_reducer_opand_reduce_wchar_t +_cilk_c_reducer_opmul_identity_char +_cilk_c_reducer_opmul_identity_double +_cilk_c_reducer_opmul_identity_float +_cilk_c_reducer_opmul_identity_int +_cilk_c_reducer_opmul_identity_long +_cilk_c_reducer_opmul_identity_longdouble +_cilk_c_reducer_opmul_identity_longlong +_cilk_c_reducer_opmul_identity_schar +_cilk_c_reducer_opmul_identity_short +_cilk_c_reducer_opmul_identity_uchar +_cilk_c_reducer_opmul_identity_uint +_cilk_c_reducer_opmul_identity_ulong +_cilk_c_reducer_opmul_identity_ulonglong +_cilk_c_reducer_opmul_identity_unsigned +_cilk_c_reducer_opmul_identity_ushort +_cilk_c_reducer_opmul_identity_wchar_t +_cilk_c_reducer_opmul_reduce_char +_cilk_c_reducer_opmul_reduce_double +_cilk_c_reducer_opmul_reduce_float +_cilk_c_reducer_opmul_reduce_int +_cilk_c_reducer_opmul_reduce_long +_cilk_c_reducer_opmul_reduce_longdouble +_cilk_c_reducer_opmul_reduce_longlong +_cilk_c_reducer_opmul_reduce_schar +_cilk_c_reducer_opmul_reduce_short +_cilk_c_reducer_opmul_reduce_uchar +_cilk_c_reducer_opmul_reduce_uint +_cilk_c_reducer_opmul_reduce_ulong +_cilk_c_reducer_opmul_reduce_ulonglong +_cilk_c_reducer_opmul_reduce_unsigned +_cilk_c_reducer_opmul_reduce_ushort +_cilk_c_reducer_opmul_reduce_wchar_t +_cilk_c_reducer_opor_identity_char +_cilk_c_reducer_opor_identity_int +_cilk_c_reducer_opor_identity_long +_cilk_c_reducer_opor_identity_longlong +_cilk_c_reducer_opor_identity_schar +_cilk_c_reducer_opor_identity_short +_cilk_c_reducer_opor_identity_uchar +_cilk_c_reducer_opor_identity_uint +_cilk_c_reducer_opor_identity_ulong +_cilk_c_reducer_opor_identity_ulonglong +_cilk_c_reducer_opor_identity_unsigned +_cilk_c_reducer_opor_identity_ushort +_cilk_c_reducer_opor_identity_wchar_t +_cilk_c_reducer_opor_reduce_char +_cilk_c_reducer_opor_reduce_int +_cilk_c_reducer_opor_reduce_long +_cilk_c_reducer_opor_reduce_longlong +_cilk_c_reducer_opor_reduce_schar +_cilk_c_reducer_opor_reduce_short +_cilk_c_reducer_opor_reduce_uchar +_cilk_c_reducer_opor_reduce_uint +_cilk_c_reducer_opor_reduce_ulong +_cilk_c_reducer_opor_reduce_ulonglong +_cilk_c_reducer_opor_reduce_unsigned +_cilk_c_reducer_opor_reduce_ushort +_cilk_c_reducer_opor_reduce_wchar_t +_cilk_c_reducer_opxor_identity_char +_cilk_c_reducer_opxor_identity_int +_cilk_c_reducer_opxor_identity_long +_cilk_c_reducer_opxor_identity_longlong +_cilk_c_reducer_opxor_identity_schar +_cilk_c_reducer_opxor_identity_short +_cilk_c_reducer_opxor_identity_uchar +_cilk_c_reducer_opxor_identity_uint +_cilk_c_reducer_opxor_identity_ulong +_cilk_c_reducer_opxor_identity_ulonglong +_cilk_c_reducer_opxor_identity_unsigned +_cilk_c_reducer_opxor_identity_ushort +_cilk_c_reducer_opxor_identity_wchar_t +_cilk_c_reducer_opxor_reduce_char +_cilk_c_reducer_opxor_reduce_int +_cilk_c_reducer_opxor_reduce_long +_cilk_c_reducer_opxor_reduce_longlong +_cilk_c_reducer_opxor_reduce_schar +_cilk_c_reducer_opxor_reduce_short +_cilk_c_reducer_opxor_reduce_uchar +_cilk_c_reducer_opxor_reduce_uint +_cilk_c_reducer_opxor_reduce_ulong +_cilk_c_reducer_opxor_reduce_ulonglong +_cilk_c_reducer_opxor_reduce_unsigned +_cilk_c_reducer_opxor_reduce_ushort +_cilk_c_reducer_opxor_reduce_wchar_t diff --git a/libcilkrts/runtime/metacall_impl.c b/libcilkrts/runtime/metacall_impl.c new file mode 100644 index 00000000000..ce1c51a202b --- /dev/null +++ b/libcilkrts/runtime/metacall_impl.c @@ -0,0 +1,167 @@ +/* metacall_impl.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#include "metacall_impl.h" + +NOINLINE +CILK_API_VOID +__cilkrts_metacall(unsigned int tool, unsigned int code, void *data) +{ +#ifdef ENABLE_NOTIFY_ZC_INTRINSIC + // The metacall type, code and data are packed together into a single + // struct which will be interpreted by the tool. This function is the + // one and only use of a "cilkscreen_metacall" annotation + metacall_data_t d = { tool, code, data }; + + // Note that Inspector uses probe mode, and is implementing the metacall + // interface to force the runtime to run with a single worker. So + // __cilkrts_metacall must use __notify_intrinsic instead of + // __notify_zc_intrinsic + __notify_intrinsic("cilkscreen_metacall", &d); +#endif // ENABLE_NOTIFY_ZC_INTRINSIC +} + +int __cilkrts_running_under_sequential_ptool(void) +{ + static int running_under_sequential_ptool = -1; + volatile char c = ~0; + + // If we haven't been called before, see if we're running under Cilkscreen + // or Cilkview + if (-1 == running_under_sequential_ptool) + { + // metacall #2 writes 0 in C if we are running under + // a p-tools that requires serial execution, and is a + // no-op otherwise + // + // Note that removing the volatile is required to prevent the compiler + // from assuming that the value has not changed + __cilkrts_metacall(METACALL_TOOL_SYSTEM, + HYPER_ZERO_IF_SEQUENTIAL_PTOOL, (void *)&c); + + running_under_sequential_ptool = (0 == c); + } + + return running_under_sequential_ptool; +} + +/* + * __cilkrts_cilkscreen_establish_c_stack + * + * Notify Cilkscreen of the extent of the stack + */ + +void __cilkrts_cilkscreen_establish_c_stack(char *begin, char *end) +{ + char *limits[2] = {begin, end}; + + __cilkrts_metacall(METACALL_TOOL_SYSTEM, HYPER_ESTABLISH_C_STACK, limits); +} + +#ifdef WORKSPAN // Workspan stuff - remove when we're sure what we can drop + +void __cilkview_workspan_start(void) { + __cilkrts_metacall(HYPER_WORKSPAN_START, 0); +} + +void __cilkview_workspan_stop(void) { + __cilkrts_metacall(HYPER_WORKSPAN_STOP, 0); +} + +void __cilkview_workspan_dump(const char *str) { + __cilkrts_metacall(HYPER_WORKSPAN_DUMP, (void*)str); +} + + +void __cilkview_workspan_reset(void) { + __cilkrts_metacall(HYPER_WORKSPAN_RESET, 0); +} + + +void __cilkview_use_default_grain(void) { + __cilkrts_metacall(HYPER_USE_DEFAULT_GRAIN, 0); +} + +void __cilkview_get_workspan_data(unsigned long long *values, int size) +{ + void *data[2]; + + /* reset counters to zero in case we are not running under + a p-tool */ + + values[0] = 0; + + data[0] = (void*) values; + data[1] = (void*) &size; + __cilkrts_metacall(HYPER_WORKSPAN_QUERY, &data); +} + +void __cilkview_workspan_connected (int *flag) { + *flag = 0; + __cilkrts_metacall(HYPER_WORKSPAN_CONNECTED, (void *)flag); +} + +void __cilkview_workspan_suspend() { + __cilkrts_metacall(HYPER_WORKSPAN_SUSPEND, 0); +} + +void __cilkview_workspan_resume() { + __cilkrts_metacall(HYPER_WORKSPAN_RESUME, 0); +} + +/* depreciated interfaces */ +void __cilkometer_workspan_start(void) { + __cilkrts_metacall(HYPER_WORKSPAN_START, 0); +} + +void __cilkometer_workspan_stop(void) { + __cilkrts_metacall(HYPER_WORKSPAN_STOP, 0); +} + +void __cilkometer_workspan_dump(const char *str) { + __cilkrts_metacall(HYPER_WORKSPAN_DUMP, (void*)str); +} + + +void __cilkometer_workspan_reset(void) { + __cilkrts_metacall(HYPER_WORKSPAN_RESET, 0); +} + +#endif // WORKSPAN + +/* End metacall_impl.c */ diff --git a/libcilkrts/runtime/metacall_impl.h b/libcilkrts/runtime/metacall_impl.h new file mode 100644 index 00000000000..90cc7f95168 --- /dev/null +++ b/libcilkrts/runtime/metacall_impl.h @@ -0,0 +1,123 @@ +/* metacall_impl.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2010-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +/** + * @file metacall_impl.h + * + * @brief Meta-function calls to be used within the Cilk runtime system. + * + * These differ from the macros in cilkscreen.h and cilkview.h because they go + * through the __cilkrts_metacall interface, which ensures that the operation + * is performed even when instrumentation is disabled. + */ + +#ifndef INCLUDED_CILKRTS_METACALL_H +#define INCLUDED_CILKRTS_METACALL_H + +#include "rts-common.h" +#include <internal/metacall.h> +#include <cilk/common.h> + +__CILKRTS_BEGIN_EXTERN_C + +/** + * This function is effectively an unconditional call from the runtime into + * a tool. It is used for operations that must be performed by the tool, + * even when the tool is not instrumenting. For example, Cilkscreen always + * recognizes the address of this function and performs the action specified + * in the contained metadata. + * + * Note that this function MUST NOT BE INLINED within the runtime. This must + * be the ONLY instance of the cilkscreen_metacall metadata. + */ +CILK_API_VOID +__cilkrts_metacall(unsigned int tool, unsigned int code, void *data); + +/** + * Return non-zero if running under Cilkscreen or Cilkview + */ +COMMON_PORTABLE +int __cilkrts_running_under_sequential_ptool(void); + +/** + * Disable Cilkscreen implementation + */ +#define __cilkrts_cilkscreen_disable_instrumentation() \ + __cilkrts_metacall(METACALL_TOOL_SYSTEM, HYPER_DISABLE_INSTRUMENTATION, 0) + +/** + * Enable Cilkscreen implementation + */ +#define __cilkrts_cilkscreen_enable_instrumentation() \ + __cilkrts_metacall(METACALL_TOOL_SYSTEM, HYPER_ENABLE_INSTRUMENTATION, 0) + +/** + * Set the worker on entering runtime. + * + * @attention Deprecated in favor of __cilkrts_cilkscreen_ignore_block. The + * begin/enter pairs in the current metadata mean Cilkscreen no longer has to + * have improper knowledge of the __cilkrts_worker or __cilkrts_stack_frame + * structures. + */ +#define __cilkrts_cilkscreen_establish_worker(w) \ + __cilkrts_metacall(METACALL_TOOL_SYSTEM, HYPER_ESTABLISH_WORKER, w) + +/** + * Notify Cilkscreen of the extent of the stack. + * + * @param[in] begin Start (low address) of stack + * @param[in] end One past high address of stack + */ +void __cilkrts_cilkscreen_establish_c_stack(char *begin, char *end); + +/** + * Tell tools to ignore a block of memory - currently the global state and + * memory allocated for workers. + */ +#define __cilkrts_cilkscreen_ignore_block(_begin, _end) \ +{ \ + void *block[2] = {_begin, _end}; \ + __cilkrts_metacall(METACALL_TOOL_SYSTEM, \ + HYPER_IGNORE_MEMORY_BLOCK, \ + block); \ +} + +__CILKRTS_END_EXTERN_C + +#endif /* ! defined(INCLUDED_CILKRTS_METACALL_H) */ diff --git a/libcilkrts/runtime/os-unix.c b/libcilkrts/runtime/os-unix.c new file mode 100644 index 00000000000..b48fd623c6e --- /dev/null +++ b/libcilkrts/runtime/os-unix.c @@ -0,0 +1,508 @@ +/* os-unix.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#ifdef __linux__ + // define _GNU_SOURCE before *any* #include. + // Even <stdint.h> will break later #includes if this macro is not + // already defined when it is #included. +# define _GNU_SOURCE +#endif + +#include "os.h" +#include "bug.h" +#include "cilk_malloc.h" +#include <internal/abi.h> + +#if defined __linux__ +# include <sys/sysinfo.h> +# include <sys/syscall.h> +#elif defined __APPLE__ +# include <sys/sysctl.h> + // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output +#elif defined __FreeBSD__ +// No additional include files +#elif defined __CYGWIN__ +// Cygwin on Windows - no additional include files +#elif defined __VXWORKS__ +# include <vxWorks.h> +# include <vxCpuLib.h> +# include <taskLib.h> +#else +# error "Unsupported OS" +#endif + +#include <stdarg.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <pthread.h> +#include <sys/types.h> + + + +// /* Thread-local storage */ +// #ifdef _WIN32 +// typedef unsigned cilkos_tls_key_t; +// #else +// typedef pthread_key_t cilkos_tls_key_t; +// #endif +// cilkos_tls_key_t cilkos_allocate_tls_key(); +// void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr); +// void* cilkos_get_tls_pointer(cilkos_tls_key_t key); + +#if !defined CILK_WORKER_TLS +static int cilk_keys_defined; +static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key; + +#if SUPPORT_GET_CURRENT_FIBER > 0 +static pthread_key_t fiber_key; +#endif + +static void *serial_worker; + + +// This destructor is called when a pthread dies to deallocate the +// pedigree node. +static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr) +{ + __cilkrts_pedigree* pedigree_tls + = (__cilkrts_pedigree*)pedigree_tls_ptr; + if (pedigree_tls) { + // Assert that we have either one or two nodes + // left in the pedigree chain. + // If we have more, then something is going wrong... + CILK_ASSERT(!pedigree_tls->parent || !pedigree_tls->parent->parent); + __cilkrts_free(pedigree_tls); + } +} + +void __cilkrts_init_tls_variables(void) +{ + int status; + /* This will be called once in serial execution before any + Cilk parallelism so we do not need to worry about races + on cilk_keys_defined. */ + if (cilk_keys_defined) + return; + status = pthread_key_create(&worker_key, NULL); + CILK_ASSERT (status == 0); + status = pthread_key_create(&pedigree_leaf_key, + __cilkrts_pedigree_leaf_destructor); + CILK_ASSERT (status == 0); + status = pthread_key_create(&tbb_interop_key, NULL); + CILK_ASSERT (status == 0); + +#if SUPPORT_GET_CURRENT_FIBER > 0 + status = pthread_key_create(&fiber_key, NULL); + CILK_ASSERT (status == 0); +#endif + cilk_keys_defined = 1; + return; +} + +COMMON_SYSDEP +void* cilkos_get_current_thread_id(void) +{ + return (void*)pthread_self(); +} + + +CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker() +{ + if (__builtin_expect(cilk_keys_defined, 1)) + return (__cilkrts_worker *)pthread_getspecific(worker_key); + else + return serial_worker; + +} + +CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast() +{ + return (__cilkrts_worker *)pthread_getspecific(worker_key); +} + +COMMON_SYSDEP +__cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void) +{ + if (__builtin_expect(cilk_keys_defined, 1)) + return (__cilk_tbb_stack_op_thunk *) + pthread_getspecific(tbb_interop_key); + else + return 0; +} + +// This counter should be updated atomically. +static int __cilkrts_global_pedigree_tls_counter = -1; + +COMMON_SYSDEP +__cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new) +{ + __cilkrts_pedigree *pedigree_tls; + if (__builtin_expect(cilk_keys_defined, 1)) { + pedigree_tls = + (struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key); + } + else { + return 0; + } + + if (!pedigree_tls && create_new) { + // This call creates two nodes, X and Y. + // X == pedigree_tls[0] is the leaf node, which gets copied + // in and out of a user worker w when w binds and unbinds. + // Y == pedigree_tls[1] is the root node, + // which is a constant node that represents the user worker + // thread w. + pedigree_tls = (__cilkrts_pedigree*) + __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree)); + + // This call sets the TLS pointer to the new node. + __cilkrts_set_tls_pedigree_leaf(pedigree_tls); + + pedigree_tls[0].rank = 0; + pedigree_tls[0].parent = &pedigree_tls[1]; + + // Create Y, whose rank begins as the global counter value. + pedigree_tls[1].rank = + __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1); + + pedigree_tls[1].parent = NULL; + CILK_ASSERT(pedigree_tls[1].rank != -1); + } + return pedigree_tls; +} + +#if SUPPORT_GET_CURRENT_FIBER > 0 +COMMON_SYSDEP +cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void) +{ + if (__builtin_expect(cilk_keys_defined, 1)) + return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key); + else + return NULL; +} +#endif + +COMMON_SYSDEP +void __cilkrts_set_tls_worker(__cilkrts_worker *w) +{ + if (__builtin_expect(cilk_keys_defined, 1)) { + int status; + status = pthread_setspecific(worker_key, w); + CILK_ASSERT (status == 0); + return; + } + else + { + serial_worker = w; + } +} + +COMMON_SYSDEP +void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t) +{ + if (__builtin_expect(cilk_keys_defined, 1)) { + int status; + status = pthread_setspecific(tbb_interop_key, t); + CILK_ASSERT (status == 0); + return; + } + abort(); +} + +COMMON_SYSDEP +void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf) +{ + if (__builtin_expect(cilk_keys_defined, 1)) { + int status; + status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf); + CILK_ASSERT (status == 0); + return; + } + abort(); +} + +#if SUPPORT_GET_CURRENT_FIBER > 0 +COMMON_SYSDEP +void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber) +{ + if (__builtin_expect(cilk_keys_defined, 1)) { + int status; + status = pthread_setspecific(fiber_key, fiber); + CILK_ASSERT (status == 0); + return; + } + abort(); +} +#endif + +#else +void __cilkrts_init_tls_variables(void) +{ +} +#endif + +#if defined (__linux__) && ! defined(ANDROID) +/* + * Get the thread id, rather than the pid. In the case of MIC offload, it's + * possible that we have multiple threads entering Cilk, and each has a + * different affinity. + */ +static pid_t linux_gettid(void) +{ + return syscall(SYS_gettid); +} + +/* + * On Linux we look at the thread affinity mask and restrict ourself to one + * thread for each of the hardware contexts to which we are bound. + * Therefore if user does + * % taskset 0-1 cilkProgram + * # restrict execution to hardware contexts zero and one + * the Cilk program will only use two threads even if it is running on a + * machine that has 32 hardware contexts. + * This is the right thing to do, because the threads are restricted to two + * hardware contexts by the affinity mask set by taskset, and if we were to + * create extra threads they would simply oversubscribe the hardware resources + * we can use. + * This is particularly important on MIC in offload mode, where the affinity + * mask is set by the offload library to force the offload code away from + * cores that have offload support threads running on them. + */ +static int linux_get_affinity_count (int tid) +{ + cpu_set_t process_mask; + + // Extract the thread affinity mask + int err = sched_getaffinity (tid, sizeof(process_mask),&process_mask); + + if (0 != err) + { + return 0; + } + + // We have extracted the mask OK, so now we can count the number of threads + // in it. This is linear in the maximum number of CPUs available, We + // could do a logarithmic version, if we assume the format of the mask, + // but it's not really worth it. We only call this at thread startup + // anyway. + int available_procs = 0; + int i; + for (i = 0; i < CPU_SETSIZE; i++) + { + if (CPU_ISSET(i, &process_mask)) + { + available_procs++; + } + } + + return available_procs; +} +#endif + +/* + * __cilkrts_hardware_cpu_count + * + * Returns the number of available CPUs on this hardware. This is architecture- + * specific. + */ + +COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void) +{ +#if defined ANDROID + return sysconf (_SC_NPROCESSORS_ONLN); +#elif defined __MIC__ + /// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial + /// on KNC. Also, ignore the last core. + int P = sysconf (_SC_NPROCESSORS_ONLN); + return P/2 - 2; +#elif defined __linux__ + int affinity_count = linux_get_affinity_count(linux_gettid()); + + return (0 != affinity_count) ? affinity_count : sysconf (_SC_NPROCESSORS_ONLN); +#elif defined __APPLE__ + int count = 0; + int cmd[2] = { CTL_HW, HW_NCPU }; + size_t len = sizeof count; + int status = sysctl(cmd, 2, &count, &len, 0, 0); + assert(status >= 0); + assert((unsigned)count == count); + + return count; +#elif defined __FreeBSD__ || defined __CYGWIN__ + int ncores = sysconf(_SC_NPROCESSORS_ONLN); + + return ncores; + // Just get the number of processors +// return sysconf(_SC_NPROCESSORS_ONLN); +#elif defined __VXWORKS__ + return __builtin_popcount( vxCpuEnabledGet() ); +#else +#error "Unknown architecture" +#endif +} + +COMMON_SYSDEP void __cilkrts_sleep(void) +{ +#ifdef __VXWORKS__ + taskDelay(1); +#else + usleep(1); +#endif +} + +COMMON_SYSDEP void __cilkrts_yield(void) +{ +#if __APPLE__ || __FreeBSD__ || __VXWORKS__ + // On MacOS, call sched_yield to yield quantum. I'm not sure why we + // don't do this on Linux also. + sched_yield(); +#elif defined(__MIC__) + // On MIC, pthread_yield() really trashes things. Arch's measurements + // showed that calling _mm_delay_32() (or doing nothing) was a better + // option. Delaying 1024 clock cycles is a reasonable compromise between + // giving up the processor and latency starting up when work becomes + // available + _mm_delay_32(1024); +#elif defined(ANDROID) + // On Android, call sched_yield to yield quantum. I'm not sure why we + // don't do this on Linux also. + sched_yield(); +#else + // On Linux, call pthread_yield (which in turn will call sched_yield) + // to yield quantum. + pthread_yield(); +#endif +} + +COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen, + const char* varname) +{ + CILK_ASSERT(value); + CILK_ASSERT(varname); + + const char* envstr = getenv(varname); + if (envstr) + { + size_t len = strlen(envstr); + if (len > vallen - 1) + return len + 1; + + strcpy(value, envstr); + return len; + } + else + { + value[0] = '\0'; + return 0; + } +} + +/* + * Unrecoverable error: Print an error message and abort execution. + */ +COMMON_SYSDEP void cilkos_error(const char *fmt, ...) +{ + va_list l; + fflush(NULL); + fprintf(stderr, "Cilk error: "); + va_start(l, fmt); + vfprintf(stderr, fmt, l); + va_end(l); + fprintf(stderr, "Exiting.\n"); + fflush(stderr); + + abort(); +} + +/* + * Print a warning message and return. + */ +COMMON_SYSDEP void cilkos_warning(const char *fmt, ...) +{ + va_list l; + fflush(NULL); + fprintf(stderr, "Cilk warning: "); + va_start(l, fmt); + vfprintf(stderr, fmt, l); + va_end(l); + fflush(stderr); +} + +static void __attribute__((constructor)) init_once() +{ + /*__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);*/ + __cilkrts_init_tls_variables(); +} + + +#define PAGE 4096 +#define CILK_MIN_STACK_SIZE (4*PAGE) +// Default size for the stacks that we create in Cilk for Unix. +#define CILK_DEFAULT_STACK_SIZE 0x100000 + +/* + * Convert the user's specified stack size into a "reasonable" value + * for this OS. + */ +size_t cilkos_validate_stack_size(size_t specified_stack_size) { + // Convert any negative value to the default. + if (specified_stack_size == 0) { + CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0); + return CILK_DEFAULT_STACK_SIZE; + } + // Round values in between 0 and CILK_MIN_STACK_SIZE up to + // CILK_MIN_STACK_SIZE. + if (specified_stack_size <= CILK_MIN_STACK_SIZE) { + return CILK_MIN_STACK_SIZE; + } + if ((specified_stack_size % PAGE) > 0) { + // Round the user's stack size value up to nearest page boundary. + return (PAGE * (1 + specified_stack_size / PAGE)); + } + return specified_stack_size; +} + +long cilkos_atomic_add(volatile long* p, long x) +{ + return __sync_add_and_fetch(p, x); +} + +/* End os-unix.c */ diff --git a/libcilkrts/runtime/os.h b/libcilkrts/runtime/os.h new file mode 100644 index 00000000000..8066f0313c2 --- /dev/null +++ b/libcilkrts/runtime/os.h @@ -0,0 +1,236 @@ +/* os.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file os.h + * + * @brief Low-level operating-system dependent facilities, not dependent on + * any Cilk facilities. + */ + +#ifndef INCLUDED_OS_DOT_H +#define INCLUDED_OS_DOT_H + +#include "rts-common.h" +#include "cilk/common.h" +#include "cilk-tbb-interop.h" + +#ifdef __cplusplus +# include <cstddef> +#else +# include <stddef.h> +#endif + +__CILKRTS_BEGIN_EXTERN_C + + +// /* Thread-local storage */ +// #ifdef _WIN32 +// typedef unsigned cilkos_tls_key_t; +// #else +// typedef pthread_key_t cilkos_tls_key_t; +// #endif +// cilkos_tls_key_t cilkos_allocate_tls_key(); +// void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr); +// void* cilkos_get_tls_pointer(cilkos_tls_key_t key); + +/* The RTS assumes that some thread-local state exists that stores the + worker and reducer map currently associated with a thread. These routines + manipulate this state. */ + +/** @brief Thread-local state for cilk fibers. */ +typedef struct cilk_fiber_sysdep cilk_fiber_sysdep; + +/** @brief Initialize all TLS variables for Cilk. */ +COMMON_SYSDEP void __cilkrts_init_tls_variables(void); + +/** @brief Set worker struct in TLS. */ +COMMON_SYSDEP +void __cilkrts_set_tls_worker(__cilkrts_worker *w) cilk_nothrow; + +/** @brief Get stack_op for TBB-interop structures from TLS. */ +COMMON_SYSDEP +__cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void); + +/** @brief Set stack_op for TBB-interop structures in TLS. */ +COMMON_SYSDEP +void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t); + +/** + * @brief Get the pointer to the pedigree leaf node from TLS. + * + * Function to get a pointer to the thread's pedigree leaf node. This + * pointer can be NULL. + */ +COMMON_SYSDEP +__cilkrts_pedigree * __cilkrts_get_tls_pedigree_leaf(int create_new); + +/** + * @brief Sets the pointer to the pedigree leaf node in TLS. + * + * If the previous pointer value was not NULL, it is the caller's + * responsibility to ensure that previous pointer value is saved and + * freed. + * + * @param pedigree_leaf The leaf node to store into TLS. + */ +COMMON_SYSDEP +void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf); + + +#if SUPPORT_GET_CURRENT_FIBER > 0 +/** + * @brief Get the cilk_fiber from TLS. + */ +COMMON_SYSDEP +cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void); + +/** + * @brief Set the cilk_fiber in TLS. + * + * @param fiber The fiber to store into TLS. + */ +COMMON_SYSDEP +void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber); +#endif + +/** + * @brief Function for returning the current thread id. + * @warning This function is useful for debugging purposes only. + */ +COMMON_SYSDEP +void* cilkos_get_current_thread_id(void); + +/** @brief Return number of CPUs supported by this hardware, using whatever definition + of CPU is considered appropriate. */ +COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void); + +/** @brief Get current value of timer */ +COMMON_SYSDEP unsigned long long __cilkrts_getticks(void); + +/* Machine instructions */ + +/// Stall execution for a few cycles. +COMMON_SYSDEP void __cilkrts_short_pause(void); +/// Wrapper for xchg instruction +COMMON_SYSDEP int __cilkrts_xchg(volatile int *ptr, int x); + +// Defines __cilkrts_fence - A macro for x86, a function call for other +// architectures +#include "os-fence.h" + +COMMON_SYSDEP void __cilkrts_sleep(void); ///< Sleep briefly +COMMON_SYSDEP void __cilkrts_yield(void); ///< Yield quantum + +/** + * @brief Gets environment variable 'varname' and copy its value into 'value'. + * + * If the entire value, including the null terminator fits into 'vallen' + * bytes, then returns the length of the value excluding the null. Otherwise, + * leaves the contents of 'value' undefined and returns the number of + * characters needed to store the environment variable's value, *including* + * the null terminator. + * + * @param value Buffer to store value. + * @param vallen Length of value buffer + * @param varname Name of the environment variable. + * @return Length of value buffer (excluding the null). + */ +COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen, + const char* varname); + +/** + * @brief Unrecoverable error: Print an error message and abort execution. + */ +COMMON_SYSDEP void cilkos_error(const char *fmt, ...); + +/** + * @brief Print a warning message and return. + */ +COMMON_SYSDEP void cilkos_warning(const char *fmt, ...); + +/** + * @brief Convert the user's specified stack size into a "reasonable" + * value for the current OS. + * + * @param specified_stack_size User-specified stack size. + * @return New stack size value, modified for the OS. + */ +COMMON_SYSDEP size_t cilkos_validate_stack_size(size_t specified_stack_size); + +/** + * @brief Atomic addition: computes *p += x. + * + * @param p Pointer to value to update + * @param x Value of x. + */ +COMMON_SYSDEP long cilkos_atomic_add(volatile long* p, long x); + +#ifdef _WIN32 + +/** + * @brief Windows-only low-level functions for processor groups. + */ +typedef struct _GROUP_AFFINITY GROUP_AFFINITY; + +/** + * @brief Probe the executing OS to see if it supports processor + * groups. These functions are expected to be available in Windows 7 + * or later. + */ +void win_init_processor_groups(void); + +unsigned long win_get_active_processor_count(unsigned short GroupNumber); +unsigned short win_get_active_processor_group_count(void); +int win_set_thread_group_affinity(/*HANDLE*/ void* hThread, + const GROUP_AFFINITY *GroupAffinity, + GROUP_AFFINITY* PreviousGroupAffinity); + +/** + * @brief Cleans up any state allocated in TLS. + * + * Only defined for Windows because Linux calls destructors for each + * thread-local variable. + */ +void __cilkrts_per_thread_tls_cleanup(void); + +#endif // _WIN32 + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_OS_DOT_H) diff --git a/libcilkrts/runtime/os_mutex-unix.c b/libcilkrts/runtime/os_mutex-unix.c new file mode 100644 index 00000000000..af398cdd089 --- /dev/null +++ b/libcilkrts/runtime/os_mutex-unix.c @@ -0,0 +1,193 @@ +/* os_mutex-unix.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#include "os_mutex.h" +#include "bug.h" + +#include <stdlib.h> +#include <errno.h> +#include <pthread.h> + +// contains notification macros for VTune. +#include "cilk-ittnotify.h" + +/* + * OS Mutex functions. + * + * Not to be confused with the spinlock mutexes implemented in cilk_mutex.c + */ + +struct os_mutex { + pthread_mutex_t mutex; ///< On Linux, os_mutex is implemented with a pthreads mutex +}; + +// Unix implementation of the global OS mutex. This will be created by the +// first call to global_os_mutex_lock() and *NEVER* destroyed. On gcc-based +// systems there's no way to guarantee the ordering of constructors and +// destructors, so we can't be guaranteed that our destructor for a static +// object will be called *after* any static destructors that may use Cilk +// in the user's application +static struct os_mutex *global_os_mutex = NULL; + +/* Sometimes during shared library load malloc doesn't work. + To handle that case, preallocate space for one mutex. */ +static struct os_mutex static_mutex; +static int static_mutex_used; + +struct os_mutex *__cilkrts_os_mutex_create(void) +{ + int status; + struct os_mutex *mutex = (struct os_mutex *)malloc(sizeof(struct os_mutex)); + pthread_mutexattr_t attr; + + ITT_SYNC_CREATE(mutex, "OS Mutex"); + + if (!mutex) { + if (static_mutex_used) { + __cilkrts_bug("Cilk RTS library initialization failed"); + } else { + static_mutex_used = 1; + mutex = &static_mutex; + } + } + + status = pthread_mutexattr_init(&attr); + CILK_ASSERT (status == 0); +#if defined DEBUG || CILK_LIB_DEBUG +#ifdef PTHREAD_MUTEX_ERRORCHECK + status = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK); +#else + status = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK_NP); +#endif + CILK_ASSERT (status == 0); +#endif + status = pthread_mutex_init (&mutex->mutex, &attr); + CILK_ASSERT (status == 0); + pthread_mutexattr_destroy(&attr); + + return mutex; +} + +void __cilkrts_os_mutex_lock(struct os_mutex *p) +{ + int status; + status = pthread_mutex_lock (&p->mutex); + ITT_SYNC_ACQUIRED(p); + if (__builtin_expect(status, 0) == 0) + return; + if (status == EDEADLK) + __cilkrts_bug("Cilk runtime error: deadlock acquiring mutex %p\n", + p); + else + __cilkrts_bug("Cilk runtime error %d acquiring mutex %p\n", + status, p); +} + +int __cilkrts_os_mutex_trylock(struct os_mutex *p) +{ + int status; + status = pthread_mutex_trylock (&p->mutex); + return (status == 0); +} + +void __cilkrts_os_mutex_unlock(struct os_mutex *p) +{ + int status; + ITT_SYNC_RELEASING(p); + status = pthread_mutex_unlock (&p->mutex); + CILK_ASSERT(status == 0); +} + +void __cilkrts_os_mutex_destroy(struct os_mutex *p) +{ + pthread_mutex_destroy (&p->mutex); + if (p == &static_mutex) { + static_mutex_used = 0; + } else { + free(p); + } +} + +/* + * create_global_os_mutex + * + * Function used with pthread_once to initialize the global OS mutex. Since + * pthread_once requires a function which takes no parameters and has no + * return value, the global OS mutex will be stored in the static (global + * to the compilation unit) variable "global_os_mutex." + * + * + * global_os_mutex will never be destroyed. + */ +static void create_global_os_mutex(void) +{ + CILK_ASSERT(NULL == global_os_mutex); + global_os_mutex = __cilkrts_os_mutex_create(); +} + +void global_os_mutex_lock(void) +{ + // pthread_once_t used with pthread_once to guarantee that + // create_global_os_mutex() is only called once + static pthread_once_t global_os_mutex_is_initialized = PTHREAD_ONCE_INIT; + + // Execute create_global_os_mutex once in a thread-safe manner + // Note that create_global_os_mutex returns the mutex in the static + // (global to the module) variable "global_os_mutex" + pthread_once(&global_os_mutex_is_initialized, + create_global_os_mutex); + + // We'd better have allocated a global_os_mutex + CILK_ASSERT(NULL != global_os_mutex); + + // Acquire the global OS mutex + __cilkrts_os_mutex_lock(global_os_mutex); +} + +void global_os_mutex_unlock(void) +{ + // We'd better have allocated a global_os_mutex. This means you should + // have called global_os_mutex_lock() before calling + // global_os_mutex_unlock(), but this is the only check for it. + CILK_ASSERT(NULL != global_os_mutex); + + // Release the global OS mutex + __cilkrts_os_mutex_unlock(global_os_mutex); +} + +/* End os_mutex-unix.c */ diff --git a/libcilkrts/runtime/os_mutex.h b/libcilkrts/runtime/os_mutex.h new file mode 100644 index 00000000000..71d9eb14e51 --- /dev/null +++ b/libcilkrts/runtime/os_mutex.h @@ -0,0 +1,135 @@ +/* os_mutex.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file os_mutex.h + * + * @brief Portable interface to operating-system mutexes. + * + * Do not confuse os_mutex with Cilk runtime-specific spinlock mutexes. + */ + +#ifndef INCLUDED_OS_MUTEX_DOT_H +#define INCLUDED_OS_MUTEX_DOT_H + +#include <cilk/common.h> +#include "rts-common.h" + +__CILKRTS_BEGIN_EXTERN_C + +/// Opaque type +typedef struct os_mutex os_mutex; + +/** + * Allocate and initialize an os_mutex + * + * @return A pointer to the initialized os_mutex + */ +COMMON_SYSDEP os_mutex* __cilkrts_os_mutex_create(void); + +/** + * Acquire the os_mutex for exclusive use + * + * @param m The os_mutex that is to be acquired. + */ +COMMON_SYSDEP void __cilkrts_os_mutex_lock(os_mutex *m); + +/** + * Try to acquire the os_mutex. + * + * @param m The os_mutex to try to acquire + * @return 0 if the lock acquire failed + * @return nonzero if the lock was acquired + */ +COMMON_SYSDEP int __cilkrts_os_mutex_trylock(os_mutex *m); + +/** + * Release the os_mutex + * + * @param m The os_mutex that is to be released. + */ +COMMON_SYSDEP void __cilkrts_os_mutex_unlock(os_mutex *m); + +/** + * Release any resources and deallocate the os_mutex. + * + * @param m The os_mutex that is to be deallocated. + */ +COMMON_SYSDEP void __cilkrts_os_mutex_destroy(os_mutex *m); + +/** + * Acquire the global os_mutex for exclusive use. The global os_mutex + * will be initialized the first time this function is called in a + * thread-safe manner. + */ +COMMON_SYSDEP void global_os_mutex_lock(); + +/** + * Release the global os_mutex. global_os_mutex_lock() must have been + * called first. + */ +COMMON_SYSDEP void global_os_mutex_unlock(); + + +#ifdef _MSC_VER + +/** + * @brief Create the global OS mutex - Windows only. + * + * On Windows we use DllMain() to create the global OS mutex when cilkrts20.dll + * is loaded. As opposed to Linux/MacOS where we use pthread_once to implement + * a singleton since there are no guarantees about constructor or destructor + * ordering between shared objects. + */ +NON_COMMON void global_os_mutex_create(); + +/** + * @brief Destroy the global OS mutex - Windows only + * + * On Windows we use DllMain() to destroy the global OS mutex when + * cilkrts20.dll is unloaded. As opposed to Linux/MacOS where we cannot + * know when it's safe to destroy the global OS mutex since there are no + * guarantees about constructor or destructor ordering. + */ +NON_COMMON void global_os_mutex_destroy(); + +#endif // _MSC_VER + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_OS_MUTEX_DOT_H) diff --git a/libcilkrts/runtime/pedigrees.c b/libcilkrts/runtime/pedigrees.c new file mode 100644 index 00000000000..dee4d9cb411 --- /dev/null +++ b/libcilkrts/runtime/pedigrees.c @@ -0,0 +1,112 @@ +/* pedigrees.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2007-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +#include "pedigrees.h" +#include "local_state.h" + +/************************************************************* + Pedigree API code. +*************************************************************/ + +/* + * C99 requires that every inline function with external linkage have one + * extern declaration in the program (with the inline definition in scope). + */ +COMMON_PORTABLE +extern void update_pedigree_on_leave_frame(__cilkrts_worker *w, + __cilkrts_stack_frame *sf); + +void __cilkrts_set_pedigree_leaf(__cilkrts_pedigree *leaf) +{ + __cilkrts_set_tls_pedigree_leaf(leaf); +} + +void load_pedigree_leaf_into_user_worker(__cilkrts_worker *w) +{ + __cilkrts_pedigree *pedigree_leaf; + CILK_ASSERT(w->l->type == WORKER_USER); + pedigree_leaf = __cilkrts_get_tls_pedigree_leaf(1); + w->pedigree = *pedigree_leaf; + + // Save a pointer to the old leaf. + // We'll need to restore it later. + CILK_ASSERT(w->l->original_pedigree_leaf == NULL); + w->l->original_pedigree_leaf = pedigree_leaf; + + __cilkrts_set_tls_pedigree_leaf(&w->pedigree); + + // Check that this new pedigree root has at least two values. + CILK_ASSERT(w->pedigree.parent); + CILK_ASSERT(w->pedigree.parent->parent == NULL); +} + +void save_pedigree_leaf_from_user_worker(__cilkrts_worker *w) +{ + CILK_ASSERT(w->l->type == WORKER_USER); + + // Existing leaf in tls should be for the current worker. + // This assert is expensive to check though. + // CILK_ASSERT(&w->pedigree == __cilkrts_get_tls_pedigree_leaf(0)); + CILK_ASSERT(w->l->original_pedigree_leaf); + + // w should finish with a pedigree node that points to + // the same root that we just looked up. + + // TODO: This assert should be valid. + // But we are removing it now to make exceptions (without pedigrees) work. + // Currently, reading the pedigree after an exception is caught + // fails because the pedigree chain not restored correctly. + // CILK_ASSERT(w->l->original_pedigree_leaf->next == w->pedigree.parent); + w->l->original_pedigree_leaf->rank = w->pedigree.rank; + + // Save that leaf pointer back into tls. + __cilkrts_set_tls_pedigree_leaf(w->l->original_pedigree_leaf); + // Null out worker's leaf for paranoia. + w->l->original_pedigree_leaf = NULL; +} + + + +/* + Local Variables: ** + c-file-style:"bsd" ** + c-basic-offset:4 ** + indent-tabs-mode:nil ** + End: ** +*/ diff --git a/libcilkrts/runtime/pedigrees.h b/libcilkrts/runtime/pedigrees.h new file mode 100644 index 00000000000..3f6ebb977f9 --- /dev/null +++ b/libcilkrts/runtime/pedigrees.h @@ -0,0 +1,130 @@ +/* pedigrees.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#ifndef INCLUDED_PEDIGREES_DOT_H +#define INCLUDED_PEDIGREES_DOT_H + + +#include <cilk/common.h> +#include <internal/abi.h> + +#include "rts-common.h" +#include "global_state.h" +#include "os.h" + +__CILKRTS_BEGIN_EXTERN_C + +/** + * @file pedigrees.h + * + * @brief pedigrees.h declares common routines related to pedigrees + * and the pedigree API. + */ + + +/** + * @brief Sets the leaf pedigree node for the current user thread. + * + * A typical implementation stores this pedigree node in thread-local + * storage. + * + * Preconditions: + * - Current thread should be a user thread. + * + * @param leaf The pedigree node to store as a leaf. + */ +COMMON_PORTABLE +void __cilkrts_set_pedigree_leaf(__cilkrts_pedigree* leaf); + + +/** + * Load the pedigree leaf node from thread-local storage into the + * current user worker. This method should execute as a part of + * binding the user thread to a worker. + * + * Preconditions: + * + * - w should be the worker for the current thread + * - w should be a user thread. + */ +COMMON_PORTABLE +void load_pedigree_leaf_into_user_worker(__cilkrts_worker *w); + +/** + * Save the pedigree leaf node from the worker into thread-local + * storage. This method should execute as part of unbinding a user + * thread from a worker. + * + * Preconditions: + * + * - w should be the worker for the current thread + * - w should be a user thread. + */ +COMMON_PORTABLE +void save_pedigree_leaf_from_user_worker(__cilkrts_worker *w); + + + +/** + * Update pedigree for a worker when leaving a frame. + * + * If this is the frame of a spawn helper (indicated by the + * CILK_FRAME_DETACHED flag) we must update the pedigree. The + * pedigree points to nodes allocated on the stack. Failing to + * update it will result in a accvio/segfault if the pedigree is + * walked. This must happen for all spawn helper frames, even if + * we're processing an exception. + */ +COMMON_PORTABLE +inline void update_pedigree_on_leave_frame(__cilkrts_worker *w, + __cilkrts_stack_frame *sf) +{ + // Update the worker's pedigree information if this is an ABI 1 or later + // frame + if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1) + { + w->pedigree.rank = sf->spawn_helper_pedigree.rank + 1; + w->pedigree.parent = sf->spawn_helper_pedigree.parent; + } +} + + + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_PEDIGREES_DOT_H) diff --git a/libcilkrts/runtime/record-replay.cpp b/libcilkrts/runtime/record-replay.cpp new file mode 100644 index 00000000000..bc5a79f2411 --- /dev/null +++ b/libcilkrts/runtime/record-replay.cpp @@ -0,0 +1,770 @@ +/* record-replay.cpp -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2012-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +/* + * Implementation of the record/replay functionality for Cilk Plus + */ + +#include <cstring> +#include <vector> +#include <stdlib.h> + +// clang is really strict about printf formats, so use the annoying integer +// printf macros. Unfortunately they're not avaiable on Windows +#ifdef _WIN32 +#define PRIu64 "llu" +#else +#define __STDC_FORMAT_MACROS 1 +#include <inttypes.h> +#endif + +#include "record-replay.h" +#include "bug.h" +#include "internal/abi.h" +#include "local_state.h" +#include "full_frame.h" +#include "global_state.h" +#include "cilk_malloc.h" +#include "os.h" // for cilkos_error() + +#if RECORD_ON_REPLAY +#pragma message ("*** Record on Replay is enabled!") +#endif + +// Defined to write sequence number to the logs. Note that you cannot +// diff logs with sequence numbers because the numbers may increment in +// different orders. +//#define INCLUDE_SEQUENCE_NUMBER 1 + +const int PED_VERSION = 1; // Log recording version + +// Log types +enum ped_type_t +{ + ped_type_unknown, + ped_type_steal, + ped_type_sync, + ped_type_orphaned, + ped_type_last // Flags end of the list +}; + +// Log type strings +#define PED_TYPE_STR_STEAL "Steal" +#define PED_TYPE_STR_SYNC "Sync" +#define PED_TYPE_STR_WORKERS "Workers" +#define PED_TYPE_STR_ORPHANED "Orphaned" + +#define PED_TYPE_SIZE 16 // Buffer size for the type of pedigree. Must + // hold largest pedigree record type string. +#define PEDIGREE_BUFF_SIZE 512 // Buffer size for the string representation + // of a pedigree. + +/** + * Data we store for a replay log entry + */ +typedef struct replay_entry_t +{ + uint64_t *m_reverse_pedigree; /**< Reverse pedigree for replay log entry */ + ped_type_t m_type; /**< Type of replay log entry */ + int16_t m_pedigree_len; /**< Number of terms in reverse pedigree */ + int16_t m_value; /**< Victim for STEALs, 0 if matching steal found for ORPHANs */ + + /** + * Load data read from the log into the entry + */ + bool load(const char *type, const char *pedigee_str, int32_t value1, int32_t value2) + { + // Convert the type into an enum + if (0 == strcmp(type, PED_TYPE_STR_STEAL)) + { + m_type = ped_type_steal; + m_value = (int16_t)value1; // Victim + } + else + { + m_value = -1; // Victim not valid + if (0 == strcmp(type, PED_TYPE_STR_SYNC)) + m_type = ped_type_sync; + else if (0 == strcmp(type, PED_TYPE_STR_ORPHANED)) + m_type = ped_type_orphaned; + else + { + m_type = ped_type_unknown; + return false; + } + } + + // Parse the pedigree + m_pedigree_len = 0; + + const char *p = pedigee_str; + char *end; + + uint64_t temp_pedigree[PEDIGREE_BUFF_SIZE/2]; + + while(1) + { + temp_pedigree[m_pedigree_len++] = (uint64_t)strtol(p, &end, 10); + if ('\0' == *end) + break; + p = end + 1; + } + + // Allocate memory to hold the pedigree. + // Copy the pedigree in reverse order since that's the order we'll + // traverse it + m_reverse_pedigree = + (uint64_t *)__cilkrts_malloc(sizeof(int64_t) * m_pedigree_len); + for (int n = 0; n < m_pedigree_len; n++) + m_reverse_pedigree[n] = temp_pedigree[(m_pedigree_len - 1) - n]; + + return true; + } + + /** + * Match this entry against the data supplied. This includes walking the + * pedigree from the specified node. + */ + bool match (ped_type_t type, const __cilkrts_pedigree *node, int victim = -1) + { + int i = 0; + + // If the type isn't what they're seeking, we don't have a match + if (type != m_type) + return false; + + // If we're looking for a STEAL, then the victim must match + if ((type == ped_type_steal) && (victim != m_value)) + return false; + + // Compare the current pedigree against what was recorded + while ((NULL != node) && (i < m_pedigree_len)) + { + // If we've got a pedigree rank difference, then we don't have + // a match + if (node->rank != m_reverse_pedigree[i]) + return false; + node = node->parent; + i++; + } + + // Make sure we exhausted both the pedigree chain and the recorded + // pedigree + return ((NULL == node) && (i == m_pedigree_len)); + } + + /** + * Advance to the next entry, skipping any ORPHANED records we didn't see + * a matching STEAL for + */ + replay_entry_t *next_entry() + { + replay_entry_t *entry = this; + + // You can't go beyond the end + if (ped_type_last == entry->m_type) + return entry; + + // Advance to the next entry + entry++; + + // Skip any ORPHANED records that don't have a matching steal. We + // initialized the value field to -1 for ORPHANED. After loading all + // the log data, we iterated through all the STEAL records setting the + // matching ORPHANED record's value field to 0. So if an ORPHANED + // record's value field is still -1, it doesn't have a matching STEAL + // record, and I don't know why we chose not to return from the + // spawned function. + while ((ped_type_orphaned == entry->m_type) && (-1 == entry->m_value)) + { + entry++; + } + + return entry; + } + + /** + * Release any allocated resources + */ + void unload() + { + __cilkrts_free(m_reverse_pedigree); + m_reverse_pedigree = NULL; + } + +} replay_entry_t; + +__CILKRTS_BEGIN_EXTERN_C + +/** + * Walk the pedigree and generate a string representation with underscores + * between terms. Currently does a recursive walk to generate a forward + * pedigree. + * + * @param p The buffer that is to be filled. Assumed to be PEDIGREE_BUFF_SIZE + * characters long + * @param pnode The initial pedigree term to be written. + * + * @return A pointer into the pedigree string buffer after a term has been + * written. + */ +static +char * walk_pedigree_nodes(char *p, const __cilkrts_pedigree *pnode) +{ + CILK_ASSERT(pnode); + if (pnode->parent) + { + p = walk_pedigree_nodes(p, pnode->parent); + p += sprintf(p, "_"); + } + + return p + sprintf(p, "%" PRIu64, pnode->rank); +} + +/** + * Write a record to a replay log file. + * + * @param w The worker we're writing the pedigree for. + * @param type The type of the pedigree record, as a string + * @param initial_node The initial pedigree node to be written, or NULL if + * there is no pedigree for this record type. + * @param i1 First integer value to be written to the record. + * @param i2 Second integer value to be written to the record. Only applies + * to STEAL records. Defaults to -1 (unused). The second value is always + * written to make parsing easier. + */ +static +void write_to_replay_log (__cilkrts_worker *w, const char *type, + const __cilkrts_pedigree *initial_node, + int i1 = -1, int i2 = -1) +{ + char pedigree[PEDIGREE_BUFF_SIZE]; + + // If we don't have an initial pedigree node, just use "0" to fill the slot + if (NULL == initial_node) + strcpy(pedigree, "0"); + else + walk_pedigree_nodes(pedigree, initial_node); + +#ifndef INCLUDE_SEQUENCE_NUMBER + // Simply write the record + fprintf(w->l->record_replay_fptr, "%s %s %d %d\n", + type, pedigree, i1, i2); +#else + // Write the record with a sequence number. The sequence number should + // always be the last term, and ignored on read + + static long volatile seq_num = 0; + long write_num; + + // Atomic increment functions are compiler/OS-specific +#ifdef _WIN32 + write_num = _InterlockedIncrement(&seq_num); +#else /* GCC */ + write_num = __sync_add_and_fetch(&seq_num, 1); +#endif // _WIN32 + + fprintf(w->l->record_replay_fptr, "%s %s %d %d %ld\n", + type, pedigree, i1, i2, write_num); +#endif // INCLUDE_SEQUENCE_NUMBER + + fflush(w->l->record_replay_fptr); +} + +/** + * Record data for a successful steal. + * + * The pedigree for a STEAL record is the pedigree of the stolen frame. + * + * @note It's assumed that replay_record_steal() has already checked that we're + * recording a log and that the record/replay functionality has not been + * compiled out. + * + * @param w The worker stealing a frame. + * @param victim_id The ID of the worker which had it's frame stolen. + */ +void replay_record_steal_internal(__cilkrts_worker *w, int32_t victim_id) +{ + // Follow the pedigree chain using worker's stack frame + CILK_ASSERT(w->l->next_frame_ff); + CILK_ASSERT(w->l->next_frame_ff->call_stack); + + // Record steal: STEAL pedigree victim_id thief_id + write_to_replay_log (w, PED_TYPE_STR_STEAL, + &(w->l->next_frame_ff->call_stack->parent_pedigree), + victim_id); +} + +/** + * Record data for the worker that continues from a sync + * + * The pedigree for a SYNC record is the pedigree at the sync. + * + * @note It's assumed that replay_record_sync() has already checked that we're + * recording a log and that the record/replay functionality has not been + * compiled out. + * + * @param w The worker continuing from a sync. + */ +void replay_record_sync_internal(__cilkrts_worker *w) +{ + // Record sync: SYNC pedigree last_worker_id + write_to_replay_log (w, PED_TYPE_STR_SYNC, &w->pedigree); +} + +/** + * Record the pedigree of an attempt to return to a stolen parent + * + * The pedigree for an ORPHANED record is the pedigree of our parent + * + * @note It's assumed that replay_record_orphaned() has already checked that + * we're recording a log and that the record/replay functionality has not + * been compiled out. + * + * @param w The worker continuing noting that it has been orphaned. + */ +void replay_record_orphaned_internal(__cilkrts_worker *w) +{ + // Record steal: ORPHANED pedigree self + write_to_replay_log (w, PED_TYPE_STR_ORPHANED, w->pedigree.parent); +} + +/** + * Attempt to match a SYNC record. We have a match when this worker was + * recorded returning from the current call to __cilkrts_sync() with the + * same pedigree and this was the worker that continued from the sync, since + * it was the last to sync. + * + * If we find a match, the caller is expected to stall it is the last worker + * to reach a sync so it will be the worker to continue from the sync. + * + * @note It's assumed that replay_match_sync_pedigree() has already returned + * if we're not replaying a log, or if record/replay functionality has + * been compiled out. + * + * @param w The worker we're checking to see if we've got a match + */ +int replay_match_sync_pedigree_internal(__cilkrts_worker *w) +{ + // Return true if we have a match + if (w->l->replay_list_entry->match(ped_type_sync, &w->pedigree)) + return 1; + else + return 0; +} + +/** + * Advance to the next log entry from a SYNC record. Consume the current + * SYNC record on this worker and advance to the next one. + * + * @note It's assumed that replay_advance_from_sync() has already returned if + * we're not replaying a log, or if record/replay functionality has been + * compiled out. + * + * @param w The worker whose replay log we're advancing. + */ +void replay_advance_from_sync_internal (__cilkrts_worker *w) +{ + // The current replay entry must be a SYNC + CILK_ASSERT(ped_type_sync == w->l->replay_list_entry->m_type); + + // Advance to the next entry + w->l->replay_list_entry = w->l->replay_list_entry->next_entry(); +} + +/** + * Called from random_steal() to override the ID of the randomly chosen victim + * worker which this worker will attempt to steal from. Returns the worker id + * of the next victim this worker was recorded stealing from, or -1 if the + * next record in the log is not a STEAL. + * + * @note This call does NOT attempt to match the pedigree. That will be done + * by replay_match_victim_pedigree() after random_steal() has locked the victim + * worker. + * + * @param w The __cilkrts_worker we're executing on. The worker's replay log + * is checked for a STEAL record. If we've got one, the stolen worker ID is + * returned. + * + * @return -1 if the next record is not a STEAL + * @return recorded stolen worker ID if we've got a matching STEAL record + */ +int replay_get_next_recorded_victim_internal(__cilkrts_worker *w) +{ + // If the next record isn't a STEAL, abort the attempt to steal work + if (ped_type_steal != w->l->replay_list_entry->m_type) + return -1; + + // Return the victim's worker ID from the STEAL record. We'll check + // the pedigree after random_steal has locked the victim worker. + return w->l->replay_list_entry->m_value; +} + +/** + * Called from random_steal() to determine if we have a STEAL record that + * matches the pedigree at the head of the victim worker. If we do have a + * match, the STEAL record is consumed. + * + * @note It's assumed that replay_match_victim_pedigree() has already returned if + * we're not replaying a log, or if record/replay functionality has been + * compiled out. + * + * @return 1 if we have a match + * @return 0 if the current replay record isn't a STEAL record, or the victim + * isn't correct, or the pedigree doesn't match. + */ +int replay_match_victim_pedigree_internal(__cilkrts_worker *w, __cilkrts_worker *victim) +{ + // If we don't have a match, return 0 + if (! w->l->replay_list_entry->match(ped_type_steal, + &((*victim->head)->parent_pedigree), + victim->self)) + return 0; + + // Consume this entry + w->l->replay_list_entry = w->l->replay_list_entry->next_entry(); + + // Return success + return 1; +} + +/** + * If the frame we're about to return to was recorded as being stolen, + * stall until it is. + * + * @note It's assumed that replay_wait_for_steal_if_parent_was_stolen() has + * already returned if we're not replaying a log, or if record/replay + * functionality has been compiled out. + * + * @param w The worker we're executing on. + */ +void replay_wait_for_steal_if_parent_was_stolen_internal(__cilkrts_worker *w) +{ + // If our parent wasn't recorded orphanen, return now + if (! w->l->replay_list_entry->match (ped_type_orphaned, + w->pedigree.parent)) + return; + + // Stall until our parent is stolen. Note that we're comparing head + // and tail, not head and exc. The steal is not completed until tail + // is modified. + while (!((w->tail - 1) < w->head)) + __cilkrts_sleep(); + + // Consume the entry + w->l->replay_list_entry = w->l->replay_list_entry->next_entry(); +} + +/** + * Allocate memory for the list of logged events. + * + * This function will read through the file and count the number of records + * so it can estimate how big a buffer to allocate for the array or replay + * entries. It will then rewind the file to the beginning so it can be + * loaded into memory. + * + * @param w The worker we're loading the file for. + * @param f The file of replay data we're scanning. + */ +static +void allocate_replay_list(__cilkrts_worker *w, FILE *f) +{ + // Count the number of entries - yeah, it's a hack, but it lets me + // allocate the space all at once instead of in chunks + char buf[1024]; + int entries = 1; // Include "LAST" node + + while (! feof(f)) + { + if (fgets(buf, 1024, f)) + { + // Skip the Workers record - should only be in file for Worker 0 + if (0 != strncmp(PED_TYPE_STR_WORKERS, buf, sizeof(PED_TYPE_STR_WORKERS)-1)) + entries++; + } + } + + w->l->replay_list_root = + (replay_entry_t *)__cilkrts_malloc(entries * sizeof(replay_entry_t)); + w->l->replay_list_root[entries - 1].m_type = ped_type_last; + + // Reset the file to the beginning + rewind(f); +} + +/** + * Load the replay log for a worker into memory. + * + * @param w The worker we're loading the replay for. + */ +static +void load_recorded_log(__cilkrts_worker *w) +{ + char ped_type[PED_TYPE_SIZE]; + char ped_str[PEDIGREE_BUFF_SIZE]; + int32_t i1 = -1, i2 = -1; + int fret; + char local_replay_file_name[512]; + FILE *f; + + // Open the log for reading + sprintf(local_replay_file_name, "%s%d.cilklog", w->g->record_replay_file_name, w->self); + f = fopen(local_replay_file_name, "r"); + + // Make sure we found a log! + CILK_ASSERT (NULL != f); + + // Initialize the replay_list + allocate_replay_list(w, f); + replay_entry_t *entry = w->l->replay_list_root; + + // Read the data out and add it to our tables + while (! feof(f)) + { +#ifndef INCLUDE_SEQUENCE_NUMBER + fret = fscanf(f, "%s %s %d %d\n", ped_type, ped_str, &i1, &i2); + if(EOF == fret) + break; + + // We must have read 4 fields + CILK_ASSERT(4 == fret); +#else + int32_t write_num; + fret = fscanf(f, "%s %s %d %d %d\n", ped_type, ped_str, + &i1, &i2, &write_num); + if(EOF == fret) + break; + + // We must have read 5 fields + CILK_ASSERT(5 == fret); +#endif // INCLUDE_SEQUENCE_NUMBER + + // Load the data into the entry + if (0 == strcmp(ped_type, PED_TYPE_STR_WORKERS)) + { + // Verify we're replaying with the same number of workers we recorded with + if (i1 != w->g->P) + { + // Fatal error - does not return + cilkos_error("Cannot continue replay: number of workers(%d) doesn't match " + "that from the recording(%d).\n", w->g->P, i1); + } + + // Verify that we understand this version of the pedigree file + if (PED_VERSION != i2) + { + // Fatal error - does not return + cilkos_error("Pedigree file version %d doesn't match current " + "version %d - cannot continue.\n", + i2, PED_VERSION); + } + } + else + { + entry->load(ped_type, ped_str, i1, i2); + entry++; + } + } + + // Make sure we've filled the allocated memory. We initialized the last + // entry in + CILK_ASSERT(ped_type_last == entry->m_type); + w->l->replay_list_entry = w->l->replay_list_root; + + // Close the log and return + fclose(f); +} + +/** + * Scan a recorded log to match STEALs againsted ORPHANED records. + * + * @param g Cilk Runtime global state. Passed to access the worker array so + * we can scan a worker's ORPHANED entries for one that matches a STEAL entry. + * @param entry The root of a replay_list for a worker. + */ +static +void scan_for_matching_steals(global_state_t *g, replay_entry_t *entry) +{ + // Iterate over all of the entries + while (ped_type_last != entry->m_type) + { + // Look for STEALs. That will tell us which worker the frame was + // stolen from + if (ped_type_steal == entry->m_type) + { + bool found = false; + + // Validate the worker ID and make sure we've got a list + CILK_ASSERT((entry->m_value >= 0) && (entry->m_value < g->total_workers)); + replay_entry_t *victim_entry = g->workers[entry->m_value]->l->replay_list_root; + CILK_ASSERT(NULL != victim_entry); + + // Scan the victim's list for the matching ORPHANED record + while ((ped_type_last != victim_entry->m_type) && ! found) + { + if (ped_type_orphaned == victim_entry->m_type) + { + if (entry->m_pedigree_len == victim_entry->m_pedigree_len) + { + if (0 == memcmp(entry->m_reverse_pedigree, + victim_entry->m_reverse_pedigree, + entry->m_pedigree_len * sizeof(int64_t))) + { + // Note that this ORPHANED record has a matching steal + victim_entry->m_value = 0; + found = true; + } + } + } + victim_entry++; + } + } + entry++; + } +} + + +/* + * Initialize per-worker data for record or replay - See record-replay.h + * for full routine header. + */ +void replay_init_workers(global_state_t *g) +{ + int i; + char worker_file_name[512]; + + // If we're not recording or replaying a log, we're done. All of the + // fields in the global_state_t or local_state_t are already initialized + // to default values. + if (RECORD_REPLAY_NONE == g->record_or_replay) + return; + + // If we're replaying a log, read each worker's log and construct the + // in-memory log + if (REPLAY_LOG == g->record_or_replay) + { + // Read all of the data + for (i = 0; i < g->total_workers; ++i) + { + // This function will also initialize and fill the worker's + // replay list + load_recorded_log(g->workers[i]); + } + + // Scan for orphans with no matching steal. Mark them so they'll be + // skipped as we advance through the log. + for (i = 0; i < g->total_workers; ++i) + { + scan_for_matching_steals(g, g->workers[i]->l->replay_list_root); + } + + // If we're recording the logs while replaying, create the log files. + // This will only be used for debugging. Create the logs in the + // current directory. It should be as good a place as any... +#if RECORD_ON_REPLAY + for(i = 0; i < g->total_workers; ++i) + { + __cilkrts_worker *w = g->workers[i]; + sprintf(worker_file_name, "replay_log_%d.cilklog", w->self); + w->l->record_replay_fptr = fopen(worker_file_name, "w+"); + CILK_ASSERT(NULL != w->l->record_replay_fptr); + } + + // Record the number of workers, file version in Worker 0's file + write_to_replay_log (g->workers[0], PED_TYPE_STR_WORKERS, NULL, g->P, PED_VERSION); +#endif // RECORD_ON_REPLAY + } + + // If we're recording, create the log files + if (RECORD_LOG == g->record_or_replay) + { + for(i = 0; i < g->total_workers; ++i) + { + __cilkrts_worker *w = g->workers[i]; + sprintf(worker_file_name, "%s%d.cilklog", + g->record_replay_file_name, + w->self); + w->l->record_replay_fptr = fopen(worker_file_name, "w+"); + CILK_ASSERT(NULL != w->l->record_replay_fptr); + } + + // Record the number of workers, file version in Worker 0's file + write_to_replay_log (g->workers[0], PED_TYPE_STR_WORKERS, NULL, g->P, PED_VERSION); + } +} + +/* + * Do any necessary cleanup for the logs - See record-replay.h for full + * routine header. + */ +void replay_term(global_state_t *g) +{ + // Free memory for the record/replay log file name, if we've got one + if (g->record_replay_file_name) + __cilkrts_free(g->record_replay_file_name); + + // Per-worker cleanup + for(int i = 0; i < g->total_workers; ++i) + { + __cilkrts_worker *w = g->workers[i]; + + // Close the log files, if we've opened them + if(w->l->record_replay_fptr) + fclose(w->l->record_replay_fptr); + + if (w->l->replay_list_root) + { + // We should have consumed the entire list + CILK_ASSERT(ped_type_last == w->l->replay_list_entry->m_type); + + replay_entry_t *entry = w->l->replay_list_root; + while (ped_type_last != entry->m_type) + { + // Free the pedigree memory for each entry + entry->unload(); + entry++; + } + __cilkrts_free(w->l->replay_list_root); + w->l->replay_list_root = NULL; + w->l->replay_list_entry = NULL; + } + } +} + +__CILKRTS_END_EXTERN_C diff --git a/libcilkrts/runtime/record-replay.h b/libcilkrts/runtime/record-replay.h new file mode 100644 index 00000000000..c1c5a68f579 --- /dev/null +++ b/libcilkrts/runtime/record-replay.h @@ -0,0 +1,432 @@ +/* record_replay.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2012-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +/** + * @file record-replay.h + * + * @brief record-replay.h and .cpp encapsulate most of the functionality to + * record and play back a Cilk Plus application. + * + * Recording is directed by the setting of the CILK_RECORD_LOG environment + * variable. If it's defined, the value specifies the root we'll use to + * generate files for each worker using the following format string: + * "%s%d.cilklog", where the integer is the value of w->self. + * + * Replay is directed by the setting of the CILK_REPLAY_LOG environment + * variable, interpreted the same way as CILK_RECORD_LOG. If both + * CILK_RECORD_LOG and CILK_REPLAY_LOG are defined, a warning will be given + * and the attempt to record a log will be ignored. + * + * Recording is relatively straightforward. We write all information about a + * worker to a per-worker file. + * + * Each pedigree record consists of the following fields. All fields must be + * present in every record to make parsing easy. + * - Type - A string identifying the pedigree record. See the PED_TYPE_STR_ + * macros for the currently defined values. + * - Pedigree - A string of pedigree values, with underscores between + * adjacent values. + * - i1 - Record type-specific value. -1 if not used. + * - i2 - Record type-specific value. -1 if not used. + * + * WORKERS record - only written to the file for worker 0. Note that this is + * the first worker in the workers array. Worker 0 is the first system worker, + * *NOT* a user worker. + * - Type: "Workers" + * - Pedigree: Always "0" - ignored + * - i1: Number of workers (g->P) when we recorded the log. A mismatch when + * we attempt to replay the log will result in aborting the execution. + * - i2: Log version number - Specified by PED_VERSION in record-replay.cpp + * + * STEAL record - written after a successful steal. + * - Type: "Steal" + * - Pedigree: Pedigree of stolen frame + * - i1: Worker the frame was stolen from + * - i2: -1 + * + * SYNC record - written after a worker continues from a sync. + * - Type: "Sync" + * - Pedigree: Pedigree of sync. Note that this is the pedigree *before* + * the pedigree in incremented in setup_for_execution_pedigree(). + * - i1: -1 + * - i2: -1 + * + * ORPHANED record - saved on a return to a stolen parent. + * - Type: "Orphaned" + * - Pedigree: Pedigree of the parent frame *before* the pedigree is + * incremented by the return + * - i1: -1 + * - i2: -1 + * + * On replay, the data is loaded into a per-worker array, and the data is + * consumed in order as needed. + */ + +#ifndef INCLUDED_RECORD_REPLAY_DOT_H +#define INCLUDED_RECORD_REPLAY_DOT_H + +#include "cilk/common.h" +#include "global_state.h" + +/** + * Define CILK_RECORD_REPLAY to enable record/replay functionality. If + * CILK_RECORD_REPLAY is not defined, all of the record/replay functions in + * record-replay.h will be stubbed out. Since they're declared as inline, + * functions, the resulting build should have no performance impact due to + * the implementation or record/replay. + */ + #define CILK_RECORD_REPLAY 1 + +/** + * Define RECORD_ON_REPLAY=1 to write logs when we're replaying a log. This + * should only be needed when debugging the replay functionality. This should + * always be defined as 0 when record-replay.h is checked in. + */ +#define RECORD_ON_REPLAY 0 + +__CILKRTS_BEGIN_EXTERN_C + +#ifdef CILK_RECORD_REPLAY +// Declarations of internal record/replay functions. The inlined versions +// further down do some preliminary testing (like if we're not recording or +// replaying) and will stub out the functionality if we've compiled out the +// record/replay feature +int replay_match_sync_pedigree_internal(__cilkrts_worker *w); +void replay_wait_for_steal_if_parent_was_stolen_internal(__cilkrts_worker *w); +void replay_record_steal_internal(__cilkrts_worker *w, int32_t victim_id); +void replay_record_sync_internal(__cilkrts_worker *w); +void replay_record_orphaned_internal(__cilkrts_worker *w); +int replay_match_victim_pedigree_internal(__cilkrts_worker *w, __cilkrts_worker *victim); +void replay_advance_from_sync_internal (__cilkrts_worker *w); +int replay_get_next_recorded_victim_internal(__cilkrts_worker *w); +#endif // CILK_RECORD_REPLAY + +// Publically defined record/replay API + +/** + * If we're replaying a log, wait for our parent to be stolen if it was when + * the log was recorded. If record/replay is compiled out, this is a noop. + * + * @param w The __cilkrts_worker we're executing on. The worker's replay + * list will be checked for a ORPHANED record with a matching pedigree. If + * there is a match, the ORPHANED record will be consumed. + */ +#ifdef CILK_RECORD_REPLAY +__CILKRTS_INLINE +void replay_wait_for_steal_if_parent_was_stolen(__cilkrts_worker *w) +{ + // Only check if we're replaying a log + if (REPLAY_LOG == w->g->record_or_replay) + replay_wait_for_steal_if_parent_was_stolen_internal(w); +} +#else +__CILKRTS_INLINE +void replay_wait_for_steal_if_parent_was_stolen(__cilkrts_worker *w) +{ + // If record/replay is disabled, we never wait +} +#endif // CILK_RECORD_REPLAY + +/** + * Called from random_steal() to override the ID of the randomly chosen victim + * worker which this worker will attempt to steal from. Returns the worker id + * of the next victim this worker was recorded stealing from, or -1 if the + * next record in the log is not a STEAL. + * + * @note This call does NOT attempt to match the pedigree. That will be done + * by replay_match_victim_pedigree() after random_steal() has locked the victim + * worker. + * + * @param w The __cilkrts_worker we're executing on. The worker's replay log + * is checked for a STEAL record. If we've got one, the stolen worker ID is + * returned. + * @param id The randomly chosen victim worker ID. If we're not replaying a + * log, or if record/replay has been compiled out, this is the value that + * will be returned. + * + * @return id if we're not replaying a log + * @return -1 if the next record is not a STEAL + * @return recorded stolen worker ID if we've got a matching STEAL record + */ +#ifdef CILK_RECORD_REPLAY +__CILKRTS_INLINE +int replay_get_next_recorded_victim(__cilkrts_worker *w, int id) +{ + // Only check if we're replaying a log + if (REPLAY_LOG == w->g->record_or_replay) + return replay_get_next_recorded_victim_internal(w); + else + return id; +} +#else +__CILKRTS_INLINE +int replay_get_next_recorded_victim(__cilkrts_worker *w, int id) +{ + // Record/replay is disabled. Always return the original worker id + return id; +} +#endif // CILK_RECORD_REPLAY + +/** + * Initialize per-worker data for record/replay. A noop if record/replay + * is disabled, or if we're not recording or replaying anything. + * + * If we're recording a log, this will ready us to create the per-worker + * logs. + * + * If we're replaying a log, this will read the logs into the per-worker + * structures. + * + * @param g Cilk runtime global state + */ +void replay_init_workers(global_state_t *g); + +/** + * Record a record on a successful steal. A noop if record/replay is + * diabled, or if we're not recording anything + * + * @param w The __cilkrts_worker we're executing on. The pedigree of + * the stolen frame will be walked to generate the STEAL record. + * + * @param victim_id The worker ID of the worker w stole from. + */ +#ifdef CILK_RECORD_REPLAY +__CILKRTS_INLINE +void replay_record_steal(__cilkrts_worker *w, int32_t victim_id) +{ +#if RECORD_ON_REPLAY + // If we're recording on replay, write the record if we're recording or + // replaying + if (RECORD_REPLAY_NONE == w->g->record_or_replay) + return; +#else + // Only write the record if we're recording + if (RECORD_LOG != w->g->record_or_replay) + return; +#endif + + replay_record_steal_internal(w, victim_id); +} +#else +__CILKRTS_INLINE +void replay_record_steal(__cilkrts_worker *w, int32_t victim_id) +{ +} +#endif // CILK_RECORD_REPLAY + +/** + * Record a record when continuing after a sync. A noop if record/replay is + * diabled, or if we're not recording anything, or if the sync was abandoned, + * meaning this isn't the worker that continues from the sync. + * + * @param w The __cilkrts_worker for we're executing on. The pedigree of + * the sync-ing frame will be walked to generate the SYNC record. + * + * @param continuing True if this worker will be continuing from the + * cilk_sync. A SYNC record will only be generated if continuing is true. + */ +#ifdef CILK_RECORD_REPLAY +__CILKRTS_INLINE +void replay_record_sync(__cilkrts_worker *w, int continuing) +{ + // If this was not the last worker to the syn, return + if (! continuing) + return; + +#if RECORD_ON_REPLAY + // If we're recording on replay, write the record if we're recording or + // replaying + if (RECORD_REPLAY_NONE == w->g->record_or_replay) + return; +#else + // Only write the record if we're recording + if (RECORD_LOG != w->g->record_or_replay) + return; +#endif + + replay_record_sync_internal(w); +} +#else +__CILKRTS_INLINE +void replay_record_sync(__cilkrts_worker *w, int abandoned) +{ +} +#endif // CILK_RECORD_REPLAY + +/** + * Record a record on a return to a stolen parent. A noop if record/replay is + * diabled, or if we're not recording anything. + * + * @param w The __cilkrts_worker for we're executing on. The pedigree of the + * frame that has discovered that its parent has been stolken will be walked + * to generate the ORPHANED record. + */ +#ifdef CILK_RECORD_REPLAY +__CILKRTS_INLINE +void replay_record_orphaned(__cilkrts_worker *w) +{ +#if RECORD_ON_REPLAY + // If we're recording on replay, write the record if we're recording or + // replaying + if (RECORD_REPLAY_NONE == w->g->record_or_replay) + return; +#else + // Only write the record if we're recording + if (RECORD_LOG != w->g->record_or_replay) + return; +#endif + + replay_record_orphaned_internal(w); +} +#else +__CILKRTS_INLINE +void replay_record_orphaned(__cilkrts_worker *w) +{ +} +#endif // CILK_RECORD_REPLAY + +/** + * Test whether the frame at the head of the victim matches the pedigree of + * the frame that was recorded being stolen. Called in random steal to verify + * that we're about to steal the correct frame. + * + * @param w The __cilkrts_worker for we're executing on. The current worker + * is needed to find the replay entry to be checked. + * + * @param victim The __cilkrts_worker for we're proposing to steal a frame + * from. The victim's head entry is + * is needed to find the replay entry to be checked. + * + * @return 0 if we're replaying a log and the victim's pedigree does NOT match + * the next frame the worker is expected to steal. + * + * @return 1 in all other cases to indicate that the steal attempt should + * continue + */ +#ifdef CILK_RECORD_REPLAY +__CILKRTS_INLINE +int replay_match_victim_pedigree(__cilkrts_worker *w, __cilkrts_worker *victim) +{ + // We're not replaying a log. The victim is always acceptable + if (REPLAY_LOG != w->g->record_or_replay) + return 1; + + // Return 1 if the victim's pedigree matches the frame the worker stole + // when we recorded the log + return replay_match_victim_pedigree_internal(w, victim); +} +#else +__CILKRTS_INLINE +int replay_match_victim_pedigree(__cilkrts_worker *w, __cilkrts_worker *victim) +{ + // Record/replay is disabled. The victim is always acceptable + return 1; +} +#endif // CILK_RECORD_REPLAY + +/** + * Test whether the current replay entry is a sync record matching the + * worker's pedigree. + * + * @param w The __cilkrts_worker for we're executing on. + * + * @return 1 if the current replay entry matches the current pedigree. + * @return 0 if there's no match, or if we're not replaying a log. + */ +#ifdef CILK_RECORD_REPLAY +__CILKRTS_INLINE +int replay_match_sync_pedigree(__cilkrts_worker *w) +{ + // If we're not replaying, assume no match + if (REPLAY_LOG != w->g->record_or_replay) + return 0; + + return replay_match_sync_pedigree_internal(w); +} +#else +__CILKRTS_INLINE +int replay_match_sync_pedigree(__cilkrts_worker *w) +{ + // Record/replay is disabled. Assume no match + return 0; +} +#endif + +/** + * Marks a sync record seen, advancing to the next record in the replay list. + * + * This function will only advance to the next record if: + * - Record/replay hasn't been compiled out AND + * - We're replaying a log AND + * - A match was found AND + * - The sync is not being abandoned + * + * @param w The __cilkrts_worker for we're executing on. + * @param match_found The value returned by replay_match_sync_pedigree(). If + * match_found is false, nothing is done. + * @param continuing Flag indicating whether this worker will continue from + * the sync (it's the last worker to the sync) or if it will abandon the work + * and go to the scheduling loop to look for more work it can steal. + */ +#ifdef CILK_RECORD_REPLAY +__CILKRTS_INLINE +void replay_advance_from_sync(__cilkrts_worker *w, int match_found, int continuing) +{ + // If we're replaying a log, and the current sync wasn't abandoned, and we + // found a match in the log, mark the sync record seen. + if ((REPLAY_LOG == w->g->record_or_replay) && match_found && continuing) + replay_advance_from_sync_internal(w); +} +#else +__CILKRTS_INLINE +void replay_advance_from_sync(__cilkrts_worker *w, int match_found, int continuing) +{ +} +#endif + +/** + * Release any resources used to read or write a replay log. + * + * @param g Cilk runtime global state + */ +void replay_term(global_state_t *g); + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_RECORD_REPLAY_DOT_H) diff --git a/libcilkrts/runtime/reducer_impl.cpp b/libcilkrts/runtime/reducer_impl.cpp new file mode 100644 index 00000000000..f20b9bc4592 --- /dev/null +++ b/libcilkrts/runtime/reducer_impl.cpp @@ -0,0 +1,1012 @@ +/* reducer_impl.cpp -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Patents Pending, Intel Corporation. + **************************************************************************/ + +/** + * Support for reducers + */ + +// ICL: Don't complain about conversion from pointer to same-sized integral type +// in hashfun. That's why we're using size_t +#ifdef _WIN32 +# pragma warning(disable: 1684) +#endif + +#include "reducer_impl.h" +#include "scheduler.h" +#include "bug.h" +#include "os.h" +#include "global_state.h" +#include "frame_malloc.h" + +#include "cilk/hyperobject_base.h" +#include "cilktools/cilkscreen.h" +#include "internal/abi.h" + +#if REDPAR_DEBUG > 0 +#include <stdio.h> +#include <stdlib.h> +#endif + + +#define DBG if(0) // if(1) enables some internal checks + +// Check that w is the currently executing worker. This method is a +// no-op unless the debug level is set high enough. +static inline void verify_current_wkr(__cilkrts_worker *w) +{ +#if REDPAR_DEBUG >= 5 + __cilkrts_worker* tmp = __cilkrts_get_tls_worker(); + if (w != tmp) { + fprintf(stderr, "W=%d, actual=%d... missing a refresh....\n", + w->self, + tmp->self); + } + CILK_ASSERT(w == tmp); // __cilkrts_get_tls_worker()); +#endif +} + +// Suppress clang warning that the expression result is unused +#if defined(__clang__) && (! defined(__INTEL_COMPILER)) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wunused-value" +#endif // __clang__ + +/// Helper class to disable and re-enable Cilkscreen +struct DisableCilkscreen +{ + DisableCilkscreen () { __cilkscreen_disable_checking(); } + ~DisableCilkscreen () { __cilkscreen_enable_checking(); } +}; + +/// Helper class to enable and re-disable Cilkscreen +struct EnableCilkscreen +{ + EnableCilkscreen () { __cilkscreen_enable_checking(); } + ~EnableCilkscreen () { __cilkscreen_disable_checking(); } +}; + +#if defined(__clang__) && (! defined(__INTEL_COMPILER)) +# pragma clang diagnostic pop +#endif // __clang__ + +/** + * @brief Element for a hyperobject + */ +struct elem { + void *key; ///< Shared key for this hyperobject + __cilkrts_hyperobject_base *hb; ///< Base of the hyperobject. + void *view; ///< Strand-private view of this hyperobject + /// Destroy and deallocate the view object for this element and set view to + /// null. + void destroy(); + + /// Returns true if this element contains a leftmost view. + bool is_leftmost() const; +}; + +/** Bucket containing at most NMAX elements */ +struct bucket { + /// Size of the array of elements for this bucket + size_t nmax; + + /** + * We use the ``struct hack'' to allocate an array of variable + * dimension at the end of the struct. However, we allocate a + * total of NMAX+1 elements instead of NMAX. The last one always + * has key == 0, which we use as a termination criterion + */ + elem el[1]; +}; + +/** + * Class that implements the map for reducers so we can find the + * view for a strand. + */ +struct cilkred_map { + /** Handy pointer to the global state */ + global_state_t *g; + + /** Number of elements in table */ + size_t nelem; + + /** Number of buckets */ + size_t nbuckets; + + /** Array of pointers to buckets */ + bucket **buckets; + + /** Set true if merging (for debugging purposes) */ + bool merging; + + /** Set true for leftmost reducer map */ + bool is_leftmost; + + /** @brief Return element mapped to 'key' or null if not found. */ + elem *lookup(void *key); + + /** + * @brief Insert key/value element into hash map without rehashing. + * Does not check for duplicate key. + */ + elem *insert_no_rehash(__cilkrts_worker *w, + void *key, + __cilkrts_hyperobject_base *hb, + void *value); + + /** + * @brief Insert key/value element into hash map, rehashing if necessary. + * Does not check for duplicate key. + */ + inline elem *rehash_and_insert(__cilkrts_worker *w, + void *key, + __cilkrts_hyperobject_base *hb, + void *value); + + /** @brief Grow bucket by one element, reallocating bucket if necessary */ + static elem *grow(__cilkrts_worker *w, bucket **bp); + + /** @brief Rehash a worker's reducer map */ + void rehash(__cilkrts_worker *); + + /** + * @brief Returns true if a rehash is needed due to the number of elements that + * have been inserted. + */ + inline bool need_rehash_p() const; + + /** @brief Allocate and initialize the buckets */ + void make_buckets(__cilkrts_worker *w, size_t nbuckets); + + /** + * Specify behavior when the same key is present in both maps passed + * into merge(). + */ + enum merge_kind + { + MERGE_UNORDERED, ///< Assertion fails + MERGE_INTO_LEFT, ///< Merges the argument from the right into the left + MERGE_INTO_RIGHT ///< Merges the argument from the left into the right + }; + + /** + * @brief Merge another reducer map into this one, destroying the other map in + * the process. + */ + __cilkrts_worker* merge(__cilkrts_worker *current_wkr, + cilkred_map *other_map, + enum merge_kind kind); + + /** @brief check consistency of a reducer map */ + void check(bool allow_null_view); + + /** @brief Test whether the cilkred_map is empty */ + bool is_empty() { return nelem == 0; } +}; + +static inline struct cilkred_map* install_new_reducer_map(__cilkrts_worker *w) { + cilkred_map *h; + h = __cilkrts_make_reducer_map(w); + w->reducer_map = h; + return h; +} + +static size_t sizeof_bucket(size_t nmax) +{ + bucket *b = 0; + return (sizeof(*b) + nmax * sizeof(b->el[0])); +} + +static bucket *alloc_bucket(__cilkrts_worker *w, size_t nmax) +{ + bucket *b = (bucket *) + __cilkrts_frame_malloc(w, sizeof_bucket(nmax)); + b->nmax = nmax; + return b; +} + +static void free_bucket(__cilkrts_worker *w, bucket **bp) +{ + bucket *b = *bp; + if (b) { + __cilkrts_frame_free(w, b, sizeof_bucket(b->nmax)); + *bp = 0; + } +} + +/* round up nmax to fill a memory allocator block completely */ +static size_t roundup(size_t nmax) +{ + size_t sz = sizeof_bucket(nmax); + + /* round up size to a full malloc block */ + sz = __cilkrts_frame_malloc_roundup(sz); + + /* invert sizeof_bucket() */ + nmax = ((sz - sizeof(bucket)) / sizeof(elem)); + + return nmax; +} + +static bool is_power_of_2(size_t n) +{ + return (n & (n - 1)) == 0; +} + +void cilkred_map::make_buckets(__cilkrts_worker *w, + size_t new_nbuckets) +{ + nbuckets = new_nbuckets; + + CILK_ASSERT(is_power_of_2(nbuckets)); +#if defined __GNUC__ && defined __ICC + /* bug workaround -- suppress calls to _intel_fast_memset */ + bucket *volatile*new_buckets = (bucket *volatile*) +#else + bucket **new_buckets = (bucket **) +#endif + __cilkrts_frame_malloc(w, nbuckets * sizeof(*(buckets))); + +#if REDPAR_DEBUG >= 1 + fprintf(stderr, "W=%d, desc=make_buckets, new_buckets=%p, new_nbuckets=%zd\n", + w->self, new_buckets, new_nbuckets); +#endif + + for (size_t i = 0; i < new_nbuckets; ++i) + new_buckets[i] = 0; +#if defined __GNUC__ && defined __ICC + buckets = (bucket **)new_buckets; +#else + buckets = new_buckets; +#endif + nelem = 0; +} + +static void free_buckets(__cilkrts_worker *w, + bucket **buckets, + size_t nbuckets) +{ + size_t i; + +#if REDPAR_DEBUG >= 1 + verify_current_wkr(w); + fprintf(stderr, "W=%d, desc=free_buckets, buckets=%p, size=%zd\n", + w->self, buckets, + nbuckets * sizeof(*buckets)); +#endif + + for (i = 0; i < nbuckets; ++i) + free_bucket(w, buckets + i); + + __cilkrts_frame_free(w, buckets, nbuckets * sizeof(*buckets)); +} + +static size_t minsz(size_t nelem) +{ + return 1U + nelem + nelem / 8U; +} + +static size_t nextsz(size_t nelem) +{ + return 2 * nelem; +} + +bool cilkred_map::need_rehash_p() const +{ + return minsz(nelem) > nbuckets; +} + +static inline size_t hashfun(const cilkred_map *h, void *key) +{ + size_t k = (size_t) key; + + k ^= k >> 21; + k ^= k >> 8; + k ^= k >> 3; + + return k & (h->nbuckets - 1); +} + +// Given a __cilkrts_hyperobject_base, return the key to that hyperobject in +// the reducer map. +static inline void* get_hyperobject_key(__cilkrts_hyperobject_base *hb) +{ + // The current implementation uses the address of the lefmost view as the + // key. + return reinterpret_cast<char*>(hb) + hb->__view_offset; +} + +// Given a hyperobject key, return a pointer to the leftmost object. In the +// current implementation, the address of the leftmost object IS the key, so +// this function is an effective noop. +static inline void* get_leftmost_view(void *key) +{ + return key; +} + +/* debugging support: check consistency of a reducer map */ +void cilkred_map::check(bool allow_null_view) +{ + size_t count = 0; + + CILK_ASSERT(buckets); + for (size_t i = 0; i < nbuckets; ++i) { + bucket *b = buckets[i]; + if (b) + for (elem *el = b->el; el->key; ++el) { + CILK_ASSERT(allow_null_view || el->view); + ++count; + } + } + CILK_ASSERT(nelem == count); + /*global_reducer_map::check();*/ +} + +/* grow bucket by one element, reallocating bucket if necessary */ +elem *cilkred_map::grow(__cilkrts_worker *w, + bucket **bp) +{ + size_t i, nmax, nnmax; + bucket *b, *nb; + + b = *bp; + if (b) { + nmax = b->nmax; + /* find empty element if any */ + for (i = 0; i < nmax; ++i) + if (b->el[i].key == 0) + return &(b->el[i]); + /* do not use the last one even if empty */ + } else { + nmax = 0; + } + + verify_current_wkr(w); + /* allocate a new bucket */ + nnmax = roundup(2 * nmax); + nb = alloc_bucket(w, nnmax); + + + /* copy old bucket into new */ + for (i = 0; i < nmax; ++i) + nb->el[i] = b->el[i]; + + free_bucket(w, bp); *bp = nb; + + /* zero out extra elements */ + for (; i < nnmax; ++i) + nb->el[i].key = 0; + + /* zero out the last one */ + nb->el[i].key = 0; + + return &(nb->el[nmax]); +} + +elem *cilkred_map::insert_no_rehash(__cilkrts_worker *w, + void *key, + __cilkrts_hyperobject_base *hb, + void *view) +{ + +#if REDPAR_DEBUG >= 2 + fprintf(stderr, "[W=%d, desc=insert_no_rehash, this_map=%p]\n", + w->self, this); + verify_current_wkr(w); +#endif + + CILK_ASSERT((w == 0 && g == 0) || w->g == g); + CILK_ASSERT(key != 0); + CILK_ASSERT(view != 0); + + elem *el = grow(w, &(buckets[hashfun(this, key)])); + +#if REDPAR_DEBUG >= 3 + fprintf(stderr, "[W=%d, this=%p, inserting key=%p, view=%p, el = %p]\n", + w->self, this, key, view, el); +#endif + + el->key = key; + el->hb = hb; + el->view = view; + ++nelem; + + return el; +} + +void cilkred_map::rehash(__cilkrts_worker *w) +{ +#if REDPAR_DEBUG >= 1 + fprintf(stderr, "[W=%d, desc=rehash, this_map=%p, g=%p, w->g=%p]\n", + w->self, this, g, w->g); + verify_current_wkr(w); +#endif + CILK_ASSERT((w == 0 && g == 0) || w->g == g); + + size_t onbuckets = nbuckets; + size_t onelem = nelem; + bucket **obuckets = buckets; + size_t i; + bucket *b; + + make_buckets(w, nextsz(nbuckets)); + + for (i = 0; i < onbuckets; ++i) { + b = obuckets[i]; + if (b) { + elem *oel; + for (oel = b->el; oel->key; ++oel) + insert_no_rehash(w, oel->key, oel->hb, oel->view); + } + } + + CILK_ASSERT(nelem == onelem); + + free_buckets(w, obuckets, onbuckets); +} + +elem *cilkred_map::rehash_and_insert(__cilkrts_worker *w, + void *key, + __cilkrts_hyperobject_base *hb, + void *view) +{ + +#if REDPAR_DEBUG >= 1 + fprintf(stderr, "W=%d, this_map =%p, inserting key=%p, view=%p\n", + w->self, this, key, view); + verify_current_wkr(w); +#endif + + if (need_rehash_p()) + rehash(w); + + return insert_no_rehash(w, key, hb, view); +} + + +elem *cilkred_map::lookup(void *key) +{ + bucket *b = buckets[hashfun(this, key)]; + + if (b) { + elem *el; + for (el = b->el; el->key; ++el) { + if (el->key == key) { + CILK_ASSERT(el->view); + return el; + } + } + } + + return 0; +} + +void elem::destroy() +{ + if (! is_leftmost()) { + + // Call destroy_fn and deallocate_fn on the view, but not if it's the + // leftmost view. + cilk_c_monoid *monoid = &(hb->__c_monoid); + cilk_c_reducer_destroy_fn_t destroy_fn = monoid->destroy_fn; + cilk_c_reducer_deallocate_fn_t deallocate_fn = monoid->deallocate_fn; + + destroy_fn((void*)hb, view); + deallocate_fn((void*)hb, view); + } + + view = 0; +} + +inline +bool elem::is_leftmost() const +{ + // implementation uses the address of the leftmost view as the key, so if + // key == view, then this element refers to the leftmost view. + return key == view; +} + +/* remove the reducer from the current reducer map. If the reducer + exists in maps other than the current one, the behavior is + undefined. */ +extern "C" +CILK_EXPORT void __CILKRTS_STRAND_STALE( + __cilkrts_hyper_destroy(__cilkrts_hyperobject_base *hb)) +{ + // Disable Cilkscreen for the duration of this call. The destructor for + // this class will re-enable Cilkscreen when the method returns. This + // will prevent Cilkscreen from reporting apparent races in reducers + DisableCilkscreen x; + + __cilkrts_worker* w = __cilkrts_get_tls_worker(); + if (! w) { + // If no worker, then Cilk is not running and there is no reducer + // map. Do nothing. The reducer's destructor will take care of + // destroying the leftmost view. + return; + } + +const char *UNSYNCED_REDUCER_MSG = + "Destroying a reducer while it is visible to unsynced child tasks, or\n" + "calling CILK_C_UNREGISTER_REDUCER() on an unregistered reducer.\n" + "Did you forget a _Cilk_sync or CILK_C_REGISTER_REDUCER()?"; + + cilkred_map* h = w->reducer_map; + if (NULL == h) + cilkos_error(UNSYNCED_REDUCER_MSG); // Does not return + + if (h->merging) { + verify_current_wkr(w); + __cilkrts_bug("User error: hyperobject used by another hyperobject"); + } + + void* key = get_hyperobject_key(hb); + elem *el = h->lookup(key); + + // Verify that the reducer is being destroyed from the leftmost strand for + // which the reducer is defined. + if (! (el && el->is_leftmost())) + cilkos_error(UNSYNCED_REDUCER_MSG); + +#if REDPAR_DEBUG >= 3 + fprintf(stderr, "[W=%d, key=%p, lookup in map %p, found el=%p, about to destroy]\n", + w->self, key, h, el); +#endif + + // Remove the element from the hash bucket. Do not bother shrinking + // the bucket. Note that the destroy() function does not actually + // call the destructor for the leftmost view. + el->destroy(); + do { + el[0] = el[1]; + ++el; + } while (el->key); + --h->nelem; + +#if REDPAR_DEBUG >= 2 + fprintf(stderr, "[W=%d, desc=hyper_destroy_finish, key=%p, w->reducer_map=%p]\n", + w->self, key, w->reducer_map); +#endif +} + +extern "C" +CILK_EXPORT +void __cilkrts_hyper_create(__cilkrts_hyperobject_base *hb) +{ + // This function registers the specified hyperobject in the current + // reducer map and registers the initial value of the hyperobject as the + // leftmost view of the reducer. + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + if (! w) { + // If there is no worker, then there is nothing to do: The iniitial + // value will automatically be used as the left-most view when we + // enter Cilk. + return; + } + + // Disable Cilkscreen for the duration of this call. The destructor for + // this class will re-enable Cilkscreen when the method returns. This + // will prevent Cilkscreen from reporting apparent races in reducers + DisableCilkscreen x; + + void* key = get_hyperobject_key(hb); + void* view = get_leftmost_view(key); + cilkred_map *h = w->reducer_map; + + if (__builtin_expect(!h, 0)) { + h = install_new_reducer_map(w); +#if REDPAR_DEBUG >= 2 + fprintf(stderr, "[W=%d, hb=%p, hyper_create, isntalled new map %p, view=%p]\n", + w->self, hb, h, view); +#endif + } + + /* Must not exist. */ + CILK_ASSERT(h->lookup(key) == NULL); + +#if REDPAR_DEBUG >= 3 + verify_current_wkr(w); + fprintf(stderr, "[W=%d, hb=%p, lookup in map %p of view %p, should be null]\n", + w->self, hb, h, view); + fprintf(stderr, "W=%d, h=%p, inserting key %p, view%p\n", + w->self, + h, + &(hb->__c_monoid), + view); +#endif + + if (h->merging) + __cilkrts_bug("User error: hyperobject used by another hyperobject"); + + CILK_ASSERT(w->reducer_map == h); + // The address of the leftmost value is the same as the key for lookup. + (void) h->rehash_and_insert(w, view, hb, view); +} + +extern "C" +CILK_EXPORT void* __CILKRTS_STRAND_PURE( + __cilkrts_hyper_lookup(__cilkrts_hyperobject_base *hb)) +{ + __cilkrts_worker* w = __cilkrts_get_tls_worker_fast(); + void* key = get_hyperobject_key(hb); + if (! w) + return get_leftmost_view(key); + + // Disable Cilkscreen for the duration of this call. This will + // prevent Cilkscreen from reporting apparent races in reducers + DisableCilkscreen dguard; + + if (__builtin_expect(w->g->force_reduce, 0)) + __cilkrts_promote_own_deque(w); + cilkred_map* h = w->reducer_map; + + if (__builtin_expect(!h, 0)) { + h = install_new_reducer_map(w); + } + + if (h->merging) + __cilkrts_bug("User error: hyperobject used by another hyperobject"); + elem* el = h->lookup(key); + if (! el) { + /* lookup failed; insert a new default element */ + void *rep; + + { + /* re-enable cilkscreen while calling the constructor */ + EnableCilkscreen eguard; + if (h->is_leftmost) + { + // This special case is called only if the reducer was not + // registered using __cilkrts_hyper_create, e.g., if this is a + // C reducer in global scope or if there is no bound worker. + rep = get_leftmost_view(key); + } + else + { + rep = hb->__c_monoid.allocate_fn((void*)hb, + hb->__view_size); + // TBD: Handle exception on identity function + hb->__c_monoid.identity_fn((void*)hb, rep); + } + } + +#if REDPAR_DEBUG >= 3 + fprintf(stderr, "W=%d, h=%p, inserting key %p, view%p\n", + w->self, + h, + &(hb->__c_monoid), + rep); + CILK_ASSERT(w->reducer_map == h); +#endif + el = h->rehash_and_insert(w, key, hb, rep); + } + + return el->view; +} + +extern "C" CILK_EXPORT +void* __cilkrts_hyperobject_alloc(void* ignore, std::size_t bytes) +{ + return std::malloc(bytes); +} + +extern "C" CILK_EXPORT +void __cilkrts_hyperobject_dealloc(void* ignore, void* view) +{ + std::free(view); +} + +/* No-op destroy function */ +extern "C" CILK_EXPORT +void __cilkrts_hyperobject_noop_destroy(void* ignore, void* ignore2) +{ +} + +cilkred_map *__cilkrts_make_reducer_map(__cilkrts_worker *w) +{ + CILK_ASSERT(w); + + cilkred_map *h; + size_t nbuckets = 1; /* default value */ + + h = (cilkred_map *)__cilkrts_frame_malloc(w, sizeof(*h)); +#if REDPAR_DEBUG >= 1 + fprintf(stderr, "[W=%d, desc=make_reducer_frame_malloc_reducer_map, h=%p]\n", + w->self, h); +#endif + + h->g = w ? w->g : 0; + h->make_buckets(w, nbuckets); + h->merging = false; + h->is_leftmost = false; + + return h; +} + +/* Destroy a reducer map. The map must have been allocated + from the worker's global context and should have been + allocated from the same worker. */ +void __cilkrts_destroy_reducer_map(__cilkrts_worker *w, cilkred_map *h) +{ + CILK_ASSERT((w == 0 && h->g == 0) || w->g == h->g); + verify_current_wkr(w); + + /* the reducer map is allowed to contain el->view == NULL here (and + only here). We set el->view == NULL only when we know that the + map will be destroyed immediately afterwards. */ + DBG h->check(/*allow_null_view=*/true); + + bucket *b; + size_t i; + + for (i = 0; i < h->nbuckets; ++i) { + b = h->buckets[i]; + if (b) { + elem *el; + for (el = b->el; el->key; ++el) { + if (el->view) + el->destroy(); + } + } + } + + free_buckets(w, h->buckets, h->nbuckets); + +#if REDPAR_DEBUG >= 1 + fprintf(stderr, "W=%d, destroy_red_map, freeing map h=%p, size=%zd\n", + w->self, h, sizeof(*h)); +#endif + + __cilkrts_frame_free(w, h, sizeof(*h)); +} + +/* Set the specified reducer map as the leftmost map if is_leftmost is true, + otherwise, set it to not be the leftmost map. */ +void __cilkrts_set_leftmost_reducer_map(cilkred_map *h, int is_leftmost) +{ + h->is_leftmost = is_leftmost; +} + + +__cilkrts_worker* cilkred_map::merge(__cilkrts_worker *w, + cilkred_map *other_map, + enum merge_kind kind) +{ + // Disable Cilkscreen while the we merge the maps. The destructor for + // the guard class will re-enable Cilkscreen when it goes out of scope. + // This will prevent Cilkscreen from reporting apparent races in between + // the reduce function and the reducer operations. The Cilk runtime + // guarantees that a pair of reducer maps will only be merged when no + // other strand will access them. + DisableCilkscreen guard; + +#if REDPAR_DEBUG >= 2 + fprintf(stderr, "[W=%d, desc=merge, this_map=%p, other_map=%p]\n", + w->self, + this, other_map); +#endif + // Remember the current stack frame. + __cilkrts_stack_frame *current_sf = w->current_stack_frame; + merging = true; + other_map->merging = true; + + // Merging to the leftmost view is a special case because every leftmost + // element must be initialized before the merge. + CILK_ASSERT(!other_map->is_leftmost /* || kind == MERGE_UNORDERED */); + bool merge_to_leftmost = (this->is_leftmost + /* && !other_map->is_leftmost */); + + DBG check(/*allow_null_view=*/false); + DBG other_map->check(/*allow_null_view=*/false); + + for (size_t i = 0; i < other_map->nbuckets; ++i) { + bucket *b = other_map->buckets[i]; + if (b) { + for (elem *other_el = b->el; other_el->key; ++other_el) { + /* Steal the value from the other map, which will be + destroyed at the end of this operation. */ + void *other_view = other_el->view; + CILK_ASSERT(other_view); + + void *key = other_el->key; + __cilkrts_hyperobject_base *hb = other_el->hb; + elem *this_el = lookup(key); + + if (this_el == 0 && merge_to_leftmost) { + /* Initialize leftmost view before merging. */ + void* leftmost = get_leftmost_view(key); + // leftmost == other_view can be true if the initial view + // was created in other than the leftmost strand of the + // spawn tree, but then made visible to subsequent strands + // (E.g., the reducer was allocated on the heap and the + // pointer was returned to the caller.) In such cases, + // parallel semantics says that syncing with earlier + // strands will always result in 'this_el' being null, + // thus propagating the initial view up the spawn tree + // until it reaches the leftmost strand. When synching + // with the leftmost strand, leftmost == other_view will be + // true and we must avoid reducing the initial view with + // itself. + if (leftmost != other_view) + this_el = rehash_and_insert(w, key, hb, leftmost); + } + + if (this_el == 0) { + /* move object from other map into this one */ + rehash_and_insert(w, key, hb, other_view); + other_el->view = 0; + continue; /* No element-level merge necessary */ + } + + /* The same key is present in both maps with values + A and B. Three choices: fail, A OP B, B OP A. */ + switch (kind) + { + case MERGE_UNORDERED: + __cilkrts_bug("TLS Reducer race"); + break; + case MERGE_INTO_RIGHT: + /* Swap elements in order to preserve object + identity */ + other_el->view = this_el->view; + this_el->view = other_view; + /* FALL THROUGH */ + case MERGE_INTO_LEFT: { + /* Stealing should be disabled during reduce + (even if force-reduce is enabled). */ + +#if DISABLE_PARALLEL_REDUCERS + __cilkrts_stack_frame * volatile *saved_protected_tail; + saved_protected_tail = __cilkrts_disallow_stealing(w, NULL); +#endif + + { + CILK_ASSERT(current_sf->worker == w); + CILK_ASSERT(w->current_stack_frame == current_sf); + + /* TBD: if reduce throws an exception we need to stop it + here. */ + hb->__c_monoid.reduce_fn((void*)hb, + this_el->view, + other_el->view); + w = current_sf->worker; + +#if REDPAR_DEBUG >= 2 + verify_current_wkr(w); + CILK_ASSERT(w->current_stack_frame == current_sf); +#endif + } + +#if DISABLE_PARALLEL_REDUCERS + /* Restore stealing */ + __cilkrts_restore_stealing(w, saved_protected_tail); +#endif + + } break; + } + } + } + } + this->is_leftmost = this->is_leftmost || other_map->is_leftmost; + merging = false; + other_map->merging = false; + verify_current_wkr(w); + __cilkrts_destroy_reducer_map(w, other_map); + return w; +} + + +/** + * Print routine for debugging the merging of reducer maps. + * A no-op unless REDPAR_DEBUG set high enough. + */ +static inline +void debug_map_merge(__cilkrts_worker *w, + cilkred_map *left_map, + cilkred_map *right_map, + __cilkrts_worker **final_wkr) +{ +#if REDPAR_DEBUG >= 2 + fprintf(stderr, "[W=%d, desc=finish_merge, left_map=%p, right_map=%p, w->reducer_map=%p, right_ans=%p, final_wkr=%d]\n", + w->self, left_map, right_map, w->reducer_map, right_map, (*final_wkr)->self); +#endif +} + + +/** + * merge RIGHT into LEFT; + * return whichever map allows for faster merge, and destroy the other one. + * + * *w_ptr should be the currently executing worker. + * *w_ptr may change during execution if the reduction is parallel. + */ +cilkred_map* +merge_reducer_maps(__cilkrts_worker **w_ptr, + cilkred_map *left_map, + cilkred_map *right_map) +{ + __cilkrts_worker *w = *w_ptr; + if (!left_map) { + debug_map_merge(w, left_map, right_map, w_ptr); + return right_map; + } + + if (!right_map) { + debug_map_merge(w, left_map, right_map, w_ptr); + return left_map; + } + + /* Special case, if left_map is leftmost, then always merge into it. + For C reducers this forces lazy creation of the leftmost views. */ + if (left_map->is_leftmost || left_map->nelem > right_map->nelem) { + *w_ptr = left_map->merge(w, right_map, cilkred_map::MERGE_INTO_LEFT); + debug_map_merge(*w_ptr, left_map, right_map, w_ptr); + return left_map; + } else { + *w_ptr = right_map->merge(w, left_map, cilkred_map::MERGE_INTO_RIGHT); + debug_map_merge(*w_ptr, left_map, right_map, w_ptr); + return right_map; + } +} + +/** + * Merges RIGHT into LEFT, and then repeatedly calls + * merge_reducer_maps_helper() until (*w_ptr)->reducer_map is NULL. + * + * *w_ptr may change as reductions execute. + */ +cilkred_map* +repeated_merge_reducer_maps(__cilkrts_worker **w_ptr, + cilkred_map *left_map, + cilkred_map *right_map) +{ + // Note: if right_map == NULL but w->reducer_map != NULL, then + // this loop will reduce w->reducer_map into left_map. + do { + left_map = merge_reducer_maps(w_ptr, left_map, right_map); + verify_current_wkr(*w_ptr); + + // Pull any newly created reducer map and loop around again. + right_map = (*w_ptr)->reducer_map; + (*w_ptr)->reducer_map = NULL; + } while (right_map); + return left_map; +} + +/* End reducer_impl.cpp */ diff --git a/libcilkrts/runtime/reducer_impl.h b/libcilkrts/runtime/reducer_impl.h new file mode 100644 index 00000000000..3425967ad8d --- /dev/null +++ b/libcilkrts/runtime/reducer_impl.h @@ -0,0 +1,128 @@ +/* reducer_impl.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file reducer_impl.h + * + * @brief Functions to implement reducers in the runtime. + */ + +#ifndef INCLUDED_REDUCER_IMPL_DOT_H +#define INCLUDED_REDUCER_IMPL_DOT_H + +#include <cilk/common.h> +#include <internal/abi.h> +#include "rts-common.h" + +__CILKRTS_BEGIN_EXTERN_C + +/** + * Construct an empty reducer map from the memory pool associated with the + * given worker. This reducer map must be destroyed before the worker's + * associated global context is destroyed. + * + * @param w __cilkrts_worker the cilkred_map is being created for. + * + * @return Pointer to the initialized cilkred_map. + */ +COMMON_SYSDEP +cilkred_map *__cilkrts_make_reducer_map(__cilkrts_worker *w); + +/** + * Destroy a reducer map. The map must have been allocated from the worker's + * global context and should have been allocated from the same worker. + * + * @param w __cilkrts_worker the cilkred_map was created for. + * @param h The cilkred_map to be deallocated. + */ +COMMON_SYSDEP +void __cilkrts_destroy_reducer_map(__cilkrts_worker *w, + cilkred_map *h); + +/** + * Set the specified reducer map as the leftmost map if is_leftmost is true, + * otherwise, set it to not be the leftmost map. + * + * @param h The cilkred_map to be modified. + * @param is_leftmost true if the reducer map is leftmost. + */ +COMMON_SYSDEP +void __cilkrts_set_leftmost_reducer_map(cilkred_map *h, + int is_leftmost); + +/** + * Merge reducer map RIGHT_MAP into LEFT_MAP and return the result of the + * merge. Both maps must be allocated from the global context associated + * with the specified worker. The returned reducer map must be destroyed + * before the worker's associated global context is destroyed. + * + * If two cilkred_maps are specified, one will be destroyed and the other + * one will be returned as the merged cilkred_map. + * + * When reducers can contain nested parallelism, execution can return + * on a different worker than when it started (but still using the + * same stack). + * + * Upon return, *w_ptr stores the pointer to the worker that execution + * returns on. + * + * @param w_ptr Pointer to the currently executing worker. + * @param left_map The left cilkred_map. + * @param right_map The right cilkred_map. + * + * @return pointer to merged cilkred_map. + */ +extern +cilkred_map *merge_reducer_maps(__cilkrts_worker **w_ptr, + cilkred_map *left_map, + cilkred_map *right_map); + +/** + * Similar to merge_reducer_maps(), except that after merging + * RIGHT_MAP into LEFT_MAP, it repeatedly merges (*w_ptr)->reducer_map + * into LEFT_MAP. This procedure ensures that any new reducers + * created by the reductions themselves also get merged into LEFT_MAP. + */ +extern +cilkred_map *repeated_merge_reducer_maps(__cilkrts_worker **w_ptr, + cilkred_map *left_map, + cilkred_map *right_map); + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_REDUCER_IMPL_DOT_H) diff --git a/libcilkrts/runtime/rts-common.h b/libcilkrts/runtime/rts-common.h new file mode 100644 index 00000000000..4ffde7ccb1e --- /dev/null +++ b/libcilkrts/runtime/rts-common.h @@ -0,0 +1,132 @@ +/* rts-common.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#ifndef INCLUDED_RTS_COMMON_DOT_H +#define INCLUDED_RTS_COMMON_DOT_H + +/* Abbreviations API functions returning different types. By using these + * abbreviations instead of using CILK_API(ret) directly, etags and other + * tools can more easily recognize function signatures. + */ +#define CILK_API_VOID CILK_API(void) +#define CILK_API_VOID_PTR CILK_API(void*) +#define CILK_API_INT CILK_API(int) +#define CILK_API_SIZET CILK_API(size_t) +#define CILK_API_TBB_RETCODE CILK_API(__cilk_tbb_retcode) +#define CILK_API_PEDIGREE CILK_API(__cilkrts_pedigree) + +/* Abbreviations ABI functions returning different types. By using these + * abbreviations instead of using CILK_ABI(ret) directly, etags and other + * tools can more easily recognize function signatures. + */ +#define CILK_ABI_VOID CILK_ABI(void) +#define CILK_ABI_WORKER_PTR CILK_ABI(__cilkrts_worker_ptr) +#define CILK_ABI_THROWS_VOID CILK_ABI_THROWS(void) + +/* documentation aid to identify portable vs. nonportable + parts of the runtime. See README for definitions. */ +#define COMMON_PORTABLE +#define COMMON_SYSDEP +#define NON_COMMON + +#if !(defined __GNUC__ || defined __ICC) +# define __builtin_expect(a_, b_) a_ +#endif + +#ifdef __cplusplus +# define cilk_nothrow throw() +#else +# define cilk_nothrow /*empty in C*/ +#endif + +#ifdef __GNUC__ +# define NORETURN void __attribute__((noreturn)) +#else +# define NORETURN void __declspec(noreturn) +#endif + +#ifdef __GNUC__ +# define NOINLINE __attribute__((noinline)) +#else +# define NOINLINE __declspec(noinline) +#endif + +#ifndef __GNUC__ +# define __attribute__(X) +#endif + +/* Microsoft CL accepts "inline" for C++, but not for C. It accepts + * __inline for both. Intel ICL accepts inline for C of /Qstd=c99 + * is set. The Cilk runtime is assumed to be compiled with /Qstd=c99 + */ +#if defined(_MSC_VER) && ! defined(__INTEL_COMPILER) +# error define inline +# define inline __inline +#endif + +/* Compilers that build the Cilk runtime are assumed to know about zero-cost + * intrinsics (__notify_intrinsic()). For those that don't, #undef the + * following definition: + */ +//#define ENABLE_NOTIFY_ZC_INTRINSIC 1 + +#if defined(__INTEL_COMPILER) +/* The notify intrinsic was introduced in ICC 12.0. */ +# if __INTEL_COMPILER <= 1200 +# undef ENABLE_NOTIFY_ZC_INTRINSIC +# endif +#elif defined(__VXWORKS__) +# undef ENABLE_NOTIFY_ZC_INTRINSIC +#elif defined(__clang__) +# if !defined(__has_extension) || !__has_extension(notify_zc_intrinsic) +# undef ENABLE_NOTIFY_ZC_INTRINSIC +# endif +#elif defined(__arm__) +// __notify_zc_intrinsic not yet supported by gcc for ARM +# undef ENABLE_NOTIFY_ZC_INTRINSIC +#endif + +// If ENABLE_NOTIFY_ZC_INTRINSIC is defined, use __notify_zc_intrisic +#ifdef ENABLE_NOTIFY_ZC_INTRINSIC +# define NOTIFY_ZC_INTRINSIC(annotation, data) \ + __notify_zc_intrinsic(annotation, data) +#else +# define NOTIFY_ZC_INTRINSIC(annotation, data) +#endif + +#endif // ! defined(INCLUDED_RTS_COMMON_DOT_H) diff --git a/libcilkrts/runtime/scheduler.c b/libcilkrts/runtime/scheduler.c new file mode 100644 index 00000000000..bab6430d9db --- /dev/null +++ b/libcilkrts/runtime/scheduler.c @@ -0,0 +1,3940 @@ +/* scheduler.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2007-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +/* + * Cilk scheduler + */ + +#include "scheduler.h" +#include "bug.h" +#include "os.h" +#include "os_mutex.h" +#include "local_state.h" +#include "signal_node.h" +#include "full_frame.h" +#include "sysdep.h" +#include "except.h" +#include "cilk_malloc.h" +#include "pedigrees.h" +#include "record-replay.h" + +#include <limits.h> +#include <string.h> /* memcpy */ +#include <stdio.h> // sprintf +#include <stdlib.h> // malloc, free, abort + +#ifdef _WIN32 +# pragma warning(disable:1786) // disable warning: sprintf is deprecated +# include "sysdep-win.h" +# include "except-win32.h" +#endif // _WIN32 + +// ICL: Don't complain about conversion from pointer to same-sized integral +// type in __cilkrts_put_stack. That's why we're using ptrdiff_t +#ifdef _WIN32 +# pragma warning(disable: 1684) +#endif + +#include "cilk/cilk_api.h" +#include "frame_malloc.h" +#include "metacall_impl.h" +#include "reducer_impl.h" +#include "cilk-tbb-interop.h" +#include "cilk-ittnotify.h" +#include "stats.h" + +// ICL: Don't complain about loss of precision in myrand +// I tried restoring the warning after the function, but it didn't +// suppress it +#ifdef _WIN32 +# pragma warning(disable: 2259) +#endif + +#ifndef _WIN32 +# include <unistd.h> +#endif + +#ifdef __VXWORKS__ +// redeclare longjmp() with noreturn to stop warnings +extern __attribute__((noreturn)) + void longjmp(jmp_buf, int); +#endif + +//#define DEBUG_LOCKS 1 +#ifdef DEBUG_LOCKS +// The currently executing worker must own this worker's lock +# define ASSERT_WORKER_LOCK_OWNED(w) \ + { \ + __cilkrts_worker *tls_worker = __cilkrts_get_tls_worker(); \ + CILK_ASSERT((w)->l->lock.owner == tls_worker); \ + } +#else +# define ASSERT_WORKER_LOCK_OWNED(w) +#endif // DEBUG_LOCKS + +// Options for the scheduler. +enum schedule_t { SCHEDULE_RUN, + SCHEDULE_WAIT, + SCHEDULE_EXIT }; + +// Return values for provably_good_steal() +enum provably_good_steal_t +{ + ABANDON_EXECUTION, // Not the last child to the sync - attempt to steal work + CONTINUE_EXECUTION, // Last child to the sync - continue executing on this worker + WAIT_FOR_CONTINUE // The replay log indicates that this was the worker + // which continued. Loop until we are the last worker + // to the sync. +}; + + +// Verify that "w" is the worker we are currently executing on. +// Because this check is expensive, this method is usually a no-op. +static inline void verify_current_wkr(__cilkrts_worker *w) +{ +#if ((REDPAR_DEBUG >= 3) || (FIBER_DEBUG >= 1)) + // Lookup the worker from TLS and compare to w. + __cilkrts_worker* tmp = __cilkrts_get_tls_worker(); + if (w != tmp) { + fprintf(stderr, "Error. W=%d, actual worker =%d...\n", + w->self, + tmp->self); + } + CILK_ASSERT(w == tmp); +#endif +} + +static enum schedule_t worker_runnable(__cilkrts_worker *w); + +// Scheduling-fiber functions: +static void do_return_from_spawn (__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *sf); +static void do_sync (__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *sf); + +// max is defined on Windows and VxWorks +#if (! defined(_WIN32)) && (! defined(__VXWORKS__)) + // TBD: definition of max() for Linux. +# define max(a, b) ((a) < (b) ? (b) : (a)) +#endif + +void __cilkrts_dump_stats_to_stderr(global_state_t *g) +{ +#ifdef CILK_PROFILE + int i; + for (i = 0; i < g->total_workers; ++i) { + // Print out statistics for each worker. We collected them, + // so why not print them out? + fprintf(stderr, "Stats for worker %d\n", i); + dump_stats_to_file(stderr, g->workers[i]->l->stats); + __cilkrts_accum_stats(&g->stats, g->workers[i]->l->stats); + } + + // Also print out aggregate statistics. + dump_stats_to_file(stderr, &g->stats); +#endif + fprintf(stderr, + "CILK PLUS Thread Info: P=%d, Q=%d\n", + g->P, + g->Q); + fprintf(stderr, + "CILK PLUS RUNTIME MEMORY USAGE: %lld bytes", + (long long)g->frame_malloc.allocated_from_os); +#ifdef CILK_PROFILE + if (g->stats.stack_hwm) + fprintf(stderr, ", %ld stacks", g->stats.stack_hwm); +#endif + fputc('\n', stderr); +} + +static void validate_worker(__cilkrts_worker *w) +{ + /* check the magic numbers, for debugging purposes */ + if (w->l->worker_magic_0 != WORKER_MAGIC_0 || + w->l->worker_magic_1 != WORKER_MAGIC_1) + abort_because_rts_is_corrupted(); +} + +static void double_link(full_frame *left_ff, full_frame *right_ff) +{ + if (left_ff) + left_ff->right_sibling = right_ff; + if (right_ff) + right_ff->left_sibling = left_ff; +} + +/* add CHILD to the right of all children of PARENT */ +static void push_child(full_frame *parent_ff, full_frame *child_ff) +{ + double_link(parent_ff->rightmost_child, child_ff); + double_link(child_ff, 0); + parent_ff->rightmost_child = child_ff; +} + +/* unlink CHILD from the list of all children of PARENT */ +static void unlink_child(full_frame *parent_ff, full_frame *child_ff) +{ + double_link(child_ff->left_sibling, child_ff->right_sibling); + + if (!child_ff->right_sibling) { + /* this is the rightmost child -- update parent link */ + CILK_ASSERT(parent_ff->rightmost_child == child_ff); + parent_ff->rightmost_child = child_ff->left_sibling; + } + child_ff->left_sibling = child_ff->right_sibling = 0; /* paranoia */ +} + +static void incjoin(full_frame *ff) +{ + ++ff->join_counter; +} + +static int decjoin(full_frame *ff) +{ + CILK_ASSERT(ff->join_counter > 0); + return (--ff->join_counter); +} + +static int simulate_decjoin(full_frame *ff) +{ + CILK_ASSERT(ff->join_counter > 0); + return (ff->join_counter - 1); +} + +/* + * Pseudo-random generator defined by the congruence S' = 69070 * S + * mod (2^32 - 5). Marsaglia (CACM July 1993) says on page 107 that + * this is a ``good one''. There you go. + * + * The literature makes a big fuss about avoiding the division, but + * for us it is not worth the hassle. + */ +static const unsigned RNGMOD = ((1ULL << 32) - 5); +static const unsigned RNGMUL = 69070U; + +static unsigned myrand(__cilkrts_worker *w) +{ + unsigned state = w->l->rand_seed; + state = (unsigned)((RNGMUL * (unsigned long long)state) % RNGMOD); + w->l->rand_seed = state; + return state; +} + +static void mysrand(__cilkrts_worker *w, unsigned seed) +{ + seed %= RNGMOD; + seed += (seed == 0); /* 0 does not belong to the multiplicative + group. Use 1 instead */ + w->l->rand_seed = seed; +} + +/* W grabs its own lock */ +void __cilkrts_worker_lock(__cilkrts_worker *w) +{ + validate_worker(w); + CILK_ASSERT(w->l->do_not_steal == 0); + + /* tell thieves to stay out of the way */ + w->l->do_not_steal = 1; + __cilkrts_fence(); /* probably redundant */ + + __cilkrts_mutex_lock(w, &w->l->lock); +} + +void __cilkrts_worker_unlock(__cilkrts_worker *w) +{ + __cilkrts_mutex_unlock(w, &w->l->lock); + CILK_ASSERT(w->l->do_not_steal == 1); + /* The fence is probably redundant. Use a release + operation when supported (gcc and compatibile); + that is faster on x86 which serializes normal stores. */ +#if defined __GNUC__ && (__GNUC__ * 10 + __GNUC_MINOR__ > 43 || __ICC >= 1110) + __sync_lock_release(&w->l->do_not_steal); +#else + w->l->do_not_steal = 0; + __cilkrts_fence(); /* store-store barrier, redundant on x86 */ +#endif +} + +/* try to acquire the lock of some *other* worker */ +static int worker_trylock_other(__cilkrts_worker *w, + __cilkrts_worker *other) +{ + int status = 0; + + validate_worker(other); + + /* This protocol guarantees that, after setting the DO_NOT_STEAL + flag, worker W can enter its critical section after waiting for + the thief currently in the critical section (if any) and at + most one other thief. + + This requirement is overly paranoid, but it should protect us + against future nonsense from OS implementors. + */ + + /* compete for the right to disturb OTHER */ + if (__cilkrts_mutex_trylock(w, &other->l->steal_lock)) { + if (other->l->do_not_steal) { + /* leave it alone */ + } else { + status = __cilkrts_mutex_trylock(w, &other->l->lock); + } + __cilkrts_mutex_unlock(w, &other->l->steal_lock); + } + + + return status; +} + +static void worker_unlock_other(__cilkrts_worker *w, + __cilkrts_worker *other) +{ + __cilkrts_mutex_unlock(w, &other->l->lock); +} + + +/* Lock macro Usage: + BEGIN_WITH_WORKER_LOCK(w) { + statement; + statement; + BEGIN_WITH_FRAME_LOCK(w, ff) { + statement; + statement; + } END_WITH_FRAME_LOCK(w, ff); + } END_WITH_WORKER_LOCK(w); + */ +#define BEGIN_WITH_WORKER_LOCK(w) __cilkrts_worker_lock(w); do +#define END_WITH_WORKER_LOCK(w) while (__cilkrts_worker_unlock(w), 0) + +// TBD(jsukha): These are worker lock acquistions on +// a worker whose deque is empty. My conjecture is that we +// do not need to hold the worker lock at these points. +// I have left them in for now, however. +// +// #define REMOVE_POSSIBLY_OPTIONAL_LOCKS +#ifdef REMOVE_POSSIBLY_OPTIONAL_LOCKS + #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) do + #define END_WITH_WORKER_LOCK_OPTIONAL(w) while (0) +#else + #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) __cilkrts_worker_lock(w); do + #define END_WITH_WORKER_LOCK_OPTIONAL(w) while (__cilkrts_worker_unlock(w), 0) +#endif + + +#define BEGIN_WITH_FRAME_LOCK(w, ff) \ + do { full_frame *_locked_ff = ff; __cilkrts_frame_lock(w, _locked_ff); do + +#define END_WITH_FRAME_LOCK(w, ff) \ + while (__cilkrts_frame_unlock(w, _locked_ff), 0); } while (0) + +/* W becomes the owner of F and F can be stolen from W */ +static void make_runnable(__cilkrts_worker *w, full_frame *ff) +{ + w->l->frame_ff = ff; + + /* CALL_STACK is invalid (the information is stored implicitly in W) */ + ff->call_stack = 0; +} + +/* + * The worker parameter is unused, except for print-debugging purposes. + */ +static void make_unrunnable(__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *sf, + int is_loot, + const char *why) +{ + /* CALL_STACK becomes valid again */ + ff->call_stack = sf; + + if (sf) { +#if CILK_LIB_DEBUG + if (__builtin_expect(sf->flags & CILK_FRAME_EXITING, 0)) + __cilkrts_bug("W%d suspending exiting frame %p/%p\n", w->self, ff, sf); +#endif + sf->flags |= CILK_FRAME_STOLEN | CILK_FRAME_SUSPENDED; + sf->worker = 0; + + if (is_loot) + __cilkrts_put_stack(ff, sf); + + /* perform any system-dependent action, such as saving the + state of the stack */ + __cilkrts_make_unrunnable_sysdep(w, ff, sf, is_loot, why); + } +} + + +/* Push the next full frame to be made active in this worker and increment its + * join counter. __cilkrts_push_next_frame and pop_next_frame work on a + * one-element queue. This queue is used to communicate across the runtime + * from the code that wants to activate a frame to the code that can actually + * begin execution on that frame. They are asymetrical in that push + * increments the join counter but pop does not decrement it. Rather, a + * single push/pop combination makes a frame active and increments its join + * counter once. */ +void __cilkrts_push_next_frame(__cilkrts_worker *w, full_frame *ff) +{ + CILK_ASSERT(ff); + CILK_ASSERT(!w->l->next_frame_ff); + incjoin(ff); + w->l->next_frame_ff = ff; +} + +/* Get the next full-frame to be made active in this worker. The join count + * of the full frame will have been incremented by the corresponding push + * event. See __cilkrts_push_next_frame, above. + */ +static full_frame *pop_next_frame(__cilkrts_worker *w) +{ + full_frame *ff; + ff = w->l->next_frame_ff; + // Remove the frame from the next_frame field. + // + // If this is a user worker, then there is a chance that another worker + // from our team could push work into our next_frame (if it is the last + // worker doing work for this team). The other worker's setting of the + // next_frame could race with our setting of next_frame to NULL. This is + // the only possible race condition on next_frame. However, if next_frame + // has a non-NULL value, then it means the team still has work to do, and + // there is no chance of another team member populating next_frame. Thus, + // it is safe to set next_frame to NULL, if it was populated. There is no + // need for an atomic op. + if (NULL != ff) { + w->l->next_frame_ff = NULL; + } + return ff; +} + +/* + * Identify the single worker that is allowed to cross a sync in this frame. A + * thief should call this function when it is the first to steal work from a + * user worker. "First to steal work" may mean that there has been parallelism + * in the user worker before, but the whole team sync'd, and this is the first + * steal after that. + * + * This should happen while holding the worker and frame lock. + */ +static void set_sync_master(__cilkrts_worker *w, full_frame *ff) +{ + w->l->last_full_frame = ff; + ff->sync_master = w; +} + +/* + * The sync that ends all parallelism for a particular user worker is about to + * be crossed. Decouple the worker and frame. + * + * No locks need to be held since the user worker isn't doing anything, and none + * of the system workers can steal from it. But unset_sync_master() should be + * called before the user worker knows about this work (i.e., before it is + * inserted into the w->l->next_frame_ff is set). + */ +static void unset_sync_master(__cilkrts_worker *w, full_frame *ff) +{ + CILK_ASSERT(WORKER_USER == w->l->type); + CILK_ASSERT(ff->sync_master == w); + ff->sync_master = NULL; + w->l->last_full_frame = NULL; +} + +/******************************************************************** + * THE protocol: + ********************************************************************/ +/* + * This is a protocol for work stealing that minimizes the overhead on + * the victim. + * + * The protocol uses three shared pointers into the worker's deque: + * - T - the "tail" + * - H - the "head" + * - E - the "exception" NB: In this case, "exception" has nothing to do + * with C++ throw-catch exceptions -- it refers only to a non-normal return, + * i.e., a steal or similar scheduling exception. + * + * with H <= E, H <= T. + * + * Stack frames SF, where H <= E < T, are available for stealing. + * + * The worker operates on the T end of the stack. The frame being + * worked on is not on the stack. To make a continuation available for + * stealing the worker pushes a from onto the stack: stores *T++ = SF. + * To return, it pops the frame off the stack: obtains SF = *--T. + * + * After decrementing T, the condition E > T signals to the victim that + * it should invoke the runtime system's "THE" exception handler. The + * pointer E can become INFINITY, in which case the victim must invoke + * the THE exception handler as soon as possible. + * + * See "The implementation of the Cilk-5 multithreaded language", PLDI 1998, + * http://portal.acm.org/citation.cfm?doid=277652.277725, for more information + * on the THE protocol. + */ + +/* the infinity value of E */ +#define EXC_INFINITY ((__cilkrts_stack_frame **) (-1)) + +static void increment_E(__cilkrts_worker *victim) +{ + __cilkrts_stack_frame *volatile *tmp; + + // The currently executing worker must own the worker lock to touch + // victim->exc + ASSERT_WORKER_LOCK_OWNED(victim); + + tmp = victim->exc; + if (tmp != EXC_INFINITY) { + /* On most x86 this pair of operations would be slightly faster + as an atomic exchange due to the implicit memory barrier in + an atomic instruction. */ + victim->exc = tmp + 1; + __cilkrts_fence(); + } +} + +static void decrement_E(__cilkrts_worker *victim) +{ + __cilkrts_stack_frame *volatile *tmp; + + // The currently executing worker must own the worker lock to touch + // victim->exc + ASSERT_WORKER_LOCK_OWNED(victim); + + tmp = victim->exc; + if (tmp != EXC_INFINITY) { + /* On most x86 this pair of operations would be slightly faster + as an atomic exchange due to the implicit memory barrier in + an atomic instruction. */ + victim->exc = tmp - 1; + __cilkrts_fence(); /* memory fence not really necessary */ + } +} + +#if 0 +/* for now unused, will be necessary if we implement abort */ +static void signal_THE_exception(__cilkrts_worker *wparent) +{ + wparent->exc = EXC_INFINITY; + __cilkrts_fence(); +} +#endif + +static void reset_THE_exception(__cilkrts_worker *w) +{ + // The currently executing worker must own the worker lock to touch + // w->exc + ASSERT_WORKER_LOCK_OWNED(w); + + w->exc = w->head; + __cilkrts_fence(); +} + +/* conditions under which victim->head can be stolen: */ +static int can_steal_from(__cilkrts_worker *victim) +{ + return ((victim->head < victim->tail) && + (victim->head < victim->protected_tail)); +} + +/* Return TRUE if the frame can be stolen, false otherwise */ +static int dekker_protocol(__cilkrts_worker *victim) +{ + // increment_E and decrement_E are going to touch victim->exc. The + // currently executing worker must own victim's lock before they can + // modify it + ASSERT_WORKER_LOCK_OWNED(victim); + + /* ASSERT(E >= H); */ + + increment_E(victim); + + /* ASSERT(E >= H + 1); */ + if (can_steal_from(victim)) { + /* success, we can steal victim->head and set H <- H + 1 + in detach() */ + return 1; + } else { + /* failure, restore previous state */ + decrement_E(victim); + return 0; + } +} + + +/* Link PARENT and CHILD in the spawn tree */ +static full_frame *make_child(__cilkrts_worker *w, + full_frame *parent_ff, + __cilkrts_stack_frame *child_sf, + cilk_fiber *fiber) +{ + full_frame *child_ff = __cilkrts_make_full_frame(w, child_sf); + + child_ff->parent = parent_ff; + push_child(parent_ff, child_ff); + + //DBGPRINTF("%d- make_child - child_frame: %p, parent_frame: %p, child_sf: %p\n" + // " parent - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n" + // " child - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n", + // w->self, child, parent, child_sf, + // parent->parent, parent->left_sibling, parent->right_sibling, parent->rightmost_child, + // child->parent, child->left_sibling, child->right_sibling, child->rightmost_child); + CILK_ASSERT(parent_ff->call_stack); + child_ff->is_call_child = (fiber == NULL); + + /* PLACEHOLDER_FIBER is used as non-null marker indicating that + child should be treated as a spawn child even though we have not + yet assigned a real fiber to its parent. */ + if (fiber == PLACEHOLDER_FIBER) + fiber = NULL; /* Parent actually gets a null fiber, for now */ + + /* perform any system-dependent actions, such as capturing + parameter passing information */ + /*__cilkrts_make_child_sysdep(child, parent);*/ + + /* Child gets reducer map and stack of parent. + Parent gets a new map and new stack. */ + child_ff->fiber_self = parent_ff->fiber_self; + child_ff->sync_master = NULL; + + if (child_ff->is_call_child) { + /* Cause segfault on any attempted access. The parent gets + the child map and stack when the child completes. */ + parent_ff->fiber_self = 0; + } else { + parent_ff->fiber_self = fiber; + } + + incjoin(parent_ff); + return child_ff; +} + +static inline __cilkrts_stack_frame *__cilkrts_advance_frame(__cilkrts_stack_frame *sf) +{ + __cilkrts_stack_frame *p = sf->call_parent; + sf->call_parent = 0; + return p; +} + +/* w should be the currently executing worker. + * loot_sf is the youngest stack frame in the call stack being + * unrolled (i.e., the most deeply nested stack frame.) + * + * When this method is called for a steal, loot_sf should be on a + * victim worker which is different from w. + * For CILK_FORCE_REDUCE, the victim worker will equal w. + * + * Before execution, the __cilkrts_stack_frame's have pointers from + * older to younger, i.e., a __cilkrts_stack_frame points to parent. + * + * This method creates a full frame for each __cilkrts_stack_frame in + * the call stack, with each full frame also pointing to its parent. + * + * The method returns the full frame created for loot_sf, i.e., the + * youngest full frame. + */ +static full_frame *unroll_call_stack(__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *const loot_sf) +{ + __cilkrts_stack_frame *sf = loot_sf; + __cilkrts_stack_frame *rev_sf = 0; + __cilkrts_stack_frame *t_sf; + + CILK_ASSERT(sf); + /*CILK_ASSERT(sf->call_parent != sf);*/ + + /* The leafmost frame is unsynched. */ + if (sf->worker != w) + sf->flags |= CILK_FRAME_UNSYNCHED; + + /* Reverse the call stack to make a linked list ordered from parent + to child. sf->call_parent points to the child of SF instead of + the parent. */ + do { + t_sf = (sf->flags & (CILK_FRAME_DETACHED|CILK_FRAME_STOLEN|CILK_FRAME_LAST))? 0 : sf->call_parent; + sf->call_parent = rev_sf; + rev_sf = sf; + sf = t_sf; + } while (sf); + sf = rev_sf; + + /* Promote each stack frame to a full frame in order from parent + to child, following the reversed list we just built. */ + make_unrunnable(w, ff, sf, sf == loot_sf, "steal 1"); + /* T is the *child* of SF, because we have reversed the list */ + for (t_sf = __cilkrts_advance_frame(sf); t_sf; + sf = t_sf, t_sf = __cilkrts_advance_frame(sf)) { + ff = make_child(w, ff, t_sf, NULL); + make_unrunnable(w, ff, t_sf, t_sf == loot_sf, "steal 2"); + } + + /* XXX What if the leafmost frame does not contain a sync + and this steal is from promote own deque? */ + /*sf->flags |= CILK_FRAME_UNSYNCHED;*/ + + CILK_ASSERT(!sf->call_parent); + return ff; +} + +/* detach the top of the deque frame from the VICTIM and install a new + CHILD frame in its place */ +static void detach_for_steal(__cilkrts_worker *w, + __cilkrts_worker *victim, + cilk_fiber* fiber) +{ + /* ASSERT: we own victim->lock */ + + full_frame *parent_ff, *child_ff, *loot_ff; + __cilkrts_stack_frame *volatile *h; + __cilkrts_stack_frame *sf; + + w->l->team = victim->l->team; + + CILK_ASSERT(w->l->frame_ff == 0 || w == victim); + + h = victim->head; + + CILK_ASSERT(*h); + + victim->head = h + 1; + + parent_ff = victim->l->frame_ff; + BEGIN_WITH_FRAME_LOCK(w, parent_ff) { + /* parent no longer referenced by victim */ + decjoin(parent_ff); + + /* obtain the victim call stack */ + sf = *h; + + /* perform system-dependent normalizations */ + /*__cilkrts_normalize_call_stack_on_steal(sf);*/ + + /* unroll PARENT_FF with call stack SF, adopt the youngest + frame LOOT. If loot_ff == parent_ff, then we hold loot_ff->lock, + otherwise, loot_ff is newly created and we can modify it without + holding its lock. */ + loot_ff = unroll_call_stack(w, parent_ff, sf); + + #if REDPAR_DEBUG >= 3 + fprintf(stderr, "[W=%d, victim=%d, desc=detach, parent_ff=%p, loot=%p]\n", + w->self, victim->self, + parent_ff, loot_ff); + #endif + + if (WORKER_USER == victim->l->type && + NULL == victim->l->last_full_frame) { + // Mark this looted frame as special: only the original user worker + // may cross the sync. + // + // This call is a shared access to + // victim->l->last_full_frame. + set_sync_master(victim, loot_ff); + } + + /* LOOT is the next frame that the thief W is supposed to + run, unless the thief is stealing from itself, in which + case the thief W == VICTIM executes CHILD and nobody + executes LOOT. */ + if (w == victim) { + /* Pretend that frame has been stolen */ + loot_ff->call_stack->flags |= CILK_FRAME_UNSYNCHED; + loot_ff->simulated_stolen = 1; + } + else + __cilkrts_push_next_frame(w, loot_ff); + + // After this "push_next_frame" call, w now owns loot_ff. + child_ff = make_child(w, loot_ff, 0, fiber); + + BEGIN_WITH_FRAME_LOCK(w, child_ff) { + /* install child in the victim's work queue, taking + the parent_ff's place */ + /* child is referenced by victim */ + incjoin(child_ff); + + // With this call, w is bestowing ownership of the newly + // created frame child_ff to the victim, and victim is + // giving up ownership of parent_ff. + // + // Worker w will either take ownership of parent_ff + // if parent_ff == loot_ff, or parent_ff will be + // suspended. + // + // Note that this call changes the victim->frame_ff + // while the victim may be executing. + make_runnable(victim, child_ff); + } END_WITH_FRAME_LOCK(w, child_ff); + } END_WITH_FRAME_LOCK(w, parent_ff); +} + +/** + * @brief cilk_fiber_proc that resumes user code after a successful + * random steal. + + * This function longjmps back into the user code whose state is + * stored in cilk_fiber_get_data(fiber)->resume_sf. The stack pointer + * is adjusted so that the code resumes on the specified fiber stack + * instead of its original stack. + * + * This method gets executed only on a fiber freshly allocated from a + * pool. + * + * @param fiber The fiber being used to resume user code. + * @param arg Unused. + */ +static +void fiber_proc_to_resume_user_code_for_random_steal(cilk_fiber *fiber) +{ + cilk_fiber_data *data = cilk_fiber_get_data(fiber); + __cilkrts_stack_frame* sf = data->resume_sf; + full_frame *ff; + + CILK_ASSERT(sf); + + // When we pull the resume_sf out of the fiber to resume it, clear + // the old value. + data->resume_sf = NULL; + CILK_ASSERT(sf->worker == data->owner); + ff = sf->worker->l->frame_ff; + + // For Win32, we need to overwrite the default exception handler + // in this function, so that when the OS exception handling code + // walks off the top of the current Cilk stack, it reaches our stub + // handler. + + // Also, this function needs to be wrapped into a try-catch block + // so the compiler generates the appropriate exception information + // in this frame. + + // TBD: IS THIS HANDLER IN THE WRONG PLACE? Can we longjmp out of + // this function (and does it matter?) +#if defined(_WIN32) && !defined(_WIN64) + install_exception_stub_handler(); + __try +#endif + { + char* new_sp = sysdep_reset_jump_buffers_for_resume(fiber, ff, sf); + + // Notify the Intel tools that we're stealing code + ITT_SYNC_ACQUIRED(sf->worker); + NOTIFY_ZC_INTRINSIC("cilk_continue", sf); + + // TBD: We'd like to move TBB-interop methods into the fiber + // eventually. + cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT); + + sf->flags &= ~CILK_FRAME_SUSPENDED; + + // longjmp to user code. Don't process exceptions here, + // because we are resuming a stolen frame. + sysdep_longjmp_to_sf(new_sp, sf, NULL); + /*NOTREACHED*/ + // Intel's C compiler respects the preceding lint pragma + } +#if defined(_WIN32) && !defined(_WIN64) + __except (CILK_ASSERT(!"should not execute the the stub filter"), + EXCEPTION_EXECUTE_HANDLER) + { + // If we are here, that means something very wrong + // has happened in our exception processing... + CILK_ASSERT(! "should not be here!"); + } +#endif +} + +static void random_steal(__cilkrts_worker *w) +{ + __cilkrts_worker *victim = NULL; + cilk_fiber *fiber = NULL; + int n; + int success = 0; + int32_t victim_id; + + // Nothing's been stolen yet. When true, this will flag + // setup_for_execution_pedigree to increment the pedigree + w->l->work_stolen = 0; + + /* If the user has disabled stealing (using the debugger) we fail */ + if (__builtin_expect(w->g->stealing_disabled, 0)) + return; + + CILK_ASSERT(w->l->type == WORKER_SYSTEM || w->l->team == w); + + /* If there is only one processor work can still be stolen. + There must be only one worker to prevent stealing. */ + CILK_ASSERT(w->g->total_workers > 1); + + /* pick random *other* victim */ + n = myrand(w) % (w->g->total_workers - 1); + if (n >= w->self) + ++n; + + // If we're replaying a log, override the victim. -1 indicates that + // we've exhausted the list of things this worker stole when we recorded + // the log so just return. If we're not replaying a log, + // replay_get_next_recorded_victim() just returns the victim ID passed in. + n = replay_get_next_recorded_victim(w, n); + if (-1 == n) + return; + + victim = w->g->workers[n]; + + START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) { + /* Verify that we can get a stack. If not, no need to continue. */ + fiber = cilk_fiber_allocate(&w->l->fiber_pool); + } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE); + + + if (NULL == fiber) { +#if FIBER_DEBUG >= 2 + fprintf(stderr, "w=%d: failed steal because we could not get a fiber\n", + w->self); +#endif + return; + } + + /* do not steal from self */ + CILK_ASSERT (victim != w); + + /* Execute a quick check before engaging in the THE protocol. + Avoid grabbing locks if there is nothing to steal. */ + if (!can_steal_from(victim)) { + NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ); + START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) { + int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool); + // Fibers we use when trying to steal should not be active, + // and thus should not have any other references. + CILK_ASSERT(0 == ref_count); + } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE); + return; + } + + /* Attempt to steal work from the victim */ + if (worker_trylock_other(w, victim)) { + if (w->l->type == WORKER_USER && victim->l->team != w) { + + // Fail to steal if this is a user worker and the victim is not + // on this team. If a user worker were allowed to steal work + // descended from another user worker, the former might not be + // done with its work by the time it was needed to resume and + // unbind. Therefore, user workers are not permitted to change + // teams. + + // There is no race on the victim's team because the victim cannot + // change its team until it runs out of work to do, at which point + // it will try to take out its own lock, and this worker already + // holds it. + NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_USER_WORKER); + + } else if (victim->l->frame_ff) { + // A successful steal will change victim->frame_ff, even + // though the victim may be executing. Thus, the lock on + // the victim's deque is also protecting victim->frame_ff. + if (dekker_protocol(victim)) { + int proceed_with_steal = 1; // optimistic + + // If we're replaying a log, verify that this the correct frame + // to steal from the victim + if (! replay_match_victim_pedigree(w, victim)) + { + // Abort the steal attempt. decrement_E(victim) to + // counter the increment_E(victim) done by the + // dekker protocol + decrement_E(victim); + proceed_with_steal = 0; + } + + if (proceed_with_steal) + { + START_INTERVAL(w, INTERVAL_STEAL_SUCCESS) { + success = 1; + detach_for_steal(w, victim, fiber); + victim_id = victim->self; + + #if REDPAR_DEBUG >= 1 + fprintf(stderr, "Wkr %d stole from victim %d, fiber = %p\n", + w->self, victim->self, fiber); + #endif + + // The use of victim->self contradicts our + // classification of the "self" field as + // local. But since this code is only for + // debugging, it is ok. + DBGPRINTF ("%d-%p: Stealing work from worker %d\n" + " sf: %p, call parent: %p\n", + w->self, GetCurrentFiber(), victim->self, + w->l->next_frame_ff->call_stack, + w->l->next_frame_ff->call_stack->call_parent); + } STOP_INTERVAL(w, INTERVAL_STEAL_SUCCESS); + } // end if(proceed_with_steal) + } else { + NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_DEKKER); + } + } else { + NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ); + } + worker_unlock_other(w, victim); + } else { + NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_LOCK); + } + + // Record whether work was stolen. When true, this will flag + // setup_for_execution_pedigree to increment the pedigree + w->l->work_stolen = success; + + if (0 == success) { + // failed to steal work. Return the fiber to the pool. + START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) { + int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool); + // Fibers we use when trying to steal should not be active, + // and thus should not have any other references. + CILK_ASSERT(0 == ref_count); + } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE); + } + else + { + // Since our steal was successful, finish initialization of + // the fiber. + cilk_fiber_reset_state(fiber, + fiber_proc_to_resume_user_code_for_random_steal); + // Record the pedigree of the frame that w has stolen. + // record only if CILK_RECORD_LOG is set + replay_record_steal(w, victim_id); + } +} + + + +/** + * At a provably good steal, we need to transfer the child reducer map + * from ff->children_reducer_map into v->reducer_map, where v is the + * worker that resumes execution of ff. + * + * Normally, we have v == w, where w is the currently executing + * worker. In the case where we are resuming a team leader on a user + * worker, however, v might differ from w. + + * Thus, this, operation is a no-op, since we can't really move + * ff->children_reducer_map into w here. + * + * Instead, this work is done in setup_for_execution_reducers(). + */ +static inline void provably_good_steal_reducers(__cilkrts_worker *w, + full_frame *ff) +{ + // No-op. +} + +/* at a provably good steal, incorporate the accumulated exceptions of + children into the parent's exception */ +static void provably_good_steal_exceptions(__cilkrts_worker *w, + full_frame *ff) +{ + // ASSERT: we own ff->lock + ff->pending_exception = + __cilkrts_merge_pending_exceptions(w, + ff->child_pending_exception, + ff->pending_exception); + ff->child_pending_exception = NULL; +} + +/* At sync discard the frame's old stack and take the leftmost child's. */ +static void provably_good_steal_stacks(__cilkrts_worker *w, full_frame *ff) +{ + CILK_ASSERT(NULL == ff->fiber_self); + ff->fiber_self = ff->fiber_child; + ff->fiber_child = NULL; +} + +static void __cilkrts_mark_synched(full_frame *ff) +{ + ff->call_stack->flags &= ~CILK_FRAME_UNSYNCHED; + ff->simulated_stolen = 0; +} + +static +enum provably_good_steal_t provably_good_steal(__cilkrts_worker *w, + full_frame *ff) +{ + // ASSERT: we hold w->lock and ff->lock + + enum provably_good_steal_t result = ABANDON_EXECUTION; + + // If the current replay entry is a sync record matching the worker's + // pedigree, AND this isn't the last child to the sync, return + // WAIT_FOR_CONTINUE to indicate that the caller should loop until + // we find the right frame to steal and CONTINUE_EXECUTION is returned. + int match_found = replay_match_sync_pedigree(w); + if (match_found && (0 != simulate_decjoin(ff))) + return WAIT_FOR_CONTINUE; + + START_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL) { + if (decjoin(ff) == 0) { + provably_good_steal_reducers(w, ff); + provably_good_steal_exceptions(w, ff); + provably_good_steal_stacks(w, ff); + __cilkrts_mark_synched(ff); + + // If the original owner wants this frame back (to resume + // it on its original thread) pass it back now. + if (NULL != ff->sync_master) { + // The frame wants to go back and be executed by the original + // user thread. We can throw caution to the wind and push the + // frame straight onto its queue because the only way we have + // gotten to this point of being able to continue execution of + // the frame is if the original user worker is spinning without + // work. + + unset_sync_master(w->l->team, ff); + __cilkrts_push_next_frame(w->l->team, ff); + + // If this is the team leader we're not abandoning the work + if (w == w->l->team) + result = CONTINUE_EXECUTION; + } else { + __cilkrts_push_next_frame(w, ff); + result = CONTINUE_EXECUTION; // Continue working on this thread + } + + // The __cilkrts_push_next_frame() call changes ownership + // of ff to the specified worker. + } + } STOP_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL); + + // Only write a SYNC record if: + // - We're recording a log *AND* + // - We're the worker continuing from this sync + replay_record_sync(w, result == CONTINUE_EXECUTION); + + // If we're replaying a log, and matched a sync from the log, mark the + // sync record seen if the sync isn't going to be abandoned. + replay_advance_from_sync (w, match_found, result == CONTINUE_EXECUTION); + + return result; +} + +static void unconditional_steal(__cilkrts_worker *w, + full_frame *ff) +{ + // ASSERT: we hold ff->lock + + START_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL) { + decjoin(ff); + __cilkrts_push_next_frame(w, ff); + } STOP_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL); +} + + +/* CHILD is about to die. Give its exceptions to a sibling or to the + parent. */ +static inline void splice_exceptions_for_call(__cilkrts_worker *w, + full_frame *parent_ff, + full_frame *child_ff) +{ + // ASSERT: We own parent_ff->lock + CILK_ASSERT(child_ff->is_call_child); + CILK_ASSERT(NULL == child_ff->right_pending_exception); + CILK_ASSERT(NULL == parent_ff->pending_exception); + + parent_ff->pending_exception = child_ff->pending_exception; + child_ff->pending_exception = NULL; +} + +/** + * Merge exceptions for a dying child. + * + * @param w The currently executing worker. + * @param ff The child frame that is dying. + * @param left_exception_ptr Pointer to the exception that is to our left. + */ +static inline +void splice_exceptions_for_spawn(__cilkrts_worker *w, + full_frame *ff, + struct pending_exception_info **left_exception_ptr) +{ + // ASSERT: parent_ff == child_ff->parent. + // ASSERT: We own parent_ff->lock + + // Merge current exception into the slot where the left + // exception should go. + *left_exception_ptr = + __cilkrts_merge_pending_exceptions(w, + *left_exception_ptr, + ff->pending_exception); + ff->pending_exception = NULL; + + + // Merge right exception into the slot where the left exception + // should go. + *left_exception_ptr = + __cilkrts_merge_pending_exceptions(w, + *left_exception_ptr, + ff->right_pending_exception); + ff->right_pending_exception = NULL; +} + + +static inline void splice_stacks_for_call(__cilkrts_worker *w, + full_frame *parent_ff, + full_frame *child_ff) +{ +#if CILK_LIB_DEBUG + if (parent_ff->call_stack) + CILK_ASSERT(!(parent_ff->call_stack->flags & CILK_FRAME_MBZ)); +#endif + + /* A synched frame does not have accumulated child reducers. */ + CILK_ASSERT(!child_ff->fiber_child); + CILK_ASSERT(child_ff->is_call_child); + + /* An attached parent has no self fiber. It may have + accumulated child fibers or child owners, which should be + ignored until sync. */ + CILK_ASSERT(!parent_ff->fiber_self); + parent_ff->fiber_self = child_ff->fiber_self; + child_ff->fiber_self = NULL; +} + +static void finalize_child_for_call(__cilkrts_worker *w, + full_frame *parent_ff, + full_frame *child_ff) +{ + // ASSERT: we hold w->lock and parent_ff->lock + + START_INTERVAL(w, INTERVAL_FINALIZE_CHILD) { + CILK_ASSERT(child_ff->is_call_child); + CILK_ASSERT(child_ff->join_counter == 0); + CILK_ASSERT(!child_ff->rightmost_child); + CILK_ASSERT(child_ff == parent_ff->rightmost_child); + + // CHILD is about to die. + // Splicing out reducers is a no-op for a call since + // w->reducer_map should already store the correct + // reducer map. + + // ASSERT there are no maps left to reduce. + CILK_ASSERT(NULL == child_ff->children_reducer_map); + CILK_ASSERT(NULL == child_ff->right_reducer_map); + + splice_exceptions_for_call(w, parent_ff, child_ff); + + splice_stacks_for_call(w, parent_ff, child_ff); + + /* remove CHILD from list of children of PARENT */ + unlink_child(parent_ff, child_ff); + + /* continue with the parent. */ + unconditional_steal(w, parent_ff); + __cilkrts_destroy_full_frame(w, child_ff); + } STOP_INTERVAL(w, INTERVAL_FINALIZE_CHILD); +} + + +/** + * The invariant on ff->children_reducer_map is that when ff is + * synched and when we are about to resume execution of ff, at least + * one of ff->children_reducer_map and w->reducer_map must be NULL. + * + * Consider the two possibilities before resuming execution of ff: + * + * 1. Suppose ff is synched and suspended. Then either + * + * (a) ff->children_reducer_map stores the reducer map that w + * should use, where w is the worker resuming execution of ff, + * OR + * (b) w already has a user map, and ff->children_reducer_map is NULL. + * + * Case (a) happens when we are resuming execution of ff as a + * provably good steal. In this case, w->reducer_map should be + * NULL and ff->children_reducer_map is valid. To resume + * execution of ff on w, set w->reducer_map to + * ff->children_reducer_map. + * + * Case (b) occurs when we resume execution of ff because ff is a + * called child. Then, ff->children_reducer_map should be NULL, + * and w should already have a valid reducer map when resuming + * execution of ff. We resume execution of ff without changing + * w->reducer_map. + * + * 2. Suppose frame ff is not synched (i.e., it is active and might have + * active children). Then ff->children_reducer_map is the slot for + * storing the reducer map from ff's leftmost child, as in the reducer + * protocol. The runtime may resume execution of ff while it is not + * synched only because of a steal. + * In this case, while we are resuming ff, ff->children_reducer_map + * may be non-NULL (because one of ff's children has completed). + * We resume execution of ff without changing w->reducer_map. + */ +static void setup_for_execution_reducers(__cilkrts_worker *w, + full_frame *ff) +{ + // We only need to move ff->children_reducer_map into + // w->reducer_map in case 1(a). + // + // First check whether ff is synched. + __cilkrts_stack_frame *sf = ff->call_stack; + if (!(sf->flags & CILK_FRAME_UNSYNCHED)) { + // In this case, ff is synched. (Case 1). + CILK_ASSERT(!ff->rightmost_child); + + // Test whether we are in case 1(a) and have + // something to do. Note that if both + // ff->children_reducer_map and w->reducer_map are NULL, we + // can't distinguish between cases 1(a) and 1(b) here. + if (ff->children_reducer_map) { + // We are in Case 1(a). + CILK_ASSERT(!w->reducer_map); + w->reducer_map = ff->children_reducer_map; + ff->children_reducer_map = NULL; + } + } +} + +static void setup_for_execution_exceptions(__cilkrts_worker *w, + full_frame *ff) +{ + CILK_ASSERT(NULL == w->l->pending_exception); + w->l->pending_exception = ff->pending_exception; + ff->pending_exception = NULL; +} + +#if 0 /* unused */ +static void setup_for_execution_stack(__cilkrts_worker *w, + full_frame *ff) +{ +} +#endif + +/* + * setup_for_execution_pedigree + * + * Copies the pedigree information from the frame we're resuming to the + * worker. Increments the pedigree if this is work that has been stolen + * to match the increment on a return from a spawn helper. + */ +static void setup_for_execution_pedigree(__cilkrts_worker *w) +{ + int pedigree_unsynched; + __cilkrts_stack_frame *sf = w->current_stack_frame; + + CILK_ASSERT(NULL != sf); + + // If this isn't an ABI 1 or later frame, there's no pedigree information + if (0 == CILK_FRAME_VERSION_VALUE(sf->flags)) + return; + + // Note whether the pedigree is unsynched and clear the flag before + // we forget + pedigree_unsynched = sf->flags & CILK_FRAME_SF_PEDIGREE_UNSYNCHED; + sf->flags &= ~CILK_FRAME_SF_PEDIGREE_UNSYNCHED; + + // If we're just marshalling onto this worker, do not increment + // the rank since that wouldn't happen in a sequential execution + if (w->l->work_stolen || pedigree_unsynched) + { + if (w->l->work_stolen) + w->pedigree.rank = sf->parent_pedigree.rank + 1; + else + w->pedigree.rank = sf->parent_pedigree.rank; + } + + w->pedigree.parent = sf->parent_pedigree.parent; + w->l->work_stolen = 0; +} + +static void setup_for_execution(__cilkrts_worker *w, + full_frame *ff, + int is_return_from_call) +{ + // ASSERT: We own w->lock and ff->lock || P == 1 + + setup_for_execution_reducers(w, ff); + setup_for_execution_exceptions(w, ff); + /*setup_for_execution_stack(w, ff);*/ + + ff->call_stack->worker = w; + w->current_stack_frame = ff->call_stack; + + // If this is a return from a call, leave the pedigree alone + if (! is_return_from_call) + setup_for_execution_pedigree(w); + + __cilkrts_setup_for_execution_sysdep(w, ff); + + w->head = w->tail = w->l->ltq; + reset_THE_exception(w); + + make_runnable(w, ff); +} + + +/* + * Called by the scheduling fiber, right before + * resuming a sf/ff for user code. + * + * This method associates the specified sf with the worker. + * + * It also asserts that w, ff, and sf all have the expected properties + * for resuming user code. + */ +void scheduling_fiber_prepare_to_resume_user_code(__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *sf) +{ + w->current_stack_frame = sf; + sf->worker = w; + + // Lots of debugging checks on the state of the fiber we might be + // resuming. +#if FIBER_DEBUG >= 1 +# if FIBER_DEBUG >= 3 + { + fprintf(stderr, "w=%d: ff=%p, sf=%p. about to resume user code\n", + w->self, ff, sf); + } +# endif + + const int flags = sf->flags; + CILK_ASSERT(flags & CILK_FRAME_SUSPENDED); + CILK_ASSERT(!sf->call_parent); + CILK_ASSERT(w->head == w->tail); + + /* A frame can not be resumed unless it was suspended. */ + CILK_ASSERT(ff->sync_sp != NULL); + + /* The leftmost frame has no allocated stack */ + if (ff->simulated_stolen) + CILK_ASSERT(flags & CILK_FRAME_UNSYNCHED); + else if (flags & CILK_FRAME_UNSYNCHED) + /* XXX By coincidence sync_sp could be null. */ + CILK_ASSERT(ff->fiber_self != NULL); + else + /* XXX This frame could be resumed unsynched on the leftmost stack */ + CILK_ASSERT((ff->sync_master == 0 || ff->sync_master == w)); + CILK_ASSERT(w->l->frame_ff == ff); +#endif +} + + +/** + * This method is the first method that should execute after we've + * switched to a scheduling fiber from user code. + * + * @param fiber The scheduling fiber for the current worker. + * @param wptr The current worker. + */ +static void enter_runtime_transition_proc(cilk_fiber *fiber) +{ + // We can execute this method for one of three reasons: + // 1. Undo-detach finds parent stolen. + // 2. Sync suspends frame. + // 3. Return from Cilk entry point. + // + // + // In cases 1 and 2, the frame may be truly suspended or + // may be immediately executed by this worker after provably_good_steal. + // + // + // There is a fourth case, which can, but does not need to execute + // this function: + // 4. Starting up the scheduling loop on a user or + // system worker. In this case, we won't have + // a scheduling stack function to run. + __cilkrts_worker* w = cilk_fiber_get_owner(fiber); + if (w->l->post_suspend) { + // Run the continuation function passed to longjmp_into_runtime + run_scheduling_stack_fcn(w); + + // After we have jumped into the runtime and run the + // scheduling function, any reducer map the worker had before entering the runtime + // should have already been saved into the appropriate full + // frame. + CILK_ASSERT(NULL == w->reducer_map); + + // There shouldn't be any uncaught exceptions. + // + // In Windows, the OS catches any exceptions not caught by the + // user code. Thus, we are omitting the check on Windows. + // + // On Android, calling std::uncaught_exception with the stlport + // library causes a seg fault. Since we're not supporting + // exceptions there at this point, just don't do the check + // + // TBD: Is this check also safe to do on Windows? + CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION(); + } +} + + +/** + * Method called to jump back to executing user code. + * + * A normal return from the runtime back to resuming user code calls + * this method. A computation executed using force_reduce also calls + * this method to return to user code. + * + * This function should not contain any code that depends on a fiber. + * In a force-reduce case, the user worker may not have a fiber. In + * the force-reduce case, we call this method directly instead of + * calling @c user_code_resume_after_switch_into_runtime. + */ +static inline NORETURN +cilkrts_resume(__cilkrts_stack_frame *sf, full_frame *ff) +{ + // Save the sync stack pointer, and do the bookkeeping + char* sync_sp = ff->sync_sp; + __cilkrts_take_stack(ff, sync_sp); // leaves ff->sync_sp null + + sf->flags &= ~CILK_FRAME_SUSPENDED; + // Actually longjmp to the user code. + // We may have exceptions to deal with, since we are resuming + // a previous-suspended frame. + sysdep_longjmp_to_sf(sync_sp, sf, ff); +} + + +/** + * Called by the user-code fiber right before resuming a full frame + * (sf/ff). + * + * This method pulls sf/ff out of the worker, and then calls + * cilkrts_resume to jump to user code. + */ +static NORETURN +user_code_resume_after_switch_into_runtime(cilk_fiber *fiber) +{ + __cilkrts_worker *w = cilk_fiber_get_owner(fiber); + __cilkrts_stack_frame *sf; + full_frame *ff; + sf = w->current_stack_frame; + ff = sf->worker->l->frame_ff; + +#if FIBER_DEBUG >= 1 + CILK_ASSERT(ff->fiber_self == fiber); + cilk_fiber_data *fdata = cilk_fiber_get_data(fiber); + DBGPRINTF ("%d-%p: resume_after_switch_into_runtime, fiber=%p\n", + w->self, w, fiber); + CILK_ASSERT(sf == fdata->resume_sf); +#endif + + // Notify the Intel tools that we're stealing code + ITT_SYNC_ACQUIRED(sf->worker); + NOTIFY_ZC_INTRINSIC("cilk_continue", sf); + cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT); + + // Actually jump to user code. + cilkrts_resume(sf, ff); + } + + +/* The current stack is about to either be suspended or destroyed. This + * function will switch to the stack on which the scheduler is suspended and + * resume running the scheduler within function do_work(). Upon waking up, + * the scheduler will run the 'cont' function, using the supplied worker and + * frame. + */ +static NORETURN +longjmp_into_runtime(__cilkrts_worker *w, + scheduling_stack_fcn_t fcn, + __cilkrts_stack_frame *sf) +{ + full_frame *ff, *ff2; + + CILK_ASSERT(!w->l->post_suspend); + ff = w->l->frame_ff; + + // If we've got only one worker, stealing shouldn't be possible. + // Assume that this is a steal or return from spawn in a force-reduce case. + // We don't have a scheduling stack to switch to, so call the continuation + // function directly. + if (1 == w->g->P) { + fcn(w, ff, sf); + + /* The call to function c() will have pushed ff as the next frame. If + * this were a normal (non-forced-reduce) execution, there would have + * been a pop_next_frame call in a separate part of the runtime. We + * must call pop_next_frame here to complete the push/pop cycle. */ + ff2 = pop_next_frame(w); + + setup_for_execution(w, ff2, 0); + scheduling_fiber_prepare_to_resume_user_code(w, ff2, w->current_stack_frame); + cilkrts_resume(w->current_stack_frame, ff2); + +// Suppress clang warning that the expression result is unused +#if defined(__clang__) && (! defined(__INTEL_COMPILER)) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wunused-value" +#endif // __clang__ + /* no return */ + CILK_ASSERT(((void)"returned from __cilkrts_resume", 0)); +#if defined(__clang__) && (! defined(__INTEL_COMPILER)) +# pragma clang diagnostic pop +#endif // __clang__ + } + + w->l->post_suspend = fcn; + w->l->suspended_stack = sf; + + ITT_SYNC_RELEASING(w); + ITT_SYNC_PREPARE(w); + +#if FIBER_DEBUG >= 2 + fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime... w->l->frame_ff = %p, sf=%p\n", + cilkos_get_current_thread_id(), + w->self, w->l->frame_ff, + sf); +#endif + + // Current fiber is either the (1) one we are about to free, + // or (2) it has been passed up to the parent. + cilk_fiber *current_fiber = ( w->l->fiber_to_free ? + w->l->fiber_to_free : + w->l->frame_ff->parent->fiber_child ); + cilk_fiber_data* fdata = cilk_fiber_get_data(current_fiber); + CILK_ASSERT(NULL == w->l->frame_ff->fiber_self); + + // Clear the sf in the current fiber for cleanliness, to prevent + // us from accidentally resuming a bad sf. + // Technically, resume_sf gets overwritten for a fiber when + // we are about to resume it anyway. + fdata->resume_sf = NULL; + CILK_ASSERT(fdata->owner == w); + + // Set the function to execute immediately after switching to the + // scheduling fiber, but before freeing any fibers. + cilk_fiber_set_post_switch_proc(w->l->scheduling_fiber, + enter_runtime_transition_proc); + cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_ORPHAN); + + if (w->l->fiber_to_free) { + // Case 1: we are freeing this fiber. We never + // resume this fiber again after jumping into the runtime. + w->l->fiber_to_free = NULL; + + // Extra check. Normally, the fiber we are about to switch to + // should have a NULL owner. + CILK_ASSERT(NULL == cilk_fiber_get_data(w->l->scheduling_fiber)->owner); +#if FIBER_DEBUG >= 4 + fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n", + cilkos_get_current_thread_id(), + w->self, + current_fiber, w->l->scheduling_fiber); +#endif + cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_RELEASE); + NOTE_INTERVAL(w, INTERVAL_DEALLOCATE_RESUME_OTHER); + cilk_fiber_remove_reference_from_self_and_resume_other(current_fiber, + &w->l->fiber_pool, + w->l->scheduling_fiber); + // We should never come back here! + CILK_ASSERT(0); + } + else { + // Case 2: We are passing the fiber to our parent because we + // are leftmost. We should come back later to + // resume execution of user code. + // + // If we are not freeing a fiber, there we must be + // returning from a spawn or processing an exception. The + // "sync" path always frees a fiber. + // + // We must be the leftmost child, and by left holder logic, we + // have already moved the current fiber into our parent full + // frame. +#if FIBER_DEBUG >= 2 + fprintf(stderr, "ThreadId=%p, W=%d: about to suspend self into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n", + cilkos_get_current_thread_id(), + w->self, + current_fiber, w->l->scheduling_fiber); +#endif + + NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER); + + cilk_fiber_suspend_self_and_resume_other(current_fiber, + w->l->scheduling_fiber); + // Resuming this fiber returns control back to + // this function because our implementation uses OS fibers. + // + // On Unix, we could have the choice of passing the + // user_code_resume_after_switch_into_runtime as an extra "resume_proc" + // that resumes execution of user code instead of the + // jumping back here, and then jumping back to user code. +#if FIBER_DEBUG >= 2 + CILK_ASSERT(fdata->owner == __cilkrts_get_tls_worker()); +#endif + user_code_resume_after_switch_into_runtime(current_fiber); + } +} + +/* + * Send a message to the children of the specified worker: run or wait. + */ +static void notify_children(__cilkrts_worker *w, unsigned int msg) +{ + int child_num; + __cilkrts_worker *child; + int num_sys_workers = w->g->P - 1; + + // If worker is "n", then its children are 2n + 1, and 2n + 2. + child_num = (w->self << 1) + 1; + if (child_num < num_sys_workers) { + child = w->g->workers[child_num]; + CILK_ASSERT(child->l->signal_node); + signal_node_msg(child->l->signal_node, msg); + child_num++; + if (child_num < num_sys_workers) { + child = w->g->workers[child_num]; + CILK_ASSERT(child->l->signal_node); + signal_node_msg(child->l->signal_node, msg); + } + } +} + +/* + * Notify this worker's children that they need to wait. + */ +static void notify_children_wait(__cilkrts_worker *w) +{ + notify_children(w, 0); +} + +/* + * Notify this worker's children to run and start trying to steal. + */ +static void notify_children_run(__cilkrts_worker *w) +{ + notify_children(w, 1); +} + +/** + * A single "check" to find work, either on our queue or through a + * steal attempt. This method checks our local queue once, and + * performs one steal attempt. + */ +static full_frame* check_for_work(__cilkrts_worker *w) +{ + full_frame *ff = NULL; + ff = pop_next_frame(w); + // If there is no work on the queue, try to steal some. + if (NULL == ff) { + START_INTERVAL(w, INTERVAL_STEALING) { + if (w->l->type != WORKER_USER && w->l->team != NULL) { + // At this point, the worker knows for certain that it has run + // out of work. Therefore, it loses its team affiliation. User + // workers never change teams, of course. + __cilkrts_worker_lock(w); + w->l->team = NULL; + __cilkrts_worker_unlock(w); + } + + // If we are about to do a random steal, we should have no + // full frame... + CILK_ASSERT(NULL == w->l->frame_ff); + random_steal(w); + } STOP_INTERVAL(w, INTERVAL_STEALING); + + // If the steal was successful, then the worker has populated its next + // frame with the work to resume. + ff = pop_next_frame(w); + if (NULL == ff) { + // Punish the worker for failing to steal. + // No quantum for you! + __cilkrts_yield(); + w->l->steal_failure_count++; + } else { + // Reset steal_failure_count since there is obviously still work to + // be done. + w->l->steal_failure_count = 0; + } + } + return ff; +} + +/** + * Keep stealing or looking on our queue. + * + * Returns either when a full frame is found, or NULL if the + * computation is done. + */ +static full_frame* search_until_work_found_or_done(__cilkrts_worker *w) +{ + full_frame *ff = NULL; + // Find a full frame to execute (either through random stealing, + // or because we pull it off w's 1-element queue). + while (!ff) { + // Check worker state to figure out our next action. + switch (worker_runnable(w)) + { + case SCHEDULE_RUN: // One attempt at checking for work. + ff = check_for_work(w); + break; + case SCHEDULE_WAIT: // go into wait-mode. + CILK_ASSERT(WORKER_SYSTEM == w->l->type); + // If we are about to wait, then we better not have + // a frame that we should execute... + CILK_ASSERT(NULL == w->l->next_frame_ff); + notify_children_wait(w); + signal_node_wait(w->l->signal_node); + // ... + // Runtime is waking up. + notify_children_run(w); + w->l->steal_failure_count = 0; + break; + case SCHEDULE_EXIT: // exit the scheduler. + CILK_ASSERT(WORKER_USER != w->l->type); + return NULL; + default: + CILK_ASSERT(0); + abort(); + } + } + return ff; +} + +/** + * The proc method for a scheduling fiber on a user worker. + * + * When a user worker jumps into the runtime, it jumps into this + * method by either starting it if the scheduling fiber has never run + * before, or resuming the fiber if it was previously suspended. + */ +COMMON_PORTABLE +void scheduler_fiber_proc_for_user_worker(cilk_fiber *fiber) +{ + __cilkrts_worker* w = cilk_fiber_get_owner(fiber); + CILK_ASSERT(w); + + // This must be a user worker + CILK_ASSERT(WORKER_USER == w->l->type); + + // If we aren't the current worker, then something is very wrong + // here.. + verify_current_wkr(w); + + __cilkrts_run_scheduler_with_exceptions(w); +} + + +/** + * The body of the runtime scheduling loop. This function executes in + * 4 stages: + * + * 1. Transitions from the user code into the runtime by + * executing any scheduling-stack functions. + * 2. Looks for a full frame enqueued from a successful provably + * good steal. + * 3. If no full frame is found in step 2, steal until + * a frame is found or we are done. If we are done, finish + * the scheduling loop. + * 4. When a frame is found, setup to resume user code. + * In particular, suspend the current fiber and resume the + * user fiber to execute the frame. + * + * Returns a fiber object that we should switch to after completing + * the body of the loop, or NULL if we should continue executing on + * this fiber. + * + * @pre @c current_fiber should equal @c wptr->l->scheduling_fiber + * + * @param current_fiber The currently executing (scheduling_ fiber + * @param wptr The currently executing worker. + * @param return The next fiber we should switch to. + */ +static cilk_fiber* worker_scheduling_loop_body(cilk_fiber* current_fiber, + void* wptr) +{ + __cilkrts_worker *w = (__cilkrts_worker*) wptr; + CILK_ASSERT(current_fiber == w->l->scheduling_fiber); + + // Stage 1: Transition from executing user code to the runtime code. + // We don't need to do this call here any more, because + // every switch to the scheduling fiber should make this call + // using a post_switch_proc on the fiber. + // + // enter_runtime_transition_proc(w->l->scheduling_fiber, wptr); + + // After Stage 1 is complete, w should no longer have + // an associated full frame. + CILK_ASSERT(NULL == w->l->frame_ff); + + // Stage 2. First do a quick check of our 1-element queue. + full_frame *ff = pop_next_frame(w); + + if (!ff) { + // Stage 3. We didn't find anything from our 1-element + // queue. Now go through the steal loop to find work. + ff = search_until_work_found_or_done(w); + if (!ff) { + CILK_ASSERT(w->g->work_done); + return NULL; + } + } + + // Stage 4. Now that we have found a full frame to work on, + // actually execute it. + __cilkrts_stack_frame *sf; + + // There shouldn't be any uncaught exceptions. + // + // In Windows, the OS catches any exceptions not caught by the + // user code. Thus, we are omitting the check on Windows. + // + // On Android, calling std::uncaught_exception with the stlport + // library causes a seg fault. Since we're not supporting + // exceptions there at this point, just don't do the check + CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION(); + + BEGIN_WITH_WORKER_LOCK(w) { + CILK_ASSERT(!w->l->frame_ff); + BEGIN_WITH_FRAME_LOCK(w, ff) { + sf = ff->call_stack; + CILK_ASSERT(sf && !sf->call_parent); + setup_for_execution(w, ff, 0); + } END_WITH_FRAME_LOCK(w, ff); + } END_WITH_WORKER_LOCK(w); + + /* run it */ + // + // Prepare to run the full frame. To do so, we need to: + // (a) Execute some code on this fiber (the scheduling + // fiber) to set up data structures, and + // (b) Suspend the scheduling fiber, and resume the + // user-code fiber. + + // Part (a). Set up data structures. + scheduling_fiber_prepare_to_resume_user_code(w, ff, sf); + + cilk_fiber *other = w->l->frame_ff->fiber_self; + cilk_fiber_data* other_data = cilk_fiber_get_data(other); + cilk_fiber_data* current_fiber_data = cilk_fiber_get_data(current_fiber); + + // I believe two cases are possible here, both of which + // should have other_data->resume_sf as NULL. + // + // 1. Resuming a fiber that was previously executing + // user code (i.e., a provably-good-steal). + // In this case, resume_sf should have been + // set to NULL when it was suspended. + // + // 2. Resuming code on a steal. In this case, since we + // grabbed a new fiber, resume_sf should be NULL. + CILK_ASSERT(NULL == other_data->resume_sf); + +#if FIBER_DEBUG >= 2 + fprintf(stderr, "W=%d: other fiber=%p, setting resume_sf to %p\n", + w->self, other, other_data->resume_sf); +#endif + // Update our own fiber's data. + current_fiber_data->resume_sf = NULL; + // The scheduling fiber should have the right owner from before. + CILK_ASSERT(current_fiber_data->owner == w); + other_data->resume_sf = sf; + + +#if FIBER_DEBUG >= 3 + fprintf(stderr, "ThreadId=%p (about to suspend self resume other), W=%d: current_fiber=%p, other=%p, current_fiber->resume_sf = %p, other->resume_sf = %p\n", + cilkos_get_current_thread_id(), + w->self, + current_fiber, other, + current_fiber_data->resume_sf, + other_data->resume_sf); +#endif + return other; +} + + +/** + * This function is executed once by each worker, to initialize its + * scheduling loop. + */ +static void worker_scheduler_init_function(__cilkrts_worker *w) +{ + // First, execute the startup tasks that must happen for all + // worker types. + ITT_SYNC_PREPARE(w); + /* Notify tools about the new worker. Inspector needs this, but we + don't want to confuse Cilkscreen with system threads. User threads + do this notification in bind_thread */ + if (! w->g->under_ptool) + __cilkrts_cilkscreen_establish_worker(w); + + // Seed the initial random number generator. + // If we forget to do this, then the worker always steals from 0. + // Programs will still execute correctly, but + // you may see a subtle performance bug... + mysrand(w, (w->self + 1)); + + // The startup work varies, depending on the worker type. + switch (w->l->type) { + case WORKER_USER: + // Stop working once we've entered the scheduler. + // For user workers, INTERVAL_IN_SCHEDULER counts the time + // since we called bind_thread. + break; + + case WORKER_SYSTEM: + // If a system worker is starting, we must also be starting + // the runtime. + + // Runtime begins in a wait-state and is woken up by the first user + // worker when the runtime is ready. + signal_node_wait(w->l->signal_node); + // ... + // Runtime is waking up. + notify_children_run(w); + w->l->steal_failure_count = 0; + + // For system threads, count all the time this thread is + // alive in the scheduling loop. + START_INTERVAL(w, INTERVAL_IN_SCHEDULER); + START_INTERVAL(w, INTERVAL_WORKING); + break; + default: + __cilkrts_bug("Unknown worker %p of type %d entering scheduling loop\n", + w, w->l->type); + } +} + +/** + * This function is executed once by each worker, to finish its + * scheduling loop. + * + * @note Currently, only system workers finish their loops. User + * workers will jump away to user code without exiting their + * scheduling loop. + */ +static void worker_scheduler_terminate_function(__cilkrts_worker *w) +{ + // A user worker should never finish by falling through the + // scheduling loop. + CILK_ASSERT(WORKER_USER != w->l->type); + STOP_INTERVAL(w, INTERVAL_IN_RUNTIME); + STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER); +} + +/** + * The main scheduler function executed by a worker's scheduling + * fiber. + * + * This method is started by either a new system worker, or a user + * worker that has stalled and just been imported into the runtime. + */ +static void worker_scheduler_function(__cilkrts_worker *w) +{ + worker_scheduler_init_function(w); + + // The main scheduling loop body. + + while (!w->g->work_done) { + // Set intervals. Now we are in the runtime instead of working. + START_INTERVAL(w, INTERVAL_IN_RUNTIME); + STOP_INTERVAL(w, INTERVAL_WORKING); + + // Execute the "body" of the scheduling loop, and figure + // out the fiber to jump to next. + cilk_fiber* fiber_to_resume + = worker_scheduling_loop_body(w->l->scheduling_fiber, w); + + if (fiber_to_resume) { + // Suspend the current fiber and resume next one. + NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER); + STOP_INTERVAL(w, INTERVAL_IN_RUNTIME); + START_INTERVAL(w, INTERVAL_WORKING); + cilk_fiber_suspend_self_and_resume_other(w->l->scheduling_fiber, + fiber_to_resume); + + // Return here only when this (scheduling) fiber is + // resumed (i.e., this worker wants to reenter the runtime). + } + } + + // Finish the scheduling loop. + worker_scheduler_terminate_function(w); +} + + +/************************************************************* + Forward declarations for reduction protocol. +*************************************************************/ + +static __cilkrts_worker* +execute_reductions_for_sync(__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *sf_at_sync); + +static __cilkrts_worker* +execute_reductions_for_spawn_return(__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *returning_sf); + + + +/************************************************************* + Scheduler functions that are callable by client code +*************************************************************/ +static full_frame *disown(__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *sf, + const char *why) +{ + CILK_ASSERT(ff); + make_unrunnable(w, ff, sf, sf != 0, why); + w->l->frame_ff = 0; + return ff->parent; +} + +/** + * Called when ff is returning from a spawn, and we need to execute a + * reduction. + * + * @param w The currently executing worker. + * @param ff The full frame for w. + * @param returning_sf The stack frame for the spawn helper that is returning. + * + * Normally, by the time we gain control in the runtime, the worker + * has already popped off the __cilkrts_stack_frame "returning_sf" + * from its call chain. + * + * When we have only serial reductions, w->current_stack_frame is not + * needed any more, because w is about to enter the runtime scheduling + * loop anyway. Similarly, the frame "ff" is slated to be destroyed + * after the runtime finishes the return from spawn and splices ff out + * of the tree of full frames. + * + * To execute a parallel reduction, however, we still want + * w->current_stack_frame == returning_sf, and we are going to use the + * frame ff for a little bit longer. + * + * This method: + * + * 1. Puts returning_sf back as w's current stack frame. + * 2. Makes "ff" runnable again on w. + */ +static inline +void restore_frame_for_spawn_return_reduction(__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *returning_sf) { +#if REDPAR_DEBUG >= 2 + CILK_ASSERT(returning_sf); + CILK_ASSERT(returning_sf->worker == w); +#endif + // Change w's current stack frame back to "returning_sf". + // + // Intuitively, w->current_stack_frame should be + // returning_sf->call_parent at this point. + // + // We can not assert this, however, because the pop of + // returning_sf from the call chain has already cleared + // returning_sf->call_parent. We don't want to restore the call + // parent of returning_sf, because its parent has been stolen, and + // the runtime assumes that steals break this link. + + // We cannot assert call_parent is NULL either, since that's not true for + // Win64 exception handling +// CILK_ASSERT(returning_sf->call_parent == NULL); + w->current_stack_frame = returning_sf; + + // Make the full frame "ff" runnable again, in preparation for + // executing the reduction. + make_runnable(w, ff); +} + + +NORETURN __cilkrts_c_sync(__cilkrts_worker *w, + __cilkrts_stack_frame *sf_at_sync) +{ + full_frame *ff; + + // Claim: This read of w->l->frame_ff can occur without + // holding the worker lock because when w has reached a sync + // and entered the runtime (because it stalls), w's deque is empty + // and no one else can steal and change w->l->frame_ff. + + ff = w->l->frame_ff; +#ifdef _WIN32 + __cilkrts_save_exception_state(w, ff); +#else + // Move any pending exceptions into the full frame + CILK_ASSERT(NULL == ff->pending_exception); + ff->pending_exception = w->l->pending_exception; + w->l->pending_exception = NULL; +#endif + + w = execute_reductions_for_sync(w, ff, sf_at_sync); + +#if FIBER_DEBUG >= 3 + fprintf(stderr, "ThreadId=%p, w->self = %d. about to longjmp_into_runtim[c_sync] with ff=%p\n", + cilkos_get_current_thread_id(), w->self, ff); +#endif + + longjmp_into_runtime(w, do_sync, sf_at_sync); +} + +static void do_sync(__cilkrts_worker *w, full_frame *ff, + __cilkrts_stack_frame *sf) +{ + //int abandoned = 1; + enum provably_good_steal_t steal_result = ABANDON_EXECUTION; + + START_INTERVAL(w, INTERVAL_SYNC_CHECK) { + BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { + + CILK_ASSERT(ff); + BEGIN_WITH_FRAME_LOCK(w, ff) { + CILK_ASSERT(sf->call_parent == 0); + CILK_ASSERT(sf->flags & CILK_FRAME_UNSYNCHED); + + // Before switching into the scheduling fiber, we should have + // already taken care of deallocating the current + // fiber. + CILK_ASSERT(NULL == ff->fiber_self); + + // Update the frame's pedigree information if this is an ABI 1 + // or later frame + if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1) + { + sf->parent_pedigree.rank = w->pedigree.rank; + sf->parent_pedigree.parent = w->pedigree.parent; + + // Note that the pedigree rank needs to be updated + // when setup_for_execution_pedigree runs + sf->flags |= CILK_FRAME_SF_PEDIGREE_UNSYNCHED; + } + + /* the decjoin() occurs in provably_good_steal() */ + steal_result = provably_good_steal(w, ff); + + } END_WITH_FRAME_LOCK(w, ff); + // set w->l->frame_ff = NULL after checking abandoned + if (WAIT_FOR_CONTINUE != steal_result) { + w->l->frame_ff = NULL; + } + } END_WITH_WORKER_LOCK_OPTIONAL(w); + } STOP_INTERVAL(w, INTERVAL_SYNC_CHECK); + + // Now, if we are in a replay situation and provably_good_steal() returned + // WAIT_FOR_CONTINUE, we should sleep, reacquire locks, call + // provably_good_steal(), and release locks until we get a value other + // than WAIT_FOR_CONTINUE from the function. +#ifdef CILK_RECORD_REPLAY + // We don't have to explicitly check for REPLAY_LOG below because + // steal_result can only be set to WAIT_FOR_CONTINUE during replay + while(WAIT_FOR_CONTINUE == steal_result) + { + __cilkrts_sleep(); + BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) + { + ff = w->l->frame_ff; + BEGIN_WITH_FRAME_LOCK(w, ff) + { + steal_result = provably_good_steal(w, ff); + } END_WITH_FRAME_LOCK(w, ff); + if (WAIT_FOR_CONTINUE != steal_result) + w->l->frame_ff = NULL; + } END_WITH_WORKER_LOCK_OPTIONAL(w); + } +#endif // CILK_RECORD_REPLAY + +#ifdef ENABLE_NOTIFY_ZC_INTRINSIC + // If we can't make any further progress on this thread, tell Inspector + // that we're abandoning the work and will go find something else to do. + if (ABANDON_EXECUTION == steal_result) + { + NOTIFY_ZC_INTRINSIC("cilk_sync_abandon", 0); + } +#endif // defined ENABLE_NOTIFY_ZC_INTRINSIC + + return; /* back to scheduler loop */ +} + +/* worker W completely promotes its own deque, simulating the case + where the whole deque is stolen. We use this mechanism to force + the allocation of new storage for reducers for race-detection + purposes. */ +void __cilkrts_promote_own_deque(__cilkrts_worker *w) +{ + // Remember the fiber we start this method on. + CILK_ASSERT(w->l->frame_ff); + cilk_fiber* starting_fiber = w->l->frame_ff->fiber_self; + + BEGIN_WITH_WORKER_LOCK(w) { + while (dekker_protocol(w)) { + /* PLACEHOLDER_FIBER is used as non-null marker to tell detach() + and make_child() that this frame should be treated as a spawn + parent, even though we have not assigned it a stack. */ + detach_for_steal(w, w, PLACEHOLDER_FIBER); + } + } END_WITH_WORKER_LOCK(w); + + + // TBD: The management of full frames and fibers is a bit + // sketchy here. We are promoting stack frames into full frames, + // and pretending they are stolen away, but no other worker is + // actually working on them. Some runtime invariants + // may be broken here. + // + // Technically, if we are simulating a steal from w + // w should get a new full frame, but + // keep the same fiber. A real thief would be taking the + // loot frame away, get a new fiber, and starting executing the + // loot frame. + // + // What should a fake thief do? Where does the frame go? + + // In any case, we should be finishing the promotion process with + // the same fiber with. + CILK_ASSERT(w->l->frame_ff); + CILK_ASSERT(w->l->frame_ff->fiber_self == starting_fiber); +} + + + +/* the client code calls this function after a spawn when the dekker + protocol fails. The function may either return or longjmp + into the rts + + This function takes in a "returning_sf" argument which corresponds + to the __cilkrts_stack_frame that we are finishing (i.e., the + argument to __cilkrts_leave_frame). + */ +void __cilkrts_c_THE_exception_check(__cilkrts_worker *w, + __cilkrts_stack_frame *returning_sf) +{ + full_frame *ff; + int stolen_p; + __cilkrts_stack_frame *saved_sf = NULL; + + START_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK); + + BEGIN_WITH_WORKER_LOCK(w) { + ff = w->l->frame_ff; + CILK_ASSERT(ff); + /* This code is called only upon a normal return and never + upon an exceptional return. Assert that this is the + case. */ + CILK_ASSERT(!w->l->pending_exception); + + reset_THE_exception(w); + stolen_p = !(w->head < (w->tail + 1)); /* +1 because tail was + speculatively + decremented by the + compiled code */ + + if (stolen_p) { + /* XXX This will be charged to THE for accounting purposes */ + __cilkrts_save_exception_state(w, ff); + + // Save the value of the current stack frame. + saved_sf = w->current_stack_frame; + + // Reverse the decrement from undo_detach. + // This update effectively resets the deque to be + // empty (i.e., changes w->tail back to equal w->head). + // We need to reset the deque to execute parallel + // reductions. When we have only serial reductions, it + // does not matter, since serial reductions do not + // change the deque. + w->tail++; +#if REDPAR_DEBUG > 1 + // ASSERT our deque is empty. + CILK_ASSERT(w->head == w->tail); +#endif + } + } END_WITH_WORKER_LOCK(w); + + STOP_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK); + + if (stolen_p) + { + w = execute_reductions_for_spawn_return(w, ff, returning_sf); + + // "Mr. Policeman? My parent always told me that if I was in trouble + // I should ask a nice policeman for help. I can't find my parent + // anywhere..." + // + // Write a record to the replay log for an attempt to return to a stolen parent + replay_record_orphaned(w); + + // Update the pedigree only after we've finished the + // reductions. + update_pedigree_on_leave_frame(w, returning_sf); + + // Notify Inspector that the parent has been stolen and we're + // going to abandon this work and go do something else. This + // will match the cilk_leave_begin in the compiled code + NOTIFY_ZC_INTRINSIC("cilk_leave_stolen", saved_sf); + + DBGPRINTF ("%d: longjmp_into_runtime from __cilkrts_c_THE_exception_check\n", w->self); + longjmp_into_runtime(w, do_return_from_spawn, 0); + DBGPRINTF ("%d: returned from longjmp_into_runtime from __cilkrts_c_THE_exception_check?!\n", w->self); + } + else + { + NOTE_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK_USELESS); + return; + } +} + +/* Return an exception to a stolen parent. */ +NORETURN __cilkrts_exception_from_spawn(__cilkrts_worker *w, + __cilkrts_stack_frame *returning_sf) +{ + full_frame *ff = w->l->frame_ff; + // This is almost the same as THE_exception_check, except + // the detach didn't happen, we don't need to undo the tail + // update. + CILK_ASSERT(w->head == w->tail); + w = execute_reductions_for_spawn_return(w, ff, returning_sf); + + longjmp_into_runtime(w, do_return_from_spawn, 0); + CILK_ASSERT(0); +} + +static void do_return_from_spawn(__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *sf) +{ + full_frame *parent_ff; + enum provably_good_steal_t steal_result = ABANDON_EXECUTION; + + BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { + CILK_ASSERT(ff); + CILK_ASSERT(!ff->is_call_child); + CILK_ASSERT(sf == NULL); + parent_ff = ff->parent; + + BEGIN_WITH_FRAME_LOCK(w, ff) { + decjoin(ff); + } END_WITH_FRAME_LOCK(w, ff); + + BEGIN_WITH_FRAME_LOCK(w, parent_ff) { + if (parent_ff->simulated_stolen) + unconditional_steal(w, parent_ff); + else + steal_result = provably_good_steal(w, parent_ff); + } END_WITH_FRAME_LOCK(w, parent_ff); + + } END_WITH_WORKER_LOCK_OPTIONAL(w); + + // Loop here in replay mode +#ifdef CILK_RECORD_REPLAY + // We don't have to explicitly check for REPLAY_LOG below because + // steal_result can only get set to WAIT_FOR_CONTINUE during replay. + // We also don't have to worry about the simulated_stolen flag + // because steal_result can only be set to WAIT_FOR_CONTINUE by + // provably_good_steal(). + while(WAIT_FOR_CONTINUE == steal_result) + { + __cilkrts_sleep(); + BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) + { + BEGIN_WITH_FRAME_LOCK(w, parent_ff) + { + steal_result = provably_good_steal(w, parent_ff); + } END_WITH_FRAME_LOCK(w, parent_ff); + } END_WITH_WORKER_LOCK_OPTIONAL(w); + } +#endif // CILK_RECORD_REPLAY + + // Cleanup the child frame. + __cilkrts_destroy_full_frame(w, ff); + return; +} + +#ifdef _WIN32 +/* migrate an exception across fibers. Call this function when an exception has + * been thrown and has to traverse across a steal. The exception has already + * been wrapped up, so all that remains is to longjmp() into the continuation, + * sync, and re-raise it. + */ +void __cilkrts_migrate_exception(__cilkrts_stack_frame *sf) { + + __cilkrts_worker *w = sf->worker; + full_frame *ff; + + BEGIN_WITH_WORKER_LOCK(w) { + ff = w->l->frame_ff; + reset_THE_exception(w); + /* there is no need to check for a steal because we wouldn't be here if + there weren't a steal. */ + __cilkrts_save_exception_state(w, ff); + + CILK_ASSERT(w->head == w->tail); + } END_WITH_WORKER_LOCK(w); + + { + // TBD(jsukha): This function emulates the + // the "do_return_from_spawn" path. + w = execute_reductions_for_spawn_return(w, ff, sf); + } + + longjmp_into_runtime(w, do_return_from_spawn, 0); /* does not return. */ + CILK_ASSERT(! "Shouldn't be here..."); +} +#endif + + +/* Pop a call stack from TAIL. Return the call stack, or NULL if the + queue is empty */ +__cilkrts_stack_frame *__cilkrts_pop_tail(__cilkrts_worker *w) +{ + __cilkrts_stack_frame *sf; + BEGIN_WITH_WORKER_LOCK(w) { + __cilkrts_stack_frame *volatile *tail = w->tail; + if (w->head < tail) { + --tail; + sf = *tail; + w->tail = tail; + } else { + sf = 0; + } + } END_WITH_WORKER_LOCK(w); + return sf; +} + +#ifdef CILK_RECORD_REPLAY +__cilkrts_stack_frame *simulate_pop_tail(__cilkrts_worker *w) +{ + __cilkrts_stack_frame *sf; + BEGIN_WITH_WORKER_LOCK(w) { + if (w->head < w->tail) { + sf = *(w->tail-1); + } else { + sf = 0; + } + } END_WITH_WORKER_LOCK(w); + return sf; +} +#endif + + +/* Return from a call, not a spawn. */ +void __cilkrts_return(__cilkrts_worker *w) +{ + full_frame *ff, *parent_ff; + START_INTERVAL(w, INTERVAL_RETURNING); + + BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { + ff = w->l->frame_ff; + CILK_ASSERT(ff); + CILK_ASSERT(ff->join_counter == 1); + /* This path is not used to return from spawn. */ + CILK_ASSERT(ff->is_call_child); + + BEGIN_WITH_FRAME_LOCK(w, ff) { + // After this call, w->l->frame_ff != ff. + // Technically, w will "own" ff until ff is freed, + // however, because ff is a dying leaf full frame. + parent_ff = disown(w, ff, 0, "return"); + decjoin(ff); + +#ifdef _WIN32 + __cilkrts_save_exception_state(w, ff); +#else + // Move the pending exceptions into the full frame + // This should always be NULL if this isn't a + // return with an exception + CILK_ASSERT(NULL == ff->pending_exception); + ff->pending_exception = w->l->pending_exception; + w->l->pending_exception = NULL; +#endif // _WIN32 + + } END_WITH_FRAME_LOCK(w, ff); + + __cilkrts_fence(); /* redundant */ + + CILK_ASSERT(parent_ff); + + BEGIN_WITH_FRAME_LOCK(w, parent_ff) { + finalize_child_for_call(w, parent_ff, ff); + } END_WITH_FRAME_LOCK(w, parent_ff); + + ff = pop_next_frame(w); + /* ff will be non-null except when the parent frame is owned + by another worker. + CILK_ASSERT(ff) + */ + CILK_ASSERT(!w->l->frame_ff); + if (ff) { + BEGIN_WITH_FRAME_LOCK(w, ff) { + __cilkrts_stack_frame *sf = ff->call_stack; + CILK_ASSERT(sf && !sf->call_parent); + setup_for_execution(w, ff, 1); + } END_WITH_FRAME_LOCK(w, ff); + } + } END_WITH_WORKER_LOCK_OPTIONAL(w); + + STOP_INTERVAL(w, INTERVAL_RETURNING); +} + +static void __cilkrts_unbind_thread() +{ + int stop_cilkscreen = 0; + global_state_t *g; + + // Take out the global OS mutex to protect accesses to the table of workers + global_os_mutex_lock(); + + if (cilkg_is_published()) { + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + if (w) { + g = w->g; + + // If there's only 1 worker, the counts will be stopped in + // __cilkrts_scheduler + if (g->P > 1) + { + STOP_INTERVAL(w, INTERVAL_WORKING); + STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER); + } + + __cilkrts_set_tls_worker(0); + + if (w->self == -1) { + // This worker is an overflow worker. I.e., it was created on- + // demand when the global pool ran out of workers. + destroy_worker(w); + __cilkrts_free(w); + } else { + // This is a normal user worker and needs to be counted by the + // global state for the purposes of throttling system workers. + w->l->type = WORKER_FREE; + __cilkrts_leave_cilk(g); + } + + stop_cilkscreen = (0 == g->Q); + } + } + global_os_mutex_unlock(); + + /* Turn off Cilkscreen. This needs to be done when we are NOT holding the + * os mutex. */ + if (stop_cilkscreen) + __cilkrts_cilkscreen_disable_instrumentation(); +} + +/* special return from the initial frame */ + +void __cilkrts_c_return_from_initial(__cilkrts_worker *w) +{ + struct cilkred_map *rm; + + /* This is only called on a user thread worker. */ + CILK_ASSERT(w->l->type == WORKER_USER); + + #if REDPAR_DEBUG >= 3 + fprintf(stderr, "[W=%d, desc=cilkrts_c_return_from_initial, ff=%p]\n", + w->self, w->l->frame_ff); + #endif + + BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) { + full_frame *ff = w->l->frame_ff; + CILK_ASSERT(ff); + CILK_ASSERT(ff->join_counter == 1); + w->l->frame_ff = 0; + + CILK_ASSERT(ff->fiber_self); + // Save any TBB interop data for the next time this thread enters Cilk + cilk_fiber_tbb_interop_save_info_from_stack(ff->fiber_self); + + // Deallocate cilk_fiber that mapped to the user stack. The stack + // itself does not get deallocated (of course) but our data + // structure becomes divorced from it. + +#if FIBER_DEBUG >= 1 + fprintf(stderr, "ThreadId=%p: w=%d: We are about to deallocate ff->fiber_self = %p here. w->l->scheduling_fiber = %p. w->l->type = %d\n", + cilkos_get_current_thread_id(), + w->self, + ff->fiber_self, + w->l->scheduling_fiber, + w->l->type); +#endif + // The fiber in ff is a user-code fiber. The fiber in + // w->l->scheduling_fiber is a scheduling fiber. These fibers should + // never be equal. When a user worker returns (and will unbind), we + // should destroy only the fiber in ff. The scheduling fiber will be + // re-used. + + CILK_ASSERT(ff->fiber_self != w->l->scheduling_fiber); + + START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) { + // This fiber might not be deallocated here if there + // is a pending exception on Windows that refers + // to this fiber. + // + // First "suspend" the fiber, and then try to delete it. + cilk_fiber_deallocate_from_thread(ff->fiber_self); + } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE); + ff->fiber_self = NULL; + + /* Save reducer map into global_state object */ + rm = w->reducer_map; + w->reducer_map = NULL; + +#if REDPAR_DEBUG >= 3 + fprintf(stderr, "W=%d, reducer_map_to_delete=%p, was in ff=%p\n", + w->self, + rm, + ff); +#endif + __cilkrts_destroy_full_frame(w, ff); + + + /* Work is never done. w->g->work_done = 1; __cilkrts_fence(); */ + } END_WITH_WORKER_LOCK_OPTIONAL(w); + + + save_pedigree_leaf_from_user_worker(w); + + // Workers can have NULL reducer maps now. + if (rm) { + __cilkrts_destroy_reducer_map(w, rm); + } + + +#if FIBER_DEBUG >= 1 + __cilkrts_worker* tmp = w; + int tmp_id = w->self; + fprintf(stderr, "w=%d: We are about unbind thread (w= %p)\n", + w->self, + w); +#endif + + w = NULL; + + __cilkrts_unbind_thread(); + +#if FIBER_DEBUG >= 1 + + fprintf(stderr, "w=%p, %d: Finished unbind\n", + tmp, tmp_id); +#endif + + /* Other workers will stop trying to steal if this was the last worker. */ + + return; +} + + +/* + * __cilkrts_restore_stealing + * + * Restore the protected_tail to a previous state, possibly allowing frames + * to be stolen. The dekker_protocol has been extended to steal only if + * head+1 is < protected_tail. + */ + +void __cilkrts_restore_stealing( + __cilkrts_worker *w, + __cilkrts_stack_frame *volatile *saved_protected_tail) +{ + /* On most x86 this pair of operations would be slightly faster + as an atomic exchange due to the implicit memory barrier in + an atomic instruction. */ + w->protected_tail = saved_protected_tail; + __cilkrts_fence(); +} + +/* + * __cilkrts_disallow_stealing + * + * Move the protected_tail to NEW_PROTECTED_TAIL, preventing any + * frames from being stolen. If NEW_PROTECTED_TAIL is NULL, prevent + * stealing from the whole queue. The dekker_protocol has been + * extended to only steal if head+1 is also < protected_tail. + */ + +__cilkrts_stack_frame *volatile *__cilkrts_disallow_stealing( + __cilkrts_worker *w, + __cilkrts_stack_frame *volatile *new_protected_tail) +{ + __cilkrts_stack_frame *volatile *saved_protected_tail = w->protected_tail; + + if (!new_protected_tail) + new_protected_tail = w->l->ltq; + + if (w->protected_tail > new_protected_tail) { + w->protected_tail = new_protected_tail; + /* Issue a store-store barrier. The update to protected_tail + here must precede the update to tail in the next spawn. + On x86 this is probably not needed. */ +#if defined __GNUC__ && __ICC >= 1200 && !(__MIC__ ||__MIC2__) + _mm_sfence(); +#else + __cilkrts_fence(); +#endif + } + + return saved_protected_tail; +} + +/************************************************************* + Initialization and startup +*************************************************************/ + +__cilkrts_worker *make_worker(global_state_t *g, + int self, __cilkrts_worker *w) +{ + w->self = self; + w->g = g; + + w->pedigree.rank = 0; // Initial rank is 0 + w->pedigree.parent = NULL; + + w->l = (local_state *)__cilkrts_malloc(sizeof(*w->l)); + + __cilkrts_frame_malloc_per_worker_init(w); + + w->reducer_map = NULL; + w->current_stack_frame = NULL; + w->reserved = NULL; + + w->l->worker_magic_0 = WORKER_MAGIC_0; + w->l->team = NULL; + w->l->type = WORKER_FREE; + + __cilkrts_mutex_init(&w->l->lock); + __cilkrts_mutex_init(&w->l->steal_lock); + w->l->do_not_steal = 0; + w->l->frame_ff = 0; + w->l->next_frame_ff = 0; + w->l->last_full_frame = NULL; + + w->l->ltq = (__cilkrts_stack_frame **) + __cilkrts_malloc(g->ltqsize * sizeof(*w->l->ltq)); + w->ltq_limit = w->l->ltq + g->ltqsize; + w->head = w->tail = w->l->ltq; + + cilk_fiber_pool_init(&w->l->fiber_pool, + &g->fiber_pool, + g->stack_size, + g->fiber_pool_size, + 0, // alloc_max is 0. We don't allocate from the heap directly without checking the parent pool. + 0); +#if FIBER_DEBUG >= 2 + fprintf(stderr, "ThreadId=%p: Making w=%d (%p), pool = %p\n", + cilkos_get_current_thread_id(), + w->self, w, + &w->l->fiber_pool); +#endif + w->l->scheduling_fiber = NULL; + w->l->original_pedigree_leaf = NULL; + w->l->rand_seed = 0; /* the scheduler will overwrite this field */ + + w->l->post_suspend = 0; + w->l->suspended_stack = 0; + w->l->fiber_to_free = NULL; + w->l->pending_exception = NULL; + +#if CILK_PROFILE + w->l->stats = __cilkrts_malloc(sizeof(statistics)); + __cilkrts_init_stats(w->l->stats); +#else + w->l->stats = NULL; +#endif + w->l->steal_failure_count = 0; + + w->l->work_stolen = 0; + + // Initialize record/replay assuming we're doing neither + w->l->record_replay_fptr = NULL; + w->l->replay_list_root = NULL; + w->l->replay_list_entry = NULL; + w->l->signal_node = NULL; + // Nothing's been stolen yet + w->l->worker_magic_1 = WORKER_MAGIC_1; + + /*w->parallelism_disabled = 0;*/ + + // Allow stealing all frames. Sets w->saved_protected_tail + __cilkrts_restore_stealing(w, w->ltq_limit); + + __cilkrts_init_worker_sysdep(w); + + reset_THE_exception(w); + + return w; +} + +void destroy_worker(__cilkrts_worker *w) +{ + CILK_ASSERT (NULL == w->l->pending_exception); + + // Deallocate the scheduling fiber + if (NULL != w->l->scheduling_fiber) + { + // The scheduling fiber is the main fiber for system workers and must + // be deallocated by the thread that created it. Thus, we can + // deallocate only free workers' (formerly user workers) scheduling + // fibers here. + CILK_ASSERT(WORKER_FREE == w->l->type); + +#if FIBER_DEBUG >=1 + fprintf(stderr, "ThreadId=%p, w=%p, %d, deallocating scheduling fiber = %p, \n", + cilkos_get_current_thread_id(), + w, + w->self, + w->l->scheduling_fiber); +#endif + int ref_count = cilk_fiber_remove_reference(w->l->scheduling_fiber, NULL); + // Scheduling fiber should never have extra references because of exceptions. + CILK_ASSERT(0 == ref_count); + w->l->scheduling_fiber = NULL; + } + +#if CILK_PROFILE + if (w->l->stats) { + __cilkrts_free(w->l->stats); + } +#else + CILK_ASSERT(NULL == w->l->stats); +#endif + + /* Free any cached fibers. */ + cilk_fiber_pool_destroy(&w->l->fiber_pool); + + __cilkrts_destroy_worker_sysdep(w); + + if (w->l->signal_node) { + CILK_ASSERT(WORKER_SYSTEM == w->l->type); + signal_node_destroy(w->l->signal_node); + } + + __cilkrts_free(w->l->ltq); + __cilkrts_mutex_destroy(0, &w->l->lock); + __cilkrts_mutex_destroy(0, &w->l->steal_lock); + __cilkrts_frame_malloc_per_worker_cleanup(w); + + __cilkrts_free(w->l); + + // The caller is responsible for freeing the worker memory +} + +/* + * Make a worker into a system worker. + */ +static void make_worker_system(__cilkrts_worker *w) { + CILK_ASSERT(WORKER_FREE == w->l->type); + w->l->type = WORKER_SYSTEM; + w->l->signal_node = signal_node_create(); +} + +void __cilkrts_deinit_internal(global_state_t *g) +{ + int i; + __cilkrts_worker *w; + + // If there's no global state then we're done + if (NULL == g) + return; + +#ifdef CILK_PROFILE + __cilkrts_dump_stats_to_stderr(g); +#endif + + w = g->workers[0]; + if (w->l->frame_ff) { + __cilkrts_destroy_full_frame(w, w->l->frame_ff); + w->l->frame_ff = 0; + } + + // Release any resources used for record/replay + replay_term(g); + + // Destroy any system dependent global state + __cilkrts_destroy_global_sysdep(g); + + for (i = 0; i < g->total_workers; ++i) + destroy_worker(g->workers[i]); + + // Free memory for all worker blocks which were allocated contiguously + __cilkrts_free(g->workers[0]); + + __cilkrts_free(g->workers); + + cilk_fiber_pool_destroy(&g->fiber_pool); + __cilkrts_frame_malloc_global_cleanup(g); + + cilkg_deinit_global_state(); +} + +/* + * Wake the runtime by notifying the system workers that they can steal. The + * first user worker into the runtime should call this. + */ +static void wake_runtime(global_state_t *g) +{ + __cilkrts_worker *root; + if (g->P > 1) { + // Send a message to the root node. The message will propagate. + root = g->workers[0]; + CILK_ASSERT(root->l->signal_node); + signal_node_msg(root->l->signal_node, 1); + } +} + +/* + * Put the runtime to sleep. The last user worker out of the runtime should + * call this. Like Dad always said, turn out the lights when nobody's in the + * room. + */ +static void sleep_runtime(global_state_t *g) +{ + __cilkrts_worker *root; + if (g->P > 1) { + // Send a message to the root node. The message will propagate. + root = g->workers[0]; + CILK_ASSERT(root->l->signal_node); + signal_node_msg(root->l->signal_node, 0); + } +} + +/* Called when a user thread joins Cilk. + Global lock must be held. */ +void __cilkrts_enter_cilk(global_state_t *g) +{ + if (g->Q++ == 0) { + // If this is the first user thread to enter Cilk wake + // up all the workers. + wake_runtime(g); + } +} + +/* Called when a user thread leaves Cilk. + Global lock must be held. */ +void __cilkrts_leave_cilk(global_state_t *g) +{ + if (--g->Q == 0) { + // Put the runtime to sleep. + sleep_runtime(g); + } +} + +/* + * worker_runnable + * + * Return true if the worker should continue to try to steal. False, otherwise. + */ + +NOINLINE +static enum schedule_t worker_runnable(__cilkrts_worker *w) +{ + global_state_t *g = w->g; + + /* If this worker has something to do, do it. + Otherwise the work would be lost. */ + if (w->l->next_frame_ff) + return SCHEDULE_RUN; + + // If Cilk has explicitly (by the user) been told to exit (i.e., by + // __cilkrts_end_cilk() -> __cilkrts_stop_workers(g)), then return 0. + if (g->work_done) + return SCHEDULE_EXIT; + + if (0 == w->self) { + // This worker is the root node and is the only one that may query the + // global state to see if there are still any user workers in Cilk. + if (w->l->steal_failure_count > g->max_steal_failures) { + if (signal_node_should_wait(w->l->signal_node)) { + return SCHEDULE_WAIT; + } else { + // Reset the steal_failure_count since we have verified that + // user workers are still in Cilk. + w->l->steal_failure_count = 0; + } + } + } else if (WORKER_SYSTEM == w->l->type && + signal_node_should_wait(w->l->signal_node)) { + // This worker has been notified by its parent that it should stop + // trying to steal. + return SCHEDULE_WAIT; + } + + return SCHEDULE_RUN; +} + + + +// Initialize the worker structs, but don't start the workers themselves. +static void init_workers(global_state_t *g) +{ + int total_workers = g->total_workers; + int i; + struct CILK_ALIGNAS(256) buffered_worker { + __cilkrts_worker w; + char buf[64]; + } *workers_memory; + + /* not needed if only one worker */ + cilk_fiber_pool_init(&g->fiber_pool, + NULL, + g->stack_size, + g->global_fiber_pool_size, // buffer_size + g->max_stacks, // maximum # to allocate + 1); + + cilk_fiber_pool_set_fiber_limit(&g->fiber_pool, + (g->max_stacks ? g->max_stacks : INT_MAX)); + + g->workers = (__cilkrts_worker **) + __cilkrts_malloc(total_workers * sizeof(*g->workers)); + + // Allocate 1 block of memory for workers to make life easier for tools + // like Inspector which run multithreaded and need to know the memory + // range for all the workers that will be accessed in a user's program + workers_memory = (struct buffered_worker*) + __cilkrts_malloc(sizeof(*workers_memory) * total_workers); + + // Notify any tools that care (Cilkscreen and Inspector) that they should + // ignore memory allocated for the workers + __cilkrts_cilkscreen_ignore_block(&workers_memory[0], + &workers_memory[total_workers]); + + // Initialize worker structs, including unused worker slots. + for (i = 0; i < total_workers; ++i) { + g->workers[i] = make_worker(g, i, &workers_memory[i].w); + } + + // Set the workers in the first P - 1 slots to be system workers. + // Remaining worker structs already have type == 0. + for (i = 0; i < g->system_workers; ++i) { + make_worker_system(g->workers[i]); + } +} + +void __cilkrts_init_internal(int start) +{ + global_state_t *g = NULL; + + if (cilkg_is_published()) { + g = cilkg_init_global_state(); + } + else { + + // We think the state has not been published yet. + // Grab the lock and try to initialize/publish. + global_os_mutex_lock(); + + if (cilkg_is_published()) { + // Some other thread must have snuck in and published. + g = cilkg_init_global_state(); + } + else { + // Initialize and retrieve global state + g = cilkg_init_global_state(); + + // Set the scheduler pointer + g->scheduler = worker_scheduler_function; + + // If we're running under a sequential P-Tool (Cilkscreen or + // Cilkview) then there's only one worker and we need to tell + // the tool about the extent of the stack + if (g->under_ptool) + __cilkrts_establish_c_stack(); + init_workers(g); + + // Initialize per-work record/replay logging + replay_init_workers(g); + + // Initialize any system dependent global state + __cilkrts_init_global_sysdep(g); + + + cilkg_publish_global_state(g); + } + + global_os_mutex_unlock(); + } + + CILK_ASSERT(g); + + if (start && !g->workers_running) + { + // Acquire the global OS mutex while we're starting the workers + global_os_mutex_lock(); + if (!g->workers_running) + // Start P - 1 system workers since P includes the first user + // worker. + __cilkrts_start_workers(g, g->P - 1); + global_os_mutex_unlock(); + } +} + + +/************************************************************************ + Methods for reducer protocol. + + Reductions occur in two places: + A. A full frame "ff" is returning from a spawn with a stolen parent. + B. A full frame "ff" is stalling at a sync. + + To support parallel reductions, reduction functions need to be + executed while control is on a user stack, before jumping into the + runtime. These reductions can not occur while holding a worker or + frame lock. + + Before a worker w executes a reduction in either Case A or B, w's + deque is empty. + + Since parallel reductions push work onto the deque, we must do extra + work to set up runtime data structures properly before reductions + begin to allow stealing. ( Normally, when we have only serial + reductions, once a worker w starts a reduction, its deque remains + empty until w either steals another frame or resumes a suspended + frame. Thus, we don't care about the state of the deque, since w + will reset its deque when setting up execution of a frame. ) + + To allow for parallel reductions, we coerce the runtime data + structures so that, from their perspective, it looks as though we + have spliced in an "execute_reductions()" function. Consider the + two cases for reductions: + + Case A: Return from a spawn with a stolen parent. + Consider a spawned function g is returning on a worker w. + Assume: + - g was spawned from a parent function f. + - ff is the full frame for g's spawn helper + - sf be the __cilkrts_stack_frame for g's spawn helper. + + We are conceptually splicing "execute_reductions()" so that it + occurs immediately before the spawn helper of g returns to f. + + We do so by creating two different world views --- one for the + runtime data structures, and one for the actual control flow. + + - Before reductions begin, the runtime data structures should + look as though the spawn helper of g is calling + "execute_reductions()", in terms of both the user stack and + worker deque. More precisely, w should satisfy the + following properties: + + (a) w has ff as its full frame, + (b) w has sf as its __cilkrts_stack_frame, and + (c) w has an empty deque. + + If the runtime satisfies these properties, then if w + encounters a spawn in a parallel reduction, it can push onto + a valid deque. Also, when a steal from w occurs, it will + build the correct tree of full frames when w is stolen from. + + - In actual control flow, however, once the + "execute_reductions()" function returns, it is actually + returning to runtime code instead of g's spawn helper. + + At the point a worker w began executing reductions, the + control flow / compiled code had already finished g's spawn + helper, and w was about to enter the runtime. With parallel + reductions, some worker v (which might be different from w) + is the one returning to the runtime. + + + The reduction logic consists of 4 steps: + + A1. Restore runtime data structures to make it look as though + the spawn helper of g() is still the currently executing + frame for w. + + A2. Execute reductions on the user stack. Reductions also + includes the logic for exceptions and stacks. Note that + reductions start on w, but may finish on a different + worker if there is parallelism in the reduce. + + A3. Splice out ff from the tree of full frames. + + A4. Jump into the runtime/scheduling stack and execute + "do_return_from_spawn". This method + + (a) Frees the user stack we were just on if it is no longer needed. + (b) Decrement the join counter on ff->parent, and tries to do a + provably good steal. + (c) Clean up the full frame ff. + + + Case B: Stalling at a sync. + + Consider a function g(), with full frame ff and + __cilkrts_stack_frame sf. Suppose g() stalls at a sync, and we + are executing reductions. + + Conceptually, we are splicing in an "execute_reductions()" + function into g() as the last action that g() takes immediately + before it executes the cilk_sync. + + The reduction logic for this case is similar to Case A. + + B1. Restore the runtime data structures. + + The main difference from Case A is that ff/sf is still a + frame that needs to be executed later (since it is stalling + at a cilk_sync). Thus, we also need to save the current + stack information into "ff" so that we can correctly resume + execution of "ff" after the sync. + + B2. Execute reductions on the user stack. + + B3. No frame to splice out of the tree. + + B4. Jump into the runtime/scheduling stack and execute "do_sync". + This method: + (a) Frees the user stack we were just on if it is no longer needed. + (b) Tries to execute a provably good steal. + + Finally, for the reducer protocol, we consider two reduction paths, + namely a "fast" and "slow" path. On a fast path, only trivial + merges of reducer maps happen (i.e., one or both of the maps are + NULL). Otherwise, on the slow path, a reduction actually needs to + happen. + +*****************************************************************/ + +/** + * @brief Locations to store the result of a reduction. + * + * Struct storing pointers to the fields in our "left" sibling that we + * should update when splicing out a full frame or stalling at a sync. + */ +typedef struct { + /** A pointer to the location of our left reducer map. */ + struct cilkred_map **map_ptr; + + /** A pointer to the location of our left exception. */ + struct pending_exception_info **exception_ptr; +} splice_left_ptrs; + +/** + * For a full frame returning from a spawn, calculate the pointers to + * the maps and exceptions to my left. + * + * @param w The currently executing worker. + * @param ff Full frame that is dying + * @return Pointers to our "left" for reducers and exceptions. + */ +static inline +splice_left_ptrs compute_left_ptrs_for_spawn_return(__cilkrts_worker *w, + full_frame *ff) +{ + // ASSERT: we hold the lock on ff->parent + + splice_left_ptrs left_ptrs; + if (ff->left_sibling) { + left_ptrs.map_ptr = &ff->left_sibling->right_reducer_map; + left_ptrs.exception_ptr = &ff->left_sibling->right_pending_exception; + } + else { + full_frame *parent_ff = ff->parent; + left_ptrs.map_ptr = &parent_ff->children_reducer_map; + left_ptrs.exception_ptr = &parent_ff->child_pending_exception; + } + return left_ptrs; +} + +/** + * For a full frame at a sync, calculate the pointers to the maps and + * exceptions to my left. + * + * @param w The currently executing worker. + * @param ff Full frame that is stalling at a sync. + * @return Pointers to our "left" for reducers and exceptions. + */ +static inline +splice_left_ptrs compute_left_ptrs_for_sync(__cilkrts_worker *w, + full_frame *ff) +{ + // ASSERT: we hold the lock on ff + splice_left_ptrs left_ptrs; + + // Figure out which map to the left we should merge into. + if (ff->rightmost_child) { + CILK_ASSERT(ff->rightmost_child->parent == ff); + left_ptrs.map_ptr = &(ff->rightmost_child->right_reducer_map); + left_ptrs.exception_ptr = &(ff->rightmost_child->right_pending_exception); + } + else { + // We have no children. Then, we should be the last + // worker at the sync... "left" is our child map. + left_ptrs.map_ptr = &(ff->children_reducer_map); + left_ptrs.exception_ptr = &(ff->child_pending_exception); + } + return left_ptrs; +} + +/** + * After we have completed all reductions on a spawn return, call this + * method to finish up before jumping into the runtime. + * + * 1. Perform the "reduction" on stacks, i.e., execute the left + * holder logic to pass the leftmost stack up. + * + * w->l->fiber_to_free holds any stack that needs to be freed + * when control switches into the runtime fiber. + * + * 2. Unlink and remove child_ff from the tree of full frames. + * + * @param w The currently executing worker. + * @param parent_ff The parent of child_ff. + * @param child_ff The full frame returning from a spawn. + */ +static inline +void finish_spawn_return_on_user_stack(__cilkrts_worker *w, + full_frame *parent_ff, + full_frame *child_ff) +{ + CILK_ASSERT(w->l->fiber_to_free == NULL); + + // Execute left-holder logic for stacks. + if (child_ff->left_sibling || parent_ff->fiber_child) { + // Case where we are not the leftmost stack. + CILK_ASSERT(parent_ff->fiber_child != child_ff->fiber_self); + + // Remember any fiber we need to free in the worker. + // After we jump into the runtime, we will actually do the + // free. + w->l->fiber_to_free = child_ff->fiber_self; + } + else { + // We are leftmost, pass stack/fiber up to parent. + // Thus, no stack/fiber to free. + parent_ff->fiber_child = child_ff->fiber_self; + w->l->fiber_to_free = NULL; + } + + child_ff->fiber_self = NULL; + + unlink_child(parent_ff, child_ff); +} + + +/** + * Executes any fast reductions necessary to splice ff out of the tree + * of full frames. + * + * This "fast" path performs only trivial merges of reducer maps, + * i.e,. when one of them is NULL. + * (See slow_path_reductions_for_spawn_return() for slow path.) + * + * Returns: 1 if we finished all reductions. + * Returns: 0 if there are still reductions to execute, and + * we should execute the slow path. + * + * This method assumes w holds the frame lock on parent_ff. + * After this method completes: + * 1. We have spliced ff out of the tree of full frames. + * 2. The reducer maps of child_ff have been deposited + * "left" according to the reducer protocol. + * 3. w->l->stack_to_free stores the stack + * that needs to be freed once we jump into the runtime. + * + * We have not, however, decremented the join counter on ff->parent. + * This prevents any other workers from resuming execution of the parent. + * + * @param w The currently executing worker. + * @param ff The full frame returning from a spawn. + * @return NULL if we finished all reductions. + * @return The address where the left map is stored (which should be passed to + * slow_path_reductions_for_spawn_return()) if there are + * still reductions to execute. + */ +struct cilkred_map** +fast_path_reductions_for_spawn_return(__cilkrts_worker *w, + full_frame *ff) +{ + // ASSERT: we hold ff->parent->lock. + splice_left_ptrs left_ptrs; + + CILK_ASSERT(NULL == w->l->pending_exception); + + // Figure out the pointers to the left where I want + // to put reducers and exceptions. + left_ptrs = compute_left_ptrs_for_spawn_return(w, ff); + + // Go ahead and merge exceptions while holding the lock. + splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr); + + // Now check if we have any reductions to perform. + // + // Consider all the cases of left, middle and right maps. + // 0. (-, -, -) : finish and return 1 + // 1. (L, -, -) : finish and return 1 + // 2. (-, M, -) : slide over to left, finish, and return 1. + // 3. (L, M, -) : return 0 + // 4. (-, -, R) : slide over to left, finish, and return 1. + // 5. (L, -, R) : return 0 + // 6. (-, M, R) : return 0 + // 7. (L, M, R) : return 0 + // + // In terms of code: + // L == *left_ptrs.map_ptr + // M == w->reducer_map + // R == f->right_reducer_map. + // + // The goal of the code below is to execute the fast path with + // as few branches and writes as possible. + + int case_value = (*(left_ptrs.map_ptr) != NULL); + case_value += ((w->reducer_map != NULL) << 1); + case_value += ((ff->right_reducer_map != NULL) << 2); + + // Fastest path is case_value == 0 or 1. + if (case_value >=2) { + switch (case_value) { + case 2: + *(left_ptrs.map_ptr) = w->reducer_map; + w->reducer_map = NULL; + return NULL; + break; + case 4: + *(left_ptrs.map_ptr) = ff->right_reducer_map; + ff->right_reducer_map = NULL; + return NULL; + default: + // If we have to execute the slow path, then + // return the pointer to the place to deposit the left + // map. + return left_ptrs.map_ptr; + } + } + + // Do nothing + return NULL; +} + + +/** + * Executes any reductions necessary to splice "ff" frame out of + * the steal tree. + * + * This method executes the "slow" path for reductions on a spawn + * return, i.e., there are non-NULL maps that need to be merged + * together. + * + * This method should execute only if + * fast_path_reductions_for_spawn_return() returns a non-NULL + * left_map_ptr. + * + * Upon entry, left_map_ptr should be the location of the left map + * at the start of the reduction, as calculated by + * fast_path_reductions_for_spawn_return(). + * + * After this method completes: + * 1. We have spliced ff out of the tree of full frames. + * 2. The reducer maps of child_ff have been deposited + * "left" according to the reducer protocol. + * 3. w->l->stack_to_free stores the stack + * that needs to be freed once we jump into the runtime. + * We have not, however, decremented the join counter on ff->parent, + * so no one can resume execution of the parent yet. + * + * WARNING: + * This method assumes the lock on ff->parent is held upon entry, and + * Upon exit, the worker that returns still holds a lock on ff->parent + * This method can, however, release and reacquire the lock on ff->parent. + * + * @param w The currently executing worker. + * @param ff The full frame returning from a spawn. + * @param left_map_ptr Pointer to our initial left map. + * @return The worker that this method returns on. + */ +static __cilkrts_worker* +slow_path_reductions_for_spawn_return(__cilkrts_worker *w, + full_frame *ff, + struct cilkred_map **left_map_ptr) +{ + + // CILK_ASSERT: w is holding frame lock on parent_ff. +#if REDPAR_DEBUG > 0 + CILK_ASSERT(!ff->rightmost_child); + CILK_ASSERT(!ff->is_call_child); +#endif + + // Loop invariant: + // When beginning this loop, we should + // 1. Be holding the lock on ff->parent. + // 2. left_map_ptr should be the address of the pointer to the left map. + // 3. All maps should be slid over left by one, if possible. + // 4. All exceptions should be merged so far. + while (1) { + + // Slide middle map left if possible. + if (!(*left_map_ptr)) { + *left_map_ptr = w->reducer_map; + w->reducer_map = NULL; + } + // Slide right map to middle if possible. + if (!w->reducer_map) { + w->reducer_map = ff->right_reducer_map; + ff->right_reducer_map = NULL; + } + + // Since we slid everything left by one, + // we are finished if there is no middle map. + if (!w->reducer_map) { + verify_current_wkr(w); + return w; + } + else { + struct cilkred_map* left_map; + struct cilkred_map* middle_map; + struct cilkred_map* right_map; + + // Take all the maps from their respective locations. + // We can't leave them in place and execute a reduction because these fields + // might change once we release the lock. + left_map = *left_map_ptr; + *left_map_ptr = NULL; + middle_map = w->reducer_map; + w->reducer_map = NULL; + right_map = ff->right_reducer_map; + ff->right_reducer_map = NULL; + + // WARNING!!! Lock release here. + // We have reductions to execute (and we can't hold locks). + __cilkrts_frame_unlock(w, ff->parent); + + // Merge all reducers into the left map. + left_map = repeated_merge_reducer_maps(&w, + left_map, + middle_map); + verify_current_wkr(w); + left_map = repeated_merge_reducer_maps(&w, + left_map, + right_map); + verify_current_wkr(w); + CILK_ASSERT(NULL == w->reducer_map); + // Put the final answer back into w->reducer_map. + w->reducer_map = left_map; + + // Save any exceptions generated because of the reduction + // process from the returning worker. These get merged + // the next time around the loop. + CILK_ASSERT(NULL == ff->pending_exception); + ff->pending_exception = w->l->pending_exception; + w->l->pending_exception = NULL; + + // Lock ff->parent for the next loop around. + __cilkrts_frame_lock(w, ff->parent); + + // Once we have the lock again, recompute who is to our + // left. + splice_left_ptrs left_ptrs; + left_ptrs = compute_left_ptrs_for_spawn_return(w, ff); + + // Update the pointer for the left map. + left_map_ptr = left_ptrs.map_ptr; + // Splice the exceptions for spawn. + splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr); + } + } + // We should never break out of this loop. + + CILK_ASSERT(0); + return NULL; +} + + + +/** + * Execute reductions when returning from a spawn whose parent has + * been stolen. + * + * Execution may start on w, but may finish on a different worker. + * This method acquires/releases the lock on ff->parent. + * + * @param w The currently executing worker. + * @param ff The full frame of the spawned function that is returning. + * @param returning_sf The __cilkrts_stack_frame for this returning function. + * @return The worker returning from this method. + */ +static __cilkrts_worker* +execute_reductions_for_spawn_return(__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *returning_sf) +{ + // Step A1 from reducer protocol described above. + // + // Coerce the runtime into thinking that + // ff/returning_sf are still on the bottom of + // w's deque. + restore_frame_for_spawn_return_reduction(w, ff, returning_sf); + + // Step A2 and A3: Execute reductions on user stack. + BEGIN_WITH_FRAME_LOCK(w, ff->parent) { + struct cilkred_map **left_map_ptr; + left_map_ptr = fast_path_reductions_for_spawn_return(w, ff); + + // Pointer will be non-NULL if there are + // still reductions to execute. + if (left_map_ptr) { + // WARNING: This method call may release the lock + // on ff->parent and re-acquire it (possibly on a + // different worker). + // We can't hold locks while actually executing + // reduce functions. + w = slow_path_reductions_for_spawn_return(w, + ff, + left_map_ptr); + verify_current_wkr(w); + } + + finish_spawn_return_on_user_stack(w, ff->parent, ff); + // WARNING: the use of this lock macro is deceptive. + // The worker may have changed here. + } END_WITH_FRAME_LOCK(w, ff->parent); + return w; +} + + + +/** + * Execute fast "reductions" when ff stalls at a sync. + * + * @param w The currently executing worker. + * @param ff The full frame stalling at a sync. + * @return 1 if we are finished with all reductions after calling this method. + * @return 0 if we still need to execute the slow path reductions. + */ +static inline +int fast_path_reductions_for_sync(__cilkrts_worker *w, + full_frame *ff) { + // Return 0 if there is some reduction that needs to happen. + return !(w->reducer_map || ff->pending_exception); +} + +/** + * Executes slow reductions when ff stalls at a sync. + * This method should execute only if + * fast_path_reductions_for_sync(w, ff) returned 0. + * + * After this method completes: + * 1. ff's current reducer map has been deposited into + * right_reducer_map of ff's rightmost child, or + * ff->children_reducer_map if ff has no children. + * 2. Similarly for ff's current exception. + * 3. Nothing to calculate for stacks --- if we are stalling + * we will always free a stack. + * + * This method may repeatedly acquire/release the lock on ff. + * + * @param w The currently executing worker. + * @param ff The full frame stalling at a sync. + * @return The worker returning from this method. + */ +static __cilkrts_worker* +slow_path_reductions_for_sync(__cilkrts_worker *w, + full_frame *ff) +{ + struct cilkred_map *left_map; + struct cilkred_map *middle_map; + +#if (REDPAR_DEBUG > 0) + CILK_ASSERT(ff); + CILK_ASSERT(w->head == w->tail); +#endif + + middle_map = w->reducer_map; + w->reducer_map = NULL; + + // Loop invariant: middle_map should be valid (the current map to reduce). + // left_map is junk. + // w->reducer_map == NULL. + while (1) { + BEGIN_WITH_FRAME_LOCK(w, ff) { + splice_left_ptrs left_ptrs = compute_left_ptrs_for_sync(w, ff); + + // Grab the "left" map and store pointers to those locations. + left_map = *(left_ptrs.map_ptr); + *(left_ptrs.map_ptr) = NULL; + + // Slide the maps in our struct left as far as possible. + if (!left_map) { + left_map = middle_map; + middle_map = NULL; + } + + *(left_ptrs.exception_ptr) = + __cilkrts_merge_pending_exceptions(w, + *left_ptrs.exception_ptr, + ff->pending_exception); + ff->pending_exception = NULL; + + // If there is no middle map, then we are done. + // Deposit left and return. + if (!middle_map) { + *(left_ptrs).map_ptr = left_map; + #if (REDPAR_DEBUG > 0) + CILK_ASSERT(NULL == w->reducer_map); + #endif + // Sanity check upon leaving the loop. + verify_current_wkr(w); + // Make sure to unlock before we return! + __cilkrts_frame_unlock(w, ff); + return w; + } + } END_WITH_FRAME_LOCK(w, ff); + + // If we get here, we have a nontrivial reduction to execute. + middle_map = repeated_merge_reducer_maps(&w, + left_map, + middle_map); + verify_current_wkr(w); + + // Save any exceptions generated because of the reduction + // process. These get merged the next time around the + // loop. + CILK_ASSERT(NULL == ff->pending_exception); + ff->pending_exception = w->l->pending_exception; + w->l->pending_exception = NULL; + } + + // We should never break out of the loop above. + CILK_ASSERT(0); + return NULL; +} + + +/** + * Execute reductions when ff stalls at a sync. + * + * Execution starts on w, but may finish on a different worker. + * This method may acquire/release the lock on ff. + * + * @param w The currently executing worker. + * @param ff The full frame of the spawned function at the sync + * @param sf_at_sync The __cilkrts_stack_frame stalling at a sync + * @return The worker returning from this method. + */ +static __cilkrts_worker* +execute_reductions_for_sync(__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *sf_at_sync) +{ + int finished_reductions; + // Step B1 from reducer protocol above: + // Restore runtime invariants. + // + // The following code for this step is almost equivalent to + // the following sequence: + // 1. disown(w, ff, sf_at_sync, "sync") (which itself + // calls make_unrunnable(w, ff, sf_at_sync)) + // 2. make_runnable(w, ff, sf_at_sync). + // + // The "disown" will mark the frame "sf_at_sync" + // as stolen and suspended, and save its place on the stack, + // so it can be resumed after the sync. + // + // The difference is, that we don't want the disown to + // break the following connections yet, since we are + // about to immediately make sf/ff runnable again anyway. + // sf_at_sync->worker == w + // w->l->frame_ff == ff. + // + // These connections are needed for parallel reductions, since + // we will use sf / ff as the stack frame / full frame for + // executing any potential reductions. + // + // TBD: Can we refactor the disown / make_unrunnable code + // to avoid the code duplication here? + + ff->call_stack = NULL; + + // Normally, "make_unrunnable" would add CILK_FRAME_STOLEN and + // CILK_FRAME_SUSPENDED to sf_at_sync->flags and save the state of + // the stack so that a worker can resume the frame in the correct + // place. + // + // But on this path, CILK_FRAME_STOLEN should already be set. + // Also, we technically don't want to suspend the frame until + // the reduction finishes. + // We do, however, need to save the stack before + // we start any reductions, since the reductions might push more + // data onto the stack. + CILK_ASSERT(sf_at_sync->flags | CILK_FRAME_STOLEN); + + __cilkrts_put_stack(ff, sf_at_sync); + __cilkrts_make_unrunnable_sysdep(w, ff, sf_at_sync, 1, + "execute_reductions_for_sync"); + CILK_ASSERT(w->l->frame_ff == ff); + + // Step B2: Execute reductions on user stack. + // Check if we have any "real" reductions to do. + finished_reductions = fast_path_reductions_for_sync(w, ff); + + if (!finished_reductions) { + // Still have some real reductions to execute. + // Run them here. + + // This method may acquire/release the lock on ff. + w = slow_path_reductions_for_sync(w, ff); + + // The previous call may return on a different worker. + // than what we started on. + verify_current_wkr(w); + } + +#if REDPAR_DEBUG >= 0 + CILK_ASSERT(w->l->frame_ff == ff); + CILK_ASSERT(ff->call_stack == NULL); +#endif + + // Now we suspend the frame ff (since we've + // finished the reductions). Roughly, we've split apart the + // "make_unrunnable" call here --- we've already saved the + // stack info earlier before the reductions execute. + // All that remains is to restore the call stack back into the + // full frame, and mark the frame as suspended. + ff->call_stack = sf_at_sync; + sf_at_sync->flags |= CILK_FRAME_SUSPENDED; + + // At a nontrivial sync, we should always free the current fiber, + // because it can not be leftmost. + w->l->fiber_to_free = ff->fiber_self; + ff->fiber_self = NULL; + return w; +} + + +/* + Local Variables: ** + c-file-style:"bsd" ** + c-basic-offset:4 ** + indent-tabs-mode:nil ** + End: ** +*/ diff --git a/libcilkrts/runtime/scheduler.h b/libcilkrts/runtime/scheduler.h new file mode 100644 index 00000000000..543adaf68e0 --- /dev/null +++ b/libcilkrts/runtime/scheduler.h @@ -0,0 +1,421 @@ +/* scheduler.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file scheduler.h + * + * @brief scheduler.h declares routines for the Intel Cilk Plus scheduler, + * making it the heart of the Intel Cilk Plus implementation. + */ + +#ifndef INCLUDED_SCHEDULER_DOT_H +#define INCLUDED_SCHEDULER_DOT_H + +#include <cilk/common.h> +#include <internal/abi.h> + +#include "rts-common.h" +#include "full_frame.h" +#include "reducer_impl.h" +#include "global_state.h" + +#ifdef CILK_RECORD_REPLAY +#include "record-replay.h" +#endif + +__CILKRTS_BEGIN_EXTERN_C + + +/** + * @brief Flag to disable parallel reductions. + * + * Set to 0 to allow parallel reductions. + */ +#define DISABLE_PARALLEL_REDUCERS 0 + +/** + * @brief Debugging level for parallel reductions. + * + * Print debugging messages and assertions for parallel reducers. 0 is + * no debugging. A higher value generates more output. + */ +#define REDPAR_DEBUG 0 + +/** + * @brief Lock the worker mutex to allow exclusive access to the + * values in the @c __cilkrts_worker and local_state structures. + * + * @pre @c w->l->do_not_steal must not be set. Essentially this + * condition asserts that the worker is not locked recursively. + * + * @param w The worker to lock. + */ +COMMON_PORTABLE +void __cilkrts_worker_lock(__cilkrts_worker *w); + +/** + * @brief Unlock the worker mutex. + * + * @pre @c w->l->do_not_steal must be set. Essentially this condition + * asserts that the worker has been previously locked. + * + * @param w The worker to unlock. + */ +COMMON_PORTABLE +void __cilkrts_worker_unlock(__cilkrts_worker *w); + +/** + * @brief Push the next full frame to be made active in this worker + * and increment its join counter. + * + * __cilkrts_push_next_frame and pop_next_frame work on a one-element queue. + * This queue is used to communicate across the runtime from the code that + * wants to activate a frame to the code that can actually begin execution + * on that frame. They are asymetrical in that push increments the join + * counter but pop does not decrement it. Rather, a single push/pop + * combination makes a frame active and increments its join counter once. + * + * @note A system worker may chose to push work onto a user worker if + * the work is the continuation from a sync which only the user worker + * may complete. + * + * @param w The worker which the frame is to be pushed onto. + * @param ff The full_frame which is to be continued by the worker. + */ +COMMON_PORTABLE +void __cilkrts_push_next_frame(__cilkrts_worker *w, + full_frame *ff); + +/** + * @brief Sync on this worker. + * + * If this worker is the last to reach the sync, execution may resume + * on this worker after the sync. + * + * If this worker is not the last spawned child to reach the sync, + * then execution is suspended and the worker will re-enter the + * scheduling loop, looking for work it can steal. + * + * This function will jump into the runtime to switch to the scheduling + * stack to implement most of its logic. + * + * @param w The worker which is executing the sync. + * @param sf The __cilkrts_stack_frame containing the sync. + */ +COMMON_PORTABLE +NORETURN __cilkrts_c_sync(__cilkrts_worker *w, + __cilkrts_stack_frame *sf); + +/** + * @brief Worker @c w completely promotes its own deque, simulating the case + * where the whole deque is stolen. + * + * We use this mechanism to force the allocation of new storage for + * reducers for race-detection purposes. + * + * This method is called from the reducer lookup logic when + * @c g->force_reduce is set. + * + * @warning Use of "force_reduce" is known to have bugs when run with + * more than 1 worker. + * + * @param w The worker which is to have all entries in its deque + * promoted to full frames. + */ +COMMON_PORTABLE +void __cilkrts_promote_own_deque(__cilkrts_worker *w); + +/** + * Called when a spawned function attempts to return and + * __cilkrts_undo_detach() fails. This can happen for two reasons: + * + * @li If another worker is considering stealing our parent, it bumps the + * exception pointer while it did so, which will cause __cilkrts_undo_detach() + * to fail. If the other worker didn't complete the steal of our parent, we + * still may be able to return to it, either because the steal attempt failed, + * or we won the race for the tail pointer. + * + * @li If the function's parent has been stolen then we cannot return. Instead + * we'll longjmp into the runtime to switch onto the scheduling stack to + * execute do_return_from_spawn() and determine what to do. Either this + * worker is the last one to the sync, in which case we need to jump to the + * sync, or this worker is not the last one to the sync, in which case we'll + * abandon this work and jump to the scheduling loop to search for more work + * we can steal. + * + * @param w The worker which attempting to return from a spawn to + * a stolen parent. + * @param returning_sf The stack frame which is returning. + */ +COMMON_PORTABLE +void __cilkrts_c_THE_exception_check(__cilkrts_worker *w, + __cilkrts_stack_frame *returning_sf); + +/** + * @brief Return an exception to an stolen parent. + * + * Used by the gcc implementation of exceptions to return an exception + * to a stolen parent + * + * @param w The worker which attempting to return from a spawn with an + * exception to a stolen parent. + * @param returning_sf The stack frame which is returning. + */ +COMMON_PORTABLE +NORETURN __cilkrts_exception_from_spawn(__cilkrts_worker *w, + __cilkrts_stack_frame *returning_sf); + +/** + * @brief Used by the Windows implementations of exceptions to migrate an exception + * across fibers. + * + * Call this function when an exception has been thrown and has to + * traverse across a steal. The exception has already been wrapped + * up, so all that remains is to longjmp() into the continuation, + * sync, and re-raise it. + * + * @param sf The __cilkrts_stack_frame for the frame that is attempting to + * return an exception to a stolen parent. + */ +void __cilkrts_migrate_exception (__cilkrts_stack_frame *sf); + +/** + * @brief Return from a call, not a spawn, where this frame has ever + * been stolen. + * + * @param w The worker that is returning from a frame which was ever stolen. + */ +COMMON_PORTABLE +void __cilkrts_return(__cilkrts_worker *w); + +/** + * @brief Special return from the initial frame. + * + * This method will be called from @c __cilkrts_leave_frame if + * @c CILK_FRAME_LAST is set. + * + * This function will do the things necessary to cleanup, and unbind the + * thread from the Intel Cilk Plus runtime. If this is the last user + * worker unbinding from the runtime, all system worker threads will be + * suspended. + * + * @pre @c w must be the currently executing worker, and must be a user + * worker. + * + * @param w The worker that's returning from the initial frame. + */ +COMMON_PORTABLE +void __cilkrts_c_return_from_initial(__cilkrts_worker *w); + +/** + * @brief Used by exception handling code to pop an entry from the + * worker's deque. + * + * @param w Worker to pop the entry from + * + * @return __cilkrts_stack_frame of parent call + * @return NULL if the deque is empty + */ +COMMON_PORTABLE +__cilkrts_stack_frame *__cilkrts_pop_tail(__cilkrts_worker *w); + +/** + * @brief Modifies the worker's protected_tail to prevent frames from + * being stolen. + * + * The Dekker protocol has been extended to only steal if head+1 is also + * less than protected_tail. + * + * @param w The worker to be modified. + * @param new_protected_tail The new setting for protected_tail, or NULL if the + * entire deque is to be protected + * + * @return Previous value of protected tail. + */ +COMMON_PORTABLE +__cilkrts_stack_frame *volatile *__cilkrts_disallow_stealing( + __cilkrts_worker *w, + __cilkrts_stack_frame *volatile *new_protected_tail); + +/** + * @brief Restores the protected tail to a previous state, possibly + * allowing frames to be stolen. + * + * @param w The worker to be modified. + * @param saved_protected_tail A previous setting for protected_tail that is + * to be restored + */ +COMMON_PORTABLE +void __cilkrts_restore_stealing( + __cilkrts_worker *w, + __cilkrts_stack_frame *volatile *saved_protected_tail); + +/** + * @brief Initialize a @c __cilkrts_worker. + * + * @note The memory for the worker must have been allocated outside + * this call. + * + * @param g The global_state_t. + * @param self The index into the global_state's array of workers for this + * worker, or -1 if this worker was allocated from the heap and cannot be + * stolen from. + * @param w The worker to be initialized. + * + * @return The initialized __cilkrts_worker. + */ +COMMON_PORTABLE +__cilkrts_worker *make_worker(global_state_t *g, + int self, + __cilkrts_worker *w); + +/** + * @brief Free up any resources allocated for a worker. + * + * @note The memory for the @c __cilkrts_worker itself must be + * deallocated outside this call. + * + * @param w The worker to be destroyed. + */ +COMMON_PORTABLE +void destroy_worker (__cilkrts_worker *w); + +/** + * @brief Initialize the runtime. + * + * If necessary, allocates and initializes the global state. If + * necessary, unsuspends the system workers. + * + * @param start Specifies whether the workers are to be unsuspended if + * they are suspended. Allows __cilkrts_init() to start up the runtime without + * releasing the system threads. + */ +COMMON_PORTABLE +void __cilkrts_init_internal(int start); + +/** + * @brief Part of the sequence to shutdown the runtime. + * + * Specifically, this call frees the @c global_state_t for the runtime. + * + * @param g The global_state_t. + */ +COMMON_PORTABLE +void __cilkrts_deinit_internal(global_state_t *g); + +/** + * Obsolete. We no longer need to import or export reducer maps. + */ +COMMON_PORTABLE +cilkred_map *__cilkrts_xchg_reducer( + __cilkrts_worker *w, cilkred_map *newmap) cilk_nothrow; + +/** + * @brief Called when a user thread is bound to the runtime. + * + * If this action increments the count of bound user threads from 0 to + * 1, the system worker threads are unsuspended. + * + * If this action increments the count of bound user threads from 0 to + * 1, the system worker threads are unsuspended. + * + * @pre Global lock must be held. + * @param g The runtime global state. + */ +COMMON_PORTABLE +void __cilkrts_enter_cilk(global_state_t *g); + +/** + * @brief Called when a user thread is unbound from the runtime. + * + * If this action decrements the count of bound user threads to 0, the + * system worker threads are suspended. + * + * + * @pre Global lock must be held. + * + * @param g The runtime global state. + */ +COMMON_PORTABLE +void __cilkrts_leave_cilk(global_state_t *g); + + +/** + * @brief cilk_fiber_proc that runs the main scheduler loop on a + * user worker. + * + * @pre fiber's owner field should be set to the correct __cilkrts_worker + * @pre fiber must be a user worker. + * + * @param fiber The scheduling fiber object. + */ +void scheduler_fiber_proc_for_user_worker(cilk_fiber *fiber); + + +/** + * @brief Prints out Cilk runtime statistics. + * + * @param g The runtime global state. + * + * This method is useful only for debugging purposes. No guarantees + * are made as to the validity of this data. :) + */ +COMMON_PORTABLE +void __cilkrts_dump_stats_to_stderr(global_state_t *g); + +#ifdef CILK_RECORD_REPLAY +COMMON_PORTABLE +char * walk_pedigree_nodes(char *p, const __cilkrts_pedigree *pnode); + +/** + * @brief Used by exception handling code to simulate the popping of + * an entry from the worker's deque. + * + * @param w Worker whose deque we want to check + * + * @return @c __cilkrts_stack_frame of parent call + * @return NULL if the deque is empty + */ +COMMON_PORTABLE +__cilkrts_stack_frame *simulate_pop_tail(__cilkrts_worker *w); + +#endif + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_SCHEDULER_DOT_H) diff --git a/libcilkrts/runtime/signal_node.c b/libcilkrts/runtime/signal_node.c new file mode 100644 index 00000000000..92c404b482c --- /dev/null +++ b/libcilkrts/runtime/signal_node.c @@ -0,0 +1,241 @@ +/* signal_node.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2011-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************/ + +#include "signal_node.h" +#include <stdlib.h> + +/* Define cilk_semaphore_t for all of the respective systems. */ +#if defined __APPLE__ +# include <mach/mach_init.h> +# include <mach/semaphore.h> +# include <mach/task.h> + typedef semaphore_t cilk_semaphore_t; +#elif defined _WIN32 +# include "windows-clean.h" + typedef HANDLE cilk_semaphore_t; +#else // Linux/MIC +# include <errno.h> +# include <semaphore.h> +# include <stdio.h> + typedef sem_t cilk_semaphore_t; +#endif // Linux/MIC + +#include "bug.h" +#include "cilk_malloc.h" +#include "signal_node.h" + +/** + * Interface within the tree to notify workers to wait without consuming cycles + * to expend cycles trying to steal. + * + * cilk_semaphore_t is implemented as an auto-reset event on Windows, and + * as a semaphore_t on Linux and MacOS. + */ +struct signal_node_t +{ + /** 0 if the worker should wait, 1 if it should be running. */ + volatile unsigned int run; + + /** OS-specific semaphore on which the worker can wait. */ + cilk_semaphore_t sem; +}; + +/******************************************************************************/ +/* Semaphore-abstraction functions */ +/******************************************************************************/ + +/* + * All of these functions are simple wrappers for the system-specific semaphore + * functions. This keeps the rest of the code reasonably clean and readable. + */ + +#if defined __APPLE__ +static void initialize_cilk_semaphore (cilk_semaphore_t *sem) +{ + kern_return_t kstatus + = semaphore_create(mach_task_self(), sem, SYNC_POLICY_FIFO, 0); + assert(kstatus == KERN_SUCCESS); +} +static void deinitialize_cilk_semaphore (cilk_semaphore_t *sem) +{ + kern_return_t kstatus = semaphore_destroy(mach_task_self(), *sem); + assert(kstatus == KERN_SUCCESS); +} +static void wait_on_cilk_semaphore (cilk_semaphore_t *sem) +{ + kern_return_t kstatus = semaphore_wait(*sem); + assert(kstatus == KERN_SUCCESS); +} +static void signal_cilk_semaphore (cilk_semaphore_t *sem) +{ + kern_return_t kstatus = semaphore_signal(*sem); + assert(kstatus == KERN_SUCCESS); +} +#elif defined _WIN32 +// Note: Windows only provides counting semaphores, and we don't really +// care about the count. So this is implemented using an auto-reset +// event which will automatically reset after the WaitForSingleObject +// call +static void initialize_cilk_semaphore (cilk_semaphore_t *sem) +{ + // Create an auto-reset event + *sem = CreateEvent(NULL, // Security attributes + FALSE, // Manual reset + FALSE, // Initial state (initially reset) + NULL); // Name (anonymous) + CILK_ASSERT (NULL != *sem); +} + +static void deinitialize_cilk_semaphore (cilk_semaphore_t *sem) +{ + BOOL result = CloseHandle(*sem); + CILK_ASSERT (0 != result); +} + +static void wait_on_cilk_semaphore (cilk_semaphore_t *sem) +{ + // WaitForSingleObject will reset the event + DWORD result = WaitForSingleObject (*sem, INFINITE); + CILK_ASSERT (WAIT_OBJECT_0 == result); +} +static void signal_cilk_semaphore (cilk_semaphore_t *sem) +{ + BOOL result = SetEvent (*sem); + CILK_ASSERT (0 != result); +} +#else // Linux/MIC +static void initialize_cilk_semaphore (cilk_semaphore_t *sem) +{ + int status = sem_init(sem, 0, 0); + assert(0 == status); +} +static void deinitialize_cilk_semaphore (cilk_semaphore_t *sem) +{ + int status = sem_destroy(sem); + assert(0 == status); +} +static void wait_on_cilk_semaphore (cilk_semaphore_t *sem) +{ + int status; + + do { + status = sem_wait(sem); + } while (status != 0 && errno == EINTR); + + if (status != 0) { + perror("sem_wait"); + abort(); + } +} +static void signal_cilk_semaphore (cilk_semaphore_t *sem) +{ + sem_post(sem); +} +#endif // Linux/MIC + +/******************************************************************************/ +/* Runtime interface functions */ +/******************************************************************************/ + +/* + * Return a newly malloc'd and initialized signal_node_t. + */ +COMMON_SYSDEP +signal_node_t *signal_node_create(void) +{ + signal_node_t *node; + + node = ( signal_node_t*) + __cilkrts_malloc(sizeof( signal_node_t)); + node->run = 0; + initialize_cilk_semaphore(&node->sem); + + return node; +} + +/* + * Clean and free a signal_node_t. + */ +void signal_node_destroy(signal_node_t *node) +{ + CILK_ASSERT(node); + deinitialize_cilk_semaphore(&node->sem); + __cilkrts_free(node); +} + +/* + * Return 1 if the node thinks the worker should go to sleep, 0 otherwise. + */ +unsigned int signal_node_should_wait(signal_node_t *node) +{ + CILK_ASSERT(node); + return !node->run; +} + +/* + * Send a message to the node that the worker will eventually read. + */ +void signal_node_msg(signal_node_t *node, unsigned int msg) +{ + CILK_ASSERT(node); + switch (msg) { + case 0: // worker should go to sleep. + node->run = msg; + break; + case 1: // worker should be awake. + node->run = msg; + signal_cilk_semaphore(&node->sem); + break; + default: // error. + CILK_ASSERT(0 == "Bad signal_node_t message."); + } +} + +/* + * The current worker will wait on the semaphore. + */ +void signal_node_wait(signal_node_t *node) +{ + CILK_ASSERT(node); + while (signal_node_should_wait(node)) { + // The loop is here to consume extra semaphore signals that might have + // accumulated. No point in passing on the accumulation. + wait_on_cilk_semaphore(&node->sem); + } +} diff --git a/libcilkrts/runtime/signal_node.h b/libcilkrts/runtime/signal_node.h new file mode 100644 index 00000000000..0a1fe200201 --- /dev/null +++ b/libcilkrts/runtime/signal_node.h @@ -0,0 +1,109 @@ +/* signal_node.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file signal_node.h + * + * @brief Signal nodes allow coordinated waking and sleeping of the runtime + * without hammering on a single location in memory. + * + * The workers are logically arranged in a binary tree and propagate messages + * leaf-ward. User workers notify the root about waking and sleeping, so only + * that one node need share a cache line with a user worker. + */ + +#ifndef INCLUDED_SIGNAL_NODE_DOT_H +#define INCLUDED_SIGNAL_NODE_DOT_H + +#include "rts-common.h" +#include <cilk/common.h> + +__CILKRTS_BEGIN_EXTERN_C + +/** Opaque type. */ +typedef struct signal_node_t signal_node_t; + +/** + * Allocate and initialize a signal_node_t + * + * @return The initialized signal_node_t + */ +COMMON_SYSDEP +signal_node_t *signal_node_create(void); + +/** + * Free any resources and deallocate a signal_node_t + * + * @param node The node to be deallocated. + */ +COMMON_SYSDEP void signal_node_destroy(signal_node_t *node); + +/** + * Test whether the node thinks the worker should go to sleep + * + * @param node The node to be tested. + * + * @return 1 If the worker should go to sleep + * @return 0 If the worker should not go to sleep + */ +COMMON_SYSDEP +unsigned int signal_node_should_wait(signal_node_t *node); + +/** + * Specify whether the worker should go to sleep + * + * @param node The node to be set. + * @param msg The value to be set. Valid values are: + * - 0 - the worker should go to sleep + * - 1 - the worker should stay active + */ +COMMON_SYSDEP +void signal_node_msg(signal_node_t *node, unsigned int msg); + + +/** + * Wait for the node to be set + * + * @param node The node to wait on + */ +COMMON_SYSDEP +void signal_node_wait(signal_node_t *node); + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_SIGNAL_NODE_DOT_H) diff --git a/libcilkrts/runtime/spin_mutex.c b/libcilkrts/runtime/spin_mutex.c new file mode 100644 index 00000000000..03908f26322 --- /dev/null +++ b/libcilkrts/runtime/spin_mutex.c @@ -0,0 +1,109 @@ +/* spin_mutex.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#include "spin_mutex.h" +#include "bug.h" +#include "os.h" +#include "stats.h" + +// TBD (11/30/12): We should be doing a conditional test-xchg instead +// of an unconditional xchg operation for the spin mutex. + +/* m->lock == 1 means that mutex M is locked */ +#define TRY_ACQUIRE(m) (__cilkrts_xchg(&(m)->lock, 1) == 0) + +/* ICC 11.1+ understands release semantics and generates an + ordinary store with a software memory barrier. */ +#if __ICC >= 1110 +#define RELEASE(m) __sync_lock_release(&(m)->lock) +#else +#define RELEASE(m) __cilkrts_xchg(&(m)->lock, 0) +#endif + + +spin_mutex* spin_mutex_create() +{ + spin_mutex* mutex = (spin_mutex*)__cilkrts_malloc(sizeof(spin_mutex)); + spin_mutex_init(mutex); + return mutex; +} + +void spin_mutex_init(struct spin_mutex *m) +{ + // Use a simple assignment so Inspector doesn't bug us about the + // interlocked exchange doing a read of an uninitialized variable. + // By definition there can't be a race when we're initializing the + // lock... + m->lock = 0; +} + +void spin_mutex_lock(struct spin_mutex *m) +{ + int count; + const int maxspin = 1000; /* SWAG */ + if (!TRY_ACQUIRE(m)) { + count = 0; + do { + do { + __cilkrts_short_pause(); + if (++count >= maxspin) { + /* let the OS reschedule every once in a while */ + __cilkrts_yield(); + count = 0; + } + } while (m->lock != 0); + } while (!TRY_ACQUIRE(m)); + } +} + +int spin_mutex_trylock(struct spin_mutex *m) +{ + return TRY_ACQUIRE(m); +} + +void spin_mutex_unlock(struct spin_mutex *m) +{ + RELEASE(m); +} + +void spin_mutex_destroy(struct spin_mutex *m) +{ + __cilkrts_free(m); +} + +/* End spin_mutex.c */ diff --git a/libcilkrts/runtime/spin_mutex.h b/libcilkrts/runtime/spin_mutex.h new file mode 100644 index 00000000000..b0045ab9313 --- /dev/null +++ b/libcilkrts/runtime/spin_mutex.h @@ -0,0 +1,129 @@ +/* spin_mutex.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file spin_mutex.h + * + * @brief Support for Cilk runtime mutexes. + * + * Cilk runtime mutexes are implemented as simple spin loops. + * + * This file is similar to a worker_mutex, except it does not have an + * owner field. + * + * TBD: This class, worker_mutex, and os_mutex overlap quite a bit in + * functionality. Can we unify these mutexes somehow? + */ +#ifndef INCLUDED_SPIN_MUTEX_DOT_H +#define INCLUDED_SPIN_MUTEX_DOT_H + +#include <cilk/common.h> +#include "rts-common.h" +#include "cilk_malloc.h" + +__CILKRTS_BEGIN_EXTERN_C + +/** + * Mutexes are treated as an abstract data type within the Cilk + * runtime system. They are implemented as simple spin loops. + */ +typedef struct spin_mutex { + /** Mutex spin loop variable. 0 if unowned, 1 if owned. */ + volatile int lock; + + /** Padding so the mutex takes up a cache line. */ + char pad[64/sizeof(int) - 1]; +} spin_mutex; + + +/** + * @brief Create a new Cilk spin_mutex. + * + * @return Returns an initialized spin mutex. + */ +COMMON_PORTABLE +spin_mutex* spin_mutex_create(); + +/** + * @brief Initialize a Cilk spin_mutex. + * + * @param m Spin_Mutex to be initialized. + */ +COMMON_PORTABLE +void spin_mutex_init(spin_mutex *m); + +/** + * @brief Acquire a Cilk spin_mutex. + * + * If statistics are being gathered, the time spent + * acquiring the spin_mutex will be attributed to the specified worker. + * + * @param m Spin_Mutex to be initialized. + */ +COMMON_PORTABLE +void spin_mutex_lock(struct spin_mutex *m); +/** + * @brief Attempt to lock a Cilk spin_mutex and fail if it isn't available. + * + * @param m Spin_Mutex to be acquired. + * + * @return 1 if the spin_mutex was acquired. + * @return 0 if the spin_mutex was not acquired. + */ +COMMON_PORTABLE +int spin_mutex_trylock(struct spin_mutex *m); + +/** + * @brief Release a Cilk spin_mutex. + * + * @param m Spin_Mutex to be released. + */ +COMMON_PORTABLE +void spin_mutex_unlock(struct spin_mutex *m); + +/** + * @brief Deallocate a Cilk spin_mutex. Currently does nothing. + * + * @param m Spin_Mutex to be deallocated. + */ +COMMON_PORTABLE +void spin_mutex_destroy(struct spin_mutex *m); + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_SPIN_MUTEX_DOT_H) diff --git a/libcilkrts/runtime/stats.c b/libcilkrts/runtime/stats.c new file mode 100644 index 00000000000..3a420745039 --- /dev/null +++ b/libcilkrts/runtime/stats.c @@ -0,0 +1,172 @@ +/* stats.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#include "stats.h" +#include "bug.h" +#include "os.h" +#include "local_state.h" + +#include <stdio.h> + +#define INVALID_START (0ULL - 1ULL) + +#ifdef CILK_PROFILE +/* MSVC does not support designated initializers, grrrr... */ +static const char *names[] = { + /*[INTERVAL_IN_SCHEDULER]*/ "in scheduler", + /*[INTERVAL_WORKING]*/ " of which: working", + /*[INTERVAL_IN_RUNTIME]*/ " of which: in runtime", + /*[INTERVAL_STEALING]*/ " of which: stealing", + /*[INTERVAL_STEAL_SUCCESS]*/ "steal success: detach", + /*[INTERVAL_STEAL_FAIL_EMPTYQ]*/ "steal fail: empty queue", + /*[INTERVAL_STEAL_FAIL_LOCK]*/ "steal fail: victim locked", + /*[INTERVAL_STEAL_FAIL_USER_WORKER]*/ "steal fail: user worker", + /*[INTERVAL_STEAL_FAIL_DEKKER]*/ "steal fail: dekker", + /*[INTERVAL_SYNC_CHECK]*/ "sync check", + /*[INTERVAL_THE_EXCEPTION_CHECK]*/ "THE exception check", + /*[INTERVAL_THE_EXCEPTION_CHECK_USELESS]*/ " of which: useless", + /*[INTERVAL_RETURNING]*/ "returning", + /*[INTERVAL_FINALIZE_CHILD]*/ "finalize child", + /*[INTERVAL_PROVABLY_GOOD_STEAL]*/ "provably good steal", + /*[INTERVAL_UNCONDITIONAL_STEAL]*/ "unconditional steal", + /*[INTERVAL_ALLOC_FULL_FRAME]*/ "alloc full frame", + /*[INTERVAL_FRAME_ALLOC_LARGE]*/ "large frame alloc", + /*[INTERVAL_FRAME_ALLOC]*/ "small frame alloc", + /*[INTERVAL_FRAME_ALLOC_GLOBAL]*/ " of which: to global pool", + /*[INTERVAL_FRAME_FREE_LARGE]*/ "large frame free", + /*[INTERVAL_FRAME_FREE]*/ "small frame free", + /*[INTERVAL_FRAME_FREE_GLOBAL]*/ " of which: to global pool", + /*[INTERVAL_MUTEX_LOCK]*/ "mutex lock", + /*[INTERVAL_MUTEX_LOCK_SPINNING]*/ " spinning", + /*[INTERVAL_MUTEX_LOCK_YIELDING]*/ " yielding", + /*[INTERVAL_MUTEX_TRYLOCK]*/ "mutex trylock", + /*[INTERVAL_FIBER_ALLOCATE]*/ "fiber_allocate", + /*[INTERVAL_FIBER_DEALLOCATE]*/ "fiber_deallocate", + /*[INTERVAL_FIBER_ALLOCATE_FROM_THREAD]*/ "fiber_allocate_from_thread", + /*[INTERVAL_FIBER_DEALLOCATE_FROM_THREAD]*/ "fiber_deallocate (thread)", + /*[INTERVAL_SUSPEND_RESUME_OTHER]*/ "fiber suspend self + resume", + /*[INTERVAL_DEALLOCATE_RESUME_OTHER]*/ "fiber deallocate self + resume", +}; +#endif + +void __cilkrts_init_stats(statistics *s) +{ + int i; + for (i = 0; i < INTERVAL_N; ++i) { + s->start[i] = INVALID_START; + s->accum[i] = 0; + s->count[i] = 0; + } + + s->stack_hwm = 0; +} + +#ifdef CILK_PROFILE +void __cilkrts_accum_stats(statistics *to, statistics *from) +{ + int i; + + for (i = 0; i < INTERVAL_N; ++i) { + to->accum[i] += from->accum[i]; + to->count[i] += from->count[i]; + from->accum[i] = 0; + from->count[i] = 0; + } + + if (from->stack_hwm > to->stack_hwm) + to->stack_hwm = from->stack_hwm; + from->stack_hwm = 0; +} + +void __cilkrts_note_interval(__cilkrts_worker *w, enum interval i) +{ + if (w) { + statistics *s = w->l->stats; + CILK_ASSERT(s->start[i] == INVALID_START); + s->count[i]++; + } +} + +void __cilkrts_start_interval(__cilkrts_worker *w, enum interval i) +{ + if (w) { + statistics *s = w->l->stats; + CILK_ASSERT(s->start[i] == INVALID_START); + s->start[i] = __cilkrts_getticks(); + s->count[i]++; + } +} + +void __cilkrts_stop_interval(__cilkrts_worker *w, enum interval i) +{ + if (w) { + statistics *s = w->l->stats; + CILK_ASSERT(s->start[i] != INVALID_START); + s->accum[i] += __cilkrts_getticks() - s->start[i]; + s->start[i] = INVALID_START; + } +} + +void dump_stats_to_file(FILE *stat_file, statistics *s) +{ + int i; + fprintf(stat_file, "\nCILK PLUS RUNTIME SYSTEM STATISTICS:\n\n"); + + fprintf(stat_file, + " %-32s: %15s %10s %12s %10s\n", + "event", + "count", + "ticks", + "ticks/count", + "%total" + ); + for (i = 0; i < INTERVAL_N; ++i) { + fprintf(stat_file, " %-32s: %15llu", names[i], s->count[i]); + if (s->accum[i]) { + fprintf(stat_file, " %10.3g %12.3g %10.2f", + (double)s->accum[i], + (double)s->accum[i] / (double)s->count[i], + 100.0 * (double)s->accum[i] / + (double)s->accum[INTERVAL_IN_SCHEDULER]); + } + fprintf(stat_file, "\n"); + } +} +#endif // CILK_PROFILE + +/* End stats.c */ diff --git a/libcilkrts/runtime/stats.h b/libcilkrts/runtime/stats.h new file mode 100644 index 00000000000..aaa99274765 --- /dev/null +++ b/libcilkrts/runtime/stats.h @@ -0,0 +1,208 @@ +/* stats.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file stats.h + * + * @brief Support for gathering and reporting statistics on Cilk applications. + * + * Note that stats are normally NOT compiled in because it increases the + * overhead of stealing. To compile in profiling support, define CILK_PROFILE. + */ + +#ifndef INCLUDED_STATS_DOT_H +#define INCLUDED_STATS_DOT_H + +/* #define CILK_PROFILE 1 */ +// @note The CILK_PROFILE flag and intervals is known to be broken +// in at least programs with Windows exceptions. +// Enable this flag at your own peril. :) + +#include <cilk/common.h> +#include "rts-common.h" +#include "internal/abi.h" + +#ifdef CILK_PROFILE +#include <stdio.h> // Define FILE * +#endif + +__CILKRTS_BEGIN_EXTERN_C + +/** @brief Events that we measure. */ +enum interval +{ + INTERVAL_IN_SCHEDULER, ///< Time threads spend "bound" to Cilk + INTERVAL_WORKING, ///< Time spent working + INTERVAL_IN_RUNTIME, ///< Time spent executing runtime scheduling loop + INTERVAL_STEALING, ///< Time spent stealing work + INTERVAL_STEAL_SUCCESS, ///< Time to do a successful steal + INTERVAL_STEAL_FAIL_EMPTYQ, ///< Count of steal failures due to lack of stealable work + INTERVAL_STEAL_FAIL_LOCK, ///< Count of steal failures due to failure to lock worker + INTERVAL_STEAL_FAIL_USER_WORKER, ///< Count of steal failures by user workers which attempt to steal from another team + INTERVAL_STEAL_FAIL_DEKKER, ///< Count of steal failures due to Dekker protocol failure + INTERVAL_SYNC_CHECK, ///< Time spent processing syncs + INTERVAL_THE_EXCEPTION_CHECK, ///< Time spent performing THE exception checks + INTERVAL_THE_EXCEPTION_CHECK_USELESS, ///< Count of useless THE exception checks + INTERVAL_RETURNING, ///< Time spent returning from calls + INTERVAL_FINALIZE_CHILD, ///< Time spent in finalize_child + INTERVAL_PROVABLY_GOOD_STEAL, ///< Time spent in provably_good_steal + INTERVAL_UNCONDITIONAL_STEAL, ///< Time spent in unconditional_steal + INTERVAL_ALLOC_FULL_FRAME, ///< Time spent in __cilkrts_make_full_frame + INTERVAL_FRAME_ALLOC_LARGE, ///< Count of calls to __cilkrts_frame_malloc for buffers bigger than FRAME_MALLOC_MAX_SIZE or with a NULL worker + INTERVAL_FRAME_ALLOC, ///< Time spent allocating memory from worker buckets + INTERVAL_FRAME_ALLOC_GLOBAL, ///< Time spent calling memory allocator when buckets are empty + INTERVAL_FRAME_FREE_LARGE, ///< Count of calls to __cilkrts_frame_malloc for buffers bigger than FRAME_MALLOC_MAX_SIZE or with a NULL worker + INTERVAL_FRAME_FREE, ///< Time spent freeing memory to worker buckets + INTERVAL_FRAME_FREE_GLOBAL, ///< Time spent calling memory deallocator when buckets are full + INTERVAL_MUTEX_LOCK, ///< Count of calls to __cilkrts_mutex_lock for a worker + INTERVAL_MUTEX_LOCK_SPINNING, ///< Time spent spinning in __cilkrts_mutex_lock for a worker + INTERVAL_MUTEX_LOCK_YIELDING, ///< Time spent yielding in __cilkrts_mutex_lock for a worker + INTERVAL_MUTEX_TRYLOCK, ///< Count of calls to __cilkrts_mutex_trylock + INTERVAL_FIBER_ALLOCATE, ///< Time spent calling cilk_fiber_allocate + INTERVAL_FIBER_DEALLOCATE, ///< Time spent calling cilk_fiber_deallocate (not from thread) + INTERVAL_FIBER_ALLOCATE_FROM_THREAD, ///< Time spent calling cilk_fiber_allocate_from_thread + INTERVAL_FIBER_DEALLOCATE_FROM_THREAD, ///< Time spent calling cilk_fiber_deallocate (from thread) + INTERVAL_SUSPEND_RESUME_OTHER, ///< Count of fiber suspend_self_and_resume_other + INTERVAL_DEALLOCATE_RESUME_OTHER, ///< Count of fiber deallocate_self_and_resume_other + INTERVAL_N ///< Number of intervals, must be last +}; + +/** + * @brief Struct that collects of all runtime statistics. + * + * There is an instance of this structure in each worker's + * local_state, as well as one in the @c global_state_t which will be + * used to accumulate the per-worker stats. + */ +typedef struct statistics +{ + /** Number of times each interval is entered */ + unsigned long long count[INTERVAL_N]; + + /** + * Time when the system entered each interval, in system-dependent + * "ticks" + */ + unsigned long long start[INTERVAL_N]; + + /** Total time spent in each interval, in system-dependent "ticks" */ + unsigned long long accum[INTERVAL_N]; + + /** + * Largest global number of stacks seen by this worker. + * The true maximum at end of execution is the max of the + * worker maxima. + */ + long stack_hwm; +} statistics; + +/** + * Initializes a statistics structure + * + * @param s The statistics structure to be initialized. + */ +COMMON_PORTABLE void __cilkrts_init_stats(statistics *s); + +/** + * @brief Sums statistics from worker to the global struct + * + * @param to The statistics structure that will accumulate the information. + * This structure is usually @c g->stats. + * @param from The statistics structure that will be accumulated. + * This structure is usually statistics kept per worker. + */ +COMMON_PORTABLE +void __cilkrts_accum_stats(statistics *to, statistics *from); + +/** + * @brief Mark the start of an interval by saving the current tick count. + * + * @pre Start time == INVALID_START + * + * @param w The worker we're accumulating stats for. + * @param i The interval we're accumulating stats for. + */ +COMMON_PORTABLE +void __cilkrts_start_interval(__cilkrts_worker *w, enum interval i); + +/** + * @brief Mark the end of an interval by adding the ticks since the + * start to the accumulated time. + * + * @pre Start time != INVALID_START + * + * @param w The worker we're accumulating stats for. + * @param i The interval we're accumulating stats for. + */ +COMMON_PORTABLE +void __cilkrts_stop_interval(__cilkrts_worker *w, enum interval i); + +/** + * @brief Start and stop interval I, charging zero time against it + * + * Precondition: + * - Start time == INVALID_START + * + * @param w The worker we're accumulating stats for. + * @param i The interval we're accumulating stats for. + */ +COMMON_PORTABLE +void __cilkrts_note_interval(__cilkrts_worker *w, enum interval i); + +#ifdef CILK_PROFILE +COMMON_PORTABLE +void dump_stats_to_file(FILE *stat_file, statistics *s); +#endif + + +#ifdef CILK_PROFILE +# define START_INTERVAL(w, i) __cilkrts_start_interval(w, i); +# define STOP_INTERVAL(w, i) __cilkrts_stop_interval(w, i); +# define NOTE_INTERVAL(w, i) __cilkrts_note_interval(w, i); +#else +/** Start an interval. No effect unless CILK_PROFILE is defined. */ +# define START_INTERVAL(w, i) +/** End an interval. No effect unless CILK_PROFILE is defined. */ +# define STOP_INTERVAL(w, i) +/** Increment a counter. No effect unless CILK_PROFILE is defined. */ +# define NOTE_INTERVAL(w, i) +#endif + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_STATS_DOT_H) diff --git a/libcilkrts/runtime/symbol_test.c b/libcilkrts/runtime/symbol_test.c new file mode 100644 index 00000000000..1113ecd44cd --- /dev/null +++ b/libcilkrts/runtime/symbol_test.c @@ -0,0 +1,62 @@ +/* symbol_test.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/* simple program to verify that there are no undefined symbols in the runtime. + * If the runtime uses any symbols that are not defined, compiling this program + * will cause a linker error. + */ + +extern void* __cilkrts_global_state; +void *volatile p; + +void foo () { } +int main () +{ + int i; + long long j; + + _Cilk_spawn foo(); + _Cilk_for (i = 0; i < 2; ++i) + foo(); + _Cilk_for (j = 0; j < 2; ++j) + foo(); + p = __cilkrts_global_state; + return 0; +} + +/* End symbol_test.c */ diff --git a/libcilkrts/runtime/sysdep-unix.c b/libcilkrts/runtime/sysdep-unix.c new file mode 100644 index 00000000000..194681fffc5 --- /dev/null +++ b/libcilkrts/runtime/sysdep-unix.c @@ -0,0 +1,794 @@ +/* + * sysdep-unix.c + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2010-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + ************************************************************************** + */ + +#ifdef __linux__ + // define _GNU_SOURCE before *any* #include. + // Even <stdint.h> will break later #includes if this macro is not + // already defined when it is #included. +# define _GNU_SOURCE +#endif + +#include "sysdep.h" +#include "os.h" +#include "bug.h" +#include "local_state.h" +#include "signal_node.h" +#include "full_frame.h" +#include "jmpbuf.h" +#include "cilk_malloc.h" +#include "reducer_impl.h" +#include "metacall_impl.h" + + +// On x86 processors (but not MIC processors), the compiler generated code to +// save the FP state (rounding mode and the like) before calling setjmp. We +// will need to restore that state when we resume. +#ifndef __MIC__ +# if defined(__i386__) || defined(__x86_64) +# define RESTORE_X86_FP_STATE +# endif // defined(__i386__) || defined(__x86_64) +#endif // __MIC__ + +// contains notification macros for VTune. +#include "cilk-ittnotify.h" + +#include <stddef.h> + +#ifdef __CYGWIN__ +// On Cygwin, string.h doesnt declare strcasecmp if __STRICT_ANSI__ is defined +# undef __STRICT_ANSI__ +#endif + +#include <string.h> +#include <pthread.h> +#include <unistd.h> +#include <alloca.h> + +#ifdef __APPLE__ +//# include <scheduler.h> // Angle brackets include Apple's scheduler.h, not ours. +#endif + +#ifdef __linux__ +# include <sys/resource.h> +# include <sys/sysinfo.h> +#endif + +#ifdef __FreeBSD__ +# include <sys/resource.h> +// BSD does not define MAP_ANONYMOUS, but *does* define MAP_ANON. Aren't standards great! +# define MAP_ANONYMOUS MAP_ANON +#endif + +#ifdef __VXWORKS__ +# include <vxWorks.h> +# include <vxCpuLib.h> +#endif + +struct global_sysdep_state +{ + pthread_t *threads; ///< Array of pthreads for system workers + size_t pthread_t_size; ///< for cilk_db +}; + +static void internal_enforce_global_visibility(); + + +COMMON_SYSDEP +void __cilkrts_init_worker_sysdep(struct __cilkrts_worker *w) +{ + ITT_SYNC_CREATE(w, "Scheduler"); +} + +COMMON_SYSDEP +void __cilkrts_destroy_worker_sysdep(struct __cilkrts_worker *w) +{ +} + +COMMON_SYSDEP +void __cilkrts_init_global_sysdep(global_state_t *g) +{ + internal_enforce_global_visibility(); + + __cilkrts_init_tls_variables(); + + CILK_ASSERT(g->total_workers >= g->P - 1); + g->sysdep = __cilkrts_malloc(sizeof (struct global_sysdep_state)); + CILK_ASSERT(g->sysdep); + g->sysdep->pthread_t_size = sizeof (pthread_t); + + // TBD: Should this value be g->total_workers, or g->P? + // Need to check what we are using this field for. + g->sysdep->threads = __cilkrts_malloc(sizeof(pthread_t) * g->total_workers); + CILK_ASSERT(g->sysdep->threads); + + return; +} + +COMMON_SYSDEP +void __cilkrts_destroy_global_sysdep(global_state_t *g) +{ + if (g->sysdep->threads) + __cilkrts_free(g->sysdep->threads); + __cilkrts_free(g->sysdep); +} + +/************************************************************* + Creation of worker threads: +*************************************************************/ + +static void internal_run_scheduler_with_exceptions(__cilkrts_worker *w) +{ + /* We assume the stack grows down. */ + char var; + __cilkrts_cilkscreen_establish_c_stack(&var - 1000000, &var); + + __cilkrts_run_scheduler_with_exceptions(w); +} + + + +/* + * scheduler_thread_proc_for_system_worker + * + * Thread start function called when we start a new worker. + * + */ +NON_COMMON void* scheduler_thread_proc_for_system_worker(void *arg) +{ + /*int status;*/ + __cilkrts_worker *w = (__cilkrts_worker *)arg; + +#ifdef __INTEL_COMPILER +#ifdef USE_ITTNOTIFY + // Name the threads for Advisor. They don't want a worker number. + __itt_thread_set_name("Cilk Worker"); +#endif // defined USE_ITTNOTIFY +#endif // defined __INTEL_COMPILER + + /* Worker startup is serialized + status = pthread_mutex_lock(&__cilkrts_global_mutex); + CILK_ASSERT(status == 0);*/ + CILK_ASSERT(w->l->type == WORKER_SYSTEM); + /*status = pthread_mutex_unlock(&__cilkrts_global_mutex); + CILK_ASSERT(status == 0);*/ + + __cilkrts_set_tls_worker(w); + + // Create a cilk fiber for this worker on this thread. + START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE_FROM_THREAD) { + w->l->scheduling_fiber = cilk_fiber_allocate_from_thread(); + cilk_fiber_set_owner(w->l->scheduling_fiber, w); + } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE_FROM_THREAD); + + internal_run_scheduler_with_exceptions(w); + + START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE_FROM_THREAD) { + // Deallocate the scheduling fiber. This operation reverses the + // effect cilk_fiber_allocate_from_thread() and must be done in this + // thread before it exits. + int ref_count = cilk_fiber_deallocate_from_thread(w->l->scheduling_fiber); + // Scheduling fibers should never have extra references to them. + // We only get extra references into fibers because of Windows + // exceptions. + CILK_ASSERT(0 == ref_count); + w->l->scheduling_fiber = NULL; + } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE_FROM_THREAD); + + return 0; +} + + +/* + * __cilkrts_user_worker_scheduling_stub + * + * Routine for the scheduling fiber created for an imported user + * worker thread. This method is analogous to + * scheduler_thread_proc_for_system_worker. + * + */ +void __cilkrts_user_worker_scheduling_stub(cilk_fiber* fiber, void* null_arg) +{ + __cilkrts_worker *w = __cilkrts_get_tls_worker(); + + // Sanity check. + CILK_ASSERT(WORKER_USER == w->l->type); + + // Enter the scheduling loop on the user worker. + // This function will never return. + __cilkrts_run_scheduler_with_exceptions(w); + + // A WORKER_USER, at some point, will resume on the original stack and leave + // Cilk. Under no circumstances do we ever exit off of the bottom of this + // stack. + CILK_ASSERT(0); +} + +/** + * We are exporting a function with this name to Inspector? + * What a confusing name... + * + * This function is exported so Piersol's stack trace displays + * reasonable information. + */ +void* __cilkrts_worker_stub(void* arg) +{ + return scheduler_thread_proc_for_system_worker(arg); +} + + + +// /* Return the lesser of the argument and the operating system +// limit on the number of workers (threads) that may or ought +// to be created. */ +// int sysdep_thread_limit(int n, int physical_cpus) +// { +// /* On Linux thread creation fails somewhere short of the +// number of available processes. */ +// struct rlimit lim; + +// if (n > 256 + 2 * physical_cpus) +// n = 256 + 2 * physical_cpus; + +// if (getrlimit(RLIMIT_NPROC, &lim) == 0 && lim.rlim_cur != RLIM_INFINITY) +// { +// /* If the limit reads 0 or absurdly small, ignore it. */ +// unsigned int maxproc = (lim.rlim_cur * 3 + 3) / 4; +// if (maxproc > 8 + 2 * physical_cpus && maxproc < n) +// n = maxproc; +// } +// return n; +// } + + + +static void write_version_file (global_state_t *, int); + +/* Create n worker threads from base..top-1 + */ +static void create_threads(global_state_t *g, int base, int top) +{ + // TBD(11/30/12): We want to insert code providing the option of + // pinning system workers to cores. + for (int i = base; i < top; i++) { + int status = pthread_create(&g->sysdep->threads[i], + NULL, + scheduler_thread_proc_for_system_worker, + g->workers[i]); + if (status != 0) + __cilkrts_bug("Cilk runtime error: thread creation (%d) failed: %d\n", i, status); + } +} + +#if PARALLEL_THREAD_CREATE +static int volatile threads_created = 0; + +// Create approximately half of the worker threads, and then become a worker +// ourselves. +static void * create_threads_and_work (void * arg) +{ + global_state_t *g = ((__cilkrts_worker *)arg)->g; + + create_threads(g, g->P/2, g->P-1); + // Let the initial thread know that we're done. + threads_created = 1; + + // Ideally this turns into a tail call that wipes out this stack frame. + return scheduler_thread_proc_for_system_worker(arg); +} +#endif +void __cilkrts_start_workers(global_state_t *g, int n) +{ + g->workers_running = 1; + g->work_done = 0; + + if (!g->sysdep->threads) + return; + + // Do we actually have any threads to create? + if (n > 0) + { +#if PARALLEL_THREAD_CREATE + int status; + // We create (a rounded up) half of the threads, thread one creates the rest + int half_threads = (n+1)/2; + + // Create the first thread passing a different thread function, so that it creates threads itself + status = pthread_create(&g->sysdep->threads[0], NULL, create_threads_and_work, g->workers[0]); + + if (status != 0) + __cilkrts_bug("Cilk runtime error: thread creation (0) failed: %d\n", status); + + // Then the rest of the ones we have to create + create_threads(g, 1, half_threads); + + // Now wait for the first created thread to tell us it's created all of its threads. + // We could maybe drop this a bit lower and overlap with write_version_file. + while (!threads_created) + __cilkrts_yield(); +#else + // Simply create all the threads linearly here. + create_threads(g, 0, n); +#endif + } + // write the version information to a file if the environment is configured + // for it (the function makes the check). + write_version_file(g, n); + + + return; +} + +void __cilkrts_stop_workers(global_state_t *g) +{ + int i; + + // Tell the workers to give up + + g->work_done = 1; + + if (g->workers_running == 0) + return; + + if (!g->sysdep->threads) + return; + + /* Make them all runnable. */ + if (g->P > 1) { + CILK_ASSERT(g->workers[0]->l->signal_node); + signal_node_msg(g->workers[0]->l->signal_node, 1); + } + + for (i = 0; i < g->P - 1; ++i) { + int sc_status; + void *th_status; + + sc_status = pthread_join(g->sysdep->threads[i], &th_status); + if (sc_status != 0) + __cilkrts_bug("Cilk runtime error: thread join (%d) failed: %d\n", i, sc_status); + } + + g->workers_running = 0; + + + return; +} + + +/* + * @brief Returns the stack address for resuming execution of sf. + * + * This method takes in the top of the stack to use, and then returns + * a properly aligned address for resuming execution of sf. + * + * @param sf - The stack frame we want to resume executing. + * @param stack_base - The top of the stack we want to execute sf on. + * + */ +static char* get_sp_for_executing_sf(char* stack_base, + full_frame *ff, + __cilkrts_stack_frame *sf) +{ +// The original calculation that had been done to correct the stack +// pointer when resuming execution. +// +// But this code was never getting called in the eng branch anyway... +// +// TBD(11/30/12): This logic needs to be revisited to make sure that +// we are doing the proper calculation in reserving space for outgoing +// arguments on all platforms and architectures. +#if 0 + /* Preserve outgoing argument space and stack alignment on steal. + Outgoing argument space is bounded by the difference between + stack and frame pointers. Some user code is known to rely on + 16 byte alignment. Maintain 32 byte alignment for future + compatibility. */ +#define SMASK 31 /* 32 byte alignment */ + if (sf) { + char *fp = FP(sf), *sp = SP(sf); + int fp_align = (int)(size_t)fp & SMASK; + ptrdiff_t space = fp - sp; + + fprintf(stderr, "Here: fp = %p, sp = %p\n", fp, sp); + char *top_aligned = (char *)((((size_t)stack_base - SMASK) & ~(size_t)SMASK) | fp_align); + /* Don't allocate an unreasonable amount of stack space. */ + + fprintf(stderr, "Here: stack_base = %p, top_aligned=%p, space=%ld\n", + stack_base, top_aligned, space); + if (space < 32) + space = 32 + (space & SMASK); + else if (space > 40 * 1024) + space = 40 * 1024 + (space & SMASK); + + return top_aligned - space; + } +#endif + +#define PERFORM_FRAME_SIZE_CALCULATION 0 + + char* new_stack_base = stack_base - 256; + +#if PERFORM_FRAME_SIZE_CALCULATION + // If there is a frame size saved, then use that as the + // correction instead of 256. + if (ff->frame_size > 0) { + if (ff->frame_size < 40*1024) { + new_stack_base = stack_base - ff->frame_size; + } + else { + // If for some reason, our frame size calculation is giving us + // a number which is bigger than about 10 pages, then + // there is likely something wrong here? Don't allocate + // an unreasonable amount of space. + new_stack_base = stack_base - 40*1024; + } + } +#endif + + // Whatever correction we choose, align the final stack top. + // This alignment seems to be necessary in particular on 32-bit + // Linux, and possibly Mac. (Is 32-byte alignment is sufficient?) + /* 256-byte alignment. Why not? */ + const uintptr_t align_mask = ~(256 -1); + new_stack_base = (char*)((size_t)new_stack_base & align_mask); + return new_stack_base; +} + +char* sysdep_reset_jump_buffers_for_resume(cilk_fiber* fiber, + full_frame *ff, + __cilkrts_stack_frame *sf) +{ +#if FIBER_DEBUG >= 4 + fprintf(stderr, "ThreadId=%p (fiber_proc_to_resume), Fiber %p. sf = %p. ff=%p, ff->sync_sp=%p\n", + cilkos_get_current_thread_id(), + fiber, + sf, + ff, ff->sync_sp); +#endif + + CILK_ASSERT(fiber); + void* sp = (void*)get_sp_for_executing_sf(cilk_fiber_get_stack_base(fiber), ff, sf); + SP(sf) = sp; + + /* Debugging: make sure stack is accessible. */ + ((volatile char *)sp)[-1]; + + // Adjust the saved_sp to account for the SP we're about to run. This will + // allow us to track fluctations in the stack +#if FIBER_DEBUG >= 4 + fprintf(stderr, "ThreadId=%p, about to take stack ff=%p, sp=%p, sync_sp=%p\n", + cilkos_get_current_thread_id(), + ff, + sp, + ff->sync_sp); +#endif + __cilkrts_take_stack(ff, sp); + return sp; +} + + +NORETURN sysdep_longjmp_to_sf(char* new_sp, + __cilkrts_stack_frame *sf, + full_frame *ff_for_exceptions /* UNUSED on Unix */) +{ +#if FIBER_DEBUG >= 3 + fprintf(stderr, + "ThreadId=%p. resume user code, sf=%p, new_sp = %p, original SP(sf) = %p, FP(sf) = %p\n", + cilkos_get_current_thread_id(), sf, new_sp, SP(sf), FP(sf)); +#endif + + // Set the stack pointer. + SP(sf) = new_sp; + +#ifdef RESTORE_X86_FP_STATE + if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1) { + // Restore the floating point state that was set in this frame at the + // last spawn. + // + // This feature is only available in ABI 1 or later frames, and only + // needed on IA64 or Intel64 processors. + restore_x86_fp_state(sf); + } +#endif + + CILK_LONGJMP(sf->ctx); +} + + +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <sys/mman.h> +#include <errno.h> + + +void __cilkrts_make_unrunnable_sysdep(__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *sf, + int is_loot, + const char *why) +{ + (void)w; /* unused */ + sf->except_data = 0; + + if (is_loot) + { + if (ff->frame_size == 0) + ff->frame_size = __cilkrts_get_frame_size(sf); + + // Null loot's sp for debugging purposes (so we'll know it's not valid) + SP(sf) = 0; + } +} + +/* + * __cilkrts_sysdep_is_worker_thread_id + * + * Returns true if the thread ID specified matches the thread ID we saved + * for a worker. + */ + +int __cilkrts_sysdep_is_worker_thread_id(global_state_t *g, + int i, + void *thread_id) +{ +#if defined( __linux__) || defined(__VXWORKS__) + pthread_t tid = *(pthread_t *)thread_id; + if (i < 0 || i > g->total_workers) + return 0; + return g->sysdep->threads[i] == tid; +#else + // Needs to be implemented + return 0; +#endif +} + + + + +/************************************************************* + Version information: +*************************************************************/ + +#include <dlfcn.h> +#include "internal/cilk_version.h" +#include <stdio.h> +#include <sys/utsname.h> + +#ifdef __VXWORKS__ +#include <version.h> +# endif + +/* (Non-static) dummy function is used by get_runtime_path() to find the path + * to the .so containing the Cilk runtime. + */ +void dummy_function() { } + +/* return a string with the path to the Cilk runtime, or "unknown" if the path + * cannot be determined. + */ +static const char *get_runtime_path () +{ +#ifdef __CYGWIN__ + // Cygwin doesn't support dladdr, which sucks + return "unknown"; +#else + Dl_info info; + if (0 == dladdr(dummy_function, &info)) return "unknown"; + return info.dli_fname; +#endif +} + +/* if the environment variable, CILK_VERSION, is defined, writes the version + * information to the specified file. + * g is the global state that was just created, and n is the number of workers + * that were made (or requested from RML) for it. + */ +static void write_version_file (global_state_t *g, int n) +{ + const char *env; // environment variable. + char buf[256]; // print buffer. + time_t t; + FILE *fp; + struct utsname sys_info; + int err; // error code from system calls. + + // if CILK_VERSION is not set, or if the file cannot be opened, fail + // silently. Otherwise open the file for writing (or use stderr or stdout + // if the user specifies). + if (NULL == (env = getenv("CILK_VERSION"))) return; + if (0 == strcasecmp(env, "stderr")) fp = stderr; + else if (0 == strcasecmp(env, "stdout")) fp = stdout; + else if (NULL == (fp = fopen(env, "w"))) return; + + // get a string for the current time. E.g., + // Cilk runtime initialized: Thu Jun 10 13:28:00 2010 + t = time(NULL); + strftime(buf, 256, "%a %b %d %H:%M:%S %Y", localtime(&t)); + fprintf(fp, "Cilk runtime initialized: %s\n", buf); + + // Print runtime info. E.g., + // Cilk runtime information + // ======================== + // Cilk version: 2.0.0 Build 9184 + // Built by willtor on host willtor-desktop + // Compilation date: Thu Jun 10 13:27:42 2010 + // Compiled with ICC V99.9.9, ICC build date: 20100610 + + fprintf(fp, "\nCilk runtime information\n"); + fprintf(fp, "========================\n"); + fprintf(fp, "Cilk version: %d.%d.%d Build %d\n", + VERSION_MAJOR, + VERSION_MINOR, + VERSION_REV, + VERSION_BUILD); +#ifdef __VXWORKS__ + char * vxWorksVer = VXWORKS_VERSION; + fprintf(fp, "Cross compiled for %s\n",vxWorksVer); + // user and host not avalible if VxWorks cross compiled on windows build host +#else + + // User and host are not available for GCC builds +#ifdef BUILD_USER + fprintf(fp, "Built by "BUILD_USER" on host "BUILD_HOST"\n"); +#endif // BUILD_USER +#endif // __VXWORKS__ + + // GCC has requested that this be removed for GCC builds +#ifdef BUILD_USER + fprintf(fp, "Compilation date: "__DATE__" "__TIME__"\n"); +#endif // BUILD_USER + +#ifdef __INTEL_COMPILER + // Compiled by the Intel C/C++ compiler. + fprintf(fp, "Compiled with ICC V%d.%d.%d, ICC build date: %d\n", + __INTEL_COMPILER / 100, + (__INTEL_COMPILER / 10) % 10, + __INTEL_COMPILER % 10, + __INTEL_COMPILER_BUILD_DATE); +#else + // Compiled by GCC. + fprintf(fp, "Compiled with GCC V%d.%d.%d\n", + __GNUC__, + __GNUC_MINOR__, + __GNUC_PATCHLEVEL__); +#endif // defined __INTEL_COMPILER + + // Print system info. E.g., + // System information + // ================== + // Cilk runtime path: /opt/icc/64/lib/libcilkrts.so.5 + // System OS: Linux, release 2.6.28-19-generic + // System architecture: x86_64 + + err = uname(&sys_info); + fprintf(fp, "\nSystem information\n"); + fprintf(fp, "==================\n"); + fprintf(fp, "Cilk runtime path: %s\n", get_runtime_path()); + fprintf(fp, "System OS: %s, release %s\n", + err < 0 ? "unknown" : sys_info.sysname, + err < 0 ? "?" : sys_info.release); + fprintf(fp, "System architecture: %s\n", + err < 0 ? "unknown" : sys_info.machine); + + // Print thread info. E.g., + // Thread information + // ================== + // System cores: 8 + // Cilk workers requested: 8 + // Thread creator: Private + + fprintf(fp, "\nThread information\n"); + fprintf(fp, "==================\n"); +#ifdef __VXWORKS__ + fprintf(fp, "System cores: %d\n", (int)__builtin_popcount(vxCpuEnabledGet())); +#else + fprintf(fp, "System cores: %d\n", (int)sysconf(_SC_NPROCESSORS_ONLN)); +#endif + fprintf(fp, "Cilk workers requested: %d\n", n); +#if (PARALLEL_THREAD_CREATE) + fprintf(fp, "Thread creator: Private (parallel)\n"); +#else + fprintf(fp, "Thread creator: Private\n"); +#endif + + if (fp != stderr && fp != stdout) fclose(fp); + else fflush(fp); // flush the handle buffer if it is stdout or stderr. +} + + +/* + * __cilkrts_establish_c_stack + * + * Tell Cilkscreen about the user stack bounds. + * + * Note that the Cilk V1 runtime only included the portion of the stack from + * the entry into Cilk, down. We don't appear to be able to find that, but + * I think this will be sufficient. + */ + +void __cilkrts_establish_c_stack(void) +{ + /* FIXME: Not implemented. */ + + /* TBD: Do we need this */ + /* + void __cilkrts_cilkscreen_establish_c_stack(char *begin, char *end); + + size_t r; + MEMORY_BASIC_INFORMATION mbi; + + r = VirtualQuery (&mbi, + &mbi, + sizeof(mbi)); + + __cilkrts_cilkscreen_establish_c_stack((char *)mbi.BaseAddress, + (char *)mbi.BaseAddress + mbi.RegionSize); + */ +} + + +/* + * internal_enforce_global_visibility + * + * Ensure global visibility of public symbols, for proper Cilk-TBB interop. + * + * If Cilk runtime is loaded dynamically, its symbols might remain unavailable + * for global search with dladdr; that might prevent TBB from finding Cilk + * in the process address space and initiating the interop protocol. + * The workaround is for the library to open itself with RTLD_GLOBAL flag. + */ + +static __attribute__((noinline)) +void internal_enforce_global_visibility() +{ + void* handle = dlopen( get_runtime_path(), RTLD_GLOBAL|RTLD_LAZY ); + + /* For proper reference counting, close the handle immediately. */ + if( handle) dlclose(handle); +} + +/* + Local Variables: ** + c-file-style:"bsd" ** + c-basic-offset:4 ** + indent-tabs-mode:nil ** + End: ** +*/ diff --git a/libcilkrts/runtime/sysdep.h b/libcilkrts/runtime/sysdep.h new file mode 100644 index 00000000000..ea939acc124 --- /dev/null +++ b/libcilkrts/runtime/sysdep.h @@ -0,0 +1,285 @@ +/* sysdep.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file sysdep.h + * + * @brief Common system-dependent functions + */ + +#ifndef INCLUDED_SYSDEP_DOT_H +#define INCLUDED_SYSDEP_DOT_H + +#include <cilk/common.h> +#include <internal/abi.h> + +#include "global_state.h" +#include "full_frame.h" +#include "os.h" +#include "os_mutex.h" + +/** + * @brief Default page size for Cilk stacks. + * + * All Cilk stacks should have size that is a multiple of this value. + */ +#define PAGE 4096 + +/** + * @brief Size of a scheduling stack. + * + * A scheduling stack is used to by system workers to execute runtime + * code. Since this stack is only executing runtime functions, we + * don't need it to be a full size stack. + * + * The number "18" should be small since the runtime doesn't require a + * large stack, but large enough to call "printf" for debugging. + */ +#define CILK_SCHEDULING_STACK_SIZE (18*PAGE) + +__CILKRTS_BEGIN_EXTERN_C + + +/** + * Code to initialize the system-dependent portion of the global_state_t + * + * @param g Pointer to the global state. + */ +COMMON_SYSDEP +void __cilkrts_init_global_sysdep(global_state_t *g); + +/** + * Code to clean up the system-dependent portion of the global_state_t + * + * @param g Pointer to the global state. + */ +COMMON_SYSDEP +void __cilkrts_destroy_global_sysdep(global_state_t *g); + +/** + * Passes stack range to Cilkscreen. This functionality should be moved + * into Cilkscreen. + */ +COMMON_SYSDEP +void __cilkrts_establish_c_stack(void); + + +/** + * Save system dependent information in the full_frame and + * __cilkrts_stack_frame. Part of promoting a + * __cilkrts_stack_frame to a full_frame. + * + * @param w The worker the frame was running on. Not used. + * @param ff The full frame that is being created for the + * __cilkrts_stack_frame. + * @param sf The __cilkrts_stack_frame that's being promoted + * to a full frame. + * @param state_valid ? + * @param why A description of why make_unrunnable was called. + * Used for debugging. + */ +COMMON_SYSDEP +void __cilkrts_make_unrunnable_sysdep(__cilkrts_worker *w, + full_frame *ff, + __cilkrts_stack_frame *sf, + int state_valid, + const char *why); + + +/** + * OS-specific code to spawn worker threads. + * + * @param g The global state. + * @param n Number of worker threads to start. + */ +COMMON_SYSDEP +void __cilkrts_start_workers(global_state_t *g, int n); + +/** + * @brief OS-specific code to stop worker threads. + * + * @param g The global state. + */ +COMMON_SYSDEP +void __cilkrts_stop_workers(global_state_t *g); + +/** + * @brief Imports a user thread the first time it returns to a stolen parent. + * + * The thread has been bound to a worker, but additional steps need to + * be taken to start running a scheduling loop. + * + * @param w The worker bound to the thread. + */ +COMMON_SYSDEP +void __cilkrts_sysdep_import_user_thread(__cilkrts_worker *w); + +/** + * @brief Function to be run for each of the system worker threads. + * + * This declaration also appears in cilk/cilk_undocumented.h -- don't + * change one declaration without also changing the other. + * + * @param arg The context value passed to the thread creation routine for + * the OS we're running on. + * + * @returns OS dependent. + */ +#ifdef _WIN32 +/* Do not use CILK_API because __cilkrts_worker_stub must be __stdcall */ +CILK_EXPORT unsigned __CILKRTS_NOTHROW __stdcall +__cilkrts_worker_stub(void *arg); +#else +/* Do not use CILK_API because __cilkrts_worker_stub have default visibility */ +__attribute__((visibility("default"))) +void* __CILKRTS_NOTHROW __cilkrts_worker_stub(void *arg); +#endif + +/** + * Initialize any OS-depenendent portions of a newly created + * __cilkrts_worker. + * + * Exported for Piersol. Without the export, Piersol doesn't display + * useful information in the stack trace. This declaration also appears in + * cilk/cilk_undocumented.h -- do not modify one without modifying the other. + * + * @param w The worker being initialized. + */ +COMMON_SYSDEP +CILK_EXPORT +void __cilkrts_init_worker_sysdep(__cilkrts_worker *w); + +/** + * Deallocate any OS-depenendent portions of a __cilkrts_worker. + * + * @param w The worker being deallocaed. + */ +COMMON_SYSDEP +void __cilkrts_destroy_worker_sysdep(__cilkrts_worker *w); + +/** + * Called to do any OS-dependent setup before starting execution on a + * frame. Mostly deals with exception handling data. + * + * @param w The worker the frame will run on. + * @param ff The full_frame that is about to be resumed. + */ +COMMON_SYSDEP +void __cilkrts_setup_for_execution_sysdep(__cilkrts_worker *w, + full_frame *ff); + +/** + * @brief OS-specific implementaton of resetting fiber and frame state + * to resume exeuction. + * + * This method: + * 1. Calculates the value of stack pointer where we should resume + * execution of "sf". This calculation uses info stored in the + * fiber, and takes into account alignment and frame size. + * 2. Updates sf and ff to match the calculated stack pointer. + * + * On Unix, the stack pointer calculation looks up the base of the + * stack from the fiber. + * + * On Windows, this calculation is calls "alloca" to find a stack + * pointer on the currently executing stack. Thus, the Windows code + * assumes @c fiber is the currently executing fiber. + * + * @param fiber fiber to resume execution on. + * @param ff full_frame for the frame we're resuming. + * @param sf __cilkrts_stack_frame that we should resume + * @return The calculated stack pointer. + */ +COMMON_SYSDEP +char* sysdep_reset_jump_buffers_for_resume(cilk_fiber* fiber, + full_frame *ff, + __cilkrts_stack_frame *sf); + +/** + * @brief System-dependent longjmp to user code for resuming execution + * of a @c __cilkrts_stack_frame. + * + * This method: + * - Changes the stack pointer in @c sf to @c new_sp. + * - If @c ff_for_exceptions is not NULL, changes fields in @c sf and + * @c ff_for_exceptions for exception processing. + * - Restores any floating point state + * - Finishes with a longjmp to user code, never to return. + * + * @param new_sp stack pointer where we should resume execution + * @param sf @c __cilkrts_stack_frame for the frame we're resuming. + * @param ff_for_exceptions full_frame to safe exception info into, if necessary + */ +COMMON_SYSDEP +NORETURN +sysdep_longjmp_to_sf(char* new_sp, + __cilkrts_stack_frame *sf, + full_frame *ff_for_exceptions); + +/** + * @brief System-dependent code to save floating point control information + * to a @c __cilkrts_stack_frame. This function will be called by compilers + * that cannot inline the code. + * + * Note that this function does *not* save the current floating point + * registers. It saves the floating point control words that control + * precision and rounding and stuff like that. + * + * This function will be a noop for architectures that don't have warts + * like the floating point control words, or where the information is + * already being saved by the setjmp. + * + * @param sf @c __cilkrts_stack_frame for the frame we're + * saving the floating point control information in. + */ +COMMON_SYSDEP +void +sysdep_save_fp_ctrl_state(__cilkrts_stack_frame *sf); + + +/** + * @brief restore x86 floating point state + * + * Only used for x86 and Intel64 processors + */ +COMMON_SYSDEP +void restore_x86_fp_state(__cilkrts_stack_frame *sf); + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_SYSDEP_DOT_H) diff --git a/libcilkrts/runtime/worker_mutex.c b/libcilkrts/runtime/worker_mutex.c new file mode 100644 index 00000000000..380d6255a0c --- /dev/null +++ b/libcilkrts/runtime/worker_mutex.c @@ -0,0 +1,121 @@ +/* worker_mutex.c -*-C-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +#include "worker_mutex.h" +#include "bug.h" +#include "os.h" +#include "stats.h" + +/* m->lock == 1 means that mutex M is locked */ +#define TRY_ACQUIRE(m) (__cilkrts_xchg(&(m)->lock, 1) == 0) + +/* ICC 11.1+ understands release semantics and generates an + ordinary store with a software memory barrier. */ +#if __ICC >= 1110 +#define RELEASE(m) __sync_lock_release(&(m)->lock) +#else +#define RELEASE(m) __cilkrts_xchg(&(m)->lock, 0) +#endif + +void __cilkrts_mutex_init(struct mutex *m) +{ + m->owner = 0; + + // Use a simple assignment so Inspector doesn't bug us about the + // interlocked exchange doing a read of an uninitialized variable. + // By definition there can't be a race when we're initializing the + // lock... + m->lock = 0; +} + +void __cilkrts_mutex_lock(__cilkrts_worker *w, struct mutex *m) +{ + int count; + const int maxspin = 1000; /* SWAG */ + + NOTE_INTERVAL(w, INTERVAL_MUTEX_LOCK); + if (!TRY_ACQUIRE(m)) { + START_INTERVAL(w, INTERVAL_MUTEX_LOCK_SPINNING); + count = 0; + do { + do { + __cilkrts_short_pause(); + if (++count >= maxspin) { + STOP_INTERVAL(w, INTERVAL_MUTEX_LOCK_SPINNING); + START_INTERVAL(w, INTERVAL_MUTEX_LOCK_YIELDING); + /* let the OS reschedule every once in a while */ + __cilkrts_yield(); + STOP_INTERVAL(w, INTERVAL_MUTEX_LOCK_YIELDING); + START_INTERVAL(w, INTERVAL_MUTEX_LOCK_SPINNING); + count = 0; + } + } while (m->lock != 0); + } while (!TRY_ACQUIRE(m)); + STOP_INTERVAL(w, INTERVAL_MUTEX_LOCK_SPINNING); + } + + CILK_ASSERT(m->owner == 0); + m->owner = w; +} + +int __cilkrts_mutex_trylock(__cilkrts_worker *w, struct mutex *m) +{ + NOTE_INTERVAL(w, INTERVAL_MUTEX_TRYLOCK); + if (TRY_ACQUIRE(m)) { + CILK_ASSERT(m->owner == 0); + m->owner = w; + return 1; + } else { + return 0; + } +} + +void __cilkrts_mutex_unlock(__cilkrts_worker *w, struct mutex *m) +{ + CILK_ASSERT(m->owner == w); + m->owner = 0; + RELEASE(m); +} + +void __cilkrts_mutex_destroy(__cilkrts_worker *w, struct mutex *m) +{ + (void)w; /* unused */ + (void)m; /* unused */ +} + +/* End worker_mutex.c */ diff --git a/libcilkrts/runtime/worker_mutex.h b/libcilkrts/runtime/worker_mutex.h new file mode 100644 index 00000000000..c2c68247e0b --- /dev/null +++ b/libcilkrts/runtime/worker_mutex.h @@ -0,0 +1,131 @@ +/* worker_mutex.h -*-C++-*- + * + ************************************************************************* + * + * @copyright + * Copyright (C) 2009-2013, Intel Corporation + * All rights reserved. + * + * @copyright + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * @copyright + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + **************************************************************************/ + +/** + * @file worker_mutex.h + * + * @brief Support for Cilk runtime mutexes. + * + * Cilk runtime mutexes are implemented as simple spin loops. + */ + +#ifndef INCLUDED_WORKER_MUTEX_DOT_H +#define INCLUDED_WORKER_MUTEX_DOT_H + +#include <cilk/common.h> +#include "rts-common.h" + +__CILKRTS_BEGIN_EXTERN_C + +/** + * Mutexes are treated as an abstract data type within the Cilk + * runtime system. They are implemented as simple spin loops and + * owned by a __cilkrts_worker. + */ +typedef struct mutex { + /** Mutex spin loop variable. 0 if unowned, 1 if owned. */ + volatile int lock; + + /** Worker that owns the mutex. Must be 0 if mutex is unowned. */ + __cilkrts_worker *owner; +} mutex; + +/** + * @brief Initialize a Cilk mutex. + * + * @param m Mutex to be initialized. + */ +COMMON_PORTABLE +void __cilkrts_mutex_init(struct mutex *m); + +/** + * @brief Acquire a Cilk mutex. + * + * If statistics are being gathered, the time spent + * acquiring the mutex will be attributed to the specified worker. + * + * @param w Worker that will become the owner of this mutex. + * @param m Mutex to be initialized. + */ +COMMON_PORTABLE +void __cilkrts_mutex_lock(__cilkrts_worker *w, + struct mutex *m); +/** + * @brief Attempt to lock a Cilk mutex and fail if it isn't available. + * + * If statistics are being gathered, the time spent acquiring the + * mutex will be attributed to the specified worker. + * + * @param w Worker that will become the owner of this mutex. + * @param m Mutex to be acquired. + * + * @return 1 if the mutex was acquired. + * @return 0 if the mutex was not acquired. + */ +COMMON_PORTABLE +int __cilkrts_mutex_trylock(__cilkrts_worker *w, + struct mutex *m); + +/** + * @brief Release a Cilk mutex. + * + * If statistics are being gathered, the time spent + * acquiring the mutex will be attributed to the specified worker. + * + * @pre The mutex must be owned by the worker. + * + * @param w Worker that owns this mutex. + * @param m Mutex to be released. + */ +COMMON_PORTABLE +void __cilkrts_mutex_unlock(__cilkrts_worker *w, + struct mutex *m); + +/** + * @brief Deallocate a Cilk mutex. Currently does nothing. + * + * @param w Unused. + * @param m Mutex to be deallocated. + */ +COMMON_PORTABLE +void __cilkrts_mutex_destroy(__cilkrts_worker *w, + struct mutex *m); + +__CILKRTS_END_EXTERN_C + +#endif // ! defined(INCLUDED_WORKER_MUTEX_DOT_H) |