Added Cilk runtime library (libcilkrts) into GCC.

git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@204173 138bc75d-0d04-0410-961f-82ee72b054a4
author: bviyer <bviyer@138bc75d-0d04-0410-961f-82ee72b054a4> 2013-10-29 18:37:47 +0000
committer: bviyer <bviyer@138bc75d-0d04-0410-961f-82ee72b054a4> 2013-10-29 18:37:47 +0000
commit: 4710dd5101f8103638ffe082a220f701f592df36 (patch)
tree: 235d812c6202e962d45c0cce844b2afcc5a0596d /libcilkrts/runtime
parent: d037099fed7476ffedb6784a1f544132f258d792 (diff)
download: gcc-4710dd5101f8103638ffe082a220f701f592df36.tar.gz
67 files changed, 23137 insertions, 0 deletions
diff --git a/libcilkrts/runtime/acknowledgements.dox b/libcilkrts/runtime/acknowledgements.dox
new file mode 100644
index 00000000000..79b5d876f33
--- /dev/null
+++ b/libcilkrts/runtime/acknowledgements.dox
@@ -0,0 +1,51 @@
+/* acknowledgements.dox
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/*
+ * This file contains acknowledgements of community contributions to the
+ * Cilk Plus runtime.
+ */
+
+/**
+ * @mainpage
+ *
+ * @section Acknowledgements Acknowledgements
+ *
+ * Modifications to build the Cilk Plus runtime for VxWorks provided by
+ * Brian Kuhl of Wind River.
+ */
diff --git a/libcilkrts/runtime/bug.cpp b/libcilkrts/runtime/bug.cpp
new file mode 100644
index 00000000000..dbdf1fd3216
--- /dev/null
+++ b/libcilkrts/runtime/bug.cpp
@@ -0,0 +1,139 @@
+/* bug.cpp                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#include "bug.h"
+
+#include <exception>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#ifdef _WIN32
+#   include "windows-clean.h"
+#   include "internal/abi.h"
+#   include "cilktools/cilkscreen.h"
+#   include <crtdbg.h>
+#endif
+
+__CILKRTS_BEGIN_EXTERN_C
+
+COMMON_PORTABLE const char *const __cilkrts_assertion_failed =
+    "%s:%d: cilk assertion failed: %s\n";
+
+COMMON_PORTABLE void __cilkrts_bug(const char *fmt,...) cilk_nothrow
+{
+#if defined (_WIN32) && defined(_DEBUG)
+    _CRTIMP void __cdecl _wassert(__in_z const wchar_t * _Message,
+                                  __in_z const wchar_t *_File,
+                                  __in unsigned _Line);
+    char message[256];
+    wchar_t wmessage[256];
+    va_list l;
+    va_start(l, fmt);
+    _vsnprintf_s(message, 256, _TRUNCATE, fmt, l);
+    va_end(l);
+    _snwprintf_s(wmessage, 256, _TRUNCATE, _CRT_WIDE("%S"),
+                 message); /* widen */
+
+    // Force asserts to go to stderr and the debugger.  This isn't polite, but
+    // we're about to kill the app anyway and it will prevent our tests from
+    // hanging
+    _CrtSetReportMode(_CRT_ASSERT, _CRTDBG_MODE_FILE| _CRTDBG_MODE_DEBUG);
+    _CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
+
+    _wassert(wmessage, _CRT_WIDE(__FILE__), __LINE__);
+
+    // If there's a debugger attached, give it a chance to look at the failure
+    if (IsDebuggerPresent())
+        DebugBreak();
+
+    abort();
+/*    __asm int 3 */
+#else
+    /* To reduce user confusion, write all user-generated output
+       before the system-generated error message. */
+    va_list l;
+    fflush(NULL);
+    va_start(l, fmt);
+    vfprintf(stderr, fmt, l);
+    va_end(l);
+    fflush(stderr);
+
+#ifndef _WIN32
+    abort();
+#endif
+
+#endif
+
+    exit(1);
+}
+
+COMMON_PORTABLE void cilkbug_assert_no_uncaught_exception(void)
+{
+    bool uncaught = std::uncaught_exception();
+    CILK_ASSERT(!uncaught);
+}
+
+COMMON_SYSDEP void abort_because_rts_is_corrupted(void)
+{
+    __cilkrts_bug("The Cilk Plus runtime system detected a corruption "
+                  "in its data structures.  This is most likely caused "
+                  "by an application bug.  Aborting execution.\n");
+}
+
+#ifdef WIN32
+COMMON_SYSDEP void __cilkrts_dbgprintf(const char *fmt,...)
+{
+    char message[2048];
+    va_list l;
+
+    // Cilkscreen shouldn't watch this
+    __cilkscreen_disable_checking();
+
+    va_start(l, fmt);
+    _vsnprintf_s(message, 2048, _TRUNCATE, fmt, l);
+    va_end(l);
+    OutputDebugStringA (message);
+
+    // Re-enable Cilkscreen
+    __cilkscreen_enable_checking();
+}
+#endif
+
+__CILKRTS_END_EXTERN_C
+
+/* End bug.cpp */
diff --git a/libcilkrts/runtime/bug.h b/libcilkrts/runtime/bug.h
new file mode 100644
index 00000000000..bb18913787d
--- /dev/null
+++ b/libcilkrts/runtime/bug.h
@@ -0,0 +1,141 @@
+/* bug.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file bug.h
+ *
+ * @brief Support for reporting bugs and debugging.
+ */
+
+#ifndef INCLUDED_BUG_DOT_H
+#define INCLUDED_BUG_DOT_H
+
+#include "rts-common.h"
+#include <cilk/common.h>
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * Flush all output, write error message to stderr and abort the execution.
+ * On Windows the error is also written to the debugger.
+ *
+ * @param fmt printf-style format string.  Any remaining parameters will be
+ * be interpreted based on the format string text.
+ */
+COMMON_PORTABLE NORETURN __cilkrts_bug(const char *fmt,...) cilk_nothrow;
+
+#ifndef CILK_ASSERT
+
+/** Standard text for failed assertion */
+COMMON_PORTABLE extern const char *const __cilkrts_assertion_failed;
+
+/**
+ * Macro to assert an invariant that must be true.  If the statement evalutes
+ * to false, __cilkrts_bug will be called to report the failure and terminate
+ * the application.
+ */
+#define CILK_ASSERT(ex)                                                 \
+    (__builtin_expect((ex) != 0, 1) ? (void)0 :                         \
+     __cilkrts_bug(__cilkrts_assertion_failed, __FILE__, __LINE__,  #ex))
+
+#define CILK_ASSERT_MSG(ex, msg)                                        \
+    (__builtin_expect((ex) != 0, 1) ? (void)0 :                         \
+     __cilkrts_bug(__cilkrts_assertion_failed, __FILE__, __LINE__,      \
+                   #ex "\n    " msg))
+#endif  // CILK_ASSERT
+
+/**
+ * Assert that there is no uncaught exception.
+ *
+ * Not valid on Windows or Android.
+ *
+ * On Android, calling std::uncaught_exception with the stlport library causes
+ * a seg fault.  Since we're not supporting exceptions there at this point,
+ * just don't do the check.  It works with the GNU STL library, but that's
+ * GPL V3 licensed.
+ */
+COMMON_PORTABLE void cilkbug_assert_no_uncaught_exception(void);
+#if defined(_WIN32) || defined(ANDROID)
+#  define CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION()
+#else
+#  define CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION() \
+    cilkbug_assert_no_uncaught_exception()
+#endif
+
+
+/**
+ * Call __cilkrts_bug with a standard message that the runtime state is
+ * corrupted and the application is being terminated.
+ */
+COMMON_SYSDEP void abort_because_rts_is_corrupted(void);
+
+// Debugging aids
+#ifndef _DEBUG
+#       define DBGPRINTF(_fmt, ...)
+#elif defined(_WIN32)
+
+/**
+ * Write debugging output.  On windows this is written to the debugger.
+ *
+ * @param fmt printf-style format string.  Any remaining parameters will be
+ * be interpreted based on the format string text.
+ */
+COMMON_SYSDEP void __cilkrts_dbgprintf(const char *fmt,...) cilk_nothrow;
+
+/**
+ * Macro to write debugging output which will be elided if this is not a
+ * debug build.  The macro is currently always elided on non-Windows builds.
+ *
+ * @param _fmt printf-style format string.  Any remaining parameters will be
+ * be interpreted based on the format string text.
+ */
+#       define DBGPRINTF(_fmt, ...) __cilkrts_dbgprintf(_fmt, __VA_ARGS__)
+
+#else /* if _DEBUG && !_WIN32 */
+    /* Non-Windows debug logging.  Someday we should make GetCurrentFiber()
+     * and GetWorkerFiber() do something.
+     */
+#   include <stdio.h>
+    __CILKRTS_INLINE void* GetCurrentFiber() { return 0; }
+    __CILKRTS_INLINE void* GetWorkerFiber(__cilkrts_worker* w) { return 0; }
+#       define DBGPRINTF(_fmt, ...) fprintf(stderr, _fmt, __VA_ARGS__)
+#endif  // _DEBUG
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_BUG_DOT_H)
diff --git a/libcilkrts/runtime/c_reducers.c b/libcilkrts/runtime/c_reducers.c
new file mode 100644
index 00000000000..52615e93f43
--- /dev/null
+++ b/libcilkrts/runtime/c_reducers.c
@@ -0,0 +1,57 @@
+/* c_reducers.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2010-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+/* Implementation of C reducers */
+
+// Disable warning about integer conversions losing significant bits.
+// The code is correct as is.
+#ifdef __INTEL_COMPILER
+#pragma warning(disable:2259)
+#endif
+
+#define CILK_C_DEFINE_REDUCERS
+
+#include <cilk/reducer_opadd.h>
+#include <cilk/reducer_opand.h>
+#include <cilk/reducer_opmul.h>
+#include <cilk/reducer_opor.h>
+#include <cilk/reducer_opxor.h>
+#include <cilk/reducer_min_max.h>
+
+/* End reducer_opadd.c */
diff --git a/libcilkrts/runtime/cilk-abi-cilk-for.cpp b/libcilkrts/runtime/cilk-abi-cilk-for.cpp
new file mode 100644
index 00000000000..4fa6dcec82a
--- /dev/null
+++ b/libcilkrts/runtime/cilk-abi-cilk-for.cpp
@@ -0,0 +1,406 @@
+/* cilk-abi-cilk-for.cpp                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2011, 2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+/* Implementation of cilk_for ABI.
+ *
+ * This file must be C++, not C, in order to handle C++ exceptions correctly
+ * from within the body of the cilk_for loop
+ */
+
+#include "internal/abi.h"
+#include "metacall_impl.h"
+#include "global_state.h"
+
+// Icky macros to determine if we're compiled with optimization.  Based on
+// the declaration of __CILKRTS_ASSERT in common.h
+#if defined(_WIN32)
+# if defined (_DEBUG)
+#   define CILKRTS_OPTIMIZED 0    // Assumes /MDd is always used with /Od
+# else
+#   define CILKRTS_OPTIMIZED 1
+# endif // defined(_DEBUG)
+#else
+# if defined(__OPTIMIZE__)
+#   define CILKRTS_OPTIMIZED 1
+# else
+#   define CILKRTS_OPTIMIZED 0
+# endif
+#endif
+
+template <typename count_t>
+static inline int grainsize(int req, count_t count)
+{
+    // A positive requested grain size comes from the user.  A very high grain
+    // size risks losing parallelism, but the user told us what they want for
+    // grainsize.  Who are we to argue?
+    if (req > 0)
+        return req;
+
+    // At present, a negative requested grain size is treated the same way as
+    // a zero grain size, i.e., the runtime computes the actual grainsize
+    // using a hueristic.  In the future, the compiler may give us additional
+    // information about the size of the cilk_for body by passing a negative
+    // grain size.
+
+    // Avoid generating a zero grainsize, even for empty loops.
+    if (count < 1)
+        return 1;
+
+    global_state_t* g = cilkg_get_global_state();
+    if (g->under_ptool)
+    {
+        // Grainsize = 1, when running under PIN, and when the grainsize has
+        // not explicitly been set by the user.
+        return 1;
+    }
+    else
+    {
+        // Divide loop count by 8 times the worker count and round up.
+        const int Px8 = g->P * 8;
+        count_t n = (count + Px8 - 1) / Px8;
+
+        // 2K should be enough to amortize the cost of the cilk_for. Any
+        // larger grainsize risks losing parallelism.
+        if (n > 2048)
+            return 2048;
+        return (int) n;  // n <= 2048, so no loss of precision on cast to int
+    }
+}
+
+/*
+ * call_cilk_for_loop_body
+ *
+ * Centralizes the code to call the loop body.  The compiler should be
+ * inlining this code
+ *
+ * low   - Low loop index we're considering in this portion of the algorithm
+ * high  - High loop index we're considering in this portion of the algorithm
+ * body  - lambda function for the cilk_for loop body
+ * data  - data used by the lambda function
+ * w     - __cilkrts_worker we're currently executing on
+ * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
+ *         the cilk_for loop to flatten out the internal nodes
+ */
+template <typename count_t, typename F>
+inline static
+void call_cilk_for_loop_body(count_t low, count_t high,
+                             F body, void *data,
+                             __cilkrts_worker *w,
+                             __cilkrts_pedigree *loop_root_pedigree)
+{
+    // Cilkscreen should not report this call in a stack trace
+    NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
+
+    // The worker is only valid until the first spawn.  Fetch the
+    // __cilkrts_stack_frame out of the worker, since it will be stable across
+    // steals.  The sf pointer actually points to the *parent's*
+    // __cilkrts_stack_frame, since this function is a non-spawning function
+    // and therefore has no cilk stack frame of its own.
+    __cilkrts_stack_frame *sf = w->current_stack_frame;
+
+    // Save the pedigree node pointed to by the worker.  We'll need to restore
+    // that when we exit since the spawn helpers in the cilk_for call tree
+    // will assume that it's valid
+    const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent;
+
+    // Add the leaf pedigree node to the chain. The parent is the root node
+    // to flatten the tree regardless of the DAG branches in the cilk_for
+    // divide-and-conquer recursion.
+    //
+    // The rank is initialized to the low index.  The user is
+    // expected to call __cilkrts_bump_loop_rank at the end of the cilk_for
+    // loop body.
+    __cilkrts_pedigree loop_leaf_pedigree;
+
+    loop_leaf_pedigree.rank = (uint64_t)low;
+    loop_leaf_pedigree.parent = loop_root_pedigree;
+
+    // The worker's pedigree always starts with a rank of 0
+    w->pedigree.rank = 0;
+    w->pedigree.parent = &loop_leaf_pedigree;
+
+    // Call the compiler generated cilk_for loop body lambda function
+    body(data, low, high);
+
+    // The loop body may have included spawns, so we must refetch the worker
+    // from the __cilkrts_stack_frame, which is stable regardless of which
+    // worker we're executing on.
+    w = sf->worker;
+
+    // Restore the pedigree chain. It must be valid because the spawn helpers
+    // generated by the cilk_for implementation will access it.
+    w->pedigree.parent = saved_next_pedigree_node;
+}
+
+/* capture_spawn_arg_stack_frame
+ *
+ * Efficiently get the address of the caller's __cilkrts_stack_frame.  The
+ * preconditons are that 'w' is the worker at the time of the call and
+ * 'w->current_stack_frame' points to the __cilkrts_stack_frame within the
+ * spawn helper.  This function should be called only within the argument list
+ * of a function that is being spawned because that is the only situation in
+ * which these preconditions hold.  This function returns the worker
+ * (unchanged) after storing the captured stack frame pointer is stored in the
+ * sf argument.
+ *
+ * The purpose of this function is to get the caller's stack frame in a
+ * context where the caller's worker is known but its stack frame is not
+ * necessarily initialized.  The "shrink wrap" optimization delays
+ * initializing the contents of a spawning function's '__cilkrts_stack_frame'
+ * as well as the 'current_stack_frame' pointer within the worker.  By calling
+ * this function within a spawning function's argument list, we can ensure
+ * that these initializations have occured but that a detach (which would
+ * invalidate the worker pointer in the caller) has not yet occured.  Once the
+ * '__cilkrts_stack_frame' has been retrieved in this way, it is stable for the
+ * remainder of the caller's execution, and becomes an efficient way to get
+ * the worker (much more efficient than calling '__cilkrts_get_tls_worker()'),
+ * even after a spawn or sync.
+ */
+inline __cilkrts_worker* 
+capture_spawn_arg_stack_frame(__cilkrts_stack_frame* &sf, __cilkrts_worker* w)
+{
+    // Get current stack frame
+    sf = w->current_stack_frame;
+#ifdef __INTEL_COMPILER
+#   if __INTEL_COMPILER <= 1300 && __INTEL_COMPILER_BUILD_DATE < 20130101
+    // In older compilers 'w->current_stack_frame' points to the
+    // spawn-helper's stack frame.  In newer compiler's however, it points
+    // directly to the pointer's stack frame.  (This change was made to avoid
+    // having the spawn helper in the frame list when evaluating function
+    // arguments, thus avoiding corruption when those arguments themselves
+    // contain cilk_spawns.)
+    
+    // w->current_stack_frame is the spawn helper's stack frame.
+    // w->current_stack_frame->call_parent is the caller's stack frame.
+    sf = sf->call_parent;
+#   endif
+#endif
+    return w;
+}
+
+/*
+ * cilk_for_recursive
+ *
+ * Templatized function to implement the recursive divide-and-conquer
+ * algorithm that's how we implement a cilk_for.
+ *
+ * low   - Low loop index we're considering in this portion of the algorithm
+ * high  - High loop index we're considering in this portion of the algorithm
+ * body  - lambda function for the cilk_for loop body
+ * data  - data used by the lambda function
+ * grain - grain size (0 if it should be computed)
+ * w     - __cilkrts_worker we're currently executing on
+ * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
+ *         the cilk_for loop to flatten out the internal nodes
+ */
+template <typename count_t, typename F>
+static
+void cilk_for_recursive(count_t low, count_t high,
+                        F body, void *data, int grain,
+                        __cilkrts_worker *w,
+                        __cilkrts_pedigree *loop_root_pedigree)
+{
+tail_recurse:
+    // Cilkscreen should not report this call in a stack trace
+    // This needs to be done everytime the worker resumes
+    NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
+
+    count_t count = high - low;
+    // Invariant: count > 0, grain >= 1
+    if (count > grain)
+    {
+        // Invariant: count >= 2
+        count_t mid = low + count / 2;
+        // The worker is valid only until the first spawn and is expensive to
+        // retrieve (using '__cilkrts_get_tls_worker') after the spawn.  The
+        // '__cilkrts_stack_frame' is more stable, but isn't initialized until
+        // the first spawn.  Thus, we want to grab the address of the
+        // '__cilkrts_stack_frame' after it is initialized but before the
+        // spawn detaches.  The only place we can do that is within the
+        // argument list of the spawned function, hence the call to
+        // capture_spawn_arg_stack_frame().
+        __cilkrts_stack_frame *sf;
+        _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
+                                       capture_spawn_arg_stack_frame(sf, w),
+                                       loop_root_pedigree);
+        w = sf->worker;
+        low = mid;
+
+        goto tail_recurse;
+    }
+
+    // Call the cilk_for loop body lambda function passed in by the compiler to
+    // execute one grain
+    call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
+}
+
+static void noop() { }
+
+/*
+ * cilk_for_root
+ *
+ * Templatized function to implement the top level of a cilk_for loop.
+ *
+ * body  - lambda function for the cilk_for loop body
+ * data  - data used by the lambda function
+ * count - trip count for loop
+ * grain - grain size (0 if it should be computed)
+ */
+template <typename count_t, typename F>
+static void cilk_for_root(F body, void *data, count_t count, int grain)
+{
+    // Cilkscreen should not report this call in a stack trace
+    NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
+
+    // Pedigree computation:
+    //
+    //    If the last pedigree node on entry to the _Cilk_for has value X,
+    //    then at the start of each iteration of the loop body, the value of
+    //    the last pedigree node should be 0, the value of the second-to-last
+    //    node should equal the loop counter, and the value of the
+    //    third-to-last node should be X.  On return from the _Cilk_for, the
+    //    value of the last pedigree should be incremented to X+2. The
+    //    pedigree within the loop is thus flattened, such that the depth of
+    //    recursion does not affect the results either inside or outside of
+    //    the loop.  Note that the pedigree after the loop exists is the same
+    //    as if a single spawn and sync were executed within this function.
+
+    // TBD: Since the shrink-wrap optimization was turned on in the compiler,
+    // it is not possible to get the current stack frame without actually
+    // forcing a call to bind-thread.  This spurious spawn is a temporary
+    // stopgap until the correct intrinsics are added to give us total control
+    // over frame initialization.
+    _Cilk_spawn noop();
+
+    // Fetch the current worker.  From that we can get the current stack frame
+    // which will be constant even if we're stolen
+    __cilkrts_worker *w = __cilkrts_get_tls_worker();
+    __cilkrts_stack_frame *sf = w->current_stack_frame;
+
+    // Decrement the rank by one to undo the pedigree change from the
+    // _Cilk_spawn
+    --w->pedigree.rank;
+
+    // Save the current worker pedigree into loop_root_pedigree, which will be
+    // the root node for our flattened pedigree.
+    __cilkrts_pedigree loop_root_pedigree = w->pedigree;
+
+    // Don't splice the loop_root node in yet.  It will be done when we
+    // call the loop body lambda function
+//    w->pedigree.rank = 0;
+//    w->pedigree.next = &loop_root_pedigree;
+
+    /* Spawn is necessary at top-level to force runtime to start up.
+     * Runtime must be started in order to call the grainsize() function.
+     */
+    int gs = grainsize(grain, count);
+    cilk_for_recursive((count_t) 0, count, body, data, gs, w,
+                       &loop_root_pedigree);
+
+    // Need to refetch the worker after calling a spawning function.
+    w = sf->worker;
+
+    // Restore the pedigree in the worker.
+    w->pedigree = loop_root_pedigree;
+
+    // Bump the worker pedigree.
+    ++w->pedigree.rank;
+
+    // Implicit sync will increment the pedigree leaf rank again, for a total
+    // of two increments.  If the noop spawn above is removed, then we'll need
+    // to re-enable the following code:
+//     // If this is an optimized build, then the compiler will have optimized
+//     // out the increment of the worker's pedigree in the implied sync.  We
+//     // need to add one to make the pedigree_loop test work correctly.
+// #if CILKRTS_OPTIMIZED
+//     ++sf->worker->pedigree.rank;
+// #endif
+}
+
+// Use extern "C" to suppress name mangling of __cilkrts_cilk_for_32 and
+// __cilkrts_cilk_for_64.
+extern "C" {
+
+/*
+ * __cilkrts_cilk_for_32
+ *
+ * Implementation of cilk_for for 32-bit trip counts (regardless of processor
+ * word size).  Assumes that the range is 0 - count.
+ *
+ * body  - lambda function for the cilk_for loop body
+ * data  - data used by the lambda function
+ * count - trip count for loop
+ * grain - grain size (0 if it should be computed)
+ */
+
+CILK_ABI_THROWS_VOID __cilkrts_cilk_for_32(__cilk_abi_f32_t body, void *data,
+                                            cilk32_t count, int grain)
+{
+    // Cilkscreen should not report this call in a stack trace
+    NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
+
+    // Check for an empty range here as an optimization - don't need to do any
+    // __cilkrts_stack_frame initialization
+    if (count > 0)
+        cilk_for_root(body, data, count, grain);
+}
+
+/*
+ * __cilkrts_cilk_for_64
+ *
+ * Implementation of cilk_for for 64-bit trip counts (regardless of processor
+ * word size).  Assumes that the range is 0 - count.
+ *
+ * body  - lambda function for the cilk_for loop body
+ * data  - data used by the lambda function
+ * count - trip count for loop
+ * grain - grain size (0 if it should be computed)
+ */
+CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data,
+                                            cilk64_t count, int grain)
+{
+    // Check for an empty range here as an optimization - don't need to do any
+    // __cilkrts_stack_frame initialization
+    if (count > 0)
+        cilk_for_root(body, data, count, grain);
+}
+
+} // end extern "C"
+
+/* End cilk-abi-cilk-for.cpp */
diff --git a/libcilkrts/runtime/cilk-abi-vla-internal.c b/libcilkrts/runtime/cilk-abi-vla-internal.c
new file mode 100644
index 00000000000..6fb92677ad0
--- /dev/null
+++ b/libcilkrts/runtime/cilk-abi-vla-internal.c
@@ -0,0 +1,83 @@
+/* cilk-abi-vla-internal.c        -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/*
+ * These functions are provided in their own compilation unit so I can debug
+ * them.  cilk-abi-vla.c must always be compiled with optimization on so that
+ * inlining occurs.
+ */
+
+#include "internal/abi.h"
+#include "cilk-abi-vla-internal.h"
+#include "bug.h"
+#include "full_frame.h"
+#include "local_state.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "bug.h"
+
+void *vla_internal_heap_alloc(__cilkrts_stack_frame *sf,
+                              size_t full_size,
+                              uint32_t align)
+{
+    return malloc(full_size);
+}
+
+void vla_internal_heap_free(void *t, size_t size)
+{
+    free(t);
+}
+
+void vla_free_from_original_stack(__cilkrts_stack_frame *sf,
+                                  size_t full_size)
+{
+    // The __cilkrts_stack_frame must be initialized
+    CILK_ASSERT(sf->worker);
+
+#if 1
+    // Add full_size to ff->sync_sp so that when we return, the VLA will no
+    // longer be allocated on the stack
+    __cilkrts_adjust_stack(sf->worker->l->frame_ff, full_size);
+#else
+    // Inline __cilkrts_adjust_stack for Kevin
+    full_frame *ff = sf->worker->l->frame_ff;
+    ff->sync_sp = ff->sync_sp + full_size;
+#endif
+}
diff --git a/libcilkrts/runtime/cilk-abi-vla-internal.h b/libcilkrts/runtime/cilk-abi-vla-internal.h
new file mode 100644
index 00000000000..909f08fa471
--- /dev/null
+++ b/libcilkrts/runtime/cilk-abi-vla-internal.h
@@ -0,0 +1,90 @@
+/* cilk-abi-vla-internal.h        -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file cilk-abi-vla-internal.h
+ *
+ * @brief Allocation/deallocation function for use with Variable Length
+ * Arrays in spawning functions.
+ *
+ * These should be the only functions in the Cilk runtime allocating memory
+ * from the standard C runtime heap.  This memory will be provided to user
+ * code for use in VLAs, when the memory cannot be allocated from the stack.
+ *
+ * While these functions are simply passthroughs to malloc and free at the
+ * moment, once we've got the basics of VLA allocations working we'll make
+ * them do fancier tricks.
+ */
+
+/**
+ * @brief Allocate memory from the heap for use by a Variable Length Array in
+ * a spawning function.
+ *
+ * @param sf The __cilkrts_stack_frame for the spawning function containing
+ * the VLA.
+ * @param full_size The number of bytes to be allocated, including any tags
+ * needed to identify this as allocated from the heap.
+ * @param align Any alignment necessary for the allocation.
+ */
+
+void *vla_internal_heap_alloc(__cilkrts_stack_frame *sf,
+                              size_t full_size,
+                              uint32_t align);
+
+/**
+ * @brief Deallocate memory from the heap used by a Variable Length Array in
+ * a spawning function.
+ *
+ * @param t The address of the memory block to be freed.
+ * @param size The size of the memory block to be freed.
+ */
+
+void vla_internal_heap_free(void *t,
+                            size_t size);
+
+/**
+ * @brief Deallocate memory from the original stack.  We'll do this by adding
+ * full_size to ff->sync_sp.  So after the sync, the Variable Length Array
+ * will no longer be allocated on the stack.
+ *
+ * @param sf The __cilkrts_stack_frame for the spawning function that is
+ * deallocating a VLA.
+ * @param full_size The size of the VLA, including any alignment and tags.
+ */
+void vla_free_from_original_stack(__cilkrts_stack_frame *sf,
+                                  size_t full_size);
diff --git a/libcilkrts/runtime/cilk-abi.c b/libcilkrts/runtime/cilk-abi.c
new file mode 100644
index 00000000000..1da05239ebc
--- /dev/null
+++ b/libcilkrts/runtime/cilk-abi.c
@@ -0,0 +1,733 @@
+/* Cilk_abi.c                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2010-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+/**
+ * @file cilk-abi.c
+ *
+ * @brief cilk-abi.c implements all of the entrypoints to the Intel Cilk
+ * Plus runtime.
+ */
+
+/*
+ * Define this macro so that compiliation of this file generates the
+ * non-inlined versions of certain functions in cilk_api.h.
+ */
+#include "internal/abi.h"
+#include "cilk/cilk_api.h"
+#include "cilk/cilk_undocumented.h"
+#include "cilktools/cilkscreen.h"
+
+#include "global_state.h"
+#include "os.h"
+#include "os_mutex.h"
+#include "bug.h"
+#include "local_state.h"
+#include "full_frame.h"
+#include "pedigrees.h"
+#include "scheduler.h"
+#include "sysdep.h"
+#include "except.h"
+#include "cilk_malloc.h"
+#include "record-replay.h"
+
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+
+#ifdef _MSC_VER
+/* Some versions of icc don't support limits.h on Linux if
+   gcc 4.3 or newer is installed. */
+#include <limits.h>
+
+/* Declare _ReturnAddress compiler intrinsic */
+void * _ReturnAddress(void);
+#pragma intrinsic(_ReturnAddress)
+
+#include "sysdep-win.h"     // Needed for sysdep_init_module()
+#endif  /* _WIN32 */
+
+#include "metacall_impl.h"
+#include "reducer_impl.h"
+#include "cilk-ittnotify.h"
+#include "cilk-tbb-interop.h"
+
+#define TBB_INTEROP_DATA_DELAYED_UNTIL_BIND (void *)-1
+
+/**
+ * __cilkrts_bind_thread is a versioned entrypoint.  The runtime should be
+ * exporting copies of __cilkrts_bind_version for the current and all previous
+ * versions of the ABI.
+ *
+ * This macro should always be set to generate a version to match the current
+ * version; __CILKRTS_ABI_VERSION.
+ */
+#define BIND_THREAD_RTN __cilkrts_bind_thread_1
+
+static inline
+void enter_frame_internal(__cilkrts_stack_frame *sf, uint32_t version)
+{
+    __cilkrts_worker *w = __cilkrts_get_tls_worker();
+    if (w == 0) { /* slow path */
+        w = BIND_THREAD_RTN();
+
+        sf->flags = CILK_FRAME_LAST | (version << 24);
+        CILK_ASSERT((sf->flags & CILK_FRAME_FLAGS_MASK) == CILK_FRAME_LAST);
+    } else {
+        sf->flags = (version << 24);
+        CILK_ASSERT((sf->flags & CILK_FRAME_FLAGS_MASK) == 0);
+    }
+    sf->call_parent = w->current_stack_frame;
+    sf->worker = w;
+    w->current_stack_frame = sf;
+}
+
+CILK_ABI_VOID __cilkrts_enter_frame(__cilkrts_stack_frame *sf)
+{
+    enter_frame_internal(sf, 0);
+}
+
+CILK_ABI_VOID __cilkrts_enter_frame_1(__cilkrts_stack_frame *sf)
+{
+    enter_frame_internal(sf, 1);
+    sf->reserved = 0;
+}
+
+static inline
+void enter_frame_fast_internal(__cilkrts_stack_frame *sf, uint32_t version)
+{
+    __cilkrts_worker *w = __cilkrts_get_tls_worker_fast();
+    sf->flags = version << 24;
+    sf->call_parent = w->current_stack_frame;
+    sf->worker = w;
+    w->current_stack_frame = sf;
+}
+
+CILK_ABI_VOID __cilkrts_enter_frame_fast(__cilkrts_stack_frame *sf)
+{
+    enter_frame_fast_internal(sf, 0);
+}
+
+CILK_ABI_VOID __cilkrts_enter_frame_fast_1(__cilkrts_stack_frame *sf)
+{
+    enter_frame_fast_internal(sf, 1);
+    sf->reserved = 0;
+}
+
+/**
+ * A component of the THE protocol.  __cilkrts_undo_detach checks whether
+ * this frame's parent has been stolen.  If it hasn't, the frame can return
+ * normally.  If the parent has been stolen, of if we suspect it might be,
+ * then __cilkrts_leave_frame() needs to call into the runtime.
+ *
+ * @note __cilkrts_undo_detach() is comparing the exception pointer against
+ * the tail pointer.  The exception pointer is modified when another worker
+ * is considering whether it can steal a frame.  The head pointer is updated
+ * to match when the worker lock is taken out and the thief is sure that
+ * it can complete the steal.  If the steal cannot be completed, the thief
+ * will restore the exception pointer.
+ *
+ * @return true if undo-detach failed.
+ */
+static int __cilkrts_undo_detach(__cilkrts_stack_frame *sf)
+{
+    __cilkrts_worker *w = sf->worker;
+    __cilkrts_stack_frame *volatile *t = w->tail;
+
+/*    DBGPRINTF("%d - __cilkrts_undo_detach - sf %p\n", w->self, sf); */
+
+    --t;
+    w->tail = t;
+    /* On x86 the __sync_fetch_and_<op> family includes a
+       full memory barrier.  In theory the sequence in the
+       second branch of the #if should be faster, but on
+       most x86 it is not.  */
+#if defined __i386__ || defined __x86_64__
+    __sync_fetch_and_and(&sf->flags, ~CILK_FRAME_DETACHED);
+#else
+    __cilkrts_fence(); /* membar #StoreLoad */
+    sf->flags &= ~CILK_FRAME_DETACHED;
+#endif
+
+    return __builtin_expect(t < w->exc, 0);
+}
+
+CILK_ABI_VOID __cilkrts_leave_frame(__cilkrts_stack_frame *sf)
+{
+    __cilkrts_worker *w = sf->worker;
+
+/*    DBGPRINTF("%d-%p __cilkrts_leave_frame - sf %p, flags: %x\n", w->self, GetWorkerFiber(w), sf, sf->flags); */
+
+#ifdef _WIN32
+    /* if leave frame was called from our unwind handler, leave_frame should
+       proceed no further. */
+    if (sf->flags & CILK_FRAME_UNWINDING)
+    {
+/*        DBGPRINTF("%d - __cilkrts_leave_frame - aborting due to UNWINDING flag\n", w->self); */
+
+        // If this is the frame of a spawn helper (indicated by the
+        // CILK_FRAME_DETACHED flag) we must update the pedigree.  The pedigree
+        // points to nodes allocated on the stack.  Failing to update it will
+        // result in a accvio/segfault if the pedigree is walked.  This must happen
+        // for all spawn helper frames, even if we're processing an exception
+        if ((sf->flags & CILK_FRAME_DETACHED))
+        {
+	    update_pedigree_on_leave_frame(w, sf);
+        }
+        return;
+    }
+#endif
+
+#if CILK_LIB_DEBUG
+    /* ensure the caller popped itself */
+    CILK_ASSERT(w->current_stack_frame != sf);
+#endif
+
+    /* The exiting function should have checked for zero flags,
+       so there is no check for flags == 0 here. */
+
+#if CILK_LIB_DEBUG
+    if (__builtin_expect(sf->flags & (CILK_FRAME_EXITING|CILK_FRAME_UNSYNCHED), 0))
+        __cilkrts_bug("W%u: function exiting with invalid flags %02x\n",
+                      w->self, sf->flags);
+#endif
+
+    /* Must return normally if (1) the active function was called
+       and not spawned, or (2) the parent has never been stolen. */
+    if ((sf->flags & CILK_FRAME_DETACHED)) {
+/*        DBGPRINTF("%d - __cilkrts_leave_frame - CILK_FRAME_DETACHED\n", w->self); */
+
+#ifndef _WIN32
+        if (__builtin_expect(sf->flags & CILK_FRAME_EXCEPTING, 0)) {
+// Pedigree will be updated in __cilkrts_leave_frame.  We need the
+// pedigree before the update for record/replay
+//	    update_pedigree_on_leave_frame(w, sf);
+            __cilkrts_return_exception(sf);
+            /* If return_exception returns the caller is attached.
+               leave_frame is called from a cleanup (destructor)
+               for the frame object.  The caller will reraise the
+               exception. */
+	    return;
+        }
+#endif
+
+        // During replay, check whether w was the last worker to continue
+        replay_wait_for_steal_if_parent_was_stolen(w);
+
+        // Attempt to undo the detach
+        if (__builtin_expect(__cilkrts_undo_detach(sf), 0)) {
+	        // The update of pedigree for leaving the frame occurs
+	        // inside this call if it does not return.
+            __cilkrts_c_THE_exception_check(w, sf);
+        }
+
+        update_pedigree_on_leave_frame(w, sf);
+
+        /* This path is taken when undo-detach wins the race with stealing.
+           Otherwise this strand terminates and the caller will be resumed
+           via setjmp at sync. */
+        if (__builtin_expect(sf->flags & CILK_FRAME_FLAGS_MASK, 0))
+            __cilkrts_bug("W%u: frame won undo-detach race with flags %02x\n",
+                          w->self, sf->flags);
+
+        return;
+    }
+
+#if CILK_LIB_DEBUG
+    sf->flags |= CILK_FRAME_EXITING;
+#endif
+
+    if (__builtin_expect(sf->flags & CILK_FRAME_LAST, 0))
+        __cilkrts_c_return_from_initial(w); /* does return */
+    else if (sf->flags & CILK_FRAME_STOLEN)
+        __cilkrts_return(w); /* does return */
+
+/*    DBGPRINTF("%d-%p __cilkrts_leave_frame - returning, StackBase: %p\n", w->self, GetWorkerFiber(w)); */
+}
+
+/* Caller must have called setjmp. */
+CILK_ABI_VOID __cilkrts_sync(__cilkrts_stack_frame *sf)
+{
+    __cilkrts_worker *w = sf->worker;
+/*    DBGPRINTF("%d-%p __cilkrts_sync - sf %p\n", w->self, GetWorkerFiber(w), sf); */
+    if (__builtin_expect(!(sf->flags & CILK_FRAME_UNSYNCHED), 0))
+        __cilkrts_bug("W%u: double sync %p\n", w->self, sf);
+#ifndef _WIN32
+    if (__builtin_expect(sf->flags & CILK_FRAME_EXCEPTING, 0)) {
+        __cilkrts_c_sync_except(w, sf);
+    }
+#endif
+
+    __cilkrts_c_sync(w, sf);
+}
+
+/*
+ * __cilkrts_get_sf
+ *
+ * Debugging aid to provide access to the current __cilkrts_stack_frame.
+ *
+ * Not documented!
+ */
+
+CILK_API_VOID_PTR
+__cilkrts_get_sf(void)
+{
+    __cilkrts_worker *w = __cilkrts_get_tls_worker();
+    if (0 == w)
+        return NULL;
+
+    return w->current_stack_frame;
+}
+
+/* Call with global lock held */
+static __cilkrts_worker *find_free_worker(global_state_t *g)
+{
+    __cilkrts_worker *w = 0;
+    int i;
+
+    // Scan the non-system workers looking for one which is free so we can
+    // use it.
+    for (i = g->P - 1; i < g->total_workers; ++i) {
+        w = g->workers[i];
+        CILK_ASSERT(WORKER_SYSTEM != w->l->type);
+        if (w->l->type == WORKER_FREE) {
+            w->l->type = WORKER_USER;
+            w->l->team = w;
+            return w;
+        }
+    }
+
+    // If we ran out of workers, create a new one.  It doesn't actually belong
+    // to the Cilk global state so nobody will ever try to steal from it.
+    w = (__cilkrts_worker *)__cilkrts_malloc(sizeof(*w));
+    __cilkrts_cilkscreen_ignore_block(w, w+1);
+    make_worker(g, -1, w);
+    w->l->type = WORKER_USER;
+    w->l->team = w;
+    return w;
+}
+
+/*
+ * __cilkrts_bind_thread
+ *
+ * Exported function to bind a thread to the runtime.
+ *
+ * This function name should always have a trailing suffix for the latest ABI
+ * version. This means that code built with a new compiler will not load
+ * against an old copy of the runtime.
+ *
+ * Symbols for the function called by code compiled with old versions of the
+ * compiler are created in an OS-specific manner:
+ *  - On Windows the old symbols are defined in the cilk-exports.def linker
+ *    definitions file as aliases of BIND_THREAD_RTN
+ *  - On Linux aliased symbols are created for BIND_THREAD_RTN in this file
+ *  - On MacOS the alternate entrypoints are implemented and simply call
+ *    BIND_THREAD_RTN.
+ */
+CILK_ABI_WORKER_PTR BIND_THREAD_RTN(void)
+{
+    __cilkrts_worker *w;
+    int start_cilkscreen = 0;
+#ifdef USE_ITTNOTIFY
+    static int unique_obj;
+#endif
+
+    // Cannot set this pointer until after __cilkrts_init_internal() call:
+    global_state_t* g;
+
+    ITT_SYNC_CREATE (&unique_obj, "Initialization");
+    ITT_SYNC_PREPARE(&unique_obj);
+    ITT_SYNC_ACQUIRED(&unique_obj);
+
+
+    /* 1: Initialize and start the Cilk runtime */
+    __cilkrts_init_internal(1);
+
+    /*
+     * 2: Choose a worker for this thread (fail if none left).  The table of
+     *    user workers is protected by the global OS mutex lock.
+     */
+    g = cilkg_get_global_state();
+    global_os_mutex_lock();
+    if (__builtin_expect(g->work_done, 0))
+        __cilkrts_bug("Attempt to enter Cilk while Cilk is shutting down");
+    w = find_free_worker(g);
+    CILK_ASSERT(w);
+
+    __cilkrts_set_tls_worker(w);
+    __cilkrts_cilkscreen_establish_worker(w);
+    {
+        full_frame *ff = __cilkrts_make_full_frame(w, 0);
+
+        ff->fiber_self = cilk_fiber_allocate_from_thread();
+        CILK_ASSERT(ff->fiber_self);
+
+        cilk_fiber_set_owner(ff->fiber_self, w);
+        cilk_fiber_tbb_interop_use_saved_stack_op_info(ff->fiber_self);
+	
+        CILK_ASSERT(ff->join_counter == 0);
+        ff->join_counter = 1;
+        w->l->frame_ff = ff;
+        w->reducer_map = __cilkrts_make_reducer_map(w);
+        __cilkrts_set_leftmost_reducer_map(w->reducer_map, 1);
+        load_pedigree_leaf_into_user_worker(w);
+    }
+
+    // Make sure that the head and tail are reset, and saved_protected_tail
+    // allows all frames to be stolen.
+    //
+    // Note that we must NOT check w->exc, since workers that are trying to
+    // steal from it will be updating w->exc and we don't own the worker lock.
+    // It's not worth taking out the lock just for an assertion.
+    CILK_ASSERT(w->head == w->l->ltq);
+    CILK_ASSERT(w->tail == w->l->ltq);
+    CILK_ASSERT(w->protected_tail  == w->ltq_limit);
+
+    // There may have been an old pending exception which was freed when the
+    // exception was caught outside of Cilk
+    w->l->pending_exception = NULL;
+
+    w->reserved = NULL;
+
+    // If we've already created a scheduling fiber for this worker, we'll just
+    // reuse it.  If w->self < 0, it means that this is an ad-hoc user worker
+    // not known to the global state.  Thus, we need to create a scheduling
+    // stack only if we don't already have one and w->self >= 0.
+    if (NULL == w->l->scheduling_fiber && w->self >= 0)
+    {
+        START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) {
+            // Create a scheduling fiber for this worker.
+            w->l->scheduling_fiber =
+                cilk_fiber_allocate_from_heap(CILK_SCHEDULING_STACK_SIZE);
+            cilk_fiber_reset_state(w->l->scheduling_fiber,
+                                   scheduler_fiber_proc_for_user_worker);
+            cilk_fiber_set_owner(w->l->scheduling_fiber, w);
+        } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE);
+    }
+    
+    // If the scheduling fiber is NULL, we've either exceeded our quota for
+    // fibers or workers or we're out of memory, so we should lose parallelism
+    // by disallowing stealing.
+    if (NULL == w->l->scheduling_fiber)
+        __cilkrts_disallow_stealing(w, NULL);
+
+    start_cilkscreen = (0 == w->g->Q);
+
+    if (w->self != -1) {
+        // w->self != -1, means that w is a normal user worker and must be
+        // accounted for by the global state since other workers can steal from
+        // it.
+
+        // w->self == -1, means that w is an overflow worker and was created on
+        // demand.  I.e., it does not need to be accounted for by the global
+        // state.
+
+        __cilkrts_enter_cilk(w->g);
+    }
+
+    global_os_mutex_unlock();
+
+    /* If there's only 1 worker, the counts will be started in
+     * __cilkrts_scheduler */
+    if (g->P > 1)
+    {
+        START_INTERVAL(w, INTERVAL_IN_SCHEDULER);
+        START_INTERVAL(w, INTERVAL_WORKING);
+    }
+
+    ITT_SYNC_RELEASING(&unique_obj);
+
+    /* Turn on Cilkscreen if this is the first worker.  This needs to be done
+     * when we are NOT holding the os mutex. */
+    if (start_cilkscreen)
+        __cilkrts_cilkscreen_enable_instrumentation();
+
+    return w;
+}
+
+#ifndef _MSC_VER
+/*
+ * Define old version-specific symbols for binding threads (since they exist in
+ * all Cilk code).  These aliases prohibit newly compiled code from loading an
+ * old version of the runtime.  We can handle old code with a new runtime, but
+ * new code with an old runtime is verboten!
+ *
+ * For Windows, the aliased symbol is exported in cilk-exports.def.
+ */
+#if defined(_DARWIN_C_SOURCE) || defined(__APPLE__)
+/**
+ * Mac OS X: Unfortunately, Darwin doesn't allow aliasing, so we just make a
+ * call and hope the optimizer does the right thing.
+ */
+CILK_ABI_WORKER_PTR __cilkrts_bind_thread (void) {
+    return BIND_THREAD_RTN();
+}
+#else
+
+/**
+ * Macro to convert a parameter to a string.  Used on Linux or BSD.
+ */
+#define STRINGIFY(x) #x
+
+/**
+ * Macro to generate an __attribute__ for an aliased name
+ */
+#define ALIASED_NAME(x) __attribute__ ((alias (STRINGIFY(x))))
+
+/**
+ * Linux or BSD: Use the alias attribute to make the labels for the versioned
+ * functions point to the same place in the code as the original.  Using
+ * the two macros is annoying but required.
+ */
+
+CILK_ABI_WORKER_PTR __cilkrts_bind_thread(void)
+    ALIASED_NAME(BIND_THREAD_RTN);
+
+#endif // defined _DARWIN_C_SOURCE || defined __APPLE__
+#endif // !defined _MSC_VER
+
+CILK_API_SIZET
+__cilkrts_get_stack_size(void) {
+    return cilkg_get_stack_size();
+}
+
+// Method for debugging.
+CILK_API_VOID __cilkrts_dump_stats(void)
+{
+    // While the stats aren't protected by the global OS mutex, the table
+    // of workers is, so take out the global OS mutex while we're doing this
+    global_os_mutex_lock();
+    if (cilkg_is_published()) {
+        global_state_t *g = cilkg_get_global_state();
+	__cilkrts_dump_stats_to_stderr(g);
+    }
+    else {
+	__cilkrts_bug("Attempting to report Cilk stats before the runtime has started\n");
+    }    
+    global_os_mutex_unlock();
+}
+
+#ifndef _WIN32
+CILK_ABI_THROWS_VOID __cilkrts_rethrow(__cilkrts_stack_frame *sf)
+{
+    __cilkrts_gcc_rethrow(sf);
+}
+#endif
+
+/*
+ * __cilkrts_unwatch_stack
+ *
+ * Callback for TBB to tell us they don't want to watch the stack anymore
+ */
+
+static __cilk_tbb_retcode __cilkrts_unwatch_stack(void *data)
+{
+    __cilk_tbb_stack_op_thunk o;
+
+    // If the cilk_fiber wasn't available fetch it now
+    if (TBB_INTEROP_DATA_DELAYED_UNTIL_BIND == data)
+    {
+        full_frame *ff;
+        __cilkrts_worker *w = __cilkrts_get_tls_worker();
+        if (NULL == w)
+        {
+            // Free any saved stack op information
+            cilk_fiber_tbb_interop_free_stack_op_info();
+
+            return 0;       /* Success! */
+        }
+
+        __cilkrts_worker_lock(w);
+        ff = w->l->frame_ff;
+        __cilkrts_frame_lock(w,ff);
+        data = ff->fiber_self;
+        __cilkrts_frame_unlock(w,ff);
+        __cilkrts_worker_unlock(w);
+    }
+
+#if CILK_LIB_DEBUG /* Debug code */
+    /* Get current stack */
+    full_frame *ff;
+    __cilkrts_worker *w = __cilkrts_get_tls_worker();
+    __cilkrts_worker_lock(w);
+    ff = w->l->frame_ff;
+    __cilkrts_frame_lock(w,ff);
+    CILK_ASSERT (data == ff->fiber_self);
+    __cilkrts_frame_unlock(w,ff);
+    __cilkrts_worker_unlock(w);
+#endif
+
+    /* Clear the callback information */
+    o.data = NULL;
+    o.routine = NULL;
+    cilk_fiber_set_stack_op((cilk_fiber*)data, o);
+    
+    // Note. Do *NOT* free any saved stack information here.   If they want to
+    // free the saved stack op information, they'll do it when the thread is
+    // unbound
+
+    return 0;       /* Success! */
+}
+
+/*
+ * __cilkrts_watch_stack
+ *
+ * Called by TBB, defined by Cilk.
+ *
+ * Requests that Cilk invoke the stack op routine when it orphans a stack. 
+ * Cilk sets *u to a thunk that TBB should call when it is no longer interested
+ * in watching the stack.
+ */
+
+CILK_API_TBB_RETCODE
+__cilkrts_watch_stack(__cilk_tbb_unwatch_thunk *u,
+                      __cilk_tbb_stack_op_thunk o)
+{
+    cilk_fiber* current_fiber;
+    __cilkrts_worker *w;
+
+#ifdef _MSC_VER
+    // This may be called by TBB *before* the OS has given us our
+    // initialization call.  Make sure the module is initialized.
+    sysdep_init_module();
+#endif
+
+    // Fetch the __cilkrts_worker bound to this thread
+    w = __cilkrts_get_tls_worker();
+    if (NULL == w)
+    {
+        // Save data for later.  We'll deal with it when/if this thread binds
+        // to the runtime
+        cilk_fiber_tbb_interop_save_stack_op_info(o);
+        
+        u->routine = __cilkrts_unwatch_stack;
+        u->data = TBB_INTEROP_DATA_DELAYED_UNTIL_BIND;
+
+        return 0;
+    }
+
+    /* Get current stack */
+    __cilkrts_worker_lock(w);
+    current_fiber = w->l->frame_ff->fiber_self;
+    __cilkrts_worker_unlock(w);
+
+/*    CILK_ASSERT( !sd->stack_op_data ); */
+/*    CILK_ASSERT( !sd->stack_op_routine ); */
+
+    /* Give TBB our callback */
+    u->routine = __cilkrts_unwatch_stack;
+    u->data = current_fiber;
+    /* Save the callback information */
+    cilk_fiber_set_stack_op(current_fiber, o);
+
+    return 0;   /* Success! */
+}
+
+
+// This function must be called only within a continuation, within the stack
+// frame of the continuation itself.
+CILK_API_INT __cilkrts_synched(void)
+{
+    __cilkrts_worker *w = __cilkrts_get_tls_worker();
+
+    // If we don't have a worker, then we're synched by definition :o)
+    if (NULL == w)
+        return 1;
+
+    // Check to see if we are in a stolen continuation.  If not, then
+    // we are synched.
+    uint32_t flags = w->current_stack_frame->flags;
+    if (0 == (flags & CILK_FRAME_UNSYNCHED))
+        return 1;
+
+    // We are in a stolen continutation, but the join counter might have been
+    // decremented to one, making us synched again.  Get the full frame so
+    // that we can check the join counter.  ASSUME: frame_ff is stable (can be
+    // read without a lock) in a stolen continuation -- it can't be stolen
+    // while it's currently executing.
+    full_frame *ff = w->l->frame_ff;
+
+    // Make sure we have a full frame
+    // TBD: Don't think that we should ever not have a full frame here.
+    // CILK_ASSERT(NULL != ff); ?
+    if (NULL == ff)
+        return 1;
+
+    // We're synched if there are no outstanding children at this instant in
+    // time.  Note that this is a known race, but it's ok since we're only
+    // reading.  We can get false negatives, but not false positives. (I.e.,
+    // we can read a non-one join_counter just before it goes to one, but the
+    // join_counter cannot go from one to greater than one while we're
+    // reading.)
+    return 1 == ff->join_counter;
+}
+
+
+
+
+CILK_API_INT
+__cilkrts_bump_loop_rank_internal(__cilkrts_worker* w)
+{
+    // If we don't have a worker, then the runtime is not bound to this
+    // thread and there is no rank to increment
+    if (NULL == w)
+        return -1;
+
+    // We're at the start of the loop body.  Advance the cilk_for loop
+    // body pedigree by following the parent link and updating its
+    // rank.
+
+    // Normally, we'd just write "w->pedigree.parent->rank++"
+    // But we need to cast away the "const".
+    ((__cilkrts_pedigree*) w->pedigree.parent)->rank++;
+
+    // Zero the worker's pedigree rank since this is the start of a new
+    // pedigree domain.
+    w->pedigree.rank = 0;
+
+    return 0;
+}
+
+CILK_ABI_VOID
+__cilkrts_save_fp_ctrl_state(__cilkrts_stack_frame *sf)
+{
+    // Pass call onto OS/architecture dependent function
+    sysdep_save_fp_ctrl_state(sf);
+}
+
+/* end cilk-abi.c */
diff --git a/libcilkrts/runtime/cilk-ittnotify.h b/libcilkrts/runtime/cilk-ittnotify.h
new file mode 100644
index 00000000000..ff995db6fbb
--- /dev/null
+++ b/libcilkrts/runtime/cilk-ittnotify.h
@@ -0,0 +1,100 @@
+/* cilk-ittnotify.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#ifndef INCLUDED_CILK_ITTNOTIFY_DOT_H
+#define INCLUDED_CILK_ITTNOTIFY_DOT_H
+
+#ifdef __INTEL_COMPILER
+#endif
+#include <stdio.h>
+
+// ITTNOTIFY does not support ARM at this time
+#ifdef __arm__
+#undef USE_ITTNOTIFY
+#endif
+
+#ifdef USE_ITTNOTIFY
+#include <ittnotify.h>
+
+#ifdef _WIN32
+# define ITT_SYNC_CREATE(_address, _description)        \
+    __itt_sync_createA(_address,                        \
+                       "Intel Cilk Plus " _description, \
+                       "",                              \
+                       __itt_attr_barrier)
+#else
+# define ITT_SYNC_CREATE(_address, _description)        \
+    __itt_sync_create(_address,                         \
+                      "Intel Cilk Plus " _description,  \
+                      "",                               \
+                      __itt_attr_barrier)
+#endif
+
+#define ITT_SYNC_PREPARE(_address) __itt_sync_prepare(_address)
+#define ITT_SYNC_ACQUIRED(_address) __itt_sync_acquired(_address)
+#define ITT_SYNC_RELEASING(_address) __itt_sync_releasing(_address)
+#define ITT_SYNC_DESTROY(_address) __itt_sync_destroy(_address)
+// Note that we subtract 5 from the return address to find the CALL instruction
+// to __cilkrts_sync
+#if 1   // Disable renaming for now.  Piersol isn't ready yet
+#define ITT_SYNC_SET_NAME_AND_PREPARE(_address, _sync_ret_address) __itt_sync_prepare(_address)
+#else
+#define ITT_SYNC_SET_NAME_AND_PREPARE(_address, _sync_ret_address) \
+    if (NULL != __itt_sync_prepare_ptr) {   \
+        if (0 == _sync_ret_address) \
+            __itt_sync_renameA(_address, "");  \
+        else    \
+        {   \
+            char buf[128];  \
+            sprintf_s(buf, 128, "IP:0x%p", (DWORD_PTR)_sync_ret_address - 5); \
+            __itt_sync_renameA(_address, buf); \
+            _sync_ret_address = 0;  \
+         }  \
+        __itt_sync_prepare(_address);  \
+    }
+#endif
+#else   // USE_ITTNOTIFY not defined, compile out all calls
+#define ITT_SYNC_CREATE(_address, _description)
+#define ITT_SYNC_PREPARE(_address)
+#define ITT_SYNC_ACQUIRED(_address)
+#define ITT_SYNC_RELEASING(_addresss)
+#define ITT_SYNC_DESTROY(_address)
+#define ITT_SYNC_SET_NAME_AND_PREPARE(_sync_address, _wait_address)
+#endif
+
+#endif // ! defined(INCLUDED_CILK_ITTNOTIFY_DOT_H)
diff --git a/libcilkrts/runtime/cilk-tbb-interop.h b/libcilkrts/runtime/cilk-tbb-interop.h
new file mode 100644
index 00000000000..cc5cff4b57e
--- /dev/null
+++ b/libcilkrts/runtime/cilk-tbb-interop.h
@@ -0,0 +1,192 @@
+/* cilk-tbb-interop.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file cilk-tbb-interop.h
+ *
+ * @brief Interface between TBB and Cilk to allow TBB to associate it's
+ * per-thread data with Cilk workers, and maintain the association as work
+ * moves between worker threads.  This handles the case where TBB calls
+ * into a Cilk function which may later call back to a function making
+ * TBB calls.
+ *
+ * Each thunk structure has two pointers: \"routine\" and \"data\".
+ * The caller of the thunk invokes *routine, passing \"data\" as the void*
+ * parameter.
+ */
+
+#ifndef INCLUDED_CILK_TBB_INTEROP_DOT_H
+#define INCLUDED_CILK_TBB_INTEROP_DOT_H
+
+#include <cilk/common.h>  // for CILK_EXPORT
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/** A return code.  0 indicates success. */
+typedef int __cilk_tbb_retcode;
+
+/**
+ * Enumeration of reasons that Cilk will call the TBB stack operation
+ * function.
+ *
+ * When a non-empty stack is transfered between threads, the first thread must
+ * orphan it and the second thread must adopt it.
+ *
+ * An empty stack can be transfered similarly, or simply released by the first
+ * thread.
+ *
+ * Here is a summary of the actions as transitions on a state machine.
+@verbatim
+                       watch                                    ORPHAN
+                       -->-->                                   -->--
+                      /      \                                 /     \
+   (freed empty stack)       (TBB sees stack running on thread)      (stack in limbo)
+                      \     /                                  \     / 
+                       --<--                                    --<--
+                       RELEASE or                               ADOPT
+                       unwatch
+@endverbatim
+ */
+typedef enum __cilk_tbb_stack_op {
+   /**
+    * Disconnecting stack from a thread.
+    *
+    * The thunk must be invoked on the thread disconnecting itself from the
+    * stack.  Must \"happen before\" the stack is adopted elsewhere.
+    */
+    CILK_TBB_STACK_ORPHAN,
+
+    /**
+     * Reconnecting orphaned stack to a thread.
+     *
+     * The thunk must be invoked on the thread adopting the stack.
+     */
+    CILK_TBB_STACK_ADOPT,
+
+   /**
+    * Releasing stack.
+    *
+    * The thunk must be invoked on the thread doing the releasing, Must
+    * \"happen before\" the stack is used elsewhere.
+    */
+    CILK_TBB_STACK_RELEASE
+} __cilk_tbb_stack_op;
+
+/**
+ * Function that will be called by the Cilk runtime to inform TBB of a change
+ * in the stack associated with the current thread.
+ *
+ * It does not matter what stack the thunk runs on.
+ * The thread (not fiber) on which the thunk runs is important.
+ *
+ * @param op Enumerated value indicating what type of change is ocurring.
+ * @param data Context value provided by TBB in the __cilkrts_watch_stack
+ * call.  This data is opaque to Cilk.
+ *
+ * @return 0 indicates success.
+ */
+typedef __cilk_tbb_retcode (*__cilk_tbb_pfn_stack_op)(enum __cilk_tbb_stack_op op,
+                                                      void* data);
+
+/**
+ * Function that will be called by TBB to inform the Cilk runtime that TBB
+ * is no longer interested in watching the stack bound to the current thread.
+ *
+ * @param data Context value provided to TBB by the __cilkrts_watch_stack
+ * call.  This data is opaque to TBB.
+ *
+ * @return 0 indicates success.
+ */
+typedef __cilk_tbb_retcode (*__cilk_tbb_pfn_unwatch_stacks)(void *data);
+
+/**
+ * Thunk invoked by Cilk to call back to TBB to tell it about a change in
+ * the stack bound to the current thread.
+ */
+typedef struct __cilk_tbb_stack_op_thunk {
+    /// Function in TBB the Cilk runtime should call when something
+    // "interesting" happens involving a stack
+    __cilk_tbb_pfn_stack_op routine;
+
+    /// TBB context data to pass with the call to the stack_op routine
+    void* data;
+} __cilk_tbb_stack_op_thunk;
+
+/**
+ * Thunk invoked by TBB when it is no longer interested in watching the stack
+ * bound to the current thread.
+ */
+typedef struct __cilk_tbb_unwatch_thunk {
+    /// Function in Cilk runtime to call when TBB no longer wants to watch
+    // stacks
+    __cilk_tbb_pfn_unwatch_stacks routine;
+
+    /// Cilk runtime context data to pass with the call to the unwatch_stacks
+    /// routine
+    void* data;
+} __cilk_tbb_unwatch_thunk;
+
+/**
+ * Requests that Cilk invoke __cilk_tbb_orphan_thunk when it orphans a stack.
+ * Cilk sets *u to a thunk that TBB should call when it is no longer
+ * interested in watching the stack.
+ *
+ * If the thread is not yet bound to the Cilk runtime, the Cilk runtime should
+ * save this data in thread-local storage until __cilkrts_bind_thread is called.
+ *
+ * Called by TBB, defined by Cilk.  This function is exported from the Cilk
+ * runtime DLL/shared object.  This declaration also appears in
+ * cilk/cilk_undocumented.h -- don't change one declaration without also
+ * changing the other.
+ *
+ * @param u __cilk_tbb_unwatch_thunk.  This structure will be filled in by
+ * the Cilk runtime to allow TBB to register that it is no longer interested
+ * in watching the stack bound to the current thread.
+ * @param o __cilk_tbb_stack_op_thunk.  This structure specifies the routine
+ * that the Cilk runtime should call when an "interesting" change in the stack
+ * associate with the current worker occurs.
+ *
+ * @return 0 indicates success.
+ */
+CILK_EXPORT
+__cilk_tbb_retcode __cilkrts_watch_stack(__cilk_tbb_unwatch_thunk* u,
+                                         __cilk_tbb_stack_op_thunk o);
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_CILK_TBB_INTEROP_DOT_H)
diff --git a/libcilkrts/runtime/cilk_api.c b/libcilkrts/runtime/cilk_api.c
new file mode 100644
index 00000000000..bbca984bc03
--- /dev/null
+++ b/libcilkrts/runtime/cilk_api.c
@@ -0,0 +1,255 @@
+/* cilk_api.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/*
+ * Implementation of functions declared in cilk_api.h
+ */
+
+/*
+ * Define the COMPILING_CILK_ABI_FUNCTIONS macro, so that
+ * compilation of this file generates non-inlined definitions for the
+ * functions marked as CILK_EXPORT_AND_INLINE in cilk_api.h.
+ *
+ * We must deal with these functions differently because we need to
+ * continue to ship nonlined versions of these functions.
+ *
+ *   CILK_EXPORT_AND_INLINE int __cilkrts_get_worker_rank(uint64_t *rank);
+ *   CILK_EXPORT_AND_INLINE int __cilkrts_bump_worker_rank();
+ *   CILK_EXPORT_AND_INLINE int __cilkrts_bump_loop_rank();
+ */
+#define COMPILING_CILK_API_FUNCTIONS
+
+#include <internal/abi.h>
+#include <cilk/cilk_api.h>
+
+#include "os.h"
+#include "os_mutex.h"
+#include "bug.h"
+#include "global_state.h"
+#include "local_state.h"
+#include "scheduler.h"
+#include "sysdep.h"
+
+CILK_API_VOID __cilkrts_init(void)
+{
+    // Initialize, but don't start, the cilk runtime.
+    __cilkrts_init_internal(0);
+}
+
+CILK_API_VOID __cilkrts_end_cilk(void)
+{
+    // Take out the global OS mutex while we do this to protect against
+    // another thread attempting to bind while we do this
+    global_os_mutex_lock();
+
+    if (cilkg_is_published()) {
+        global_state_t *g = cilkg_get_global_state();
+        if (g->Q || __cilkrts_get_tls_worker())
+            __cilkrts_bug("Attempt to shut down Cilk while Cilk is still "
+                          "running");
+        __cilkrts_stop_workers(g);
+        __cilkrts_deinit_internal(g);
+    }
+
+    global_os_mutex_unlock();
+}
+
+CILK_API_INT
+__cilkrts_get_nworkers()
+{
+    return cilkg_get_nworkers();
+}
+
+CILK_API_INT
+__cilkrts_get_total_workers()
+{
+    return cilkg_get_total_workers();
+}
+
+CILK_API_INT __cilkrts_get_force_reduce(void)
+{
+    return cilkg_get_force_reduce();
+}
+
+CILK_API_INT __cilkrts_set_param(const char* param, const char* value)
+{
+    return cilkg_set_param(param, value);
+}
+
+#ifdef _WIN32
+CILK_API_INT __cilkrts_set_param_w(const wchar_t* param, const wchar_t* value)
+{
+    return cilkg_set_param_w(param, value);
+}
+#endif // _WIN32
+
+/* Return a small integer indicating which Cilk worker the function is
+ * currently running on.  Each thread started by the Cilk runtime library
+ * (system worker) has a unique worker number in the range 1..P-1, where P is
+ * the valued returned by __cilkrts_get_nworkers().  All threads started by
+ * the user or by other libraries (user workers) share the worker number 0.
+ * Therefore, the worker number is not unique across multiple user threads.
+ *
+ * Implementor's note: The value returned from this function is different from
+ * the value, w->self, used in most debug messages.
+ */
+CILK_API_INT
+__cilkrts_get_worker_number(void)
+{
+    __cilkrts_worker *w = __cilkrts_get_tls_worker();
+
+    if (0 == w)
+        /* A non-worker always has a worker number of zero. */
+        return 0;
+    else if (WORKER_USER == w->l->type)
+        /* User worker was once a non-worker, so its number should still be
+         * zero. */
+        return 0;
+    else
+        /* w->self for a system worker is in range 0..(P-1); adjust to 1..P
+         * to avoid conflicting with the user thread's worker number. */
+        return w->self + 1;
+}
+
+/**
+ * Internal definition of the pedigree context.  The size of the
+ * structure must match __cilkrts_pedigree_context_t defined in abi.i
+ */
+typedef struct pedigree_context_t
+{
+    /** Size of the structure, in bytes */
+    size_t size;
+
+    /** Next __cilkrts_pedigree to return */
+    const __cilkrts_pedigree *pedigree;
+
+    /** Unused.  Left over from previous implementation */
+    void *unused1;
+
+    /** Unused.  Left over from previous implementation */
+    void *unused2;
+
+    // // Debugging aid for pedigree-test:
+    // __cilkrts_stack_frame *expected_sf;
+} pedigree_context_t;
+
+/*
+ * __cilkrts_get_pedigree_info
+ *
+ * Fetch the birthrank for a stack frame.  To initialize the walk, both sf_in
+ * and frame_in should be NULL.  parent_sf_ptr and parent_frame_ptr provide
+ * context for the stackwalk and should be returned as sf_in and frame_in on
+ * the next call.
+ *
+ * Returns:
+ *   0 - Success - birthrank, parent_sf_out and parent_frame_out are valid
+ *   >1 - Pedigree walk completed
+ *   <1 - Failure - -1: No worker bound to thread, -2: Sanity check failed
+ */
+
+#define PEDIGREE_WALK_COMPLETE (__cilkrts_pedigree *)-1
+
+CILK_API_INT
+__cilkrts_get_pedigree_info(__cilkrts_pedigree_context_t *external_context,
+                            uint64_t *sf_birthrank)
+{
+    pedigree_context_t *context = (pedigree_context_t *)external_context;
+
+    CILK_ASSERT(sizeof(__cilkrts_pedigree_context_t) ==
+                sizeof(pedigree_context_t));
+    if (context->size != sizeof(pedigree_context_t))
+        return -3;  // Invalid size
+
+    // If the pointer to the last __cilkrts_pedigree is -1, we've
+    // finished the walk.  We're still done.
+    if (PEDIGREE_WALK_COMPLETE == context->pedigree)
+        return 1;
+
+    // The passed in context value contains a pointer to the last
+    // __cilkrts_pedigree returned, or NULL if we're starting a
+    // new walk
+    if (NULL == context->pedigree)
+    {
+        __cilkrts_worker *w = __cilkrts_get_tls_worker();
+	__cilkrts_pedigree* pedigree_node;
+        if (NULL != w) {
+	    pedigree_node = &w->pedigree;
+	}
+	else {
+	    pedigree_node = __cilkrts_get_tls_pedigree_leaf(1);
+	}
+	context->pedigree = pedigree_node->parent;
+    }
+    else
+        context->pedigree = context->pedigree->parent;
+
+    // Note: If we want to omit the user root node,
+    // stop at context->pedigree->parent instead.
+    if (NULL == context->pedigree)
+    {
+	context->pedigree = PEDIGREE_WALK_COMPLETE;
+        return 1;
+    }
+
+    *sf_birthrank = context->pedigree->rank;
+    return 0;
+}
+
+CILK_API_PEDIGREE
+__cilkrts_get_pedigree_internal(__cilkrts_worker *w)
+{
+    if (NULL != w) {
+	return w->pedigree;
+    }
+    else {
+	const __cilkrts_pedigree *pedigree =
+            __cilkrts_get_tls_pedigree_leaf(1);
+	return *pedigree;
+    }
+}
+
+
+CILK_API_INT __cilkrts_bump_worker_rank_internal(__cilkrts_worker *w)
+{
+    __cilkrts_pedigree *pedigree;
+    pedigree = (w ? &w->pedigree : __cilkrts_get_tls_pedigree_leaf(1));
+    pedigree->rank++;
+    return 0;
+}
+
+/* End cilk_api.c */
diff --git a/libcilkrts/runtime/cilk_fiber-unix.cpp b/libcilkrts/runtime/cilk_fiber-unix.cpp
new file mode 100644
index 00000000000..b9b47e364a5
--- /dev/null
+++ b/libcilkrts/runtime/cilk_fiber-unix.cpp
@@ -0,0 +1,273 @@
+/* cilk_fiber-unix.cpp                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2012-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#include "cilk_fiber-unix.h"
+#include "cilk_malloc.h"
+#include "bug.h"
+#include "os.h"
+
+#include <cstdio>
+#include <cstdlib>
+
+#include <alloca.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+// MAP_ANON is deprecated on Linux, but seems to be required on Mac...
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+// Magic number for sanity checking fiber structure
+const unsigned magic_number = 0x5afef00d;
+
+int cilk_fiber_sysdep::s_page_size = getpagesize();
+
+cilk_fiber_sysdep::cilk_fiber_sysdep(std::size_t stack_size)
+    : cilk_fiber(stack_size)
+    , m_magic(magic_number)
+{
+    // Set m_stack and m_stack_base.
+    make_stack(stack_size);
+
+    // Get high-address of stack, with 32-bytes of spare space, and rounded
+    // down to the nearest 32-byte boundary.
+    const uintptr_t align_mask = 32 - 1;
+    m_stack_base -= ((std::size_t) m_stack_base) & align_mask;
+}
+
+cilk_fiber_sysdep::cilk_fiber_sysdep(from_thread_t)
+    : cilk_fiber()
+    , m_magic(magic_number)
+{
+    this->set_allocated_from_thread(true);
+
+    // Dummy stack data for thread-main fiber
+    m_stack      = NULL;
+    m_stack_base = NULL;
+}
+
+void cilk_fiber_sysdep::convert_fiber_back_to_thread()
+{
+    // Does nothing on Linux.
+}
+
+cilk_fiber_sysdep::~cilk_fiber_sysdep()
+{
+    CILK_ASSERT(magic_number == m_magic);
+    if (!this->is_allocated_from_thread())
+        free_stack();
+}
+
+#if SUPPORT_GET_CURRENT_FIBER
+cilk_fiber_sysdep* cilk_fiber_sysdep::get_current_fiber_sysdep()
+{
+    return cilkos_get_tls_cilk_fiber();
+}
+#endif
+
+// Jump to resume other fiber.  We may or may not come back.
+inline void cilk_fiber_sysdep::resume_other_sysdep(cilk_fiber_sysdep* other)
+{
+    if (other->is_resumable()) {
+        other->set_resumable(false);
+        // Resume by longjmp'ing to the place where we suspended.
+        CILK_LONGJMP(other->m_resume_jmpbuf);
+    }
+    else {
+        // Otherwise, we've never ran this fiber before.  Start the
+        // proc method.
+        other->run();
+    }
+}
+
+void cilk_fiber_sysdep::suspend_self_and_resume_other_sysdep(cilk_fiber_sysdep* other)
+{
+#if SUPPORT_GET_CURRENT_FIBER
+    cilkos_set_tls_cilk_fiber(other);
+#endif
+    CILK_ASSERT(this->is_resumable());
+
+
+    // Jump to the other fiber.  We expect to come back.
+    if (! CILK_SETJMP(m_resume_jmpbuf)) {
+        resume_other_sysdep(other);
+    }
+
+    // Return here when another fiber resumes me.
+    // If the fiber that switched to me wants to be deallocated, do it now.
+    do_post_switch_actions();
+}
+
+NORETURN cilk_fiber_sysdep::jump_to_resume_other_sysdep(cilk_fiber_sysdep* other)
+{
+#if SUPPORT_GET_CURRENT_FIBER
+    cilkos_set_tls_cilk_fiber(other);
+#endif
+    CILK_ASSERT(!this->is_resumable());
+
+    // Jump to the other fiber.  But we are never coming back because
+    // this fiber is being reset.
+    resume_other_sysdep(other);
+
+    // We should never come back here...
+    __cilkrts_bug("Should not get here");
+}
+
+
+NORETURN cilk_fiber_sysdep::run()
+{
+    // Only fibers created from a pool have a proc method to run and execute. 
+    CILK_ASSERT(m_start_proc);
+    CILK_ASSERT(!this->is_allocated_from_thread());
+    CILK_ASSERT(!this->is_resumable());
+
+    // TBD: This setjmp/longjmp pair simply changes the stack pointer.
+    // We could probably replace this code with some assembly.
+    if (! CILK_SETJMP(m_resume_jmpbuf))
+    {
+        // Calculate the size of the current stack frame (i.e., this
+        // run() function.  
+        size_t frame_size = (size_t)JMPBUF_FP(m_resume_jmpbuf) - (size_t)JMPBUF_SP(m_resume_jmpbuf);
+
+        // Macs require 16-byte alignment.  Do it always because it just
+        // doesn't matter
+        if (frame_size & (16-1))
+            frame_size += 16 - (frame_size  & (16-1));
+
+        // Assert that we are getting a reasonable frame size out of
+        // it.  If this run() function is using more than 4096 bytes
+        // of space for its local variables / any state that spills to
+        // registers, something is probably *very* wrong here...
+        //
+        // 4096 bytes just happens to be a number that seems "large
+        // enough" --- for an example GCC 32-bit compilation, the
+        // frame size was 48 bytes.
+        CILK_ASSERT(frame_size < 4096);
+
+        // Change stack pointer to fiber stack.  Offset the
+        // calculation by the frame size, so that we've allocated
+        // enough extra space from the top of the stack we are
+        // switching to for any temporaries required for this run()
+        // function.
+        JMPBUF_SP(m_resume_jmpbuf) = m_stack_base - frame_size;
+        CILK_LONGJMP(m_resume_jmpbuf);
+    }
+
+    // Note: our resetting of the stack pointer is valid only if the
+    // compiler has not saved any temporaries onto the stack for this
+    // function before the longjmp that we still care about at this
+    // point.
+    
+    // Verify that 1) 'this' is still valid and 2) '*this' has not been
+    // corrupted.
+    CILK_ASSERT(magic_number == m_magic);
+
+    // If the fiber that switched to me wants to be deallocated, do it now.
+    do_post_switch_actions();
+
+    // Now call the user proc on the new stack
+    m_start_proc(this);
+
+    // alloca() to force generation of frame pointer.  The argument to alloca
+    // is contrived to prevent the compiler from optimizing it away.  This
+    // code should never actually be executed.
+    int* dummy = (int*) alloca((sizeof(int) + (std::size_t) m_start_proc) & 0x1);
+    *dummy = 0xface;
+
+    // User proc should never return.
+    __cilkrts_bug("Should not get here");
+}
+
+void cilk_fiber_sysdep::make_stack(size_t stack_size)
+{
+    char* p;
+    // We've already validated that the stack size is page-aligned and
+    // is a reasonable value.  No need to do any extra rounding here.
+    size_t rounded_stack_size = stack_size;
+
+    // Normally, we have already validated that the stack size is
+    // aligned to 4K.  In the rare case that pages are huge though, we
+    // need to do some extra checks.
+    if (rounded_stack_size < 3 * (size_t)s_page_size) {
+        // If the specified stack size is too small, round up to 3
+        // pages.  We need at least 2 extra for the guard pages.
+        rounded_stack_size = 3 * (size_t)s_page_size;
+    }
+    else {
+        // Otherwise, the stack size is large enough, but might not be
+        // a multiple of page size.  Round up to nearest multiple of
+        // s_page_size, just to be safe.
+        size_t remainder = rounded_stack_size % s_page_size;
+        if (remainder) {
+            rounded_stack_size += s_page_size - remainder;
+        }
+    }
+
+    p = (char*)mmap(0, rounded_stack_size,
+                    PROT_READ|PROT_WRITE,
+                    MAP_PRIVATE|MAP_ANONYMOUS,
+                    -1, 0);
+    if (MAP_FAILED == p) {
+        // For whatever reason (probably ran out of memory), mmap() failed.
+        // There is no stack to return, so the program loses parallelism.
+        m_stack = NULL;
+        m_stack_base = NULL;
+        return;
+    }
+
+    // mprotect guard pages.
+    mprotect(p + rounded_stack_size - s_page_size, s_page_size, PROT_NONE);
+    mprotect(p, s_page_size, PROT_NONE);
+
+    m_stack = p;
+    m_stack_base = p + rounded_stack_size - s_page_size;
+}
+
+
+void cilk_fiber_sysdep::free_stack()
+{
+    if (m_stack) {
+        size_t rounded_stack_size = m_stack_base - m_stack + s_page_size;
+        if (munmap(m_stack, rounded_stack_size) < 0)
+            __cilkrts_bug("Cilk: stack munmap failed error %d\n", errno);
+    }
+}
+
+/* End cilk_fiber-unix.cpp */
diff --git a/libcilkrts/runtime/cilk_fiber-unix.h b/libcilkrts/runtime/cilk_fiber-unix.h
new file mode 100644
index 00000000000..9f47d5b0437
--- /dev/null
+++ b/libcilkrts/runtime/cilk_fiber-unix.h
@@ -0,0 +1,149 @@
+/* cilk_fiber-unix.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2012-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#ifndef INCLUDED_CILK_FIBER_UNIX_DOT_H
+#define INCLUDED_CILK_FIBER_UNIX_DOT_H
+
+#ifndef __cplusplus
+#   error cilk_fiber-unix.h is a C++-only header
+#endif
+
+#include "cilk_fiber.h"
+#include "jmpbuf.h"
+
+/**
+ * @file cilk_fiber-unix.h
+ *
+ * @brief Unix-specific implementation for cilk_fiber.
+ */
+
+/**
+ * @brief Unix-specific fiber class derived from portable fiber class
+ */
+struct cilk_fiber_sysdep : public cilk_fiber
+{
+  public:
+
+#if SUPPORT_GET_CURRENT_FIBER
+    /**
+     * @brief Gets the current fiber from TLS.
+     */
+    static cilk_fiber_sysdep* get_current_fiber_sysdep();
+#endif
+
+    /**
+     * @brief Construct the system-dependent portion of a fiber.
+     *
+     * @param stack_size  The size of the stack for this fiber.
+     */ 
+    cilk_fiber_sysdep(std::size_t stack_size);
+
+    /**
+     * @brief Construct the system-dependent of a fiber created from a
+     * thread.
+     */ 
+    cilk_fiber_sysdep(from_thread_t);
+
+    /**
+     * @brief Destructor
+     */ 
+    ~cilk_fiber_sysdep();
+
+    /**
+     * @brief OS-specific calls to convert this fiber back to thread.
+     *
+     * Nothing to do for Linux.
+     */
+    void convert_fiber_back_to_thread();
+
+    /**
+     * @brief System-dependent function to suspend self and resume execution of "other".
+     *
+     * This fiber is suspended.
+     *          
+     * @pre @c is_resumable() should be true. 
+     *
+     * @param other              Fiber to resume.
+     */
+    void suspend_self_and_resume_other_sysdep(cilk_fiber_sysdep* other);
+
+    /**
+     * @brief System-dependent function called to jump to @p other
+     * fiber.
+     *
+     * @pre @c is_resumable() should be false.
+     *
+     * @param other  Fiber to resume.
+     */
+    NORETURN jump_to_resume_other_sysdep(cilk_fiber_sysdep* other);
+    
+    /**
+     * @brief Runs the start_proc.
+     * @pre is_resumable() should be false.
+     * @pre is_allocated_from_thread() should be false.
+     * @pre m_start_proc must be valid.
+     */
+    NORETURN run();
+
+    /**
+     * @brief Returns the base of this fiber's stack.
+     */
+    inline char* get_stack_base_sysdep() { return m_stack_base; }
+
+  private:
+    char*                       m_stack_base;     ///< The base of this fiber's stack.
+    char*                       m_stack;          // Stack memory (low address)
+    __CILK_JUMP_BUFFER          m_resume_jmpbuf;  // Place to resume fiber
+    unsigned                    m_magic;          // Magic number for checking
+
+    static int                  s_page_size;      // Page size for
+                                                  // stacks.
+
+    // Allocate memory for a stack.  This method
+    // initializes m_stack and m_stack_base.
+    void make_stack(size_t stack_size);
+
+    // Deallocates memory for the stack.
+    void free_stack();
+
+    // Common helper method for implementation of resume_other_sysdep
+    // variants.
+    inline void resume_other_sysdep(cilk_fiber_sysdep* other);
+};
+
+#endif // ! defined(INCLUDED_CILK_FIBER_UNIX_DOT_H)
diff --git a/libcilkrts/runtime/cilk_fiber.cpp b/libcilkrts/runtime/cilk_fiber.cpp
new file mode 100644
index 00000000000..0c66f234d3b
--- /dev/null
+++ b/libcilkrts/runtime/cilk_fiber.cpp
@@ -0,0 +1,1078 @@
+/* cilk_fiber.cpp                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2012-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/* Implementations of non-platform-specific aspects of cilk_fiber, especially
+ * the cilk_fiber_pool interface.
+ */
+#include "cilk_fiber.h"
+#ifdef _WIN32
+#   include "cilk_fiber-win.h"
+#else
+#   include "cilk_fiber-unix.h"
+#endif
+#include "cilk_malloc.h"
+#include "bug.h"
+#include <new>
+
+#include <climits>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "sysdep.h"
+
+
+extern "C" {
+
+inline int cilk_fiber_pool_sanity_check(cilk_fiber_pool *pool, const char* desc)
+{
+    int errors = 0;
+#if FIBER_DEBUG >= 1    
+    if ((NULL != pool) && pool->total > 0) {
+
+        // Root pool should not allocate more fibers than alloc_max
+        errors += ((pool->parent == NULL) &&
+                   (pool->total > pool->alloc_max));
+        errors += (pool->total > pool->high_water);
+
+        if (errors) {
+            fprintf(stderr, "ERROR at %s: pool=%p has max_size=%u, total=%d, high_water=%d\n",
+                    desc,
+                    pool, pool->max_size, pool->total, pool->high_water);
+        }
+    }
+#endif
+    return (errors == 0);
+}
+
+inline void increment_pool_total(cilk_fiber_pool* pool)
+{
+    ++pool->total;
+    if (pool->high_water < pool->total)
+        pool->high_water = pool->total;
+}
+
+inline void decrement_pool_total(cilk_fiber_pool* pool, int fibers_freed)
+{
+    pool->total -= fibers_freed;
+}
+
+
+/**
+ * @brief Free fibers from this pool until we have at most @c
+ * num_to_keep fibers remaining, and then put a fiber back.
+ *
+ * @pre   We do not hold @c pool->lock 
+ * @post  After completion, we do not hold @c pool->lock
+ */
+static void cilk_fiber_pool_free_fibers_from_pool(cilk_fiber_pool* pool,
+                                                  unsigned num_to_keep,
+                                                  cilk_fiber* fiber_to_return)
+{
+    // Free our own fibers, until we fall below our desired threshold.
+    // Each iteration of this loop proceeds in the following stages:
+    //   1.  Acquire the pool lock,
+    //   2.  Grabs up to B fibers from the pool, stores them into a buffer.
+    //   3.  Check if pool is empty enough.  If yes, put the last fiber back,
+    //       and remember that we should quit.
+    //   4.  Release the pool lock, and actually free any buffered fibers.
+    //   5.  Check if we are done and should exit the loop.  Otherwise, try again.
+    // 
+    const bool need_lock = pool->lock;
+    bool last_fiber_returned = false;
+    
+    do {
+        const int B = 10;   // Pull at most this many fibers from the
+                            // parent for one lock acquisition.  Make
+                            // this value large enough to amortize
+                            // against the cost of acquiring and
+                            // releasing the lock.
+        int num_to_free = 0;
+        cilk_fiber* fibers_to_free[B];
+
+        // Stage 1: Grab the lock.
+        if (need_lock) {
+            spin_mutex_lock(pool->lock);
+        }
+        
+        // Stage 2: Grab up to B fibers to free.
+        int fibers_freed = 0;
+        while ((pool->size > num_to_keep) && (num_to_free < B)) {
+            fibers_to_free[num_to_free++] = pool->fibers[--pool->size];
+            fibers_freed++;
+        }
+        decrement_pool_total(pool, fibers_freed);
+
+        // Stage 3.  Pool is below threshold.  Put extra fiber back.
+        if (pool->size <= num_to_keep) {
+            // Put the last fiber back into the pool.
+            if (fiber_to_return) {
+                CILK_ASSERT(pool->size < pool->max_size);
+                pool->fibers[pool->size] = fiber_to_return;
+                pool->size++;
+            }
+            last_fiber_returned = true;
+        }
+        
+        // Stage 4: Release the lock, and actually free any fibers
+        // buffered.
+        if (need_lock) {
+            spin_mutex_unlock(pool->lock);
+        }
+
+        for (int i = 0; i < num_to_free; ++i) {
+            fibers_to_free[i]->deallocate_to_heap();
+        }
+        
+    } while (!last_fiber_returned);
+}
+
+
+/******************************************************************
+ * TBD: We want to simplify / rework the logic for allocating and
+ * deallocating fibers, so that they are hopefully simpler and work
+ * more elegantly for more than two levels.
+ ******************************************************************/
+
+/**
+ * @brief Transfer fibers from @c pool to @c pool->parent.
+ *
+ * @pre   Must hold @c pool->lock if it exists.
+ * @post  After completion, some number of fibers
+ *        have been moved from this pool to the parent.
+ *        The lock @c pool->lock is still held.
+ *
+ * TBD: Do we wish to guarantee that the lock has never been
+ * released?  It may depend on the implementation...
+ */
+static void cilk_fiber_pool_move_fibers_to_parent_pool(cilk_fiber_pool* pool,
+                                                       unsigned num_to_keep)
+{
+    // ASSERT: We should hold the lock on pool (if it has one).
+    CILK_ASSERT(pool->parent);
+    cilk_fiber_pool* parent_pool = pool->parent;
+
+    // Move fibers from our pool to the parent until we either run out
+    // of space in the parent, or hit our threshold.
+    //
+    // This operation must be done while holding the parent lock.
+
+    // If the parent pool appears to be full, just return early.
+    if (parent_pool->size >= parent_pool->max_size)
+        return;
+
+    spin_mutex_lock(pool->parent->lock);
+    while ((parent_pool->size < parent_pool->max_size) &&
+           (pool->size > num_to_keep)) {
+        parent_pool->fibers[parent_pool->size++] =
+            pool->fibers[--pool->size];
+    }
+
+    // If the child pool has deallocated more than fibers to the heap
+    // than it has allocated, then transfer this "surplus" to the
+    // parent, so that the parent is free to allocate more from the
+    // heap.
+    // 
+    // This transfer means that the total in the parent can
+    // temporarily go negative.
+    if (pool->total < 0) {
+        // Reduce parent total by the surplus we have in the local
+        // pool.
+        parent_pool->total += pool->total;
+        pool->total = 0;
+    }
+
+    spin_mutex_unlock(pool->parent->lock);
+}
+    
+void cilk_fiber_pool_init(cilk_fiber_pool* pool,
+                          cilk_fiber_pool* parent,
+                          size_t           stack_size,
+                          unsigned         buffer_size,
+                          int              alloc_max,
+                          int              is_shared)
+{
+#if FIBER_DEBUG >= 1    
+    fprintf(stderr, "fiber_pool_init, pool=%p, parent=%p, alloc_max=%u\n",
+            pool, parent, alloc_max);
+#endif
+
+    pool->lock       = (is_shared ? spin_mutex_create() : NULL);
+    pool->parent     = parent;
+    pool->stack_size = stack_size;
+    pool->max_size   = buffer_size;
+    pool->size       = 0;
+    pool->total      = 0;
+    pool->high_water = 0;
+    pool->alloc_max  = alloc_max;
+    pool->fibers     =
+        (cilk_fiber**) __cilkrts_malloc(buffer_size * sizeof(cilk_fiber*));
+    CILK_ASSERT(NULL != pool->fibers);
+
+#ifdef __MIC__
+#define PREALLOCATE_FIBERS
+#endif
+    
+#ifdef PREALLOCATE_FIBERS
+    // Pre-allocate 1/4 of fibers in the pools ahead of time.  This
+    // value is somewhat arbitrary.  It was chosen to be less than the
+    // threshold (of about 3/4) of fibers to keep in the pool when
+    // transferring fibers to the parent.
+    
+    int pre_allocate_count = buffer_size/4;
+    for (pool->size = 0; pool->size < pre_allocate_count; pool->size++) {
+        pool->fibers[pool->size] = cilk_fiber::allocate_from_heap(pool->stack_size);
+    }
+#endif
+}
+
+
+void cilk_fiber_pool_set_fiber_limit(cilk_fiber_pool* root_pool,
+                                     unsigned max_fibers_to_allocate)
+{
+    // Should only set limit on root pool, not children.
+    CILK_ASSERT(NULL == root_pool->parent);
+    root_pool->alloc_max = max_fibers_to_allocate;
+}
+                                   
+void cilk_fiber_pool_destroy(cilk_fiber_pool* pool)
+{
+    CILK_ASSERT(cilk_fiber_pool_sanity_check(pool, "pool_destroy"));
+
+    // Lock my own pool, if I need to.
+    if (pool->lock) {
+        spin_mutex_lock(pool->lock);
+    }
+
+    // Give any remaining fibers to parent pool.
+    if (pool->parent) {
+        cilk_fiber_pool_move_fibers_to_parent_pool(pool, 0);
+    }
+
+    // Unlock pool.
+    if (pool->lock) {
+        spin_mutex_unlock(pool->lock);
+    }
+
+    // If I have any left in my pool, just free them myself.
+    // This method may acquire the pool lock.
+    cilk_fiber_pool_free_fibers_from_pool(pool, 0, NULL);
+
+    // Destroy the lock if there is one.
+    if (pool->lock) {
+        spin_mutex_destroy(pool->lock);
+    }
+    __cilkrts_free(pool->fibers);
+}
+
+
+cilk_fiber* cilk_fiber_allocate(cilk_fiber_pool* pool)
+{
+    CILK_ASSERT(cilk_fiber_pool_sanity_check(pool, "allocate"));
+    return cilk_fiber::allocate(pool);
+}
+
+cilk_fiber* cilk_fiber_allocate_from_heap(size_t stack_size)
+{
+    return cilk_fiber::allocate_from_heap(stack_size);
+}
+
+void cilk_fiber_reset_state(cilk_fiber* fiber, cilk_fiber_proc start_proc) 
+{
+    fiber->reset_state(start_proc);
+}
+
+int cilk_fiber_remove_reference(cilk_fiber *fiber, cilk_fiber_pool *pool)
+{
+    return fiber->remove_reference(pool);
+}
+
+cilk_fiber* cilk_fiber_allocate_from_thread()
+{
+    return cilk_fiber::allocate_from_thread();
+}
+
+int cilk_fiber_deallocate_from_thread(cilk_fiber *fiber)
+{
+    return fiber->deallocate_from_thread();
+}
+
+int cilk_fiber_remove_reference_from_thread(cilk_fiber *fiber)
+{
+    return fiber->remove_reference_from_thread();
+}
+
+int cilk_fiber_is_allocated_from_thread(cilk_fiber *fiber)
+{
+    return fiber->is_allocated_from_thread();
+}
+
+#if SUPPORT_GET_CURRENT_FIBER
+cilk_fiber* cilk_fiber_get_current_fiber(void)
+{
+    return cilk_fiber::get_current_fiber();
+}
+#endif
+
+void cilk_fiber_suspend_self_and_resume_other(cilk_fiber* self,
+                                              cilk_fiber* other)
+{
+    self->suspend_self_and_resume_other(other);
+}
+
+
+void cilk_fiber::reset_state(cilk_fiber_proc start_proc)
+{
+    // Setup the fiber and return.
+    this->m_start_proc = start_proc;
+    
+    CILK_ASSERT(!this->is_resumable());
+    CILK_ASSERT(NULL == this->m_pending_remove_ref);
+    CILK_ASSERT(NULL == this->m_pending_pool);
+}
+
+NORETURN
+cilk_fiber_remove_reference_from_self_and_resume_other(cilk_fiber*      self,
+                                                       cilk_fiber_pool* self_pool,
+                                                       cilk_fiber*      other)
+{
+#if FIBER_DEBUG >= 3
+    __cilkrts_worker* w = __cilkrts_get_tls_worker();
+    fprintf(stderr, "W=%d: cilk_fiber_deactivate_self_and_resume_other: self=%p, other=%p\n",
+            w->self,
+            self, other);
+#endif
+    CILK_ASSERT(cilk_fiber_pool_sanity_check(self_pool, "remove_reference_from_self_resume_other"));
+    self->remove_reference_from_self_and_resume_other(self_pool, other);
+    
+    // We should never return here. 
+}
+
+void cilk_fiber_set_post_switch_proc(cilk_fiber *self,
+                                     cilk_fiber_proc post_switch_proc)
+{
+    self->set_post_switch_proc(post_switch_proc);
+}
+
+void cilk_fiber_invoke_tbb_stack_op(cilk_fiber* fiber,
+                                    __cilk_tbb_stack_op op)
+{
+    fiber->invoke_tbb_stack_op(op);
+}
+
+cilk_fiber_data* cilk_fiber_get_data(cilk_fiber* fiber)
+{
+    return fiber->get_data();
+
+    /// TBD: Change this code to "return (cilk_fiber_data*)fiber;"
+    //       plus a static assert, so that this function is 
+    //       more easily inlined by the compiler.
+}
+
+int cilk_fiber_is_resumable(cilk_fiber *fiber)
+{
+    return fiber->is_resumable();
+}
+
+char* cilk_fiber_get_stack_base(cilk_fiber *fiber)
+{
+    return fiber->get_stack_base();
+}
+
+
+#if defined(_WIN32) && 0 // Only works on Windows.  Disable debugging for now.
+#define DBG_STACK_OPS(_fmt, ...) __cilkrts_dbgprintf(_fmt, __VA_ARGS__)
+#else
+#define DBG_STACK_OPS(_fmt, ...)
+#endif
+
+void cilk_fiber_set_stack_op(cilk_fiber *fiber,
+                             __cilk_tbb_stack_op_thunk o)
+{
+    cilk_fiber_data *fdata = cilk_fiber_get_data(fiber);
+    DBG_STACK_OPS ("cilk_fiber_set_stack_op - cilk_fiber %p, routine: %p, data: %p\n",
+                   fiber,
+                   o.routine,
+                   o.data);
+    fdata->stack_op_routine = o.routine;
+    fdata->stack_op_data = o.data;
+}
+
+#if 0    // Debugging function
+static
+const char *NameStackOp (enum __cilk_tbb_stack_op op)
+{
+    switch(op)
+    {
+        case CILK_TBB_STACK_ORPHAN: return "CILK_TBB_STACK_ORPHAN";
+        case CILK_TBB_STACK_ADOPT: return "CILK_TBB_STACK_ADOPT";
+        case CILK_TBB_STACK_RELEASE: return "CILK_TBB_STACK_RELEASE";
+        default: return "Unknown";
+    }
+}
+#endif
+
+/*
+ * Save TBB interop information for an unbound thread.  It will get picked
+ * up when the thread is bound to the runtime.
+ */
+void cilk_fiber_tbb_interop_save_stack_op_info(__cilk_tbb_stack_op_thunk o)
+{
+    __cilk_tbb_stack_op_thunk *saved_thunk =
+        __cilkrts_get_tls_tbb_interop();
+
+    DBG_STACK_OPS("Calling save_stack_op; o.routine=%p, o.data=%p, saved_thunk=%p\n",
+                  o.routine, o.data, saved_thunk);
+
+    // If there is not already space allocated, allocate some.
+    if (NULL == saved_thunk) {
+        saved_thunk = (__cilk_tbb_stack_op_thunk*)
+            __cilkrts_malloc(sizeof(__cilk_tbb_stack_op_thunk));
+        __cilkrts_set_tls_tbb_interop(saved_thunk);
+    }
+
+    *saved_thunk = o;
+
+    DBG_STACK_OPS ("Unbound Thread %04x: tbb_interop_save_stack_op_info - saved info\n",
+                   cilkos_get_current_thread_id());
+}
+
+/*
+ * Save TBB interop information from the cilk_fiber.  It will get picked
+ * up when the thread is bound to the runtime next time.
+ */
+void cilk_fiber_tbb_interop_save_info_from_stack(cilk_fiber *fiber)
+{
+    __cilk_tbb_stack_op_thunk *saved_thunk;
+    cilk_fiber_data* fdata;
+
+    if (NULL == fiber)
+        return;
+
+    fdata = cilk_fiber_get_data(fiber);
+    // If there is no TBB interop data, just return
+    if (NULL == fdata->stack_op_routine)
+        return;
+    
+    saved_thunk = __cilkrts_get_tls_tbb_interop();
+
+    // If there is not already space allocated, allocate some.
+    if (NULL == saved_thunk) {
+        saved_thunk = (__cilk_tbb_stack_op_thunk*)
+            __cilkrts_malloc(sizeof(__cilk_tbb_stack_op_thunk));
+        __cilkrts_set_tls_tbb_interop(saved_thunk);
+    }
+
+    saved_thunk->routine = fdata->stack_op_routine;
+    saved_thunk->data = fdata->stack_op_data;
+}
+
+/*
+ * If there's TBB interop information that was saved before the thread was
+ * bound, apply it now
+ */
+void cilk_fiber_tbb_interop_use_saved_stack_op_info(cilk_fiber* fiber)
+{
+    __cilk_tbb_stack_op_thunk *saved_thunk =
+        __cilkrts_get_tls_tbb_interop();
+
+    CILK_ASSERT(fiber);
+    // If we haven't allocated a TBB interop index, we don't have any saved info
+    if (NULL == saved_thunk) {
+        DBG_STACK_OPS ("cilk_fiber %p: tbb_interop_use_saved_stack_op_info - no saved info\n",
+                       fiber);
+        return;
+    }
+
+    DBG_STACK_OPS ("cilk_fiber %p: tbb_interop_use_saved_stack_op_info - using saved info\n",
+                   fiber);
+
+     // Associate the saved info with the __cilkrts_stack
+    cilk_fiber_set_stack_op(fiber, *saved_thunk);
+    
+    // Free the saved data.  We'll save it again if needed when the code
+    // returns from the initial function
+    cilk_fiber_tbb_interop_free_stack_op_info();
+}
+
+/*
+ * Free saved TBB interop memory.  Should only be called when the thread is
+ * not bound.
+ */
+void cilk_fiber_tbb_interop_free_stack_op_info(void)
+{
+    __cilk_tbb_stack_op_thunk *saved_thunk =
+        __cilkrts_get_tls_tbb_interop();
+
+    // If we haven't allocated a TBB interop index, we don't have any saved info
+    if (NULL == saved_thunk)
+        return;
+
+    DBG_STACK_OPS ("tbb_interop_free_stack_op_info - freeing saved info\n");
+
+    // Free the memory and wipe out the TLS value
+    __cilkrts_free(saved_thunk);
+    __cilkrts_set_tls_tbb_interop(NULL);
+}
+
+
+
+#if NEED_FIBER_REF_COUNTS
+int cilk_fiber_has_references(cilk_fiber *fiber) 
+{
+    return (fiber->get_ref_count() > 0);
+}
+
+int cilk_fiber_get_ref_count(cilk_fiber *fiber)
+{
+    return fiber->get_ref_count();
+}
+
+void cilk_fiber_add_reference(cilk_fiber *fiber)
+{
+    fiber->inc_ref_count();
+}
+#endif // NEED_FIBER_REF_COUNTS
+
+
+} // End extern "C"
+
+
+cilk_fiber_sysdep* cilk_fiber::sysdep()
+{
+    return static_cast<cilk_fiber_sysdep*>(this);
+}
+
+
+cilk_fiber::cilk_fiber()
+    : m_start_proc(NULL)
+    , m_post_switch_proc(NULL)
+    , m_pending_remove_ref(NULL)
+    , m_pending_pool(NULL)
+    , m_flags(0)
+{
+    // Clear cilk_fiber_data base-class data members
+    std::memset((cilk_fiber_data*) this, 0, sizeof(cilk_fiber_data));
+
+    // cilk_fiber data members
+    init_ref_count(0);
+}
+
+cilk_fiber::cilk_fiber(std::size_t stack_size)
+{
+    *this = cilk_fiber();  // A delegating constructor would be nice here
+    this->stack_size = stack_size;
+}
+
+cilk_fiber::~cilk_fiber() 
+{
+    // Empty destructor.
+}
+
+
+char* cilk_fiber::get_stack_base()
+{
+    return this->sysdep()->get_stack_base_sysdep();
+}
+
+cilk_fiber* cilk_fiber::allocate_from_heap(std::size_t stack_size)
+{
+    // Case 1: pool is NULL. create a new fiber from the heap
+    // No need for locks here.
+    cilk_fiber_sysdep* ret =
+        (cilk_fiber_sysdep*) __cilkrts_malloc(sizeof(cilk_fiber_sysdep));
+
+    // Error condition. If we failed to allocate a fiber from the
+    // heap, we are in trouble though...
+    if (!ret)
+        return NULL;
+
+    ::new(ret) cilk_fiber_sysdep(stack_size);
+
+    CILK_ASSERT(0 == ret->m_flags);
+    CILK_ASSERT(NULL == ret->m_pending_remove_ref);
+    CILK_ASSERT(NULL == ret->m_pending_pool);
+    ret->init_ref_count(1);
+    return ret;
+}
+
+
+#if USE_FIBER_TRY_ALLOCATE_FROM_POOL
+/**
+ * Helper method: try to allocate a fiber from this pool or its
+ * ancestors without going to the OS / heap.
+ *
+ * Returns allocated pool, or NULL if no pool is found.
+ *
+ * If pool contains a suitable fiber. Return it.  Otherwise, try to
+ * recursively grab a fiber from the parent pool, if there is one.
+ *
+ * This method will not allocate a fiber from the heap.
+ *
+ * This method could be written either recursively or iteratively.
+ * It probably does not matter which one we do.
+ *
+ * @note This method is compiled, but may not be used unless the
+ * USE_FIBER_TRY_ALLOCATE_FROM_POOL switch is set.
+ */
+cilk_fiber* cilk_fiber::try_allocate_from_pool_recursive(cilk_fiber_pool* pool)
+{
+    cilk_fiber* ret = NULL;
+
+    if (pool->size > 0) {
+        // Try to get the lock.
+        if (pool->lock) {
+            // For some reason, it seems to be better to just block on the parent
+            // pool lock, instead of using a try-lock?
+#define USE_TRY_LOCK_IN_FAST_ALLOCATE 0
+#if USE_TRY_LOCK_IN_FAST_ALLOCATE
+            int got_lock = spin_mutex_trylock(pool->lock);
+            if (!got_lock) {
+                // If we fail, skip to the parent.
+                if (pool->parent) {
+                    return try_allocate_from_pool_recursive(pool->parent);
+                }
+            }
+#else
+            spin_mutex_lock(pool->lock);
+#endif
+        }
+
+        // Check in the pool if we have the lock.
+        if (pool->size > 0) {
+            ret = pool->fibers[--pool->size];
+        }
+
+        // Release the lock once we are done updating pool fields.
+        if (pool->lock) {
+            spin_mutex_unlock(pool->lock);
+        }
+    }
+
+    if ((!ret) && (pool->parent)) {
+        return try_allocate_from_pool_recursive(pool->parent);
+    }
+
+    if (ret) {
+        // When we pull a fiber out of the pool, set its reference
+        // count before we return it.
+        ret->init_ref_count(1);
+    }
+    return ret;
+}
+#endif // USE_FIBER_TRY_ALLOCATE_FROM_POOL
+
+
+cilk_fiber* cilk_fiber::allocate(cilk_fiber_pool* pool)
+{
+    // Pool should not be NULL in this method.  But I'm not going to
+    // actually assert it, because we are likely to seg fault anyway
+    // if it is.
+    // CILK_ASSERT(NULL != pool);
+
+    cilk_fiber *ret = NULL;
+
+#if USE_FIBER_TRY_ALLOCATE_FROM_POOL
+    // "Fast" path, which doesn't go to the heap or OS until checking
+    // the ancestors first.
+    ret = try_allocate_from_pool_recursive(pool);
+    if (ret)
+        return ret;
+#endif
+
+    // If we don't get anything from the "fast path", then go through
+    // a slower path to look for a fiber.
+    //
+    //  1. Lock the pool if it is shared.
+    //  2. Look in our local pool.  If we find one, release the lock
+    //     and quit searching.
+    //  3. Otherwise, check whether we can allocate from heap.
+    //  4. Release the lock if it was acquired.
+    //  5. Try to allocate from the heap, if step 3 said we could.
+    //     If we find a fiber, then quit searching.
+    //  6. If none of these steps work, just recursively try again
+    //     from the parent.
+
+    // 1. Lock the pool if it is shared.
+    if (pool->lock) {
+        spin_mutex_lock(pool->lock);
+    }
+
+    // 2. Look in local pool.
+    if (pool->size > 0) {
+        ret = pool->fibers[--pool->size];
+        if (ret) {
+            // If we found one, release the lock once we are
+            // done updating pool fields, and break out of the
+            // loop.
+            if (pool->lock) {
+                spin_mutex_unlock(pool->lock);
+            }
+
+            // When we pull a fiber out of the pool, set its reference
+            // count just in case.
+            ret->init_ref_count(1);
+            return ret;
+        }
+    }
+
+    // 3. Check whether we can allocate from the heap.
+    bool can_allocate_from_heap = false;
+    if (pool->total < pool->alloc_max) {
+        // Track that we are allocating a new fiber from the
+        // heap, originating from this pool.
+        // This increment may be undone if we happen to fail to
+        // allocate from the heap.
+        increment_pool_total(pool);
+        can_allocate_from_heap = true;
+    }
+
+    // 4. Unlock the pool, and then allocate from the heap.
+    if (pool->lock) {
+        spin_mutex_unlock(pool->lock);
+    }
+
+    // 5. Actually try to allocate from the heap / OS.
+    if (can_allocate_from_heap) {
+        ret = allocate_from_heap(pool->stack_size);
+        // If we got something from the heap, just return it.
+        if (ret) {
+            return ret;
+        }
+
+        // Otherwise, we failed in our attempt to allocate a
+        // fiber from the heap.  Grab the lock and decrement
+        // the total again.
+        if (pool->lock) {
+            spin_mutex_lock(pool->lock);
+        }
+        decrement_pool_total(pool, 1);
+        if (pool->lock) {
+            spin_mutex_unlock(pool->lock);
+        }
+    }
+
+    // 6. If we get here, then searching this pool failed.  Go search
+    // the parent instead if we have one.
+    if (pool->parent) {
+        return allocate(pool->parent);
+    }
+    
+    return ret;
+}
+
+int cilk_fiber::remove_reference(cilk_fiber_pool* pool)
+{
+    int ref_count = this->dec_ref_count();
+    if (ref_count == 0) {
+        if (pool) {
+            deallocate_self(pool);
+        }
+        else {
+            deallocate_to_heap();
+        }
+    }
+    return ref_count;
+}
+
+cilk_fiber* cilk_fiber::allocate_from_thread()
+{
+    void* retmem = __cilkrts_malloc(sizeof(cilk_fiber_sysdep));
+    CILK_ASSERT(retmem);
+    cilk_fiber_sysdep* ret = ::new(retmem) cilk_fiber_sysdep(from_thread);
+
+    // A fiber allocated from a thread begins with a reference count
+    // of 2.  The first is for being created, and the second is for
+    // being running.
+    //
+    // Suspending this fiber will decrement the count down to 1.
+    ret->init_ref_count(2);
+
+#if SUPPORT_GET_CURRENT_FIBER    
+    // We're creating the main fiber for this thread. Set this fiber as the
+    // current fiber.
+    cilkos_set_tls_cilk_fiber(ret);
+#endif
+    return ret;
+}
+
+int cilk_fiber::deallocate_from_thread()
+{
+    CILK_ASSERT(this->is_allocated_from_thread());
+#if SUPPORT_GET_CURRENT_FIBER
+    CILK_ASSERT(this == cilkos_get_tls_cilk_fiber());
+    // Reverse of "allocate_from_thread".
+    cilkos_set_tls_cilk_fiber(NULL);
+#endif
+
+    this->assert_ref_count_at_least(2);
+
+    // Suspending the fiber should conceptually decrement the ref
+    // count by 1.
+    cilk_fiber_sysdep* self = this->sysdep();
+    self->convert_fiber_back_to_thread();
+
+    // Then, freeing the fiber itself decrements the ref count again.
+    int ref_count = this->sub_from_ref_count(2);
+    if (ref_count == 0) {
+        self->~cilk_fiber_sysdep();
+        __cilkrts_free(self);
+    }
+    return ref_count;
+}
+
+int cilk_fiber::remove_reference_from_thread()
+{
+    int ref_count = dec_ref_count();
+    if (ref_count == 0) {
+        cilk_fiber_sysdep* self = this->sysdep();
+        self->~cilk_fiber_sysdep();
+        __cilkrts_free(self);
+    }
+    return ref_count;
+}
+
+
+#if SUPPORT_GET_CURRENT_FIBER
+cilk_fiber* cilk_fiber::get_current_fiber()
+{
+    return cilk_fiber_sysdep::get_current_fiber_sysdep();
+}
+#endif
+
+void cilk_fiber::do_post_switch_actions()
+{
+    if (m_post_switch_proc) 
+    {
+        cilk_fiber_proc proc = m_post_switch_proc;
+        m_post_switch_proc = NULL;
+        proc(this);
+    }
+
+    if (m_pending_remove_ref)
+    {
+        m_pending_remove_ref->remove_reference(m_pending_pool);
+
+        // Even if we don't free it, 
+        m_pending_remove_ref = NULL;
+        m_pending_pool   = NULL;
+    }
+}
+
+void cilk_fiber::suspend_self_and_resume_other(cilk_fiber* other)
+{
+#if FIBER_DEBUG >=1
+    fprintf(stderr, "suspend_self_and_resume_other: self =%p, other=%p [owner=%p, resume_sf=%p]\n",
+            this, other, other->owner, other->resume_sf);
+#endif
+
+    // Decrement my reference count (to suspend)
+    // Increment other's count (to resume)
+    // Suspended fiber should have a reference count of at least 1.  (It is not in a pool).
+    this->dec_ref_count();
+    other->inc_ref_count();
+    this->assert_ref_count_at_least(1);
+
+    // Pass along my owner.
+    other->owner = this->owner;
+    this->owner  = NULL;
+
+    // Change this fiber to resumable.
+    CILK_ASSERT(!this->is_resumable());
+    this->set_resumable(true);
+
+    // Normally, I'd assert other->is_resumable().  But this flag may
+    // be false the first time we try to "resume" a fiber.
+    cilk_fiber_sysdep* self = this->sysdep();
+    self->suspend_self_and_resume_other_sysdep(other->sysdep());
+
+    // HAVE RESUMED EXECUTION
+    // When we come back here, we should have at least two references:
+    // one for the fiber being allocated / out of a pool, and one for it being active.
+    this->assert_ref_count_at_least(2);
+}
+
+NORETURN
+cilk_fiber::remove_reference_from_self_and_resume_other(cilk_fiber_pool* self_pool,
+                                                        cilk_fiber*      other)
+{
+    // Decrement my reference count once (to suspend)
+    // Increment other's count (to resume)
+    // Suspended fiber should have a reference count of at least 1.  (It is not in a pool).
+    this->dec_ref_count();
+    other->inc_ref_count();
+
+    // Set a pending remove reference for this fiber, once we have
+    // actually switched off.
+    other->m_pending_remove_ref = this;
+    other->m_pending_pool   = self_pool;
+
+    // Pass along my owner.
+    other->owner = this->owner;
+    this->owner  = NULL;
+
+    // Since we are deallocating self, this fiber does not become
+    // resumable.
+    CILK_ASSERT(!this->is_resumable());
+
+    cilk_fiber_sysdep* self = this->sysdep();
+    self->jump_to_resume_other_sysdep(other->sysdep());
+
+    __cilkrts_bug("Deallocating fiber.  We should never come back here.");
+    std::abort();
+}
+
+
+void cilk_fiber::deallocate_to_heap()
+{
+    cilk_fiber_sysdep* self = this->sysdep();
+    self->~cilk_fiber_sysdep();
+    __cilkrts_free(self);
+}
+
+void cilk_fiber::deallocate_self(cilk_fiber_pool* pool)
+{
+    this->set_resumable(false);
+
+    CILK_ASSERT(NULL != pool);
+    CILK_ASSERT(!this->is_allocated_from_thread());
+    this->assert_ref_count_equals(0);
+    
+    // Cases: 
+    //
+    // 1. pool has space:  Add to this pool.
+    // 2. pool is full:    Give some fibers to parent, and then free
+    //                     enough to make space for the fiber we are deallocating.
+    //                     Then put the fiber back into the pool.
+    
+    const bool need_lock = pool->lock;
+    // Grab the lock for the remaining cases.
+    if (need_lock) {
+        spin_mutex_lock(pool->lock);
+    }
+
+    // Case 1: this pool has space.  Return the fiber.
+    if (pool->size < pool->max_size)
+    {
+        // Add this fiber to pool
+        pool->fibers[pool->size++] = this;
+        if (need_lock) {
+            spin_mutex_unlock(pool->lock);
+        }
+        return;
+    }
+
+    // Case 2: Pool is full.
+    //
+    // First free up some space by giving fibers to the parent.
+    if (pool->parent)
+    {
+        // Pool is full.  Move all but "num_to_keep" fibers to parent,
+        // if we can.
+        unsigned num_to_keep = pool->max_size/2 + pool->max_size/4;
+        cilk_fiber_pool_move_fibers_to_parent_pool(pool, num_to_keep);
+    }
+
+    if (need_lock) {
+        spin_mutex_unlock(pool->lock);
+    }
+
+    // Now, free a fiber to make room for the one we need to put back,
+    // and then put this fiber back.  This step may actually return
+    // fibers to the heap.
+    cilk_fiber_pool_free_fibers_from_pool(pool, pool->max_size -1, this);
+}
+
+
+// NOTE: Except for print-debug, this code is the same as in Windows. 
+void cilk_fiber::invoke_tbb_stack_op(__cilk_tbb_stack_op op)
+{
+    cilk_fiber_data *fdata = this->get_data();
+
+    if (0 == fdata->stack_op_routine)
+    {
+        if  (CILK_TBB_STACK_RELEASE != op)
+            DBG_STACK_OPS ("Wkr %p: invoke_tbb_stack_op - %s (%d) for cilk_fiber %p, fiber %p, thread id %04x - No stack op routine\n",
+                           fdata->owner, 
+                           NameStackOp(op),
+                           op,
+                           fdata,
+                           this,
+                           cilkos_get_current_thread_id());
+        return;
+    }
+
+    // Call TBB to do it's thing
+    DBG_STACK_OPS ("Wkr %p: invoke_tbb_stack_op - op %s data %p for cilk_fiber %p, fiber %p, thread id %04x\n",
+                   fdata->owner, 
+                   NameStackOp(op),
+                   fdata->stack_op_data,
+                   fdata,
+                   this, 
+                   cilkos_get_current_thread_id());
+
+    (*fdata->stack_op_routine)(op, fdata->stack_op_data);
+    if (op == CILK_TBB_STACK_RELEASE)
+    {
+        fdata->stack_op_routine = 0;
+        fdata->stack_op_data = 0;
+    }
+}
+
+
+
+#if NEED_FIBER_REF_COUNTS
+
+void cilk_fiber::atomic_inc_ref_count()
+{
+    cilkos_atomic_add(&m_outstanding_references, 1);
+}
+
+long cilk_fiber::atomic_dec_ref_count()
+{
+    return cilkos_atomic_add(&m_outstanding_references, -1);
+}
+
+long cilk_fiber::atomic_sub_from_ref_count(long v)
+{
+    return cilkos_atomic_add(&m_outstanding_references, -v);
+}
+
+#endif // NEED_FIBER_REF_COUNTS
+
+/* End cilk_fibers.cpp */
diff --git a/libcilkrts/runtime/cilk_fiber.h b/libcilkrts/runtime/cilk_fiber.h
new file mode 100644
index 00000000000..2671f924681
--- /dev/null
+++ b/libcilkrts/runtime/cilk_fiber.h
@@ -0,0 +1,882 @@
+/* cilk_fiber.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2012-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file cilk_fiber.h
+ *
+ * @brief Abstraction of a "fiber": A coprocess-like stack and auxiliary data
+ */
+
+#ifndef INCLUDED_CILK_FIBER_DOT_H
+#define INCLUDED_CILK_FIBER_DOT_H
+
+#include <cilk/common.h>
+#ifdef __cplusplus
+#   include <cstddef>
+#else
+#   include <stddef.h>
+#endif
+
+#include "bug.h"
+#include "cilk-tbb-interop.h"
+#include "spin_mutex.h"
+#include "internal/abi.h"       // Define __cilkrts_stack_frame
+
+/**
+ * @brief Debugging level for Cilk fiber code.
+ *
+ * A value of 0 means no debugging.
+ * Higher values generate more debugging output.
+ */
+#define FIBER_DEBUG 0
+
+/**
+ * @brief Flag for validating reference counts.
+ * 
+ * Set to 1 to assert that fiber reference counts are reasonable.
+ */
+#define FIBER_CHECK_REF_COUNTS 1
+
+/**
+ * @brief Flag to determine whether fibers support reference counting.
+ * We require reference counting only on Windows, for exception
+ * processing.  Unix does not need reference counting.
+ */
+#if defined(_WIN32)
+#   define NEED_FIBER_REF_COUNTS 1
+#endif
+
+/**
+ * @brief Flag to enable support for the
+ * cilk_fiber_get_current_fiber() method.
+ *
+ * I'd like this flag to be 0.  However, the cilk_fiber test depends
+ * on being able to call this method.
+ */
+#if !defined(SUPPORT_GET_CURRENT_FIBER)
+#   define SUPPORT_GET_CURRENT_FIBER 0
+#endif
+
+/**
+ * @brief Switch for enabling "fast path" check for fibers, which
+ * doesn't go to the heap or OS until checking the ancestors first.
+ *
+ * Doing this check seems to make the stress test in
+ * cilk_fiber_pool.t.cpp run faster.  But it doesn't seem to make much
+ * difference in other benchmarks, so it is disabled by default.
+ */
+#define USE_FIBER_TRY_ALLOCATE_FROM_POOL 0
+
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/// @brief Forward reference to fiber pool.
+typedef struct cilk_fiber_pool cilk_fiber_pool;
+
+/** @brief Opaque data structure representing a fiber */
+typedef struct cilk_fiber cilk_fiber;
+
+/** @brief Function pointer type for use as a fiber's "main" procedure */
+typedef void (*cilk_fiber_proc)(cilk_fiber*);
+
+/** @brief Data structure associated with each fiber. */
+typedef struct cilk_fiber_data
+{
+    __STDNS size_t          stack_size;       /**< Size of stack for fiber    */
+    __cilkrts_worker*       owner;            /**< Worker using this fiber    */
+    __cilkrts_stack_frame*  resume_sf;        /**< Stack frame to resume      */
+    __cilk_tbb_pfn_stack_op stack_op_routine; /**< Cilk/TBB interop callback  */
+    void*                   stack_op_data;    /**< Data for Cilk/TBB callback */
+    void*                   client_data;      /**< Data managed by client     */
+
+#ifdef _WIN32
+    char *initial_sp;       /**<  Initalized in fiber_stub */
+# ifdef _WIN64
+    char *steal_frame_sp;   /**< RSP for frame stealing work */
+                            // Needed for exception handling so we can
+                            // identify when about to unwind off stack
+# endif
+#endif
+
+} cilk_fiber_data;
+
+/** @brief Pool of cilk_fiber for fiber reuse
+ *
+ * Pools form a hierarchy, with each pool pointing to its parent.  When the
+ * pool undeflows, it gets a fiber from its parent.  When a pool overflows,
+ * it returns some fibers to its parent.  If the root pool underflows, it
+ * allocates and initializes a new fiber from the heap but only if the total
+ * is less than max_size; otherwise, fiber creation fails.
+ */
+struct cilk_fiber_pool
+{
+    spin_mutex*      lock;       ///< Mutual exclusion for pool operations 
+    __STDNS size_t   stack_size; ///< Size of stacks for fibers in this pool.
+    cilk_fiber_pool* parent;     ///< @brief Parent pool.
+                                 ///< If this pool is empty, get from parent 
+
+    // Describes inactive fibers stored in the pool.
+    cilk_fiber**     fibers;     ///< Array of max_size fiber pointers 
+    unsigned         max_size;   ///< Limit on number of fibers in pool 
+    unsigned         size;       ///< Number of fibers currently in the pool
+
+    // Statistics on active fibers that were allocated from this pool,
+    // but no longer in the pool.
+    int              total;      ///< @brief Fibers allocated - fiber deallocated from pool
+                                 ///< total may be negative for non-root pools.
+    int              high_water; ///< High water mark of total fibers
+    int              alloc_max;  ///< Limit on number of fibers allocated from the heap/OS
+};
+
+/** @brief Initializes a cilk_fiber_pool structure
+ *
+ * @param pool         - The address of the pool that is to be initialized
+ * @param parent       - The address of this pool's parent, or NULL for root pool
+ * @param stack_size   - Size of stacks for fibers allocated from this pool.
+ * @param buffer_size  - The maximum number of fibers that may be pooled.
+ * @param alloc_max    - Limit on # of fibers this pool can allocate from the heap.
+ * @param is_shared    - True if accessing this pool needs a lock, false otherwise.
+ */
+void cilk_fiber_pool_init(cilk_fiber_pool* pool,
+                          cilk_fiber_pool* parent,
+                          size_t           stack_size,
+                          unsigned         buffer_size,
+                          int              alloc_max,
+                          int              is_shared);
+
+/** @brief Sets the maximum number of fibers to allocate from a root pool.
+ *
+ * @param root_pool              - A root fiber pool
+ * @param max_fibers_to_allocate - The limit on # of fibers to allocate.
+ *
+ * Sets the maximum number of fibers that can be allocated from this
+ * pool and all its descendants.  This pool must be a root pool.
+ */
+void cilk_fiber_pool_set_fiber_limit(cilk_fiber_pool* root_pool,
+                                     unsigned max_fibers_to_allocate);
+
+/** @brief De-initalizes a cilk_fiber_pool
+ *
+ * @param pool - The address of the pool that is to be destroyed
+ */
+void cilk_fiber_pool_destroy(cilk_fiber_pool* pool);
+
+/** @brief Allocates a new cilk_fiber.
+ *
+ * If the specified pool is empty, this method may choose to either
+ * allocate a fiber from the heap (if pool->total < pool->alloc_max),
+ * or retrieve a fiber from the parent pool.
+ *
+ * @note If a non-null fiber is returned, @c cilk_fiber_reset_state
+ * should be called on this fiber before using it.
+ *
+ * An allocated fiber begins with a reference count of 1.
+ * This method may lock @c pool or one of its ancestors.
+ *
+ * @pre pool should not be NULL.
+ *
+ * @param pool         The fiber pool from which to retrieve a fiber.
+ * @return             An allocated fiber, or NULL if failed to allocate.
+ */
+cilk_fiber* cilk_fiber_allocate(cilk_fiber_pool* pool);
+
+/** @brief Allocate and initialize a new cilk_fiber using memory from
+ * the heap and/or OS.
+ *
+ * The allocated fiber begins with a reference count of 1.
+ *
+ * @param stack_size   The size (in bytes) to be allocated for the fiber's
+ *                     stack.
+ * @return             An initialized fiber.  This method should not return NULL
+ *                     unless some exceptional condition has occurred.
+ */
+cilk_fiber* cilk_fiber_allocate_from_heap(size_t stack_size);
+
+
+/** @brief Resets an fiber object just allocated from a pool with the
+ * specified proc.
+ *
+ * After this call, cilk_fiber_data object associated with this fiber
+ * is filled with zeros.
+ *
+ * This function can be called only on a fiber that has been allocated
+ * from a pool, but never used.
+ *
+ * @param fiber        The fiber to reset and initialize. 
+ * @param start_proc   The function to run when switching to the fiber.  If
+ *                     null, the fiber can be used with cilk_fiber_run_proc()
+ *                     but not with cilk_fiber_resume().
+ */
+void cilk_fiber_reset_state(cilk_fiber* fiber,
+                            cilk_fiber_proc start_proc);
+
+/** @brief Remove a reference from this fiber, possibly deallocating it.
+ *
+ * This fiber is deallocated only when there are no other references
+ * to it.  Deallocation happens either by returning the fiber to the
+ * specified pool, or returning it to the heap.
+ *
+ * A fiber that is currently executing should not remove the last
+ * reference to itself.
+ *
+ * When a fiber is deallocated, destructors are not called for the
+ * objects (if any) still on its stack.  The fiber's stack and fiber
+ * data is returned to the stack pool but the client fiber data is not
+ * deallocated.
+ *
+ * If the pool overflows because of a deallocation, then some fibers
+ * will be returned to the parent pool.  If the root pool overflows,
+ * then the fiber is returned to the heap.
+ *
+ * @param fiber   The Cilk fiber to remove a reference to.
+ * @param pool    The fiber pool to which the fiber should be returned.  The
+ *                caller is assumed to have exclusive access to the pool
+ *                either because there is no contention for it or because
+ *                its lock has been acquired.  If pool is NULL, any
+ *                deallocated fiber is destroyed and returned to the
+ *                heap.
+ *
+ * @return        Final reference count.  If the count is 0, the fiber was
+ *                returned to a pool or the heap.
+ */
+int cilk_fiber_remove_reference(cilk_fiber *fiber, cilk_fiber_pool *pool);
+
+/** @brief Allocates and intializes this thread's main fiber
+ *
+ * Each thread has an "implicit" main fiber that control's the
+ * thread's initial stack.  This function makes this fiber visible to
+ * the client and allocates the Cilk-specific aspects of the implicit
+ * fiber.  A call to this function must be paired with a call to
+ *   cilk_fiber_deallocate_fiber_from_thread() 
+ * or a memory leak (or worse) will result.
+ *
+ * A fiber allocated from a thread begins with a reference count of 2.
+ * One is for being allocated, and one is for being active.
+ * (A fiber created from a thread is automatically currently executing.)
+ * The matching calls above each decrement the reference count by 1.
+ *
+ * @return  A fiber for the currently executing thread.
+ */
+cilk_fiber* cilk_fiber_allocate_from_thread(void);
+
+/** @brief Remove  a fiber created from a thread,
+ * possibly deallocating it.
+ *
+ * Same as cilk_fiber_remove_reference, except that it works on fibers
+ * created via cilk_fiber_allocate_from_thread().
+ *
+ * Fibers created from a thread are never returned to a pool.
+ *
+ * @param fiber   The Cilk fiber to remove a reference from.
+ * @return        Final reference count.  If the count is 0, the fiber was
+ *                returned to the heap.
+ */
+int cilk_fiber_remove_reference_from_thread(cilk_fiber *fiber);
+
+/** @brief Deallocate a fiber created from a thread,
+ * possibly destroying it.
+ *
+ * This method decrements the reference count of the fiber by 2, and
+ * destroys the fiber struct if the reference count is 0.
+ *
+ * OS-specific cleanup for the fiber executes unconditionally with 
+ * this method.  The destruction of the actual object, however, does
+ * not occur unless the reference count is 0.
+ *
+ * @param fiber   The cilk_fiber to deallocate from a thread.
+ * @return        Final reference count.  If the count is 0, the fiber was
+ *                returned to the heap.
+ */
+int cilk_fiber_deallocate_from_thread(cilk_fiber *fiber);
+
+/** @brief Returns true if this fiber is allocated from a thread.
+ */
+int cilk_fiber_is_allocated_from_thread(cilk_fiber *fiber);
+
+
+/** @brief Suspend execution on current fiber resumes other fiber.
+ *
+ * Suspends the current fiber and transfers control to a new fiber.  Execution
+ * on the new fiber resumes from the point at which fiber suspended itself to
+ * run a different fiber.  If fiber was freshly allocated, then runs the
+ * start_proc function specified at allocation.  This function returns when
+ * another fiber resumes the self fiber.  Note that the state of the
+ * floating-point control register (i.e., the register that controls rounding
+ * mode, etc.) is valid but indeterminate on return -- different
+ * implementations will have different results.
+ *
+ * When the @c self fiber is resumed, execution proceeds as though
+ * this function call returns.
+ *
+ * This operation increments the reference count of @p other.
+ * This operation decrements the reference count of @p self.
+ *
+ * @param self               Fiber to switch from.  Must equal current fiber.
+ * @param other              Fiber to switch to.
+ */
+void cilk_fiber_suspend_self_and_resume_other(cilk_fiber* self,
+                                              cilk_fiber* other);
+
+/** @brief Removes a reference from the currently executing fiber and
+ * resumes other fiber.
+ *
+ * Removes a reference from @p self and transfer control to @p other
+ * fiber.  Execution on @p other resumes from the point at which @p
+ * other suspended itself to run a different fiber.  If @p other fiber
+ * was freshly allocated, then runs the function specified at
+ * creation.
+ *
+ *
+ * This operation increments the reference count of @p other.
+ * 
+ * This operation conceptually decrements the reference count of
+ * @p self twice, once to suspend it, and once to remove a reference to
+ * it.  Then, if the count is 0, it is returned to the specified pool
+ * or destroyed.
+ *
+ * @pre @p self is the currently executing fiber.
+ *
+ * @param self               Fiber to remove reference switch from. 
+ * @param self_pool          Pool to which the current fiber should be returned
+ * @param other              Fiber to switch to.
+ */
+NORETURN
+cilk_fiber_remove_reference_from_self_and_resume_other(cilk_fiber*      self,
+                                                       cilk_fiber_pool* self_pool,
+                                                       cilk_fiber*      other);
+
+/** @brief Set the proc method to execute immediately after a switch
+ * to this fiber.
+ *
+ * The @c post_switch_proc method executes immediately after switching
+ * away form @p self fiber to some other fiber, but before @c self
+ * gets cleaned up.
+ * 
+ * @note A fiber can have only one post_switch_proc method at a time.
+ * If this method is called multiple times before switching to the
+ * fiber, only the last proc method will execute.
+ *
+ * @param self              Fiber.
+ * @param post_switch_proc  Proc method to execute immediately after switching to this fiber.
+ */
+void cilk_fiber_set_post_switch_proc(cilk_fiber* self, cilk_fiber_proc post_switch_proc);
+
+/** @brief Invoke TBB stack op for this fiber.
+ *
+ * @param fiber Fiber to invoke stack op for.
+ * @param op    The stack op to invoke
+ */
+void cilk_fiber_invoke_tbb_stack_op(cilk_fiber* fiber, __cilk_tbb_stack_op op);
+
+/** @brief Returns the fiber data associated with the specified fiber.
+ *
+ * The returned struct is owned by the fiber and is deallocated automatically
+ * when the fiber is destroyed.  However, the client_data field is owned by
+ * the client and must be deallocated separately.  When called for a
+ * newly-allocated fiber, the returned data is zero-filled.
+ *
+ * @param fiber   The fiber for which data is being requested.
+ * @return        The fiber data for the specified fiber
+ */
+cilk_fiber_data* cilk_fiber_get_data(cilk_fiber* fiber);
+
+/** @brief Retrieve the owner field from the fiber.
+ *
+ *  This method is provided for convenience.  One can also get the
+ *  fiber data, and then get the owner field.
+ */
+__CILKRTS_INLINE
+__cilkrts_worker* cilk_fiber_get_owner(cilk_fiber* fiber)
+{
+    // TBD: We really want a static assert here, that this cast is
+    // doing the right thing.
+    cilk_fiber_data* fdata = (cilk_fiber_data*)fiber;
+    return fdata->owner;
+}
+
+/** @brief Sets the owner field of a fiber.
+ *
+ *  This method is provided for convenience.  One can also get the
+ *  fiber data, and then get the owner field.
+ */
+__CILKRTS_INLINE
+void cilk_fiber_set_owner(cilk_fiber* fiber, __cilkrts_worker* owner) 
+{
+    // TBD: We really want a static assert here, that this cast is
+    // doing the right thing.
+    cilk_fiber_data* fdata = (cilk_fiber_data*)fiber;
+    fdata->owner = owner;
+}
+    
+/** @brief Returns true if this fiber is resumable.
+ *
+ * A fiber is considered resumable when it is not currently being
+ * executed.
+ *
+ * This function is used by Windows exception code.
+ * @param fiber   The fiber to check.
+ * @return        Nonzero value if fiber is resumable.
+ */
+int cilk_fiber_is_resumable(cilk_fiber* fiber);
+
+/**
+ * @brief Returns the base of this fiber's stack.
+ *
+ * On some platforms (e.g., Windows), the fiber must have started
+ * running before we can get this information.
+ *
+ * @param fiber   The fiber to get the stack pointer from.
+ * @return        The base of the stack, or NULL if this
+ *                information is not available yet.
+ */
+char* cilk_fiber_get_stack_base(cilk_fiber* fiber);
+
+
+/****************************************************************************
+ * TBB interop functions
+ * **************************************************************************/
+/**
+ * @brief Set the TBB callback information for a stack
+ *
+ * @param fiber The fiber to set the TBB callback information for
+ * @param o     The TBB callback thunk.  Specifies the callback address and
+ *              context value.
+ */
+void cilk_fiber_set_stack_op(cilk_fiber *fiber,
+                             __cilk_tbb_stack_op_thunk o);
+                     
+/**
+ * @brief Save the TBB callback address and context value in
+ * thread-local storage.
+ *
+ * We'll use it later when the thread binds to a worker.
+ *
+ * @param o The TBB callback thunk which is to be saved.
+ */
+void cilk_fiber_tbb_interop_save_stack_op_info(__cilk_tbb_stack_op_thunk o);
+
+/**
+ * @brief Move TBB stack-op info from thread-local storage and store
+ * it into the fiber.
+ *
+ * Called when we bind a thread to the runtime.  If there is any TBB
+ * interop information in thread-local storage, bind it to the stack
+ * now.
+ *
+ * @pre \c fiber should not be NULL.
+ * @param fiber The fiber that should take over the TBB interop information.
+ */
+void cilk_fiber_tbb_interop_use_saved_stack_op_info(cilk_fiber *fiber);
+
+/**
+ * @brief Free any TBB interop information saved in thread-local storage
+ */
+void cilk_fiber_tbb_interop_free_stack_op_info(void);
+
+/**
+ * @brief Migrate any TBB interop information from a cilk_fiber to
+ * thread-local storage.
+ *
+ * Returns immediately if no TBB interop information has been
+ * associated with the stack.
+ *
+ * @param fiber The cilk_fiber who's TBB interop information should be
+ * saved in thread-local storage.
+ */
+void cilk_fiber_tbb_interop_save_info_from_stack(cilk_fiber* fiber);
+
+
+#if SUPPORT_GET_CURRENT_FIBER
+/** @brief Returns the fiber associated with the currently executing thread
+ *
+ * @note This function is currently used only for testing the Cilk
+ * runtime.
+ *
+ * @return Fiber associated with the currently executing thread or NULL if no
+ *         fiber was associated with this thread.
+ */
+cilk_fiber* cilk_fiber_get_current_fiber(void);
+#endif
+
+
+#if NEED_FIBER_REF_COUNTS 
+/** @brief Returns true if this fiber has reference count > 0.
+ * 
+ * @param fiber   The fiber to check for references.
+ * @return        Nonzero value if the fiber has references.
+ */
+int cilk_fiber_has_references(cilk_fiber *fiber);
+
+/** @brief Returns the value of the reference count.
+ *
+ * @param fiber   The fiber to check for references.
+ * @return        The value of the reference count of fiber.
+ */
+int cilk_fiber_get_ref_count(cilk_fiber *fiber);
+
+/** @brief Adds a reference to this fiber.
+ *
+ *  Increments the reference count of a current fiber.  Fibers with
+ *  nonzero reference count will not be freed or returned to a fiber
+ *  pool.
+ *
+ * @param fiber   The fiber to add a reference to.
+ */
+void cilk_fiber_add_reference(cilk_fiber *fiber);
+
+#endif // NEED_FIBER_REF_COUNTS
+
+__CILKRTS_END_EXTERN_C
+
+#ifdef __cplusplus
+// Some C++ implementation details
+
+/// Opaque declaration of a cilk_fiber_sysdep object.
+struct cilk_fiber_sysdep;
+
+/**
+ * cilk_fiber is a base-class for system-dependent fiber implementations.
+ */
+struct cilk_fiber : protected cilk_fiber_data
+{
+  protected:
+    // This is a rare acceptable use of protected inheritence and protected
+    // variable access: when the base class and derived class collaborate
+    // tightly to comprise a single component.
+
+    /// For overloading constructor of cilk_fiber. 
+    enum from_thread_t { from_thread = 1 };
+
+    // Boolean flags capturing the status of the fiber.
+    // Each one can be set independently.
+    // A default fiber is constructed with a flag value of 0.
+    static const int RESUMABLE             = 0x01;  ///< True if the fiber is in a suspended state and can be resumed.
+    static const int ALLOCATED_FROM_THREAD = 0x02;  ///< True if fiber was allocated from a thread.
+
+    cilk_fiber_proc  m_start_proc;        ///< Function to run on start up/reset
+    cilk_fiber_proc  m_post_switch_proc;  ///< Function that executes when we first switch to a new fiber from a different one.
+
+    cilk_fiber*      m_pending_remove_ref;///< Fiber to possibly delete on start up or resume
+    cilk_fiber_pool* m_pending_pool;      ///< Pool where m_pending_remove_ref should go if it is deleted.
+    unsigned         m_flags;             ///< Captures the status of this fiber. 
+
+#if NEED_FIBER_REF_COUNTS
+    volatile long    m_outstanding_references;  ///< Counts references to this fiber.
+#endif
+
+    /// Creates a fiber with NULL data.
+    cilk_fiber();
+
+    /**
+     * @brief Creates a fiber with user-specified arguments.
+     *
+     * @param stack_size   Size of stack to use for this fiber.
+     */
+    cilk_fiber(std::size_t stack_size);
+
+    /// Empty destructor.
+    ~cilk_fiber();
+
+    /**
+     * @brief Performs any actions that happen after switching from
+     * one fiber to another.
+     *
+     * These actions are:
+     *   1. Execute m_post_switch_proc on a fiber.
+     *   2. Do any pending deallocations from the previous fiber.
+     */
+    void do_post_switch_actions();
+
+    /**
+     *@brief Helper method that converts a @c cilk_fiber object into a
+     * @c cilk_fiber_sysdep object.
+     *
+     * The @c cilk_fiber_sysdep object contains the system-dependent parts
+     * of the implementation of a @\c cilk_fiber.
+     *
+     * We could have @c cilk_fiber_sysdep inherit from @c cilk_fiber and
+     * then use virtual functions.  But since a given platform only uses
+     * one definition of @c cilk_fiber_sysdep at a time, we statically
+     * cast between them.
+     */
+    inline cilk_fiber_sysdep* sysdep();
+
+    /**
+     * @brief Set resumable flag to specified state.
+     */
+    inline void set_resumable(bool state) {
+        m_flags = state ?  (m_flags | RESUMABLE) : (m_flags & (~RESUMABLE));
+    }
+
+    /**
+     *@brief Set the allocated_from_thread flag. 
+     */
+    inline void set_allocated_from_thread(bool state) {
+        m_flags = state ?  (m_flags | ALLOCATED_FROM_THREAD) : (m_flags & (~ALLOCATED_FROM_THREAD));
+    }
+
+  public:
+
+    /**
+     * @brief Allocates and initializes a new cilk_fiber, either from
+     * the specified pool or from the heap.
+     *
+     * @pre pool should not be NULL.
+     */
+    static cilk_fiber* allocate(cilk_fiber_pool* pool);
+
+    /**
+     * @brief Allocates a fiber from the heap.
+     */
+    static cilk_fiber* allocate_from_heap(size_t stack_size);
+
+    /**
+     * @brief Return a fiber to the heap.
+     */
+    void deallocate_to_heap();
+
+    /**
+     * @brief Reset the state of a fiber just allocated from a pool.
+     */
+    void reset_state(cilk_fiber_proc start_proc);
+
+    /**
+     * @brief Remove a reference from this fiber, possibly
+     * deallocating it if the reference count becomes 0.
+     *
+     * @param pool The fiber pool to which this fiber should be returned.
+     * @return     The final reference count.
+     */
+    int remove_reference(cilk_fiber_pool* pool);
+
+    /**
+     * @brief Deallocate the fiber by returning it to the pool.
+     * @pre This method should only be called if the reference count
+     * is 0.
+     *
+     * @param pool The fiber pool to return this fiber to. If NULL,
+     *   fiber is returned to the heap.
+     */
+    void deallocate_self(cilk_fiber_pool *pool);
+
+    /** @brief Allocates and intializes this thread's main fiber. */
+    static cilk_fiber* allocate_from_thread();
+
+    /** @brief Deallocate a fiber created from a thread,
+     * possibly destroying it.
+     *
+     * This method decrements the reference count of this fiber by 2,
+     * and destroys the fiber if the reference count is 0.
+     *
+     * OS-specific cleanup for the fiber executes unconditionally with for
+     * this method.  The destruction of the actual object, however, does
+     * not occur unless the reference count is 0.
+     *
+     * @return        Final reference count.  If the count is 0, the fiber was
+     *                returned to the heap.
+     */
+    int deallocate_from_thread();
+
+    /** @brief Removes a reference from this fiber.
+     *
+     * This method deallocates this fiber if the reference count
+     * becomes 0.
+     *
+     * @pre     This fiber must be allocated from a thread.
+     * @return  The final reference count of this fiber.
+     */
+    int remove_reference_from_thread();
+
+#if SUPPORT_GET_CURRENT_FIBER
+    /** @brief Get the current fiber from TLS.
+     *
+     * @note This function is only used for testing the runtime.
+     */
+    static cilk_fiber* get_current_fiber();
+#endif
+
+    /** @brief Suspend execution on current fiber resumes other fiber.
+     * 
+     * Control returns after resuming execution of the self fiber.
+     */ 
+    void suspend_self_and_resume_other(cilk_fiber* other);
+
+
+    /** @brief Removes a reference from the currently executing fiber
+     * and resumes other fiber.
+     *
+     *  This fiber may be returned to a pool or deallocated.
+     */
+    NORETURN remove_reference_from_self_and_resume_other(cilk_fiber_pool* self_pool,
+                                                         cilk_fiber*      other);
+
+    /** @brief Set the proc method to execute immediately after a switch
+     * to this fiber.
+     *
+     * @param post_switch_proc Proc method to execute immediately
+     * after switching to this fiber.
+     */
+    inline void set_post_switch_proc(cilk_fiber_proc post_switch_proc) {
+        m_post_switch_proc = post_switch_proc;
+    }
+
+    /** @brief Returns true if this fiber is resumable.
+     *
+     * A fiber is considered resumable when it is not currently being
+     * executed.
+     */
+    inline bool is_resumable(void) {
+        return (m_flags & RESUMABLE);
+    }
+    
+    /** @brief Returns true if fiber was allocated from a thread. */   
+    inline bool is_allocated_from_thread(void) {
+        return (m_flags & ALLOCATED_FROM_THREAD);
+    }
+
+    /**
+     *@brief Get the address at the base of the stack for this fiber.
+     */
+    inline char* get_stack_base();
+    
+    /** @brief Return the data for this fiber. */ 
+    cilk_fiber_data*       get_data()       { return this; }
+
+    /** @brief Return the data for this fiber. */ 
+    cilk_fiber_data const* get_data() const { return this; }
+
+    
+#if NEED_FIBER_REF_COUNTS
+    /** @brief Verifies that this fiber's reference count equals v. */
+    inline void assert_ref_count_equals(long v) {
+    #if FIBER_CHECK_REF_COUNTS
+        CILK_ASSERT(m_outstanding_references >= v);
+    #endif
+    }
+
+    /** @brief Verifies that this fiber's reference count is at least v. */
+    inline void assert_ref_count_at_least(long v) {
+    #if FIBER_CHECK_REF_COUNTS
+        CILK_ASSERT(m_outstanding_references >= v);
+    #endif
+    }
+
+    /** @brief Get reference count. */
+    inline long get_ref_count()        { return m_outstanding_references; }
+
+    /** @brief Initialize reference count.
+     *  Operation is not atomic.
+     */
+    inline void init_ref_count(long v) { m_outstanding_references = v; }
+
+    // For Windows, updates to the fiber reference count need to be
+    // atomic, because exceptions can live on a stack that we are not
+    // currently executing on.  Thus, we can update the reference
+    // count of a fiber we are not currently executing on.
+
+    /** @brief Increment reference count for this fiber [Windows]. */
+    inline void inc_ref_count()            { atomic_inc_ref_count(); }
+
+    /** @brief Decrement reference count for this fiber [Windows]. */
+    inline long dec_ref_count()            { return atomic_dec_ref_count(); }
+
+    /** @brief Subtract v from the reference count for this fiber [Windows]. */
+    inline long sub_from_ref_count(long v) { return atomic_sub_from_ref_count(v); }
+#else  // NEED_FIBER_REF_COUNTS
+
+    // Without reference counting, we have placeholder methods.
+    inline void init_ref_count(long v) { }
+
+    inline void inc_ref_count() { }
+    
+    // With no reference counting, dec_ref_count always return 0.
+    // Thus, anyone checking is always the "last" one.
+    inline long dec_ref_count() { return 0; }
+    inline long sub_from_ref_count(long v) { return 0; }
+
+    // The assert methods do nothing.
+    inline void assert_ref_count_equals(long v) { }
+    inline void assert_ref_count_at_least(long v) { }
+#endif    
+
+    /**
+     * @brief Call TBB to tell it about an "interesting" event.
+     *
+     * @param op    Value specifying the event to track.
+     */
+    void invoke_tbb_stack_op(__cilk_tbb_stack_op op);
+
+private:
+
+    /**
+     * @brief Helper method: try to allocate a fiber from this pool or
+     * its ancestors without going to the OS / heap.
+     *
+     * Returns allocated pool, or NULL if no pool is found.
+     *
+     * If pool contains a suitable fiber. Return it.  Otherwise, try to
+     * recursively grab a fiber from the parent pool, if there is one.
+     *
+     * This method will not allocate a fiber from the heap.
+     */
+    static cilk_fiber* try_allocate_from_pool_recursive(cilk_fiber_pool* pool);
+    
+    
+#if NEED_FIBER_REF_COUNTS
+    /**
+     * @brief Atomic increment of reference count. 
+     */
+    void atomic_inc_ref_count();
+
+    /**
+     * @brief Atomic decrement of reference count.
+     */
+    long atomic_dec_ref_count();
+
+    /**
+     * @brief Atomic subtract of v from reference count.
+     * @param v Value to subtract.
+     */    
+    long atomic_sub_from_ref_count(long v);
+#endif // NEED_FIBER_REF_COUNTS
+    
+};
+
+#endif // __cplusplus
+
+#endif // ! defined(INCLUDED_CILK_FIBER_DOT_H)
diff --git a/libcilkrts/runtime/cilk_malloc.c b/libcilkrts/runtime/cilk_malloc.c
new file mode 100644
index 00000000000..9d02c52d037
--- /dev/null
+++ b/libcilkrts/runtime/cilk_malloc.c
@@ -0,0 +1,84 @@
+/* cilk_malloc.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#include "cilk_malloc.h"
+
+#include <stdlib.h>
+#if defined _WIN32 || defined _WIN64 || defined __linux__
+#include <malloc.h>
+#define HAS_MEMALIGN 1
+#endif
+#ifdef __VXWORKS__
+#define HAS_MEMALIGN 1
+#include <memLib.h>
+#endif
+
+#define PREFERRED_ALIGNMENT 64  /* try to keep runtime system data
+                                   structures within one cache line */
+
+void *__cilkrts_malloc(size_t size)
+{
+    /* TODO: check for out of memory */
+#ifdef _WIN32
+    return _aligned_malloc(size, PREFERRED_ALIGNMENT);
+#elif defined HAS_MEMALIGN
+    return memalign(PREFERRED_ALIGNMENT, size);
+#else
+    return malloc(size);
+#endif
+}
+
+void *__cilkrts_realloc(void *ptr, size_t size)
+{
+#ifdef _WIN32
+    return _aligned_realloc(ptr, size, PREFERRED_ALIGNMENT);
+#else
+    return realloc(ptr, size);
+#endif
+}
+
+void __cilkrts_free(void *ptr)
+{
+#ifdef _WIN32
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+}
+
+/* End cilk_malloc.c */
diff --git a/libcilkrts/runtime/cilk_malloc.h b/libcilkrts/runtime/cilk_malloc.h
new file mode 100644
index 00000000000..fa0fa6d5c9d
--- /dev/null
+++ b/libcilkrts/runtime/cilk_malloc.h
@@ -0,0 +1,90 @@
+/* cilk_malloc.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file cilk_malloc.h
+ *
+ * @brief Provides replacement memory allocation functions to allocate
+ * (and free) memory on cache line boundaries, if supported by the OS.
+ *
+ * If aligned memory functions are not provided by the OS, the calls just
+ * pass through to the standard memory allocation functions.
+ */
+
+#ifndef INCLUDED_CILK_MALLOC_DOT_H
+#define INCLUDED_CILK_MALLOC_DOT_H
+
+#include <cilk/common.h>
+#include <stddef.h>
+
+#include "rts-common.h"
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * malloc replacement function to allocate memory aligned on a cache line
+ * boundary if aligned memory allocations are supported by the OS.
+ *
+ * @param size Number of bytes to allocate.
+ *
+ * @return pointer to memory block allocated, or NULL if unsuccessful.
+ */
+COMMON_PORTABLE void *__cilkrts_malloc(size_t size);
+
+/**
+ * realloc replacement function to allocate memory aligned on a cache line
+ * boundary if aligned memory allocations are supported by the OS.
+ *
+ * @param ptr Block to be reallocated.
+ * @param size Number of bytes to allocate.
+ *
+ * @return pointer to memory block allocated, or NULL if unsuccessful.
+ */
+COMMON_PORTABLE void *__cilkrts_realloc(void *ptr, size_t size);
+
+/**
+ * free replacement function to deallocate memory aligned on a cache line
+ * boundary if aligned memory allocations are supported by the OS.
+ *
+ * @param ptr Block to be freed.
+ */
+COMMON_PORTABLE void __cilkrts_free(void *ptr);
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_CILK_MALLOC_DOT_H)
diff --git a/libcilkrts/runtime/component.h b/libcilkrts/runtime/component.h
new file mode 100644
index 00000000000..64ff3e5fc42
--- /dev/null
+++ b/libcilkrts/runtime/component.h
@@ -0,0 +1,52 @@
+/* component.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#ifndef INCLUDED_COMPONENT_DOT_H
+#define INCLUDED_COMPONENT_DOT_H
+
+#define COMPONENT_NAME "Intel� Cilk� Plus Runtime"
+
+#define COMPONENT_INTERNAL_NAME COMPONENT_NAME
+
+#define COMPONENT_FILENAME "CILKRTS20"
+
+#define BuildVersionString(_major, _minor, _build, _rev) #_major "," #_minor "," #_build "," #_rev
+
+#define COMPONENT_VERSION_STRING BuildVersionString (VERSION_MAJOR, VERSION_MINOR, VERSION_BUILD, VERSION_REVISION)
+
+#endif // ! defined(INCLUDED_COMPONENT_DOT_H)
diff --git a/libcilkrts/runtime/config/generic/cilk-abi-vla.c b/libcilkrts/runtime/config/generic/cilk-abi-vla.c
new file mode 100644
index 00000000000..98fefa101bd
--- /dev/null
+++ b/libcilkrts/runtime/config/generic/cilk-abi-vla.c
@@ -0,0 +1,107 @@
+/* cilk-abi-vla.cpp                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+/*
+ * Implementation of Variable Length Array (VLA) ABI.
+ *
+ * The compiler calls these functions to allocate Variable Length Arrays
+ * at runtime.  The compiler must guarantee that __cilkrts_stack_free() is
+ * called to cleanup any memory allocated by __cilkrts_stack_alloc().
+ *
+ * This generic implementation always allocates the memory from the heap.
+ * Optimally, the implementation should expand the frame of the calling
+ * function if possible, since that will be faster.  See the x86 version
+ * for one possible implementation.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "internal/abi.h"
+#include "cilk-abi-vla-internal.h"
+
+#define c_cilk_ptr_from_heap  0xc2f2f00d
+#define c_cilk_ptr_from_stack 0xc3f30d0f
+
+// Allocate space for a variable length array
+CILK_ABI(__cilkrts_void_ptr)
+__cilkrts_stack_alloc(
+    __cilkrts_stack_frame *sf,
+    size_t size,
+    size_t distance_from_sp_to_alloca_area,
+    uint32_t align,     // align is always >= minimum stack alignment and
+                        // >= ptr_size as well, and must be a power of 2.
+    uint32_t needs_tag  // non-zero if the pointer being returned needs to
+                        // be tagged
+)
+{
+    // full_size will be a multiple of align, and contains
+    // enough extra space to allocate a marker.
+    size_t full_size = (size + align - 1) & ~(align - 1);
+
+    // Allocate memory from the heap.  The compiler is responsible
+    // for guaranteeing us a chance to free it before the function
+    // exits
+
+    return (void *)vla_internal_heap_alloc(sf, full_size, align);
+}
+
+// Free the space allocated for a variable length array.
+CILK_ABI(void)
+__cilkrts_stack_free(
+    __cilkrts_stack_frame *sf,
+    void *p,
+    size_t size,
+    size_t distance_from_sp_to_alloca_area,
+    uint32_t align, // same requirements as for align in allocation,
+                    // and must match alignment that was passed when
+                    // doing the allocation 
+    uint32_t known_from_stack  // non-zero if this is known to be allocated
+                               // on the stack, and therefore has no tag
+)
+{
+    // full_size will be a multiple of align, and contains
+    // enough extra space to allocate a marker if one was needed.
+    size_t full_size = (size + align - 1) & ~(align - 1);
+
+    // Just free the allocated memory to the heap since we don't know
+    // how to expand/contract the calling frame
+    vla_internal_heap_free(t, full_size);
+}
diff --git a/libcilkrts/runtime/config/generic/os-fence.h b/libcilkrts/runtime/config/generic/os-fence.h
new file mode 100644
index 00000000000..841307a5296
--- /dev/null
+++ b/libcilkrts/runtime/config/generic/os-fence.h
@@ -0,0 +1,53 @@
+/* os.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/*
+ * void __cilkrts_fence(void)
+ *
+ * Executes an MFENCE instruction to serialize all load and store instructions
+ * that were issued prior the MFENCE instruction. This serializing operation
+ * guarantees that every load and store instruction that precedes the MFENCE
+ * instruction is globally visible before any load or store instruction that
+ * follows the MFENCE instruction. The MFENCE instruction is ordered with
+ * respect to all load and store instructions, other MFENCE instructions, any
+ * SFENCE and LFENCE instructions, and any serializing instructions (such as
+ * the CPUID instruction).
+ */
+
+COMMON_SYSDEP void __cilkrts_fence(void); ///< MFENCE instruction
+
diff --git a/libcilkrts/runtime/config/generic/os-unix-sysdep.c b/libcilkrts/runtime/config/generic/os-unix-sysdep.c
new file mode 100644
index 00000000000..fda7fc414bc
--- /dev/null
+++ b/libcilkrts/runtime/config/generic/os-unix-sysdep.c
@@ -0,0 +1,94 @@
+/* os-unix-sysdep.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************
+ *
+ * This file contains generic implementations of system-specific code for
+ * Unix-based systems
+ */
+
+#include "os.h"
+#include "sysdep.h"
+
+/*
+ * The cycle counter is used for debugging.  This funciton is only called if
+ * CILK_PROFILE is defined when the runtime is built.
+ */
+COMMON_SYSDEP unsigned long long __cilkrts_getticks(void)
+{
+#   warning "unimplemented cycle counter"
+    return 0;
+}
+
+/*
+ * A "short pause" - called from the Cilk runtime's spinloops.
+ */
+COMMON_SYSDEP void __cilkrts_short_pause(void)
+{
+#   warning __cilkrts_short_pause empty
+}
+
+/*
+ * Interlocked exchange - used to implement the Cilk runtime's spinloops
+ */
+COMMON_SYSDEP int __cilkrts_xchg(volatile int *ptr, int x)
+{
+    x = __sync_lock_test_and_set(ptr, x);
+    return x;
+}
+
+
+/*
+ * Restore the floating point state that is stored in a stack frame at each
+ * spawn.  This should be called each time a frame is resumed.
+ *
+ * Only valid for IA32 and Intel64 processors.
+ */
+void restore_x86_fp_state (__cilkrts_stack_frame *sf)
+{
+}
+
+
+/*
+ * Save the floating point state to the __cilkrts_stack_frame at each spawn.
+ *
+ * Architecture-specific - Should only be needed on IA32 and Intel64
+ * processors.
+ */
+void sysdep_save_fp_ctrl_state(__cilkrts_stack_frame *sf)
+{
+}
+
diff --git a/libcilkrts/runtime/config/x86/cilk-abi-vla.c b/libcilkrts/runtime/config/x86/cilk-abi-vla.c
new file mode 100644
index 00000000000..2d38e7f9a56
--- /dev/null
+++ b/libcilkrts/runtime/config/x86/cilk-abi-vla.c
@@ -0,0 +1,422 @@
+/* cilk-abi-vla.cpp                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+/*
+ * Implementation of Variable Length Array (VLA) ABI.
+ *
+ * __cilkrts_stack_alloc() and __cilkrts_stack_free must be compiled
+ * such that ebp/rbp is used for the stack frames.  This is done by having
+ * each of them use alloca, which forces the special frame types needed on
+ * each of the ABIs.  Additionally, for some forms of stack frame, special
+ * care must be taken because the alloca space may not be at the bottom of the
+ * stack frame of the caller.  For Intel64 windows, and for some options
+ * with other ABIs, a preallocated parameter block may exist on the stack
+ * at a lower address than the alloca.  If this is the case, the parameter
+ * distance_from_sp_to_alloca_area will be non-zero, and will indicate how
+ * much pre-allocated parameter space resides in the caller's stack frame
+ * between the alloca area, and the bottom of the stack when the call to
+ * the cilkrts is made.  As such, when non-zero it also includes any space
+ * used for passing the cilkrts_stack_alloc or cilkrts_stack_free parameters.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+#ifdef _WIN32
+# define alloca _alloca
+# define INLINE static __inline
+# pragma warning(disable:1025)  // Don't whine about zero extending result of unary operation
+#else
+# include <alloca.h>
+# define INLINE static inline
+#endif
+
+#include "internal/abi.h"
+#include "cilk-abi-vla-internal.h"
+
+#if defined(__x86_64) || defined(_M_X64)
+INLINE void setsp(void *val)
+{
+    __asm__("movq %0, %%rsp" : : "r"(val): "rsp");
+}
+INLINE char* getsp(void)
+{
+    void *res;
+
+    __asm__("movq %%rsp, %0" : "=r"(res): : "rsp");
+    return res;
+}
+INLINE char* getbp(void)
+{
+    void *res;
+
+    __asm__("movq %%rbp, %0" : "=r"(res): : "rbp");
+    return res;
+}
+INLINE void copy_frame_down_and_move_bp(
+    char *dst,
+    char *src,
+    size_t cpy_bytes,
+    char *new_ebp
+)
+{
+    // In this version, dst is guaranteed to be lower address than src,
+    // therefore copying upwards from src into dst is safe in case
+    // there is overlap. The number of bytes is also guaranteed to be
+    // a multiple of 8, and the copy is done in 64 bit word chunks for
+    // best efficiency.
+    __asm__(
+        "movq %0, %%rdi;"
+        "movq %1, %%rsi;"
+        "movq %2, %%rcx;"
+        "shrq $3, %%rcx;"
+        "rep movsq;"
+        "movq %3, %%rbp" : 
+        :
+        "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
+        "rsi", "rdi", "rcx", "rbp", "memory");
+}
+INLINE void copy_frame_up_and_move_bp(
+    char *dst,
+    char *src,
+    size_t cpy_bytes,
+    char *new_ebp
+)
+{
+    // In this version, dst is guaranteed to be higher address than src,
+    // therefore copying downwards from src into dst is safe in case
+    // there is overlap. The number of bytes is also guaranteed to be
+    // a multiple of 8, and the copy is done in 64 bit word chunks for
+    // best efficiency.
+    dst += cpy_bytes - 8;
+    src += cpy_bytes - 8;
+    __asm__(
+        "movq %0, %%rdi;"
+        "movq %1, %%rsi;"
+        "movq %2, %%rcx;"
+        "shrq $3, %%rcx;"
+        "std; rep movsq; cld;"
+        "movl %3, %%rbp;" : 
+        :
+        "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
+        "rsi", "rdi", "rcx", "rbp", "memory");
+}
+#else
+INLINE void setsp(void *val)
+{
+    __asm__("movl %0, %%esp" : : "r"(val): "esp");
+}
+INLINE char* getsp(void)
+{
+    void *res;
+
+    __asm__("movl %%esp, %0" : "=r"(res): : "esp");
+    return res;
+}
+INLINE char* getbp(void)
+{
+    void *res;
+
+    __asm__("movl %%ebp, %0" : "=r"(res): : "ebp");
+    return res;
+}
+INLINE void copy_frame_down_and_move_bp(
+    char *dst,
+    char *src,
+    size_t cpy_bytes,
+    char *new_ebp
+)
+{
+    // In this version, dst is guaranteed to be lower address than src,
+    // therefore copying upwards from src into dst is safe in case
+    // there is overlap. The number of bytes is also guaranteed to be
+    // a multiple of 4, and the copy is done in 32 bit word chunks for
+    // best efficiency.
+    __asm__(
+        "movl %0, %%edi;"
+        "movl %1, %%esi;"
+        "movl %2, %%ecx;"
+        "shrl $2, %%ecx;"
+        "rep movsd;"
+        "movl %3, %%ebp" : 
+        :
+        "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
+        "esi", "edi", "ecx", "ebp", "memory");
+}
+INLINE void copy_frame_up_and_move_bp(
+    char *dst,
+    char *src,
+    size_t cpy_bytes,
+    char *new_ebp
+)
+{
+    // In this version, dst is guaranteed to be higher address than src,
+    // therefore copying downwards from src into dst is safe in case
+    // there is overlap. The number of bytes is also guaranteed to be
+    // a multiple of 4, and the copy is done in 32 bit word chunks for
+    // best efficiency.
+    dst += cpy_bytes - 4;
+    src += cpy_bytes - 4;
+    __asm__(
+        "movl %0, %%edi;"
+        "movl %1, %%esi;"
+        "movl %2, %%ecx;"
+        "shrl $2, %%ecx;"
+        "std; rep movsd; cld;"
+        "movl %3, %%ebp" : 
+        // "=D"(dst), "=S"(src), "=C"(cpy_bytes) :
+        :
+        "rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
+        "esi", "edi", "ecx", "ebp", "memory");
+}
+#endif
+
+
+#define c_cilk_ptr_from_heap  0xc2f2f00d
+#define c_cilk_ptr_from_stack 0xc3f30d0f
+
+CILK_ABI(__cilkrts_void_ptr)
+__cilkrts_stack_alloc(
+    __cilkrts_stack_frame *sf,
+    size_t size,
+    size_t distance_from_sp_to_alloca_area,
+    uint32_t align,     // align is always >= minimum stack alignment and
+                        // >= ptr_size as well, and must be a power of 2.
+    uint32_t needs_tag  // non-zero if the pointer being returned needs to
+                        // be tagged
+)
+{
+#ifdef __INTEL_COMPILER
+    // full_size will be a multiple of align, and contains
+    // enough extra space to allocate a marker.
+    size_t full_size = (size + align - 1) & ~(align - 1);
+
+    if (needs_tag) {
+        full_size += align;
+    }
+
+    char *t;
+    if (sf->worker != 0 &&
+        ((sf->flags & CILK_FRAME_UNSYNCHED) != 0)) {
+        t = vla_internal_heap_alloc(sf, full_size, align);
+        if (needs_tag) {
+            t += align;
+            ((uint32_t*)t)[-1] = c_cilk_ptr_from_heap;
+        }
+        return (void *)t;
+    }
+    
+    // stack is still synced, allocate full_size from esp,
+    // and record in 32 bits immediately below the space
+    // allocated that this was space that this was
+    // allocated in the stack.
+    char *old_ebp = getbp();
+    char *old_esp = getsp();
+
+    // make top_ptr point to base of first parameter.
+    char *top_ptr = ((char *)(_AddressOfReturnAddress()) +
+                    sizeof(char *));
+    size_t param_size = 0;
+
+#if defined(__x86_64)
+    // For Intel64 linux & MACH ABI, all the parameters were passed in
+    // register, so top of the stack frame above the return address
+    // is just the size of the return address plus
+    // distance_from_sp_to_alloca_area on the chance that the alloca
+    // area isn't at the very bottom of the calling functions stack.
+#elif defined(__MACH__)
+    // For ia32 MACH, parameter size is always a mutliple of 16
+    // bytes to keep the stack 16 byte aligned.  So we need to round
+    // number of parameters up to multiple of 4.
+    param_size = 8 * sizeof(char *);
+#else
+    // For both windows Intel64 ABI, and the IA32 windows and
+    // linux ABIs, space is reserved on the stack for all these
+    // parameters.  param_size is 5 * size of a stack slot.
+    param_size = 5 * sizeof(char *);
+#endif
+
+    // now make top_ptr point above the params, or if
+    // distance_from_sp_to_alloca_area is not zero, make
+    // it point above that area.  When non-zero,
+    // distance_from_sp_to_alloca area is expected to contain
+    // the parameter space, so we only add one or the other,
+    // not both.
+    top_ptr += (distance_from_sp_to_alloca_area != 0) ?
+                   distance_from_sp_to_alloca_area : param_size;
+    
+    // t needs to end up at current value of top_ptr less full_size and less
+    // distance_from_sp_to_alloca_area and
+    // then rounded down to the alignment needed.  Then we have to bump
+    // esp down by current frame_size, so that when all is done with respect
+    // to executing the return sequence, the final value of esp will be the
+    // same value as t.
+    t = (top_ptr - full_size) - distance_from_sp_to_alloca_area;
+    intptr_t temp = (intptr_t)t;
+    temp &= ~((intptr_t)(align - 1));
+    t = (char *)temp;
+
+    // ok, the value of t is set where we need it.  Now set esp
+    // to the value of t less the current frame size.
+    // So now when we do regular return esp should be left such
+    // that it has moved down by full_size.
+    size_t cur_fm_size = (top_ptr - old_esp);
+    char *new_esp = t - cur_fm_size;
+    char *new_ebp = old_ebp - (old_esp - new_esp);
+
+    // extend the stack down by at least the difference between where
+    // I want it to be and where it currently is.  This should take care
+    // of touching any pages necessary.
+    char *foo = alloca(old_esp - new_esp);
+    setsp(foo < new_esp ? foo : new_esp);
+
+    // Now set esp exactly where I want it.
+    // setsp(new_esp);
+
+    copy_frame_down_and_move_bp(new_esp, old_esp, cur_fm_size, new_ebp);
+
+    if (needs_tag) {
+        t += align;
+        ((uint32_t*)t)[-1] = c_cilk_ptr_from_stack;
+    }
+
+    return t;
+#else // Not __INTEL_COMPILER
+    // Not supported unless we can figure out how to get the size of the frame
+    return NULL;
+#endif
+}
+
+// This frees the space allocated for a variable length array.
+CILK_ABI(void)
+__cilkrts_stack_free(
+    __cilkrts_stack_frame *sf,
+    void *p,
+    size_t size,
+    size_t distance_from_sp_to_alloca_area,
+    uint32_t align, // same requirements as for align in allocation,
+                    // and must match alignment that was passed when
+                    // doing the allocation 
+    uint32_t known_from_stack  // non-zero if this is known to be allocated
+                               // on the stack, and therefore has no tag
+)
+{
+#ifdef __INTEL_COMPILER
+    uint32_t *t = (uint32_t*)p;
+
+    // full_size will be a multiple of align, and contains
+    // enough extra space to allocate a marker if one was needed.
+    size_t full_size = (size + align - 1) & ~(align - 1);
+    if (known_from_stack == 0) {
+        // if the compiler hasn't told the run-time that this is
+        // known to be on the stack, then this pointer must have been
+        // tagged such that the run-time can tell.
+        assert(t[-1] == c_cilk_ptr_from_stack ||
+               t[-1] == c_cilk_ptr_from_heap);
+
+        known_from_stack = t[-1] == c_cilk_ptr_from_stack;
+        full_size += align;    // accounts for extra space for marker
+        t = (uint32_t *)(((char *)t) - align);
+    }
+
+    if (known_from_stack) {
+        // alloca useage forces an ebp/rbp based stack frame even though
+        // 0 and unused.
+        char *foo = alloca(0);
+        if (sf->worker == 0 || (sf->flags & CILK_FRAME_UNSYNCHED) == 0) {
+            // p was allocated from current stack frame and we
+            // are synced on current stack frame.  Return the
+            // amount of the stack that needs to be freed.
+            char *old_ebp = getbp();
+            char *old_esp = getsp();
+
+            // make top_ptr point to base of first parameter.
+            char *top_ptr = ((char *)(_AddressOfReturnAddress()) +
+                            sizeof(char *));
+            size_t param_size = 0;
+
+#if defined(__x86_64)
+            // For Intel64 linux & MACH ABI, all the parameters were passed in
+            // register, so top of the stack frame above the return address
+            // is just the size of the return address plus
+            // distance_from_sp_to_alloca_area on the chance that the alloca
+            // area isn't at the very bottom of the calling functions stack.
+#elif defined(__MACH__)
+            // For ia32 MACH, parameter size is always a mutliple of 16
+            // bytes to keep the stack 16 byte aligned.  So we need to round
+            // number of parameters up to multiple of 4.
+            param_size = 8 * sizeof(char *);
+#else
+            // For both windows Intel64 ABI, and the IA32 windows and
+            // linux ABIs, space is reserved on the stack for all these
+            // parameters.  param_size is 5 * size of a stack slot.
+            param_size = 6 * sizeof(char *);
+#endif
+
+            // now make top_ptr point above the params, or if
+            // distance_from_sp_to_alloca_area is not zero, make
+            // it point above that area.  When non-zero,
+            // distance_from_sp_to_alloca area is expected to contain
+            // the parameter space, so we only add one or the other,
+            // not both.
+            top_ptr += (distance_from_sp_to_alloca_area != 0) ?
+                           distance_from_sp_to_alloca_area : param_size;
+
+            size_t cur_fm_size = (top_ptr - old_esp);
+            char *new_esp = old_esp + full_size;
+            char *new_ebp = old_ebp + full_size;
+
+            copy_frame_up_and_move_bp(new_esp, old_esp, cur_fm_size, new_ebp);
+            setsp(new_esp);
+        }
+        else {
+            // p was allocated on stack frame, but that is
+            // no longer the current stack frame.  Need to adjust the
+            // saved esp that is somewhere in the cilk runtime so that
+            // on sync, esp will be cut back correctly.
+            vla_free_from_original_stack(sf, full_size);
+        }
+    }
+    else {
+        vla_internal_heap_free(t, full_size);
+    }
+#else // Not __INTEL_COMPILER
+    // Not supported unless we can figure out how to get the size of the frame
+#endif
+}
diff --git a/libcilkrts/runtime/config/x86/os-fence.h b/libcilkrts/runtime/config/x86/os-fence.h
new file mode 100644
index 00000000000..ec704e94ef2
--- /dev/null
+++ b/libcilkrts/runtime/config/x86/os-fence.h
@@ -0,0 +1,72 @@
+/* os.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/* gcc before 4.4 does not implement __sync_synchronize properly */
+#if (__ICC >= 1110 && !(__MIC__ || __MIC2__))                      \
+    || (!defined __ICC && __GNUC__ * 10 + __GNUC_MINOR__ > 43)
+#   define HAVE_SYNC_INTRINSICS 1
+#endif
+
+
+/*
+ * void __cilkrts_fence(void)
+ *
+ * Executes an MFENCE instruction to serialize all load and store instructions
+ * that were issued prior the MFENCE instruction. This serializing operation
+ * guarantees that every load and store instruction that precedes the MFENCE
+ * instruction is globally visible before any load or store instruction that
+ * follows the MFENCE instruction. The MFENCE instruction is ordered with
+ * respect to all load and store instructions, other MFENCE instructions, any
+ * SFENCE and LFENCE instructions, and any serializing instructions (such as
+ * the CPUID instruction).
+ */
+#ifdef HAVE_SYNC_INTRINSICS
+#   define __cilkrts_fence() __sync_synchronize()
+#elif defined __ICC || defined __GNUC__
+    /* mfence is a strict subset of lock add but takes longer on many
+     * processors. */
+// #   define __cilkrts_fence() __asm__ volatile ("mfence")
+    /* On MIC, fence seems to be completely unnecessary.
+     * Just for simplicity of 1st implementation, it defaults to x86 */ 
+#   define __cilkrts_fence() __asm__ volatile ("lock addl $0,(%rsp)")
+// #elif defined _WIN32
+// #   pragma intrinsic(_ReadWriteBarrier)
+// #   define __cilkrts_fence() _ReadWriteBarrier()
+#else
+COMMON_SYSDEP void __cilkrts_fence(void); ///< MFENCE instruction
+#endif
diff --git a/libcilkrts/runtime/config/x86/os-unix-sysdep.c b/libcilkrts/runtime/config/x86/os-unix-sysdep.c
new file mode 100644
index 00000000000..881bc3f4283
--- /dev/null
+++ b/libcilkrts/runtime/config/x86/os-unix-sysdep.c
@@ -0,0 +1,123 @@
+/* os-unix-sysdep.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************
+ *
+ * This file contains system-specific code for Unix systems
+ */
+
+#include "os.h"
+#include "sysdep.h"
+#include <internal/abi.h>
+
+// On x86 processors (but not MIC processors), the compiler generated code to
+// save the FP state (rounding mode and the like) before calling setjmp.  We
+// will need to restore that state when we resume.
+#ifndef __MIC__
+# if defined(__i386__) || defined(__x86_64)
+#   define RESTORE_X86_FP_STATE
+# endif // defined(__i386__) || defined(__x86_64)
+#endif  // __MIC__
+
+/* timer support */
+COMMON_SYSDEP unsigned long long __cilkrts_getticks(void)
+{
+#if defined __i386__ || defined __x86_64
+    unsigned a, d; 
+    __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); 
+    return ((unsigned long long)a) | (((unsigned long long)d) << 32); 
+#else
+#   warning "unimplemented cycle counter"
+    return 0;
+#endif
+}
+
+COMMON_SYSDEP void __cilkrts_short_pause(void)
+{
+#if __ICC >= 1110
+#   if __MIC__ || __MIC2__
+    _mm_delay_32(16); // stall for 16 cycles
+#   else
+    _mm_pause();
+#   endif
+#elif defined __i386__ || defined __x86_64
+    __asm__("pause");
+#else
+#   warning __cilkrts_short_pause empty
+#endif
+}
+
+COMMON_SYSDEP int __cilkrts_xchg(volatile int *ptr, int x)
+{
+#if defined __i386__ || defined __x86_64
+    /* asm statement here works around icc bugs */
+    __asm__("xchgl %0,%a1" :"=r" (x) : "r" (ptr), "0" (x) :"memory");
+#else
+    x = __sync_lock_test_and_set(ptr, x);
+#endif
+    return x;
+}
+
+
+/*
+ * Restore the floating point state that is stored in a stack frame at each
+ * spawn.  This should be called each time a frame is resumed.
+ *
+ * Only valid for IA32 and Intel64 processors.
+ */
+void restore_x86_fp_state (__cilkrts_stack_frame *sf) {
+#ifdef RESTORE_X86_FP_STATE
+    __asm__ ( "ldmxcsr %0\n\t"
+              "fnclex\n\t"
+              "fldcw %1"
+              :
+              : "m" (sf->mxcsr), "m" (sf->fpcsr));
+#endif
+}
+
+
+void sysdep_save_fp_ctrl_state(__cilkrts_stack_frame *sf)
+{
+// If we're not going to restore, don't bother saving it
+#ifdef RESTORE_X86_FP_STATE
+    if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1)
+    {
+        __asm__ ("stmxcsr %0" : "=m" (sf->mxcsr));
+        __asm__ ("fnstsw %0" : "=m" (sf->fpcsr));
+    }
+#endif
+}
+
diff --git a/libcilkrts/runtime/doxygen-layout.xml b/libcilkrts/runtime/doxygen-layout.xml
new file mode 100644
index 00000000000..fabe0ab3cd8
--- /dev/null
+++ b/libcilkrts/runtime/doxygen-layout.xml
@@ -0,0 +1,222 @@
+<doxygenlayout version="1.0">
+
+<!--
+#  @copyright
+#  Copyright (C) 2011-2013, Intel Corporation
+#  All rights reserved.
+#  
+#  @copyright
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#  
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#  
+#  @copyright
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+#  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+#  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+#  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+#  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+#  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+#  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+-->
+
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" visible="yes" title=""/>
+    <tab type="pages" visible="yes" title="" intro=""/>
+    <tab type="modules" visible="yes" title="" intro=""/>
+    <tab type="namespaces" visible="yes" title="">
+      <tab type="namespaces" visible="yes" title="" intro=""/>
+      <tab type="namespacemembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="classes" visible="yes" title="Classes, Structs and Unions">
+      <tab type="classes" visible="yes" title="Classes, Structs and Unions" intro=""/>
+      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/> 
+      <tab type="hierarchy" visible="yes" title="" intro=""/>
+      <tab type="classmembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="files" visible="yes" title="">
+      <tab type="files" visible="yes" title="" intro=""/>
+      <tab type="globals" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="globals" visible="yes" title="Global Functions" intro=""/>
+    <tab type="dirs" visible="yes" title="" intro=""/>
+    <tab type="examples" visible="yes" title="" intro=""/>  
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <allmemberslink visible="yes"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <typedefs title=""/>
+      <enums title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="no"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+      <detaileddescription title="Description"/>
+      <includes visible="no"/>
+      <sourcelink visible="yes"/>
+      <memberdecl>
+      <classes visible="yes" title="Structures and Classes"/>
+      <namespaces visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <memberdef>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <nestedgroups visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/libcilkrts/runtime/doxygen.cfg b/libcilkrts/runtime/doxygen.cfg
new file mode 100644
index 00000000000..684dcb51b51
--- /dev/null
+++ b/libcilkrts/runtime/doxygen.cfg
@@ -0,0 +1,1774 @@
+# Doxyfile 1.7.4
+
+#  @copyright
+#  Copyright (C) 2011-2013, Intel Corporation
+#  All rights reserved.
+#  
+#  @copyright
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions
+#  are met:
+#  
+#    * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+#    * Redistributions in binary form must reproduce the above copyright
+#      notice, this list of conditions and the following disclaimer in
+#      the documentation and/or other materials provided with the
+#      distribution.
+#    * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products derived
+#      from this software without specific prior written permission.
+#  
+#  @copyright
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+#  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+#  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+#  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+#  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+#  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+#  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#  POSSIBILITY OF SUCH DAMAGE.
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = "Intel Cilk Plus Runtime"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       =
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = YES
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            = doxygen-layout.xml
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = ./ \
+                         ../include/internal/abi.h \
+                         ../include/cilk/cilk_api.h \
+                         ../include/cilk/common.h \
+                         ./readme.dox
+                         
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                = attributes.h \
+                         cilk-ittnotify.h \
+                         component.h \
+                         rts-common.h \
+                         windows-clean.h
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        = _UNWIND_INFO \
+                         _UNWIND_CODE \
+                         _DISPATCHER_CONTEXT \
+                         __cilkrts_stack \
+                         pending_exception_info
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+# for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is adviced to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the stylesheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = com.Intel.CilkPlusRuntime
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = com.Intel.CilkPlusRuntime
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = "Intel Corporation"
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NO
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the
+# mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             = _WIN32 \
+                         COMMON_SYSDEP= \
+                         COMMON_PORTABLE= \
+                         NON_COMMON= \
+                         __CILKRTS_BEGIN_EXTERN_C= \
+                         __CILKRTS_END_EXTERN_C= \
+                         CILK_API(t)=t \
+                         CILK_ABI(t)=t \
+                         CILK_ABI_THROWS(t)=t \
+                         CALLBACK= \
+                         __CILKRTS_INLINE=inline \
+                         __CILKRTS_ABI_VERSION=1 \
+                         __cplusplus \
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will write a font called Helvetica to the output
+# directory and reference it in all dot files that doxygen generates.
+# When you want a differently looking font you can specify the font name
+# using DOT_FONTNAME. You need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/libcilkrts/runtime/except-gcc.cpp b/libcilkrts/runtime/except-gcc.cpp
new file mode 100644
index 00000000000..bd08d1826b3
--- /dev/null
+++ b/libcilkrts/runtime/except-gcc.cpp
@@ -0,0 +1,597 @@
+/* except-gcc.cpp                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#include "except-gcc.h"
+#include "except.h"
+#include "sysdep.h"
+#include "bug.h"
+#include "local_state.h"
+#include "full_frame.h"
+#include "scheduler.h"
+#include "frame_malloc.h"
+#include "pedigrees.h"
+
+#include <stdint.h>
+#include <typeinfo>
+
+#define DEBUG_EXCEPTIONS 0
+
+struct pending_exception_info
+{
+    void make(__cxa_eh_globals *, _Unwind_Exception *, bool);
+    void destruct();
+    bool empty() const;
+    void check() const;
+    /* Active exception at time of suspend. */
+    _Unwind_Exception *active;
+    /* If true the most recently caught exception is to be rethrown
+       on resume.  This handling is technically incorrect but allows
+       running without compiler support; the proper standards-compliant
+       method is to save the exception in the previous field. */
+    bool rethrow;
+    struct __cxa_eh_globals runtime_state;
+};
+
+void pending_exception_info::check() const
+{
+    if (active)
+        CILK_ASSERT((int)runtime_state.uncaughtExceptions > 0);
+}
+
+void pending_exception_info::make(__cxa_eh_globals *state_in,
+                                  _Unwind_Exception *exc_in, bool rethrow_in)
+{
+    active = exc_in;
+    rethrow = rethrow_in;
+    runtime_state = *state_in;
+    /* Read and clear C++ runtime state.  */
+    state_in->caughtExceptions = 0;
+    state_in->uncaughtExceptions = 0;
+#if CILK_LIB_DEBUG
+    check();
+#endif
+}
+
+bool
+pending_exception_info::empty() const
+{
+    return !active && !rethrow && !runtime_state.caughtExceptions &&
+        !runtime_state.uncaughtExceptions;
+}
+
+#if DEBUG_EXCEPTIONS
+#include <stdio.h>
+static void
+decode_exceptions(char *out, size_t len, struct pending_exception_info *info)
+{
+    if (info->empty())
+        snprintf(out, len, "[empty]");
+    else if (info->rethrow)
+        snprintf(out, len, "[rethrow %p]",
+                 info->runtime_state.caughtExceptions);
+    else
+        snprintf(out, len, "[throw %p]", (void *)info->active);
+}
+#endif
+
+static void
+save_exception_info(__cilkrts_worker *w,
+                    __cxa_eh_globals *state,
+                    _Unwind_Exception *exc,
+                    bool rethrow,
+                    const char *why)
+{
+    struct pending_exception_info *info =
+        (struct pending_exception_info *)__cilkrts_frame_malloc(w, sizeof (struct pending_exception_info));
+    CILK_ASSERT(info);
+    info->make(state, exc, rethrow);
+
+#if DEBUG_EXCEPTIONS
+    {
+        char buf[40];
+        decode_exceptions(buf, sizeof buf, info);
+        fprintf(stderr, "make exception info W%u %p %s (%s)\n",
+                w->self, info, buf, why);
+    }
+#endif
+
+    CILK_ASSERT(w->l->pending_exception == 0);
+    w->l->pending_exception = info;
+}
+
+#if DEBUG_EXCEPTIONS
+#include <stdio.h> /* DEBUG */
+
+static void decode_flags(int flags, char out[9])
+{
+  out[0] = (flags & CILK_FRAME_STOLEN) ? 'S' : '_';
+  out[1] = (flags & CILK_FRAME_UNSYNCHED) ? 'U' : '_';
+  out[2] = (flags & CILK_FRAME_DETACHED) ? 'D' : '_';
+  out[3] = (flags & CILK_FRAME_EXCEPTING) ? 'X' : '_';
+  out[4] = '\0';
+}
+#endif
+
+/* __cilkrts_save_except is called from the runtime epilogue
+   when a function is returning with an exception pending.
+
+   If the function has a parent to which it could return normally,
+   return and have the caller call _Unwind_Resume, the same as if
+   an exception filter had not matched.
+
+   Otherwise save the exception in the worker.
+
+   If this is a return from a ordinary call that must go through
+   the runtime, the assembly epilogue must have saved the call-saved
+   register state in the parent frame. */
+
+extern "C"
+CILK_ABI_THROWS_VOID
+__cilkrts_return_exception(__cilkrts_stack_frame *sf)
+{
+    __cilkrts_worker *w = sf->worker;
+    _Unwind_Exception *exc = (_Unwind_Exception *)sf->except_data;
+
+    CILK_ASSERT(sf->flags & CILK_FRAME_DETACHED);
+    sf->flags &= ~CILK_FRAME_DETACHED;
+
+   /*
+    * If we are in replay mode, and a steal occurred during the recording
+    * phase, stall till a steal actually occurs.
+    */
+    replay_wait_for_steal_if_parent_was_stolen(w);
+
+    /* If this is to be an abnormal return, save the active exception. */
+    if (!__cilkrts_pop_tail(w)) {
+        /* Write a record to the replay log for an attempt to return to a
+           stolen parent.  This must be done before the exception handler
+           invokes __cilkrts_leave_frame which will bump the pedigree so
+           the replay_wait_for_steal_if_parent_was_stolen() above will match on
+           replay */
+        replay_record_orphaned(w);
+
+        /* Now that the record/replay stuff is done, update the pedigree */
+        update_pedigree_on_leave_frame(w, sf);
+
+        /* Inline pop_frame; this may not be needed. */
+        w->current_stack_frame = sf->call_parent;
+        sf->call_parent = 0;
+        __cxa_eh_globals *state = __cxa_get_globals();
+
+#if DEBUG_EXCEPTIONS
+        fflush(stdout);
+        char decoded[9];
+        decode_flags(sf->flags, decoded);
+        fprintf(stderr, "__cilkrts_save_except W%u sf %p/%s exc %p [%u %p] suspend\n",
+                w->self, sf, decoded, exc,
+                state->uncaughtExceptions,
+                state->caughtExceptions);
+#endif
+
+        /* Like __cilkrts_save_exception_state except for setting the
+           rethrow flag. */
+        save_exception_info(w, state, exc, exc == NULL, "save_except");
+        {
+            full_frame *ff = w->l->frame_ff;
+            CILK_ASSERT(NULL == ff->pending_exception);
+            ff->pending_exception = w->l->pending_exception;
+            w->l->pending_exception = NULL;
+        }
+        __cilkrts_exception_from_spawn(w, sf); /* does not return */
+    }
+    /* This code path is taken when the parent is attached.  It is on
+       the same stack and part of the same full frame.  The caller is
+       cleaning up the Cilk frame during unwind and will reraise the
+       exception */
+
+    /* Now that the record/replay stuff is done, update the pedigree */
+    update_pedigree_on_leave_frame(w, sf);
+
+#if DEBUG_EXCEPTIONS /* DEBUG ONLY */
+    {
+        __cxa_eh_globals *state = __cxa_get_globals();
+
+        fflush(stdout);
+        char decoded[9];
+        decode_flags(sf->flags, decoded);
+        fprintf(stderr, "__cilkrts_save_except W%d %p/%s %p->%p [%u %p] escape\n",
+                w->self, sf, decoded, exc,
+                exc ? to_cxx(exc)->nextException : 0,
+                state->uncaughtExceptions,
+                state->caughtExceptions);
+
+        /* XXX This is triggering in the user thread which gets an exception
+           from somewhere but does not get the corresponding runtime exception
+           state.
+           XXX There might be two or more uncaught exceptions.  Test could be
+           (uncaught != 0) == (exc != 0).  First, design tests to see if that
+           case is otherwise handled correctly.  And what if there's an uncaught
+           exception that does not belong to this function?  I.e. this is a return
+           from spawn in a destructor. */
+        if (exc)
+            CILK_ASSERT((int)state->uncaughtExceptions > 0);
+        /*CILK_ASSERT(state->uncaughtExceptions == (exc != 0));*/
+    }
+#endif
+    
+    /* The parent is attached so this exception can be propagated normally. */
+    return;
+}
+
+/* Save the exception state into the full frame, which is exiting
+   or suspending. */
+extern "C"
+void __cilkrts_save_exception_state(__cilkrts_worker *w, full_frame *ff)
+{
+    save_exception_info(w, __cxa_get_globals(), 0, false, "undo-detach");
+    CILK_ASSERT(NULL == ff->pending_exception);
+    ff->pending_exception = w->l->pending_exception;
+    w->l->pending_exception = NULL;    
+}
+
+/* __cilkrts_c_sync_except is like __cilkrts_c_sync except that it
+   saves exception state.  __cilkrts_c_sync never returns here and
+   always reinstalls the saved exception state.
+
+   This function must be used because a parent of this function may
+   be propagating an uncaught exception.  The uncaught exception
+   count must be saved by the child and passed back to the parent. */
+
+extern "C"
+NORETURN __cilkrts_c_sync_except (__cilkrts_worker *w, __cilkrts_stack_frame *sf)
+{
+    __cxa_eh_globals *state = __cxa_get_globals();
+    _Unwind_Exception *exc = (_Unwind_Exception *)sf->except_data;
+
+    CILK_ASSERT((sf->flags & (CILK_FRAME_UNSYNCHED|CILK_FRAME_EXCEPTING)) ==
+                (CILK_FRAME_UNSYNCHED|CILK_FRAME_EXCEPTING));
+    sf->flags &= ~CILK_FRAME_EXCEPTING;
+
+#if DEBUG_EXCEPTIONS
+    fflush(stdout);
+    char decoded[9];
+    decode_flags(sf->flags, decoded);
+    if (exc)
+        fprintf(stderr, "__cilkrts_sync_except W%u %p/%s %p->%p [%u %p]\n",
+                w->self, sf, decoded, exc,
+                to_cxx(exc)->nextException,
+                state->uncaughtExceptions,
+                state->caughtExceptions);
+    else
+        fprintf(stderr, "__cilkrts_sync_except W%d %p/%s none [%u %p]\n",
+                w->self, sf, decoded,
+                state->uncaughtExceptions,
+                state->caughtExceptions);
+#endif
+
+    /* Here the identity of an rethrown exception is always known.
+       If exc is NULL this call is only to preserve parent state. */
+    save_exception_info(w, state, exc, false, "sync_except");
+#if 0
+    {
+        full_frame *ff = w->l->frame_ff;
+        CILK_ASSERT(NULL == ff->pending_exception);
+        ff->pending_exception = w->l->pending_exception;
+        w->l->pending_exception = NULL;    
+    }
+#endif
+    CILK_ASSERT(!std::uncaught_exception());
+    __cilkrts_c_sync(w, sf);
+}
+
+void
+pending_exception_info::destruct()
+{
+    if (active) {
+#if DEBUG_EXCEPTIONS
+        fprintf(stderr, "destroy exception info %p %p\n", this, active);
+#endif
+        _Unwind_DeleteException(active);
+        active = 0;
+    } else {
+#if DEBUG_EXCEPTIONS
+        fprintf(stderr, "destroy exception info %p\n", this);
+#endif
+    }
+    while (runtime_state.caughtExceptions) {
+        __cxa_exception *exc = runtime_state.caughtExceptions;
+        runtime_state.caughtExceptions = exc->nextException;
+#if DEBUG_EXCEPTIONS
+        fprintf(stderr, "destroy caught exception %p\n", this);
+#endif
+        _Unwind_DeleteException(&exc->unwindHeader);
+    }
+}
+
+/*
+ * __cilkrts_merge_pending_exceptions
+ *
+ * Merge the right exception record into the left.  The left is logically
+ * earlier.
+ *
+ * The active exception of E is
+ * E->active if it is non-NULL (in which case E->rethrow is false)
+ * unresolved if E->active is NULL and E->rethrow is true
+ * nil if E->active is NULL and E->rethrow is false
+ *
+ * The merged active exception is left active exception if it is not
+ * nil, otherwise the right.
+ *
+ * On entry the left state is synched and can not have an unresolved
+ * exception.  The merge may result in an unresolved exception.
+ *
+ * Due to scoping rules at most one of the caught exception lists is
+ * non-NULL.
+ */
+
+struct pending_exception_info *
+__cilkrts_merge_pending_exceptions (
+    __cilkrts_worker *w,
+    struct pending_exception_info *left,
+    struct pending_exception_info *right)
+{
+    /* If we've only got one exception, return it */
+
+    if (NULL == left) {
+#if DEBUG_EXCEPTIONS
+        if (right) {
+            char buf[40];
+            decode_exceptions(buf, sizeof buf, right);
+            fprintf(stderr, "__cilkrts merge W%u nil %p -> %p %s\n",
+                    w->self, right, right, buf);
+        }
+#endif
+        return right;
+    }
+
+    if (NULL == right) {
+#if DEBUG_EXCEPTIONS
+        if (left) {
+            char buf[40];
+            decode_exceptions(buf, sizeof buf, left);
+            fprintf(stderr, "__cilkrts merge W%u %p nil -> %p %s\n",
+                    w->self, left, left, buf);
+        }
+#endif
+        return left;
+    }
+
+#if CILK_LIB_DEBUG
+    /*volatile struct pending_exception_info left_in = *left, right_in = *right;*/
+    left->check();
+    right->check();
+#endif
+
+#if DEBUG_EXCEPTIONS
+    {
+        char buf1[40], buf2[40];
+        decode_exceptions(buf1, sizeof buf1, left);
+        decode_exceptions(buf2, sizeof buf2, right);
+        fprintf(stderr, "__cilkrts merge W%u %p %s %p %s\n",
+                w->self, left, buf1, right, buf2);
+    }
+#endif
+
+    /* It should not be possible for both left and right to
+       have accumulated catch blocks.
+
+       The left exception record may always have a catch
+       chain it kept when its parent was stolen.
+
+       If they are siblings, the right sibling should not
+       have accumulated any net catches.  (Catch is lexically
+       scoped.)
+
+       If the right frame is a parent, it should not have entered
+       a catch block without syncing first.  If it spawned in a
+       catch block, the child got its catch. */
+    __cxa_exception *caught = left->runtime_state.caughtExceptions;
+    if (caught)
+        CILK_ASSERT(!right->runtime_state.caughtExceptions);
+    else {
+        CILK_ASSERT(!left->rethrow);
+        left->rethrow = right->rethrow;
+        left->runtime_state.caughtExceptions = caught = right->runtime_state.caughtExceptions;
+        right->runtime_state.caughtExceptions = NULL;
+    }
+
+    /* Merge the uncaught exception and count of uncaught exceptions. */
+    const unsigned int right_uncaught = right->runtime_state.uncaughtExceptions;
+    if (!left->active){
+        left->active = right->active; /* could be NULL */
+        right->active = 0;
+        left->runtime_state.uncaughtExceptions += right_uncaught;
+        if (left->active)
+            /* assert is C++ exception */
+            /*CILK_ASSERT(__cxxabiv1::__is_gxx_exception_class(left->active->exception_class))*/;
+    } else {
+        /* Subtract 1 if the right exception is being destructed. */
+        left->runtime_state.uncaughtExceptions += right_uncaught - (right->active != 0);
+    }
+
+    right->destruct();
+    __cilkrts_frame_free(w, right, sizeof *right);
+
+    /* If there is no state left, return NULL. */
+    if (left->empty()) {
+        left->destruct();
+        __cilkrts_frame_free(w, left, sizeof *left);
+        left = NULL;
+    }
+
+#if CILK_LIB_DEBUG
+    if (left)
+        left->check();
+#endif
+
+    return left;
+}
+
+#if 0
+/* __cilkrts_c_resume_except is called from the assembly language
+   restart code when a resumed frame has a pending exception.
+
+   The handler count negation on rethrow was done when the throw was
+   resolved.
+
+   The assembly language runtime must make the throw unwind to
+   the sync, spawn, or other location where the exception should
+   be injected.  (This should not happen after a spawn but nothing
+   here depends on there being no exception on steal.)
+
+   This function is unused in the Intel stack based system. */
+extern "C"
+void __cilkrts_c_resume_except (_Unwind_Exception *exc)
+{
+#if DEBUG_EXCEPTIONS
+    fprintf(stderr, "resume exception %p\n", exc);
+#endif
+    _Unwind_Reason_Code why = _Unwind_RaiseException(exc);
+    __cilkrts_bug ("Cilk runtime error: failed to reinstate suspended exception %p (%d)\n", exc, why);
+}
+#endif
+
+/* Restore the caught exception chain.  This assumes no C++ exception
+   code will run before the frame is resumed.  If there is no exception
+   to be resumed free the object. */
+
+extern "C"
+void __cilkrts_setup_for_execution_sysdep(__cilkrts_worker *w, full_frame *ff)
+{
+    // ASSERT: We own w->lock and ff->lock || P == 1
+
+    __cxa_eh_globals *state = __cxa_get_globals ();
+    struct pending_exception_info *info = w->l->pending_exception;
+
+    if (info == NULL)
+        return;
+
+    w->l->pending_exception = 0;
+
+#if DEBUG_EXCEPTIONS
+    _Unwind_Exception *exc = info->active;
+    if (exc) {
+        fflush(stdout);
+        fprintf(stderr, "__cilkrts_resume_except W%u %p->%p [%u %p]\n",
+                w->self, exc,
+                to_cxx(exc)->nextException,
+                info->runtime_state.uncaughtExceptions,
+                info->runtime_state.caughtExceptions);
+        /*CILK_ASSERT(info->runtime_state.uncaughtExceptions > 0);*/
+    }
+#endif
+
+    if (state->uncaughtExceptions || state->caughtExceptions)
+        __cilkrts_bug("W%u: resuming with non-empty prior exception state %u %p\n", state->uncaughtExceptions, state->caughtExceptions);
+
+    *state = info->runtime_state;
+    info->runtime_state.caughtExceptions = 0;
+    info->runtime_state.uncaughtExceptions = 0;
+
+    if (info->rethrow) {
+        info->rethrow = false;
+        /* Resuming function will rethrow.  Runtime calls
+           std::terminate if there is no caught exception. */
+        ff->call_stack->flags |= CILK_FRAME_EXCEPTING;
+    }
+    if (info->active) {
+        ff->call_stack->flags |= CILK_FRAME_EXCEPTING;
+        ff->call_stack->except_data = info->active;
+        info->active = 0;
+    }
+
+    if (info->empty()) {
+        info->destruct();
+        __cilkrts_frame_free(w, info, sizeof *info);
+        w->l->pending_exception = NULL;
+    }
+
+#if CILK_LIB_DEBUG
+    if (ff->call_stack->except_data)
+        CILK_ASSERT(std::uncaught_exception());
+#endif
+}
+
+#if 0
+extern "C"
+struct pending_exception_info *__cilkrts_get_exception(__cilkrts_worker *w,
+                                                       __cilkrts_stack_frame *sf)
+{
+    struct pending_exception_info *info = w->l->pending_exception;
+
+    if (info == NULL) {
+        sf->flags &= ~CILK_FRAME_EXCEPTING;
+        return 0;
+    }
+
+    w->l->pending_exception = NULL;
+
+    /* This exception goes into the frame. */
+
+    _Unwind_Exception *exc = info->active;
+    info->active = NULL;
+    info->destruct();
+    __cilkrts_frame_free(w, info, sizeof *info);
+    info = 0;
+    sf->flags |= CILK_FRAME_EXCEPTING;
+    sf->exception = exc;
+    return 0;
+}
+#endif
+
+extern "C"
+void __attribute__((nonnull)) __cilkrts_gcc_rethrow(__cilkrts_stack_frame *sf)
+{
+#ifdef __CYGWIN__
+    // Cygwin doesn't support exceptions, so _Unwind_Resume isn't available
+    // Which means we can't support exceptions either
+    __cilkrts_bug("The Cygwin implementation of the Intel Cilk Plus runtime doesn't support exceptions\n");
+#else
+    if (sf->except_data) {
+#if CILK_LIB_DEBUG
+        CILK_ASSERT(std::uncaught_exception());
+#endif        
+        _Unwind_Resume ((_Unwind_Exception *)sf->except_data);
+    } else {
+        throw;
+    }
+#endif  // __CYGWIN__
+}
+
+/* End except-gcc.cpp */
+
diff --git a/libcilkrts/runtime/except-gcc.h b/libcilkrts/runtime/except-gcc.h
new file mode 100644
index 00000000000..aa76adbc233
--- /dev/null
+++ b/libcilkrts/runtime/except-gcc.h
@@ -0,0 +1,146 @@
+/* except-gcc.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file except-gcc.h
+ *
+ * @brief ABI for gcc exception handling.
+ *
+ * @par Origin
+ * The code below is generally copied from the Intel Itanium ABI (Intel
+ * download 245370).
+ */
+
+#ifndef INCLUDED_EXCEPT_GCC_DOT_H
+#define INCLUDED_EXCEPT_GCC_DOT_H
+
+#ifndef __cplusplus
+#   error except-gcc.h should be used in C++ code only.
+#endif
+
+#include <cilk/common.h>
+#include <exception>
+#include <typeinfo>
+
+struct __cxa_exception;
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/** Unwind reason code (Itanium ABI 6.1.2.1) */
+typedef enum _Unwind_Reason_Code {
+    _URC_NO_REASON = 0,
+    _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
+    _URC_FATAL_PHASE2_ERROR = 2,
+    _URC_FATAL_PHASE1_ERROR = 3,
+    _URC_NORMAL_STOP = 4,
+    _URC_END_OF_STACK = 5,
+    _URC_HANDLER_FOUND = 6,
+    _URC_INSTALL_CONTEXT = 7,
+    _URC_CONTINUE_UNWIND = 8
+} _Unwind_Reason_Code;
+
+typedef struct _Unwind_Exception _Unwind_Exception;
+
+/** Exception cleanup function pointer (Itanium ABI 6.1.2.2) */
+typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code reason,
+                                             _Unwind_Exception *exc);
+
+/**
+ * @brief Exception undwinding information
+ *
+ * This is copied from the Intel Itanium ABI except that the
+ * private fields are declared unsigned long for binary
+ * compatibility with gcc/g++ on 32 bit machines.
+ */
+struct _Unwind_Exception
+{
+    uint64_t                     exception_class;
+    _Unwind_Exception_Cleanup_Fn exception_cleanup;
+    unsigned long                private_1;
+    unsigned long                private_2;
+};
+
+/** Throw or rethrow an exception */
+_Unwind_Reason_Code
+_Unwind_RaiseException(_Unwind_Exception *exception_object);
+
+/** Resume an exception other than by rethrowing it. */
+void _Unwind_Resume(_Unwind_Exception *exception_object);
+
+/** Delete an exception object */
+void _Unwind_DeleteException(_Unwind_Exception *exception_object);
+
+/**
+ * C++ exception ABI.
+ *  The following declarations are from
+ *
+ * http://www.codesourcery.com/public/cxx-abi/abi-eh.html#cxx-abi
+ */
+
+struct __cxa_exception {
+    std::type_info *        exceptionType;
+    void (*exceptionDestructor)(void *); 
+    std::unexpected_handler unexpectedHandler;
+    std::terminate_handler  terminateHandler;
+    __cxa_exception *       nextException;
+
+    int                     handlerCount;
+    int                     handlerSwitchValue;
+    const char *            actionRecord;
+    const char *            languageSpecificData;
+    void *                  catchTemp;
+    void *                  adjustedPtr;
+
+    _Unwind_Exception       unwindHeader;
+};
+
+static inline __cxa_exception *to_cxx(_Unwind_Exception *e)
+{
+    return ((__cxa_exception *)(e+1)) - 1;
+}
+
+typedef struct __cxa_eh_globals {
+    __cxa_exception *caughtExceptions;
+    unsigned int     uncaughtExceptions;
+} __cxa_eh_globals;
+
+__cxa_eh_globals*__cxa_get_globals(void) throw();
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_EXCEPT_GCC_DOT_H)
diff --git a/libcilkrts/runtime/except.h b/libcilkrts/runtime/except.h
new file mode 100644
index 00000000000..58e2238c581
--- /dev/null
+++ b/libcilkrts/runtime/except.h
@@ -0,0 +1,123 @@
+/* except.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file except.h
+ *
+ * @brief Common definitions for the various implementations of exception
+ * handling.
+ */
+
+#ifndef INCLUDED_EXCEPT_DOT_H
+#define INCLUDED_EXCEPT_DOT_H
+
+#include <cilk/common.h>
+#include <internal/abi.h>
+#include "full_frame.h"
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * OS-dependent information about an exception that's being moved between
+ * strands.
+ */
+typedef struct pending_exception_info pending_exception_info;
+
+/**
+ * Merge the right exception record into the left.  The left is logically
+ * earlier.
+ *
+ * On entry the left state is synched and can not have an unresolved
+ * exception.  The merge may result in an unresolved exception.
+ *
+ * If there is both a right and left exception, the right exception will
+ * be disposed of in preference to the left exception, destructing the
+ * exception object.
+ *
+ * @param w The worker that is preparing to resume execution.
+ * @param left_exception The exception that would have happened earlier
+ * if the code executed serially.  Can be NULL if the left strand has not
+ * raised an exception.
+ * @param right_exception The exception that would have happened later
+ * if the code executed serially.  Can be NULL if the right strand has not
+ * raised an exception.
+ *
+ * @return NULL if there both the right and left exception are NULL. This
+ * indicates that there are no pending exceptions.
+ * @return The pending exception that is to be raised to continue searching
+ * for a catch block to handle the exception.
+ */
+COMMON_SYSDEP
+struct pending_exception_info *__cilkrts_merge_pending_exceptions(
+    __cilkrts_worker *w,
+    pending_exception_info *left_exception,
+    pending_exception_info *right_exception);
+
+/**
+ * Move the exception information from the worker to the full_frame.
+ *
+ * @param w The worker which is suspending work on a full_frame.
+ * @param ff The full_frame which is being suspended.
+ */
+COMMON_SYSDEP
+void __cilkrts_save_exception_state(__cilkrts_worker *w,
+                                    full_frame *ff);
+
+/**
+ * Function to delete pending exception.  This will delete the
+ * exception object and then free the stack/fiber.
+ *
+ * @param w The worker we're running on.
+ * @param pei The pending exception to be delete
+ * @param delete_object Unused.  Should always be 1.
+ */
+void delete_exception_obj (__cilkrts_worker *w,
+                           struct pending_exception_info *pei,
+                           int delete_object);
+
+#ifndef _WIN32
+/* gcc-style exception handling */
+NON_COMMON NORETURN __cilkrts_c_sync_except(__cilkrts_worker *w,
+                                            __cilkrts_stack_frame *sf);
+NON_COMMON void __attribute__((nonnull))
+__cilkrts_gcc_rethrow(__cilkrts_stack_frame *sf);
+#endif
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_EXCEPT_DOT_H)
diff --git a/libcilkrts/runtime/frame_malloc.c b/libcilkrts/runtime/frame_malloc.c
new file mode 100644
index 00000000000..0b38bd209a9
--- /dev/null
+++ b/libcilkrts/runtime/frame_malloc.c
@@ -0,0 +1,462 @@
+/* frame_malloc.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#include "frame_malloc.h"
+#include "bug.h"
+#include "local_state.h"
+#include "cilk_malloc.h"
+
+#ifndef __VXWORKS__
+#include <memory.h>
+#endif
+
+/* #define USE_MMAP 1 */ 
+#if USE_MMAP
+#define __USE_MISC 1
+#include <sys/mman.h>
+#include <errno.h>
+#endif
+
+// Define to fill the stack frame header with the fill character when pushing
+// it on a free list.  Note that this should be #ifdef'd out when checked in!
+
+#ifdef _DEBUG
+#define HEADER_FILL_CHAR 0xbf
+#endif
+
+// HEADER_FILL_CHAR should not be defined when checked in, so put out a warning
+// message if this is a release build
+
+#if defined(NDEBUG) && defined (HEADER_FILL_CHAR)
+#pragma message ("Warning: HEADER_FILL_CHAR defined for a release build")
+#endif
+
+static void allocate_batch(__cilkrts_worker *w, int bucket, size_t size);
+
+#ifndef _WIN32
+
+const unsigned short __cilkrts_bucket_sizes[FRAME_MALLOC_NBUCKETS] =
+{
+    64, 128, 256, 512, 1024, 2048
+};
+
+#define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) __cilkrts_bucket_sizes[bucket]
+
+/* threshold above which we use slow malloc */
+#define FRAME_MALLOC_MAX_SIZE 2048
+
+#else // _WIN32
+
+/* Note that this must match the implementation of framesz_to_bucket in
+ * asmilator/layout.ml! */
+#define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) ((size_t)(64 << (bucket)))
+
+/* threshold above which we use slow malloc */
+#define FRAME_MALLOC_MAX_SIZE                                   \
+    FRAME_MALLOC_BUCKET_TO_SIZE(FRAME_MALLOC_NBUCKETS - 1)
+
+#endif // _WIN32
+
+/* utility procedures */
+static void push(struct free_list **b, struct free_list *p)
+{
+#ifdef HEADER_FILL_CHAR
+    memset (p, HEADER_FILL_CHAR, FRAME_MALLOC_BUCKET_TO_SIZE(0));
+#endif
+    /* cons! onto free list */
+    p->cdr = *b;
+    *b = p;
+}
+
+static struct free_list *pop(struct free_list **b)
+{
+    struct free_list *p = *b;
+    if (p) 
+        *b = p->cdr;
+    return p;
+}
+
+/*************************************************************
+  global allocator:
+*************************************************************/
+/* request slightly less than 2^K from the OS, which after malloc
+   overhead and alignment should end up filling each VM page almost
+   completely.  128 is a guess of the total malloc overhead and cache
+   line alignment */
+#define FRAME_MALLOC_CHUNK (32 * 1024 - 128)
+
+/** Implements linked list of frames */
+struct pool_cons {
+    char *p;                /**< This element of the list */
+    struct pool_cons *cdr;  /**< Remainder of the list */
+};
+
+static void extend_global_pool(global_state_t *g)
+{
+    /* FIXME: memalign to a cache line? */
+    struct pool_cons *c = (struct pool_cons *)__cilkrts_malloc(sizeof(*c));
+    g->frame_malloc.pool_begin = 
+        (char *)__cilkrts_malloc((size_t)FRAME_MALLOC_CHUNK);
+    g->frame_malloc.pool_end = 
+        g->frame_malloc.pool_begin + FRAME_MALLOC_CHUNK;
+    g->frame_malloc.allocated_from_os += FRAME_MALLOC_CHUNK;
+    c->p = g->frame_malloc.pool_begin;
+    c->cdr = g->frame_malloc.pool_list;
+    g->frame_malloc.pool_list = c;
+}
+
+/* the size is already canonicalized at this point */
+static struct free_list *global_alloc(global_state_t *g, int bucket)
+{
+    struct free_list *mem;
+    size_t size;
+
+    CILK_ASSERT(bucket < FRAME_MALLOC_NBUCKETS);
+    size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
+    g->frame_malloc.allocated_from_global_pool += size;
+
+    if (!(mem = pop(&g->frame_malloc.global_free_list[bucket]))) {
+
+        CILK_ASSERT(g->frame_malloc.pool_begin <= g->frame_malloc.pool_end);
+        if (g->frame_malloc.pool_begin + size > g->frame_malloc.pool_end) {
+            /* We waste the fragment of pool. */
+            g->frame_malloc.wasted +=
+                g->frame_malloc.pool_end - g->frame_malloc.pool_begin;
+            extend_global_pool(g);
+        }
+        mem = (struct free_list *)g->frame_malloc.pool_begin;
+        g->frame_malloc.pool_begin += size;
+    }
+
+    return mem;
+}
+
+static void global_free(global_state_t *g, void *mem, int bucket)
+{
+    size_t size;
+
+    CILK_ASSERT(bucket < FRAME_MALLOC_NBUCKETS);
+    size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
+    g->frame_malloc.allocated_from_global_pool -= size;
+
+    push(&g->frame_malloc.global_free_list[bucket], mem);
+}
+
+void __cilkrts_frame_malloc_global_init(global_state_t *g)
+{
+    int i;
+
+    __cilkrts_mutex_init(&g->frame_malloc.lock); 
+    g->frame_malloc.check_for_leaks = 1;
+    g->frame_malloc.pool_list = 0;
+    g->frame_malloc.pool_begin = 0;
+    g->frame_malloc.pool_end = 0;
+    g->frame_malloc.batch_size = 8000;
+    g->frame_malloc.potential_limit = 4 * g->frame_malloc.batch_size;
+    g->frame_malloc.allocated_from_os = 0;
+    g->frame_malloc.allocated_from_global_pool = 0;
+    g->frame_malloc.wasted = 0;
+    for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) 
+        g->frame_malloc.global_free_list[i] = 0;
+}
+
+// Counts how many bytes are in the global free list.
+static size_t count_memory_in_global_list(global_state_t *g)
+{
+
+    // Count the memory remaining in the global free list.
+    size_t size_remaining_in_global_list = 0;
+    int i;
+    for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) {
+        struct free_list *p;
+        size_t size_in_bucket = 0;
+        p = g->frame_malloc.global_free_list[i];
+
+        while (p) {
+            size_in_bucket += FRAME_MALLOC_BUCKET_TO_SIZE(i);
+            p = p->cdr;
+        }
+        size_remaining_in_global_list += size_in_bucket;
+    }
+    return size_remaining_in_global_list;
+}
+
+
+void __cilkrts_frame_malloc_global_cleanup(global_state_t *g)
+{
+    struct pool_cons *c;
+
+    if (g->frame_malloc.check_for_leaks) {
+        size_t memory_in_global_list = count_memory_in_global_list(g);
+        // TBD: This check is weak.  Short of memory corruption,
+        // I don't see how we have more memory in the free list
+        // than allocated from the os.
+        // Ideally, we should count the memory in the global free list
+        // and check that we have it all.  But I believe the runtime
+        // itself also uses some memory, which is not being tracked.
+        if (memory_in_global_list > g->frame_malloc.allocated_from_os) {
+            __cilkrts_bug("\nError. The Cilk runtime data structures may have been corrupted.\n");
+        }
+    }
+    
+    while ((c = g->frame_malloc.pool_list)) {
+        g->frame_malloc.pool_list = c->cdr;
+        __cilkrts_free(c->p);
+        __cilkrts_free(c);
+    }
+
+    __cilkrts_mutex_destroy(0, &g->frame_malloc.lock);
+
+    // Check that all the memory moved from the global pool into
+    // workers has been returned to the global pool.
+    if (g->frame_malloc.check_for_leaks
+        && (g->frame_malloc.allocated_from_global_pool != 0))
+    {
+        __cilkrts_bug("\n"
+                      "---------------------------" "\n"
+                      "  MEMORY LEAK DETECTED!!!  " "\n"
+                      "---------------------------" "\n"
+                      "\n"
+            );
+    }
+}
+
+/*************************************************************
+  per-worker allocator
+*************************************************************/
+/* allocate a batch of frames of size SIZE from the global pool and
+   store them in the worker's free list */
+static void allocate_batch(__cilkrts_worker *w, int bucket, size_t size)
+{
+    global_state_t *g = w->g;
+
+    __cilkrts_mutex_lock(w, &g->frame_malloc.lock); {
+#if USE_MMAP
+        char *p = mmap(0, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+        if (p == MAP_FAILED)
+            __cilkrts_bug("mmap failed %d", errno);
+        assert(size < 4096);
+        assert(p != MAP_FAILED);
+        mprotect(p, 4096, PROT_NONE);
+        mprotect(p + 8192, 4096, PROT_NONE);
+        w->l->bucket_potential[bucket] += size;
+        push(&w->l->free_list[bucket], (struct free_list *)(p + 8192 - size));
+#else
+        size_t bytes_allocated = 0;
+        do {
+            w->l->bucket_potential[bucket] += size;
+            bytes_allocated += size;
+            push(&w->l->free_list[bucket], global_alloc(g, bucket));
+        } while (bytes_allocated < g->frame_malloc.batch_size);
+#endif
+    } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock);
+
+}
+
+static void gc_bucket(__cilkrts_worker *w, int bucket, size_t size)
+{
+    struct free_list *p, *q;
+    global_state_t *g = w->g;
+    size_t pot = w->l->bucket_potential[bucket];
+    size_t newpot;
+
+    /* Keep up to POT/2 elements in the free list.  The cost of
+       counting up to POT/2 is amortized against POT. */
+    newpot = 0;
+    for (newpot = 0, p = w->l->free_list[bucket]; p && 2 * newpot < pot; 
+         p = p->cdr, newpot += size)
+        ;
+    w->l->bucket_potential[bucket] = newpot;
+
+    if (p) {
+        /* free the rest of the list.  The cost of grabbing the lock
+           is amortized against POT/2; the cost of traversing the rest
+           of the list is amortized against the free operation that
+           puts the element on the list. */
+        __cilkrts_mutex_lock(w, &g->frame_malloc.lock); {
+            while ((q = pop(&p->cdr)))
+#if USE_MMAP
+                munmap((char *)q + size - 8192, 12288);
+#else
+                global_free(g, q, bucket);
+#endif
+        } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock);
+    }
+}
+
+// Free all the memory in this bucket for the specified worker,
+// returning it to the global pool's free list.
+static void move_bucket_to_global_free_list(__cilkrts_worker *w,
+                                            int bucket)
+{
+    struct free_list *p, *q;
+    global_state_t *g = w->g;
+    p = w->l->free_list[bucket];
+    
+    if (p) {
+        __cilkrts_mutex_lock(w, &g->frame_malloc.lock); {
+            while ((q = pop(&p))) {
+#if USE_MMAP
+                size_t size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
+                munmap((char *)q + size - 8192, 12288);
+#else
+                global_free(g, q, bucket);
+#endif
+            }
+        } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock);
+    }
+
+    // I'm not sure this does anything useful now, since
+    // the worker is about to be destroyed. But why not?
+    w->l->bucket_potential[bucket] = 0;
+}
+
+static int bucket_of_size(size_t size)
+{
+    int i;
+
+    for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i)
+        if (size <= FRAME_MALLOC_BUCKET_TO_SIZE(i))
+            return i;
+
+    CILK_ASSERT(0 /* can't happen */);
+    return -1;
+}
+
+size_t __cilkrts_frame_malloc_roundup(size_t size)
+{
+    if (size > FRAME_MALLOC_MAX_SIZE) {
+        /* nothing, leave it alone */
+    } else {
+        int bucket = bucket_of_size(size);
+        size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
+    }
+    return size;
+}
+
+size_t __cilkrts_size_of_bucket(int bucket)
+{
+    CILK_ASSERT(bucket >= 0 && bucket < FRAME_MALLOC_NBUCKETS);
+    return FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
+}
+
+void *__cilkrts_frame_malloc(__cilkrts_worker *w, size_t size)
+{
+    int bucket;
+    void *mem;
+
+    /* if too large, or if no worker, fall back to __cilkrts_malloc()  */
+    if (!w || size > FRAME_MALLOC_MAX_SIZE) {
+        NOTE_INTERVAL(w, INTERVAL_FRAME_ALLOC_LARGE);
+        return __cilkrts_malloc(size);
+    }
+
+    START_INTERVAL(w, INTERVAL_FRAME_ALLOC); {
+        bucket = bucket_of_size(size);
+        size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
+
+        while (!(mem = pop(&w->l->free_list[bucket]))) {
+            /* get a batch of frames from the global pool */
+            START_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL) {
+                allocate_batch(w, bucket, size);
+            } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL);
+        }
+    } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC);
+
+    return mem;
+}
+
+void __cilkrts_frame_free(__cilkrts_worker *w, void *p0, size_t size)
+{
+    int bucket;
+    struct free_list *p = (struct free_list *)p0;
+
+    /* if too large, or if no worker, fall back to __cilkrts_free()  */
+    if (!w || size > FRAME_MALLOC_MAX_SIZE) {
+        NOTE_INTERVAL(w, INTERVAL_FRAME_FREE_LARGE);
+        __cilkrts_free(p);
+        return;
+    }
+
+#if CILK_LIB_DEBUG
+    *(volatile long *)w;
+#endif
+
+    START_INTERVAL(w, INTERVAL_FRAME_FREE); {
+        bucket = bucket_of_size(size);
+        size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
+        w->l->bucket_potential[bucket] += size;
+        push(&w->l->free_list[bucket], p);
+        if (w->l->bucket_potential[bucket] >
+            w->g->frame_malloc.potential_limit) {
+            START_INTERVAL(w, INTERVAL_FRAME_FREE_GLOBAL) {
+                gc_bucket(w, bucket, size);
+            } STOP_INTERVAL(w, INTERVAL_FRAME_FREE_GLOBAL);
+        }
+    } STOP_INTERVAL(w, INTERVAL_FRAME_FREE);
+}
+
+void __cilkrts_frame_malloc_per_worker_init(__cilkrts_worker *w)
+{
+    int i;
+    local_state *l = w->l;
+
+    for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) {
+        l->free_list[i] = 0;
+        l->bucket_potential[i] = 0;
+    }
+}
+
+void __cilkrts_frame_malloc_per_worker_cleanup(__cilkrts_worker *w)
+{
+    int i;
+    // Move memory to the global pool.  This operation
+    // ensures the memory does not become unreachable / leak
+    // when the worker is destroyed.
+    for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) {
+        move_bucket_to_global_free_list(w, i);
+    }
+}
+
+/*
+  Local Variables: **
+  c-file-style:"bsd" **
+  c-basic-offset:4 **
+  indent-tabs-mode:nil **
+  End: **
+*/
diff --git a/libcilkrts/runtime/frame_malloc.h b/libcilkrts/runtime/frame_malloc.h
new file mode 100644
index 00000000000..d412fb620fe
--- /dev/null
+++ b/libcilkrts/runtime/frame_malloc.h
@@ -0,0 +1,205 @@
+/* frame_malloc.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file frame_malloc.h
+ *
+ * @brief The frame allocation routines manage memory in a per-worker pool.
+ *
+ * The name "frame malloc" refers to an earlier implementation of Cilk which
+ * allocated frames from the heap using this allocator.
+ */
+
+#ifndef INCLUDED_FRAME_MALLOC_DOT_H
+#define INCLUDED_FRAME_MALLOC_DOT_H
+
+#include "worker_mutex.h"
+#include "rts-common.h"
+#include <internal/abi.h>  // __cilkrts_worker
+
+#ifdef __cplusplus
+#   include <cstddef>
+#else
+#   include <stddef.h>
+#endif
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * Number of buckets.  Gives us buckets to hold  64, 128, 256, 512, 1024
+ * and 2048 bytes
+ */
+#define FRAME_MALLOC_NBUCKETS 6
+
+/** Layout of frames when unallocated */
+struct free_list {
+     /** Pointer to next free frame */
+     struct free_list *cdr;
+};
+
+/** per-worker memory cache */
+struct __cilkrts_frame_cache
+{
+    /** Mutex to serialize access */
+    struct mutex lock;
+
+    /** Linked list of frames */
+    struct pool_cons *pool_list;
+
+    /** Low bound of memory in pool */
+    char *pool_begin;
+
+    /** High bound of memory in pool */
+    char *pool_end;
+
+    /** Global free-list buckets */
+    struct free_list *global_free_list[FRAME_MALLOC_NBUCKETS];
+
+    /**
+     * How many bytes to obtain at once from the global pool
+     * (approximately)
+     */
+    size_t batch_size;
+
+    /** Garbage-collect a bucket when its potential exceeds the limit */
+    size_t potential_limit;
+
+    /** If TRUE, check for memory leaks at the end of execution */
+    int check_for_leaks;
+
+    /** Bytes of memory allocated from the OS by the global cache */
+    size_t allocated_from_os;
+
+    /** Tracks memory allocated by a chunk that isn't a full bucket size */
+    size_t wasted;
+
+    /** Bytes of memory allocated from the global cache */
+    size_t allocated_from_global_pool;
+};
+
+/**
+ * Allocate memory from the per-worker pool. If the size is too large, or
+ * if we're given a NULL worker, the memory is allocated using
+ * __cilkrts_malloc().
+ *
+ * @param w The worker to allocate the memory from.
+ * @param size The number of bytes to allocate.
+ *
+ * @return pointer to allocated memory block.
+ */
+COMMON_PORTABLE
+void *__cilkrts_frame_malloc(__cilkrts_worker *w,
+                             size_t size) cilk_nothrow;
+
+/**
+ * Return memory to the per-worker pool. If the size is too large, or
+ * if we're given a NULL worker, the memory is freed using
+ * __cilkrts_free().
+ *
+ * @param w The worker to allocate the memory from.
+ * @param p The memory block to be released.
+ * @param size The size of the block, in bytes.
+ */
+COMMON_PORTABLE
+void __cilkrts_frame_free(__cilkrts_worker *w,
+                          void*  p,
+                          size_t size) cilk_nothrow;
+
+/**
+ * Destroy the global cache stored in the global state, freeing all memory
+ * to the global heap.  Checks whether any memory has been allocated but
+ * not freed.
+ *
+ * @param g The global state.
+ */
+COMMON_PORTABLE
+void __cilkrts_frame_malloc_global_cleanup(global_state_t *g);
+
+/**
+ * Initialize a worker's memory cache.  Initially it is empty.
+ *
+ * @param w The worker who's memory cache is to be initialized.
+ */
+COMMON_PORTABLE
+void __cilkrts_frame_malloc_per_worker_init(__cilkrts_worker *w);
+
+/**
+ * If check_for_leaks is set in the global state's memory cache, free any
+ * memory in the worker's memory cache.
+ *
+ * If check_for_leask is not set, nothing happens.
+ *
+ * @param w The worker who's memory cache is to be cleaned up.
+ */
+COMMON_PORTABLE
+void __cilkrts_frame_malloc_per_worker_cleanup(__cilkrts_worker *w);
+
+/**
+ * Round a number of bytes to the size of the smallest bucket that will
+ * hold it.  If the size is bigger than the largest bucket, the value is
+ * unchanged.
+ *
+ * @param size Number of bytes to be rounded up to the nearest bucket size.
+ *
+ * @return The size of the smallest bucket that will hold the specified bytes.
+ */
+COMMON_PORTABLE
+size_t __cilkrts_frame_malloc_roundup(size_t size) cilk_nothrow;
+
+/**
+ * Return the number of bytes that can fit into a bucket.
+ *
+ * Preconditions:
+ *  - The index must be in the range 0 - FRAME_MALLOC_NBUCKETS
+ *
+ * @param bucket Index of the bucket to be sized.
+ */
+COMMON_PORTABLE
+size_t __cilkrts_size_of_bucket(int bucket) cilk_nothrow;
+
+/**
+ * Initialize the global memory cache.
+ *
+ * @param g The global state.
+ */
+COMMON_PORTABLE
+void __cilkrts_frame_malloc_global_init(global_state_t *g);
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_FRAME_MALLOC_DOT_H)
diff --git a/libcilkrts/runtime/full_frame.c b/libcilkrts/runtime/full_frame.c
new file mode 100644
index 00000000000..9ccfd110d6b
--- /dev/null
+++ b/libcilkrts/runtime/full_frame.c
@@ -0,0 +1,181 @@
+/* full_frame.c                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2010-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+#include "full_frame.h"
+#include "stats.h"
+#include "os.h"
+#include "bug.h"
+#include "jmpbuf.h"
+#include "frame_malloc.h"
+
+COMMON_PORTABLE
+full_frame *__cilkrts_make_full_frame(__cilkrts_worker *w,
+                                      __cilkrts_stack_frame *sf)
+{
+    full_frame *ff;
+
+    START_INTERVAL(w, INTERVAL_ALLOC_FULL_FRAME) {
+        ff = (full_frame *)__cilkrts_frame_malloc(w, sizeof(*ff));
+        __cilkrts_mutex_init(&ff->lock);
+
+        ff->full_frame_magic_0 = FULL_FRAME_MAGIC_0;
+        ff->join_counter = 0;
+        ff->parent = 0;
+        ff->rightmost_child = 0;
+        ff->left_sibling = ff->right_sibling = 0;
+        ff->call_stack = sf;
+        ff->is_call_child = 0;
+        ff->simulated_stolen = 0;
+	ff->children_reducer_map = ff->right_reducer_map = 0;
+        ff->pending_exception = 
+            ff->child_pending_exception = 
+            ff->right_pending_exception = NULL;
+
+        ff->sync_sp = 0;
+#ifdef _WIN32
+        ff->exception_sp = 0;
+        ff->trylevel = (unsigned long)-1;
+        ff->registration = 0;
+#endif
+	ff->frame_size = 0;
+        ff->fiber_self = 0;
+        ff->fiber_child = 0;
+
+        ff->sync_master = 0;
+
+        /*__cilkrts_init_full_frame_sysdep(w, ff);*/
+        ff->full_frame_magic_1 = FULL_FRAME_MAGIC_1;
+    } STOP_INTERVAL(w, INTERVAL_ALLOC_FULL_FRAME);
+    return ff;
+}
+
+COMMON_PORTABLE void __cilkrts_put_stack(full_frame *ff,
+                                         __cilkrts_stack_frame *sf)
+{
+    /* When suspending frame ff prior to stealing it, __cilkrts_put_stack is
+     * used to store the stack pointer for eventual sync.  When suspending
+     * frame ff prior to a sync, __cilkrts_put_stack is called to re-establish
+     * the sync stack pointer, offsetting it by any change in the stack depth
+     * that occured between the spawn and the sync.
+     * Although it is not usually meaningful to add two pointers, the value of
+     * ff->sync_sp at the time of this call is really an integer, not a
+     * pointer.
+     */
+    ptrdiff_t sync_sp_i = (ptrdiff_t) ff->sync_sp;
+    char* sp = (char*) __cilkrts_get_sp(sf);
+
+    ff->sync_sp = sp + sync_sp_i;
+
+    DBGPRINTF("%d-                __cilkrts_put_stack - adjust (+) sync "
+              "stack of full frame %p (+sp: %p) to %p\n",
+              __cilkrts_get_tls_worker()->self, ff, sp, ff->sync_sp);
+}
+
+COMMON_PORTABLE void __cilkrts_take_stack(full_frame *ff, void *sp)
+{
+    /* When resuming the parent after a steal, __cilkrts_take_stack is used to
+     * subtract the new stack pointer from the current stack pointer, storing
+     * the offset in ff->sync_sp.  When resuming after a sync,
+     * __cilkrts_take_stack is used to subtract the new stack pointer from
+     * itself, leaving ff->sync_sp at zero (null).  Although the pointers being
+     * subtracted are not part of the same contiguous chunk of memory, the
+     * flat memory model allows us to subtract them and get a useable offset.
+     */
+    ptrdiff_t sync_sp_i = ff->sync_sp - (char*) sp;
+
+    ff->sync_sp = (char *) sync_sp_i;
+
+    DBGPRINTF("%d-                __cilkrts_take_stack - adjust (-) sync "
+              "stack of full frame %p to %p (-sp: %p)\n",
+              __cilkrts_get_tls_worker()->self, ff, ff->sync_sp, sp);
+}
+
+COMMON_PORTABLE void __cilkrts_adjust_stack(full_frame *ff, size_t size)
+{
+    /* When resuming the parent after a steal, __cilkrts_take_stack is used to
+     * subtract the new stack pointer from the current stack pointer, storing
+     * the offset in ff->sync_sp.  When resuming after a sync,
+     * __cilkrts_take_stack is used to subtract the new stack pointer from
+     * itself, leaving ff->sync_sp at zero (null).  Although the pointers being
+     * subtracted are not part of the same contiguous chunk of memory, the
+     * flat memory model allows us to subtract them and get a useable offset.
+     *
+     * __cilkrts_adjust_stack() is used to deallocate a Variable Length Array
+     * by adding it's size to ff->sync_sp.
+     */
+    ff->sync_sp = ff->sync_sp + size;
+
+    DBGPRINTF("%d-                __cilkrts_adjust_stack - adjust (+) sync "
+              "stack of full frame %p to %p (+ size: 0x%x)\n",
+              __cilkrts_get_tls_worker()->self, ff, ff->sync_sp, size);
+}
+
+COMMON_PORTABLE
+void __cilkrts_destroy_full_frame(__cilkrts_worker *w, full_frame *ff)
+{
+    validate_full_frame(ff);
+    CILK_ASSERT(ff->children_reducer_map == 0);
+    CILK_ASSERT(ff->right_reducer_map == 0);
+    CILK_ASSERT(NULL == ff->pending_exception);
+    CILK_ASSERT(NULL == ff->child_pending_exception);
+    CILK_ASSERT(NULL == ff->right_pending_exception);
+    __cilkrts_mutex_destroy(w, &ff->lock);
+    __cilkrts_frame_free(w, ff, sizeof(*ff));
+}
+
+COMMON_PORTABLE void validate_full_frame(full_frame *ff)
+{
+    /* check the magic numbers, for debugging purposes */
+    if (ff->full_frame_magic_0 != FULL_FRAME_MAGIC_0 ||
+        ff->full_frame_magic_1 != FULL_FRAME_MAGIC_1)
+        abort_because_rts_is_corrupted();
+}
+
+void __cilkrts_frame_lock(__cilkrts_worker *w, full_frame *ff)
+{
+    validate_full_frame(ff);
+    __cilkrts_mutex_lock(w, &ff->lock);
+}
+
+void __cilkrts_frame_unlock(__cilkrts_worker *w, full_frame *ff)
+{
+    __cilkrts_mutex_unlock(w, &ff->lock);
+}
+
+/* End full_frame.c */
diff --git a/libcilkrts/runtime/full_frame.h b/libcilkrts/runtime/full_frame.h
new file mode 100644
index 00000000000..327a3337afe
--- /dev/null
+++ b/libcilkrts/runtime/full_frame.h
@@ -0,0 +1,493 @@
+/* full_frame.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#ifndef INCLUDED_FULL_FRAME_DOT_H
+#define INCLUDED_FULL_FRAME_DOT_H
+
+
+#include "rts-common.h"
+#include "worker_mutex.h"
+
+#include <cilk/common.h>
+#include <internal/abi.h>
+#include <stddef.h>
+#include "cilk_fiber.h"
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/** Magic numbers for full_frame, used for debugging */
+typedef unsigned long long ff_magic_t;
+
+/* COMMON_SYSDEP */ struct pending_exception_info;  /* opaque */
+
+/*************************************************************
+  Full frames
+*************************************************************/
+
+/**
+ * @file full_frame.h
+ * @brief A full frame includes additional information such as a join
+ * counter and parent frame.
+ * @defgroup FullFrames Full Frames
+ * A full frame includes additional information such as a join
+ * counter and parent frame.
+ * @{
+ */
+
+/**
+ * Convenience typedef so we don't have to specify "struct full_frame"
+ * all over the code.  Putting it before the structure definition allows
+ * us to use the typedef within the structure itself
+ */
+typedef struct full_frame full_frame;
+
+/**
+ * @brief A full frame includes additional information such as a join
+ * counter and parent frame.
+ *
+ * The frame at the top of a worker's stack is promoted into a "full"
+ * frame, which carries additional information, such as join counter
+ * and parent frame.  Full frames can be suspended at a sync, in which
+ * case they lie somewhere in memory and do not belong to any
+ * worker. 
+ *
+ * Full frames are in contrast to the entries in the worker's deque which
+ * are only represented by a pointer to their __cilkrts_stack_frame.
+ *
+ * At any instant, we say that a full frame ff is either "suspended",
+ * or "owned" by some worker w.
+ *
+ * More precisely, we say that a worker w owns a frame ff under one of
+ * the following conditions:
+ *
+ *  1. Creation: Worker w has just created ff, but not yet linked ff
+ *     into the tree of full frames.  This situation can occur when a
+ *     worker is unrolling a call stack to promote a
+ *     __cilkrts_stack_frame to a full_frame.
+ *  2. Executing frame: We have w->l->frame_ff == ff, i.e,. ff is the
+ *     currently executing frame for w.
+ *  3. Next frame: We have w->l->next_frame_ff == ff, i.e,. ff is the
+ *     next frame that w is about to execute.
+ *  4. Resume execution: Worker w has popped ff from
+ *     w->l->next_frame_ff, and is about to resume execution of ff.
+ *  5. Dying leaf: Worker w has finished executing a frame ff
+ *     that is a leaf the tree of full frames, and is in the process
+ *     of unlinking "ff" from the tree.
+ *
+ * Otherwise, the frame ff is suspended, and has no owner.
+ * Note that work-stealing changes the owner of a full frame from the
+ * victim to the thief.  
+ *
+ * Using this notion of ownership, we classify the fields of a full
+ * frame into one of several categories:
+ *
+ *  1. Local: 
+ *     These fields are accessed only by the owner of the full frame.
+ *     Because a frame can have only one owner at a time, these fields
+ *     can be modified without any (additional) locking or
+ *     synchronization, assuming the correct synchronization for
+ *     changing the ownership of full frame (e.g., on a successful
+ *     steal) is already in place.
+ *
+ *  2. Constant (i.e., read-only):
+ *     This field is constant for the lifetime of the full frame.
+ *     No locks are needed to access this field.
+ *     Technically, a field could be read-only and local, but we assume
+ *     it is shared.
+ *  
+ *  3. Self-locked:
+ *     To access this field in the frame ff, a worker should acquire
+ *     the lock on ff.  
+ *     A self-locked field is conceptually "shared" between the worker
+ *     that owns frame ff (which is a child) and the worker that
+ *     owns the frame ff->parent (which is the parent of ff).
+ *
+ *  4. Parent-locked:
+ *     To access this field in the frame ff, a worker should
+ *     acquire the lock on ff->parent.
+ *     A parent-locked field is conceptually "shared" between the worker
+ *     that owns frame ff, and a worker that is either owns the
+ *     parent frame (ff->parent) or owns a sibling frame of ff (i.e.,
+ *     any child of ff->parent).
+ *
+ *  5. Synchronization
+ *     A field used explicitly for synchronization (i.e., locks).
+ */
+
+/* COMMON_PORTABLE */ 
+struct full_frame
+{
+    /**
+     * Value to detect writes off the beginning of a full_frame.
+     */
+#   define FULL_FRAME_MAGIC_0 ((ff_magic_t)0x361e710b9597d553ULL)
+
+    /**
+     * Field to detect writes off the beginning of a full_frame.  Must be
+     * FULL_FRAME_MAGIC_0.
+     * [constant]
+     */
+    ff_magic_t full_frame_magic_0;
+
+    /**
+     * Used to serialize access to this full_frame
+     * [synchronization]
+     */
+    struct mutex lock;
+
+    /**
+     * Count of outstanding children running in parallel
+     * [self-locked]
+     */
+    int join_counter;
+
+    /**
+     * If TRUE: frame was called by the parent.
+     * If FALSE: frame was spawned by parent.
+     * [constant]
+     */
+    int is_call_child;
+
+    /**
+     * TRUE if this frame is the loot of a simulated steal.
+     *
+     * This situation never happens in normal execution.  However,
+     * when running under cilkscreen, a worker may promote frames and
+     * then immediately suspend them, in order to simulate an
+     * execution on an infinite number of processors where all spawns
+     * are stolen.  In this case, the frame is marked as the loot of a fake
+     * steal.
+     * [local]
+     */
+    int simulated_stolen;
+
+    /**
+     * Caller of this full_frame
+     * [constant]
+     */
+    full_frame *parent;
+
+    /**
+     * Doubly-linked list of children.  The serial execution order is
+     * by definition from left to right.  Because of how we do work
+     * stealing, the parent is always to the right of all its
+     * children.
+     *
+     * For a frame ff, we lock the ff->parent to follow the sibling
+     * links for ff.
+     *
+     * [parent-locked]
+     */
+    full_frame *left_sibling;
+
+    /**
+     * @copydoc left_sibling
+     */
+    full_frame *right_sibling;
+
+    /**
+     * Pointer to rightmost child
+     *
+     * [self-locked]
+     */
+    full_frame *rightmost_child;
+
+    /**
+     * Call stack associated with this frame.
+     * Set and reset in make_unrunnable and make_runnable
+     *
+     * [self-locked]
+     */
+    __cilkrts_stack_frame *call_stack;
+
+    /**
+     * Accumulated reducers of children
+     *
+     * [self-locked]
+     */
+    struct cilkred_map *children_reducer_map;
+
+    /**
+     * Accumulated reducers of right siblings that have already
+     * terminated
+     *
+     * [parent-locked]
+     */
+    struct cilkred_map *right_reducer_map;
+
+    /**
+     * Exception that needs to be pass to our parent
+     *
+     * [local]
+     *
+     * TBD: verify that the exception code satisfies this requirement.
+     */
+    struct pending_exception_info *pending_exception;
+
+    /**
+     * Exception from one of our children
+     *
+     * [self-locked]
+     */
+    struct pending_exception_info *child_pending_exception;
+
+    /**
+     * Exception from any right siblings
+     *
+     * [parent-locked]
+     */
+    struct pending_exception_info *right_pending_exception;
+
+    /**
+     * Stack pointer to restore on sync.
+     * [local]
+     */
+    char *sync_sp;
+
+#ifdef _WIN32
+    /**
+     * Stack pointer to restore on exception.
+     * [local]
+     */
+    char *exception_sp;
+
+    /**
+     * Exception trylevel at steal
+     * [local]
+     *
+     * TBD: this field is set but not read?
+     */
+    unsigned long trylevel;
+
+    /**
+     * Exception registration head pointer to restore on sync.
+     * [local]
+     */
+    unsigned long registration;
+#endif
+
+    /**
+     * Size of frame to match sync sp
+     * [local]
+     * TBD: obsolete field only used in debugging?
+     */
+    ptrdiff_t frame_size;
+
+    /**
+     * Allocated fibers that need to be freed.  The fibers work
+     * like a reducer.  The leftmost frame may have @c fiber_self
+     * null and owner non-null.
+     *
+     * [local]
+     * TBD: verify exception code satisfies this requirement.
+     */
+    cilk_fiber *fiber_self;
+
+    /**
+     * Allocated fibers that need to be freed.  The fibers work
+     * like a reducer.  The leftmost frame may have @c fiber_self
+     * null and owner non-null.
+     *
+     * [self-locked]
+     */
+    cilk_fiber *fiber_child;
+
+    /**
+     * If the sync_master is set, this function can only be sync'd by the team
+     * leader, who first entered Cilk.  This is set by the first worker to steal
+     * from the user worker.
+     *
+     * [self-locked]
+     */
+    __cilkrts_worker *sync_master;
+
+    /**
+     * Value to detect writes off the end of a full_frame.
+     */
+#   define FULL_FRAME_MAGIC_1 ((ff_magic_t)0x189986dcc7aee1caULL)
+
+    /**
+     * Field to detect writes off the end of a full_frame.  Must be
+     * FULL_FRAME_MAGIC_1.
+     *
+     * [constant]
+     */
+    ff_magic_t full_frame_magic_1;
+};
+
+/* The functions __cilkrts_put_stack and __cilkrts_take_stack keep track of
+ * changes in the stack's depth between when the point at which a frame is
+ * stolen and when it is resumed at a sync.  A stolen frame typically goes
+ * through the following phase changes:
+ *
+ *   1. Suspend frame while stealing it.
+ *   2. Resume stolen frame at begining of continuation
+ *   3. Suspend stolen frame at a sync
+ *   4. Resume frame (no longer marked stolen) after the sync
+ *
+ * When the frame is suspended (steps 1 and 3), __cilkrts_put_stack is called to
+ * establish the stack pointer for the sync.  When the frame is resumed (steps
+ * 2 and 4), __cilkrts_take_stack is called to indicate the stack pointer
+ * (which may be on a different stack) at
+ * the point of resume.  If the stack pointer changes between steps 2 and 3,
+ * e.g., as a result of pushing 4 bytes onto the stack,
+ * the offset is reflected in the value of ff->sync_sp after step 3 relative to
+ * its value after step 1 (e.g., the value of ff->sync_sp after step 3 would be
+ * 4 less than its value after step 1, for a down-growing stack).
+ *
+ * Imp detail: The actual call chains for each of these phase-change events is:
+ *
+ *   1. unroll_call_stack -> make_unrunnable  -> __cilkrts_put_stack
+ *   2. do_work           -> __cilkrts_resume -> __cilkrts_take_stack
+ *   3. do_sync -> disown -> make_runnable    -> __cilkrts_put_stack
+ *   4. __cilkrts_resume                      -> __cilkrts_take_stack
+ *
+ * (The above is a changeable implementation detail.  The resume, sequence, in
+ * particular, is more complex on some operating systems.)
+ */
+
+/**
+ * @brief Records the stack pointer within the @c sf stack frame as the
+ * current stack pointer at the point of suspending full frame @c ff.
+ *
+ * @pre @c ff->sync_sp must be either null or contain the result of a prior call to
+ *      @c __cilkrts_take_stack().
+ * @pre If @c ff->sync_sp is not null, then @c SP(sf) must refer to the same stack as
+ *      the @c sp argument to the prior call to @c __cilkrts_take_stack().
+ * 
+
+ * @post If @c ff->sync_sp was null before the call, then @c
+ *       ff->sync_sp will be set to @c SP(sf).
+ * @post Otherwise, @c ff->sync_sp will be restored to the value it had just prior
+ *       to the last call to @c __cilkrts_take_stack(), except offset by any change
+ *       in the stack pointer between the call to @c __cilkrts_take_stack() and
+ *       this call to @c __cilkrts_put_stack().
+ *
+ * @param ff The full frame that is being suspended.
+ * @param sf The @c __cilkrts_stack_frame that is being suspended.  The stack
+ *   pointer will be taken from the jmpbuf contained within this
+ *   @c __cilkrts_stack_frame.
+ */
+COMMON_PORTABLE void __cilkrts_put_stack(full_frame *ff,
+                                         __cilkrts_stack_frame *sf);
+
+/**
+ * @brief Records the stack pointer @c sp as the stack pointer at the point of
+ * resuming execution on full frame @c ff.
+ *
+ * The value of @c sp may be on a different stack than the original
+ * value recorded for the stack pointer using __cilkrts_put_stack().
+ *
+ * @pre  @c ff->sync_sp must contain a value set by @c __cilkrts_put_stack().
+ *
+ * @post @c ff->sync_sp contains an *integer* value used to compute a change in the
+ *       stack pointer upon the next call to @c __cilkrts_take_stack().
+ * @post If @c sp equals @c ff->sync_sp, then @c ff->sync_sp is set to null.
+ *
+ * @param ff The full frame that is being resumed.
+ * @param sp The stack pointer for the stack the function is being resumed on.
+ */
+COMMON_PORTABLE void __cilkrts_take_stack(full_frame *ff, void *sp);
+
+/*
+ * @brief Adjust the stack for to deallocate a Variable Length Array
+ *
+ * @param ff The full frame that is being adjusted.
+ * @param size The size of the array being deallocated from the stack
+ */
+COMMON_PORTABLE void __cilkrts_adjust_stack(full_frame *ff, size_t size);
+
+/**
+ * @brief Allocates and initailizes a full_frame.
+ *
+ * @param w The memory for the full_frame will be allocated out of the
+ * worker's pool.
+ * @param sf The @c __cilkrts_stack_frame which will be saved as the call_stack
+ * for this full_frame.
+ *
+ * @return The newly allocated and initialized full_frame.
+ */
+COMMON_PORTABLE
+full_frame *__cilkrts_make_full_frame(__cilkrts_worker *w,
+                                      __cilkrts_stack_frame *sf);
+
+/**
+ * @brief Deallocates a full_frame.
+ *
+ * @param w The memory for the full_frame will be returned to the worker's pool.
+ * @param ff The full_frame to be deallocated.
+ */
+COMMON_PORTABLE
+void __cilkrts_destroy_full_frame(__cilkrts_worker *w, full_frame *ff);
+
+/**
+ * @brief Performs sanity checks to check the integrity of a full_frame.
+ *
+ * @param ff The full_frame to be validated.
+ */
+COMMON_PORTABLE void validate_full_frame(full_frame *ff);
+
+/**
+ * @brief Locks the mutex contained in a full_frame.
+ *
+ * The full_frame is validated before the runtime attempts to lock it.
+ *
+ * @post @c ff->lock will be owned by @c w.
+ *
+ * @param w  The worker that will own the full_frame.  If the runtime is
+ * collecting stats, the intervals will be attributed to the worker.
+ * @param ff The full_frame containing the mutex to be locked.
+ */
+COMMON_PORTABLE void __cilkrts_frame_lock(__cilkrts_worker *w,
+                                          full_frame *ff);
+
+/**
+ * @brief Unlocks the mutex contained in a full_frame.
+ *
+ * @pre @c ff->lock must must be owned by @c w.
+ *
+ * @param w  The worker that currently owns the full_frame.
+ * @param ff The full_frame containing the mutex to be unlocked.
+ */
+COMMON_PORTABLE void __cilkrts_frame_unlock(__cilkrts_worker *w,
+                                            full_frame *ff);
+/** @} */
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_FULL_FRAME_DOT_H)
diff --git a/libcilkrts/runtime/global_state.cpp b/libcilkrts/runtime/global_state.cpp
new file mode 100644
index 00000000000..02de54f43b1
--- /dev/null
+++ b/libcilkrts/runtime/global_state.cpp
@@ -0,0 +1,628 @@
+/* global_state.cpp                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#include "global_state.h"
+#include "os.h"
+#include "bug.h"
+#include "metacall_impl.h"
+#include "stats.h"
+#include "cilk/cilk_api.h"
+#include "cilk_malloc.h"
+#include "record-replay.h"
+
+#include <algorithm>  // For max()
+#include <cstring>
+#include <cstdlib>
+#include <climits>
+#include <cerrno>
+
+#ifdef _WIN32
+#   include <wchar.h>
+#endif
+
+// TBD: There is a race when multiple threads try to initialize the
+// user_settable_values??
+// 
+// Set to true if the user settable values portion of the global state
+// singleton is initialized, even if the rest of the singleton is not
+// initialized.
+int cilkg_user_settable_values_initialized = false;
+
+namespace {
+
+// Single copy of the global state.  Zero-filled until
+// cilkg_get_user_settable_values() is called and partially-zero-filled until
+// cilkg_init_global_state() is called.  The first field is filled in with
+// the size of a void* for the debugger and must be valid before initialization
+global_state_t global_state_singleton =
+{
+    sizeof(void *),    // addr_size
+};
+
+
+// Variables that need to export C-style names
+extern "C"
+{
+    // Pointer to the global state singleton.  
+    global_state_t *cilkg_singleton_ptr = NULL;
+
+    // __cilkrts_global_state is exported and referenced by the debugger.
+    // The debugger expects it to be valid when the module loads.
+//    CILK_EXPORT_DATA
+    global_state_t *__cilkrts_global_state = &global_state_singleton;
+}
+
+// Returns true if 'a' and 'b' are equal null-terminated strings
+inline bool strmatch(const char* a, const char* b)
+{
+    return 0 == std::strcmp(a, b);
+}
+
+// Returns the integer value represented by the null-terminated string at 's'.
+inline long to_long(const char* s)
+{
+    char *end;
+
+    errno = 0;
+    return std::strtol(s, &end, 0);
+}
+
+#ifdef _WIN32
+// Returns true if 'a' and 'b' are equal null-terminated wide-char strings
+inline bool strmatch(const wchar_t* a, const wchar_t* b)
+{
+    return 0 == wcscmp(a, b);
+}
+
+// Returns true if the multi-byte character string at 'a' represents the same
+// character sequence as the wide-character string at 'b'.  The behavior is
+// undefined if 'a' contains more than 30 multi-byte characters.
+bool strmatch(const char* a, const wchar_t* b)
+{
+    // Convert 'a' to wide-characters, then compare.
+    wchar_t wa[31];
+    std::size_t count;
+    errno_t err = mbstowcs_s(&count, wa, a, 30);
+    CILK_ASSERT(0 == err);
+    if (err) return false;
+    return strmatch(wa, b);
+}
+
+// Returns true if the wide-character string at 'a' represents the same
+// character sequence as the multi-byte character string at 'b'.  The behavior
+// id undefined if 'b' contains more than 30 multi-byte characters.
+inline
+bool strmatch(const wchar_t* a, const char* b)
+{
+    return strmatch(b, a);
+}
+
+
+// Returns the integer value represented by the null-terminated wide-char
+// string at 's'.
+inline long to_long(const wchar_t* s)
+{
+    wchar_t *end;
+
+    errno = 0;
+    return wcstol(s, &end, 0);
+}
+#endif
+
+// Check if Cilkscreen or other sequential ptool wants to force reducers.
+bool always_force_reduce()
+{
+    // Metacall *looks* like a no-op.  volatile needed to keep compiler from
+    // optimizing away variable.
+    volatile char not_force_reduce = '\377';
+    __cilkrts_metacall(METACALL_TOOL_SYSTEM, HYPER_ZERO_IF_FORCE_REDUCE,
+                       const_cast<char*>(&not_force_reduce));
+    return ! not_force_reduce;
+}
+
+// Stores the boolean value represented by the null-terminated string at 'val'
+// into the integer object at 'out'.  Returns '__CILKRTS_SET_PARAM_SUCCESS' if
+// 'val' is "true", "false", "0" or "1" and '__CILKRTS_SET_PARAM_INVALID'
+// otherwise.
+template <typename INT_T, typename CHAR_T>
+int store_bool(INT_T *out, const CHAR_T *val)
+{
+    static const char* const s_zero  = "0";
+    static const char* const s_one   = "1";
+    static const char* const s_true  = "true";
+    static const char* const s_false = "false";
+    
+    if (val == 0)
+        return __CILKRTS_SET_PARAM_INVALID;
+
+    if (strmatch(s_false, val) || strmatch(s_zero, val)) { 
+        *out = 0;
+        return __CILKRTS_SET_PARAM_SUCCESS;
+    }
+
+    if (strmatch(s_true, val) || strmatch(s_one, val)) { 
+        *out = 1;
+        return __CILKRTS_SET_PARAM_SUCCESS;
+    }
+
+    return __CILKRTS_SET_PARAM_INVALID;
+}
+
+// Stores the integer value represented by the null-terminated string at 'val'
+// into the integer object at 'out', restricting the result to the range 'min'
+// to 'max', inclusive.  Returns '__CILKRTS_SET_PARAM_SUCCESS' if the conversion
+// succeeds and is in range, '__CILKRTS_SET_PARAM_XRANGE' if the conversion
+// succeeds but is out of range, and '__CILKRTS_SET_PARAM_INVALID' otherwise.  In
+// the case of any error, '*out' is unchanged.
+template <typename INT_T, typename CHAR_T>
+int store_int(INT_T *out, const CHAR_T *val, INT_T min, INT_T max)
+{
+    errno = 0;
+    long val_as_long = to_long(val);
+    if (val_as_long == 0 && errno != 0)
+        return __CILKRTS_SET_PARAM_INVALID;
+    if (val_as_long < min || val_as_long == LONG_MIN)
+        return __CILKRTS_SET_PARAM_XRANGE;
+    else if (val_as_long > max || val_as_long == LONG_MAX)
+        return __CILKRTS_SET_PARAM_XRANGE;
+
+    *out = val_as_long;
+    return __CILKRTS_SET_PARAM_SUCCESS;
+}
+
+// Implementaton of cilkg_set_param templatized on character type.
+// Windows will instantiate with both char and wchar_t.
+// Note that g must have its user settable values set, but need not be fully
+// initialized.
+template <class CHAR_T>
+int set_param_imp(global_state_t* g, const CHAR_T* param, const CHAR_T* value)
+{
+    static const char* const s_force_reduce     = "force reduce";
+    static const char* const s_nworkers         = "nworkers";
+    static const char* const s_max_user_workers = "max user workers";
+    static const char* const s_local_stacks     = "local stacks";
+    static const char* const s_shared_stacks    = "shared stacks";
+    static const char* const s_nstacks          = "nstacks";
+    static const char* const s_stack_size       = "stack size";
+
+    // We must have a parameter and a value
+    if (0 == param)
+        return __CILKRTS_SET_PARAM_INVALID;
+    if (0 == value)
+        return __CILKRTS_SET_PARAM_INVALID;
+
+    if (strmatch(param, s_force_reduce))
+    {
+        // Sets whether we force a reduce operation at every sync.  Useful for
+        // debugging reducers.  Off by default.  Overridden by Cilkscreen
+        //
+        // Documented in cilk_api_<os>.h
+        if (always_force_reduce())
+            // Force reduce is set by cilkscreen.  User cannot change it.
+            return __CILKRTS_SET_PARAM_LATE;
+
+        return store_bool(&g->force_reduce, value);
+    }
+    else if (strmatch(param, s_nworkers))
+    {
+        // Set the total number of workers.  Overrides count of cores we get
+        // from the OS and the setting of the CILK_NWORKERS environment
+        // variable.  Setting to 0 indicates that the default worker count
+        // should be used.
+        //
+        // Documented in cilk_api_<os>.h
+        if (cilkg_singleton_ptr)
+            return __CILKRTS_SET_PARAM_LATE;
+
+        // Fetch the number of cores.  There must be at last 1, since we're
+        // executing on *something*, aren't we!?
+        int hardware_cpu_count = __cilkrts_hardware_cpu_count();
+        CILK_ASSERT(hardware_cpu_count > 0);
+
+        int max_cpu_count = 16 * hardware_cpu_count;
+        if (__cilkrts_running_under_sequential_ptool())
+        {
+            hardware_cpu_count = 1;
+            max_cpu_count = 1;
+        }
+        // Allow a value of 0, which means "set to hardware thread count".
+        int ret = store_int(&g->P, value, 0, max_cpu_count);
+        if (0 == g->P)
+            g->P = hardware_cpu_count;
+        return ret;
+    }
+    else if (strmatch(param, s_max_user_workers))
+    {
+        // ** UNDOCUMENTED **
+        //
+        // Sets the number of slots allocated for user worker threads
+        int hardware_cpu_count = __cilkrts_hardware_cpu_count();
+        CILK_ASSERT (hardware_cpu_count > 0);
+
+        return store_int(&g->max_user_workers, value, 1,
+                         16 * hardware_cpu_count);
+    }
+    else if (strmatch(param, s_local_stacks))
+    {
+        // ** UNDOCUMENTED **
+        //
+        // Number of stacks we'll hold in the per-worker stack cache.  Maximum
+        // value is 42.  See __cilkrts_make_global_state for details.
+        return store_int(&g->fiber_pool_size, value, 0, 42);
+    }
+    else if (strmatch(param, s_shared_stacks))
+    {
+        // ** UNDOCUMENTED **
+        //
+        // Maximum number of stacks we'll hold in the global stack
+        // cache. Maximum value is 42.  See __cilkrts_make_global_state for
+        // details.
+        return store_int(&g->global_fiber_pool_size, value, 0, 42);
+    }
+    else if (strmatch(param, s_nstacks))
+    {
+        // Sets the maximum number of stacks permitted at one time.  If the
+        // runtime reaches this maximum, it will cease to allocate stacks and
+        // the app will lose parallelism.  0 means unlimited.  Default is
+        // unlimited.  Minimum is twice the number of worker threads, though
+        // that cannot be tested at this time.
+        //
+        // Undocumented at this time, though there are plans to expose it.
+        // The current implentation is for Linux debugging only and is not
+        // robust enough for users.
+        if (cilkg_singleton_ptr)
+            return __CILKRTS_SET_PARAM_LATE;
+        return store_int<unsigned>(&g->max_stacks, value, 0, INT_MAX);
+    }
+    else if (strmatch(param, s_stack_size))
+    {
+        // ** UNDOCUMENTED **
+        //
+        // Sets the size (in bytes) of the stacks that Cilk creates.
+        // Can only be set before the runtime starts.
+        if (cilkg_singleton_ptr)
+            return __CILKRTS_SET_PARAM_LATE;
+
+        // Maximum value that can be parsed is MAX_INT (32-bit).
+        int ret = store_int<size_t>(&g->stack_size, value, 0, INT_MAX);
+
+        // Process the value the user set (or 0 if the user didn't set
+        // anything) into something nice for the current OS.  This
+        // processing is done immediately and stored into
+        // g->stack_size so that a call to get stack size will return
+        // the value that the runtime will actually use.
+        g->stack_size = cilkos_validate_stack_size(g->stack_size);
+        return ret;     
+    }
+
+
+    // If got here, then didn't match any of the strings
+    return __CILKRTS_SET_PARAM_UNIMP;
+}
+
+inline
+int calc_max_user_workers(global_state_t *g)
+{
+    // If it's been set by the user, give back what we got
+    if (g->max_user_workers > 0)
+        return g->max_user_workers;
+
+    // Calculate it
+    return std::max(3, g->P * 2);
+}
+
+} // end unnamed namespace
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * @brief Returns the global state object.  If called for the first time,
+ * initializes the user-settable values in the global state, but does not
+ * initialize the rest of the structure.
+ */
+global_state_t* cilkg_get_user_settable_values()
+{
+    // Environment variable value.  More than big enough for a 64-bit signed
+    // integer.
+    char envstr[24];
+
+    // Abbreviating &global_state_singleton as g is not only shorter, it also
+    // facilitates grepping for the string "g->", which appears ubiquitously
+    // in the runtime code.
+    global_state_t* g = &global_state_singleton;
+
+    // TBD: We need synchronization around this loop to prevent
+    // multiple threads from initializing this data.
+    if (! cilkg_user_settable_values_initialized)
+    {
+        size_t len;
+
+        // Preserve stealing disabled since it may have been set by the
+        // debugger
+        int stealing_disabled = g->stealing_disabled;
+
+        // All fields will be zero until set.  In particular
+        std::memset(g, 0, sizeof(global_state_t));
+
+        // Fetch the number of cores.  There must be at last 1, since we're
+        // executing on *something*, aren't we!?
+        int hardware_cpu_count = __cilkrts_hardware_cpu_count();
+        CILK_ASSERT(hardware_cpu_count > 0);
+
+        bool under_ptool = __cilkrts_running_under_sequential_ptool();
+        if (under_ptool)
+            hardware_cpu_count = 1;
+
+        g->stealing_disabled        = stealing_disabled;
+        g->under_ptool              = under_ptool;
+        g->force_reduce             = 0;   // Default Off
+        g->P                        = hardware_cpu_count;   // Defaults to hardware CPU count
+        g->max_user_workers         = 0;   // 0 unless set by user
+        g->fiber_pool_size          = 7;   // Arbitrary default
+        
+        g->global_fiber_pool_size   = 3 * 3* g->P;  // Arbitrary default
+        // 3*P was the default size of the worker array (including
+        // space for extra user workers).  This parameter was chosen
+        // to match previous versions of the runtime.
+
+        if (4 == sizeof(void *))
+            g->max_stacks           = 1200; // Only 1GB on 32-bit machines
+        else
+            g->max_stacks           = 2400; // 2GB on 64-bit machines
+
+        // If we have 2400 1MB stacks, that is 2 gb.  If we reach this
+        // limit on a single-socket machine, we may have other
+        // problems.  Is 2400 too small for large multicore machines?
+
+        // TBD(jsukha, 11/27/2012): I set this limit on stacks to be a
+        // value independent of P.  When running on a Xeon Phi with
+        // small values of P, I recall seeing a few microbenchmarks
+        // (e.g., fib) where a limit of 10*P seemed to be
+        // unnecessarily slowing things down.
+        // 
+        // That being said, the code has changed sufficiently that
+        // this observation may no longer be true.
+        //
+        // Note: in general, the worst-case number of stacks required
+        // for a Cilk computation with spawn depth "d" on P workers is
+        // O(Pd).  Code with unbalanced recursion may run into issues
+        // with this stack usage.
+
+        g->max_steal_failures       = 128; // TBD: depend on max_workers?
+        g->stack_size               = 0;   // 0 unless set by the user
+
+        // Assume no record or replay log for now
+        g->record_replay_file_name  = NULL;
+        g->record_or_replay         = RECORD_REPLAY_NONE;  // set by user
+
+        if (always_force_reduce())
+            g->force_reduce = true;
+        else if (cilkos_getenv(envstr, sizeof(envstr), "CILK_FORCE_REDUCE"))
+            store_bool(&g->force_reduce, envstr);
+
+        if (under_ptool)
+            g->P = 1;  // Ignore environment variable if under cilkscreen
+        else if (cilkos_getenv(envstr, sizeof(envstr), "CILK_NWORKERS"))
+            // Set P to environment variable, but limit to no less than 1
+            // and no more than 16 times the number of hardware threads.
+            store_int(&g->P, envstr, 1, 16 * hardware_cpu_count);
+
+        if (cilkos_getenv(envstr, sizeof(envstr), "CILK_MAX_USER_WORKERS"))
+            // Set max_user_workers to environment variable, but limit to no
+            // less than 1 and no more 16 times the number of hardware
+            // threads.  If not specified, defaults (somewhat arbitrarily) to
+            // the larger of 3 and twice the number of hardware threads.
+            store_int(&g->max_user_workers, envstr, 1, 16*hardware_cpu_count);
+
+        if (cilkos_getenv(envstr, sizeof(envstr), "CILK_STEAL_FAILURES"))
+            // Set the number of times a worker should fail to steal before
+            // it looks to see whether it should suspend itself.
+            store_int<unsigned>(&g->max_steal_failures, envstr, 1, INT_MAX);
+
+        // Compute the total number of workers to allocate.  Subtract one from
+        // nworkers and user workers so that the first user worker isn't
+        // factored in twice.
+        //
+        // total_workers must be computed now to support __cilkrts_get_total_workers
+        g->total_workers = g->P + calc_max_user_workers(g) - 1;
+
+#ifdef CILK_RECORD_REPLAY
+        // RecordReplay: See if we've been asked to replay a log
+        len = cilkos_getenv(envstr, 0, "CILK_REPLAY_LOG");
+        if (len > 0)
+        {
+            len += 1;    // Allow for trailing NUL
+            g->record_or_replay = REPLAY_LOG;
+            g->record_replay_file_name = (char *)__cilkrts_malloc(len);
+            cilkos_getenv(g->record_replay_file_name, len, "CILK_REPLAY_LOG");
+        }
+
+        // RecordReplay: See if we've been asked to record a log
+        len = cilkos_getenv(envstr, 0, "CILK_RECORD_LOG");
+        if (len > 0)
+        {
+            if (RECORD_REPLAY_NONE != g->record_or_replay)
+                cilkos_warning("CILK_RECORD_LOG ignored since CILK_REPLAY_LOG is defined.\n");
+            else
+            {
+                len += 1;    // Allow for trailing NUL
+                g->record_or_replay = RECORD_LOG;
+                g->record_replay_file_name = (char *)__cilkrts_malloc(len);
+                cilkos_getenv(g->record_replay_file_name, len, "CILK_RECORD_LOG");
+            }
+        }
+#endif
+        
+        cilkg_user_settable_values_initialized = true;
+    }
+
+    return g;
+}
+
+int cilkg_calc_total_workers()
+{
+    global_state_t* g = cilkg_get_user_settable_values();
+
+    // Compute the total number of workers to allocate.  Subtract one from
+    // nworkers and user workers so that the first user worker isn't
+    // factored in twice.
+    return g->P + calc_max_user_workers(g) - 1;
+}
+
+// Should be called while holding the global lock.
+global_state_t* cilkg_init_global_state()
+{
+    if (cilkg_singleton_ptr)
+        return cilkg_singleton_ptr;
+
+    // Get partially-initialized global state.
+    global_state_t* g = cilkg_get_user_settable_values();
+
+    if (g->max_stacks > 0) {
+
+        // nstacks is currently honored on non-Windows systems only.
+
+        // Set an upper bound on the number of stacks that are allocated.  If
+        // nstacks is set, each worker gets up to one stack in its cache so that
+        // no one worker can hog all of the free stacks and keep work from being
+        // stolen by the other workers.
+
+        // nstacks corresponds to the number of stacks that will be allocated by
+        // the runtime apart from the initial stack created for each thread by
+        // the system.  Therefore, if a user asks for n stacks, and there are
+        // p workers created, the total number of stacks is actually n + p.
+
+        // This feature is primarily for MIC which has flat memory
+        // instead of virtual addresses and tends to run out really quickly.
+        // It is not implemented for Windows and it's non-intuitive
+        // interaction with the local stack cache is specifically to help out
+        // MIC.
+
+        // About max_stacks / P stacks, except we require at least 1
+        // per pool.
+        if (((int)g->max_stacks / g->P) < g->fiber_pool_size)
+            g->fiber_pool_size = g->max_stacks / g->P;
+
+        if (g->fiber_pool_size <= 0) {
+            g->fiber_pool_size = 1;
+        }
+        
+        if ((int)g->max_stacks < g->P)
+            g->max_stacks = g->P;
+
+        g->global_fiber_pool_size = g->P * (g->fiber_pool_size+1);
+    }
+
+    // Number of bytes/address - validation for debugger integration
+    g->addr_size = sizeof(void *);
+
+    __cilkrts_init_stats(&g->stats);
+
+    __cilkrts_frame_malloc_global_init(g);
+
+    g->Q = 0;
+    g->total_workers = cilkg_calc_total_workers();
+    g->system_workers = g->P - 1; // system_workers is here for the debugger.
+    g->work_done = 0;
+    g->workers_running = 0;
+    g->ltqsize = 1024; /* FIXME */
+
+    g->stack_size = cilkos_validate_stack_size(g->stack_size);
+    g->failure_to_allocate_stack = 0;
+
+
+    return g;
+}
+
+void cilkg_publish_global_state(global_state_t* g) 
+{
+
+    // TBD: which one of these needs to be executed first?  I say
+    // cilkg_singleton_ptr needs to be set last, with a mfence in
+    // between, since it is the flag that cilkg_is_published_is
+    // checking for.
+    __cilkrts_global_state = g;
+    __cilkrts_fence();
+    cilkg_singleton_ptr = g;
+}
+
+void cilkg_deinit_global_state()
+{
+    cilkg_singleton_ptr = NULL;
+    __cilkrts_global_state = NULL;
+}
+
+int cilkg_is_published(void)
+{
+    return NULL != cilkg_singleton_ptr;
+}
+
+int cilkg_set_param(const char* param, const char* value)
+{
+    return set_param_imp(cilkg_get_user_settable_values(), param, value);
+}
+
+#ifdef _WIN32
+int cilkg_set_param_w(const wchar_t* param, const wchar_t* value)
+{
+    return set_param_imp(cilkg_get_user_settable_values(), param, value);
+}
+#endif
+
+extern "C++" {
+    // C++ scheduler function (that may throw exceptions)
+    typedef void cpp_scheduler_t(__cilkrts_worker *w);
+}
+
+void __cilkrts_run_scheduler_with_exceptions(__cilkrts_worker *w)
+{
+    global_state_t* g = cilkg_get_global_state();
+    CILK_ASSERT(g->scheduler);
+
+    cpp_scheduler_t* scheduler = (cpp_scheduler_t*) g->scheduler;
+
+    try {
+        scheduler(w);
+    } catch (...) {
+        __cilkrts_bug("Exception escaped Cilk context");
+    }
+}
+
+__CILKRTS_END_EXTERN_C
+
+/* End global_state.cpp */
diff --git a/libcilkrts/runtime/global_state.h b/libcilkrts/runtime/global_state.h
new file mode 100644
index 00000000000..ef455e479d5
--- /dev/null
+++ b/libcilkrts/runtime/global_state.h
@@ -0,0 +1,417 @@
+/* global_state.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file global_state.h
+ *
+ * @brief The global_state_t structure contains most of the global context
+ * maintained by the Intel Cilk runtime.
+ */
+
+#ifndef INCLUDED_GLOBAL_STATE_DOT_H
+#define INCLUDED_GLOBAL_STATE_DOT_H
+
+#include <cilk/common.h>
+
+#include "frame_malloc.h"
+#include "stats.h"
+#include "bug.h"
+#include "cilk_fiber.h"
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * Non-null place-holder for a stack handle that has no meaningful value.
+ */
+#define PLACEHOLDER_FIBER  ((cilk_fiber *) -2)
+
+/**
+ * States for record_or_replay
+ */
+enum record_replay_t {
+    RECORD_REPLAY_NONE,
+    RECORD_LOG,
+    REPLAY_LOG
+};
+
+/**
+ * @brief The global state is a structure that is shared by all workers in
+ * Cilk.
+ *
+ * Make the structure ready for use by calling
+ * cilkg_init_global_state() and then cilkg_publish_global_state().
+ *
+ * The same global lock should be held while both of these methods are
+ * called.  These methods are split because it is useful to execute
+ * other runtime initialization code in between.
+ *
+ * After cilkg_publish_global_state() has completed, Cilk runtime
+ * methods may call cilkg_get_global_state() to look at the published
+ * value without holding the global lock.
+ *
+ * Finally, clean up the global state by calling
+ * cilkg_deinit_global_state().  This method should be called only
+ * after all calls to cilkg_get_global_state() have completed, and
+ * while holding the global lock.
+ *
+ * Before initialization and after deinitialization, the fields in the
+ * global state have unspecified values, except for a few special
+ * fields labeled "USER SETTING", which can be read and written before
+ * initialization and after deinitialization.
+ */
+
+struct global_state_t { /* COMMON_PORTABLE */
+
+    /* Fields described as "(fixed)" should not be changed after
+     * initialization.
+     */
+
+    /*************************************************************************
+     * Note that debugger integration must reach into the
+     * global state!  The debugger integration is depending on the
+     * offsets of the addr_size, system_workers, total_workers,
+     * stealing_disabled, sysdep, and workers.  If these offsets change, the
+     * debugger integration library will need to be changed to match!!!
+     *************************************************************************/
+
+    int addr_size; ///< Number of bytes for an address, used by debugger (fixed)
+
+    int system_workers; ///< Number of system workers (fixed)
+
+    /**
+     * @brief USER SETTING: Maximum number of user workers that can be
+     * bound to cilk workers.
+     *
+     * 0 unless set by user.  Call cilkg_calc_max_user_workers to get
+     * the value.
+     */
+    int max_user_workers; 
+
+    int total_workers;  ///< Total number of worker threads allocated (fixed)
+
+    int workers_running; ///< True when system workers have beens started */
+
+    /// Set by debugger to disable stealing (fixed)
+    int stealing_disabled;
+
+    /// System-dependent part of the global state
+    struct global_sysdep_state *sysdep;
+
+    /// Array of worker structures.
+    __cilkrts_worker **workers;
+
+    /******* END OF DEBUGGER-INTEGRATION FIELDS ***************/
+
+    /// Number of frames in each worker's lazy task queue
+    __STDNS size_t ltqsize;
+
+    /**
+     * @brief USER SETTING: Force all possible reductions.
+     *
+     * TRUE if running a p-tool that requires reducers to call the reduce()
+     * method even if no actual stealing occurs.
+     *
+     * When set to TRUE, runtime will simulate steals, forcing calls to the 
+     * the reduce() methods of reducers.
+     *
+     */
+    int force_reduce;    
+
+    /// USER SETTING: Per-worker fiber pool size
+    int fiber_pool_size; 
+
+    /// USER SETTING: Global fiber pool size
+    int global_fiber_pool_size;
+
+    /**
+     * @brief TRUE when workers should exit scheduling loop so we can
+     * shut down the runtime and free the global state.
+     *
+     * @note @c work_done will be checked *FREQUENTLY* in the scheduling loop
+     * by idle workers.  We need to ensure that it's not in a cache line which
+     * may be invalidated by other cores.  The surrounding fields are either
+     * constant after initialization or not used until shutdown (stats) so we
+     * should be OK.
+     */
+    volatile int work_done;
+
+    int under_ptool;     ///< True when running under a serial PIN tool
+
+    statistics stats;    ///< Statistics on use of runtime
+
+    /**
+     * @brief USER SETTING: Maximum number of stacks the runtime will
+     * allocate (apart from those created by the OS when worker
+     * threads are created).
+     *
+     * If max_stacks == 0,there is no pre-defined maximum.
+     */
+    unsigned max_stacks; 
+
+    /// Size of each stack
+    size_t stack_size;
+
+    /// Global cache for per-worker memory
+    struct __cilkrts_frame_cache frame_malloc;
+
+    /// Global fiber pool
+    cilk_fiber_pool fiber_pool;
+
+
+    /**
+     * @brief Track whether the runtime has failed to allocate a
+     * stack.
+     * 
+     * Setting this flag prevents multiple warnings from being
+     * issued.
+     */
+    int failure_to_allocate_stack;
+
+    /**
+     * @brief USER SETTING: indicate record or replay log.
+     * Set to NULL if not used in this run.
+     */
+    char *record_replay_file_name;
+
+    /**
+     * @brief Record/replay state.
+     * Valid states are:
+     *   RECORD_REPLAY_NONE - Not recording or replaying a log
+     *   RECORD_LOG - Recording a log for replay later
+     *   REPLAY_LOG - Replay a log recorded earlier
+     */
+    enum record_replay_t record_or_replay;
+
+    /**
+     * @brief Buffer to force max_steal_failures to appear on a
+     * different cache line from the previous member variables.
+     *
+     * This padding is needed because max_steal_failures is read
+     * constantly and other modified values in the global state will
+     * cause thrashing.
+     */
+    char cache_buf[64];
+
+    /**
+     * @brief Maximum number of times a thread should fail to steal
+     * before checking if Cilk is shutting down.
+     */
+    unsigned int max_steal_failures;
+
+    /// Pointer to scheduler entry point
+    void (*scheduler)(__cilkrts_worker *w);
+
+    /**
+     * @brief Buffer to force P and Q to appear on a different cache
+     * line from the previous member variables.
+     */
+    char cache_buf_2[64];
+
+    int P;         ///< USER SETTING: number of system workers + 1 (fixed)
+    int Q;         ///< Number of user threads currently bound to workers 
+};
+
+/**
+ * @brief Initialize the global state object.  This method must both
+ * complete before referencing any fields in the global state, except
+ * those specified as "user-settable values".
+ */
+global_state_t* cilkg_init_global_state();
+
+/**
+ * @brief Publish the global state object, so that
+ * cilkg_is_published can return true.
+ *
+ * @param g - the global state created by cilkg_init_global_state() to
+ * publish.
+ *
+ * After the global state object has been published, a thread should
+ * not modify this state unless it has exclusive access (i.e., holds
+ * the global lock).
+ */
+void cilkg_publish_global_state(global_state_t* g);
+
+/**
+ * @brief Return true if the global state has been fully initialized
+ * and published, and has not been deinitialized.
+ */
+int cilkg_is_published(void);
+
+/**
+ * @brief De-initializes the global state object.  Must be called to free
+ * resources when the global state is no longer needed.
+ */
+void cilkg_deinit_global_state(void);
+
+/**
+ * @brief Returns the global state object.  Result is valid only if the
+ * global state has been published (see cilkg_publish_global_state()).
+ */
+static inline
+global_state_t* cilkg_get_global_state(void)
+{
+    // "private" extern declaration:
+    extern global_state_t *cilkg_singleton_ptr;
+
+    __CILKRTS_ASSERT(cilkg_singleton_ptr); // Debug only
+    return cilkg_singleton_ptr;
+}
+
+
+/**
+ * @brief Implementation of __cilkrts_set_params.
+ *
+ * Set user controllable parameters
+ * @param param - string specifying parameter to be set
+ * @param value - string specifying new value
+ * @returns One of: CILKG_SET_PARAM_SUCCESS ( = 0),
+ *    CILKG_SET_PARAM_UNIMP, CILKG_SET_PARAM_XRANGE,
+ *    CILKG_SET_PARAM_INVALID, or CILKG_SET_PARAM_LATE.
+ *
+ * @attention The wide character version __cilkrts_set_param_w() is available
+ * only on Windows.
+ *
+ * Allowable parameter names:
+ *
+ * - "nworkers" - number of processors that should run Cilk code.
+ *   The value is a string of digits to be parsed by strtol.
+ *
+ * - "force reduce" - test reducer callbacks by allocating new views
+ *   for every spawn within which a reducer is accessed.  This can
+ *   significantly reduce performance.  The value is "1" or "true"
+ *   to enable, "0" or "false" to disable.
+ *   @warning Enabling "force reduce" when running with more than a single
+ *   worker is currently broken.
+ *
+ * - "max user workers" - (Not publicly documented) Sets the number of slots
+ *   allocated for user worker threads
+ *
+ * - "local stacks" - (Not publicly documented) Number of stacks we'll hold in
+ *   the per-worker stack cache.  Range 1 .. 42.  See
+ *   cilkg_init_global_state for details.
+ *
+ * - "shared stacks" - (Not publicly documented) Maximum number of stacks
+ *   we'll hold in the global stack cache. Maximum value is 42.  See
+ *   __cilkrts_make_global_state for details
+ *
+ * - "nstacks" - (Not publicly documented at this time, though it may be
+ *   exposed in the future) Sets the maximum number of stacks permitted at one
+ *   time.  If the runtime reaches this maximum, it will cease to allocate
+ *   stacks and the app will lose parallelism.  0 means unlimited.  Default is
+ *   unlimited.  Minimum is twice the number of worker threads, though that
+ *   cannot be tested at this time.
+ */
+int cilkg_set_param(const char* param, const char* value);
+#ifdef _WIN32
+/**
+ * @brief Implementation of __cilkrts_set_params for Unicode characters on
+ * Windows.  See the documentation on @ref cilkg_set_param for more details.
+ *
+ * Set user controllable parameters
+ * @param param - string specifying parameter to be set
+ * @param value - string specifying new value
+ * @returns One of: CILKG_SET_PARAM_SUCCESS ( = 0),
+ *    CILKG_SET_PARAM_UNIMP, CILKG_SET_PARAM_XRANGE,
+ *    CILKG_SET_PARAM_INVALID, or CILKG_SET_PARAM_LATE.
+ */
+int cilkg_set_param_w(const wchar_t* param, const wchar_t* value);
+#endif
+
+/**
+ * @brief implementation of __cilkrts_get_nworkers()
+ */
+static inline
+int cilkg_get_nworkers(void)
+{
+    // "private" extern declaration
+    extern global_state_t* cilkg_get_user_settable_values(void);
+    return cilkg_get_user_settable_values()->P;
+}
+
+/**
+ * @brief implementation of __cilkrts_get_total_workers()
+ */
+static inline
+int cilkg_get_total_workers(void)
+{
+    // "private" extern declaration
+    extern int cilkg_calc_total_workers(void);
+
+    // This number can fluctate until initialization so we
+    // compute it from scratch
+    return cilkg_calc_total_workers();
+}
+
+/**
+ * @brief implementation of __cilkrts_get_force_reduce()
+ */
+static inline
+int cilkg_get_force_reduce(void)
+{
+    // "private" extern declaration
+    extern global_state_t* cilkg_get_user_settable_values(void);
+    return cilkg_get_user_settable_values()->force_reduce;
+}
+
+/**
+ * @brief implementation of __cilkrts_get_stack_size()
+ */
+static inline
+size_t cilkg_get_stack_size(void)
+{
+    // "private" extern declaration
+    extern global_state_t* cilkg_get_user_settable_values(void);
+    return cilkg_get_user_settable_values()->stack_size;
+}
+
+/**
+ * @brief Run the scheduler function stored in the global_state
+ *
+ * Look up the scheduler function in global_state and run it.  Report a fatal
+ * error if an exception escapes the scheduler function.
+ * 
+ * @param w - Worker structure to associate with the current thread.
+ *
+ * @attention The scheduler field of the global state must be set before this
+ * function is called.
+ */
+void __cilkrts_run_scheduler_with_exceptions(__cilkrts_worker *w);
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_GLOBAL_STATE_DOT_H)
diff --git a/libcilkrts/runtime/jmpbuf.c b/libcilkrts/runtime/jmpbuf.c
new file mode 100644
index 00000000000..39b51a593ce
--- /dev/null
+++ b/libcilkrts/runtime/jmpbuf.c
@@ -0,0 +1,48 @@
+/* jmpbuf.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#include "jmpbuf.h"
+
+/*
+ * C99 requires that every inline function with external linkage have
+ * one extern declaration in the program.
+ */
+extern char *__cilkrts_get_sp(__cilkrts_stack_frame *sf);
+extern ptrdiff_t __cilkrts_get_frame_size(__cilkrts_stack_frame *sf);
+
+/* End jmpbuf.c */
diff --git a/libcilkrts/runtime/jmpbuf.h b/libcilkrts/runtime/jmpbuf.h
new file mode 100644
index 00000000000..60573f3a5fa
--- /dev/null
+++ b/libcilkrts/runtime/jmpbuf.h
@@ -0,0 +1,136 @@
+/* jmpbuf.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file jmpbuf.h
+ *
+ * @brief Macros and functions to access the _JUMP_BUFFER initialized by a 
+ * call to CILK_SETJMP before a cilk_spawn or cilk_sync.  The definition of
+ * CILK_SETJMP and CILK_LONGJMP are OS dependent and in abi.h
+ *
+ */
+
+#ifndef INCLUDED_JMPBUF_DOT_H
+#define INCLUDED_JMPBUF_DOT_H
+
+#include <cilk/common.h>
+#include <internal/abi.h>
+#include <stddef.h>
+#include <setjmp.h>
+
+#if 0 /* defined CILK_USE_C_SETJMP && defined JB_RSP */
+#   define JMPBUF_SP(ctx) (ctx)[0].__jmpbuf[JB_RSP]
+#   define JMPBUF_FP(ctx) (ctx)[0].__jmpbuf[JB_RBP]
+#   define JMPBUF_PC(ctx) (ctx)[0].__jmpbuf[JB_PC]
+#elif 0 /* defined CILK_USE_C_SETJMP && defined JB_SP */
+#   define JMPBUF_SP(ctx) (ctx)[0].__jmpbuf[JB_SP]
+#   define JMPBUF_FP(ctx) (ctx)[0].__jmpbuf[JB_BP]
+#   define JMPBUF_PC(ctx) (ctx)[0].__jmpbuf[JB_PC]
+#elif defined _WIN64
+#   define JMPBUF_SP(ctx) ((_JUMP_BUFFER*)(&(ctx)))->Rsp
+#   define JMPBUF_FP(ctx) ((_JUMP_BUFFER*)(&(ctx)))->Rbp
+#   define JMPBUF_PC(ctx) ((_JUMP_BUFFER*)(&(ctx)))->Rip
+#elif defined _WIN32
+    /** Fetch stack pointer from a __cilkrts_stack_frame */
+#   define JMPBUF_SP(ctx) (ctx).Esp
+    /** Fetch frame pointer from a __cilkrts_stack_frame */
+#   define JMPBUF_FP(ctx) (ctx).Ebp
+    /** Fetch program counter from a __cilkrts_stack_frame */
+#   define JMPBUF_PC(ctx) (ctx).Eip
+#else /* defined __GNUC__ || defined __ICC */
+    /* word 0 is frame address
+     * word 1 is resume address
+     * word 2 is stack address */
+#   define JMPBUF_FP(ctx) (ctx)[0]
+#   define JMPBUF_PC(ctx) (ctx)[1]
+#   define JMPBUF_SP(ctx) (ctx)[2]
+#endif
+
+/**
+ * @brief Get frame pointer from jump buffer in__cilkrts_stack_frame.
+ */
+#define FP(SF) JMPBUF_FP((SF)->ctx)
+
+/**
+ * @brief Get program counter from jump buffer in__cilkrts_stack_frame.
+ */
+#define PC(SF) JMPBUF_PC((SF)->ctx)
+
+/**
+ * @brief Get stack pointer from jump buffer in__cilkrts_stack_frame.
+ */
+#define SP(SF) JMPBUF_SP((SF)->ctx)
+
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * Fetch the stack pointer from a __cilkrts_stack_frame.  The jmpbuf was
+ * initialized before a cilk_spawn or cilk_sync.
+ *
+ * @param sf __cilkrts_stack_frame containing the jmpbuf.
+ *
+ * @return the stack pointer from the ctx.
+ */
+inline char *__cilkrts_get_sp(__cilkrts_stack_frame *sf)
+{
+    return (char *)SP(sf);
+}
+
+/**
+ * Calculate the frame size from __cilkrts_stack_frame.  The jmpbuf was
+ * initialized before a cilk_spawn or cilk_sync.
+ *
+ * @warning Returning an arbitrary value on Windows!
+ *
+ * @param sf __cilkrts_stack_frame containing the jmpbuf.
+ *
+ * @return the stack pointer from the ctx.
+ */
+inline ptrdiff_t __cilkrts_get_frame_size(__cilkrts_stack_frame *sf)
+{
+#ifdef _WIN32
+    if (0 == SP(sf))
+        return 256;         // Arbitrary!
+#endif
+    return (ptrdiff_t)FP(sf) - (ptrdiff_t)SP(sf);
+}
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_JMPBUF_DOT_H)
diff --git a/libcilkrts/runtime/linux-symbols.ver b/libcilkrts/runtime/linux-symbols.ver
new file mode 100644
index 00000000000..aeb4a5fb13d
--- /dev/null
+++ b/libcilkrts/runtime/linux-symbols.ver
@@ -0,0 +1,369 @@
+/*
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+CILKABI0
+{
+  global:
+    __cilkrts_bind_thread;
+    __cilkrts_cilk_for_32;
+    __cilkrts_cilk_for_64;
+    __cilkrts_debugger_notification;
+    __cilkrts_dump_stats;
+    __cilkrts_end_cilk;
+    __cilkrts_enter_frame;
+    __cilkrts_enter_frame_fast;
+    __cilkrts_get_force_reduce;
+    __cilkrts_get_nworkers;
+    __cilkrts_get_tls_worker;
+    __cilkrts_get_tls_worker_fast;
+    __cilkrts_get_total_workers;
+    __cilkrts_get_worker_number;
+    __cilkrts_global_state;
+    __cilkrts_hyper_create;
+    __cilkrts_hyper_destroy;
+    __cilkrts_hyper_lookup;
+    __cilkrts_hyperobject_alloc;
+    __cilkrts_hyperobject_dealloc;
+    __cilkrts_hyperobject_noop_destroy;
+    __cilkrts_init;
+    __cilkrts_irml_version;
+    __cilkrts_leave_frame;
+    __cilkrts_metacall;
+    __cilkrts_rethrow;
+    __cilkrts_return_exception;
+    __cilkrts_set_param;
+    __cilkrts_sync;
+    __cilkrts_synched;
+    __cilkrts_worker_stub;
+  local: *;
+};
+
+CILKABI1
+{
+  global:
+    __cilkrts_bind_thread_1;
+    __cilkrts_bump_loop_rank;
+    __cilkrts_bump_loop_rank_internal;
+    __cilkrts_bump_worker_rank;
+    __cilkrts_bump_worker_rank_internal;
+    __cilkrts_enter_frame_1;
+    __cilkrts_enter_frame_fast_1;
+    __cilkrts_get_pedigree_info;
+    __cilkrts_get_pedigree_internal;
+    __cilkrts_get_sf;
+    __cilkrts_get_stack_size;
+    __cilkrts_get_worker_rank;
+    __cilkrts_save_fp_ctrl_state;
+    __cilkrts_stack_alloc;
+    __cilkrts_stack_free;
+    __cilkrts_watch_stack;
+} CILKABI0;
+
+CILKLIB1.02
+{
+  global:
+    cilk_c_reducer_max_identity_char;
+    cilk_c_reducer_max_identity_double;
+    cilk_c_reducer_max_identity_float;
+    cilk_c_reducer_max_identity_int;
+    cilk_c_reducer_max_identity_long;
+    cilk_c_reducer_max_identity_longdouble;
+    cilk_c_reducer_max_identity_longlong;
+    cilk_c_reducer_max_identity_schar;
+    cilk_c_reducer_max_identity_short;
+    cilk_c_reducer_max_identity_uchar;
+    cilk_c_reducer_max_identity_uint;
+    cilk_c_reducer_max_identity_ulong;
+    cilk_c_reducer_max_identity_ulonglong;
+    cilk_c_reducer_max_identity_unsigned;
+    cilk_c_reducer_max_identity_ushort;
+    cilk_c_reducer_max_identity_wchar_t;
+    cilk_c_reducer_max_index_identity_char;
+    cilk_c_reducer_max_index_identity_double;
+    cilk_c_reducer_max_index_identity_float;
+    cilk_c_reducer_max_index_identity_int;
+    cilk_c_reducer_max_index_identity_long;
+    cilk_c_reducer_max_index_identity_longdouble;
+    cilk_c_reducer_max_index_identity_longlong;
+    cilk_c_reducer_max_index_identity_schar;
+    cilk_c_reducer_max_index_identity_short;
+    cilk_c_reducer_max_index_identity_uchar;
+    cilk_c_reducer_max_index_identity_uint;
+    cilk_c_reducer_max_index_identity_ulong;
+    cilk_c_reducer_max_index_identity_ulonglong;
+    cilk_c_reducer_max_index_identity_unsigned;
+    cilk_c_reducer_max_index_identity_ushort;
+    cilk_c_reducer_max_index_identity_wchar_t;
+    cilk_c_reducer_max_index_reduce_char;
+    cilk_c_reducer_max_index_reduce_double;
+    cilk_c_reducer_max_index_reduce_float;
+    cilk_c_reducer_max_index_reduce_int;
+    cilk_c_reducer_max_index_reduce_long;
+    cilk_c_reducer_max_index_reduce_longdouble;
+    cilk_c_reducer_max_index_reduce_longlong;
+    cilk_c_reducer_max_index_reduce_schar;
+    cilk_c_reducer_max_index_reduce_short;
+    cilk_c_reducer_max_index_reduce_uchar;
+    cilk_c_reducer_max_index_reduce_uint;
+    cilk_c_reducer_max_index_reduce_ulong;
+    cilk_c_reducer_max_index_reduce_ulonglong;
+    cilk_c_reducer_max_index_reduce_unsigned;
+    cilk_c_reducer_max_index_reduce_ushort;
+    cilk_c_reducer_max_index_reduce_wchar_t;
+    cilk_c_reducer_max_reduce_char;
+    cilk_c_reducer_max_reduce_double;
+    cilk_c_reducer_max_reduce_float;
+    cilk_c_reducer_max_reduce_int;
+    cilk_c_reducer_max_reduce_long;
+    cilk_c_reducer_max_reduce_longdouble;
+    cilk_c_reducer_max_reduce_longlong;
+    cilk_c_reducer_max_reduce_schar;
+    cilk_c_reducer_max_reduce_short;
+    cilk_c_reducer_max_reduce_uchar;
+    cilk_c_reducer_max_reduce_uint;
+    cilk_c_reducer_max_reduce_ulong;
+    cilk_c_reducer_max_reduce_ulonglong;
+    cilk_c_reducer_max_reduce_unsigned;
+    cilk_c_reducer_max_reduce_ushort;
+    cilk_c_reducer_max_reduce_wchar_t;
+    cilk_c_reducer_min_identity_char;
+    cilk_c_reducer_min_identity_double;
+    cilk_c_reducer_min_identity_float;
+    cilk_c_reducer_min_identity_int;
+    cilk_c_reducer_min_identity_long;
+    cilk_c_reducer_min_identity_longdouble;
+    cilk_c_reducer_min_identity_longlong;
+    cilk_c_reducer_min_identity_schar;
+    cilk_c_reducer_min_identity_short;
+    cilk_c_reducer_min_identity_uchar;
+    cilk_c_reducer_min_identity_uint;
+    cilk_c_reducer_min_identity_ulong;
+    cilk_c_reducer_min_identity_ulonglong;
+    cilk_c_reducer_min_identity_unsigned;
+    cilk_c_reducer_min_identity_ushort;
+    cilk_c_reducer_min_identity_wchar_t;
+    cilk_c_reducer_min_index_identity_char;
+    cilk_c_reducer_min_index_identity_double;
+    cilk_c_reducer_min_index_identity_float;
+    cilk_c_reducer_min_index_identity_int;
+    cilk_c_reducer_min_index_identity_long;
+    cilk_c_reducer_min_index_identity_longdouble;
+    cilk_c_reducer_min_index_identity_longlong;
+    cilk_c_reducer_min_index_identity_schar;
+    cilk_c_reducer_min_index_identity_short;
+    cilk_c_reducer_min_index_identity_uchar;
+    cilk_c_reducer_min_index_identity_uint;
+    cilk_c_reducer_min_index_identity_ulong;
+    cilk_c_reducer_min_index_identity_ulonglong;
+    cilk_c_reducer_min_index_identity_unsigned;
+    cilk_c_reducer_min_index_identity_ushort;
+    cilk_c_reducer_min_index_identity_wchar_t;
+    cilk_c_reducer_min_index_reduce_char;
+    cilk_c_reducer_min_index_reduce_double;
+    cilk_c_reducer_min_index_reduce_float;
+    cilk_c_reducer_min_index_reduce_int;
+    cilk_c_reducer_min_index_reduce_long;
+    cilk_c_reducer_min_index_reduce_longdouble;
+    cilk_c_reducer_min_index_reduce_longlong;
+    cilk_c_reducer_min_index_reduce_schar;
+    cilk_c_reducer_min_index_reduce_short;
+    cilk_c_reducer_min_index_reduce_uchar;
+    cilk_c_reducer_min_index_reduce_uint;
+    cilk_c_reducer_min_index_reduce_ulong;
+    cilk_c_reducer_min_index_reduce_ulonglong;
+    cilk_c_reducer_min_index_reduce_unsigned;
+    cilk_c_reducer_min_index_reduce_ushort;
+    cilk_c_reducer_min_index_reduce_wchar_t;
+    cilk_c_reducer_min_reduce_char;
+    cilk_c_reducer_min_reduce_double;
+    cilk_c_reducer_min_reduce_float;
+    cilk_c_reducer_min_reduce_int;
+    cilk_c_reducer_min_reduce_long;
+    cilk_c_reducer_min_reduce_longdouble;
+    cilk_c_reducer_min_reduce_longlong;
+    cilk_c_reducer_min_reduce_schar;
+    cilk_c_reducer_min_reduce_short;
+    cilk_c_reducer_min_reduce_uchar;
+    cilk_c_reducer_min_reduce_uint;
+    cilk_c_reducer_min_reduce_ulong;
+    cilk_c_reducer_min_reduce_ulonglong;
+    cilk_c_reducer_min_reduce_unsigned;
+    cilk_c_reducer_min_reduce_ushort;
+    cilk_c_reducer_min_reduce_wchar_t;
+    cilk_c_reducer_opadd_identity_char;
+    cilk_c_reducer_opadd_identity_double;
+    cilk_c_reducer_opadd_identity_float;
+    cilk_c_reducer_opadd_identity_int;
+    cilk_c_reducer_opadd_identity_long;
+    cilk_c_reducer_opadd_identity_longdouble;
+    cilk_c_reducer_opadd_identity_longlong;
+    cilk_c_reducer_opadd_identity_schar;
+    cilk_c_reducer_opadd_identity_short;
+    cilk_c_reducer_opadd_identity_uchar;
+    cilk_c_reducer_opadd_identity_uint;
+    cilk_c_reducer_opadd_identity_ulong;
+    cilk_c_reducer_opadd_identity_ulonglong;
+    cilk_c_reducer_opadd_identity_unsigned;
+    cilk_c_reducer_opadd_identity_ushort;
+    cilk_c_reducer_opadd_identity_wchar_t;
+    cilk_c_reducer_opadd_reduce_char;
+    cilk_c_reducer_opadd_reduce_double;
+    cilk_c_reducer_opadd_reduce_float;
+    cilk_c_reducer_opadd_reduce_int;
+    cilk_c_reducer_opadd_reduce_long;
+    cilk_c_reducer_opadd_reduce_longdouble;
+    cilk_c_reducer_opadd_reduce_longlong;
+    cilk_c_reducer_opadd_reduce_schar;
+    cilk_c_reducer_opadd_reduce_short;
+    cilk_c_reducer_opadd_reduce_uchar;
+    cilk_c_reducer_opadd_reduce_uint;
+    cilk_c_reducer_opadd_reduce_ulong;
+    cilk_c_reducer_opadd_reduce_ulonglong;
+    cilk_c_reducer_opadd_reduce_unsigned;
+    cilk_c_reducer_opadd_reduce_ushort;
+    cilk_c_reducer_opadd_reduce_wchar_t;
+    cilk_c_reducer_opand_identity_char;
+    cilk_c_reducer_opand_identity_int;
+    cilk_c_reducer_opand_identity_long;
+    cilk_c_reducer_opand_identity_longlong;
+    cilk_c_reducer_opand_identity_schar;
+    cilk_c_reducer_opand_identity_short;
+    cilk_c_reducer_opand_identity_uchar;
+    cilk_c_reducer_opand_identity_uint;
+    cilk_c_reducer_opand_identity_ulong;
+    cilk_c_reducer_opand_identity_ulonglong;
+    cilk_c_reducer_opand_identity_unsigned;
+    cilk_c_reducer_opand_identity_ushort;
+    cilk_c_reducer_opand_identity_wchar_t;
+    cilk_c_reducer_opand_reduce_char;
+    cilk_c_reducer_opand_reduce_int;
+    cilk_c_reducer_opand_reduce_long;
+    cilk_c_reducer_opand_reduce_longlong;
+    cilk_c_reducer_opand_reduce_schar;
+    cilk_c_reducer_opand_reduce_short;
+    cilk_c_reducer_opand_reduce_uchar;
+    cilk_c_reducer_opand_reduce_uint;
+    cilk_c_reducer_opand_reduce_ulong;
+    cilk_c_reducer_opand_reduce_ulonglong;
+    cilk_c_reducer_opand_reduce_unsigned;
+    cilk_c_reducer_opand_reduce_ushort;
+    cilk_c_reducer_opand_reduce_wchar_t;
+    cilk_c_reducer_opmul_identity_char;
+    cilk_c_reducer_opmul_identity_double;
+    cilk_c_reducer_opmul_identity_float;
+    cilk_c_reducer_opmul_identity_int;
+    cilk_c_reducer_opmul_identity_long;
+    cilk_c_reducer_opmul_identity_longdouble;
+    cilk_c_reducer_opmul_identity_longlong;
+    cilk_c_reducer_opmul_identity_schar;
+    cilk_c_reducer_opmul_identity_short;
+    cilk_c_reducer_opmul_identity_uchar;
+    cilk_c_reducer_opmul_identity_uint;
+    cilk_c_reducer_opmul_identity_ulong;
+    cilk_c_reducer_opmul_identity_ulonglong;
+    cilk_c_reducer_opmul_identity_unsigned;
+    cilk_c_reducer_opmul_identity_ushort;
+    cilk_c_reducer_opmul_identity_wchar_t;
+    cilk_c_reducer_opmul_reduce_char;
+    cilk_c_reducer_opmul_reduce_double;
+    cilk_c_reducer_opmul_reduce_float;
+    cilk_c_reducer_opmul_reduce_int;
+    cilk_c_reducer_opmul_reduce_long;
+    cilk_c_reducer_opmul_reduce_longdouble;
+    cilk_c_reducer_opmul_reduce_longlong;
+    cilk_c_reducer_opmul_reduce_schar;
+    cilk_c_reducer_opmul_reduce_short;
+    cilk_c_reducer_opmul_reduce_uchar;
+    cilk_c_reducer_opmul_reduce_uint;
+    cilk_c_reducer_opmul_reduce_ulong;
+    cilk_c_reducer_opmul_reduce_ulonglong;
+    cilk_c_reducer_opmul_reduce_unsigned;
+    cilk_c_reducer_opmul_reduce_ushort;
+    cilk_c_reducer_opmul_reduce_wchar_t;
+    cilk_c_reducer_opor_identity_char;
+    cilk_c_reducer_opor_identity_int;
+    cilk_c_reducer_opor_identity_long;
+    cilk_c_reducer_opor_identity_longlong;
+    cilk_c_reducer_opor_identity_schar;
+    cilk_c_reducer_opor_identity_short;
+    cilk_c_reducer_opor_identity_uchar;
+    cilk_c_reducer_opor_identity_uint;
+    cilk_c_reducer_opor_identity_ulong;
+    cilk_c_reducer_opor_identity_ulonglong;
+    cilk_c_reducer_opor_identity_unsigned;
+    cilk_c_reducer_opor_identity_ushort;
+    cilk_c_reducer_opor_identity_wchar_t;
+    cilk_c_reducer_opor_reduce_char;
+    cilk_c_reducer_opor_reduce_int;
+    cilk_c_reducer_opor_reduce_long;
+    cilk_c_reducer_opor_reduce_longlong;
+    cilk_c_reducer_opor_reduce_schar;
+    cilk_c_reducer_opor_reduce_short;
+    cilk_c_reducer_opor_reduce_uchar;
+    cilk_c_reducer_opor_reduce_uint;
+    cilk_c_reducer_opor_reduce_ulong;
+    cilk_c_reducer_opor_reduce_ulonglong;
+    cilk_c_reducer_opor_reduce_unsigned;
+    cilk_c_reducer_opor_reduce_ushort;
+    cilk_c_reducer_opor_reduce_wchar_t;
+    cilk_c_reducer_opxor_identity_char;
+    cilk_c_reducer_opxor_identity_int;
+    cilk_c_reducer_opxor_identity_long;
+    cilk_c_reducer_opxor_identity_longlong;
+    cilk_c_reducer_opxor_identity_schar;
+    cilk_c_reducer_opxor_identity_short;
+    cilk_c_reducer_opxor_identity_uchar;
+    cilk_c_reducer_opxor_identity_uint;
+    cilk_c_reducer_opxor_identity_ulong;
+    cilk_c_reducer_opxor_identity_ulonglong;
+    cilk_c_reducer_opxor_identity_unsigned;
+    cilk_c_reducer_opxor_identity_ushort;
+    cilk_c_reducer_opxor_identity_wchar_t;
+    cilk_c_reducer_opxor_reduce_char;
+    cilk_c_reducer_opxor_reduce_int;
+    cilk_c_reducer_opxor_reduce_long;
+    cilk_c_reducer_opxor_reduce_longlong;
+    cilk_c_reducer_opxor_reduce_schar;
+    cilk_c_reducer_opxor_reduce_short;
+    cilk_c_reducer_opxor_reduce_uchar;
+    cilk_c_reducer_opxor_reduce_uint;
+    cilk_c_reducer_opxor_reduce_ulong;
+    cilk_c_reducer_opxor_reduce_ulonglong;
+    cilk_c_reducer_opxor_reduce_unsigned;
+    cilk_c_reducer_opxor_reduce_ushort;
+    cilk_c_reducer_opxor_reduce_wchar_t;
+};
diff --git a/libcilkrts/runtime/local_state.c b/libcilkrts/runtime/local_state.c
new file mode 100644
index 00000000000..14ac8271936
--- /dev/null
+++ b/libcilkrts/runtime/local_state.c
@@ -0,0 +1,68 @@
+/* local_state.c                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2010-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+#include "local_state.h"
+#include "bug.h"
+#include "full_frame.h"
+
+void run_scheduling_stack_fcn(__cilkrts_worker *w)
+{
+    scheduling_stack_fcn_t fcn = w->l->post_suspend;
+    full_frame *ff2 = w->l->frame_ff;
+    __cilkrts_stack_frame *sf2 = w->l->suspended_stack;
+
+    w->l->post_suspend = 0;
+    w->l->suspended_stack = 0;
+
+    // Conceptually, after clearing w->l->frame_ff,
+    // w no longer owns the full frame ff.
+    // The next time another (possibly different) worker takes
+    // ownership of ff will be at a provably_good_steal on ff. 
+    w->l->frame_ff = NULL;
+
+    CILK_ASSERT(fcn);
+    CILK_ASSERT(ff2);
+    fcn(w, ff2, sf2);
+
+    // After we run the scheduling stack function, we shouldn't
+    // (still) not have a full frame.
+    CILK_ASSERT(NULL == w->l->frame_ff);
+}
+
+/* End local_state.c */
diff --git a/libcilkrts/runtime/local_state.h b/libcilkrts/runtime/local_state.h
new file mode 100644
index 00000000000..03f39897f51
--- /dev/null
+++ b/libcilkrts/runtime/local_state.h
@@ -0,0 +1,424 @@
+/* local_state.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file local_state.h
+ *
+ * @brief The local_state structure contains additional OS-independent
+ * information that's associated with a worker, but doesn't need to be visible
+ * to the code generated by the compiler.
+ */
+
+#ifndef INCLUDED_LOCAL_STATE_DOT_H
+#define INCLUDED_LOCAL_STATE_DOT_H
+
+#include <internal/abi.h>
+#include "worker_mutex.h"
+#include "global_state.h"
+#include "record-replay.h"
+#include "signal_node.h"
+
+#include <setjmp.h>
+#include <stddef.h>
+#include <stdio.h>
+
+
+#ifndef _WIN32
+#   include <pthread.h>
+#endif
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/* Opaque types. */
+
+struct full_frame;
+struct free_list;
+struct pending_exception_info;
+/// Opaque type for replay entry. 
+typedef struct replay_entry_t replay_entry_t;
+
+/**
+ * @brief Magic numbers for local_state, used for debugging
+ */
+typedef unsigned long long ls_magic_t;
+
+/**
+ * @brief Scheduling stack function: A function that is decided on the program stack,
+ * but that must be executed on the scheduling stack.
+ */
+typedef void (*scheduling_stack_fcn_t) (__cilkrts_worker *w,
+                                        struct full_frame *ff,
+                                        __cilkrts_stack_frame *sf);
+
+/**
+ * @brief Type of this worker.
+ **/
+typedef enum cilk_worker_type
+{
+    WORKER_FREE,    ///< Unused worker - available to be bound to user threads
+    WORKER_SYSTEM,  ///< Worker created by runtime - able to steal from any worker
+    WORKER_USER     ///< User thread - able to steal only from team members
+} cilk_worker_type;
+
+
+/**
+ * @brief The local_state structure contains additional OS-independent
+ * information that's associated with a worker, but doesn't need to be
+ * visible to the compiler.
+ *
+ * No compiler-generated code should need to know the layout of this
+ * structure.
+ *
+ * The fields of this struct can be classified as either local or
+ * shared.
+ *
+ *  Local: This field is only accessed by the thread bound to this
+ *    worker struct.  Local fields can be freely accessed without
+ *    acquiring locks.
+ *  
+ *  Shared: This field may be accessed by multiple worker threads.
+ *    Accesses to shared fields usually requires locks, except in
+ *    special situations where one can prove that locks are
+ *    unnecessary.
+ *
+ * The fields of this can also be classified as "read-only" if the
+ * field does not change after it is initialized.  Otherwise, the
+ * field is "read/write".  Read-only fields do not require locks to
+ * access (ignoring the synchronization that might be needed for
+ * initialization if this can occur in parallel).
+ *
+ * Finally, we explicitly classify some fields as "synchronization"
+ * fields if they are used as part of a synchronization protocol in
+ * the runtime.  These variables are generally shared and read/write.
+ * Mostly, this category includes lock variables and other variables
+ * that are involved in synchronization protocols (i.e., the THE
+ * protocol).
+ */
+struct local_state  /* COMMON_PORTABLE */
+{
+    /** This value should be in the first field in any local_state */
+#   define WORKER_MAGIC_0 ((ls_magic_t)0xe0831a4a940c60b8ULL)
+
+    /**
+     * Should be WORKER_MAGIC_0 or the local_state has been corrupted
+     * This magic field is shared because it is read on lock acquisitions.
+     *
+     * [shared read-only]
+     */
+    ls_magic_t worker_magic_0;
+
+    /**
+     * Mutex used to serialize access to the local_state
+     * Synchronization field. [shared read/write]
+     */
+    struct mutex lock;
+
+    /**
+     * Flag that indicates that the worker is interested in grabbing
+     * LOCK, and thus thieves should leave the worker alone.
+     * Written only by self, may be read by others.
+     *
+     * Synchronization field.  [shared read/write]
+     */
+    int do_not_steal;
+
+    /**
+     * Lock that all thieves grab in order to compete for the right
+     * to disturb this worker.
+     *
+     * Synchronization field. [shared read/write]
+     */
+    struct mutex steal_lock;
+
+    /**
+     * Full frame that the worker is working on.
+     *
+     * While a worker w is executing, a thief may change
+     * w->l->frame_ff (on a successful steal) after acquiring w's
+     * lock.
+     *
+     * Unlocked accesses to w->l->frame_ff are safe (by w itself) when
+     * w's deque is empty, or when stealing from w has been disabled.
+     *
+     * [shared read/write]
+     */
+    struct full_frame *frame_ff;
+
+    /**
+     * Full frame that the worker will be working on next
+     *
+     * This field is normally local for a worker w.  Another worker v
+     * may modify w->l->next_frame_ff, however, in the special case
+     * when v is returning a frame to a user thread w since w is the
+     * team leader.
+     *
+     * [shared read/write]
+     */
+    struct full_frame *next_frame_ff;
+
+    /**
+     * This is set iff this is a WORKER_USER and there has been a steal.  It
+     * points to the first frame that was stolen since the team was last fully
+     * sync'd.  Only this worker may continue past a sync in this function.
+     *
+     * This field is set by a thief for a victim that is a user
+     * thread, while holding the victim's lock.
+     * It can be cleared without a lock by the worker that will
+     * continue exuecting past the sync.
+     *
+     * [shared read/write]
+     */
+    struct full_frame *last_full_frame;
+
+    /**
+     * Team on which this worker is a participant.  When a user worker enters,
+     * its team is its own worker struct and it can never change teams.  When a
+     * system worker steals, it adopts the team of its victim.
+     *
+     * When a system worker w steals, it reads victim->l->team and
+     * joins this team.  w->l->team is constant until the next time w
+     * returns control to the runtime.
+     * We must acquire the worker lock to change w->l->team.
+     *
+     * @note This field is 64-byte aligned because it is the first in
+     * the group of shared read-only fields.  We want this group to
+     * fall on a different cache line from the previous group, which
+     * is shared read-write.
+     *
+     * [shared read-only]
+     */
+    __attribute__((aligned(64)))
+    __cilkrts_worker *team;
+
+    /**
+     * Type of this worker
+     *
+     * This field changes only when a worker binds or unbinds.
+     * Otherwise, the field is read-only while the worker is bound.
+     *
+     * [shared read-only]
+     */
+    cilk_worker_type type;
+    
+    /**
+     * Lazy task queue of this worker - an array of pointers to stack frames.
+     *
+     * Read-only because deques are a fixed size in the current
+     * implementation.
+     *
+     * @note This field is 64-byte aligned because it is the first in
+     * the group of local fields.  We want this group to fall on a
+     * different cache line from the previous group, which is shared
+     * read-only.
+     *
+     * [local read-only]
+     */
+    __attribute__((aligned(64)))
+    __cilkrts_stack_frame **ltq;
+
+    /**
+     * Pool of fibers waiting to be reused.
+     * [local read/write]
+     */
+    cilk_fiber_pool fiber_pool;
+
+    /**
+     * The fiber for the scheduling stacks.
+     * [local read/write]
+     */
+    cilk_fiber* scheduling_fiber;
+
+    /**
+     * Saved pointer to the leaf node in thread-local storage, when a
+     * user thread is imported.  This pointer gets set to a
+     * meaningful value when binding a user thread, and cleared on
+     * unbind.
+     *
+     * [local read/write]
+     */
+    __cilkrts_pedigree* original_pedigree_leaf;
+
+    /**
+     * State of the random number generator
+     * 
+     * [local read/write]
+     */
+    unsigned rand_seed;
+
+    /**
+     * Function to execute after transferring onto the scheduling stack.
+     *
+     * [local read/write]
+     */
+    scheduling_stack_fcn_t post_suspend;
+
+    /**
+     * __cilkrts_stack_frame we suspended when we transferred onto the
+     * scheduling stack.    
+     *
+     * [local read/write]
+     */
+    __cilkrts_stack_frame *suspended_stack;
+
+    /**
+     * cilk_fiber that should be freed after returning from a
+     *  spawn with a stolen parent or after stalling at a sync.
+
+     *  We calculate the stack to free when executing a reduction on
+     *  the user stack, but we can not actually release the stack
+     *  until control longjmps onto a runtime scheduling stack.
+     *
+     * This field is used to pass information to the runtime across
+     * the longjmp onto the scheduling stack.
+     *
+     * [local read/write]
+     */
+    cilk_fiber* fiber_to_free;
+
+    /**
+     * Saved exception object for an exception that is being passed to
+     * our parent
+     *
+     * [local read/write]
+     */
+    struct pending_exception_info *pending_exception;
+
+    /**
+     * Buckets for the memory allocator
+     *
+     * [local read/write]
+     */
+    struct free_list *free_list[FRAME_MALLOC_NBUCKETS];
+
+    /**
+     * Potential function for the memory allocator
+     *
+     * [local read/write]
+     */
+    size_t bucket_potential[FRAME_MALLOC_NBUCKETS];
+
+    /**
+     * Support for statistics
+     *
+     * Useful only when CILK_PROFIlE is compiled in. 
+     * [local read/write]
+     */
+    statistics* stats;
+
+    /**
+     * Count indicates number of failures since last successful steal.  This is
+     * used by the scheduler to reduce contention on shared flags.
+     *
+     * [local read/write]
+     */
+    unsigned int steal_failure_count;
+
+    /**
+     * 1 if work was stolen from another worker.  When true, this will flag
+     * setup_for_execution_pedigree to increment the pedigree when we resume
+     * execution to match the increment that would have been done on a return
+     * from a spawn helper.
+     *
+     * [local read/write]
+     */
+    int work_stolen;
+
+    /**
+     * File pointer for record or replay
+     * Does FILE * work on Windows?
+     * During record, the file will be opened in write-only mode.
+     * During replay, the file will be opened in read-only mode.
+     *
+     * [local read/write]
+     */
+    FILE *record_replay_fptr;
+
+    /**
+     * Root of array of replay entries - NULL if we're not replaying a log
+     *
+     * [local read/write]
+     */
+    replay_entry_t *replay_list_root;
+
+    /**
+     * Current replay entry - NULL if we're not replaying a log
+     *
+     * [local read/write]
+     */
+    replay_entry_t *replay_list_entry;
+    
+    /**
+     * Separate the signal_node from other things in the local_state by the
+     * sizeof a cache line for performance reasons.
+     *
+     * unused 
+     */
+    char buf[64];
+
+    /**
+     * Signal object for waking/sleeping the worker.  This should be a pointer
+     * to avoid the possibility of caching problems.
+     *
+     * [shared read-only]
+     */
+     signal_node_t *signal_node;
+
+    /** This value should be in the last field in any local_state */
+#   define WORKER_MAGIC_1 ((ls_magic_t)0x16164afb0ea0dff9ULL)
+
+    /**
+     * Should be WORKER_MAGIC_1 or the local_state has been corrupted
+     * This magic field is shared because it is read on lock acquisitions.
+     * [shared read-only]
+     */
+    ls_magic_t worker_magic_1;
+};
+
+/**
+ * Perform cleanup according to the function set before the longjmp().
+ *
+ * Call this after longjmp() has completed and the worker is back on a
+ * scheduling stack.
+ *
+ * @param w __cilkrts_worker currently executing.
+ */
+void run_scheduling_stack_fcn(__cilkrts_worker *w);
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_LOCAL_STATE_DOT_H)
diff --git a/libcilkrts/runtime/mac-symbols.txt b/libcilkrts/runtime/mac-symbols.txt
new file mode 100644
index 00000000000..38d83a8675d
--- /dev/null
+++ b/libcilkrts/runtime/mac-symbols.txt
@@ -0,0 +1,318 @@
+# Exported symbol list:
+___cilkrts_bind_thread
+___cilkrts_bind_thread_1
+___cilkrts_bump_loop_rank
+___cilkrts_bump_loop_rank_internal
+___cilkrts_bump_worker_rank
+___cilkrts_bump_worker_rank_internal
+___cilkrts_cilk_for_32
+___cilkrts_cilk_for_64
+___cilkrts_debugger_notification
+___cilkrts_dump_stats
+___cilkrts_end_cilk
+___cilkrts_enter_frame
+___cilkrts_enter_frame_1
+___cilkrts_enter_frame_fast
+___cilkrts_enter_frame_fast_1
+___cilkrts_get_force_reduce
+___cilkrts_get_nworkers
+___cilkrts_get_pedigree_info
+___cilkrts_get_pedigree_internal
+___cilkrts_get_sf
+___cilkrts_get_stack_size
+___cilkrts_get_tls_worker
+___cilkrts_get_tls_worker_fast
+___cilkrts_get_total_workers
+___cilkrts_get_worker_number
+___cilkrts_get_worker_rank
+___cilkrts_global_state
+___cilkrts_hyper_create
+___cilkrts_hyper_destroy
+___cilkrts_hyper_lookup
+___cilkrts_hyperobject_alloc
+___cilkrts_hyperobject_dealloc
+___cilkrts_hyperobject_noop_destroy
+___cilkrts_init
+___cilkrts_irml_version
+___cilkrts_leave_frame
+___cilkrts_metacall
+___cilkrts_rethrow
+___cilkrts_return_exception
+___cilkrts_save_fp_ctrl_state
+___cilkrts_set_param
+___cilkrts_stack_alloc
+___cilkrts_stack_free
+___cilkrts_sync
+___cilkrts_synched
+___cilkrts_watch_stack
+___cilkrts_worker_stub
+_cilk_c_reducer_max_identity_char
+_cilk_c_reducer_max_identity_double
+_cilk_c_reducer_max_identity_float
+_cilk_c_reducer_max_identity_int
+_cilk_c_reducer_max_identity_long
+_cilk_c_reducer_max_identity_longdouble
+_cilk_c_reducer_max_identity_longlong
+_cilk_c_reducer_max_identity_schar
+_cilk_c_reducer_max_identity_short
+_cilk_c_reducer_max_identity_uchar
+_cilk_c_reducer_max_identity_uint
+_cilk_c_reducer_max_identity_ulong
+_cilk_c_reducer_max_identity_ulonglong
+_cilk_c_reducer_max_identity_unsigned
+_cilk_c_reducer_max_identity_ushort
+_cilk_c_reducer_max_identity_wchar_t
+_cilk_c_reducer_max_index_identity_char
+_cilk_c_reducer_max_index_identity_double
+_cilk_c_reducer_max_index_identity_float
+_cilk_c_reducer_max_index_identity_int
+_cilk_c_reducer_max_index_identity_long
+_cilk_c_reducer_max_index_identity_longdouble
+_cilk_c_reducer_max_index_identity_longlong
+_cilk_c_reducer_max_index_identity_schar
+_cilk_c_reducer_max_index_identity_short
+_cilk_c_reducer_max_index_identity_uchar
+_cilk_c_reducer_max_index_identity_uint
+_cilk_c_reducer_max_index_identity_ulong
+_cilk_c_reducer_max_index_identity_ulonglong
+_cilk_c_reducer_max_index_identity_unsigned
+_cilk_c_reducer_max_index_identity_ushort
+_cilk_c_reducer_max_index_identity_wchar_t
+_cilk_c_reducer_max_index_reduce_char
+_cilk_c_reducer_max_index_reduce_double
+_cilk_c_reducer_max_index_reduce_float
+_cilk_c_reducer_max_index_reduce_int
+_cilk_c_reducer_max_index_reduce_long
+_cilk_c_reducer_max_index_reduce_longdouble
+_cilk_c_reducer_max_index_reduce_longlong
+_cilk_c_reducer_max_index_reduce_schar
+_cilk_c_reducer_max_index_reduce_short
+_cilk_c_reducer_max_index_reduce_uchar
+_cilk_c_reducer_max_index_reduce_uint
+_cilk_c_reducer_max_index_reduce_ulong
+_cilk_c_reducer_max_index_reduce_ulonglong
+_cilk_c_reducer_max_index_reduce_unsigned
+_cilk_c_reducer_max_index_reduce_ushort
+_cilk_c_reducer_max_index_reduce_wchar_t
+_cilk_c_reducer_max_reduce_char
+_cilk_c_reducer_max_reduce_double
+_cilk_c_reducer_max_reduce_float
+_cilk_c_reducer_max_reduce_int
+_cilk_c_reducer_max_reduce_long
+_cilk_c_reducer_max_reduce_longdouble
+_cilk_c_reducer_max_reduce_longlong
+_cilk_c_reducer_max_reduce_schar
+_cilk_c_reducer_max_reduce_short
+_cilk_c_reducer_max_reduce_uchar
+_cilk_c_reducer_max_reduce_uint
+_cilk_c_reducer_max_reduce_ulong
+_cilk_c_reducer_max_reduce_ulonglong
+_cilk_c_reducer_max_reduce_unsigned
+_cilk_c_reducer_max_reduce_ushort
+_cilk_c_reducer_max_reduce_wchar_t
+_cilk_c_reducer_min_identity_char
+_cilk_c_reducer_min_identity_double
+_cilk_c_reducer_min_identity_float
+_cilk_c_reducer_min_identity_int
+_cilk_c_reducer_min_identity_long
+_cilk_c_reducer_min_identity_longdouble
+_cilk_c_reducer_min_identity_longlong
+_cilk_c_reducer_min_identity_schar
+_cilk_c_reducer_min_identity_short
+_cilk_c_reducer_min_identity_uchar
+_cilk_c_reducer_min_identity_uint
+_cilk_c_reducer_min_identity_ulong
+_cilk_c_reducer_min_identity_ulonglong
+_cilk_c_reducer_min_identity_unsigned
+_cilk_c_reducer_min_identity_ushort
+_cilk_c_reducer_min_identity_wchar_t
+_cilk_c_reducer_min_index_identity_char
+_cilk_c_reducer_min_index_identity_double
+_cilk_c_reducer_min_index_identity_float
+_cilk_c_reducer_min_index_identity_int
+_cilk_c_reducer_min_index_identity_long
+_cilk_c_reducer_min_index_identity_longdouble
+_cilk_c_reducer_min_index_identity_longlong
+_cilk_c_reducer_min_index_identity_schar
+_cilk_c_reducer_min_index_identity_short
+_cilk_c_reducer_min_index_identity_uchar
+_cilk_c_reducer_min_index_identity_uint
+_cilk_c_reducer_min_index_identity_ulong
+_cilk_c_reducer_min_index_identity_ulonglong
+_cilk_c_reducer_min_index_identity_unsigned
+_cilk_c_reducer_min_index_identity_ushort
+_cilk_c_reducer_min_index_identity_wchar_t
+_cilk_c_reducer_min_index_reduce_char
+_cilk_c_reducer_min_index_reduce_double
+_cilk_c_reducer_min_index_reduce_float
+_cilk_c_reducer_min_index_reduce_int
+_cilk_c_reducer_min_index_reduce_long
+_cilk_c_reducer_min_index_reduce_longdouble
+_cilk_c_reducer_min_index_reduce_longlong
+_cilk_c_reducer_min_index_reduce_schar
+_cilk_c_reducer_min_index_reduce_short
+_cilk_c_reducer_min_index_reduce_uchar
+_cilk_c_reducer_min_index_reduce_uint
+_cilk_c_reducer_min_index_reduce_ulong
+_cilk_c_reducer_min_index_reduce_ulonglong
+_cilk_c_reducer_min_index_reduce_unsigned
+_cilk_c_reducer_min_index_reduce_ushort
+_cilk_c_reducer_min_index_reduce_wchar_t
+_cilk_c_reducer_min_reduce_char
+_cilk_c_reducer_min_reduce_double
+_cilk_c_reducer_min_reduce_float
+_cilk_c_reducer_min_reduce_int
+_cilk_c_reducer_min_reduce_long
+_cilk_c_reducer_min_reduce_longdouble
+_cilk_c_reducer_min_reduce_longlong
+_cilk_c_reducer_min_reduce_schar
+_cilk_c_reducer_min_reduce_short
+_cilk_c_reducer_min_reduce_uchar
+_cilk_c_reducer_min_reduce_uint
+_cilk_c_reducer_min_reduce_ulong
+_cilk_c_reducer_min_reduce_ulonglong
+_cilk_c_reducer_min_reduce_unsigned
+_cilk_c_reducer_min_reduce_ushort
+_cilk_c_reducer_min_reduce_wchar_t
+_cilk_c_reducer_opadd_identity_char
+_cilk_c_reducer_opadd_identity_double
+_cilk_c_reducer_opadd_identity_float
+_cilk_c_reducer_opadd_identity_int
+_cilk_c_reducer_opadd_identity_long
+_cilk_c_reducer_opadd_identity_longdouble
+_cilk_c_reducer_opadd_identity_longlong
+_cilk_c_reducer_opadd_identity_schar
+_cilk_c_reducer_opadd_identity_short
+_cilk_c_reducer_opadd_identity_uchar
+_cilk_c_reducer_opadd_identity_uint
+_cilk_c_reducer_opadd_identity_ulong
+_cilk_c_reducer_opadd_identity_ulonglong
+_cilk_c_reducer_opadd_identity_unsigned
+_cilk_c_reducer_opadd_identity_ushort
+_cilk_c_reducer_opadd_identity_wchar_t
+_cilk_c_reducer_opadd_reduce_char
+_cilk_c_reducer_opadd_reduce_double
+_cilk_c_reducer_opadd_reduce_float
+_cilk_c_reducer_opadd_reduce_int
+_cilk_c_reducer_opadd_reduce_long
+_cilk_c_reducer_opadd_reduce_longdouble
+_cilk_c_reducer_opadd_reduce_longlong
+_cilk_c_reducer_opadd_reduce_schar
+_cilk_c_reducer_opadd_reduce_short
+_cilk_c_reducer_opadd_reduce_uchar
+_cilk_c_reducer_opadd_reduce_uint
+_cilk_c_reducer_opadd_reduce_ulong
+_cilk_c_reducer_opadd_reduce_ulonglong
+_cilk_c_reducer_opadd_reduce_unsigned
+_cilk_c_reducer_opadd_reduce_ushort
+_cilk_c_reducer_opadd_reduce_wchar_t
+_cilk_c_reducer_opand_identity_char
+_cilk_c_reducer_opand_identity_int
+_cilk_c_reducer_opand_identity_long
+_cilk_c_reducer_opand_identity_longlong
+_cilk_c_reducer_opand_identity_schar
+_cilk_c_reducer_opand_identity_short
+_cilk_c_reducer_opand_identity_uchar
+_cilk_c_reducer_opand_identity_uint
+_cilk_c_reducer_opand_identity_ulong
+_cilk_c_reducer_opand_identity_ulonglong
+_cilk_c_reducer_opand_identity_unsigned
+_cilk_c_reducer_opand_identity_ushort
+_cilk_c_reducer_opand_identity_wchar_t
+_cilk_c_reducer_opand_reduce_char
+_cilk_c_reducer_opand_reduce_int
+_cilk_c_reducer_opand_reduce_long
+_cilk_c_reducer_opand_reduce_longlong
+_cilk_c_reducer_opand_reduce_schar
+_cilk_c_reducer_opand_reduce_short
+_cilk_c_reducer_opand_reduce_uchar
+_cilk_c_reducer_opand_reduce_uint
+_cilk_c_reducer_opand_reduce_ulong
+_cilk_c_reducer_opand_reduce_ulonglong
+_cilk_c_reducer_opand_reduce_unsigned
+_cilk_c_reducer_opand_reduce_ushort
+_cilk_c_reducer_opand_reduce_wchar_t
+_cilk_c_reducer_opmul_identity_char
+_cilk_c_reducer_opmul_identity_double
+_cilk_c_reducer_opmul_identity_float
+_cilk_c_reducer_opmul_identity_int
+_cilk_c_reducer_opmul_identity_long
+_cilk_c_reducer_opmul_identity_longdouble
+_cilk_c_reducer_opmul_identity_longlong
+_cilk_c_reducer_opmul_identity_schar
+_cilk_c_reducer_opmul_identity_short
+_cilk_c_reducer_opmul_identity_uchar
+_cilk_c_reducer_opmul_identity_uint
+_cilk_c_reducer_opmul_identity_ulong
+_cilk_c_reducer_opmul_identity_ulonglong
+_cilk_c_reducer_opmul_identity_unsigned
+_cilk_c_reducer_opmul_identity_ushort
+_cilk_c_reducer_opmul_identity_wchar_t
+_cilk_c_reducer_opmul_reduce_char
+_cilk_c_reducer_opmul_reduce_double
+_cilk_c_reducer_opmul_reduce_float
+_cilk_c_reducer_opmul_reduce_int
+_cilk_c_reducer_opmul_reduce_long
+_cilk_c_reducer_opmul_reduce_longdouble
+_cilk_c_reducer_opmul_reduce_longlong
+_cilk_c_reducer_opmul_reduce_schar
+_cilk_c_reducer_opmul_reduce_short
+_cilk_c_reducer_opmul_reduce_uchar
+_cilk_c_reducer_opmul_reduce_uint
+_cilk_c_reducer_opmul_reduce_ulong
+_cilk_c_reducer_opmul_reduce_ulonglong
+_cilk_c_reducer_opmul_reduce_unsigned
+_cilk_c_reducer_opmul_reduce_ushort
+_cilk_c_reducer_opmul_reduce_wchar_t
+_cilk_c_reducer_opor_identity_char
+_cilk_c_reducer_opor_identity_int
+_cilk_c_reducer_opor_identity_long
+_cilk_c_reducer_opor_identity_longlong
+_cilk_c_reducer_opor_identity_schar
+_cilk_c_reducer_opor_identity_short
+_cilk_c_reducer_opor_identity_uchar
+_cilk_c_reducer_opor_identity_uint
+_cilk_c_reducer_opor_identity_ulong
+_cilk_c_reducer_opor_identity_ulonglong
+_cilk_c_reducer_opor_identity_unsigned
+_cilk_c_reducer_opor_identity_ushort
+_cilk_c_reducer_opor_identity_wchar_t
+_cilk_c_reducer_opor_reduce_char
+_cilk_c_reducer_opor_reduce_int
+_cilk_c_reducer_opor_reduce_long
+_cilk_c_reducer_opor_reduce_longlong
+_cilk_c_reducer_opor_reduce_schar
+_cilk_c_reducer_opor_reduce_short
+_cilk_c_reducer_opor_reduce_uchar
+_cilk_c_reducer_opor_reduce_uint
+_cilk_c_reducer_opor_reduce_ulong
+_cilk_c_reducer_opor_reduce_ulonglong
+_cilk_c_reducer_opor_reduce_unsigned
+_cilk_c_reducer_opor_reduce_ushort
+_cilk_c_reducer_opor_reduce_wchar_t
+_cilk_c_reducer_opxor_identity_char
+_cilk_c_reducer_opxor_identity_int
+_cilk_c_reducer_opxor_identity_long
+_cilk_c_reducer_opxor_identity_longlong
+_cilk_c_reducer_opxor_identity_schar
+_cilk_c_reducer_opxor_identity_short
+_cilk_c_reducer_opxor_identity_uchar
+_cilk_c_reducer_opxor_identity_uint
+_cilk_c_reducer_opxor_identity_ulong
+_cilk_c_reducer_opxor_identity_ulonglong
+_cilk_c_reducer_opxor_identity_unsigned
+_cilk_c_reducer_opxor_identity_ushort
+_cilk_c_reducer_opxor_identity_wchar_t
+_cilk_c_reducer_opxor_reduce_char
+_cilk_c_reducer_opxor_reduce_int
+_cilk_c_reducer_opxor_reduce_long
+_cilk_c_reducer_opxor_reduce_longlong
+_cilk_c_reducer_opxor_reduce_schar
+_cilk_c_reducer_opxor_reduce_short
+_cilk_c_reducer_opxor_reduce_uchar
+_cilk_c_reducer_opxor_reduce_uint
+_cilk_c_reducer_opxor_reduce_ulong
+_cilk_c_reducer_opxor_reduce_ulonglong
+_cilk_c_reducer_opxor_reduce_unsigned
+_cilk_c_reducer_opxor_reduce_ushort
+_cilk_c_reducer_opxor_reduce_wchar_t
diff --git a/libcilkrts/runtime/metacall_impl.c b/libcilkrts/runtime/metacall_impl.c
new file mode 100644
index 00000000000..ce1c51a202b
--- /dev/null
+++ b/libcilkrts/runtime/metacall_impl.c
@@ -0,0 +1,167 @@
+/* metacall_impl.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#include "metacall_impl.h"
+
+NOINLINE
+CILK_API_VOID
+__cilkrts_metacall(unsigned int tool, unsigned int code, void *data)
+{
+#ifdef ENABLE_NOTIFY_ZC_INTRINSIC
+    // The metacall type, code and data are packed together into a single
+    // struct which will be interpreted by the tool. This function is the
+    // one and only use of a "cilkscreen_metacall" annotation
+    metacall_data_t d = { tool, code, data };
+
+    // Note that Inspector uses probe mode, and is implementing the metacall
+    // interface to force the runtime to run with a single worker.  So
+    // __cilkrts_metacall must use __notify_intrinsic instead of
+    // __notify_zc_intrinsic
+    __notify_intrinsic("cilkscreen_metacall", &d);
+#endif // ENABLE_NOTIFY_ZC_INTRINSIC
+}
+
+int __cilkrts_running_under_sequential_ptool(void)
+{
+    static int running_under_sequential_ptool = -1;
+    volatile char c = ~0;
+
+    // If we haven't been called before, see if we're running under Cilkscreen
+    // or Cilkview
+    if (-1 == running_under_sequential_ptool)
+    {
+        // metacall #2 writes 0 in C if we are running under
+        // a p-tools that requires serial execution, and is a 
+        // no-op otherwise
+        //
+        // Note that removing the volatile is required to prevent the compiler
+        // from assuming that the value has not changed
+        __cilkrts_metacall(METACALL_TOOL_SYSTEM,
+                           HYPER_ZERO_IF_SEQUENTIAL_PTOOL, (void *)&c);
+
+        running_under_sequential_ptool = (0 == c);
+    }
+
+    return running_under_sequential_ptool;
+}
+
+/*
+ * __cilkrts_cilkscreen_establish_c_stack
+ *
+ * Notify Cilkscreen of the extent of the stack
+ */
+
+void __cilkrts_cilkscreen_establish_c_stack(char *begin, char *end)
+{
+    char *limits[2] = {begin, end};
+
+    __cilkrts_metacall(METACALL_TOOL_SYSTEM, HYPER_ESTABLISH_C_STACK, limits);
+}
+
+#ifdef WORKSPAN // Workspan stuff - remove when we're sure what we can drop
+
+void __cilkview_workspan_start(void) {
+  __cilkrts_metacall(HYPER_WORKSPAN_START, 0);
+}
+
+void __cilkview_workspan_stop(void) {
+  __cilkrts_metacall(HYPER_WORKSPAN_STOP, 0);
+}
+
+void __cilkview_workspan_dump(const char *str) {
+  __cilkrts_metacall(HYPER_WORKSPAN_DUMP, (void*)str);
+}
+
+
+void __cilkview_workspan_reset(void) {
+  __cilkrts_metacall(HYPER_WORKSPAN_RESET, 0);
+}
+
+
+void __cilkview_use_default_grain(void) {
+    __cilkrts_metacall(HYPER_USE_DEFAULT_GRAIN, 0);
+}
+
+void __cilkview_get_workspan_data(unsigned long long *values, int size)
+{
+    void *data[2];
+
+    /* reset counters to zero in case we are not running under
+       a p-tool */
+
+    values[0] = 0;
+
+    data[0] = (void*) values;
+    data[1] = (void*) &size;
+     __cilkrts_metacall(HYPER_WORKSPAN_QUERY, &data);
+}
+
+void __cilkview_workspan_connected (int *flag) {
+  *flag = 0;
+  __cilkrts_metacall(HYPER_WORKSPAN_CONNECTED, (void *)flag);
+}
+
+void __cilkview_workspan_suspend() {
+  __cilkrts_metacall(HYPER_WORKSPAN_SUSPEND, 0);
+}
+
+void __cilkview_workspan_resume() {
+  __cilkrts_metacall(HYPER_WORKSPAN_RESUME, 0);
+}
+
+/* depreciated interfaces */
+void __cilkometer_workspan_start(void) {
+  __cilkrts_metacall(HYPER_WORKSPAN_START, 0);
+}
+
+void __cilkometer_workspan_stop(void) {
+  __cilkrts_metacall(HYPER_WORKSPAN_STOP, 0);
+}
+
+void __cilkometer_workspan_dump(const char *str) {
+  __cilkrts_metacall(HYPER_WORKSPAN_DUMP, (void*)str);
+}
+
+
+void __cilkometer_workspan_reset(void) {
+  __cilkrts_metacall(HYPER_WORKSPAN_RESET, 0);
+}
+
+#endif // WORKSPAN
+
+/* End metacall_impl.c */
diff --git a/libcilkrts/runtime/metacall_impl.h b/libcilkrts/runtime/metacall_impl.h
new file mode 100644
index 00000000000..90cc7f95168
--- /dev/null
+++ b/libcilkrts/runtime/metacall_impl.h
@@ -0,0 +1,123 @@
+/* metacall_impl.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2010-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+/**
+ * @file metacall_impl.h
+ *
+ * @brief Meta-function calls to be used within the Cilk runtime system.
+ * 
+ * These differ from the macros in cilkscreen.h and cilkview.h because they go
+ * through the __cilkrts_metacall interface, which ensures that the operation
+ * is performed even when instrumentation is disabled.
+ */
+
+#ifndef INCLUDED_CILKRTS_METACALL_H
+#define INCLUDED_CILKRTS_METACALL_H
+
+#include "rts-common.h"
+#include <internal/metacall.h>
+#include <cilk/common.h>
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * This function is effectively an unconditional call from the runtime into
+ * a tool.  It is used for operations that must be performed by the tool,
+ * even when the tool is not instrumenting.  For example, Cilkscreen always
+ * recognizes the address of this function and performs the action specified
+ * in the contained metadata.
+ *
+ * Note that this function MUST NOT BE INLINED within the runtime.  This must
+ * be the ONLY instance of the cilkscreen_metacall metadata.
+ */
+CILK_API_VOID
+__cilkrts_metacall(unsigned int tool, unsigned int code, void *data);
+
+/**
+ * Return non-zero if running under Cilkscreen or Cilkview
+ */
+COMMON_PORTABLE
+int __cilkrts_running_under_sequential_ptool(void);
+
+/**
+ * Disable Cilkscreen implementation
+ */
+#define __cilkrts_cilkscreen_disable_instrumentation() \
+    __cilkrts_metacall(METACALL_TOOL_SYSTEM, HYPER_DISABLE_INSTRUMENTATION, 0)
+
+/**
+ * Enable Cilkscreen implementation
+ */
+#define __cilkrts_cilkscreen_enable_instrumentation() \
+    __cilkrts_metacall(METACALL_TOOL_SYSTEM, HYPER_ENABLE_INSTRUMENTATION, 0)
+
+/**
+ * Set the worker on entering runtime.
+ *
+ * @attention Deprecated in favor of __cilkrts_cilkscreen_ignore_block.  The
+ * begin/enter pairs in the current metadata mean Cilkscreen no longer has to
+ * have improper knowledge of the __cilkrts_worker or __cilkrts_stack_frame
+ * structures.
+ */
+#define __cilkrts_cilkscreen_establish_worker(w) \
+    __cilkrts_metacall(METACALL_TOOL_SYSTEM, HYPER_ESTABLISH_WORKER, w)
+
+/**
+ * Notify Cilkscreen of the extent of the stack.
+ *
+ * @param[in] begin Start (low address) of stack
+ * @param[in] end   One past high address of stack
+ */
+void __cilkrts_cilkscreen_establish_c_stack(char *begin, char *end);
+
+/**
+ * Tell tools to ignore a block of memory - currently the global state and
+ * memory allocated for workers.
+ */
+#define __cilkrts_cilkscreen_ignore_block(_begin, _end) \
+{                                                       \
+    void *block[2] = {_begin, _end};                    \
+    __cilkrts_metacall(METACALL_TOOL_SYSTEM,            \
+                       HYPER_IGNORE_MEMORY_BLOCK,       \
+                       block);                          \
+}
+
+__CILKRTS_END_EXTERN_C
+
+#endif /* ! defined(INCLUDED_CILKRTS_METACALL_H) */
diff --git a/libcilkrts/runtime/os-unix.c b/libcilkrts/runtime/os-unix.c
new file mode 100644
index 00000000000..b48fd623c6e
--- /dev/null
+++ b/libcilkrts/runtime/os-unix.c
@@ -0,0 +1,508 @@
+/* os-unix.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#ifdef __linux__
+    // define _GNU_SOURCE before *any* #include.
+    // Even <stdint.h> will break later #includes if this macro is not
+    // already defined when it is #included.
+#   define _GNU_SOURCE
+#endif
+
+#include "os.h"
+#include "bug.h"
+#include "cilk_malloc.h"
+#include <internal/abi.h>
+
+#if defined __linux__
+#   include <sys/sysinfo.h>
+#   include <sys/syscall.h>
+#elif defined __APPLE__
+#   include <sys/sysctl.h>
+    // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output
+#elif defined  __FreeBSD__
+// No additional include files
+#elif defined __CYGWIN__
+// Cygwin on Windows - no additional include files
+#elif defined  __VXWORKS__
+#   include <vxWorks.h>   
+#   include <vxCpuLib.h>   
+#   include <taskLib.h>   
+#else
+#   error "Unsupported OS"
+#endif
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/types.h>
+
+
+
+// /* Thread-local storage */
+// #ifdef _WIN32
+// typedef unsigned cilkos_tls_key_t;
+// #else
+// typedef pthread_key_t cilkos_tls_key_t;
+// #endif
+// cilkos_tls_key_t cilkos_allocate_tls_key();
+// void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
+// void* cilkos_get_tls_pointer(cilkos_tls_key_t key);
+
+#if !defined CILK_WORKER_TLS
+static int cilk_keys_defined;
+static pthread_key_t worker_key, pedigree_leaf_key, tbb_interop_key;
+
+#if SUPPORT_GET_CURRENT_FIBER > 0
+static pthread_key_t fiber_key;
+#endif
+
+static void *serial_worker;
+
+
+// This destructor is called when a pthread dies to deallocate the
+// pedigree node.
+static void __cilkrts_pedigree_leaf_destructor(void* pedigree_tls_ptr)
+{
+    __cilkrts_pedigree* pedigree_tls
+	= (__cilkrts_pedigree*)pedigree_tls_ptr;
+    if (pedigree_tls) {
+        // Assert that we have either one or two nodes
+        // left in the pedigree chain.
+        // If we have more, then something is going wrong...
+        CILK_ASSERT(!pedigree_tls->parent || !pedigree_tls->parent->parent);
+	__cilkrts_free(pedigree_tls);
+    }
+}
+
+void __cilkrts_init_tls_variables(void)
+{
+    int status;
+    /* This will be called once in serial execution before any
+       Cilk parallelism so we do not need to worry about races
+       on cilk_keys_defined. */
+    if (cilk_keys_defined)
+        return;
+    status = pthread_key_create(&worker_key, NULL);
+    CILK_ASSERT (status == 0);
+    status = pthread_key_create(&pedigree_leaf_key,
+				__cilkrts_pedigree_leaf_destructor);
+    CILK_ASSERT (status == 0);
+    status = pthread_key_create(&tbb_interop_key, NULL);
+    CILK_ASSERT (status == 0);
+
+#if SUPPORT_GET_CURRENT_FIBER > 0    
+    status = pthread_key_create(&fiber_key, NULL);
+    CILK_ASSERT (status == 0);
+#endif
+    cilk_keys_defined = 1;
+    return;
+}
+
+COMMON_SYSDEP
+void* cilkos_get_current_thread_id(void)
+{
+    return (void*)pthread_self();
+}
+
+
+CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker()
+{
+    if (__builtin_expect(cilk_keys_defined, 1))
+        return (__cilkrts_worker *)pthread_getspecific(worker_key);
+    else 
+        return serial_worker;
+    
+}
+
+CILK_ABI_WORKER_PTR __cilkrts_get_tls_worker_fast()
+{
+  return (__cilkrts_worker *)pthread_getspecific(worker_key);
+}
+
+COMMON_SYSDEP
+__cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void)
+{
+    if (__builtin_expect(cilk_keys_defined, 1))
+        return (__cilk_tbb_stack_op_thunk *)
+            pthread_getspecific(tbb_interop_key);
+    else
+        return 0;
+}
+
+// This counter should be updated atomically.
+static int __cilkrts_global_pedigree_tls_counter = -1;
+
+COMMON_SYSDEP
+__cilkrts_pedigree *__cilkrts_get_tls_pedigree_leaf(int create_new)
+{
+    __cilkrts_pedigree *pedigree_tls;    
+    if (__builtin_expect(cilk_keys_defined, 1)) {
+        pedigree_tls =
+            (struct __cilkrts_pedigree *)pthread_getspecific(pedigree_leaf_key);
+    }
+    else {
+        return 0;
+    }
+    
+    if (!pedigree_tls && create_new) {
+        // This call creates two nodes, X and Y.
+        // X == pedigree_tls[0] is the leaf node, which gets copied
+        // in and out of a user worker w when w binds and unbinds.
+        // Y == pedigree_tls[1] is the root node,
+        // which is a constant node that represents the user worker
+        // thread w.
+	pedigree_tls = (__cilkrts_pedigree*)
+	    __cilkrts_malloc(2 * sizeof(__cilkrts_pedigree));
+
+        // This call sets the TLS pointer to the new node.
+	__cilkrts_set_tls_pedigree_leaf(pedigree_tls);
+        
+        pedigree_tls[0].rank = 0;
+        pedigree_tls[0].parent = &pedigree_tls[1];
+
+        // Create Y, whose rank begins as the global counter value.
+        pedigree_tls[1].rank =
+            __sync_add_and_fetch(&__cilkrts_global_pedigree_tls_counter, 1);
+
+        pedigree_tls[1].parent = NULL;
+        CILK_ASSERT(pedigree_tls[1].rank != -1);
+    }
+    return pedigree_tls;
+}
+
+#if SUPPORT_GET_CURRENT_FIBER > 0
+COMMON_SYSDEP
+cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void)
+{
+    if (__builtin_expect(cilk_keys_defined, 1))
+        return (cilk_fiber_sysdep *)pthread_getspecific(fiber_key);
+    else
+        return NULL;
+}
+#endif
+
+COMMON_SYSDEP
+void __cilkrts_set_tls_worker(__cilkrts_worker *w)
+{
+    if (__builtin_expect(cilk_keys_defined, 1)) {
+        int status;
+        status = pthread_setspecific(worker_key, w);
+        CILK_ASSERT (status == 0);
+        return;
+    }
+    else
+    {
+        serial_worker = w;
+    }
+}
+
+COMMON_SYSDEP
+void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t)
+{
+    if (__builtin_expect(cilk_keys_defined, 1)) {
+        int status;
+        status = pthread_setspecific(tbb_interop_key, t);
+        CILK_ASSERT (status == 0);
+        return;
+    }
+    abort();
+}
+
+COMMON_SYSDEP
+void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf)
+{
+    if (__builtin_expect(cilk_keys_defined, 1)) {
+        int status;
+        status = pthread_setspecific(pedigree_leaf_key, pedigree_leaf);
+        CILK_ASSERT (status == 0);
+        return;
+    }
+    abort();
+}
+
+#if SUPPORT_GET_CURRENT_FIBER > 0
+COMMON_SYSDEP
+void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber)
+{
+    if (__builtin_expect(cilk_keys_defined, 1)) {
+        int status;
+        status = pthread_setspecific(fiber_key, fiber);
+        CILK_ASSERT (status == 0);
+        return;
+    }
+    abort();
+}
+#endif
+
+#else
+void __cilkrts_init_tls_variables(void)
+{
+}
+#endif
+
+#if defined (__linux__) && ! defined(ANDROID)
+/*
+ * Get the thread id, rather than the pid. In the case of MIC offload, it's
+ * possible that we have multiple threads entering Cilk, and each has a
+ * different affinity.
+ */
+static pid_t linux_gettid(void)
+{
+    return syscall(SYS_gettid);
+}
+
+/*
+ * On Linux we look at the thread affinity mask and restrict ourself to one
+ * thread for each of the hardware contexts to which we are bound.
+ * Therefore if user does
+ * % taskset 0-1 cilkProgram
+ *       # restrict execution to hardware contexts zero and one
+ * the Cilk program will only use two threads even if it is running on a
+ * machine that has 32 hardware contexts.
+ * This is the right thing to do, because the threads are restricted to two
+ * hardware contexts by the affinity mask set by taskset, and if we were to
+ * create extra threads they would simply oversubscribe the hardware resources
+ * we can use.
+ * This is particularly important on MIC in offload mode, where the affinity
+ * mask is set by the offload library to force the offload code away from
+ * cores that have offload support threads running on them.
+ */
+static int linux_get_affinity_count (int tid) 
+{
+    cpu_set_t process_mask;
+
+    // Extract the thread affinity mask
+    int err = sched_getaffinity (tid, sizeof(process_mask),&process_mask);
+
+    if (0 != err)
+    {
+        return 0;
+    }
+
+    // We have extracted the mask OK, so now we can count the number of threads
+    // in it.  This is linear in the maximum number of CPUs available, We
+    // could do a logarithmic version, if we assume the format of the mask,
+    // but it's not really worth it. We only call this at thread startup
+    // anyway.
+    int available_procs = 0;
+    int i;
+    for (i = 0; i < CPU_SETSIZE; i++)
+    {
+        if (CPU_ISSET(i, &process_mask))
+        {
+            available_procs++;
+        }
+    }
+
+    return available_procs;
+}
+#endif
+
+/*
+ * __cilkrts_hardware_cpu_count
+ *
+ * Returns the number of available CPUs on this hardware.  This is architecture-
+ * specific. 
+ */
+
+COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void)
+{
+#if defined ANDROID
+    return sysconf (_SC_NPROCESSORS_ONLN);
+#elif defined __MIC__
+    /// HACK: Usually, the 3rd and 4th hyperthreads are not beneficial
+    /// on KNC.  Also, ignore the last core.
+    int P = sysconf (_SC_NPROCESSORS_ONLN);
+    return P/2 - 2;
+#elif defined __linux__
+    int affinity_count = linux_get_affinity_count(linux_gettid());
+
+    return (0 != affinity_count) ? affinity_count : sysconf (_SC_NPROCESSORS_ONLN);
+#elif defined __APPLE__
+    int count = 0;
+    int cmd[2] = { CTL_HW, HW_NCPU };
+    size_t len = sizeof count;
+    int status = sysctl(cmd, 2, &count, &len, 0, 0);
+    assert(status >= 0);
+    assert((unsigned)count == count);
+
+    return count;
+#elif defined  __FreeBSD__ || defined __CYGWIN__
+    int ncores = sysconf(_SC_NPROCESSORS_ONLN);
+
+    return ncores;
+    // Just get the number of processors
+//    return sysconf(_SC_NPROCESSORS_ONLN);
+#elif defined  __VXWORKS__
+    return __builtin_popcount( vxCpuEnabledGet() );
+#else
+#error "Unknown architecture"
+#endif
+}
+
+COMMON_SYSDEP void __cilkrts_sleep(void)
+{
+#ifdef __VXWORKS__
+	taskDelay(1);
+#else			
+    usleep(1);
+#endif	
+}
+
+COMMON_SYSDEP void __cilkrts_yield(void)
+{
+#if __APPLE__ || __FreeBSD__ || __VXWORKS__
+    // On MacOS, call sched_yield to yield quantum.  I'm not sure why we
+    // don't do this on Linux also.
+    sched_yield();
+#elif defined(__MIC__)
+    // On MIC, pthread_yield() really trashes things.  Arch's measurements
+    // showed that calling _mm_delay_32() (or doing nothing) was a better
+    // option.  Delaying 1024 clock cycles is a reasonable compromise between
+    // giving up the processor and latency starting up when work becomes
+    // available
+    _mm_delay_32(1024);
+#elif defined(ANDROID)
+    // On Android, call sched_yield to yield quantum.  I'm not sure why we
+    // don't do this on Linux also.
+    sched_yield();
+#else
+    // On Linux, call pthread_yield (which in turn will call sched_yield)
+    // to yield quantum.
+    pthread_yield();
+#endif
+}
+
+COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen,
+                                           const char* varname)
+{
+    CILK_ASSERT(value);
+    CILK_ASSERT(varname);
+
+    const char* envstr = getenv(varname);
+    if (envstr)
+    {
+        size_t len = strlen(envstr);
+        if (len > vallen - 1)
+            return len + 1;
+
+        strcpy(value, envstr);
+        return len;
+    }
+    else
+    {
+        value[0] = '\0';
+        return 0;
+    }
+}
+
+/*
+ * Unrecoverable error: Print an error message and abort execution.
+ */
+COMMON_SYSDEP void cilkos_error(const char *fmt, ...)
+{
+    va_list l;
+    fflush(NULL);
+    fprintf(stderr, "Cilk error: ");
+    va_start(l, fmt);
+    vfprintf(stderr, fmt, l);
+    va_end(l);
+    fprintf(stderr, "Exiting.\n");
+    fflush(stderr);
+
+    abort();
+}
+
+/*
+ * Print a warning message and return.
+ */
+COMMON_SYSDEP void cilkos_warning(const char *fmt, ...)
+{
+    va_list l;
+    fflush(NULL);
+    fprintf(stderr, "Cilk warning: ");
+    va_start(l, fmt);
+    vfprintf(stderr, fmt, l);
+    va_end(l);
+    fflush(stderr);
+}
+
+static void __attribute__((constructor)) init_once()
+{
+    /*__cilkrts_debugger_notification_internal(CILK_DB_RUNTIME_LOADED);*/
+    __cilkrts_init_tls_variables();
+}
+
+
+#define PAGE 4096
+#define CILK_MIN_STACK_SIZE (4*PAGE)
+// Default size for the stacks that we create in Cilk for Unix.
+#define CILK_DEFAULT_STACK_SIZE 0x100000
+
+/*
+ * Convert the user's specified stack size into a "reasonable" value
+ * for this OS.
+ */
+size_t cilkos_validate_stack_size(size_t specified_stack_size) {
+    // Convert any negative value to the default.
+    if (specified_stack_size == 0) {
+        CILK_ASSERT((CILK_DEFAULT_STACK_SIZE % PAGE) == 0);
+        return CILK_DEFAULT_STACK_SIZE;
+    }
+    // Round values in between 0 and CILK_MIN_STACK_SIZE up to
+    // CILK_MIN_STACK_SIZE.
+    if (specified_stack_size <= CILK_MIN_STACK_SIZE) {
+        return CILK_MIN_STACK_SIZE;
+    }
+    if ((specified_stack_size % PAGE) > 0) {
+        // Round the user's stack size value up to nearest page boundary.
+        return (PAGE * (1 + specified_stack_size / PAGE));
+    }
+    return specified_stack_size;
+}
+
+long cilkos_atomic_add(volatile long* p, long x)
+{
+    return __sync_add_and_fetch(p, x);
+}
+
+/* End os-unix.c */
diff --git a/libcilkrts/runtime/os.h b/libcilkrts/runtime/os.h
new file mode 100644
index 00000000000..8066f0313c2
--- /dev/null
+++ b/libcilkrts/runtime/os.h
@@ -0,0 +1,236 @@
+/* os.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file os.h
+ *
+ * @brief Low-level operating-system dependent facilities, not dependent on
+ * any Cilk facilities.
+ */
+
+#ifndef INCLUDED_OS_DOT_H
+#define INCLUDED_OS_DOT_H
+
+#include "rts-common.h"
+#include "cilk/common.h"
+#include "cilk-tbb-interop.h"
+
+#ifdef __cplusplus
+#   include <cstddef>
+#else
+#   include <stddef.h>
+#endif
+
+__CILKRTS_BEGIN_EXTERN_C
+
+
+// /* Thread-local storage */
+// #ifdef _WIN32
+// typedef unsigned cilkos_tls_key_t;
+// #else
+// typedef pthread_key_t cilkos_tls_key_t;
+// #endif
+// cilkos_tls_key_t cilkos_allocate_tls_key();
+// void cilkos_set_tls_pointer(cilkos_tls_key_t key, void* ptr);
+// void* cilkos_get_tls_pointer(cilkos_tls_key_t key);
+
+/* The RTS assumes that some thread-local state exists that stores the
+   worker and reducer map currently associated with a thread.  These routines
+   manipulate this state. */
+
+/** @brief Thread-local state for cilk fibers. */
+typedef struct cilk_fiber_sysdep cilk_fiber_sysdep;
+
+/** @brief Initialize  all TLS variables for Cilk. */
+COMMON_SYSDEP void __cilkrts_init_tls_variables(void);
+
+/** @brief Set worker struct in TLS. */
+COMMON_SYSDEP
+void __cilkrts_set_tls_worker(__cilkrts_worker *w) cilk_nothrow;
+
+/** @brief Get stack_op for TBB-interop structures from TLS. */
+COMMON_SYSDEP
+__cilk_tbb_stack_op_thunk *__cilkrts_get_tls_tbb_interop(void);
+
+/** @brief Set stack_op for TBB-interop structures in TLS. */
+COMMON_SYSDEP
+void __cilkrts_set_tls_tbb_interop(__cilk_tbb_stack_op_thunk *t);
+
+/**
+ * @brief Get the pointer to the pedigree leaf node from TLS.
+ *
+ * Function to get a pointer to the thread's pedigree leaf node.  This
+ * pointer can be NULL.
+ */
+COMMON_SYSDEP
+__cilkrts_pedigree * __cilkrts_get_tls_pedigree_leaf(int create_new);
+
+/**
+ * @brief Sets the pointer to the pedigree leaf node in TLS.
+ *
+ * If the previous pointer value was not NULL, it is the caller's
+ * responsibility to ensure that previous pointer value is saved and
+ * freed.
+ *
+ * @param pedigree_leaf The leaf node to store into TLS.
+ */ 
+COMMON_SYSDEP
+void __cilkrts_set_tls_pedigree_leaf(__cilkrts_pedigree* pedigree_leaf);
+
+
+#if SUPPORT_GET_CURRENT_FIBER > 0
+/**
+ * @brief Get the cilk_fiber from TLS. 
+ */
+COMMON_SYSDEP
+cilk_fiber_sysdep* cilkos_get_tls_cilk_fiber(void);
+
+/**
+ * @brief Set the cilk_fiber in TLS.
+ *
+ * @param fiber The fiber to store into TLS. 
+ */
+COMMON_SYSDEP
+void cilkos_set_tls_cilk_fiber(cilk_fiber_sysdep* fiber);
+#endif
+
+/**
+ * @brief Function for returning the current thread id.
+ * @warning This function is useful for debugging purposes only.
+ */
+COMMON_SYSDEP
+void* cilkos_get_current_thread_id(void);
+
+/** @brief Return number of CPUs supported by this hardware, using whatever definition
+   of CPU is considered appropriate. */
+COMMON_SYSDEP int __cilkrts_hardware_cpu_count(void);
+
+/** @brief Get current value of timer */
+COMMON_SYSDEP unsigned long long __cilkrts_getticks(void);
+
+/* Machine instructions */
+
+/// Stall execution for a few cycles.
+COMMON_SYSDEP void __cilkrts_short_pause(void);
+/// Wrapper for xchg instruction
+COMMON_SYSDEP int __cilkrts_xchg(volatile int *ptr, int x);
+
+// Defines __cilkrts_fence - A macro for x86, a function call for other
+// architectures
+#include "os-fence.h"
+
+COMMON_SYSDEP void __cilkrts_sleep(void); ///< Sleep briefly 
+COMMON_SYSDEP void __cilkrts_yield(void); ///< Yield quantum 
+
+/**
+ * @brief Gets environment variable 'varname' and copy its value into 'value'.
+ *
+ * If the entire value, including the null terminator fits into 'vallen'
+ * bytes, then returns the length of the value excluding the null.  Otherwise,
+ * leaves the contents of 'value' undefined and returns the number of
+ * characters needed to store the environment variable's value, *including*
+ * the null terminator.
+ *
+ * @param value    Buffer to store value.
+ * @param vallen   Length of value buffer
+ * @param varname  Name of the environment variable.
+ * @return         Length of value buffer (excluding the null).
+ */
+COMMON_SYSDEP __STDNS size_t cilkos_getenv(char* value, __STDNS size_t vallen,
+                                           const char* varname);
+
+/**
+ * @brief Unrecoverable error: Print an error message and abort execution.
+ */
+COMMON_SYSDEP void cilkos_error(const char *fmt, ...);
+
+/**
+ * @brief Print a warning message and return.
+ */
+COMMON_SYSDEP void cilkos_warning(const char *fmt, ...);
+
+/**
+ * @brief Convert the user's specified stack size into a "reasonable"
+ * value for the current OS.
+ *
+ * @param specified_stack_size   User-specified stack size.
+ * @return New stack size value, modified for the OS.
+ */
+COMMON_SYSDEP size_t cilkos_validate_stack_size(size_t specified_stack_size);
+
+/**
+ * @brief Atomic addition: computes *p += x.
+ * 
+ * @param p  Pointer to value to update
+ * @param x  Value of x.
+ */
+COMMON_SYSDEP long cilkos_atomic_add(volatile long* p, long x);
+
+#ifdef _WIN32
+
+/**
+ * @brief Windows-only low-level functions for processor groups.
+ */
+typedef struct _GROUP_AFFINITY GROUP_AFFINITY;
+
+/**
+ * @brief Probe the executing OS to see if it supports processor
+ * groups.  These functions are expected to be available in Windows 7
+ * or later.
+ */
+void win_init_processor_groups(void);
+
+unsigned long win_get_active_processor_count(unsigned short GroupNumber);
+unsigned short win_get_active_processor_group_count(void);
+int win_set_thread_group_affinity(/*HANDLE*/ void* hThread,
+                                  const GROUP_AFFINITY *GroupAffinity,
+                                  GROUP_AFFINITY* PreviousGroupAffinity);
+
+/**
+ * @brief Cleans up any state allocated in TLS.
+ *
+ * Only defined for Windows because Linux calls destructors for each
+ * thread-local variable.
+ */
+void __cilkrts_per_thread_tls_cleanup(void);
+
+#endif // _WIN32
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_OS_DOT_H)
diff --git a/libcilkrts/runtime/os_mutex-unix.c b/libcilkrts/runtime/os_mutex-unix.c
new file mode 100644
index 00000000000..af398cdd089
--- /dev/null
+++ b/libcilkrts/runtime/os_mutex-unix.c
@@ -0,0 +1,193 @@
+/* os_mutex-unix.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#include "os_mutex.h"
+#include "bug.h"
+
+#include <stdlib.h>
+#include <errno.h>
+#include <pthread.h>
+
+// contains notification macros for VTune.
+#include "cilk-ittnotify.h"
+
+/*
+ * OS Mutex functions.
+ *
+ * Not to be confused with the spinlock mutexes implemented in cilk_mutex.c
+ */
+
+struct os_mutex {
+    pthread_mutex_t mutex;  ///< On Linux, os_mutex is implemented with a pthreads mutex
+};
+
+// Unix implementation of the global OS mutex.  This will be created by the
+// first call to global_os_mutex_lock() and *NEVER* destroyed.  On gcc-based
+// systems there's no way to guarantee the ordering of constructors and
+// destructors, so we can't be guaranteed that our destructor for a static
+// object will be called *after* any static destructors that may use Cilk
+// in the user's application
+static struct os_mutex *global_os_mutex = NULL;
+
+/* Sometimes during shared library load malloc doesn't work.
+   To handle that case, preallocate space for one mutex. */
+static struct os_mutex static_mutex;
+static int static_mutex_used;
+
+struct os_mutex *__cilkrts_os_mutex_create(void)
+{
+    int status;
+    struct os_mutex *mutex = (struct os_mutex *)malloc(sizeof(struct os_mutex));
+    pthread_mutexattr_t attr;
+
+    ITT_SYNC_CREATE(mutex, "OS Mutex");
+
+    if (!mutex) {
+        if (static_mutex_used) {
+            __cilkrts_bug("Cilk RTS library initialization failed");
+        } else {
+            static_mutex_used = 1;
+            mutex = &static_mutex;
+        }
+    }
+
+    status = pthread_mutexattr_init(&attr);
+    CILK_ASSERT (status == 0);
+#if defined DEBUG || CILK_LIB_DEBUG 
+#ifdef PTHREAD_MUTEX_ERRORCHECK
+    status = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
+#else
+    status = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK_NP);
+#endif
+    CILK_ASSERT (status == 0);
+#endif
+    status = pthread_mutex_init (&mutex->mutex, &attr);
+    CILK_ASSERT (status == 0);
+    pthread_mutexattr_destroy(&attr);
+
+    return mutex;
+}
+
+void __cilkrts_os_mutex_lock(struct os_mutex *p)
+{
+    int status;
+    status = pthread_mutex_lock (&p->mutex);
+    ITT_SYNC_ACQUIRED(p);
+    if (__builtin_expect(status, 0) == 0)
+        return;
+    if (status == EDEADLK)
+        __cilkrts_bug("Cilk runtime error: deadlock acquiring mutex %p\n",
+                      p);
+    else
+        __cilkrts_bug("Cilk runtime error %d acquiring mutex %p\n",
+                      status, p);
+}
+
+int __cilkrts_os_mutex_trylock(struct os_mutex *p)
+{
+    int status;
+    status = pthread_mutex_trylock (&p->mutex);
+    return (status == 0);
+}
+
+void __cilkrts_os_mutex_unlock(struct os_mutex *p)
+{
+    int status;
+    ITT_SYNC_RELEASING(p);
+    status = pthread_mutex_unlock (&p->mutex);
+    CILK_ASSERT(status == 0);
+}
+
+void __cilkrts_os_mutex_destroy(struct os_mutex *p)
+{
+    pthread_mutex_destroy (&p->mutex);
+    if (p == &static_mutex) {
+        static_mutex_used = 0;
+    } else {
+        free(p);
+    }
+}
+
+/*
+ * create_global_os_mutex
+ *
+ * Function used with pthread_once to initialize the global OS mutex.  Since
+ * pthread_once requires a function which takes no parameters and has no
+ * return value, the global OS mutex will be stored in the static (global
+ * to the compilation unit) variable "global_os_mutex."
+ * 
+ * 
+ * global_os_mutex will never be destroyed.
+ */
+static void create_global_os_mutex(void)
+{
+    CILK_ASSERT(NULL == global_os_mutex);
+    global_os_mutex = __cilkrts_os_mutex_create();
+}
+
+void global_os_mutex_lock(void)
+{
+    // pthread_once_t used with pthread_once to guarantee that
+    // create_global_os_mutex() is only called once
+    static pthread_once_t global_os_mutex_is_initialized = PTHREAD_ONCE_INIT;
+
+    // Execute create_global_os_mutex once in a thread-safe manner
+    // Note that create_global_os_mutex returns the mutex in the static
+    // (global to the module) variable "global_os_mutex"
+    pthread_once(&global_os_mutex_is_initialized,
+		 create_global_os_mutex);
+
+    // We'd better have allocated a global_os_mutex
+    CILK_ASSERT(NULL != global_os_mutex);
+    
+    // Acquire the global OS mutex
+    __cilkrts_os_mutex_lock(global_os_mutex);
+}
+
+void global_os_mutex_unlock(void)
+{
+    // We'd better have allocated a global_os_mutex.  This means you should
+    // have called global_os_mutex_lock() before calling
+    // global_os_mutex_unlock(), but this is the only check for it.
+    CILK_ASSERT(NULL != global_os_mutex);
+
+    // Release the global OS mutex
+    __cilkrts_os_mutex_unlock(global_os_mutex);
+}
+
+/* End os_mutex-unix.c */
diff --git a/libcilkrts/runtime/os_mutex.h b/libcilkrts/runtime/os_mutex.h
new file mode 100644
index 00000000000..71d9eb14e51
--- /dev/null
+++ b/libcilkrts/runtime/os_mutex.h
@@ -0,0 +1,135 @@
+/* os_mutex.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file os_mutex.h
+ *
+ * @brief Portable interface to operating-system mutexes.
+ *
+ * Do not confuse os_mutex with Cilk runtime-specific spinlock mutexes.
+ */
+
+#ifndef INCLUDED_OS_MUTEX_DOT_H
+#define INCLUDED_OS_MUTEX_DOT_H
+
+#include <cilk/common.h>
+#include "rts-common.h"
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/// Opaque type
+typedef struct os_mutex os_mutex;
+
+/**
+ * Allocate and initialize an os_mutex
+ *
+ * @return A pointer to the initialized os_mutex
+ */
+COMMON_SYSDEP os_mutex* __cilkrts_os_mutex_create(void);
+
+/**
+ * Acquire the os_mutex for exclusive use
+ *
+ * @param m The os_mutex that is to be acquired.
+ */
+COMMON_SYSDEP void __cilkrts_os_mutex_lock(os_mutex *m);
+
+/**
+ * Try to acquire the os_mutex.
+ *
+ * @param   m  The os_mutex to try to acquire
+ * @return  0      if the lock acquire failed
+ * @return nonzero if the lock was acquired
+ */
+COMMON_SYSDEP int __cilkrts_os_mutex_trylock(os_mutex *m);
+
+/**
+ * Release the os_mutex
+ *
+ * @param m The os_mutex that is to be released.
+ */
+COMMON_SYSDEP void __cilkrts_os_mutex_unlock(os_mutex *m);
+
+/**
+ * Release any resources and deallocate the os_mutex.
+ *
+ * @param m The os_mutex that is to be deallocated.
+ */
+COMMON_SYSDEP void __cilkrts_os_mutex_destroy(os_mutex *m);
+
+/**
+ * Acquire the global os_mutex for exclusive use.  The global os_mutex
+ * will be initialized the first time this function is called in a
+ * thread-safe manner.
+ */
+COMMON_SYSDEP void global_os_mutex_lock();
+
+/**
+ * Release the global os_mutex.  global_os_mutex_lock() must have been
+ * called first.
+ */
+COMMON_SYSDEP void global_os_mutex_unlock();
+
+
+#ifdef _MSC_VER
+
+/**
+ * @brief Create the global OS mutex - Windows only.
+ *
+ * On Windows we use DllMain() to create the global OS mutex when cilkrts20.dll
+ * is loaded. As opposed to Linux/MacOS where we use pthread_once to implement
+ * a singleton since there are no guarantees about constructor or destructor
+ * ordering between shared objects.
+ */
+NON_COMMON void global_os_mutex_create();
+
+/**
+ * @brief Destroy the global OS mutex - Windows only
+ *
+ * On Windows we use DllMain() to destroy the global OS mutex when
+ * cilkrts20.dll is unloaded.  As opposed to Linux/MacOS where we cannot
+ * know when it's safe to destroy the global OS mutex since there are no
+ * guarantees about constructor or destructor ordering.
+ */
+NON_COMMON void global_os_mutex_destroy();
+
+#endif  // _MSC_VER
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_OS_MUTEX_DOT_H)
diff --git a/libcilkrts/runtime/pedigrees.c b/libcilkrts/runtime/pedigrees.c
new file mode 100644
index 00000000000..dee4d9cb411
--- /dev/null
+++ b/libcilkrts/runtime/pedigrees.c
@@ -0,0 +1,112 @@
+/* pedigrees.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2007-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+#include "pedigrees.h"
+#include "local_state.h"
+
+/*************************************************************
+  Pedigree API code.
+*************************************************************/
+
+/*
+ * C99 requires that every inline function with external linkage have one
+ * extern declaration in the program (with the inline definition in scope).
+ */
+COMMON_PORTABLE
+extern void update_pedigree_on_leave_frame(__cilkrts_worker *w,
+					   __cilkrts_stack_frame *sf);
+
+void __cilkrts_set_pedigree_leaf(__cilkrts_pedigree *leaf)
+{
+    __cilkrts_set_tls_pedigree_leaf(leaf);
+}
+
+void load_pedigree_leaf_into_user_worker(__cilkrts_worker *w)
+{
+    __cilkrts_pedigree *pedigree_leaf;
+    CILK_ASSERT(w->l->type == WORKER_USER);
+    pedigree_leaf = __cilkrts_get_tls_pedigree_leaf(1);
+    w->pedigree = *pedigree_leaf;
+
+    // Save a pointer to the old leaf.
+    // We'll need to restore it later.
+    CILK_ASSERT(w->l->original_pedigree_leaf == NULL);
+    w->l->original_pedigree_leaf = pedigree_leaf;
+    
+    __cilkrts_set_tls_pedigree_leaf(&w->pedigree);
+    
+    // Check that this new pedigree root has at least two values.
+    CILK_ASSERT(w->pedigree.parent);
+    CILK_ASSERT(w->pedigree.parent->parent == NULL);
+}
+
+void save_pedigree_leaf_from_user_worker(__cilkrts_worker *w)
+{
+    CILK_ASSERT(w->l->type == WORKER_USER);
+
+    // Existing leaf in tls should be for the current worker.
+    // This assert is expensive to check though.
+    // CILK_ASSERT(&w->pedigree == __cilkrts_get_tls_pedigree_leaf(0));
+    CILK_ASSERT(w->l->original_pedigree_leaf);
+
+    // w should finish with a pedigree node that points to 
+    // the same root that we just looked up.
+
+    // TODO: This assert should be valid.
+    // But we are removing it now to make exceptions (without pedigrees) work.
+    // Currently, reading the pedigree after an exception is caught
+    // fails because the pedigree chain not restored correctly. 
+    // CILK_ASSERT(w->l->original_pedigree_leaf->next == w->pedigree.parent);
+    w->l->original_pedigree_leaf->rank = w->pedigree.rank;
+
+    // Save that leaf pointer back into tls.
+    __cilkrts_set_tls_pedigree_leaf(w->l->original_pedigree_leaf);
+    // Null out worker's leaf for paranoia.
+    w->l->original_pedigree_leaf = NULL;
+}
+
+
+
+/*
+  Local Variables: **
+  c-file-style:"bsd" **
+  c-basic-offset:4 **
+  indent-tabs-mode:nil **
+  End: **
+*/
diff --git a/libcilkrts/runtime/pedigrees.h b/libcilkrts/runtime/pedigrees.h
new file mode 100644
index 00000000000..3f6ebb977f9
--- /dev/null
+++ b/libcilkrts/runtime/pedigrees.h
@@ -0,0 +1,130 @@
+/* pedigrees.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#ifndef INCLUDED_PEDIGREES_DOT_H
+#define INCLUDED_PEDIGREES_DOT_H
+
+
+#include <cilk/common.h>
+#include <internal/abi.h>
+
+#include "rts-common.h"
+#include "global_state.h"
+#include "os.h"
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * @file pedigrees.h
+ *
+ * @brief pedigrees.h declares common routines related to pedigrees
+ * and the pedigree API.
+ */
+
+
+/**
+ * @brief Sets the leaf pedigree node for the current user thread.
+ *
+ * A typical implementation stores this pedigree node in thread-local
+ * storage.
+ *
+ * Preconditions:
+ *  - Current thread should be a user thread.
+ *
+ * @param leaf The pedigree node to store as a leaf.
+ */
+COMMON_PORTABLE
+void __cilkrts_set_pedigree_leaf(__cilkrts_pedigree* leaf);
+
+
+/**
+ * Load the pedigree leaf node from thread-local storage into the
+ * current user worker.  This method should execute as a part of
+ * binding the user thread to a worker.
+ *
+ * Preconditions:
+ *  
+ *  - w should be the worker for the current thread 
+ *  - w should be a user thread.
+ */
+COMMON_PORTABLE
+void load_pedigree_leaf_into_user_worker(__cilkrts_worker *w);
+
+/**
+ * Save the pedigree leaf node from the worker into thread-local
+ * storage.  This method should execute as part of unbinding a user
+ * thread from a worker.
+ *
+ * Preconditions:
+ *  
+ *  - w should be the worker for the current thread 
+ *  - w should be a user thread.
+ */
+COMMON_PORTABLE
+void save_pedigree_leaf_from_user_worker(__cilkrts_worker *w);
+
+
+
+/**
+ * Update pedigree for a worker when leaving a frame.
+ *
+ * If this is the frame of a spawn helper (indicated by the
+ *  CILK_FRAME_DETACHED flag) we must update the pedigree.  The
+ *  pedigree points to nodes allocated on the stack.  Failing to
+ *  update it will result in a accvio/segfault if the pedigree is
+ *  walked.  This must happen for all spawn helper frames, even if
+ *  we're processing an exception.
+ */ 
+COMMON_PORTABLE
+inline void update_pedigree_on_leave_frame(__cilkrts_worker *w,
+					   __cilkrts_stack_frame *sf) 
+{
+    // Update the worker's pedigree information if this is an ABI 1 or later
+    // frame
+    if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1)
+    {
+	w->pedigree.rank = sf->spawn_helper_pedigree.rank + 1;
+	w->pedigree.parent = sf->spawn_helper_pedigree.parent;
+    }
+}
+
+
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_PEDIGREES_DOT_H)
diff --git a/libcilkrts/runtime/record-replay.cpp b/libcilkrts/runtime/record-replay.cpp
new file mode 100644
index 00000000000..bc5a79f2411
--- /dev/null
+++ b/libcilkrts/runtime/record-replay.cpp
@@ -0,0 +1,770 @@
+/* record-replay.cpp                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2012-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+/*
+ * Implementation of the record/replay functionality for Cilk Plus
+ */
+
+#include <cstring>
+#include <vector>
+#include <stdlib.h>
+
+// clang is really strict about printf formats, so use the annoying integer
+// printf macros.  Unfortunately they're not avaiable on Windows
+#ifdef _WIN32
+#define PRIu64 "llu"
+#else
+#define __STDC_FORMAT_MACROS 1
+#include <inttypes.h>
+#endif
+
+#include "record-replay.h"
+#include "bug.h"
+#include "internal/abi.h"
+#include "local_state.h"
+#include "full_frame.h"
+#include "global_state.h"
+#include "cilk_malloc.h"
+#include "os.h"  // for cilkos_error()
+
+#if RECORD_ON_REPLAY
+#pragma message ("*** Record on Replay is enabled!")
+#endif
+
+// Defined to write sequence number to the logs.  Note that you cannot
+// diff logs with sequence numbers because the numbers may increment in
+// different orders.
+//#define INCLUDE_SEQUENCE_NUMBER 1
+
+const int PED_VERSION = 1;      // Log recording version
+
+// Log types
+enum ped_type_t
+{
+    ped_type_unknown,
+    ped_type_steal,
+    ped_type_sync,
+    ped_type_orphaned,
+    ped_type_last               // Flags end of the list
+};
+
+// Log type strings
+#define PED_TYPE_STR_STEAL "Steal"
+#define PED_TYPE_STR_SYNC "Sync"
+#define PED_TYPE_STR_WORKERS "Workers"
+#define PED_TYPE_STR_ORPHANED "Orphaned"
+
+#define PED_TYPE_SIZE 16        // Buffer size for the type of pedigree.  Must
+                                // hold largest pedigree record type string.
+#define PEDIGREE_BUFF_SIZE 512  // Buffer size for the string representation
+                                // of a pedigree.
+
+/**
+ * Data we store for a replay log entry
+ */
+typedef struct replay_entry_t
+{
+    uint64_t *m_reverse_pedigree;   /**< Reverse pedigree for replay log entry */
+    ped_type_t m_type;              /**< Type of replay log entry */
+    int16_t m_pedigree_len;         /**< Number of terms in reverse pedigree */
+    int16_t m_value;                /**< Victim for STEALs, 0 if matching steal found for ORPHANs */
+
+   /**
+    * Load data read from the log into the entry
+    */
+    bool load(const char *type, const char *pedigee_str, int32_t value1, int32_t value2)
+    {
+        // Convert the type into an enum
+        if (0 == strcmp(type, PED_TYPE_STR_STEAL))
+        {
+            m_type = ped_type_steal;
+            m_value = (int16_t)value1;   // Victim
+        }
+        else
+        {
+            m_value = -1;      // Victim not valid
+            if (0 == strcmp(type, PED_TYPE_STR_SYNC))
+                m_type = ped_type_sync;
+            else if (0 == strcmp(type, PED_TYPE_STR_ORPHANED))
+                m_type = ped_type_orphaned;
+            else
+            {
+                m_type = ped_type_unknown;
+                return false;
+            }
+        }
+
+        // Parse the pedigree
+        m_pedigree_len = 0;
+
+        const char *p = pedigee_str;
+        char *end;
+
+        uint64_t temp_pedigree[PEDIGREE_BUFF_SIZE/2];
+
+        while(1)
+        {
+            temp_pedigree[m_pedigree_len++] = (uint64_t)strtol(p, &end, 10);
+            if ('\0' == *end)
+                break;
+            p = end + 1;
+        }
+
+        // Allocate memory to hold the pedigree.
+        // Copy the pedigree in reverse order since that's the order we'll
+        // traverse it
+        m_reverse_pedigree =
+            (uint64_t *)__cilkrts_malloc(sizeof(int64_t) * m_pedigree_len);
+        for (int n = 0; n < m_pedigree_len; n++)
+            m_reverse_pedigree[n] = temp_pedigree[(m_pedigree_len - 1) - n];
+
+        return true;
+    }
+
+   /**
+    * Match this entry against the data supplied.  This includes walking the
+    * pedigree from the specified node.
+    */
+    bool match (ped_type_t type, const __cilkrts_pedigree *node, int victim = -1)
+    {
+        int i = 0;
+
+        // If the type isn't what they're seeking, we don't have a match
+        if (type != m_type)
+            return false;
+
+        // If we're looking for a STEAL, then the victim must match
+        if ((type == ped_type_steal) && (victim != m_value))
+            return false;
+
+        // Compare the current pedigree against what was recorded
+        while ((NULL != node) && (i < m_pedigree_len))
+        {
+            // If we've got a pedigree rank difference, then we don't have
+            // a match
+            if (node->rank != m_reverse_pedigree[i])
+                return false;
+            node = node->parent;
+            i++;
+        }
+
+        // Make sure we exhausted both the pedigree chain and the recorded
+        // pedigree
+        return ((NULL == node) && (i == m_pedigree_len));
+    }
+
+   /**
+    * Advance to the next entry, skipping any ORPHANED records we didn't see
+    * a matching STEAL for
+    */
+    replay_entry_t *next_entry()
+    {
+        replay_entry_t *entry = this;
+
+        // You can't go beyond the end
+        if (ped_type_last == entry->m_type)
+            return entry;
+
+        // Advance to the next entry
+        entry++;
+
+        // Skip any ORPHANED records that don't have a matching steal. We
+        // initialized the value field to -1 for ORPHANED.  After loading all
+        // the log data, we iterated through all the STEAL records setting the
+        // matching ORPHANED record's value field to 0. So if an ORPHANED
+        // record's value field is still -1, it doesn't have a matching STEAL
+        // record, and I don't know why we chose not to return from the
+        // spawned function.
+        while ((ped_type_orphaned == entry->m_type) && (-1 == entry->m_value))
+        {
+            entry++;
+        }
+
+        return entry;
+    }
+
+   /**
+    * Release any allocated resources
+    */
+    void unload()
+    {
+        __cilkrts_free(m_reverse_pedigree);
+        m_reverse_pedigree = NULL;
+    }
+
+} replay_entry_t;
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * Walk the pedigree and generate a string representation with underscores
+ * between terms.  Currently does a recursive walk to generate a forward
+ * pedigree.
+ *
+ * @param p The buffer that is to be filled.  Assumed to be PEDIGREE_BUFF_SIZE
+ * characters long
+ * @param pnode The initial pedigree term to be written.
+ *
+ * @return A pointer into the pedigree string buffer after a term has been
+ * written.
+ */
+static
+char * walk_pedigree_nodes(char *p, const __cilkrts_pedigree *pnode)
+{
+    CILK_ASSERT(pnode);
+    if (pnode->parent)
+    {
+        p = walk_pedigree_nodes(p, pnode->parent);
+        p += sprintf(p, "_");
+    }
+
+    return p + sprintf(p, "%" PRIu64, pnode->rank);
+}
+
+/**
+ * Write a record to a replay log file.
+ *
+ * @param w The worker we're writing the pedigree for.
+ * @param type The type of the pedigree record, as a string
+ * @param initial_node The initial pedigree node to be written, or NULL if
+ * there is no pedigree for this record type.
+ * @param i1 First integer value to be written to the record.
+ * @param i2 Second integer value to be written to the record. Only applies
+ * to STEAL records. Defaults to -1 (unused).  The second value is always
+ * written to make parsing easier.
+ */
+static
+void write_to_replay_log (__cilkrts_worker *w, const char *type,
+                          const __cilkrts_pedigree *initial_node,
+                          int i1 = -1, int i2 = -1)
+{
+    char pedigree[PEDIGREE_BUFF_SIZE];
+
+    // If we don't have an initial pedigree node, just use "0" to fill the slot
+    if (NULL == initial_node)
+        strcpy(pedigree, "0");
+    else
+        walk_pedigree_nodes(pedigree, initial_node);
+
+#ifndef INCLUDE_SEQUENCE_NUMBER
+    // Simply write the record
+    fprintf(w->l->record_replay_fptr, "%s %s %d %d\n",
+            type, pedigree, i1, i2);
+#else
+    // Write the record with a sequence number.  The sequence number should
+    // always be the last term, and ignored on read
+
+    static long volatile seq_num = 0;
+    long write_num;
+
+    // Atomic increment functions are compiler/OS-specific
+#ifdef _WIN32
+    write_num = _InterlockedIncrement(&seq_num);
+#else /* GCC */
+    write_num = __sync_add_and_fetch(&seq_num, 1);
+#endif // _WIN32
+
+    fprintf(w->l->record_replay_fptr, "%s %s %d %d %ld\n",
+            type, pedigree, i1, i2, write_num);
+#endif // INCLUDE_SEQUENCE_NUMBER
+
+    fflush(w->l->record_replay_fptr);
+}
+
+/**
+ * Record data for a successful steal.
+ *
+ * The pedigree for a STEAL record is the pedigree of the stolen frame.
+ *
+ * @note It's assumed that replay_record_steal() has already checked that we're
+ * recording a log and that the record/replay functionality has not been
+ * compiled out.
+ *
+ * @param w The worker stealing a frame.
+ * @param victim_id The ID of the worker which had it's frame stolen.
+ */
+void replay_record_steal_internal(__cilkrts_worker *w, int32_t victim_id)
+{
+    // Follow the pedigree chain using worker's stack frame
+    CILK_ASSERT(w->l->next_frame_ff);
+    CILK_ASSERT(w->l->next_frame_ff->call_stack);
+
+    // Record steal: STEAL pedigree victim_id thief_id
+    write_to_replay_log (w, PED_TYPE_STR_STEAL,
+                         &(w->l->next_frame_ff->call_stack->parent_pedigree),
+                         victim_id);
+}
+
+/**
+ * Record data for the worker that continues from a sync
+ *
+ * The pedigree for a SYNC record is the pedigree at the sync.
+ *
+ * @note It's assumed that replay_record_sync() has already checked that we're
+ * recording a log and that the record/replay functionality has not been
+ * compiled out.
+ *
+ * @param w The worker continuing from a sync.
+ */
+void replay_record_sync_internal(__cilkrts_worker *w)
+{
+    // Record sync: SYNC pedigree last_worker_id
+    write_to_replay_log (w, PED_TYPE_STR_SYNC, &w->pedigree);
+}
+
+/**
+ * Record the pedigree of an attempt to return to a stolen parent
+ *
+ * The pedigree for an ORPHANED record is the pedigree of our parent
+ *
+ * @note It's assumed that replay_record_orphaned() has already checked that
+ * we're recording a log and that the record/replay functionality has not
+ * been compiled out.
+ *
+ * @param w The worker continuing noting that it has been orphaned.
+ */
+void replay_record_orphaned_internal(__cilkrts_worker *w)
+{
+    // Record steal: ORPHANED pedigree self
+    write_to_replay_log (w, PED_TYPE_STR_ORPHANED, w->pedigree.parent);
+}
+
+/**
+ * Attempt to match a SYNC record.  We have a match when this worker was
+ * recorded returning from the current call to __cilkrts_sync() with the
+ * same pedigree and this was the worker that continued from the sync, since
+ * it was the last to sync.
+ *
+ * If we find a match, the caller is expected to stall it is the last worker
+ * to reach a sync so it will be the worker to continue from the sync.
+ *
+ * @note It's assumed that replay_match_sync_pedigree() has already returned
+ * if we're not replaying a log, or if record/replay functionality has
+ * been compiled out.
+ *
+ * @param w The worker we're checking to see if we've got a match
+ */
+int replay_match_sync_pedigree_internal(__cilkrts_worker *w)
+{
+    // Return true if we have a match
+    if (w->l->replay_list_entry->match(ped_type_sync, &w->pedigree))
+        return 1;
+    else
+        return 0;
+}
+
+/**
+ * Advance to the next log entry from a SYNC record.  Consume the current
+ * SYNC record on this worker and advance to the next one.
+ *
+ * @note It's assumed that replay_advance_from_sync() has already returned if
+ * we're not replaying a log, or if record/replay functionality has been
+ * compiled out.
+ *
+ * @param w The worker whose replay log we're advancing.
+ */
+void replay_advance_from_sync_internal (__cilkrts_worker *w)
+{
+    // The current replay entry must be a SYNC
+    CILK_ASSERT(ped_type_sync == w->l->replay_list_entry->m_type);
+
+    // Advance to the next entry
+    w->l->replay_list_entry = w->l->replay_list_entry->next_entry();
+}
+
+/**
+ * Called from random_steal() to override the ID of the randomly chosen victim
+ * worker which this worker will attempt to steal from. Returns the worker id
+ * of the next victim this worker was recorded stealing from, or -1 if the
+ * next record in the log is not a STEAL.
+ *
+ * @note This call does NOT attempt to match the pedigree.  That will be done
+ * by replay_match_victim_pedigree() after random_steal() has locked the victim
+ * worker.
+ *
+ * @param w The __cilkrts_worker we're executing on.  The worker's replay log
+ * is checked for a STEAL record.  If we've got one, the stolen worker ID is
+ * returned.
+ *
+ * @return -1 if the next record is not a STEAL
+ * @return recorded stolen worker ID if we've got a matching STEAL record
+ */
+int replay_get_next_recorded_victim_internal(__cilkrts_worker *w)
+{
+    // If the next record isn't a STEAL, abort the attempt to steal work
+    if (ped_type_steal != w->l->replay_list_entry->m_type)
+        return -1;
+
+    // Return the victim's worker ID from the STEAL record.  We'll check
+    // the pedigree after random_steal has locked the victim worker.
+    return w->l->replay_list_entry->m_value;
+}
+
+/**
+ * Called from random_steal() to determine if we have a STEAL record that
+ * matches the pedigree at the head of the victim worker.  If we do have a
+ * match, the STEAL record is consumed.
+ *
+ * @note It's assumed that replay_match_victim_pedigree() has already returned if
+ * we're not replaying a log, or if record/replay functionality has been
+ * compiled out.
+ *
+ * @return 1 if we have a match
+ * @return 0 if the current replay record isn't a STEAL record, or the victim
+ * isn't correct, or the pedigree doesn't match.
+ */
+int replay_match_victim_pedigree_internal(__cilkrts_worker *w, __cilkrts_worker *victim)
+{
+    // If we don't have a match, return 0
+    if (! w->l->replay_list_entry->match(ped_type_steal,
+                                             &((*victim->head)->parent_pedigree),
+                                             victim->self))
+        return 0;
+
+    // Consume this entry
+    w->l->replay_list_entry = w->l->replay_list_entry->next_entry();
+
+    // Return success
+    return 1;
+}
+
+/**
+ * If the frame we're about to return to was recorded as being stolen,
+ * stall until it is.
+ *
+ * @note It's assumed that replay_wait_for_steal_if_parent_was_stolen() has
+ * already returned if we're not replaying a log, or if record/replay
+ * functionality has been compiled out.
+ *
+ * @param w The worker we're executing on.
+ */
+void replay_wait_for_steal_if_parent_was_stolen_internal(__cilkrts_worker *w)
+{
+    // If our parent wasn't recorded orphanen, return now
+    if (! w->l->replay_list_entry->match (ped_type_orphaned,
+                                              w->pedigree.parent))
+        return;
+
+    // Stall until our parent is stolen.  Note that we're comparing head
+    // and tail, not head and exc.  The steal is not completed until tail
+    // is modified.
+    while (!((w->tail - 1) < w->head))
+        __cilkrts_sleep();
+
+    // Consume the entry
+    w->l->replay_list_entry = w->l->replay_list_entry->next_entry();
+}
+
+/**
+ * Allocate memory for the list of logged events.
+ *
+ * This function will read through the file and count the number of records
+ * so it can estimate how big a buffer to allocate for the array or replay
+ * entries.  It will then rewind the file to the beginning so it can be
+ * loaded into memory.
+ *
+ * @param w The worker we're loading the file for.
+ * @param f The file of replay data we're scanning.
+ */
+static
+void allocate_replay_list(__cilkrts_worker *w, FILE *f)
+{
+    // Count the number of entries - yeah, it's a hack, but it lets me
+    // allocate the space all at once instead of in chunks
+    char buf[1024];
+    int entries = 1;    // Include "LAST" node
+
+    while (! feof(f))
+    {
+        if (fgets(buf, 1024, f))
+        {
+            // Skip the Workers record - should only be in file for Worker 0
+            if (0 != strncmp(PED_TYPE_STR_WORKERS, buf, sizeof(PED_TYPE_STR_WORKERS)-1))
+                entries++;
+        }
+    }
+
+    w->l->replay_list_root =
+        (replay_entry_t *)__cilkrts_malloc(entries * sizeof(replay_entry_t));
+    w->l->replay_list_root[entries - 1].m_type = ped_type_last;
+
+    // Reset the file to the beginning
+    rewind(f);
+}
+
+/**
+ * Load the replay log for a worker into memory.
+ *
+ * @param w The worker we're loading the replay for.
+ */
+static
+void load_recorded_log(__cilkrts_worker *w)
+{
+    char ped_type[PED_TYPE_SIZE];
+    char ped_str[PEDIGREE_BUFF_SIZE];
+    int32_t i1 = -1, i2 = -1;
+    int fret;
+    char local_replay_file_name[512];
+    FILE *f;
+
+    // Open the log for reading
+    sprintf(local_replay_file_name, "%s%d.cilklog", w->g->record_replay_file_name,  w->self);
+    f = fopen(local_replay_file_name, "r");
+
+    // Make sure we found a log!
+    CILK_ASSERT (NULL != f);
+
+    // Initialize the replay_list
+    allocate_replay_list(w, f);
+    replay_entry_t *entry = w->l->replay_list_root;
+
+    // Read the data out and add it to our tables
+    while (! feof(f))
+    {
+#ifndef INCLUDE_SEQUENCE_NUMBER
+        fret = fscanf(f, "%s %s %d %d\n", ped_type, ped_str, &i1, &i2);
+        if(EOF == fret)
+            break;
+
+        // We must have read 4 fields
+        CILK_ASSERT(4 == fret);
+#else
+        int32_t write_num;
+        fret = fscanf(f, "%s %s %d %d %d\n", ped_type, ped_str,
+                      &i1, &i2, &write_num);
+        if(EOF == fret)
+            break;
+
+        // We must have read 5 fields
+        CILK_ASSERT(5 == fret);
+#endif // INCLUDE_SEQUENCE_NUMBER
+
+        // Load the data into the entry
+        if (0 == strcmp(ped_type, PED_TYPE_STR_WORKERS))
+        {
+            // Verify we're replaying with the same number of workers we recorded with
+            if (i1 != w->g->P)
+            {
+                // Fatal error - does not return
+                cilkos_error("Cannot continue replay: number of workers(%d) doesn't match "
+                             "that from the recording(%d).\n", w->g->P, i1);
+            }
+
+            // Verify that we understand this version of the pedigree file
+            if (PED_VERSION != i2)
+            {
+                // Fatal error - does not return
+                cilkos_error("Pedigree file version %d doesn't match current "
+                             "version %d - cannot continue.\n",
+                             i2, PED_VERSION);
+            }
+        }
+        else
+        {
+            entry->load(ped_type, ped_str, i1, i2);
+            entry++;
+        }
+    }
+
+    // Make sure we've filled the allocated memory.  We initialized the last
+    // entry in 
+    CILK_ASSERT(ped_type_last == entry->m_type);
+    w->l->replay_list_entry = w->l->replay_list_root;
+
+    // Close the log and return
+    fclose(f);
+}
+
+/**
+ * Scan a recorded log to match STEALs againsted ORPHANED records.
+ *
+ * @param g Cilk Runtime global state.  Passed to access the worker array so
+ * we can scan a worker's ORPHANED entries for one that matches a STEAL entry.
+ * @param entry The root of a replay_list for a worker.
+ */
+static
+void scan_for_matching_steals(global_state_t *g, replay_entry_t *entry)
+{
+    // Iterate over all of the entries
+    while (ped_type_last != entry->m_type)
+    {
+        // Look for STEALs.  That will tell us which worker the frame was
+        // stolen from
+        if (ped_type_steal == entry->m_type)
+        {
+            bool found = false;
+
+            // Validate the worker ID and make sure we've got a list
+            CILK_ASSERT((entry->m_value >= 0) && (entry->m_value < g->total_workers));
+            replay_entry_t *victim_entry = g->workers[entry->m_value]->l->replay_list_root;
+            CILK_ASSERT(NULL != victim_entry);
+
+            // Scan the victim's list for the matching ORPHANED record
+            while ((ped_type_last != victim_entry->m_type) && ! found)
+            {
+                if (ped_type_orphaned == victim_entry->m_type)
+                {
+                    if (entry->m_pedigree_len == victim_entry->m_pedigree_len)
+                    {
+                        if (0 == memcmp(entry->m_reverse_pedigree,
+                                        victim_entry->m_reverse_pedigree,
+                                        entry->m_pedigree_len * sizeof(int64_t)))
+                        {
+                            // Note that this ORPHANED record has a matching steal
+                            victim_entry->m_value = 0;
+                            found = true;
+                        }
+                    }
+                }
+                victim_entry++;
+            }
+        }
+        entry++;
+    }
+}
+
+
+/*
+ * Initialize per-worker data for record or replay - See record-replay.h
+ * for full routine header.
+ */
+void replay_init_workers(global_state_t *g)
+{
+    int i;
+    char worker_file_name[512];
+
+    // If we're not recording or replaying a log, we're done.  All of the
+    // fields in the global_state_t or local_state_t are already initialized
+    // to default values.
+    if (RECORD_REPLAY_NONE == g->record_or_replay)
+        return;
+
+    // If we're replaying a log, read each worker's log and construct the
+    // in-memory log
+    if (REPLAY_LOG == g->record_or_replay)
+    {
+        // Read all of the data
+        for (i = 0; i < g->total_workers; ++i)
+        {
+            // This function will also initialize and fill the worker's 
+            // replay list
+            load_recorded_log(g->workers[i]);
+        }
+
+        // Scan for orphans with no matching steal.  Mark them so they'll be
+        // skipped as we advance through the log.
+        for (i = 0; i < g->total_workers; ++i)
+        {
+            scan_for_matching_steals(g, g->workers[i]->l->replay_list_root);
+        }
+
+        // If we're recording the logs while replaying, create the log files.
+        // This will only be used for debugging.  Create the logs in the
+        // current directory.  It should be as good a place as any...
+#if RECORD_ON_REPLAY
+        for(i = 0; i < g->total_workers; ++i)
+        {
+            __cilkrts_worker *w = g->workers[i];
+            sprintf(worker_file_name, "replay_log_%d.cilklog",  w->self);
+            w->l->record_replay_fptr = fopen(worker_file_name, "w+");
+            CILK_ASSERT(NULL != w->l->record_replay_fptr);
+        }
+
+        // Record the number of workers, file version in Worker 0's file
+        write_to_replay_log (g->workers[0], PED_TYPE_STR_WORKERS, NULL, g->P, PED_VERSION);
+#endif // RECORD_ON_REPLAY
+    }
+
+    // If we're recording, create the log files
+    if (RECORD_LOG == g->record_or_replay)
+    {
+        for(i = 0; i < g->total_workers; ++i)
+        {
+            __cilkrts_worker *w = g->workers[i];
+            sprintf(worker_file_name, "%s%d.cilklog",
+                    g->record_replay_file_name,
+                    w->self);
+            w->l->record_replay_fptr = fopen(worker_file_name, "w+");
+            CILK_ASSERT(NULL != w->l->record_replay_fptr);
+        }
+
+        // Record the number of workers, file version in Worker 0's file
+        write_to_replay_log (g->workers[0], PED_TYPE_STR_WORKERS, NULL, g->P, PED_VERSION);
+    }
+}
+
+/*
+ * Do any necessary cleanup for the logs - See record-replay.h for full
+ * routine header.
+ */
+void replay_term(global_state_t *g)
+{
+    // Free memory for the record/replay log file name, if we've got one
+    if (g->record_replay_file_name)
+        __cilkrts_free(g->record_replay_file_name);
+
+    // Per-worker cleanup
+    for(int i = 0; i < g->total_workers; ++i)
+    {
+        __cilkrts_worker *w = g->workers[i];
+
+        // Close the log files, if we've opened them
+        if(w->l->record_replay_fptr)
+            fclose(w->l->record_replay_fptr);
+
+        if (w->l->replay_list_root)
+        {
+            // We should have consumed the entire list
+            CILK_ASSERT(ped_type_last == w->l->replay_list_entry->m_type);
+
+            replay_entry_t *entry = w->l->replay_list_root;
+            while (ped_type_last != entry->m_type)
+            {
+                // Free the pedigree memory for each entry
+                entry->unload();
+                entry++;
+            }
+            __cilkrts_free(w->l->replay_list_root);
+            w->l->replay_list_root = NULL;
+            w->l->replay_list_entry = NULL;
+        }
+    }
+}
+
+__CILKRTS_END_EXTERN_C
diff --git a/libcilkrts/runtime/record-replay.h b/libcilkrts/runtime/record-replay.h
new file mode 100644
index 00000000000..c1c5a68f579
--- /dev/null
+++ b/libcilkrts/runtime/record-replay.h
@@ -0,0 +1,432 @@
+/* record_replay.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2012-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+/**
+ * @file record-replay.h
+ *
+ * @brief record-replay.h and .cpp encapsulate most of the functionality to
+ * record and play back a Cilk Plus application.
+ *
+ * Recording is directed by the setting of the CILK_RECORD_LOG environment
+ * variable.  If it's defined, the value specifies the root we'll use to
+ * generate files for each worker using the following format string:
+ * "%s%d.cilklog", where the integer is the value of w->self.
+ *
+ * Replay is directed by the setting of the CILK_REPLAY_LOG environment
+ * variable, interpreted the same way as CILK_RECORD_LOG.  If both
+ * CILK_RECORD_LOG and CILK_REPLAY_LOG are defined, a warning will be given
+ * and the attempt to record a log will be ignored.
+ *
+ * Recording is relatively straightforward.  We write all information about a
+ * worker to a per-worker file.
+ *
+ * Each pedigree record consists of the following fields.  All fields must be
+ * present in every record to make parsing easy.
+ *    - Type - A string identifying the pedigree record.  See the PED_TYPE_STR_
+ *      macros for the currently defined values.
+ *    - Pedigree - A string of pedigree values, with underscores between
+ *      adjacent values.
+ *    - i1 - Record type-specific value.  -1 if not used.
+ *    - i2 - Record type-specific value.  -1 if not used.
+ *
+ * WORKERS record - only written to the file for worker 0.  Note that this is
+ * the first worker in the workers array. Worker 0 is the first system worker,
+ * *NOT* a user worker.
+ *  - Type: "Workers"
+ *  - Pedigree: Always "0" - ignored
+ *  - i1: Number of workers (g->P) when we recorded the log.  A mismatch when
+ *        we attempt to replay the log will result in aborting the execution.
+ *  - i2: Log version number - Specified by PED_VERSION in record-replay.cpp
+ *
+ * STEAL record - written after a successful steal.
+ *  - Type: "Steal"
+ *  - Pedigree: Pedigree of stolen frame
+ *  - i1: Worker the frame was stolen from
+ *  - i2: -1
+ *
+ * SYNC record - written after a worker continues from a sync.
+ *  - Type: "Sync"
+ *  - Pedigree: Pedigree of sync.  Note that this is the pedigree *before*
+ *        the pedigree in incremented in setup_for_execution_pedigree().
+ *  - i1: -1
+ *  - i2: -1
+ *
+ * ORPHANED record - saved on a return to a stolen parent.
+ *  - Type: "Orphaned"
+ *  - Pedigree: Pedigree of the parent frame *before* the pedigree is
+ *        incremented by the return
+ *  - i1: -1
+ *  - i2: -1
+ *
+ * On replay, the data is loaded into a per-worker array, and the data is
+ * consumed in order as needed.
+ */
+
+#ifndef INCLUDED_RECORD_REPLAY_DOT_H
+#define INCLUDED_RECORD_REPLAY_DOT_H
+
+#include "cilk/common.h"
+#include "global_state.h"
+
+/**
+ * Define CILK_RECORD_REPLAY to enable record/replay functionality.  If
+ * CILK_RECORD_REPLAY is not defined, all of the record/replay functions in
+ * record-replay.h will be stubbed out.  Since they're declared as inline,
+ * functions, the resulting build should have no performance impact due to
+ * the implementation or record/replay.
+ */
+ #define CILK_RECORD_REPLAY 1
+
+/**
+ * Define RECORD_ON_REPLAY=1 to write logs when we're replaying a log.  This
+ * should only be needed when debugging the replay functionality.  This should
+ * always be defined as 0 when record-replay.h is checked in.
+ */
+#define RECORD_ON_REPLAY 0
+
+__CILKRTS_BEGIN_EXTERN_C
+
+#ifdef CILK_RECORD_REPLAY
+// Declarations of internal record/replay functions.  The inlined versions
+// further down do some preliminary testing (like if we're not recording or
+// replaying) and will stub out the functionality if we've compiled out the
+// record/replay feature
+int replay_match_sync_pedigree_internal(__cilkrts_worker *w);
+void replay_wait_for_steal_if_parent_was_stolen_internal(__cilkrts_worker *w);
+void replay_record_steal_internal(__cilkrts_worker *w, int32_t victim_id);
+void replay_record_sync_internal(__cilkrts_worker *w);
+void replay_record_orphaned_internal(__cilkrts_worker *w);
+int replay_match_victim_pedigree_internal(__cilkrts_worker *w, __cilkrts_worker *victim);
+void replay_advance_from_sync_internal (__cilkrts_worker *w);
+int replay_get_next_recorded_victim_internal(__cilkrts_worker *w);
+#endif //  CILK_RECORD_REPLAY
+
+// Publically defined record/replay API
+
+/**
+ * If we're replaying a log, wait for our parent to be stolen if it was when
+ * the log was recorded.  If record/replay is compiled out, this is a noop.
+ *
+ * @param w The __cilkrts_worker we're executing on.  The worker's replay
+ * list will be checked for a ORPHANED record with a matching pedigree.  If
+ * there is a match, the ORPHANED record will be consumed.
+ */
+#ifdef CILK_RECORD_REPLAY
+__CILKRTS_INLINE
+void replay_wait_for_steal_if_parent_was_stolen(__cilkrts_worker *w)
+{
+    // Only check if we're replaying a log
+    if (REPLAY_LOG == w->g->record_or_replay)
+        replay_wait_for_steal_if_parent_was_stolen_internal(w);
+}
+#else
+__CILKRTS_INLINE
+void replay_wait_for_steal_if_parent_was_stolen(__cilkrts_worker *w)
+{
+    // If record/replay is disabled, we never wait
+}
+#endif //  CILK_RECORD_REPLAY
+
+/**
+ * Called from random_steal() to override the ID of the randomly chosen victim
+ * worker which this worker will attempt to steal from. Returns the worker id
+ * of the next victim this worker was recorded stealing from, or -1 if the
+ * next record in the log is not a STEAL.
+ *
+ * @note This call does NOT attempt to match the pedigree.  That will be done
+ * by replay_match_victim_pedigree() after random_steal() has locked the victim
+ * worker.
+ *
+ * @param w The __cilkrts_worker we're executing on.  The worker's replay log
+ * is checked for a STEAL record.  If we've got one, the stolen worker ID is
+ * returned.
+ * @param id The randomly chosen victim worker ID.  If we're not replaying a
+ * log, or if record/replay has been compiled out, this is the value that
+ * will be returned.
+ *
+ * @return id if we're not replaying a log
+ * @return -1 if the next record is not a STEAL
+ * @return recorded stolen worker ID if we've got a matching STEAL record
+ */
+#ifdef CILK_RECORD_REPLAY
+__CILKRTS_INLINE
+int replay_get_next_recorded_victim(__cilkrts_worker *w, int id)
+{
+    // Only check if we're replaying a log
+    if (REPLAY_LOG == w->g->record_or_replay)
+        return replay_get_next_recorded_victim_internal(w);
+    else
+        return id;
+}
+#else
+__CILKRTS_INLINE
+int replay_get_next_recorded_victim(__cilkrts_worker *w, int id)
+{
+    // Record/replay is disabled.  Always return the original worker id
+    return id;
+}
+#endif //  CILK_RECORD_REPLAY
+
+/**
+ * Initialize per-worker data for record/replay.  A noop if record/replay
+ * is disabled, or if we're not recording or replaying anything.
+ *
+ * If we're recording a log, this will ready us to create the per-worker
+ * logs.
+ *
+ * If we're replaying a log, this will read the logs into the per-worker
+ * structures.
+ *
+ * @param g Cilk runtime global state
+ */
+void replay_init_workers(global_state_t *g);
+
+/**
+ * Record a record on a successful steal.  A noop if record/replay is
+ * diabled, or if we're not recording anything
+ *
+ * @param w The __cilkrts_worker we're executing on.  The pedigree of
+ * the stolen frame will be walked to generate the STEAL record.
+ *
+ * @param victim_id The worker ID of the worker w stole from.
+ */
+#ifdef CILK_RECORD_REPLAY
+__CILKRTS_INLINE
+void replay_record_steal(__cilkrts_worker *w, int32_t victim_id)
+{
+#if RECORD_ON_REPLAY
+    // If we're recording on replay, write the record if we're recording or
+    // replaying
+    if (RECORD_REPLAY_NONE == w->g->record_or_replay)
+        return;
+#else
+    // Only write the record if we're recording
+    if (RECORD_LOG != w->g->record_or_replay)
+        return;
+#endif
+
+    replay_record_steal_internal(w, victim_id);
+}
+#else
+__CILKRTS_INLINE
+void replay_record_steal(__cilkrts_worker *w, int32_t victim_id)
+{
+}
+#endif //  CILK_RECORD_REPLAY
+
+/**
+ * Record a record when continuing after a sync.  A noop if record/replay is
+ * diabled, or if we're not recording anything, or if the sync was abandoned,
+ * meaning this isn't the worker that continues from the sync.
+ *
+ * @param w The __cilkrts_worker for we're executing on.  The pedigree of
+ * the sync-ing frame will be walked to generate the SYNC record.
+ *
+ * @param continuing True if this worker will be continuing from the
+ * cilk_sync.  A SYNC record will only be generated if continuing is true.
+ */
+#ifdef CILK_RECORD_REPLAY
+__CILKRTS_INLINE
+void replay_record_sync(__cilkrts_worker *w, int continuing)
+{
+    // If this was not the last worker to the syn, return
+    if (! continuing)
+        return;
+
+#if RECORD_ON_REPLAY
+    // If we're recording on replay, write the record if we're recording or
+    // replaying
+    if (RECORD_REPLAY_NONE == w->g->record_or_replay)
+        return;
+#else
+    // Only write the record if we're recording
+    if (RECORD_LOG != w->g->record_or_replay)
+        return;
+#endif
+
+    replay_record_sync_internal(w);
+}
+#else
+__CILKRTS_INLINE
+void replay_record_sync(__cilkrts_worker *w, int abandoned)
+{
+}
+#endif //  CILK_RECORD_REPLAY
+
+/**
+ * Record a record on a return to a stolen parent.  A noop if record/replay is
+ * diabled, or if we're not recording anything.
+ *
+ * @param w The __cilkrts_worker for we're executing on.  The pedigree of the
+ * frame that has discovered that its parent has been stolken will be walked
+ * to generate the ORPHANED record.
+ */
+#ifdef CILK_RECORD_REPLAY
+__CILKRTS_INLINE
+void replay_record_orphaned(__cilkrts_worker *w)
+{
+#if RECORD_ON_REPLAY
+    // If we're recording on replay, write the record if we're recording or
+    // replaying
+    if (RECORD_REPLAY_NONE == w->g->record_or_replay)
+        return;
+#else
+    // Only write the record if we're recording
+    if (RECORD_LOG != w->g->record_or_replay)
+        return;
+#endif
+
+    replay_record_orphaned_internal(w);
+}
+#else
+__CILKRTS_INLINE
+void replay_record_orphaned(__cilkrts_worker *w)
+{
+}
+#endif //  CILK_RECORD_REPLAY
+
+/**
+ * Test whether the frame at the head of the victim matches the pedigree of
+ * the frame that was recorded being stolen.  Called in random steal to verify
+ * that we're about to steal the correct frame.
+ *
+ * @param w The __cilkrts_worker for we're executing on.  The current worker
+ * is needed to find the replay entry to be checked.
+ *
+ * @param victim The __cilkrts_worker for we're proposing to steal a frame
+ * from.  The victim's head entry is 
+ * is needed to find the replay entry to be checked.
+ *
+ * @return 0 if we're replaying a log and the victim's pedigree does NOT match
+ * the next frame the worker is expected to steal.
+ *
+ * @return 1 in all other cases to indicate that the steal attempt should
+ * continue
+ */
+#ifdef CILK_RECORD_REPLAY
+__CILKRTS_INLINE
+int replay_match_victim_pedigree(__cilkrts_worker *w, __cilkrts_worker *victim)
+{
+    // We're not replaying a log. The victim is always acceptable
+    if (REPLAY_LOG != w->g->record_or_replay)
+        return 1;
+
+    // Return 1 if the victim's pedigree matches the frame the worker stole
+    // when we recorded the log
+    return replay_match_victim_pedigree_internal(w, victim);
+}
+#else
+__CILKRTS_INLINE
+int replay_match_victim_pedigree(__cilkrts_worker *w, __cilkrts_worker *victim)
+{
+    // Record/replay is disabled.  The victim is always acceptable
+    return 1;
+}
+#endif //  CILK_RECORD_REPLAY
+
+/**
+ * Test whether the current replay entry is a sync record matching the
+ * worker's pedigree.
+ *
+ * @param w The __cilkrts_worker for we're executing on.
+ *
+ * @return 1 if the current replay entry matches the current pedigree.
+ * @return 0 if there's no match, or if we're not replaying a log.
+ */
+#ifdef CILK_RECORD_REPLAY
+__CILKRTS_INLINE
+int replay_match_sync_pedigree(__cilkrts_worker *w)
+{
+    // If we're not replaying, assume no match
+    if (REPLAY_LOG != w->g->record_or_replay)
+        return 0;
+
+    return replay_match_sync_pedigree_internal(w);
+}
+#else
+__CILKRTS_INLINE
+int replay_match_sync_pedigree(__cilkrts_worker *w)
+{
+    // Record/replay is disabled.  Assume no match
+    return 0;
+}
+#endif
+
+/**
+ * Marks a sync record seen, advancing to the next record in the replay list.
+ *
+ * This function will only advance to the next record if:
+ *   - Record/replay hasn't been compiled out AND
+ *   - We're replaying a log AND
+ *   - A match was found AND
+ *   - The sync is not being abandoned
+ *
+ * @param w The __cilkrts_worker for we're executing on.
+ * @param match_found The value returned by replay_match_sync_pedigree().  If
+ * match_found is false, nothing is done.
+ * @param continuing  Flag indicating whether this worker will continue from
+ * the sync (it's the last worker to the sync) or if it will abandon the work
+ * and go to the scheduling loop to look for more work it can steal.
+ */
+#ifdef CILK_RECORD_REPLAY
+__CILKRTS_INLINE
+void replay_advance_from_sync(__cilkrts_worker *w, int match_found, int continuing)
+{
+    // If we're replaying a log, and the current sync wasn't abandoned, and we
+    // found a match in the log, mark the sync record seen.
+    if ((REPLAY_LOG == w->g->record_or_replay) && match_found && continuing)
+        replay_advance_from_sync_internal(w);
+}
+#else
+__CILKRTS_INLINE
+void replay_advance_from_sync(__cilkrts_worker *w, int match_found, int continuing)
+{
+}
+#endif
+
+/**
+ * Release any resources used to read or write a replay log.
+ *
+ * @param g Cilk runtime global state
+ */
+void replay_term(global_state_t *g);
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_RECORD_REPLAY_DOT_H)
diff --git a/libcilkrts/runtime/reducer_impl.cpp b/libcilkrts/runtime/reducer_impl.cpp
new file mode 100644
index 00000000000..f20b9bc4592
--- /dev/null
+++ b/libcilkrts/runtime/reducer_impl.cpp
@@ -0,0 +1,1012 @@
+/* reducer_impl.cpp                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  Patents Pending, Intel Corporation.
+ **************************************************************************/
+
+/**
+ * Support for reducers
+ */
+
+// ICL: Don't complain about conversion from pointer to same-sized integral type
+// in hashfun.  That's why we're using size_t
+#ifdef _WIN32
+#   pragma warning(disable: 1684)
+#endif
+
+#include "reducer_impl.h"
+#include "scheduler.h"
+#include "bug.h"
+#include "os.h"
+#include "global_state.h"
+#include "frame_malloc.h"
+
+#include "cilk/hyperobject_base.h"
+#include "cilktools/cilkscreen.h"
+#include "internal/abi.h"
+
+#if REDPAR_DEBUG > 0
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+
+#define DBG if(0) // if(1) enables some internal checks
+
+// Check that w is the currently executing worker.  This method is a
+// no-op unless the debug level is set high enough.
+static inline void verify_current_wkr(__cilkrts_worker *w)
+{
+#if REDPAR_DEBUG >= 5
+    __cilkrts_worker* tmp = __cilkrts_get_tls_worker();
+    if (w != tmp) {
+        fprintf(stderr, "W=%d, actual=%d... missing a refresh....\n",
+                w->self,
+                tmp->self);
+    }
+    CILK_ASSERT(w == tmp); // __cilkrts_get_tls_worker());
+#endif
+}
+
+// Suppress clang warning that the expression result is unused
+#if defined(__clang__) && (! defined(__INTEL_COMPILER))
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wunused-value"
+#endif // __clang__
+
+/// Helper class to disable and re-enable Cilkscreen
+struct DisableCilkscreen
+{
+    DisableCilkscreen () { __cilkscreen_disable_checking(); }
+    ~DisableCilkscreen () { __cilkscreen_enable_checking(); }
+};
+
+/// Helper class to enable and re-disable Cilkscreen
+struct EnableCilkscreen
+{
+    EnableCilkscreen () { __cilkscreen_enable_checking(); }
+    ~EnableCilkscreen () { __cilkscreen_disable_checking(); }
+};
+
+#if defined(__clang__) && (! defined(__INTEL_COMPILER))
+#   pragma clang diagnostic pop
+#endif // __clang__
+
+/**
+ * @brief Element for a hyperobject
+ */
+struct elem {
+    void                       *key;  ///< Shared key for this hyperobject
+    __cilkrts_hyperobject_base *hb;   ///< Base of the hyperobject.
+    void                       *view; ///< Strand-private view of this hyperobject
+    /// Destroy and deallocate the view object for this element and set view to
+    /// null.
+    void destroy();
+
+    /// Returns true if this element contains a leftmost view.
+    bool is_leftmost() const;
+};
+
+/** Bucket containing at most NMAX elements */
+struct bucket {
+    /// Size of the array of elements for this bucket
+    size_t nmax;
+
+    /**
+     * We use the ``struct hack'' to allocate an array of variable
+     * dimension at the end of the struct.  However, we allocate a
+     * total of NMAX+1 elements instead of NMAX.  The last one always
+     * has key == 0, which we use as a termination criterion
+     */
+    elem el[1];
+};
+
+/**
+ * Class that implements the map for reducers so we can find the
+ * view for a strand.
+ */
+struct cilkred_map {
+    /** Handy pointer to the global state */
+    global_state_t *g;
+
+    /** Number of elements in table */
+    size_t nelem;
+
+    /** Number of buckets */
+    size_t nbuckets;
+
+    /** Array of pointers to buckets */
+    bucket **buckets;
+
+    /** Set true if merging (for debugging purposes) */
+    bool merging;
+
+    /** Set true for leftmost reducer map */
+    bool is_leftmost;
+
+    /** @brief Return element mapped to 'key' or null if not found. */
+    elem *lookup(void *key);
+
+    /**
+     * @brief Insert key/value element into hash map without rehashing.
+     * Does not check for duplicate key.
+     */
+    elem *insert_no_rehash(__cilkrts_worker           *w,
+			   void                       *key,
+			   __cilkrts_hyperobject_base *hb,
+                           void                       *value);
+
+    /**
+     * @brief Insert key/value element into hash map, rehashing if necessary.
+     * Does not check for duplicate key.
+     */
+    inline elem *rehash_and_insert(__cilkrts_worker           *w,
+				   void                       *key,
+				   __cilkrts_hyperobject_base *hb,
+                                   void                       *value);
+
+    /** @brief Grow bucket by one element, reallocating bucket if necessary */
+    static elem *grow(__cilkrts_worker *w, bucket **bp);
+
+    /** @brief Rehash a worker's reducer map */
+    void rehash(__cilkrts_worker *);
+
+    /**
+     * @brief Returns true if a rehash is needed due to the number of elements that
+     * have been inserted.
+     */
+    inline bool need_rehash_p() const;
+
+    /** @brief Allocate and initialize the buckets */
+    void make_buckets(__cilkrts_worker *w, size_t nbuckets);
+
+    /**
+     * Specify behavior when the same key is present in both maps passed
+     * into merge().
+     */
+    enum merge_kind
+    {
+        MERGE_UNORDERED, ///< Assertion fails
+        MERGE_INTO_LEFT, ///< Merges the argument from the right into the left
+        MERGE_INTO_RIGHT ///< Merges the argument from the left into the right
+    };
+
+    /**
+     * @brief Merge another reducer map into this one, destroying the other map in
+     * the process.
+     */
+    __cilkrts_worker* merge(__cilkrts_worker *current_wkr,
+			    cilkred_map      *other_map,
+			    enum merge_kind   kind);
+
+    /** @brief check consistency of a reducer map */
+    void check(bool allow_null_view);
+
+    /** @brief Test whether the cilkred_map is empty */
+    bool is_empty() { return nelem == 0; }
+};
+
+static inline struct cilkred_map* install_new_reducer_map(__cilkrts_worker *w) {
+    cilkred_map *h;
+    h = __cilkrts_make_reducer_map(w);
+    w->reducer_map = h;
+    return h;
+}
+
+static size_t sizeof_bucket(size_t nmax)
+{
+    bucket *b = 0;
+    return (sizeof(*b) + nmax * sizeof(b->el[0]));
+}
+
+static bucket *alloc_bucket(__cilkrts_worker *w, size_t nmax)
+{
+    bucket *b = (bucket *)
+        __cilkrts_frame_malloc(w, sizeof_bucket(nmax));
+    b->nmax = nmax;
+    return b;
+}
+
+static void free_bucket(__cilkrts_worker *w, bucket **bp)
+{
+    bucket *b = *bp;
+    if (b) {
+        __cilkrts_frame_free(w, b, sizeof_bucket(b->nmax));
+        *bp = 0;
+    }
+}
+
+/* round up nmax to fill a memory allocator block completely */
+static size_t roundup(size_t nmax)
+{
+    size_t sz = sizeof_bucket(nmax);
+
+    /* round up size to a full malloc block */
+    sz = __cilkrts_frame_malloc_roundup(sz);
+
+    /* invert sizeof_bucket() */
+    nmax = ((sz - sizeof(bucket)) / sizeof(elem));
+     
+    return nmax;
+}
+
+static bool is_power_of_2(size_t n)
+{
+    return (n & (n - 1)) == 0;
+}
+
+void cilkred_map::make_buckets(__cilkrts_worker *w, 
+                               size_t            new_nbuckets)
+{     
+    nbuckets = new_nbuckets;
+
+    CILK_ASSERT(is_power_of_2(nbuckets));
+#if defined __GNUC__ && defined __ICC 
+    /* bug workaround -- suppress calls to _intel_fast_memset */
+    bucket *volatile*new_buckets = (bucket *volatile*)
+#else
+    bucket **new_buckets = (bucket **)
+#endif
+        __cilkrts_frame_malloc(w, nbuckets * sizeof(*(buckets)));
+
+#if REDPAR_DEBUG >= 1
+    fprintf(stderr, "W=%d, desc=make_buckets, new_buckets=%p, new_nbuckets=%zd\n",
+	    w->self, new_buckets, new_nbuckets);
+#endif
+
+    for (size_t i = 0; i < new_nbuckets; ++i)
+        new_buckets[i] = 0;
+#if defined __GNUC__ && defined __ICC 
+    buckets = (bucket **)new_buckets;
+#else
+    buckets = new_buckets;
+#endif
+    nelem = 0;
+}
+
+static void free_buckets(__cilkrts_worker  *w, 
+                         bucket           **buckets,
+                         size_t             nbuckets)
+{
+    size_t i;
+
+#if REDPAR_DEBUG >= 1
+    verify_current_wkr(w);
+    fprintf(stderr, "W=%d, desc=free_buckets, buckets=%p, size=%zd\n",
+	    w->self, buckets,
+	    nbuckets * sizeof(*buckets));
+#endif
+
+    for (i = 0; i < nbuckets; ++i)
+        free_bucket(w, buckets + i);
+
+    __cilkrts_frame_free(w, buckets, nbuckets * sizeof(*buckets));
+}
+
+static size_t minsz(size_t nelem)
+{
+    return 1U + nelem + nelem / 8U;
+}
+
+static size_t nextsz(size_t nelem)
+{
+    return 2 * nelem;
+}
+
+bool cilkred_map::need_rehash_p() const
+{
+    return minsz(nelem) > nbuckets;
+}
+
+static inline size_t hashfun(const cilkred_map *h, void *key)
+{
+    size_t k = (size_t) key;
+
+    k ^= k >> 21;
+    k ^= k >> 8;
+    k ^= k >> 3;
+
+    return k & (h->nbuckets - 1);
+}
+
+// Given a __cilkrts_hyperobject_base, return the key to that hyperobject in
+// the reducer map.
+static inline void* get_hyperobject_key(__cilkrts_hyperobject_base *hb)
+{
+    // The current implementation uses the address of the lefmost view as the
+    // key.
+    return reinterpret_cast<char*>(hb) + hb->__view_offset;
+}
+
+// Given a hyperobject key, return a pointer to the leftmost object.  In the
+// current implementation, the address of the leftmost object IS the key, so
+// this function is an effective noop.
+static inline void* get_leftmost_view(void *key)
+{
+    return key;
+}
+
+/* debugging support: check consistency of a reducer map */
+void cilkred_map::check(bool allow_null_view)
+{
+    size_t count = 0;
+
+    CILK_ASSERT(buckets);
+    for (size_t i = 0; i < nbuckets; ++i) {
+        bucket *b = buckets[i];
+        if (b) 
+            for (elem *el = b->el; el->key; ++el) {
+                CILK_ASSERT(allow_null_view || el->view);
+                ++count;
+            }
+    }
+    CILK_ASSERT(nelem == count);
+    /*global_reducer_map::check();*/
+}             
+
+/* grow bucket by one element, reallocating bucket if necessary */
+elem *cilkred_map::grow(__cilkrts_worker *w, 
+                        bucket          **bp)
+{
+    size_t i, nmax, nnmax;
+    bucket *b, *nb;
+
+    b = *bp;
+    if (b) {
+        nmax = b->nmax;
+        /* find empty element if any */
+        for (i = 0; i < nmax; ++i) 
+            if (b->el[i].key == 0) 
+                return &(b->el[i]);
+        /* do not use the last one even if empty */
+    } else {
+        nmax = 0;
+    }
+
+    verify_current_wkr(w);
+    /* allocate a new bucket */
+    nnmax = roundup(2 * nmax);
+    nb = alloc_bucket(w, nnmax);
+
+
+    /* copy old bucket into new */
+    for (i = 0; i < nmax; ++i)
+        nb->el[i] = b->el[i];
+     
+    free_bucket(w, bp); *bp = nb;
+
+    /* zero out extra elements */
+    for (; i < nnmax; ++i)
+        nb->el[i].key = 0;
+
+    /* zero out the last one */
+    nb->el[i].key = 0;
+  
+    return &(nb->el[nmax]);
+}
+
+elem *cilkred_map::insert_no_rehash(__cilkrts_worker           *w,
+				    void                       *key,
+				    __cilkrts_hyperobject_base *hb,
+                                    void                       *view)
+{
+
+#if REDPAR_DEBUG >= 2
+    fprintf(stderr, "[W=%d, desc=insert_no_rehash, this_map=%p]\n",
+	    w->self, this);
+    verify_current_wkr(w);
+#endif
+    
+    CILK_ASSERT((w == 0 && g == 0) || w->g == g);
+    CILK_ASSERT(key != 0);
+    CILK_ASSERT(view != 0);
+	    
+    elem *el = grow(w, &(buckets[hashfun(this, key)]));
+
+#if REDPAR_DEBUG >= 3
+    fprintf(stderr, "[W=%d, this=%p, inserting key=%p, view=%p, el = %p]\n",
+	    w->self, this, key, view, el);
+#endif
+
+    el->key = key;
+    el->hb  = hb;
+    el->view = view;
+    ++nelem;
+
+    return el;
+}
+
+void cilkred_map::rehash(__cilkrts_worker *w)
+{
+#if REDPAR_DEBUG >= 1
+    fprintf(stderr, "[W=%d, desc=rehash, this_map=%p, g=%p, w->g=%p]\n",
+	    w->self, this, g, w->g);
+    verify_current_wkr(w);
+#endif
+    CILK_ASSERT((w == 0 && g == 0) || w->g == g);
+    
+    size_t onbuckets = nbuckets;
+    size_t onelem = nelem;
+    bucket **obuckets = buckets;
+    size_t i;
+    bucket *b;
+
+    make_buckets(w, nextsz(nbuckets));
+     
+    for (i = 0; i < onbuckets; ++i) {
+        b = obuckets[i];
+        if (b) {
+            elem *oel;
+            for (oel = b->el; oel->key; ++oel)
+                insert_no_rehash(w, oel->key, oel->hb, oel->view);
+        }
+    }
+
+    CILK_ASSERT(nelem == onelem);
+
+    free_buckets(w, obuckets, onbuckets);
+}
+
+elem *cilkred_map::rehash_and_insert(__cilkrts_worker           *w,
+				     void                       *key,
+                                     __cilkrts_hyperobject_base *hb,
+				     void                       *view)
+{
+
+#if REDPAR_DEBUG >= 1
+    fprintf(stderr, "W=%d, this_map =%p, inserting key=%p, view=%p\n",
+	    w->self, this, key, view);
+    verify_current_wkr(w);
+#endif
+
+    if (need_rehash_p()) 
+        rehash(w);
+
+    return insert_no_rehash(w, key, hb, view);
+}
+
+
+elem *cilkred_map::lookup(void *key)
+{
+    bucket *b = buckets[hashfun(this, key)];
+
+    if (b) {
+        elem *el;
+        for (el = b->el; el->key; ++el) {
+            if (el->key == key) {
+                CILK_ASSERT(el->view);
+                return el;
+            }
+        }
+    }
+
+    return 0;
+}
+
+void elem::destroy()
+{
+    if (! is_leftmost()) {
+
+        // Call destroy_fn and deallocate_fn on the view, but not if it's the
+        // leftmost view.
+        cilk_c_monoid *monoid = &(hb->__c_monoid);
+        cilk_c_reducer_destroy_fn_t    destroy_fn    = monoid->destroy_fn;
+        cilk_c_reducer_deallocate_fn_t deallocate_fn = monoid->deallocate_fn;
+	
+        destroy_fn((void*)hb, view);
+        deallocate_fn((void*)hb, view);
+    }
+
+    view = 0;
+}
+
+inline
+bool elem::is_leftmost() const
+{
+    // implementation uses the address of the leftmost view as the key, so if
+    // key == view, then this element refers to the leftmost view.
+    return key == view;
+}
+
+/* remove the reducer from the current reducer map.  If the reducer
+   exists in maps other than the current one, the behavior is
+   undefined. */
+extern "C"
+CILK_EXPORT void __CILKRTS_STRAND_STALE(
+    __cilkrts_hyper_destroy(__cilkrts_hyperobject_base *hb))
+{
+    // Disable Cilkscreen for the duration of this call.  The destructor for
+    // this class will re-enable Cilkscreen when the method returns.  This
+    // will prevent Cilkscreen from reporting apparent races in reducers
+    DisableCilkscreen x;
+
+    __cilkrts_worker* w = __cilkrts_get_tls_worker();
+    if (! w) {
+        // If no worker, then Cilk is not running and there is no reducer
+        // map.  Do nothing.  The reducer's destructor will take care of
+        // destroying the leftmost view.
+        return;
+    }
+
+const char *UNSYNCED_REDUCER_MSG =
+    "Destroying a reducer while it is visible to unsynced child tasks, or\n"
+    "calling CILK_C_UNREGISTER_REDUCER() on an unregistered reducer.\n"
+    "Did you forget a _Cilk_sync or CILK_C_REGISTER_REDUCER()?";
+
+    cilkred_map* h = w->reducer_map;
+    if (NULL == h)
+	cilkos_error(UNSYNCED_REDUCER_MSG); // Does not return
+
+    if (h->merging) {
+	verify_current_wkr(w);
+	__cilkrts_bug("User error: hyperobject used by another hyperobject");
+    }
+
+    void* key = get_hyperobject_key(hb);
+    elem *el = h->lookup(key);
+
+    // Verify that the reducer is being destroyed from the leftmost strand for
+    // which the reducer is defined.
+    if (! (el && el->is_leftmost()))
+	cilkos_error(UNSYNCED_REDUCER_MSG);
+
+#if REDPAR_DEBUG >= 3
+    fprintf(stderr, "[W=%d, key=%p, lookup in map %p, found el=%p, about to destroy]\n",
+            w->self, key, h, el);
+#endif
+	
+    // Remove the element from the hash bucket.  Do not bother shrinking
+    // the bucket. Note that the destroy() function does not actually
+    // call the destructor for the leftmost view.
+    el->destroy();
+    do {
+        el[0] = el[1];
+        ++el;
+    } while (el->key);
+    --h->nelem;
+
+#if REDPAR_DEBUG >= 2
+    fprintf(stderr, "[W=%d, desc=hyper_destroy_finish, key=%p, w->reducer_map=%p]\n",
+	    w->self, key, w->reducer_map);
+#endif 
+}
+    
+extern "C"
+CILK_EXPORT
+void __cilkrts_hyper_create(__cilkrts_hyperobject_base *hb)
+{
+    // This function registers the specified hyperobject in the current
+    // reducer map and registers the initial value of the hyperobject as the
+    // leftmost view of the reducer.
+    __cilkrts_worker *w = __cilkrts_get_tls_worker();
+    if (! w) {
+        // If there is no worker, then there is nothing to do: The iniitial
+        // value will automatically be used as the left-most view when we
+        // enter Cilk.
+        return;
+    }
+
+    // Disable Cilkscreen for the duration of this call.  The destructor for
+    // this class will re-enable Cilkscreen when the method returns.  This
+    // will prevent Cilkscreen from reporting apparent races in reducers
+    DisableCilkscreen x;
+
+    void* key = get_hyperobject_key(hb);
+    void* view = get_leftmost_view(key);
+    cilkred_map *h = w->reducer_map;
+
+    if (__builtin_expect(!h, 0)) {
+	h = install_new_reducer_map(w);
+#if REDPAR_DEBUG >= 2
+	fprintf(stderr, "[W=%d, hb=%p, hyper_create, isntalled new map %p, view=%p]\n",
+		w->self, hb, h, view);
+#endif
+    }
+
+    /* Must not exist. */
+    CILK_ASSERT(h->lookup(key) == NULL);
+
+#if REDPAR_DEBUG >= 3
+    verify_current_wkr(w);
+    fprintf(stderr, "[W=%d, hb=%p, lookup in map %p of view %p, should be null]\n",
+	    w->self, hb, h, view);
+    fprintf(stderr, "W=%d, h=%p, inserting key %p, view%p\n",
+	    w->self,
+	    h,
+	    &(hb->__c_monoid),
+	    view);
+#endif    
+
+    if (h->merging)
+        __cilkrts_bug("User error: hyperobject used by another hyperobject");
+
+    CILK_ASSERT(w->reducer_map == h);
+    // The address of the leftmost value is the same as the key for lookup.
+    (void) h->rehash_and_insert(w, view, hb, view);
+}
+
+extern "C"
+CILK_EXPORT void* __CILKRTS_STRAND_PURE(
+    __cilkrts_hyper_lookup(__cilkrts_hyperobject_base *hb))
+{
+    __cilkrts_worker* w = __cilkrts_get_tls_worker_fast();
+    void* key = get_hyperobject_key(hb);
+    if (! w)
+        return get_leftmost_view(key);
+
+    // Disable Cilkscreen for the duration of this call.  This will
+    // prevent Cilkscreen from reporting apparent races in reducers
+    DisableCilkscreen dguard;
+
+    if (__builtin_expect(w->g->force_reduce, 0))
+        __cilkrts_promote_own_deque(w);
+    cilkred_map* h = w->reducer_map;
+
+    if (__builtin_expect(!h, 0)) {
+	h = install_new_reducer_map(w);
+    }
+
+    if (h->merging)
+        __cilkrts_bug("User error: hyperobject used by another hyperobject");
+    elem* el = h->lookup(key);
+    if (! el) {
+        /* lookup failed; insert a new default element */
+        void *rep;
+
+        {
+            /* re-enable cilkscreen while calling the constructor */
+            EnableCilkscreen eguard;
+            if (h->is_leftmost)
+            {
+                // This special case is called only if the reducer was not
+                // registered using __cilkrts_hyper_create, e.g., if this is a
+                // C reducer in global scope or if there is no bound worker.
+                rep = get_leftmost_view(key);
+            }
+            else
+            {
+                rep = hb->__c_monoid.allocate_fn((void*)hb,
+						 hb->__view_size);
+                // TBD: Handle exception on identity function
+                hb->__c_monoid.identity_fn((void*)hb, rep);
+            }
+        }
+
+#if REDPAR_DEBUG >= 3
+	fprintf(stderr, "W=%d, h=%p, inserting key %p, view%p\n",
+		w->self,
+		h,
+		&(hb->__c_monoid),
+		rep);
+	CILK_ASSERT(w->reducer_map == h);
+#endif
+        el = h->rehash_and_insert(w, key, hb, rep);
+    }
+
+    return el->view;
+}
+
+extern "C" CILK_EXPORT
+void* __cilkrts_hyperobject_alloc(void* ignore, std::size_t bytes)
+{
+    return std::malloc(bytes);
+}
+
+extern "C" CILK_EXPORT
+void __cilkrts_hyperobject_dealloc(void* ignore, void* view)
+{
+    std::free(view);
+}
+
+/* No-op destroy function */
+extern "C" CILK_EXPORT
+void __cilkrts_hyperobject_noop_destroy(void* ignore, void* ignore2)
+{
+}
+
+cilkred_map *__cilkrts_make_reducer_map(__cilkrts_worker *w)
+{
+    CILK_ASSERT(w);
+
+    cilkred_map *h;
+    size_t nbuckets = 1; /* default value */
+    
+    h = (cilkred_map *)__cilkrts_frame_malloc(w, sizeof(*h));
+#if REDPAR_DEBUG >= 1
+    fprintf(stderr, "[W=%d, desc=make_reducer_frame_malloc_reducer_map, h=%p]\n",
+	    w->self, h);
+#endif
+
+    h->g = w ? w->g : 0;
+    h->make_buckets(w, nbuckets);
+    h->merging = false;
+    h->is_leftmost = false;
+
+    return h;
+}
+
+/* Destroy a reducer map.  The map must have been allocated
+   from the worker's global context and should have been
+   allocated from the same worker. */
+void __cilkrts_destroy_reducer_map(__cilkrts_worker *w, cilkred_map *h)
+{
+    CILK_ASSERT((w == 0 && h->g == 0) || w->g == h->g);
+    verify_current_wkr(w);
+
+    /* the reducer map is allowed to contain el->view == NULL here (and
+       only here).  We set el->view == NULL only when we know that the
+       map will be destroyed immediately afterwards. */
+    DBG h->check(/*allow_null_view=*/true);
+
+    bucket *b;
+    size_t i;
+
+    for (i = 0; i < h->nbuckets; ++i) {
+        b = h->buckets[i];
+        if (b) {
+            elem *el;
+            for (el = b->el; el->key; ++el) {
+                if (el->view)
+                    el->destroy();
+            }
+        }
+    }
+
+    free_buckets(w, h->buckets, h->nbuckets);
+
+#if REDPAR_DEBUG >= 1
+    fprintf(stderr, "W=%d, destroy_red_map, freeing map h=%p, size=%zd\n",
+	    w->self, h, sizeof(*h));
+#endif
+    
+    __cilkrts_frame_free(w, h, sizeof(*h));
+}
+
+/* Set the specified reducer map as the leftmost map if is_leftmost is true,
+   otherwise, set it to not be the leftmost map. */
+void __cilkrts_set_leftmost_reducer_map(cilkred_map *h, int is_leftmost)
+{
+    h->is_leftmost = is_leftmost;
+}
+
+
+__cilkrts_worker* cilkred_map::merge(__cilkrts_worker *w,
+				     cilkred_map *other_map,
+				     enum merge_kind kind)
+{
+    // Disable Cilkscreen while the we merge the maps.  The destructor for
+    // the guard class will re-enable Cilkscreen when it goes out of scope.
+    // This will prevent Cilkscreen from reporting apparent races in between
+    // the reduce function and the reducer operations.  The Cilk runtime
+    // guarantees that a pair of reducer maps will only be merged when no 
+    // other strand will access them.
+    DisableCilkscreen guard;
+
+#if REDPAR_DEBUG >= 2
+    fprintf(stderr, "[W=%d, desc=merge, this_map=%p, other_map=%p]\n",
+	    w->self,
+	    this, other_map);
+#endif
+    // Remember the current stack frame.
+    __cilkrts_stack_frame *current_sf = w->current_stack_frame;
+    merging = true;
+    other_map->merging = true;
+
+    // Merging to the leftmost view is a special case because every leftmost
+    // element must be initialized before the merge.
+    CILK_ASSERT(!other_map->is_leftmost /* || kind == MERGE_UNORDERED */);
+    bool merge_to_leftmost = (this->is_leftmost
+                              /* && !other_map->is_leftmost */);
+
+    DBG check(/*allow_null_view=*/false);
+    DBG other_map->check(/*allow_null_view=*/false);
+
+    for (size_t i = 0; i < other_map->nbuckets; ++i) {
+        bucket *b = other_map->buckets[i];
+        if (b) {
+            for (elem *other_el = b->el; other_el->key; ++other_el) {
+                /* Steal the value from the other map, which will be
+                   destroyed at the end of this operation. */
+                void *other_view = other_el->view;
+                CILK_ASSERT(other_view);
+
+                void *key = other_el->key;
+		__cilkrts_hyperobject_base *hb = other_el->hb;
+                elem *this_el = lookup(key);
+
+                if (this_el == 0 && merge_to_leftmost) {
+                    /* Initialize leftmost view before merging. */
+                    void* leftmost = get_leftmost_view(key);
+                    // leftmost == other_view can be true if the initial view
+                    // was created in other than the leftmost strand of the
+                    // spawn tree, but then made visible to subsequent strands
+                    // (E.g., the reducer was allocated on the heap and the
+                    // pointer was returned to the caller.)  In such cases,
+                    // parallel semantics says that syncing with earlier
+                    // strands will always result in 'this_el' being null,
+                    // thus propagating the initial view up the spawn tree
+                    // until it reaches the leftmost strand.  When synching
+                    // with the leftmost strand, leftmost == other_view will be
+                    // true and we must avoid reducing the initial view with
+                    // itself.
+                    if (leftmost != other_view)
+                        this_el = rehash_and_insert(w, key, hb, leftmost);
+                }
+
+                if (this_el == 0) {
+                    /* move object from other map into this one */
+                    rehash_and_insert(w, key, hb, other_view);
+                    other_el->view = 0;
+                    continue; /* No element-level merge necessary */
+                }
+
+                /* The same key is present in both maps with values
+                   A and B.  Three choices: fail, A OP B, B OP A. */
+                switch (kind)
+                {
+                case MERGE_UNORDERED:
+                    __cilkrts_bug("TLS Reducer race");
+                    break;
+                case MERGE_INTO_RIGHT:
+                    /* Swap elements in order to preserve object
+                       identity */
+                    other_el->view = this_el->view;
+                    this_el->view = other_view;
+                    /* FALL THROUGH */
+                case MERGE_INTO_LEFT: {
+                    /* Stealing should be disabled during reduce
+                       (even if force-reduce is enabled). */
+
+#if DISABLE_PARALLEL_REDUCERS
+		    __cilkrts_stack_frame * volatile *saved_protected_tail;
+		    saved_protected_tail = __cilkrts_disallow_stealing(w, NULL);
+#endif
+
+		    {			
+			CILK_ASSERT(current_sf->worker == w);
+			CILK_ASSERT(w->current_stack_frame == current_sf);
+
+			/* TBD: if reduce throws an exception we need to stop it
+			   here. */
+			hb->__c_monoid.reduce_fn((void*)hb,
+						 this_el->view,
+						 other_el->view);
+			w = current_sf->worker;
+
+#if REDPAR_DEBUG >= 2
+			verify_current_wkr(w);
+			CILK_ASSERT(w->current_stack_frame == current_sf);
+#endif
+		    }
+
+#if DISABLE_PARALLEL_REDUCERS
+		    /* Restore stealing */
+		    __cilkrts_restore_stealing(w, saved_protected_tail);
+#endif
+
+                  } break;
+                }
+            }
+        }
+    }
+    this->is_leftmost = this->is_leftmost || other_map->is_leftmost;
+    merging = false;
+    other_map->merging = false;
+    verify_current_wkr(w);
+    __cilkrts_destroy_reducer_map(w, other_map);
+    return w;
+}
+
+
+/**
+ * Print routine for debugging the merging of reducer maps.
+ * A no-op unless REDPAR_DEBUG set high enough.
+ */
+static inline
+void debug_map_merge(__cilkrts_worker *w,
+		     cilkred_map      *left_map,
+		     cilkred_map      *right_map,
+		     __cilkrts_worker **final_wkr)
+{    
+#if REDPAR_DEBUG >= 2
+    fprintf(stderr, "[W=%d, desc=finish_merge, left_map=%p, right_map=%p, w->reducer_map=%p, right_ans=%p, final_wkr=%d]\n",
+	    w->self, left_map, right_map, w->reducer_map, right_map, (*final_wkr)->self);
+#endif
+}
+
+
+/**
+ * merge RIGHT into LEFT;
+ * return whichever map allows for faster merge, and destroy the other one.
+ * 
+ * *w_ptr should be the currently executing worker.
+ * *w_ptr may change during execution if the reduction is parallel.
+ */ 
+cilkred_map*
+merge_reducer_maps(__cilkrts_worker **w_ptr,
+		   cilkred_map *left_map,
+		   cilkred_map *right_map)
+{
+    __cilkrts_worker *w = *w_ptr;
+    if (!left_map) {
+	debug_map_merge(w, left_map, right_map, w_ptr);
+        return right_map;
+    }
+
+    if (!right_map) {
+	debug_map_merge(w, left_map, right_map, w_ptr);
+        return left_map;
+    }
+    
+    /* Special case, if left_map is leftmost, then always merge into it.
+       For C reducers this forces lazy creation of the leftmost views. */
+    if (left_map->is_leftmost || left_map->nelem > right_map->nelem) {	
+	*w_ptr = left_map->merge(w, right_map, cilkred_map::MERGE_INTO_LEFT);
+	debug_map_merge(*w_ptr, left_map, right_map, w_ptr);
+        return left_map;
+    } else {
+        *w_ptr = right_map->merge(w, left_map, cilkred_map::MERGE_INTO_RIGHT);
+	debug_map_merge(*w_ptr, left_map, right_map, w_ptr);
+        return right_map;
+    }
+}
+
+/**
+ * Merges RIGHT into LEFT, and then repeatedly calls
+ * merge_reducer_maps_helper() until (*w_ptr)->reducer_map is NULL.
+ *
+ *  *w_ptr may change as reductions execute.
+ */ 
+cilkred_map*
+repeated_merge_reducer_maps(__cilkrts_worker **w_ptr,
+			    cilkred_map      *left_map,
+			    cilkred_map      *right_map)
+{
+    // Note: if right_map == NULL but w->reducer_map != NULL, then
+    // this loop will reduce w->reducer_map into left_map.
+    do {
+	left_map = merge_reducer_maps(w_ptr, left_map, right_map);
+	verify_current_wkr(*w_ptr);
+
+	// Pull any newly created reducer map and loop around again.
+	right_map = (*w_ptr)->reducer_map;
+	(*w_ptr)->reducer_map = NULL;
+    } while (right_map);
+    return left_map;
+}
+
+/* End reducer_impl.cpp */
diff --git a/libcilkrts/runtime/reducer_impl.h b/libcilkrts/runtime/reducer_impl.h
new file mode 100644
index 00000000000..3425967ad8d
--- /dev/null
+++ b/libcilkrts/runtime/reducer_impl.h
@@ -0,0 +1,128 @@
+/* reducer_impl.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file reducer_impl.h
+ *
+ * @brief Functions to implement reducers in the runtime.
+ */
+
+#ifndef INCLUDED_REDUCER_IMPL_DOT_H
+#define INCLUDED_REDUCER_IMPL_DOT_H
+
+#include <cilk/common.h>
+#include <internal/abi.h>
+#include "rts-common.h"
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * Construct an empty reducer map from the memory pool associated with the
+ * given worker.  This reducer map must be destroyed before the worker's
+ * associated global context is destroyed.
+ *
+ * @param w __cilkrts_worker the cilkred_map is being created for.
+ *
+ * @return Pointer to the initialized cilkred_map.
+ */
+COMMON_SYSDEP
+cilkred_map *__cilkrts_make_reducer_map(__cilkrts_worker *w);
+
+/**
+ * Destroy a reducer map.  The map must have been allocated from the worker's
+ * global context and should have been allocated from the same worker.
+ *
+ * @param w __cilkrts_worker the cilkred_map was created for.
+ * @param h The cilkred_map to be deallocated.
+ */
+COMMON_SYSDEP
+void __cilkrts_destroy_reducer_map(__cilkrts_worker *w,
+                                   cilkred_map *h);
+
+/**
+ * Set the specified reducer map as the leftmost map if is_leftmost is true,
+ * otherwise, set it to not be the leftmost map.
+ *
+ * @param h The cilkred_map to be modified.
+ * @param is_leftmost true if the reducer map is leftmost.
+ */
+COMMON_SYSDEP
+void __cilkrts_set_leftmost_reducer_map(cilkred_map *h,
+                                        int is_leftmost);
+
+/**
+ * Merge reducer map RIGHT_MAP into LEFT_MAP and return the result of the
+ * merge.  Both maps must be allocated from the global context associated
+ * with the specified worker.  The returned reducer map must be destroyed
+ * before the worker's associated global context is destroyed.
+ *
+ * If two cilkred_maps are specified, one will be destroyed and the other
+ * one will be returned as the merged cilkred_map.
+ *
+ * When reducers can contain nested parallelism, execution can return
+ * on a different worker than when it started (but still using the
+ * same stack).
+ *
+ * Upon return, *w_ptr stores the pointer to the worker that execution
+ * returns on.
+ *
+ * @param w_ptr      Pointer to the currently executing worker.
+ * @param left_map   The left cilkred_map.
+ * @param right_map  The right cilkred_map.
+ *                  
+ * @return pointer to merged cilkred_map.
+ */
+extern
+cilkred_map *merge_reducer_maps(__cilkrts_worker **w_ptr,
+				cilkred_map *left_map,
+				cilkred_map *right_map);
+
+/**
+ * Similar to merge_reducer_maps(), except that after merging
+ * RIGHT_MAP into LEFT_MAP, it repeatedly merges (*w_ptr)->reducer_map
+ * into LEFT_MAP.  This procedure ensures that any new reducers
+ * created by the reductions themselves also get merged into LEFT_MAP.
+ */ 
+extern
+cilkred_map *repeated_merge_reducer_maps(__cilkrts_worker **w_ptr,
+					 cilkred_map *left_map,
+					 cilkred_map *right_map);
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_REDUCER_IMPL_DOT_H)
diff --git a/libcilkrts/runtime/rts-common.h b/libcilkrts/runtime/rts-common.h
new file mode 100644
index 00000000000..4ffde7ccb1e
--- /dev/null
+++ b/libcilkrts/runtime/rts-common.h
@@ -0,0 +1,132 @@
+/* rts-common.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#ifndef INCLUDED_RTS_COMMON_DOT_H
+#define INCLUDED_RTS_COMMON_DOT_H
+
+/* Abbreviations API functions returning different types.  By using these
+ * abbreviations instead of using CILK_API(ret) directly, etags and other
+ * tools can more easily recognize function signatures.
+ */
+#define CILK_API_VOID          CILK_API(void)
+#define CILK_API_VOID_PTR      CILK_API(void*)
+#define CILK_API_INT           CILK_API(int)
+#define CILK_API_SIZET         CILK_API(size_t)
+#define CILK_API_TBB_RETCODE   CILK_API(__cilk_tbb_retcode)
+#define CILK_API_PEDIGREE      CILK_API(__cilkrts_pedigree) 
+
+/* Abbreviations ABI functions returning different types.  By using these
+ * abbreviations instead of using CILK_ABI(ret) directly, etags and other
+ * tools can more easily recognize function signatures.
+ */
+#define CILK_ABI_VOID        CILK_ABI(void)
+#define CILK_ABI_WORKER_PTR  CILK_ABI(__cilkrts_worker_ptr)
+#define CILK_ABI_THROWS_VOID CILK_ABI_THROWS(void)
+
+/* documentation aid to identify portable vs. nonportable
+   parts of the runtime.  See README for definitions. */
+#define COMMON_PORTABLE
+#define COMMON_SYSDEP
+#define NON_COMMON
+
+#if !(defined __GNUC__ || defined __ICC)
+#   define __builtin_expect(a_, b_) a_
+#endif
+
+#ifdef __cplusplus
+#   define cilk_nothrow throw()
+#else
+#   define cilk_nothrow /*empty in C*/
+#endif
+
+#ifdef __GNUC__
+#   define NORETURN void __attribute__((noreturn))
+#else
+#   define NORETURN void __declspec(noreturn)
+#endif
+
+#ifdef __GNUC__
+#   define NOINLINE __attribute__((noinline))
+#else
+#   define NOINLINE __declspec(noinline)
+#endif
+
+#ifndef __GNUC__
+#   define __attribute__(X)
+#endif
+
+/* Microsoft CL accepts "inline" for C++, but not for C.  It accepts 
+ * __inline for both.  Intel ICL accepts inline for C of /Qstd=c99
+ * is set.  The Cilk runtime is assumed to be compiled with /Qstd=c99
+ */
+#if defined(_MSC_VER) && ! defined(__INTEL_COMPILER)
+#   error define inline
+#   define inline __inline
+#endif
+
+/* Compilers that build the Cilk runtime are assumed to know about zero-cost
+ * intrinsics (__notify_intrinsic()).  For those that don't, #undef the
+ * following definition:
+ */
+//#define ENABLE_NOTIFY_ZC_INTRINSIC 1
+
+#if defined(__INTEL_COMPILER)
+/* The notify intrinsic was introduced in ICC 12.0. */
+#   if __INTEL_COMPILER <= 1200
+#       undef ENABLE_NOTIFY_ZC_INTRINSIC
+#   endif
+#elif defined(__VXWORKS__)
+#   undef ENABLE_NOTIFY_ZC_INTRINSIC
+#elif defined(__clang__)
+#   if !defined(__has_extension) || !__has_extension(notify_zc_intrinsic)
+#      undef ENABLE_NOTIFY_ZC_INTRINSIC
+#   endif
+#elif defined(__arm__)
+// __notify_zc_intrinsic not yet supported by gcc for ARM
+#   undef ENABLE_NOTIFY_ZC_INTRINSIC
+#endif
+
+// If ENABLE_NOTIFY_ZC_INTRINSIC is defined, use __notify_zc_intrisic
+#ifdef ENABLE_NOTIFY_ZC_INTRINSIC
+#   define NOTIFY_ZC_INTRINSIC(annotation, data) \
+    __notify_zc_intrinsic(annotation, data)
+#else
+#   define NOTIFY_ZC_INTRINSIC(annotation, data)
+#endif
+
+#endif // ! defined(INCLUDED_RTS_COMMON_DOT_H)
diff --git a/libcilkrts/runtime/scheduler.c b/libcilkrts/runtime/scheduler.c
new file mode 100644
index 00000000000..bab6430d9db
--- /dev/null
+++ b/libcilkrts/runtime/scheduler.c
@@ -0,0 +1,3940 @@
+/* scheduler.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2007-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+/*
+ * Cilk scheduler
+ */
+
+#include "scheduler.h"
+#include "bug.h"
+#include "os.h"
+#include "os_mutex.h"
+#include "local_state.h"
+#include "signal_node.h"
+#include "full_frame.h"
+#include "sysdep.h"
+#include "except.h"
+#include "cilk_malloc.h"
+#include "pedigrees.h"
+#include "record-replay.h"
+
+#include <limits.h>
+#include <string.h> /* memcpy */
+#include <stdio.h>  // sprintf
+#include <stdlib.h> // malloc, free, abort
+
+#ifdef _WIN32
+#   pragma warning(disable:1786)   // disable warning: sprintf is deprecated
+#   include "sysdep-win.h"
+#   include "except-win32.h"
+#endif  // _WIN32
+
+// ICL: Don't complain about conversion from pointer to same-sized integral
+// type in __cilkrts_put_stack.  That's why we're using ptrdiff_t
+#ifdef _WIN32
+#   pragma warning(disable: 1684)
+#endif
+
+#include "cilk/cilk_api.h"
+#include "frame_malloc.h"
+#include "metacall_impl.h"
+#include "reducer_impl.h"
+#include "cilk-tbb-interop.h"
+#include "cilk-ittnotify.h"
+#include "stats.h"
+
+// ICL: Don't complain about loss of precision in myrand
+// I tried restoring the warning after the function, but it didn't
+// suppress it
+#ifdef _WIN32
+#   pragma warning(disable: 2259)
+#endif
+
+#ifndef _WIN32
+#   include <unistd.h>
+#endif
+
+#ifdef __VXWORKS__
+// redeclare longjmp() with noreturn to stop warnings
+extern __attribute__((noreturn)) 
+		void longjmp(jmp_buf, int);
+#endif
+
+//#define DEBUG_LOCKS 1
+#ifdef DEBUG_LOCKS
+// The currently executing worker must own this worker's lock
+#   define ASSERT_WORKER_LOCK_OWNED(w) \
+        { \
+            __cilkrts_worker *tls_worker = __cilkrts_get_tls_worker(); \
+            CILK_ASSERT((w)->l->lock.owner == tls_worker); \
+        }
+#else
+#   define ASSERT_WORKER_LOCK_OWNED(w)
+#endif // DEBUG_LOCKS
+
+// Options for the scheduler.
+enum schedule_t { SCHEDULE_RUN,
+                  SCHEDULE_WAIT,
+                  SCHEDULE_EXIT };
+
+// Return values for provably_good_steal()
+enum provably_good_steal_t
+{
+    ABANDON_EXECUTION,  // Not the last child to the sync - attempt to steal work
+    CONTINUE_EXECUTION, // Last child to the sync - continue executing on this worker
+    WAIT_FOR_CONTINUE   // The replay log indicates that this was the worker
+                        // which continued.  Loop until we are the last worker
+                        // to the sync.
+};
+
+
+// Verify that "w" is the worker we are currently executing on.
+// Because this check is expensive, this method is usually a no-op.
+static inline void verify_current_wkr(__cilkrts_worker *w)
+{
+#if ((REDPAR_DEBUG >= 3) || (FIBER_DEBUG >= 1))
+    // Lookup the worker from TLS and compare to w. 
+    __cilkrts_worker* tmp = __cilkrts_get_tls_worker();
+    if (w != tmp) {
+        fprintf(stderr, "Error.  W=%d, actual worker =%d...\n",
+                w->self,
+                tmp->self);
+    }
+    CILK_ASSERT(w == tmp);
+#endif
+}                                                            
+
+static enum schedule_t worker_runnable(__cilkrts_worker *w);
+
+// Scheduling-fiber functions:
+static void do_return_from_spawn (__cilkrts_worker *w,
+                                  full_frame *ff,
+                                  __cilkrts_stack_frame *sf);
+static void do_sync (__cilkrts_worker *w,
+                     full_frame *ff,
+                     __cilkrts_stack_frame *sf);
+
+// max is defined on Windows and VxWorks
+#if (! defined(_WIN32)) && (! defined(__VXWORKS__))
+    // TBD: definition of max() for Linux.
+#   define max(a, b) ((a) < (b) ? (b) : (a))
+#endif
+
+void __cilkrts_dump_stats_to_stderr(global_state_t *g)
+{
+#ifdef CILK_PROFILE
+    int i;
+    for (i = 0; i < g->total_workers; ++i) {
+        // Print out statistics for each worker.  We collected them,
+        // so why not print them out?
+        fprintf(stderr, "Stats for worker %d\n", i);
+        dump_stats_to_file(stderr, g->workers[i]->l->stats);
+        __cilkrts_accum_stats(&g->stats, g->workers[i]->l->stats);
+    }
+
+    // Also print out aggregate statistics.
+    dump_stats_to_file(stderr, &g->stats);
+#endif
+    fprintf(stderr,
+            "CILK PLUS Thread Info: P=%d, Q=%d\n",
+            g->P,
+            g->Q);
+    fprintf(stderr,
+            "CILK PLUS RUNTIME MEMORY USAGE: %lld bytes",
+            (long long)g->frame_malloc.allocated_from_os);
+#ifdef CILK_PROFILE
+    if (g->stats.stack_hwm)
+        fprintf(stderr, ", %ld stacks", g->stats.stack_hwm);
+#endif
+    fputc('\n', stderr);
+}
+
+static void validate_worker(__cilkrts_worker *w)
+{
+    /* check the magic numbers, for debugging purposes */
+    if (w->l->worker_magic_0 != WORKER_MAGIC_0 ||
+        w->l->worker_magic_1 != WORKER_MAGIC_1)
+        abort_because_rts_is_corrupted();
+}
+
+static void double_link(full_frame *left_ff, full_frame *right_ff)
+{
+    if (left_ff)
+        left_ff->right_sibling = right_ff;
+    if (right_ff)
+        right_ff->left_sibling = left_ff;
+}
+
+/* add CHILD to the right of all children of PARENT */
+static void push_child(full_frame *parent_ff, full_frame *child_ff)
+{
+    double_link(parent_ff->rightmost_child, child_ff);
+    double_link(child_ff, 0);
+    parent_ff->rightmost_child = child_ff;
+}
+
+/* unlink CHILD from the list of all children of PARENT */
+static void unlink_child(full_frame *parent_ff, full_frame *child_ff)
+{
+    double_link(child_ff->left_sibling, child_ff->right_sibling);
+
+    if (!child_ff->right_sibling) {
+        /* this is the rightmost child -- update parent link */
+        CILK_ASSERT(parent_ff->rightmost_child == child_ff);
+        parent_ff->rightmost_child = child_ff->left_sibling;
+    }
+    child_ff->left_sibling = child_ff->right_sibling = 0; /* paranoia */
+}
+
+static void incjoin(full_frame *ff)
+{
+    ++ff->join_counter;
+}
+
+static int decjoin(full_frame *ff)
+{
+    CILK_ASSERT(ff->join_counter > 0);
+    return (--ff->join_counter);
+}
+
+static int simulate_decjoin(full_frame *ff)
+{
+  CILK_ASSERT(ff->join_counter > 0);
+  return (ff->join_counter - 1);
+}
+
+/*
+ * Pseudo-random generator defined by the congruence S' = 69070 * S
+ * mod (2^32 - 5).  Marsaglia (CACM July 1993) says on page 107 that
+ * this is a ``good one''.  There you go.
+ *
+ * The literature makes a big fuss about avoiding the division, but
+ * for us it is not worth the hassle.
+ */
+static const unsigned RNGMOD = ((1ULL << 32) - 5);
+static const unsigned RNGMUL = 69070U;
+
+static unsigned myrand(__cilkrts_worker *w)
+{
+    unsigned state = w->l->rand_seed;
+    state = (unsigned)((RNGMUL * (unsigned long long)state) % RNGMOD);
+    w->l->rand_seed = state;
+    return state;
+}
+
+static void mysrand(__cilkrts_worker *w, unsigned seed)
+{
+    seed %= RNGMOD;
+    seed += (seed == 0); /* 0 does not belong to the multiplicative
+                            group.  Use 1 instead */
+    w->l->rand_seed = seed;
+}
+
+/* W grabs its own lock */
+void __cilkrts_worker_lock(__cilkrts_worker *w)
+{
+    validate_worker(w);
+    CILK_ASSERT(w->l->do_not_steal == 0);
+
+    /* tell thieves to stay out of the way */
+    w->l->do_not_steal = 1;
+    __cilkrts_fence(); /* probably redundant */
+
+    __cilkrts_mutex_lock(w, &w->l->lock);
+}
+
+void __cilkrts_worker_unlock(__cilkrts_worker *w)
+{
+    __cilkrts_mutex_unlock(w, &w->l->lock);
+    CILK_ASSERT(w->l->do_not_steal == 1);
+    /* The fence is probably redundant.  Use a release
+       operation when supported (gcc and compatibile);
+       that is faster on x86 which serializes normal stores. */
+#if defined __GNUC__ && (__GNUC__ * 10 + __GNUC_MINOR__ > 43 || __ICC >= 1110)
+    __sync_lock_release(&w->l->do_not_steal);
+#else
+    w->l->do_not_steal = 0;
+    __cilkrts_fence(); /* store-store barrier, redundant on x86 */
+#endif
+}
+
+/* try to acquire the lock of some *other* worker */
+static int worker_trylock_other(__cilkrts_worker *w,
+                                __cilkrts_worker *other)
+{
+    int status = 0;
+
+    validate_worker(other);
+
+    /* This protocol guarantees that, after setting the DO_NOT_STEAL
+       flag, worker W can enter its critical section after waiting for
+       the thief currently in the critical section (if any) and at
+       most one other thief.  
+
+       This requirement is overly paranoid, but it should protect us
+       against future nonsense from OS implementors.
+    */
+
+    /* compete for the right to disturb OTHER */
+    if (__cilkrts_mutex_trylock(w, &other->l->steal_lock)) {
+        if (other->l->do_not_steal) {
+            /* leave it alone */
+        } else {
+            status = __cilkrts_mutex_trylock(w, &other->l->lock);
+        }
+        __cilkrts_mutex_unlock(w, &other->l->steal_lock);
+    }
+
+
+    return status;
+}
+
+static void worker_unlock_other(__cilkrts_worker *w,
+                                __cilkrts_worker *other)
+{
+    __cilkrts_mutex_unlock(w, &other->l->lock);
+}
+
+
+/* Lock macro Usage:
+    BEGIN_WITH_WORKER_LOCK(w) {
+        statement;
+        statement;
+        BEGIN_WITH_FRAME_LOCK(w, ff) {
+            statement;
+            statement;
+        } END_WITH_FRAME_LOCK(w, ff);
+    } END_WITH_WORKER_LOCK(w);
+ */
+#define BEGIN_WITH_WORKER_LOCK(w) __cilkrts_worker_lock(w); do
+#define END_WITH_WORKER_LOCK(w)   while (__cilkrts_worker_unlock(w), 0)
+
+// TBD(jsukha): These are worker lock acquistions on
+// a worker whose deque is empty.  My conjecture is that we
+// do not need to hold the worker lock at these points.
+// I have left them in for now, however.
+//
+// #define REMOVE_POSSIBLY_OPTIONAL_LOCKS
+#ifdef REMOVE_POSSIBLY_OPTIONAL_LOCKS
+    #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) do
+    #define END_WITH_WORKER_LOCK_OPTIONAL(w)   while (0)
+#else
+    #define BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) __cilkrts_worker_lock(w); do
+    #define END_WITH_WORKER_LOCK_OPTIONAL(w)   while (__cilkrts_worker_unlock(w), 0)
+#endif
+
+
+#define BEGIN_WITH_FRAME_LOCK(w, ff)                                     \
+    do { full_frame *_locked_ff = ff; __cilkrts_frame_lock(w, _locked_ff); do
+
+#define END_WITH_FRAME_LOCK(w, ff)                       \
+    while (__cilkrts_frame_unlock(w, _locked_ff), 0); } while (0)
+
+/* W becomes the owner of F and F can be stolen from W */
+static void make_runnable(__cilkrts_worker *w, full_frame *ff)
+{
+    w->l->frame_ff = ff;
+
+    /* CALL_STACK is invalid (the information is stored implicitly in W) */
+    ff->call_stack = 0;
+}
+
+/*
+ * The worker parameter is unused, except for print-debugging purposes.
+ */
+static void make_unrunnable(__cilkrts_worker *w,
+                            full_frame *ff,
+                            __cilkrts_stack_frame *sf,
+                            int is_loot,
+                            const char *why)
+{
+    /* CALL_STACK becomes valid again */
+    ff->call_stack = sf;
+
+    if (sf) {
+#if CILK_LIB_DEBUG
+        if (__builtin_expect(sf->flags & CILK_FRAME_EXITING, 0))
+            __cilkrts_bug("W%d suspending exiting frame %p/%p\n", w->self, ff, sf);
+#endif
+        sf->flags |= CILK_FRAME_STOLEN | CILK_FRAME_SUSPENDED;
+        sf->worker = 0;
+
+        if (is_loot)
+            __cilkrts_put_stack(ff, sf);
+
+        /* perform any system-dependent action, such as saving the
+           state of the stack */
+        __cilkrts_make_unrunnable_sysdep(w, ff, sf, is_loot, why);
+    }
+}
+
+
+/* Push the next full frame to be made active in this worker and increment its
+ * join counter.  __cilkrts_push_next_frame and pop_next_frame work on a
+ * one-element queue.  This queue is used to communicate across the runtime
+ * from the code that wants to activate a frame to the code that can actually
+ * begin execution on that frame.  They are asymetrical in that push
+ * increments the join counter but pop does not decrement it.  Rather, a
+ * single push/pop combination makes a frame active and increments its join
+ * counter once. */
+void __cilkrts_push_next_frame(__cilkrts_worker *w, full_frame *ff)
+{
+    CILK_ASSERT(ff);
+    CILK_ASSERT(!w->l->next_frame_ff);
+    incjoin(ff);
+    w->l->next_frame_ff = ff;
+}
+
+/* Get the next full-frame to be made active in this worker.  The join count
+ * of the full frame will have been incremented by the corresponding push
+ * event.  See __cilkrts_push_next_frame, above.
+ */
+static full_frame *pop_next_frame(__cilkrts_worker *w)
+{
+    full_frame *ff;
+    ff = w->l->next_frame_ff;
+    // Remove the frame from the next_frame field.
+    //
+    // If this is a user worker, then there is a chance that another worker
+    // from our team could push work into our next_frame (if it is the last
+    // worker doing work for this team).  The other worker's setting of the
+    // next_frame could race with our setting of next_frame to NULL.  This is
+    // the only possible race condition on next_frame.  However, if next_frame
+    // has a non-NULL value, then it means the team still has work to do, and
+    // there is no chance of another team member populating next_frame.  Thus,
+    // it is safe to set next_frame to NULL, if it was populated.  There is no
+    // need for an atomic op.
+    if (NULL != ff) {
+        w->l->next_frame_ff = NULL;
+    }
+    return ff;
+}
+
+/*
+ * Identify the single worker that is allowed to cross a sync in this frame.  A
+ * thief should call this function when it is the first to steal work from a
+ * user worker.  "First to steal work" may mean that there has been parallelism
+ * in the user worker before, but the whole team sync'd, and this is the first
+ * steal after that.
+ *
+ * This should happen while holding the worker and frame lock.
+ */
+static void set_sync_master(__cilkrts_worker *w, full_frame *ff)
+{
+    w->l->last_full_frame = ff;
+    ff->sync_master = w;
+}
+
+/*
+ * The sync that ends all parallelism for a particular user worker is about to
+ * be crossed.  Decouple the worker and frame.
+ *
+ * No locks need to be held since the user worker isn't doing anything, and none
+ * of the system workers can steal from it.  But unset_sync_master() should be
+ * called before the user worker knows about this work (i.e., before it is
+ * inserted into the w->l->next_frame_ff is set).
+ */
+static void unset_sync_master(__cilkrts_worker *w, full_frame *ff)
+{
+    CILK_ASSERT(WORKER_USER == w->l->type);
+    CILK_ASSERT(ff->sync_master == w);
+    ff->sync_master = NULL;
+    w->l->last_full_frame = NULL;
+}
+
+/********************************************************************
+ * THE protocol:
+ ********************************************************************/
+/*
+ * This is a protocol for work stealing that minimizes the overhead on
+ * the victim.
+ *
+ * The protocol uses three shared pointers into the worker's deque:
+ * - T - the "tail"
+ * - H - the "head"
+ * - E - the "exception"  NB: In this case, "exception" has nothing to do
+ * with C++ throw-catch exceptions -- it refers only to a non-normal return,
+ * i.e., a steal or similar scheduling exception.
+ *
+ * with H <= E, H <= T.  
+ *
+ * Stack frames SF, where H <= E < T, are available for stealing. 
+ *
+ * The worker operates on the T end of the stack.  The frame being
+ * worked on is not on the stack.  To make a continuation available for
+ * stealing the worker pushes a from onto the stack: stores *T++ = SF.
+ * To return, it pops the frame off the stack: obtains SF = *--T.
+ *
+ * After decrementing T, the condition E > T signals to the victim that
+ * it should invoke the runtime system's "THE" exception handler.  The
+ * pointer E can become INFINITY, in which case the victim must invoke
+ * the THE exception handler as soon as possible.
+ *
+ * See "The implementation of the Cilk-5 multithreaded language", PLDI 1998,
+ * http://portal.acm.org/citation.cfm?doid=277652.277725, for more information
+ * on the THE protocol.
+ */
+
+/* the infinity value of E */
+#define EXC_INFINITY  ((__cilkrts_stack_frame **) (-1))
+
+static void increment_E(__cilkrts_worker *victim)
+{
+    __cilkrts_stack_frame *volatile *tmp;
+
+    // The currently executing worker must own the worker lock to touch
+    // victim->exc
+    ASSERT_WORKER_LOCK_OWNED(victim);
+
+    tmp = victim->exc;
+    if (tmp != EXC_INFINITY) {
+        /* On most x86 this pair of operations would be slightly faster
+           as an atomic exchange due to the implicit memory barrier in
+           an atomic instruction. */
+        victim->exc = tmp + 1;
+        __cilkrts_fence();
+    }
+}
+
+static void decrement_E(__cilkrts_worker *victim)
+{
+    __cilkrts_stack_frame *volatile *tmp;
+
+    // The currently executing worker must own the worker lock to touch
+    // victim->exc
+    ASSERT_WORKER_LOCK_OWNED(victim);
+
+    tmp = victim->exc;
+    if (tmp != EXC_INFINITY) {
+        /* On most x86 this pair of operations would be slightly faster
+           as an atomic exchange due to the implicit memory barrier in
+           an atomic instruction. */
+        victim->exc = tmp - 1;
+        __cilkrts_fence(); /* memory fence not really necessary */
+    }
+}
+
+#if 0
+/* for now unused, will be necessary if we implement abort */
+static void signal_THE_exception(__cilkrts_worker *wparent)
+{
+    wparent->exc = EXC_INFINITY;
+    __cilkrts_fence();
+}
+#endif
+
+static void reset_THE_exception(__cilkrts_worker *w)
+{
+    // The currently executing worker must own the worker lock to touch
+    // w->exc
+    ASSERT_WORKER_LOCK_OWNED(w);
+
+    w->exc = w->head;
+    __cilkrts_fence();
+}
+
+/* conditions under which victim->head can be stolen: */
+static int can_steal_from(__cilkrts_worker *victim)
+{
+    return ((victim->head < victim->tail) && 
+            (victim->head < victim->protected_tail));
+}
+
+/* Return TRUE if the frame can be stolen, false otherwise */
+static int dekker_protocol(__cilkrts_worker *victim)
+{
+    // increment_E and decrement_E are going to touch victim->exc.  The
+    // currently executing worker must own victim's lock before they can
+    // modify it
+    ASSERT_WORKER_LOCK_OWNED(victim);
+
+    /* ASSERT(E >= H); */
+
+    increment_E(victim);
+
+    /* ASSERT(E >= H + 1); */
+    if (can_steal_from(victim)) {
+        /* success, we can steal victim->head and set H <- H + 1
+           in detach() */
+        return 1;
+    } else {
+        /* failure, restore previous state */
+        decrement_E(victim);
+        return 0;    
+    }
+}
+
+
+/* Link PARENT and CHILD in the spawn tree */
+static full_frame *make_child(__cilkrts_worker *w, 
+                              full_frame *parent_ff,
+                              __cilkrts_stack_frame *child_sf,
+                              cilk_fiber *fiber) 
+{
+    full_frame *child_ff = __cilkrts_make_full_frame(w, child_sf);
+
+    child_ff->parent = parent_ff;
+    push_child(parent_ff, child_ff);
+
+    //DBGPRINTF("%d-          make_child - child_frame: %p, parent_frame: %p, child_sf: %p\n"
+    //    "            parent - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n"
+    //    "            child  - parent: %p, left_sibling: %p, right_sibling: %p, rightmost_child: %p\n",
+    //          w->self, child, parent, child_sf,
+    //          parent->parent, parent->left_sibling, parent->right_sibling, parent->rightmost_child,
+    //          child->parent, child->left_sibling, child->right_sibling, child->rightmost_child);
+    CILK_ASSERT(parent_ff->call_stack);
+    child_ff->is_call_child = (fiber == NULL);
+
+    /* PLACEHOLDER_FIBER is used as non-null marker indicating that
+       child should be treated as a spawn child even though we have not
+       yet assigned a real fiber to its parent. */
+    if (fiber == PLACEHOLDER_FIBER)
+        fiber = NULL; /* Parent actually gets a null fiber, for now */
+
+    /* perform any system-dependent actions, such as capturing
+       parameter passing information */
+    /*__cilkrts_make_child_sysdep(child, parent);*/
+
+    /* Child gets reducer map and stack of parent.
+       Parent gets a new map and new stack. */
+    child_ff->fiber_self = parent_ff->fiber_self;
+    child_ff->sync_master = NULL;
+
+    if (child_ff->is_call_child) {
+        /* Cause segfault on any attempted access.  The parent gets
+           the child map and stack when the child completes. */
+        parent_ff->fiber_self = 0;
+    } else {
+        parent_ff->fiber_self = fiber;
+    }
+
+    incjoin(parent_ff);
+    return child_ff;
+}
+
+static inline __cilkrts_stack_frame *__cilkrts_advance_frame(__cilkrts_stack_frame *sf)
+{
+    __cilkrts_stack_frame *p = sf->call_parent;
+    sf->call_parent = 0;
+    return p;
+}
+
+/* w should be the currently executing worker.  
+ * loot_sf is the youngest stack frame in the call stack being 
+ *   unrolled (i.e., the most deeply nested stack frame.)
+ *
+ * When this method is called for a steal, loot_sf should be on a
+ * victim worker which is different from w.
+ * For CILK_FORCE_REDUCE, the victim worker will equal w.
+ *
+ * Before execution, the __cilkrts_stack_frame's have pointers from
+ * older to younger, i.e., a __cilkrts_stack_frame points to parent.
+ *
+ * This method creates a full frame for each __cilkrts_stack_frame in
+ * the call stack, with each full frame also pointing to its parent. 
+ *
+ * The method returns the full frame created for loot_sf, i.e., the
+ * youngest full frame.
+ */
+static full_frame *unroll_call_stack(__cilkrts_worker *w, 
+                                     full_frame *ff, 
+                                     __cilkrts_stack_frame *const loot_sf)
+{
+    __cilkrts_stack_frame *sf = loot_sf;
+    __cilkrts_stack_frame *rev_sf = 0;
+    __cilkrts_stack_frame *t_sf;
+
+    CILK_ASSERT(sf);
+    /*CILK_ASSERT(sf->call_parent != sf);*/
+
+    /* The leafmost frame is unsynched. */
+    if (sf->worker != w)
+        sf->flags |= CILK_FRAME_UNSYNCHED;
+
+    /* Reverse the call stack to make a linked list ordered from parent
+       to child.  sf->call_parent points to the child of SF instead of
+       the parent.  */
+    do {
+        t_sf = (sf->flags & (CILK_FRAME_DETACHED|CILK_FRAME_STOLEN|CILK_FRAME_LAST))? 0 : sf->call_parent;
+        sf->call_parent = rev_sf;
+        rev_sf = sf;
+        sf = t_sf;
+    } while (sf);
+    sf = rev_sf;
+
+    /* Promote each stack frame to a full frame in order from parent
+       to child, following the reversed list we just built. */
+    make_unrunnable(w, ff, sf, sf == loot_sf, "steal 1");
+    /* T is the *child* of SF, because we have reversed the list */
+    for (t_sf = __cilkrts_advance_frame(sf); t_sf;
+         sf = t_sf, t_sf = __cilkrts_advance_frame(sf)) {
+        ff = make_child(w, ff, t_sf, NULL);
+        make_unrunnable(w, ff, t_sf, t_sf == loot_sf, "steal 2");
+    }
+
+    /* XXX What if the leafmost frame does not contain a sync
+       and this steal is from promote own deque? */
+    /*sf->flags |= CILK_FRAME_UNSYNCHED;*/
+
+    CILK_ASSERT(!sf->call_parent);
+    return ff;
+}
+
+/* detach the top of the deque frame from the VICTIM and install a new
+   CHILD frame in its place */
+static void detach_for_steal(__cilkrts_worker *w,
+                             __cilkrts_worker *victim,
+                             cilk_fiber* fiber)
+{
+    /* ASSERT: we own victim->lock */
+
+    full_frame *parent_ff, *child_ff, *loot_ff;
+    __cilkrts_stack_frame *volatile *h;
+    __cilkrts_stack_frame *sf;
+
+    w->l->team = victim->l->team;
+
+    CILK_ASSERT(w->l->frame_ff == 0 || w == victim);
+
+    h = victim->head;
+
+    CILK_ASSERT(*h);
+
+    victim->head = h + 1;
+
+    parent_ff = victim->l->frame_ff;
+    BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
+        /* parent no longer referenced by victim */
+        decjoin(parent_ff);
+
+        /* obtain the victim call stack */
+        sf = *h;
+
+        /* perform system-dependent normalizations */
+        /*__cilkrts_normalize_call_stack_on_steal(sf);*/
+
+        /* unroll PARENT_FF with call stack SF, adopt the youngest
+           frame LOOT.  If loot_ff == parent_ff, then we hold loot_ff->lock,
+           otherwise, loot_ff is newly created and we can modify it without
+           holding its lock. */
+        loot_ff = unroll_call_stack(w, parent_ff, sf);
+
+        #if REDPAR_DEBUG >= 3
+        fprintf(stderr, "[W=%d, victim=%d, desc=detach, parent_ff=%p, loot=%p]\n",
+                w->self, victim->self,
+                parent_ff, loot_ff);
+        #endif
+
+        if (WORKER_USER == victim->l->type &&
+            NULL == victim->l->last_full_frame) {
+            // Mark this looted frame as special: only the original user worker
+            // may cross the sync.
+            // 
+            // This call is a shared access to
+            // victim->l->last_full_frame.
+            set_sync_master(victim, loot_ff);
+        }
+
+        /* LOOT is the next frame that the thief W is supposed to
+           run, unless the thief is stealing from itself, in which
+           case the thief W == VICTIM executes CHILD and nobody
+           executes LOOT. */
+        if (w == victim) {
+            /* Pretend that frame has been stolen */
+            loot_ff->call_stack->flags |= CILK_FRAME_UNSYNCHED;
+            loot_ff->simulated_stolen = 1;
+        }
+        else
+            __cilkrts_push_next_frame(w, loot_ff);
+
+        // After this "push_next_frame" call, w now owns loot_ff.
+        child_ff = make_child(w, loot_ff, 0, fiber);
+
+        BEGIN_WITH_FRAME_LOCK(w, child_ff) {
+            /* install child in the victim's work queue, taking
+               the parent_ff's place */
+            /* child is referenced by victim */
+            incjoin(child_ff);
+
+            // With this call, w is bestowing ownership of the newly
+            // created frame child_ff to the victim, and victim is
+            // giving up ownership of parent_ff.
+            //
+            // Worker w will either take ownership of parent_ff
+            // if parent_ff == loot_ff, or parent_ff will be
+            // suspended.
+            //
+            // Note that this call changes the victim->frame_ff
+            // while the victim may be executing.
+            make_runnable(victim, child_ff);
+        } END_WITH_FRAME_LOCK(w, child_ff);
+    } END_WITH_FRAME_LOCK(w, parent_ff);
+}
+
+/**
+ * @brief cilk_fiber_proc that resumes user code after a successful
+ * random steal.
+
+ * This function longjmps back into the user code whose state is
+ * stored in cilk_fiber_get_data(fiber)->resume_sf.  The stack pointer
+ * is adjusted so that the code resumes on the specified fiber stack
+ * instead of its original stack.
+ *
+ * This method gets executed only on a fiber freshly allocated from a
+ * pool.
+ *
+ * @param fiber   The fiber being used to resume user code.
+ * @param arg     Unused.
+ */
+static
+void fiber_proc_to_resume_user_code_for_random_steal(cilk_fiber *fiber)
+{
+    cilk_fiber_data *data = cilk_fiber_get_data(fiber);
+    __cilkrts_stack_frame* sf = data->resume_sf;
+    full_frame *ff;
+
+    CILK_ASSERT(sf);
+
+    // When we pull the resume_sf out of the fiber to resume it, clear
+    // the old value.
+    data->resume_sf = NULL;
+    CILK_ASSERT(sf->worker == data->owner);
+    ff = sf->worker->l->frame_ff;
+
+    // For Win32, we need to overwrite the default exception handler
+    // in this function, so that when the OS exception handling code
+    // walks off the top of the current Cilk stack, it reaches our stub
+    // handler.
+    
+    // Also, this function needs to be wrapped into a try-catch block
+    // so the compiler generates the appropriate exception information
+    // in this frame.
+    
+    // TBD: IS THIS HANDLER IN THE WRONG PLACE?  Can we longjmp out of
+    // this function (and does it matter?)
+#if defined(_WIN32) && !defined(_WIN64)
+    install_exception_stub_handler();
+    __try 
+#endif
+    {
+        char* new_sp = sysdep_reset_jump_buffers_for_resume(fiber, ff, sf);
+        
+        // Notify the Intel tools that we're stealing code
+        ITT_SYNC_ACQUIRED(sf->worker);
+        NOTIFY_ZC_INTRINSIC("cilk_continue", sf);
+
+        // TBD: We'd like to move TBB-interop methods into the fiber
+        // eventually.
+        cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT);
+        
+        sf->flags &= ~CILK_FRAME_SUSPENDED;
+
+        // longjmp to user code.  Don't process exceptions here,
+        // because we are resuming a stolen frame.
+        sysdep_longjmp_to_sf(new_sp, sf, NULL);
+        /*NOTREACHED*/
+        // Intel's C compiler respects the preceding lint pragma
+    }
+#if defined(_WIN32) && !defined(_WIN64)
+    __except (CILK_ASSERT(!"should not execute the the stub filter"),
+              EXCEPTION_EXECUTE_HANDLER)
+    {
+        // If we are here, that means something very wrong
+        // has happened in our exception processing...
+        CILK_ASSERT(! "should not be here!");
+    }
+#endif
+}
+
+static void random_steal(__cilkrts_worker *w)
+{
+    __cilkrts_worker *victim = NULL;
+    cilk_fiber *fiber = NULL;
+    int n;
+    int success = 0;
+    int32_t victim_id;
+
+    // Nothing's been stolen yet. When true, this will flag
+    // setup_for_execution_pedigree to increment the pedigree
+    w->l->work_stolen = 0;
+
+    /* If the user has disabled stealing (using the debugger) we fail */
+    if (__builtin_expect(w->g->stealing_disabled, 0))
+        return;
+
+    CILK_ASSERT(w->l->type == WORKER_SYSTEM || w->l->team == w);
+
+    /* If there is only one processor work can still be stolen.
+       There must be only one worker to prevent stealing. */
+    CILK_ASSERT(w->g->total_workers > 1);
+
+    /* pick random *other* victim */
+    n = myrand(w) % (w->g->total_workers - 1);
+    if (n >= w->self)
+        ++n;
+
+    // If we're replaying a log, override the victim.  -1 indicates that
+    // we've exhausted the list of things this worker stole when we recorded
+    // the log so just return.  If we're not replaying a log,
+    // replay_get_next_recorded_victim() just returns the victim ID passed in.
+    n = replay_get_next_recorded_victim(w, n);
+    if (-1 == n)
+        return;
+
+    victim = w->g->workers[n];
+
+    START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) {
+        /* Verify that we can get a stack.  If not, no need to continue. */
+        fiber = cilk_fiber_allocate(&w->l->fiber_pool);
+    } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE);
+
+
+    if (NULL == fiber) {
+#if FIBER_DEBUG >= 2
+        fprintf(stderr, "w=%d: failed steal because we could not get a fiber\n",
+                w->self);
+#endif        
+        return;
+    }
+
+    /* do not steal from self */
+    CILK_ASSERT (victim != w);
+
+    /* Execute a quick check before engaging in the THE protocol.
+       Avoid grabbing locks if there is nothing to steal. */
+    if (!can_steal_from(victim)) {
+        NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ);
+        START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
+            int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool);
+            // Fibers we use when trying to steal should not be active,
+            // and thus should not have any other references.
+            CILK_ASSERT(0 == ref_count);
+        } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
+        return;
+    }
+    
+    /* Attempt to steal work from the victim */
+    if (worker_trylock_other(w, victim)) {
+        if (w->l->type == WORKER_USER && victim->l->team != w) {
+
+            // Fail to steal if this is a user worker and the victim is not
+            // on this team.  If a user worker were allowed to steal work
+            // descended from another user worker, the former might not be
+            // done with its work by the time it was needed to resume and
+            // unbind.  Therefore, user workers are not permitted to change
+            // teams.
+
+            // There is no race on the victim's team because the victim cannot
+            // change its team until it runs out of work to do, at which point
+            // it will try to take out its own lock, and this worker already
+            // holds it.
+            NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_USER_WORKER);
+
+        } else if (victim->l->frame_ff) {
+            // A successful steal will change victim->frame_ff, even
+            // though the victim may be executing.  Thus, the lock on
+            // the victim's deque is also protecting victim->frame_ff.
+            if (dekker_protocol(victim)) {
+                int proceed_with_steal = 1; // optimistic
+
+                // If we're replaying a log, verify that this the correct frame
+                // to steal from the victim
+                if (! replay_match_victim_pedigree(w, victim))
+                {
+                    // Abort the steal attempt. decrement_E(victim) to
+                    // counter the increment_E(victim) done by the
+                    // dekker protocol
+                    decrement_E(victim);
+                    proceed_with_steal = 0;
+                }
+
+                if (proceed_with_steal)
+                {
+                    START_INTERVAL(w, INTERVAL_STEAL_SUCCESS) {
+                        success = 1;
+                        detach_for_steal(w, victim, fiber);
+                        victim_id = victim->self;
+
+                        #if REDPAR_DEBUG >= 1
+                        fprintf(stderr, "Wkr %d stole from victim %d, fiber = %p\n",
+                                w->self, victim->self, fiber);
+                        #endif
+
+                        // The use of victim->self contradicts our
+                        // classification of the "self" field as 
+                        // local.  But since this code is only for
+                        // debugging, it is ok.
+                        DBGPRINTF ("%d-%p: Stealing work from worker %d\n"
+                            "            sf: %p, call parent: %p\n",
+                            w->self, GetCurrentFiber(), victim->self,
+                            w->l->next_frame_ff->call_stack,
+                            w->l->next_frame_ff->call_stack->call_parent);
+                    } STOP_INTERVAL(w, INTERVAL_STEAL_SUCCESS);
+                }  // end if(proceed_with_steal)
+            } else {
+                NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_DEKKER);
+            }
+        } else {
+            NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_EMPTYQ);
+        }
+        worker_unlock_other(w, victim);
+    } else {
+        NOTE_INTERVAL(w, INTERVAL_STEAL_FAIL_LOCK);
+    }
+
+    // Record whether work was stolen.  When true, this will flag
+    // setup_for_execution_pedigree to increment the pedigree
+    w->l->work_stolen = success;
+
+    if (0 == success) {
+        // failed to steal work.  Return the fiber to the pool.
+        START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
+            int ref_count = cilk_fiber_remove_reference(fiber, &w->l->fiber_pool);
+            // Fibers we use when trying to steal should not be active,
+            // and thus should not have any other references.
+            CILK_ASSERT(0 == ref_count);
+        } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
+    }
+    else
+    {
+        // Since our steal was successful, finish initialization of
+        // the fiber.
+        cilk_fiber_reset_state(fiber,
+                               fiber_proc_to_resume_user_code_for_random_steal);
+        // Record the pedigree of the frame that w has stolen.
+        // record only if CILK_RECORD_LOG is set
+        replay_record_steal(w, victim_id);
+    }
+}
+
+
+
+/**
+ * At a provably good steal, we need to transfer the child reducer map
+ * from ff->children_reducer_map into v->reducer_map, where v is the
+ * worker that resumes execution of ff.
+ *
+ * Normally, we have v == w, where w is the currently executing
+ * worker.  In the case where we are resuming a team leader on a user
+ * worker, however, v might differ from w.
+
+ * Thus, this, operation is a no-op, since we can't really move
+ * ff->children_reducer_map into w here.
+ *
+ * Instead, this work is done in setup_for_execution_reducers().
+ */
+static inline void provably_good_steal_reducers(__cilkrts_worker *w,
+                                                full_frame       *ff)
+{
+    // No-op.
+}
+
+/* at a provably good steal, incorporate the accumulated exceptions of
+   children into the parent's exception */
+static void provably_good_steal_exceptions(__cilkrts_worker *w, 
+                                           full_frame       *ff)
+{
+    // ASSERT: we own ff->lock
+    ff->pending_exception =
+        __cilkrts_merge_pending_exceptions(w,
+                                           ff->child_pending_exception,
+                                           ff->pending_exception);
+    ff->child_pending_exception = NULL;
+}
+
+/* At sync discard the frame's old stack and take the leftmost child's. */
+static void provably_good_steal_stacks(__cilkrts_worker *w, full_frame *ff)
+{
+    CILK_ASSERT(NULL == ff->fiber_self);
+    ff->fiber_self = ff->fiber_child;
+    ff->fiber_child = NULL;
+}
+
+static void __cilkrts_mark_synched(full_frame *ff)
+{
+    ff->call_stack->flags &= ~CILK_FRAME_UNSYNCHED;
+    ff->simulated_stolen = 0;
+}
+
+static
+enum provably_good_steal_t provably_good_steal(__cilkrts_worker *w,
+                                               full_frame       *ff)
+{
+    // ASSERT: we hold w->lock and ff->lock
+
+    enum provably_good_steal_t result = ABANDON_EXECUTION;
+
+    // If the current replay entry is a sync record matching the worker's
+    // pedigree, AND this isn't the last child to the sync, return
+    // WAIT_FOR_CONTINUE to indicate that the caller should loop until
+    // we find the right frame to steal and CONTINUE_EXECUTION is returned.
+    int match_found = replay_match_sync_pedigree(w);
+    if (match_found && (0 != simulate_decjoin(ff)))
+        return WAIT_FOR_CONTINUE;
+
+    START_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL) {
+        if (decjoin(ff) == 0) {
+            provably_good_steal_reducers(w, ff);
+            provably_good_steal_exceptions(w, ff);
+            provably_good_steal_stacks(w, ff);
+            __cilkrts_mark_synched(ff);
+
+            // If the original owner wants this frame back (to resume
+            // it on its original thread) pass it back now.
+            if (NULL != ff->sync_master) {
+                // The frame wants to go back and be executed by the original
+                // user thread.  We can throw caution to the wind and push the
+                // frame straight onto its queue because the only way we have
+                // gotten to this point of being able to continue execution of
+                // the frame is if the original user worker is spinning without
+                // work.
+
+                unset_sync_master(w->l->team, ff);
+                __cilkrts_push_next_frame(w->l->team, ff);
+
+                // If this is the team leader we're not abandoning the work
+                if (w == w->l->team)
+                    result = CONTINUE_EXECUTION;
+            } else {
+                __cilkrts_push_next_frame(w, ff);
+                result = CONTINUE_EXECUTION;  // Continue working on this thread
+            }
+
+            // The __cilkrts_push_next_frame() call changes ownership
+            // of ff to the specified worker.
+        }
+    } STOP_INTERVAL(w, INTERVAL_PROVABLY_GOOD_STEAL);
+
+    // Only write a SYNC record if:
+    // - We're recording a log *AND*
+    // - We're the worker continuing from this sync
+    replay_record_sync(w, result == CONTINUE_EXECUTION);
+
+    // If we're replaying a log, and matched a sync from the log, mark the
+    // sync record seen if the sync isn't going to be abandoned.
+    replay_advance_from_sync (w, match_found, result == CONTINUE_EXECUTION);
+
+    return result;
+}
+
+static void unconditional_steal(__cilkrts_worker *w,
+                                full_frame *ff)
+{
+    // ASSERT: we hold ff->lock
+
+    START_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL) {
+        decjoin(ff);
+        __cilkrts_push_next_frame(w, ff);
+    } STOP_INTERVAL(w, INTERVAL_UNCONDITIONAL_STEAL);
+}
+
+
+/* CHILD is about to die.  Give its exceptions to a sibling or to the
+   parent.  */
+static inline void splice_exceptions_for_call(__cilkrts_worker *w,
+                                              full_frame *parent_ff,
+                                              full_frame *child_ff)
+{
+    // ASSERT: We own parent_ff->lock
+    CILK_ASSERT(child_ff->is_call_child);
+    CILK_ASSERT(NULL == child_ff->right_pending_exception);
+    CILK_ASSERT(NULL == parent_ff->pending_exception);
+    
+    parent_ff->pending_exception = child_ff->pending_exception;
+    child_ff->pending_exception = NULL;
+}
+
+/**
+ * Merge exceptions for a dying child. 
+ *
+ * @param w                   The currently executing worker.
+ * @param ff                  The child frame that is dying.
+ * @param left_exception_ptr  Pointer to the exception that is to our left.
+ */ 
+static inline
+void splice_exceptions_for_spawn(__cilkrts_worker *w,
+                                 full_frame *ff,
+                                 struct pending_exception_info **left_exception_ptr)
+{
+    // ASSERT: parent_ff == child_ff->parent.
+    // ASSERT: We own parent_ff->lock
+
+    // Merge current exception into the slot where the left
+    // exception should go.
+    *left_exception_ptr =
+        __cilkrts_merge_pending_exceptions(w,
+                                           *left_exception_ptr,
+                                           ff->pending_exception);
+    ff->pending_exception = NULL;
+
+
+    // Merge right exception into the slot where the left exception
+    // should go.
+    *left_exception_ptr =
+        __cilkrts_merge_pending_exceptions(w,
+                                           *left_exception_ptr,
+                                           ff->right_pending_exception);
+    ff->right_pending_exception = NULL;
+}
+
+
+static inline void splice_stacks_for_call(__cilkrts_worker *w,
+                                          full_frame *parent_ff,
+                                          full_frame *child_ff)
+{
+#if CILK_LIB_DEBUG
+    if (parent_ff->call_stack)
+        CILK_ASSERT(!(parent_ff->call_stack->flags & CILK_FRAME_MBZ));
+#endif
+
+    /* A synched frame does not have accumulated child reducers. */
+    CILK_ASSERT(!child_ff->fiber_child);
+    CILK_ASSERT(child_ff->is_call_child);
+
+    /* An attached parent has no self fiber.  It may have
+       accumulated child fibers or child owners, which should be
+       ignored until sync. */
+    CILK_ASSERT(!parent_ff->fiber_self);
+    parent_ff->fiber_self = child_ff->fiber_self;
+    child_ff->fiber_self = NULL;
+}
+
+static void finalize_child_for_call(__cilkrts_worker *w,
+                                    full_frame *parent_ff,
+                                    full_frame *child_ff)
+{
+    // ASSERT: we hold w->lock and parent_ff->lock
+    
+    START_INTERVAL(w, INTERVAL_FINALIZE_CHILD) {
+        CILK_ASSERT(child_ff->is_call_child);
+        CILK_ASSERT(child_ff->join_counter == 0);
+        CILK_ASSERT(!child_ff->rightmost_child);
+        CILK_ASSERT(child_ff == parent_ff->rightmost_child);
+
+        // CHILD is about to die. 
+        // Splicing out reducers is a no-op for a call since
+        // w->reducer_map should already store the correct 
+        // reducer map.
+        
+        // ASSERT there are no maps left to reduce.
+        CILK_ASSERT(NULL == child_ff->children_reducer_map);
+        CILK_ASSERT(NULL == child_ff->right_reducer_map);
+        
+        splice_exceptions_for_call(w, parent_ff, child_ff);
+
+        splice_stacks_for_call(w, parent_ff, child_ff);
+
+        /* remove CHILD from list of children of PARENT */
+        unlink_child(parent_ff, child_ff);
+
+        /* continue with the parent. */
+        unconditional_steal(w, parent_ff);
+        __cilkrts_destroy_full_frame(w, child_ff);
+    } STOP_INTERVAL(w, INTERVAL_FINALIZE_CHILD);
+}
+
+
+/**
+ * The invariant on ff->children_reducer_map is that when ff is
+ * synched and when we are about to resume execution of ff, at least
+ * one of ff->children_reducer_map and w->reducer_map must be NULL.
+ *
+ * Consider the two possibilities before resuming execution of ff:
+ *
+ * 1.  Suppose ff is synched and suspended.  Then either
+ *
+ *     (a) ff->children_reducer_map stores the reducer map that w
+ *         should use, where w is the worker resuming execution of ff, 
+ *         OR
+ *     (b) w already has a user map, and ff->children_reducer_map is NULL. 
+ *
+ *     Case (a) happens when we are resuming execution of ff as a
+ *     provably good steal.  In this case, w->reducer_map should be
+ *     NULL and ff->children_reducer_map is valid.  To resume
+ *     execution of ff on w, set w->reducer_map to
+ *     ff->children_reducer_map.
+ * 
+ *     Case (b) occurs when we resume execution of ff because ff is a
+ *     called child.  Then, ff->children_reducer_map should be NULL,
+ *     and w should already have a valid reducer map when resuming
+ *     execution of ff.  We resume execution of ff without changing
+ *     w->reducer_map.
+ *
+ * 2. Suppose frame ff is not synched (i.e., it is active and might have
+ *    active children).   Then ff->children_reducer_map is the slot for
+ *    storing the reducer map from ff's leftmost child, as in the reducer
+ *    protocol.   The runtime may resume execution of ff while it is not 
+ *    synched only because of a steal.
+ *    In this case, while we are resuming ff, ff->children_reducer_map
+ *    may be non-NULL (because one of ff's children has completed).
+ *    We resume execution of ff without changing w->reducer_map.
+ */ 
+static void setup_for_execution_reducers(__cilkrts_worker *w,
+                                         full_frame *ff)
+{
+    // We only need to move ff->children_reducer_map into
+    // w->reducer_map in case 1(a).
+    //
+    // First check whether ff is synched.
+    __cilkrts_stack_frame *sf = ff->call_stack;
+    if (!(sf->flags & CILK_FRAME_UNSYNCHED)) {
+        // In this case, ff is synched. (Case 1).
+        CILK_ASSERT(!ff->rightmost_child);
+
+        // Test whether we are in case 1(a) and have
+        // something to do.  Note that if both
+        // ff->children_reducer_map and w->reducer_map are NULL, we
+        // can't distinguish between cases 1(a) and 1(b) here.
+        if (ff->children_reducer_map) {
+            // We are in Case 1(a).
+            CILK_ASSERT(!w->reducer_map);
+            w->reducer_map = ff->children_reducer_map;
+            ff->children_reducer_map = NULL;
+        }
+    }
+}
+
+static void setup_for_execution_exceptions(__cilkrts_worker *w, 
+                                           full_frame *ff)
+{
+    CILK_ASSERT(NULL == w->l->pending_exception);
+    w->l->pending_exception = ff->pending_exception;
+    ff->pending_exception = NULL;
+}
+
+#if 0 /* unused */
+static void setup_for_execution_stack(__cilkrts_worker *w, 
+                                      full_frame *ff)
+{
+}
+#endif
+
+/*
+ * setup_for_execution_pedigree
+ *
+ * Copies the pedigree information from the frame we're resuming to the
+ * worker.  Increments the pedigree if this is work that has been stolen
+ * to match the increment on a return from a spawn helper.
+ */
+static void setup_for_execution_pedigree(__cilkrts_worker *w)
+{
+    int pedigree_unsynched;
+    __cilkrts_stack_frame *sf = w->current_stack_frame;
+
+    CILK_ASSERT(NULL != sf);
+
+    // If this isn't an ABI 1 or later frame, there's no pedigree information
+    if (0 == CILK_FRAME_VERSION_VALUE(sf->flags))
+        return;
+
+    // Note whether the pedigree is unsynched and clear the flag before
+    // we forget
+    pedigree_unsynched = sf->flags & CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
+    sf->flags &= ~CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
+
+    // If we're just marshalling onto this worker, do not increment
+    // the rank since that wouldn't happen in a sequential execution
+    if (w->l->work_stolen || pedigree_unsynched)
+    {
+        if (w->l->work_stolen)
+            w->pedigree.rank = sf->parent_pedigree.rank + 1;
+        else
+            w->pedigree.rank = sf->parent_pedigree.rank;
+    }
+
+    w->pedigree.parent = sf->parent_pedigree.parent;
+    w->l->work_stolen = 0;
+}
+
+static void setup_for_execution(__cilkrts_worker *w, 
+                                full_frame *ff,
+                                int is_return_from_call)
+{
+    // ASSERT: We own w->lock and ff->lock || P == 1
+
+    setup_for_execution_reducers(w, ff);
+    setup_for_execution_exceptions(w, ff);
+    /*setup_for_execution_stack(w, ff);*/
+
+    ff->call_stack->worker = w;
+    w->current_stack_frame = ff->call_stack;
+
+    // If this is a return from a call, leave the pedigree alone
+    if (! is_return_from_call)
+        setup_for_execution_pedigree(w);
+
+    __cilkrts_setup_for_execution_sysdep(w, ff);
+
+    w->head = w->tail = w->l->ltq;
+    reset_THE_exception(w);
+
+    make_runnable(w, ff);
+}
+
+
+/*
+ * Called by the scheduling fiber, right before
+ * resuming a sf/ff for user code.
+ *
+ * This method associates the specified sf with the worker.
+ *
+ * It also asserts that w, ff, and sf all have the expected properties
+ * for resuming user code.
+ */ 
+void scheduling_fiber_prepare_to_resume_user_code(__cilkrts_worker *w,
+                                                  full_frame *ff,
+                                                  __cilkrts_stack_frame *sf)
+{
+    w->current_stack_frame = sf;
+    sf->worker = w;
+
+    // Lots of debugging checks on the state of the fiber we might be
+    // resuming.
+#if FIBER_DEBUG >= 1
+#   if FIBER_DEBUG >= 3
+    {
+        fprintf(stderr, "w=%d: ff=%p, sf=%p. about to resume user code\n",
+                w->self, ff, sf);
+    }
+#   endif
+
+    const int flags = sf->flags;
+    CILK_ASSERT(flags & CILK_FRAME_SUSPENDED);
+    CILK_ASSERT(!sf->call_parent);
+    CILK_ASSERT(w->head == w->tail);
+
+    /* A frame can not be resumed unless it was suspended. */
+    CILK_ASSERT(ff->sync_sp != NULL);
+
+    /* The leftmost frame has no allocated stack */
+    if (ff->simulated_stolen)
+        CILK_ASSERT(flags & CILK_FRAME_UNSYNCHED);
+    else if (flags & CILK_FRAME_UNSYNCHED)
+        /* XXX By coincidence sync_sp could be null. */
+        CILK_ASSERT(ff->fiber_self != NULL);
+    else
+        /* XXX This frame could be resumed unsynched on the leftmost stack */
+        CILK_ASSERT((ff->sync_master == 0 || ff->sync_master == w));
+    CILK_ASSERT(w->l->frame_ff == ff);
+#endif    
+}
+
+
+/**
+ * This method is the first method that should execute after we've
+ * switched to a scheduling fiber from user code.
+ *
+ * @param fiber The scheduling fiber for the current worker.
+ * @param wptr  The current worker.
+ */
+static void enter_runtime_transition_proc(cilk_fiber *fiber)
+{
+    // We can execute this method for one of three reasons:
+    // 1. Undo-detach finds parent stolen.
+    // 2. Sync suspends frame.
+    // 3. Return from Cilk entry point.
+    //
+    //
+    // In cases 1 and 2, the frame may be truly suspended or
+    // may be immediately executed by this worker after provably_good_steal.
+    //
+    // 
+    // There is a fourth case, which can, but does not need to execute
+    // this function:
+    //   4. Starting up the scheduling loop on a user or
+    //      system worker.  In this case, we won't have
+    //      a scheduling stack function to run.
+    __cilkrts_worker* w = cilk_fiber_get_owner(fiber);
+    if (w->l->post_suspend) {
+        // Run the continuation function passed to longjmp_into_runtime
+        run_scheduling_stack_fcn(w);
+
+        // After we have jumped into the runtime and run the
+        // scheduling function, any reducer map the worker had before entering the runtime
+        // should have already been saved into the appropriate full
+        // frame.
+        CILK_ASSERT(NULL == w->reducer_map);
+
+        // There shouldn't be any uncaught exceptions.
+        //
+        // In Windows, the OS catches any exceptions not caught by the
+        // user code.  Thus, we are omitting the check on Windows.
+        //
+        // On Android, calling std::uncaught_exception with the stlport
+        // library causes a seg fault.  Since we're not supporting
+        // exceptions there at this point, just don't do the check
+        //
+        // TBD: Is this check also safe to do on Windows? 
+        CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION();
+    }
+}
+
+
+/**
+ * Method called to jump back to executing user code.
+ *
+ * A normal return from the runtime back to resuming user code calls
+ * this method.  A computation executed using force_reduce also calls
+ * this method to return to user code.
+ *
+ * This function should not contain any code that depends on a fiber.
+ * In a force-reduce case, the user worker may not have a fiber.  In
+ * the force-reduce case, we call this method directly instead of
+ * calling @c user_code_resume_after_switch_into_runtime.
+ */
+static inline NORETURN
+cilkrts_resume(__cilkrts_stack_frame *sf, full_frame *ff)
+{
+    // Save the sync stack pointer, and do the bookkeeping
+    char* sync_sp = ff->sync_sp;
+    __cilkrts_take_stack(ff, sync_sp);  // leaves ff->sync_sp null
+
+    sf->flags &= ~CILK_FRAME_SUSPENDED;
+    // Actually longjmp to the user code.
+    // We may have exceptions to deal with, since we are resuming
+    // a previous-suspended frame.
+    sysdep_longjmp_to_sf(sync_sp, sf, ff);
+}
+
+
+/**
+ * Called by the user-code fiber right before resuming a full frame
+ * (sf/ff).
+ *
+ * This method pulls sf/ff out of the worker, and then calls
+ * cilkrts_resume to jump to user code.
+ */
+static NORETURN
+user_code_resume_after_switch_into_runtime(cilk_fiber *fiber)
+{
+    __cilkrts_worker *w = cilk_fiber_get_owner(fiber);
+    __cilkrts_stack_frame *sf;
+    full_frame *ff;
+    sf = w->current_stack_frame;
+    ff = sf->worker->l->frame_ff;
+
+#if FIBER_DEBUG >= 1    
+    CILK_ASSERT(ff->fiber_self == fiber);
+    cilk_fiber_data *fdata = cilk_fiber_get_data(fiber);
+    DBGPRINTF ("%d-%p: resume_after_switch_into_runtime, fiber=%p\n",
+               w->self, w, fiber);
+    CILK_ASSERT(sf == fdata->resume_sf);
+#endif
+
+    // Notify the Intel tools that we're stealing code
+    ITT_SYNC_ACQUIRED(sf->worker);
+    NOTIFY_ZC_INTRINSIC("cilk_continue", sf);
+    cilk_fiber_invoke_tbb_stack_op(fiber, CILK_TBB_STACK_ADOPT);
+
+    // Actually jump to user code.
+    cilkrts_resume(sf, ff);
+ }
+
+
+/* The current stack is about to either be suspended or destroyed.  This
+ * function will switch to the stack on which the scheduler is suspended and
+ * resume running the scheduler within function do_work().  Upon waking up,
+ * the scheduler will run the 'cont' function, using the supplied worker and
+ * frame.
+ */
+static NORETURN
+longjmp_into_runtime(__cilkrts_worker *w,
+                     scheduling_stack_fcn_t fcn,
+                     __cilkrts_stack_frame *sf)
+{
+    full_frame *ff, *ff2;
+
+    CILK_ASSERT(!w->l->post_suspend);
+    ff = w->l->frame_ff;
+
+    // If we've got only one worker, stealing shouldn't be possible.
+    // Assume that this is a steal or return from spawn in a force-reduce case.
+    // We don't have a scheduling stack to switch to, so call the continuation
+    // function directly.
+    if (1 == w->g->P) {
+        fcn(w, ff, sf);
+
+        /* The call to function c() will have pushed ff as the next frame.  If
+         * this were a normal (non-forced-reduce) execution, there would have
+         * been a pop_next_frame call in a separate part of the runtime.  We
+         * must call pop_next_frame here to complete the push/pop cycle. */
+        ff2 = pop_next_frame(w);
+
+        setup_for_execution(w, ff2, 0);
+        scheduling_fiber_prepare_to_resume_user_code(w, ff2, w->current_stack_frame);
+        cilkrts_resume(w->current_stack_frame, ff2);
+        
+// Suppress clang warning that the expression result is unused
+#if defined(__clang__) && (! defined(__INTEL_COMPILER))
+#   pragma clang diagnostic push
+#   pragma clang diagnostic ignored "-Wunused-value"
+#endif // __clang__
+        /* no return */
+        CILK_ASSERT(((void)"returned from __cilkrts_resume", 0));
+#if defined(__clang__) && (! defined(__INTEL_COMPILER))
+#   pragma clang diagnostic pop
+#endif // __clang__
+    }
+
+    w->l->post_suspend = fcn;
+    w->l->suspended_stack = sf;
+
+    ITT_SYNC_RELEASING(w);
+    ITT_SYNC_PREPARE(w);
+
+#if FIBER_DEBUG >= 2
+    fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime... w->l->frame_ff = %p, sf=%p\n",
+            cilkos_get_current_thread_id(),
+            w->self, w->l->frame_ff,
+            sf);
+#endif
+
+    // Current fiber is either the (1) one we are about to free,
+    // or (2) it has been passed up to the parent.
+    cilk_fiber *current_fiber = ( w->l->fiber_to_free ?
+                                  w->l->fiber_to_free :
+                                  w->l->frame_ff->parent->fiber_child );
+    cilk_fiber_data* fdata = cilk_fiber_get_data(current_fiber);
+    CILK_ASSERT(NULL == w->l->frame_ff->fiber_self);
+
+    // Clear the sf in the current fiber for cleanliness, to prevent
+    // us from accidentally resuming a bad sf.
+    // Technically, resume_sf gets overwritten for a fiber when
+    // we are about to resume it anyway.
+    fdata->resume_sf = NULL;
+    CILK_ASSERT(fdata->owner == w);
+
+    // Set the function to execute immediately after switching to the
+    // scheduling fiber, but before freeing any fibers.
+    cilk_fiber_set_post_switch_proc(w->l->scheduling_fiber,
+                                    enter_runtime_transition_proc);
+    cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_ORPHAN);
+    
+    if (w->l->fiber_to_free) {
+        // Case 1: we are freeing this fiber.  We never
+        // resume this fiber again after jumping into the runtime.
+        w->l->fiber_to_free = NULL;
+
+        // Extra check. Normally, the fiber we are about to switch to
+        // should have a NULL owner.
+        CILK_ASSERT(NULL == cilk_fiber_get_data(w->l->scheduling_fiber)->owner);
+#if FIBER_DEBUG >= 4
+        fprintf(stderr, "ThreadId=%p, W=%d: about to switch into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n",
+                cilkos_get_current_thread_id(),
+                w->self,
+                current_fiber, w->l->scheduling_fiber);
+#endif
+        cilk_fiber_invoke_tbb_stack_op(current_fiber, CILK_TBB_STACK_RELEASE);
+        NOTE_INTERVAL(w, INTERVAL_DEALLOCATE_RESUME_OTHER);
+        cilk_fiber_remove_reference_from_self_and_resume_other(current_fiber,
+                                                               &w->l->fiber_pool,
+                                                               w->l->scheduling_fiber);
+        // We should never come back here!
+        CILK_ASSERT(0);
+    }
+    else {        
+        // Case 2: We are passing the fiber to our parent because we
+        // are leftmost.  We should come back later to
+        // resume execution of user code.
+        //
+        // If we are not freeing a fiber, there we must be
+        // returning from a spawn or processing an exception.  The
+        // "sync" path always frees a fiber.
+        // 
+        // We must be the leftmost child, and by left holder logic, we
+        // have already moved the current fiber into our parent full
+        // frame.
+#if FIBER_DEBUG >= 2
+        fprintf(stderr, "ThreadId=%p, W=%d: about to suspend self into runtime.. current_fiber = %p, deallcoate, switch to fiber %p\n",
+                cilkos_get_current_thread_id(),
+                w->self,
+                current_fiber, w->l->scheduling_fiber);
+#endif
+
+        NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER);
+
+        cilk_fiber_suspend_self_and_resume_other(current_fiber,
+                                                 w->l->scheduling_fiber);
+        // Resuming this fiber returns control back to
+        // this function because our implementation uses OS fibers.
+        //
+        // On Unix, we could have the choice of passing the
+        // user_code_resume_after_switch_into_runtime as an extra "resume_proc"
+        // that resumes execution of user code instead of the
+        // jumping back here, and then jumping back to user code.
+#if FIBER_DEBUG >= 2
+        CILK_ASSERT(fdata->owner == __cilkrts_get_tls_worker());
+#endif
+        user_code_resume_after_switch_into_runtime(current_fiber);
+    }
+}
+
+/*
+ * Send a message to the children of the specified worker: run or wait.
+ */
+static void notify_children(__cilkrts_worker *w, unsigned int msg)
+{
+    int child_num;
+    __cilkrts_worker *child;
+    int num_sys_workers = w->g->P - 1;
+
+    // If worker is "n", then its children are 2n + 1, and 2n + 2.
+    child_num = (w->self << 1) + 1;
+    if (child_num < num_sys_workers) {
+        child = w->g->workers[child_num];
+        CILK_ASSERT(child->l->signal_node);
+        signal_node_msg(child->l->signal_node, msg);
+        child_num++;
+        if (child_num < num_sys_workers) {
+            child = w->g->workers[child_num];
+            CILK_ASSERT(child->l->signal_node);
+            signal_node_msg(child->l->signal_node, msg);
+        }
+    }
+}
+
+/*
+ * Notify this worker's children that they need to wait.
+ */
+static void notify_children_wait(__cilkrts_worker *w)
+{
+    notify_children(w, 0);
+}
+
+/*
+ * Notify this worker's children to run and start trying to steal.
+ */
+static void notify_children_run(__cilkrts_worker *w)
+{
+    notify_children(w, 1);
+}
+
+/**
+ * A single "check" to find work, either on our queue or through a
+ * steal attempt.  This method checks our local queue once, and
+ * performs one steal attempt.
+ */
+static full_frame* check_for_work(__cilkrts_worker *w)
+{
+    full_frame *ff = NULL;
+    ff = pop_next_frame(w);
+    // If there is no work on the queue, try to steal some.
+    if (NULL == ff) {
+        START_INTERVAL(w, INTERVAL_STEALING) {
+            if (w->l->type != WORKER_USER && w->l->team != NULL) {
+                // At this point, the worker knows for certain that it has run
+                // out of work.  Therefore, it loses its team affiliation.  User
+                // workers never change teams, of course.
+                __cilkrts_worker_lock(w);
+                w->l->team = NULL;
+                __cilkrts_worker_unlock(w);
+            }
+
+            // If we are about to do a random steal, we should have no
+            // full frame...
+            CILK_ASSERT(NULL == w->l->frame_ff);
+            random_steal(w);
+        } STOP_INTERVAL(w, INTERVAL_STEALING);
+
+        // If the steal was successful, then the worker has populated its next
+        // frame with the work to resume.
+        ff = pop_next_frame(w);
+        if (NULL == ff) {
+            // Punish the worker for failing to steal.
+            // No quantum for you!
+            __cilkrts_yield();
+            w->l->steal_failure_count++;
+        } else {
+            // Reset steal_failure_count since there is obviously still work to
+            // be done.
+            w->l->steal_failure_count = 0;
+        }
+    }
+    return ff;
+}
+
+/**
+ * Keep stealing or looking on our queue.
+ *
+ * Returns either when a full frame is found, or NULL if the
+ * computation is done.
+ */ 
+static full_frame* search_until_work_found_or_done(__cilkrts_worker *w)
+{
+    full_frame *ff = NULL;
+    // Find a full frame to execute (either through random stealing,
+    // or because we pull it off w's 1-element queue).
+    while (!ff) {
+        // Check worker state to figure out our next action.
+        switch (worker_runnable(w))    
+        {
+        case SCHEDULE_RUN:             // One attempt at checking for work.
+            ff = check_for_work(w);
+            break;
+        case SCHEDULE_WAIT:            // go into wait-mode.
+            CILK_ASSERT(WORKER_SYSTEM == w->l->type);
+            // If we are about to wait, then we better not have
+            // a frame that we should execute...
+            CILK_ASSERT(NULL == w->l->next_frame_ff);
+            notify_children_wait(w);
+            signal_node_wait(w->l->signal_node);
+            // ...
+            // Runtime is waking up.
+            notify_children_run(w);
+            w->l->steal_failure_count = 0;
+            break;
+        case SCHEDULE_EXIT:            // exit the scheduler.
+            CILK_ASSERT(WORKER_USER != w->l->type);
+            return NULL;
+        default:
+            CILK_ASSERT(0);
+            abort();
+        }
+    }
+    return ff;
+}
+
+/**
+ * The proc method for a scheduling fiber on a user worker.
+ * 
+ * When a user worker jumps into the runtime, it jumps into this
+ * method by either starting it if the scheduling fiber has never run
+ * before, or resuming the fiber if it was previously suspended.
+ */
+COMMON_PORTABLE
+void scheduler_fiber_proc_for_user_worker(cilk_fiber *fiber)
+{
+    __cilkrts_worker* w = cilk_fiber_get_owner(fiber);
+    CILK_ASSERT(w);
+
+    // This must be a user worker
+    CILK_ASSERT(WORKER_USER == w->l->type);
+
+    // If we aren't the current worker, then something is very wrong
+    // here..
+    verify_current_wkr(w);
+
+    __cilkrts_run_scheduler_with_exceptions(w);
+}
+
+
+/**
+ * The body of the runtime scheduling loop.  This function executes in
+ * 4 stages:
+ *
+ * 1. Transitions from the user code into the runtime by
+ *    executing any scheduling-stack functions.
+ * 2. Looks for a full frame enqueued from a successful provably
+ *    good steal.
+ * 3. If no full frame is found in step 2, steal until
+ *    a frame is found or we are done.  If we are done, finish
+ *    the scheduling loop. 
+ * 4. When a frame is found, setup to resume user code.
+ *    In particular, suspend the current fiber and resume the
+ *    user fiber to execute the frame.
+ *
+ * Returns a fiber object that we should switch to after completing
+ * the body of the loop, or NULL if we should continue executing on
+ * this fiber.
+ *
+ * @pre @c current_fiber should equal @c wptr->l->scheduling_fiber
+ * 
+ * @param current_fiber   The currently executing (scheduling_ fiber
+ * @param wptr            The currently executing worker.
+ * @param return          The next fiber we should switch to.
+ */
+static cilk_fiber* worker_scheduling_loop_body(cilk_fiber* current_fiber,
+                                               void* wptr)
+{
+    __cilkrts_worker *w = (__cilkrts_worker*) wptr;
+    CILK_ASSERT(current_fiber == w->l->scheduling_fiber);
+
+    // Stage 1: Transition from executing user code to the runtime code.
+    // We don't need to do this call here any more, because 
+    // every switch to the scheduling fiber should make this call
+    // using a post_switch_proc on the fiber.
+    //
+    //  enter_runtime_transition_proc(w->l->scheduling_fiber, wptr);
+
+    // After Stage 1 is complete, w should no longer have
+    // an associated full frame.
+    CILK_ASSERT(NULL == w->l->frame_ff);
+
+    // Stage 2.  First do a quick check of our 1-element queue.
+    full_frame *ff = pop_next_frame(w);
+
+    if (!ff) {
+        // Stage 3.  We didn't find anything from our 1-element
+        // queue.  Now go through the steal loop to find work. 
+        ff = search_until_work_found_or_done(w);
+        if (!ff) {
+            CILK_ASSERT(w->g->work_done);
+            return NULL;
+        }
+    }
+
+    // Stage 4.  Now that we have found a full frame to work on,
+    // actually execute it.
+    __cilkrts_stack_frame *sf;
+
+    // There shouldn't be any uncaught exceptions.
+    //
+    // In Windows, the OS catches any exceptions not caught by the
+    // user code.  Thus, we are omitting the check on Windows.
+    //
+    // On Android, calling std::uncaught_exception with the stlport
+    // library causes a seg fault.  Since we're not supporting
+    // exceptions there at this point, just don't do the check
+    CILKBUG_ASSERT_NO_UNCAUGHT_EXCEPTION();
+
+    BEGIN_WITH_WORKER_LOCK(w) {
+        CILK_ASSERT(!w->l->frame_ff);
+        BEGIN_WITH_FRAME_LOCK(w, ff) {
+            sf = ff->call_stack;
+            CILK_ASSERT(sf && !sf->call_parent);
+            setup_for_execution(w, ff, 0);
+        } END_WITH_FRAME_LOCK(w, ff);
+    } END_WITH_WORKER_LOCK(w);
+
+    /* run it */
+    //
+    // Prepare to run the full frame.  To do so, we need to:
+    //   (a) Execute some code on this fiber (the scheduling
+    //       fiber) to set up data structures, and
+    //   (b) Suspend the scheduling fiber, and resume the
+    //       user-code fiber.
+
+    // Part (a). Set up data structures.
+    scheduling_fiber_prepare_to_resume_user_code(w, ff, sf);
+
+    cilk_fiber *other = w->l->frame_ff->fiber_self;
+    cilk_fiber_data* other_data = cilk_fiber_get_data(other);
+    cilk_fiber_data* current_fiber_data = cilk_fiber_get_data(current_fiber);
+
+    // I believe two cases are possible here, both of which
+    // should have other_data->resume_sf as NULL.
+    //
+    // 1. Resuming a fiber that was previously executing
+    //    user code (i.e., a provably-good-steal).
+    //    In this case, resume_sf should have been
+    //    set to NULL when it was suspended.
+    //
+    // 2. Resuming code on a steal.  In this case, since we
+    //    grabbed a new fiber, resume_sf should be NULL.
+    CILK_ASSERT(NULL == other_data->resume_sf);
+        
+#if FIBER_DEBUG >= 2
+    fprintf(stderr, "W=%d: other fiber=%p, setting resume_sf to %p\n",
+            w->self, other, other_data->resume_sf);
+#endif
+    // Update our own fiber's data.
+    current_fiber_data->resume_sf = NULL;
+    // The scheduling fiber should have the right owner from before.
+    CILK_ASSERT(current_fiber_data->owner == w);
+    other_data->resume_sf = sf;
+        
+
+#if FIBER_DEBUG >= 3
+    fprintf(stderr, "ThreadId=%p (about to suspend self resume other), W=%d: current_fiber=%p, other=%p, current_fiber->resume_sf = %p, other->resume_sf = %p\n",
+            cilkos_get_current_thread_id(),
+            w->self,
+            current_fiber, other,
+            current_fiber_data->resume_sf,
+            other_data->resume_sf);
+#endif
+    return other;
+}
+
+
+/**
+ * This function is executed once by each worker, to initialize its
+ * scheduling loop.
+ */
+static void worker_scheduler_init_function(__cilkrts_worker *w)
+{
+    // First, execute the startup tasks that must happen for all
+    // worker types.
+    ITT_SYNC_PREPARE(w);
+    /* Notify tools about the new worker. Inspector needs this, but we
+       don't want to confuse Cilkscreen with system threads.  User threads
+       do this notification in bind_thread */
+    if (! w->g->under_ptool)
+        __cilkrts_cilkscreen_establish_worker(w);
+
+    // Seed the initial random number generator.
+    // If we forget to do this, then the worker always steals from 0.
+    // Programs will still execute correctly, but
+    // you may see a subtle performance bug...
+    mysrand(w, (w->self + 1));
+
+    // The startup work varies, depending on the worker type.
+    switch (w->l->type) {
+    case WORKER_USER:
+        // Stop working once we've entered the scheduler.
+        // For user workers, INTERVAL_IN_SCHEDULER counts the time
+        // since we called bind_thread.
+        break;
+
+    case WORKER_SYSTEM:
+        // If a system worker is starting, we must also be starting
+        // the runtime.
+
+        // Runtime begins in a wait-state and is woken up by the first user
+        // worker when the runtime is ready.
+        signal_node_wait(w->l->signal_node);
+        // ...
+        // Runtime is waking up.
+        notify_children_run(w);
+        w->l->steal_failure_count = 0;
+
+        // For system threads, count all the time this thread is
+        // alive in the scheduling loop.
+        START_INTERVAL(w, INTERVAL_IN_SCHEDULER);
+        START_INTERVAL(w, INTERVAL_WORKING);
+        break;
+    default:
+        __cilkrts_bug("Unknown worker %p of type %d entering scheduling loop\n",
+                      w, w->l->type);
+    }
+}
+
+/**
+ * This function is executed once by each worker, to finish its
+ * scheduling loop.
+ *
+ * @note Currently, only system workers finish their loops.  User
+ * workers will jump away to user code without exiting their
+ * scheduling loop.
+ */ 
+static void worker_scheduler_terminate_function(__cilkrts_worker *w)
+{
+    // A user worker should never finish by falling through the
+    // scheduling loop.
+    CILK_ASSERT(WORKER_USER != w->l->type);
+    STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
+    STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER);
+}
+
+/**
+ * The main scheduler function executed by a worker's scheduling
+ * fiber.
+ * 
+ * This method is started by either a new system worker, or a user
+ * worker that has stalled and just been imported into the runtime.
+ */
+static void worker_scheduler_function(__cilkrts_worker *w)
+{
+    worker_scheduler_init_function(w);
+
+    // The main scheduling loop body.
+
+    while (!w->g->work_done) {    
+        // Set intervals.  Now we are in the runtime instead of working.
+        START_INTERVAL(w, INTERVAL_IN_RUNTIME);
+        STOP_INTERVAL(w, INTERVAL_WORKING);
+
+        // Execute the "body" of the scheduling loop, and figure
+        // out the fiber to jump to next.
+        cilk_fiber* fiber_to_resume
+            = worker_scheduling_loop_body(w->l->scheduling_fiber, w);
+
+        if (fiber_to_resume) {
+            // Suspend the current fiber and resume next one.
+            NOTE_INTERVAL(w, INTERVAL_SUSPEND_RESUME_OTHER);
+            STOP_INTERVAL(w, INTERVAL_IN_RUNTIME);
+            START_INTERVAL(w, INTERVAL_WORKING);
+            cilk_fiber_suspend_self_and_resume_other(w->l->scheduling_fiber,
+                                                     fiber_to_resume);
+
+            // Return here only when this (scheduling) fiber is
+            // resumed (i.e., this worker wants to reenter the runtime).
+        }
+    }
+
+    // Finish the scheduling loop.
+    worker_scheduler_terminate_function(w);
+}
+
+
+/*************************************************************
+  Forward declarations for reduction protocol.
+*************************************************************/
+
+static __cilkrts_worker*
+execute_reductions_for_sync(__cilkrts_worker *w,
+                            full_frame *ff,
+                            __cilkrts_stack_frame *sf_at_sync);
+
+static __cilkrts_worker*
+execute_reductions_for_spawn_return(__cilkrts_worker *w,
+                                    full_frame *ff,
+                                    __cilkrts_stack_frame *returning_sf);
+
+                                                             
+
+/*************************************************************
+  Scheduler functions that are callable by client code
+*************************************************************/
+static full_frame *disown(__cilkrts_worker *w,
+                          full_frame *ff,
+                          __cilkrts_stack_frame *sf,
+                          const char *why)
+{
+    CILK_ASSERT(ff);
+    make_unrunnable(w, ff, sf, sf != 0, why);
+    w->l->frame_ff = 0;
+    return ff->parent;
+}
+
+/**
+ * Called when ff is returning from a spawn, and we need to execute a
+ * reduction.
+ *
+ * @param w             The currently executing worker.
+ * @param ff            The full frame for w.
+ * @param returning_sf  The stack frame for the spawn helper that is returning.
+ *
+ * Normally, by the time we gain control in the runtime, the worker
+ * has already popped off the __cilkrts_stack_frame "returning_sf"
+ * from its call chain.
+ * 
+ * When we have only serial reductions, w->current_stack_frame is not
+ * needed any more, because w is about to enter the runtime scheduling
+ * loop anyway.  Similarly, the frame "ff" is slated to be destroyed
+ * after the runtime finishes the return from spawn and splices ff out
+ * of the tree of full frames.
+ *
+ * To execute a parallel reduction, however, we still want
+ * w->current_stack_frame == returning_sf, and we are going to use the
+ * frame ff for a little bit longer.
+ *
+ * This method:
+ *
+ *   1. Puts returning_sf back as w's current stack frame.
+ *   2. Makes "ff" runnable again on w.
+ */ 
+static inline
+void restore_frame_for_spawn_return_reduction(__cilkrts_worker *w,
+                                              full_frame *ff,
+                                              __cilkrts_stack_frame *returning_sf) {
+#if REDPAR_DEBUG >= 2
+    CILK_ASSERT(returning_sf);
+    CILK_ASSERT(returning_sf->worker == w);
+#endif
+    // Change w's current stack frame back to "returning_sf".
+    //
+    // Intuitively, w->current_stack_frame should be
+    // returning_sf->call_parent at this point.
+    //
+    // We can not assert this, however, because the pop of
+    // returning_sf from the call chain has already cleared
+    // returning_sf->call_parent.  We don't want to restore the call
+    // parent of returning_sf, because its parent has been stolen, and
+    // the runtime assumes that steals break this link.
+
+    // We cannot assert call_parent is NULL either, since that's not true for
+    // Win64 exception handling
+//    CILK_ASSERT(returning_sf->call_parent == NULL);
+    w->current_stack_frame = returning_sf;
+
+    // Make the full frame "ff" runnable again, in preparation for
+    // executing the reduction.
+    make_runnable(w, ff);
+}
+
+
+NORETURN __cilkrts_c_sync(__cilkrts_worker *w,
+                          __cilkrts_stack_frame *sf_at_sync)
+{
+    full_frame *ff; 
+
+    // Claim: This read of w->l->frame_ff can occur without
+    // holding the worker lock because when w has reached a sync
+    // and entered the runtime (because it stalls), w's deque is empty
+    // and no one else can steal and change w->l->frame_ff.
+
+    ff = w->l->frame_ff;
+#ifdef _WIN32
+    __cilkrts_save_exception_state(w, ff);
+#else
+    // Move any pending exceptions into the full frame
+    CILK_ASSERT(NULL == ff->pending_exception);
+    ff->pending_exception = w->l->pending_exception;
+    w->l->pending_exception = NULL;
+#endif
+    
+    w = execute_reductions_for_sync(w, ff, sf_at_sync);
+
+#if FIBER_DEBUG >= 3
+    fprintf(stderr, "ThreadId=%p, w->self = %d. about to longjmp_into_runtim[c_sync] with ff=%p\n",
+            cilkos_get_current_thread_id(), w->self, ff);
+#endif    
+
+    longjmp_into_runtime(w, do_sync, sf_at_sync);
+}
+
+static void do_sync(__cilkrts_worker *w, full_frame *ff,
+                    __cilkrts_stack_frame *sf)
+{
+    //int abandoned = 1;
+    enum provably_good_steal_t steal_result = ABANDON_EXECUTION;
+
+    START_INTERVAL(w, INTERVAL_SYNC_CHECK) {
+        BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
+
+            CILK_ASSERT(ff);
+            BEGIN_WITH_FRAME_LOCK(w, ff) {
+                CILK_ASSERT(sf->call_parent == 0);
+                CILK_ASSERT(sf->flags & CILK_FRAME_UNSYNCHED);
+
+                // Before switching into the scheduling fiber, we should have
+                // already taken care of deallocating the current
+                // fiber. 
+                CILK_ASSERT(NULL == ff->fiber_self);
+
+                // Update the frame's pedigree information if this is an ABI 1
+                // or later frame
+                if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1)
+                {
+                    sf->parent_pedigree.rank = w->pedigree.rank;
+                    sf->parent_pedigree.parent = w->pedigree.parent;
+
+                    // Note that the pedigree rank needs to be updated
+                    // when setup_for_execution_pedigree runs
+                    sf->flags |= CILK_FRAME_SF_PEDIGREE_UNSYNCHED;
+                }
+
+                /* the decjoin() occurs in provably_good_steal() */
+                steal_result = provably_good_steal(w, ff);
+
+            } END_WITH_FRAME_LOCK(w, ff);
+            // set w->l->frame_ff = NULL after checking abandoned
+            if (WAIT_FOR_CONTINUE != steal_result) {
+                w->l->frame_ff = NULL;
+            }
+        } END_WITH_WORKER_LOCK_OPTIONAL(w);
+    } STOP_INTERVAL(w, INTERVAL_SYNC_CHECK);
+
+    // Now, if we are in a replay situation and provably_good_steal() returned
+    // WAIT_FOR_CONTINUE, we should sleep, reacquire locks, call
+    // provably_good_steal(), and release locks until we get a value other
+    // than WAIT_FOR_CONTINUE from the function.
+#ifdef CILK_RECORD_REPLAY
+    // We don't have to explicitly check for REPLAY_LOG below because
+    // steal_result can only be set to WAIT_FOR_CONTINUE during replay
+    while(WAIT_FOR_CONTINUE == steal_result)
+    {
+        __cilkrts_sleep();
+        BEGIN_WITH_WORKER_LOCK_OPTIONAL(w)
+        {
+            ff = w->l->frame_ff;
+            BEGIN_WITH_FRAME_LOCK(w, ff)
+            {
+                steal_result = provably_good_steal(w, ff);
+            } END_WITH_FRAME_LOCK(w, ff);
+            if (WAIT_FOR_CONTINUE != steal_result)
+                w->l->frame_ff = NULL;
+        } END_WITH_WORKER_LOCK_OPTIONAL(w);
+    }
+#endif  // CILK_RECORD_REPLAY
+
+#ifdef ENABLE_NOTIFY_ZC_INTRINSIC
+    // If we can't make any further progress on this thread, tell Inspector
+    // that we're abandoning the work and will go find something else to do.
+    if (ABANDON_EXECUTION == steal_result)
+    {
+        NOTIFY_ZC_INTRINSIC("cilk_sync_abandon", 0);
+    }
+#endif // defined ENABLE_NOTIFY_ZC_INTRINSIC
+
+    return; /* back to scheduler loop */
+}
+
+/* worker W completely promotes its own deque, simulating the case
+   where the whole deque is stolen.  We use this mechanism to force
+   the allocation of new storage for reducers for race-detection
+   purposes. */
+void __cilkrts_promote_own_deque(__cilkrts_worker *w)
+{
+    // Remember the fiber we start this method on.
+    CILK_ASSERT(w->l->frame_ff);
+    cilk_fiber* starting_fiber = w->l->frame_ff->fiber_self;
+    
+    BEGIN_WITH_WORKER_LOCK(w) {
+        while (dekker_protocol(w)) {
+            /* PLACEHOLDER_FIBER is used as non-null marker to tell detach()
+               and make_child() that this frame should be treated as a spawn
+               parent, even though we have not assigned it a stack. */
+            detach_for_steal(w, w, PLACEHOLDER_FIBER);
+        }
+    } END_WITH_WORKER_LOCK(w);
+
+
+    // TBD: The management of full frames and fibers is a bit
+    // sketchy here.  We are promoting stack frames into full frames,
+    // and pretending they are stolen away, but no other worker is
+    // actually working on them.  Some runtime invariants
+    // may be broken here.
+    //
+    // Technically, if we are simulating a steal from w
+    // w should get a new full frame, but
+    // keep the same fiber.  A real thief would be taking the
+    // loot frame away, get a new fiber, and starting executing the
+    // loot frame.
+    //
+    // What should a fake thief do?  Where does the frame go? 
+
+    // In any case, we should be finishing the promotion process with
+    // the same fiber with.
+    CILK_ASSERT(w->l->frame_ff);
+    CILK_ASSERT(w->l->frame_ff->fiber_self == starting_fiber);
+}
+
+
+
+/* the client code calls this function after a spawn when the dekker
+   protocol fails.  The function may either return or longjmp
+   into the rts
+
+   This function takes in a "returning_sf" argument which corresponds
+   to the __cilkrts_stack_frame that we are finishing (i.e., the
+   argument to __cilkrts_leave_frame).
+   */
+void __cilkrts_c_THE_exception_check(__cilkrts_worker *w, 
+                                     __cilkrts_stack_frame *returning_sf)
+{
+    full_frame *ff;
+    int stolen_p;
+    __cilkrts_stack_frame *saved_sf = NULL;
+
+    START_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK);
+
+    BEGIN_WITH_WORKER_LOCK(w) {
+        ff = w->l->frame_ff;
+        CILK_ASSERT(ff);
+        /* This code is called only upon a normal return and never
+           upon an exceptional return.  Assert that this is the
+           case. */
+        CILK_ASSERT(!w->l->pending_exception);
+
+        reset_THE_exception(w);
+        stolen_p = !(w->head < (w->tail + 1)); /* +1 because tail was
+                                                  speculatively
+                                                  decremented by the
+                                                  compiled code */
+
+        if (stolen_p) {
+            /* XXX This will be charged to THE for accounting purposes */
+            __cilkrts_save_exception_state(w, ff);
+
+            // Save the value of the current stack frame.
+            saved_sf = w->current_stack_frame;
+
+            // Reverse the decrement from undo_detach.
+            // This update effectively resets the deque to be
+            // empty (i.e., changes w->tail back to equal w->head). 
+            // We need to reset the deque to execute parallel
+            // reductions.  When we have only serial reductions, it
+            // does not matter, since serial reductions do not
+            // change the deque.
+            w->tail++;
+#if REDPAR_DEBUG > 1            
+            // ASSERT our deque is empty.
+            CILK_ASSERT(w->head == w->tail);
+#endif
+        }
+    } END_WITH_WORKER_LOCK(w);
+
+    STOP_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK);
+
+    if (stolen_p)
+    {
+        w = execute_reductions_for_spawn_return(w, ff, returning_sf);
+
+        // "Mr. Policeman?  My parent always told me that if I was in trouble
+        // I should ask a nice policeman for help.  I can't find my parent
+        // anywhere..."
+        //
+        // Write a record to the replay log for an attempt to return to a stolen parent
+        replay_record_orphaned(w);
+
+        // Update the pedigree only after we've finished the
+        // reductions.
+        update_pedigree_on_leave_frame(w, returning_sf);
+
+        // Notify Inspector that the parent has been stolen and we're
+        // going to abandon this work and go do something else.  This
+        // will match the cilk_leave_begin in the compiled code
+        NOTIFY_ZC_INTRINSIC("cilk_leave_stolen", saved_sf);
+
+        DBGPRINTF ("%d: longjmp_into_runtime from __cilkrts_c_THE_exception_check\n", w->self);
+        longjmp_into_runtime(w, do_return_from_spawn, 0);
+        DBGPRINTF ("%d: returned from longjmp_into_runtime from __cilkrts_c_THE_exception_check?!\n", w->self);
+    }
+    else
+    {
+        NOTE_INTERVAL(w, INTERVAL_THE_EXCEPTION_CHECK_USELESS);
+        return;
+    }
+}
+
+/* Return an exception to a stolen parent. */
+NORETURN __cilkrts_exception_from_spawn(__cilkrts_worker *w,
+                                        __cilkrts_stack_frame *returning_sf) 
+{
+    full_frame *ff = w->l->frame_ff;
+    // This is almost the same as THE_exception_check, except
+    // the detach didn't happen, we don't need to undo the tail
+    // update.
+    CILK_ASSERT(w->head == w->tail);
+    w = execute_reductions_for_spawn_return(w, ff, returning_sf);
+
+    longjmp_into_runtime(w, do_return_from_spawn, 0);
+    CILK_ASSERT(0);
+}
+
+static void do_return_from_spawn(__cilkrts_worker *w,
+                                 full_frame *ff,
+                                 __cilkrts_stack_frame *sf)
+{
+    full_frame *parent_ff;
+    enum provably_good_steal_t steal_result = ABANDON_EXECUTION;
+
+    BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
+        CILK_ASSERT(ff);
+        CILK_ASSERT(!ff->is_call_child);
+        CILK_ASSERT(sf == NULL);
+        parent_ff = ff->parent;
+    
+        BEGIN_WITH_FRAME_LOCK(w, ff) {
+            decjoin(ff);
+        } END_WITH_FRAME_LOCK(w, ff);
+
+        BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
+            if (parent_ff->simulated_stolen)
+                unconditional_steal(w, parent_ff);
+            else
+                steal_result = provably_good_steal(w, parent_ff);
+        } END_WITH_FRAME_LOCK(w, parent_ff);
+
+    } END_WITH_WORKER_LOCK_OPTIONAL(w);
+
+    // Loop here in replay mode
+#ifdef CILK_RECORD_REPLAY
+    // We don't have to explicitly check for REPLAY_LOG below because
+    // steal_result can only get set to WAIT_FOR_CONTINUE during replay.
+    // We also don't have to worry about the simulated_stolen flag
+    // because steal_result can only be set to WAIT_FOR_CONTINUE by
+    // provably_good_steal().
+    while(WAIT_FOR_CONTINUE == steal_result)
+    {
+        __cilkrts_sleep();
+        BEGIN_WITH_WORKER_LOCK_OPTIONAL(w)
+        {
+            BEGIN_WITH_FRAME_LOCK(w, parent_ff)
+            {
+                steal_result = provably_good_steal(w, parent_ff);
+            } END_WITH_FRAME_LOCK(w, parent_ff);
+        } END_WITH_WORKER_LOCK_OPTIONAL(w);
+    }
+#endif  // CILK_RECORD_REPLAY
+
+    // Cleanup the child frame.
+    __cilkrts_destroy_full_frame(w, ff);
+    return;
+}
+
+#ifdef _WIN32
+/* migrate an exception across fibers.  Call this function when an exception has
+ * been thrown and has to traverse across a steal.  The exception has already
+ * been wrapped up, so all that remains is to longjmp() into the continuation,
+ * sync, and re-raise it.
+ */
+void __cilkrts_migrate_exception(__cilkrts_stack_frame *sf) {
+
+    __cilkrts_worker *w = sf->worker;
+    full_frame *ff;
+
+    BEGIN_WITH_WORKER_LOCK(w) {
+        ff = w->l->frame_ff;
+        reset_THE_exception(w);
+        /* there is no need to check for a steal because we wouldn't be here if
+           there weren't a steal. */
+        __cilkrts_save_exception_state(w, ff);
+
+        CILK_ASSERT(w->head == w->tail);
+    } END_WITH_WORKER_LOCK(w);
+
+    {
+        // TBD(jsukha): This function emulates the
+        // the "do_return_from_spawn" path.
+        w = execute_reductions_for_spawn_return(w, ff, sf);
+    }
+
+    longjmp_into_runtime(w, do_return_from_spawn, 0); /* does not return. */
+    CILK_ASSERT(! "Shouldn't be here...");
+}
+#endif
+
+
+/* Pop a call stack from TAIL.  Return the call stack, or NULL if the
+   queue is empty */
+__cilkrts_stack_frame *__cilkrts_pop_tail(__cilkrts_worker *w)
+{
+    __cilkrts_stack_frame *sf;
+    BEGIN_WITH_WORKER_LOCK(w) {
+        __cilkrts_stack_frame *volatile *tail = w->tail;
+        if (w->head < tail) {
+            --tail;
+            sf = *tail;
+            w->tail = tail;
+        } else {
+            sf = 0;
+        }
+    } END_WITH_WORKER_LOCK(w);
+    return sf;
+}
+
+#ifdef CILK_RECORD_REPLAY
+__cilkrts_stack_frame *simulate_pop_tail(__cilkrts_worker *w)
+{
+    __cilkrts_stack_frame *sf;
+    BEGIN_WITH_WORKER_LOCK(w) {
+        if (w->head < w->tail) {
+            sf = *(w->tail-1);
+        } else {
+            sf = 0;
+        }
+    } END_WITH_WORKER_LOCK(w);
+    return sf;
+}
+#endif
+
+
+/* Return from a call, not a spawn. */
+void __cilkrts_return(__cilkrts_worker *w)
+{
+    full_frame *ff, *parent_ff;
+    START_INTERVAL(w, INTERVAL_RETURNING);
+
+    BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
+        ff = w->l->frame_ff;
+        CILK_ASSERT(ff);
+        CILK_ASSERT(ff->join_counter == 1);
+        /* This path is not used to return from spawn. */
+        CILK_ASSERT(ff->is_call_child);
+
+        BEGIN_WITH_FRAME_LOCK(w, ff) {
+            // After this call, w->l->frame_ff != ff.
+            // Technically, w will "own" ff until ff is freed,
+            // however, because ff is a dying leaf full frame.
+            parent_ff = disown(w, ff, 0, "return");
+            decjoin(ff);
+
+#ifdef _WIN32
+            __cilkrts_save_exception_state(w, ff);
+#else
+            // Move the pending exceptions into the full frame
+            // This should always be NULL if this isn't a
+            // return with an exception
+            CILK_ASSERT(NULL == ff->pending_exception);
+            ff->pending_exception = w->l->pending_exception;
+            w->l->pending_exception = NULL;
+#endif  // _WIN32
+
+        } END_WITH_FRAME_LOCK(w, ff);
+
+        __cilkrts_fence(); /* redundant */
+
+        CILK_ASSERT(parent_ff);
+
+        BEGIN_WITH_FRAME_LOCK(w, parent_ff) {
+            finalize_child_for_call(w, parent_ff, ff);
+        } END_WITH_FRAME_LOCK(w, parent_ff);
+
+        ff = pop_next_frame(w);
+        /* ff will be non-null except when the parent frame is owned
+           by another worker.
+           CILK_ASSERT(ff)
+        */
+        CILK_ASSERT(!w->l->frame_ff);
+        if (ff) {
+            BEGIN_WITH_FRAME_LOCK(w, ff) {
+                __cilkrts_stack_frame *sf = ff->call_stack;
+                CILK_ASSERT(sf && !sf->call_parent);
+                setup_for_execution(w, ff, 1);
+            } END_WITH_FRAME_LOCK(w, ff);
+        }
+    } END_WITH_WORKER_LOCK_OPTIONAL(w);
+
+    STOP_INTERVAL(w, INTERVAL_RETURNING);
+}
+
+static void __cilkrts_unbind_thread()
+{
+    int stop_cilkscreen = 0;
+    global_state_t *g;
+
+    // Take out the global OS mutex to protect accesses to the table of workers
+    global_os_mutex_lock();
+
+    if (cilkg_is_published()) {
+        __cilkrts_worker *w = __cilkrts_get_tls_worker();
+        if (w) {
+            g = w->g;
+
+            // If there's only 1 worker, the counts will be stopped in
+            // __cilkrts_scheduler
+            if (g->P > 1)
+            {
+                STOP_INTERVAL(w, INTERVAL_WORKING);
+                STOP_INTERVAL(w, INTERVAL_IN_SCHEDULER);
+            }
+
+            __cilkrts_set_tls_worker(0);
+
+            if (w->self == -1) {
+                // This worker is an overflow worker.  I.e., it was created on-
+                // demand when the global pool ran out of workers.
+                destroy_worker(w);
+                __cilkrts_free(w);
+            } else {
+                // This is a normal user worker and needs to be counted by the
+                // global state for the purposes of throttling system workers.
+                w->l->type = WORKER_FREE;
+                __cilkrts_leave_cilk(g);
+            }
+
+            stop_cilkscreen = (0 == g->Q);
+        }
+    }
+    global_os_mutex_unlock();
+
+    /* Turn off Cilkscreen.  This needs to be done when we are NOT holding the
+     * os mutex. */
+    if (stop_cilkscreen)
+        __cilkrts_cilkscreen_disable_instrumentation();
+}
+
+/* special return from the initial frame */
+
+void __cilkrts_c_return_from_initial(__cilkrts_worker *w)
+{
+    struct cilkred_map *rm;
+
+    /* This is only called on a user thread worker. */
+    CILK_ASSERT(w->l->type == WORKER_USER);
+
+    #if REDPAR_DEBUG >= 3
+    fprintf(stderr, "[W=%d, desc=cilkrts_c_return_from_initial, ff=%p]\n",
+            w->self, w->l->frame_ff);
+    #endif
+    
+    BEGIN_WITH_WORKER_LOCK_OPTIONAL(w) {
+        full_frame *ff = w->l->frame_ff;
+        CILK_ASSERT(ff);
+        CILK_ASSERT(ff->join_counter == 1);
+        w->l->frame_ff = 0;
+
+        CILK_ASSERT(ff->fiber_self);
+        // Save any TBB interop data for the next time this thread enters Cilk
+        cilk_fiber_tbb_interop_save_info_from_stack(ff->fiber_self);
+
+        // Deallocate cilk_fiber that mapped to the user stack.  The stack
+        // itself does not get deallocated (of course) but our data
+        // structure becomes divorced from it.
+
+#if FIBER_DEBUG >= 1
+        fprintf(stderr, "ThreadId=%p: w=%d: We are about to deallocate ff->fiber_self  = %p here. w->l->scheduling_fiber = %p. w->l->type = %d\n",
+                cilkos_get_current_thread_id(),
+                w->self,
+                ff->fiber_self,
+                w->l->scheduling_fiber,
+                w->l->type);
+#endif
+        // The fiber in ff is a user-code fiber.  The fiber in
+        // w->l->scheduling_fiber is a scheduling fiber.  These fibers should
+        // never be equal.  When a user worker returns (and will unbind), we
+        // should destroy only the fiber in ff.  The scheduling fiber will be
+        // re-used.
+
+        CILK_ASSERT(ff->fiber_self != w->l->scheduling_fiber);
+
+        START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE) {
+            // This fiber might not be deallocated here if there
+            // is a pending exception on Windows that refers
+            // to this fiber.
+            //
+            // First "suspend" the fiber, and then try to delete it.
+            cilk_fiber_deallocate_from_thread(ff->fiber_self);
+        } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE);
+        ff->fiber_self = NULL;
+
+        /* Save reducer map into global_state object */
+        rm = w->reducer_map;
+        w->reducer_map = NULL;
+
+#if REDPAR_DEBUG >= 3
+        fprintf(stderr, "W=%d, reducer_map_to_delete=%p, was in ff=%p\n",
+                w->self,
+                rm,
+                ff);
+#endif
+        __cilkrts_destroy_full_frame(w, ff);
+
+
+        /* Work is never done. w->g->work_done = 1; __cilkrts_fence(); */
+    } END_WITH_WORKER_LOCK_OPTIONAL(w);
+
+
+    save_pedigree_leaf_from_user_worker(w);
+
+    // Workers can have NULL reducer maps now.
+    if (rm) {
+        __cilkrts_destroy_reducer_map(w, rm);
+    }
+
+
+#if FIBER_DEBUG >= 1
+    __cilkrts_worker* tmp = w;
+    int tmp_id = w->self;
+    fprintf(stderr, "w=%d: We are about unbind thread (w= %p)\n",
+            w->self,
+            w);
+#endif
+
+    w = NULL;
+    
+    __cilkrts_unbind_thread();
+
+#if FIBER_DEBUG >= 1
+    
+    fprintf(stderr, "w=%p, %d: Finished unbind\n",
+            tmp, tmp_id);
+#endif
+
+    /* Other workers will stop trying to steal if this was the last worker. */
+
+    return;
+}
+
+
+/*
+ * __cilkrts_restore_stealing
+ *
+ * Restore the protected_tail to a previous state, possibly allowing frames
+ * to be stolen.  The dekker_protocol has been extended to steal only if
+ * head+1 is < protected_tail.
+ */
+
+void __cilkrts_restore_stealing(
+    __cilkrts_worker *w,
+    __cilkrts_stack_frame *volatile *saved_protected_tail)
+{
+    /* On most x86 this pair of operations would be slightly faster
+       as an atomic exchange due to the implicit memory barrier in
+       an atomic instruction. */
+    w->protected_tail = saved_protected_tail;
+    __cilkrts_fence();
+}
+
+/*
+ * __cilkrts_disallow_stealing
+ *
+ * Move the protected_tail to NEW_PROTECTED_TAIL, preventing any
+ * frames from being stolen.  If NEW_PROTECTED_TAIL is NULL, prevent
+ * stealing from the whole queue.  The dekker_protocol has been
+ * extended to only steal if head+1 is also < protected_tail.
+ */
+
+__cilkrts_stack_frame *volatile *__cilkrts_disallow_stealing(
+    __cilkrts_worker *w,
+    __cilkrts_stack_frame *volatile *new_protected_tail)
+{
+    __cilkrts_stack_frame *volatile *saved_protected_tail = w->protected_tail;
+
+    if (!new_protected_tail)
+        new_protected_tail = w->l->ltq;
+
+    if (w->protected_tail > new_protected_tail) {
+        w->protected_tail = new_protected_tail;
+        /* Issue a store-store barrier.  The update to protected_tail
+           here must precede the update to tail in the next spawn.
+           On x86 this is probably not needed. */
+#if defined __GNUC__ && __ICC >= 1200 && !(__MIC__ ||__MIC2__)
+        _mm_sfence();
+#else
+        __cilkrts_fence();
+#endif
+    }
+
+    return saved_protected_tail;
+}
+
+/*************************************************************
+  Initialization and startup 
+*************************************************************/
+
+__cilkrts_worker *make_worker(global_state_t *g,
+                              int self, __cilkrts_worker *w)
+{
+    w->self = self;
+    w->g = g;
+
+    w->pedigree.rank = 0;    // Initial rank is 0
+    w->pedigree.parent = NULL;
+
+    w->l = (local_state *)__cilkrts_malloc(sizeof(*w->l));
+
+    __cilkrts_frame_malloc_per_worker_init(w);
+
+    w->reducer_map = NULL;
+    w->current_stack_frame = NULL;
+    w->reserved = NULL;
+    
+    w->l->worker_magic_0 = WORKER_MAGIC_0;
+    w->l->team = NULL;
+    w->l->type = WORKER_FREE;
+    
+    __cilkrts_mutex_init(&w->l->lock);
+    __cilkrts_mutex_init(&w->l->steal_lock);
+    w->l->do_not_steal = 0;
+    w->l->frame_ff = 0;
+    w->l->next_frame_ff = 0;
+    w->l->last_full_frame = NULL;
+
+    w->l->ltq = (__cilkrts_stack_frame **)
+        __cilkrts_malloc(g->ltqsize * sizeof(*w->l->ltq));
+    w->ltq_limit = w->l->ltq + g->ltqsize;
+    w->head = w->tail = w->l->ltq;
+    
+    cilk_fiber_pool_init(&w->l->fiber_pool,
+                         &g->fiber_pool,
+                         g->stack_size,
+                         g->fiber_pool_size,
+                         0,   // alloc_max is 0.  We don't allocate from the heap directly without checking the parent pool.
+                         0);
+#if FIBER_DEBUG >= 2
+    fprintf(stderr, "ThreadId=%p: Making w=%d (%p), pool = %p\n",
+            cilkos_get_current_thread_id(),
+            w->self, w, 
+            &w->l->fiber_pool);
+#endif
+    w->l->scheduling_fiber = NULL;
+    w->l->original_pedigree_leaf = NULL;
+    w->l->rand_seed = 0; /* the scheduler will overwrite this field */
+
+    w->l->post_suspend = 0;
+    w->l->suspended_stack = 0;
+    w->l->fiber_to_free = NULL;
+    w->l->pending_exception = NULL;
+
+#if CILK_PROFILE
+    w->l->stats = __cilkrts_malloc(sizeof(statistics));
+    __cilkrts_init_stats(w->l->stats);
+#else
+    w->l->stats = NULL;
+#endif    
+    w->l->steal_failure_count = 0;
+
+    w->l->work_stolen = 0;
+
+    // Initialize record/replay assuming we're doing neither
+    w->l->record_replay_fptr = NULL;
+    w->l->replay_list_root = NULL;
+    w->l->replay_list_entry = NULL;
+    w->l->signal_node = NULL;
+    // Nothing's been stolen yet
+    w->l->worker_magic_1 = WORKER_MAGIC_1;
+
+    /*w->parallelism_disabled = 0;*/
+
+    // Allow stealing all frames. Sets w->saved_protected_tail
+    __cilkrts_restore_stealing(w, w->ltq_limit);
+    
+    __cilkrts_init_worker_sysdep(w);
+
+    reset_THE_exception(w); 
+
+    return w;
+}
+
+void destroy_worker(__cilkrts_worker *w)
+{
+    CILK_ASSERT (NULL == w->l->pending_exception);
+
+    // Deallocate the scheduling fiber
+    if (NULL != w->l->scheduling_fiber)
+    {
+        // The scheduling fiber is the main fiber for system workers and must
+        // be deallocated by the thread that created it.  Thus, we can
+        // deallocate only free workers' (formerly user workers) scheduling
+        // fibers here. 
+        CILK_ASSERT(WORKER_FREE == w->l->type);
+
+#if FIBER_DEBUG >=1
+        fprintf(stderr, "ThreadId=%p, w=%p, %d, deallocating scheduling fiber = %p, \n",
+                cilkos_get_current_thread_id(),
+                w,
+                w->self,
+                w->l->scheduling_fiber);
+#endif
+        int ref_count = cilk_fiber_remove_reference(w->l->scheduling_fiber, NULL);
+        // Scheduling fiber should never have extra references because of exceptions.
+        CILK_ASSERT(0 == ref_count);
+        w->l->scheduling_fiber = NULL;
+    }
+
+#if CILK_PROFILE
+    if (w->l->stats) {
+        __cilkrts_free(w->l->stats);
+    }
+#else
+    CILK_ASSERT(NULL == w->l->stats);
+#endif
+    
+    /* Free any cached fibers. */
+    cilk_fiber_pool_destroy(&w->l->fiber_pool);
+
+    __cilkrts_destroy_worker_sysdep(w);
+
+    if (w->l->signal_node) {
+        CILK_ASSERT(WORKER_SYSTEM == w->l->type);
+        signal_node_destroy(w->l->signal_node);
+    }
+
+    __cilkrts_free(w->l->ltq);
+    __cilkrts_mutex_destroy(0, &w->l->lock);
+    __cilkrts_mutex_destroy(0, &w->l->steal_lock);
+    __cilkrts_frame_malloc_per_worker_cleanup(w);
+
+    __cilkrts_free(w->l);
+
+    // The caller is responsible for freeing the worker memory
+}
+
+/*
+ * Make a worker into a system worker.
+ */
+static void make_worker_system(__cilkrts_worker *w) {
+    CILK_ASSERT(WORKER_FREE == w->l->type);
+    w->l->type = WORKER_SYSTEM;
+    w->l->signal_node = signal_node_create();
+}
+
+void __cilkrts_deinit_internal(global_state_t *g)
+{
+    int i;
+    __cilkrts_worker *w;
+
+    // If there's no global state then we're done
+    if (NULL == g)
+        return;
+
+#ifdef CILK_PROFILE
+    __cilkrts_dump_stats_to_stderr(g);
+#endif
+
+    w = g->workers[0];
+    if (w->l->frame_ff) {
+        __cilkrts_destroy_full_frame(w, w->l->frame_ff);
+        w->l->frame_ff = 0;
+    }
+
+    // Release any resources used for record/replay
+    replay_term(g);
+
+    // Destroy any system dependent global state
+    __cilkrts_destroy_global_sysdep(g);
+
+    for (i = 0; i < g->total_workers; ++i)
+        destroy_worker(g->workers[i]);
+
+    // Free memory for all worker blocks which were allocated contiguously
+    __cilkrts_free(g->workers[0]);
+
+    __cilkrts_free(g->workers);
+
+    cilk_fiber_pool_destroy(&g->fiber_pool);
+    __cilkrts_frame_malloc_global_cleanup(g);
+
+    cilkg_deinit_global_state();
+}
+
+/*
+ * Wake the runtime by notifying the system workers that they can steal.  The
+ * first user worker into the runtime should call this.
+ */
+static void wake_runtime(global_state_t *g)
+{
+    __cilkrts_worker *root;
+    if (g->P > 1) {
+        // Send a message to the root node.  The message will propagate.
+        root = g->workers[0];
+        CILK_ASSERT(root->l->signal_node);
+        signal_node_msg(root->l->signal_node, 1);
+    }
+}
+
+/*
+ * Put the runtime to sleep.  The last user worker out of the runtime should
+ * call this.  Like Dad always said, turn out the lights when nobody's in the
+ * room.
+ */
+static void sleep_runtime(global_state_t *g)
+{
+    __cilkrts_worker *root;
+    if (g->P > 1) {
+        // Send a message to the root node.  The message will propagate.
+        root = g->workers[0];
+        CILK_ASSERT(root->l->signal_node);
+        signal_node_msg(root->l->signal_node, 0);
+    }
+}
+
+/* Called when a user thread joins Cilk.
+   Global lock must be held. */
+void __cilkrts_enter_cilk(global_state_t *g)
+{
+    if (g->Q++ == 0) {
+        // If this is the first user thread to enter Cilk wake
+        // up all the workers.
+        wake_runtime(g);
+    }
+}
+
+/* Called when a user thread leaves Cilk.
+   Global lock must be held. */
+void __cilkrts_leave_cilk(global_state_t *g)
+{
+    if (--g->Q == 0) {
+        // Put the runtime to sleep.
+        sleep_runtime(g);
+    }
+}
+
+/*
+ * worker_runnable
+ *
+ * Return true if the worker should continue to try to steal.  False, otherwise.
+ */
+
+NOINLINE
+static enum schedule_t worker_runnable(__cilkrts_worker *w)
+{
+    global_state_t *g = w->g;
+
+    /* If this worker has something to do, do it.
+       Otherwise the work would be lost. */
+    if (w->l->next_frame_ff)
+        return SCHEDULE_RUN;
+
+    // If Cilk has explicitly (by the user) been told to exit (i.e., by
+    // __cilkrts_end_cilk() -> __cilkrts_stop_workers(g)), then return 0.
+    if (g->work_done)
+        return SCHEDULE_EXIT;
+
+    if (0 == w->self) {
+        // This worker is the root node and is the only one that may query the
+        // global state to see if there are still any user workers in Cilk.
+        if (w->l->steal_failure_count > g->max_steal_failures) {
+            if (signal_node_should_wait(w->l->signal_node)) {
+                return SCHEDULE_WAIT;
+            } else {
+                // Reset the steal_failure_count since we have verified that
+                // user workers are still in Cilk.
+                w->l->steal_failure_count = 0;
+            }
+        }
+    } else if (WORKER_SYSTEM == w->l->type &&
+               signal_node_should_wait(w->l->signal_node)) {
+        // This worker has been notified by its parent that it should stop
+        // trying to steal.
+        return SCHEDULE_WAIT;
+    }
+
+    return SCHEDULE_RUN;
+}
+
+
+
+// Initialize the worker structs, but don't start the workers themselves.
+static void init_workers(global_state_t *g)
+{
+    int total_workers = g->total_workers;
+    int i;
+    struct CILK_ALIGNAS(256) buffered_worker {
+        __cilkrts_worker w;
+        char buf[64];
+    } *workers_memory;
+
+    /* not needed if only one worker */
+    cilk_fiber_pool_init(&g->fiber_pool,
+                         NULL,
+                         g->stack_size,
+                         g->global_fiber_pool_size,           // buffer_size
+                         g->max_stacks,                       // maximum # to allocate
+                         1);
+
+    cilk_fiber_pool_set_fiber_limit(&g->fiber_pool,
+                                    (g->max_stacks ? g->max_stacks : INT_MAX));
+
+    g->workers = (__cilkrts_worker **)
+        __cilkrts_malloc(total_workers * sizeof(*g->workers));
+
+    // Allocate 1 block of memory for workers to make life easier for tools
+    // like Inspector which run multithreaded and need to know the memory
+    // range for all the workers that will be accessed in a user's program
+    workers_memory = (struct buffered_worker*)
+        __cilkrts_malloc(sizeof(*workers_memory) * total_workers);    
+    
+    // Notify any tools that care (Cilkscreen and Inspector) that they should
+    // ignore memory allocated for the workers
+    __cilkrts_cilkscreen_ignore_block(&workers_memory[0],
+                                      &workers_memory[total_workers]);
+
+    // Initialize worker structs, including unused worker slots.
+    for (i = 0; i < total_workers; ++i) {
+        g->workers[i] = make_worker(g, i, &workers_memory[i].w);
+    }
+
+    // Set the workers in the first P - 1 slots to be system workers.
+    // Remaining worker structs already have type == 0.
+    for (i = 0; i < g->system_workers; ++i) {
+        make_worker_system(g->workers[i]);
+    }
+}
+
+void __cilkrts_init_internal(int start)
+{
+    global_state_t *g = NULL;
+
+    if (cilkg_is_published()) {
+        g = cilkg_init_global_state();
+    }
+    else {
+
+        // We think the state has not been published yet.
+        // Grab the lock and try to initialize/publish.
+        global_os_mutex_lock();
+
+        if (cilkg_is_published()) {
+            // Some other thread must have snuck in and published.
+            g = cilkg_init_global_state();
+        }
+        else {
+            // Initialize and retrieve global state
+            g = cilkg_init_global_state();
+
+            // Set the scheduler pointer
+            g->scheduler = worker_scheduler_function;
+
+            // If we're running under a sequential P-Tool (Cilkscreen or
+            // Cilkview) then there's only one worker and we need to tell
+            // the tool about the extent of the stack
+            if (g->under_ptool)
+                __cilkrts_establish_c_stack();     
+            init_workers(g);
+
+            // Initialize per-work record/replay logging
+            replay_init_workers(g);
+
+            // Initialize any system dependent global state
+            __cilkrts_init_global_sysdep(g);
+
+
+            cilkg_publish_global_state(g);
+        }
+
+        global_os_mutex_unlock();
+    }
+
+    CILK_ASSERT(g);
+
+    if (start && !g->workers_running)
+    {
+        // Acquire the global OS mutex while we're starting the workers
+        global_os_mutex_lock();
+        if (!g->workers_running)
+            // Start P - 1 system workers since P includes the first user
+            // worker.
+            __cilkrts_start_workers(g, g->P - 1);
+        global_os_mutex_unlock();
+    }
+}
+
+
+/************************************************************************
+  Methods for reducer protocol.
+
+  Reductions occur in two places:
+    A. A full frame "ff" is returning from a spawn with a stolen parent.
+    B. A full frame "ff" is stalling at a sync.
+
+  To support parallel reductions, reduction functions need to be
+  executed while control is on a user stack, before jumping into the
+  runtime.  These reductions can not occur while holding a worker or
+  frame lock.
+
+  Before a worker w executes a reduction in either Case A or B, w's
+  deque is empty.
+
+  Since parallel reductions push work onto the deque, we must do extra
+  work to set up runtime data structures properly before reductions
+  begin to allow stealing.  ( Normally, when we have only serial
+  reductions, once a worker w starts a reduction, its deque remains
+  empty until w either steals another frame or resumes a suspended
+  frame.  Thus, we don't care about the state of the deque, since w
+  will reset its deque when setting up execution of a frame. )
+
+  To allow for parallel reductions, we coerce the runtime data
+  structures so that, from their perspective, it looks as though we
+  have spliced in an "execute_reductions()" function.  Consider the
+  two cases for reductions:
+
+    Case A: Return from a spawn with a stolen parent.
+      Consider a spawned function g is returning on a worker w.
+      Assume:
+          -   g was spawned from a parent function f.  
+          -   ff is the full frame for g's spawn helper
+          -   sf be the __cilkrts_stack_frame for g's spawn helper.
+
+      We are conceptually splicing "execute_reductions()" so that it
+      occurs immediately before the spawn helper of g returns to f.
+
+      We do so by creating two different world views --- one for the
+      runtime data structures, and one for the actual control flow.
+
+        - Before reductions begin, the runtime data structures should
+          look as though the spawn helper of g is calling
+          "execute_reductions()", in terms of both the user stack and
+          worker deque.  More precisely, w should satisfy the
+          following properties:
+
+              (a) w has ff as its full frame,
+              (b) w has sf as its __cilkrts_stack_frame, and
+              (c) w has an empty deque. 
+
+          If the runtime satisfies these properties, then if w
+          encounters a spawn in a parallel reduction, it can push onto
+          a valid deque.  Also, when a steal from w occurs, it will
+          build the correct tree of full frames when w is stolen from.
+
+        - In actual control flow, however, once the
+          "execute_reductions()" function returns, it is actually
+          returning to runtime code instead of g's spawn helper. 
+
+          At the point a worker w began executing reductions, the
+          control flow / compiled code had already finished g's spawn
+          helper, and w was about to enter the runtime.  With parallel
+          reductions, some worker v (which might be different from w)
+          is the one returning to the runtime.
+
+
+      The reduction logic consists of 4 steps:
+
+       A1. Restore runtime data structures to make it look as though
+           the spawn helper of g() is still the currently executing
+           frame for w.
+
+       A2. Execute reductions on the user stack.  Reductions also
+           includes the logic for exceptions and stacks.  Note that
+           reductions start on w, but may finish on a different
+           worker if there is parallelism in the reduce.
+
+       A3. Splice out ff from the tree of full frames.
+
+       A4. Jump into the runtime/scheduling stack and execute
+           "do_return_from_spawn".  This method
+
+           (a) Frees the user stack we were just on if it is no longer needed.
+           (b) Decrement the join counter on ff->parent, and tries to do a
+               provably good steal.
+           (c) Clean up the full frame ff. 
+
+
+   Case B: Stalling at a sync.
+
+     Consider a function g(), with full frame ff and
+     __cilkrts_stack_frame sf.  Suppose g() stalls at a sync, and we
+     are executing reductions.
+
+     Conceptually, we are splicing in an "execute_reductions()"
+     function into g() as the last action that g() takes immediately
+     before it executes the cilk_sync.
+
+     The reduction logic for this case is similar to Case A.
+
+       B1. Restore the runtime data structures. 
+
+           The main difference from Case A is that ff/sf is still a
+           frame that needs to be executed later (since it is stalling
+           at a cilk_sync).  Thus, we also need to save the current
+           stack information into "ff" so that we can correctly resume
+           execution of "ff" after the sync.
+
+       B2. Execute reductions on the user stack.
+
+       B3. No frame to splice out of the tree.
+
+       B4. Jump into the runtime/scheduling stack and execute "do_sync".
+           This method:
+           (a) Frees the user stack we were just on if it is no longer needed.
+           (b) Tries to execute a provably good steal.
+
+  Finally, for the reducer protocol, we consider two reduction paths,
+  namely a "fast" and "slow" path.  On a fast path, only trivial
+  merges of reducer maps happen (i.e., one or both of the maps are
+  NULL).  Otherwise, on the slow path, a reduction actually needs to
+  happen.
+
+*****************************************************************/
+
+/**
+ * @brief Locations to store the result of a reduction.
+ *
+ * Struct storing pointers to the fields in our "left" sibling that we
+ * should update when splicing out a full frame or stalling at a sync.
+ */
+typedef struct {
+    /** A pointer to the location of our left reducer map. */
+    struct cilkred_map **map_ptr;
+
+    /** A pointer to the location of our left exception. */
+    struct pending_exception_info **exception_ptr;
+} splice_left_ptrs;
+
+/**
+ * For a full frame returning from a spawn, calculate the pointers to
+ * the maps and exceptions to my left.
+ *
+ * @param w   The currently executing worker.
+ * @param ff  Full frame that is dying
+ * @return    Pointers to our "left" for reducers and exceptions.
+ */
+static inline
+splice_left_ptrs compute_left_ptrs_for_spawn_return(__cilkrts_worker *w,
+                                                    full_frame *ff)
+{
+    // ASSERT: we hold the lock on ff->parent
+
+    splice_left_ptrs left_ptrs;
+    if (ff->left_sibling) {
+        left_ptrs.map_ptr = &ff->left_sibling->right_reducer_map;
+        left_ptrs.exception_ptr = &ff->left_sibling->right_pending_exception;
+    }
+    else {
+        full_frame *parent_ff = ff->parent;
+        left_ptrs.map_ptr = &parent_ff->children_reducer_map;
+        left_ptrs.exception_ptr = &parent_ff->child_pending_exception;
+    }
+    return left_ptrs;
+}
+
+/**
+ * For a full frame at a sync, calculate the pointers to the maps and
+ * exceptions to my left.
+ *
+ * @param w   The currently executing worker.
+ * @param ff  Full frame that is stalling at a sync.
+ * @return    Pointers to our "left" for reducers and exceptions.
+ */
+static inline
+splice_left_ptrs compute_left_ptrs_for_sync(__cilkrts_worker *w,
+                                            full_frame *ff)
+{
+    // ASSERT: we hold the lock on ff
+    splice_left_ptrs left_ptrs;
+
+    // Figure out which map to the left we should merge into.
+    if (ff->rightmost_child) {
+        CILK_ASSERT(ff->rightmost_child->parent == ff);
+        left_ptrs.map_ptr = &(ff->rightmost_child->right_reducer_map);
+        left_ptrs.exception_ptr = &(ff->rightmost_child->right_pending_exception);
+    }
+    else {
+        // We have no children.  Then, we should be the last
+        // worker at the sync... "left" is our child map.
+        left_ptrs.map_ptr = &(ff->children_reducer_map);
+        left_ptrs.exception_ptr = &(ff->child_pending_exception);
+    }
+    return left_ptrs;
+}
+
+/**
+ * After we have completed all reductions on a spawn return, call this
+ * method to finish up before jumping into the runtime.
+ *
+ *   1. Perform the "reduction" on stacks, i.e., execute the left
+ *      holder logic to pass the leftmost stack up.
+ *
+ *      w->l->fiber_to_free holds any stack that needs to be freed
+ *      when control switches into the runtime fiber.
+ * 
+ *   2. Unlink and remove child_ff from the tree of full frames.
+ *
+ * @param   w          The currently executing worker.
+ * @param   parent_ff  The parent of child_ff.
+ * @param   child_ff   The full frame returning from a spawn.
+ */
+static inline
+void finish_spawn_return_on_user_stack(__cilkrts_worker *w,
+                                       full_frame *parent_ff,
+                                       full_frame *child_ff)
+{
+    CILK_ASSERT(w->l->fiber_to_free == NULL);
+
+    // Execute left-holder logic for stacks.
+    if (child_ff->left_sibling || parent_ff->fiber_child) {
+        // Case where we are not the leftmost stack.
+        CILK_ASSERT(parent_ff->fiber_child != child_ff->fiber_self);
+
+        // Remember any fiber we need to free in the worker.
+        // After we jump into the runtime, we will actually do the
+        // free.
+        w->l->fiber_to_free = child_ff->fiber_self;
+    }
+    else {
+        // We are leftmost, pass stack/fiber up to parent.
+        // Thus, no stack/fiber to free.
+        parent_ff->fiber_child = child_ff->fiber_self;
+        w->l->fiber_to_free = NULL;
+    }
+
+    child_ff->fiber_self = NULL;
+
+    unlink_child(parent_ff, child_ff);
+}
+
+
+/**
+ * Executes any fast reductions necessary to splice ff out of the tree
+ * of full frames.
+ *
+ * This "fast" path performs only trivial merges of reducer maps,
+ * i.e,. when one of them is NULL.
+ * (See slow_path_reductions_for_spawn_return() for slow path.)
+ *
+ * Returns: 1 if we finished all reductions.
+ * Returns: 0 if there are still reductions to execute, and
+ *            we should execute the slow path.
+ *
+ * This method assumes w holds the frame lock on parent_ff.
+ * After this method completes:
+ *    1. We have spliced ff out of the tree of full frames.
+ *    2. The reducer maps of child_ff have been deposited
+ *       "left" according to the reducer protocol.
+ *    3. w->l->stack_to_free stores the stack
+ *       that needs to be freed once we jump into the runtime.
+ *
+ * We have not, however, decremented the join counter on ff->parent.
+ * This prevents any other workers from resuming execution of the parent.
+ *
+ * @param   w    The currently executing worker.
+ * @param   ff   The full frame returning from a spawn.
+ * @return  NULL if we finished all reductions.
+ * @return  The address where the left map is stored (which should be passed to 
+ *          slow_path_reductions_for_spawn_return()) if there are
+ *          still reductions to execute. 
+ */
+struct cilkred_map**
+fast_path_reductions_for_spawn_return(__cilkrts_worker *w,
+                                      full_frame *ff)
+{
+    // ASSERT: we hold ff->parent->lock.
+    splice_left_ptrs left_ptrs;
+
+    CILK_ASSERT(NULL == w->l->pending_exception);
+
+    // Figure out the pointers to the left where I want
+    // to put reducers and exceptions.
+    left_ptrs = compute_left_ptrs_for_spawn_return(w, ff);
+    
+    // Go ahead and merge exceptions while holding the lock.
+    splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr);
+
+    // Now check if we have any reductions to perform.
+    //
+    // Consider all the cases of left, middle and right maps.
+    //  0. (-, -, -)  :  finish and return 1
+    //  1. (L, -, -)  :  finish and return 1
+    //  2. (-, M, -)  :  slide over to left, finish, and return 1.
+    //  3. (L, M, -)  :  return 0
+    //  4. (-, -, R)  :  slide over to left, finish, and return 1.
+    //  5. (L, -, R)  :  return 0
+    //  6. (-, M, R)  :  return 0
+    //  7. (L, M, R)  :  return 0
+    //
+    // In terms of code:
+    //  L == *left_ptrs.map_ptr
+    //  M == w->reducer_map
+    //  R == f->right_reducer_map.
+    //
+    // The goal of the code below is to execute the fast path with
+    // as few branches and writes as possible.
+    
+    int case_value = (*(left_ptrs.map_ptr) != NULL);
+    case_value += ((w->reducer_map != NULL) << 1);
+    case_value += ((ff->right_reducer_map != NULL) << 2);
+
+    // Fastest path is case_value == 0 or 1.
+    if (case_value >=2) {
+        switch (case_value) {
+        case 2:
+            *(left_ptrs.map_ptr) = w->reducer_map;
+            w->reducer_map = NULL;
+            return NULL;
+            break;
+        case 4:
+            *(left_ptrs.map_ptr) = ff->right_reducer_map;
+            ff->right_reducer_map = NULL;
+            return NULL;
+        default:
+            // If we have to execute the slow path, then
+            // return the pointer to the place to deposit the left
+            // map.
+            return left_ptrs.map_ptr;
+        }
+    }
+
+    // Do nothing
+    return NULL;
+}
+
+
+/**
+ * Executes any reductions necessary to splice "ff" frame out of
+ * the steal tree.
+ *
+ * This method executes the "slow" path for reductions on a spawn
+ * return, i.e., there are non-NULL maps that need to be merged
+ * together.
+ *
+ * This method should execute only if
+ * fast_path_reductions_for_spawn_return() returns a non-NULL
+ * left_map_ptr.
+ *
+ * Upon entry, left_map_ptr should be the location of the left map
+ * at the start of the reduction, as calculated by
+ * fast_path_reductions_for_spawn_return().
+ *
+ * After this method completes:
+ *    1. We have spliced ff out of the tree of full frames.
+ *    2. The reducer maps of child_ff have been deposited
+ *       "left" according to the reducer protocol.
+ *    3. w->l->stack_to_free stores the stack
+ *       that needs to be freed once we jump into the runtime.
+ * We have not, however, decremented the join counter on ff->parent,
+ * so no one can resume execution of the parent yet.
+ *
+ * WARNING: 
+ *   This method assumes the lock on ff->parent is held upon entry, and
+ *   Upon exit, the worker that returns still holds a lock on ff->parent
+ *   This method can, however, release and reacquire the lock on ff->parent.
+ *
+ * @param w             The currently executing worker.
+ * @param ff            The full frame returning from a spawn.
+ * @param left_map_ptr  Pointer to our initial left map.
+ * @return              The worker that this method returns on. 
+ */ 
+static __cilkrts_worker*
+slow_path_reductions_for_spawn_return(__cilkrts_worker *w,
+                                      full_frame *ff,
+                                      struct cilkred_map **left_map_ptr)
+{
+
+    // CILK_ASSERT: w is holding frame lock on parent_ff.
+#if REDPAR_DEBUG > 0
+    CILK_ASSERT(!ff->rightmost_child);
+    CILK_ASSERT(!ff->is_call_child);
+#endif
+
+    // Loop invariant:
+    // When beginning this loop, we should
+    //   1. Be holding the lock on ff->parent.
+    //   2. left_map_ptr should be the address of the pointer to the left map.
+    //   3. All maps should be slid over left by one, if possible.
+    //   4. All exceptions should be merged so far.
+    while (1) {
+        
+        // Slide middle map left if possible.
+        if (!(*left_map_ptr)) {
+            *left_map_ptr = w->reducer_map;
+            w->reducer_map = NULL;
+        }
+        // Slide right map to middle if possible.
+        if (!w->reducer_map) {
+            w->reducer_map = ff->right_reducer_map;
+            ff->right_reducer_map = NULL;
+        }
+
+        // Since we slid everything left by one,
+        // we are finished if there is no middle map.
+        if (!w->reducer_map) {
+            verify_current_wkr(w);
+            return w;
+        }
+        else {
+            struct cilkred_map* left_map;
+            struct cilkred_map* middle_map;
+            struct cilkred_map* right_map;
+
+            // Take all the maps from their respective locations.
+            // We can't leave them in place and execute a reduction because these fields
+            // might change once we release the lock.
+            left_map = *left_map_ptr;
+            *left_map_ptr = NULL;
+            middle_map = w->reducer_map;
+            w->reducer_map = NULL;
+            right_map = ff->right_reducer_map;
+            ff->right_reducer_map = NULL;
+        
+            // WARNING!!! Lock release here.
+            // We have reductions to execute (and we can't hold locks).
+            __cilkrts_frame_unlock(w, ff->parent);
+
+            // Merge all reducers into the left map.
+            left_map = repeated_merge_reducer_maps(&w,
+                                                   left_map,
+                                                   middle_map);
+            verify_current_wkr(w);
+            left_map = repeated_merge_reducer_maps(&w,
+                                                   left_map,
+                                                   right_map);
+            verify_current_wkr(w);
+            CILK_ASSERT(NULL == w->reducer_map);
+            // Put the final answer back into w->reducer_map.
+            w->reducer_map = left_map;
+            
+            // Save any exceptions generated because of the reduction
+            // process from the returning worker.  These get merged
+            // the next time around the loop.
+            CILK_ASSERT(NULL == ff->pending_exception);
+            ff->pending_exception = w->l->pending_exception;
+            w->l->pending_exception = NULL;
+
+            // Lock ff->parent for the next loop around.
+            __cilkrts_frame_lock(w, ff->parent);
+
+            // Once we have the lock again, recompute who is to our
+            // left.
+            splice_left_ptrs left_ptrs;
+            left_ptrs = compute_left_ptrs_for_spawn_return(w, ff);
+
+            // Update the pointer for the left map.
+            left_map_ptr = left_ptrs.map_ptr;
+            // Splice the exceptions for spawn.
+            splice_exceptions_for_spawn(w, ff, left_ptrs.exception_ptr);
+        }
+    }
+    // We should never break out of this loop.
+    
+    CILK_ASSERT(0);
+    return NULL;
+}
+
+
+
+/**
+ * Execute reductions when returning from a spawn whose parent has
+ * been stolen.
+ *
+ * Execution may start on w, but may finish on a different worker.
+ * This method acquires/releases the lock on ff->parent. 
+ *
+ * @param w            The currently executing worker.
+ * @param ff           The full frame of the spawned function that is returning.
+ * @param returning_sf The __cilkrts_stack_frame for this returning function.
+ * @return             The worker returning from this method. 
+ */ 
+static __cilkrts_worker*
+execute_reductions_for_spawn_return(__cilkrts_worker *w,
+                                    full_frame *ff,
+                                    __cilkrts_stack_frame *returning_sf)
+{ 
+    // Step A1 from reducer protocol described above.
+    //
+    // Coerce the runtime into thinking that 
+    // ff/returning_sf are still on the bottom of
+    // w's deque.
+    restore_frame_for_spawn_return_reduction(w, ff, returning_sf);
+
+    // Step A2 and A3: Execute reductions on user stack.
+    BEGIN_WITH_FRAME_LOCK(w, ff->parent) {
+        struct cilkred_map **left_map_ptr;
+        left_map_ptr = fast_path_reductions_for_spawn_return(w, ff);
+
+        // Pointer will be non-NULL if there are
+        // still reductions to execute.
+        if (left_map_ptr) {
+            // WARNING: This method call may release the lock
+            // on ff->parent and re-acquire it (possibly on a
+            // different worker).
+            // We can't hold locks while actually executing
+            // reduce functions.
+            w = slow_path_reductions_for_spawn_return(w,
+                                                      ff,
+                                                      left_map_ptr);
+            verify_current_wkr(w);
+        }
+
+        finish_spawn_return_on_user_stack(w, ff->parent, ff);      
+        // WARNING: the use of this lock macro is deceptive.
+        // The worker may have changed here.
+    } END_WITH_FRAME_LOCK(w, ff->parent);
+    return w;
+}
+
+
+
+/**
+ * Execute fast "reductions" when ff stalls at a sync.
+ *
+ * @param   w  The currently executing worker.
+ * @param   ff The full frame stalling at a sync.
+ * @return  1 if we are finished with all reductions after calling this method.
+ * @return  0 if we still need to execute the slow path reductions.
+ */ 
+static inline
+int fast_path_reductions_for_sync(__cilkrts_worker *w,
+                                  full_frame *ff) {
+    // Return 0 if there is some reduction that needs to happen.
+    return !(w->reducer_map  || ff->pending_exception);
+}
+
+/**
+ * Executes slow reductions when ff stalls at a sync.
+ * This method should execute only if
+ *   fast_path_reductions_for_sync(w, ff) returned 0.
+ *
+ * After this method completes:
+ *   1. ff's current reducer map has been deposited into
+ *       right_reducer_map of ff's rightmost child, or
+ *       ff->children_reducer_map if ff has no children.
+ *   2. Similarly for ff's current exception.
+ *   3. Nothing to calculate for stacks --- if we are stalling
+ *      we will always free a stack.
+ *
+ * This method may repeatedly acquire/release the lock on ff.
+ *
+ * @param   w  The currently executing worker.
+ * @param   ff The full frame stalling at a sync.
+ * @return  The worker returning from this method.
+ */
+static __cilkrts_worker*
+slow_path_reductions_for_sync(__cilkrts_worker *w,
+                              full_frame *ff)
+{
+    struct cilkred_map *left_map;
+    struct cilkred_map *middle_map;
+    
+#if (REDPAR_DEBUG > 0)
+    CILK_ASSERT(ff);
+    CILK_ASSERT(w->head == w->tail);
+#endif
+
+    middle_map = w->reducer_map;
+    w->reducer_map = NULL;
+
+    // Loop invariant: middle_map should be valid (the current map to reduce). 
+    //                 left_map is junk.
+    //                 w->reducer_map == NULL.
+    while (1) {
+        BEGIN_WITH_FRAME_LOCK(w, ff) {
+            splice_left_ptrs left_ptrs = compute_left_ptrs_for_sync(w, ff);
+            
+            // Grab the "left" map and store pointers to those locations.
+            left_map = *(left_ptrs.map_ptr);
+            *(left_ptrs.map_ptr) = NULL;
+            
+            // Slide the maps in our struct left as far as possible.
+            if (!left_map) {
+                left_map = middle_map;
+                middle_map = NULL;
+            }
+
+            *(left_ptrs.exception_ptr) =
+                __cilkrts_merge_pending_exceptions(w,
+                                                   *left_ptrs.exception_ptr,
+                                                   ff->pending_exception);
+            ff->pending_exception = NULL;
+
+            // If there is no middle map, then we are done.
+            // Deposit left and return.
+            if (!middle_map) {
+                *(left_ptrs).map_ptr = left_map;
+                #if (REDPAR_DEBUG > 0)
+                CILK_ASSERT(NULL == w->reducer_map);
+                #endif
+                // Sanity check upon leaving the loop.
+                verify_current_wkr(w);
+                // Make sure to unlock before we return!
+                __cilkrts_frame_unlock(w, ff);
+                return w;
+            }
+        } END_WITH_FRAME_LOCK(w, ff);
+        
+        // If we get here, we have a nontrivial reduction to execute.
+        middle_map = repeated_merge_reducer_maps(&w,
+                                                 left_map,
+                                                 middle_map);
+        verify_current_wkr(w);
+
+        // Save any exceptions generated because of the reduction
+        // process.  These get merged the next time around the
+        // loop.
+        CILK_ASSERT(NULL == ff->pending_exception);
+        ff->pending_exception = w->l->pending_exception;
+        w->l->pending_exception = NULL;
+    }
+    
+    // We should never break out of the loop above.
+    CILK_ASSERT(0);
+    return NULL;
+}
+
+
+/**
+ * Execute reductions when ff stalls at a sync.
+ *
+ * Execution starts on w, but may finish on a different worker.
+ * This method may acquire/release the lock on ff.
+ *
+ * @param w          The currently executing worker.
+ * @param ff         The full frame of the spawned function at the sync
+ * @param sf_at_sync The __cilkrts_stack_frame stalling at a sync
+ * @return           The worker returning from this method.
+ */ 
+static __cilkrts_worker*
+execute_reductions_for_sync(__cilkrts_worker *w,
+                            full_frame *ff,
+                            __cilkrts_stack_frame *sf_at_sync)
+{
+    int finished_reductions;
+    // Step B1 from reducer protocol above:
+    // Restore runtime invariants.
+    //
+    // The following code for this step is almost equivalent to
+    // the following sequence:
+    //   1. disown(w, ff, sf_at_sync, "sync") (which itself
+    //        calls make_unrunnable(w, ff, sf_at_sync))
+    //   2. make_runnable(w, ff, sf_at_sync).
+    //
+    // The "disown" will mark the frame "sf_at_sync"
+    // as stolen and suspended, and save its place on the stack,
+    // so it can be resumed after the sync. 
+    //
+    // The difference is, that we don't want the disown to 
+    // break the following connections yet, since we are
+    // about to immediately make sf/ff runnable again anyway.
+    //   sf_at_sync->worker == w
+    //   w->l->frame_ff == ff.
+    //
+    // These connections are needed for parallel reductions, since
+    // we will use sf / ff as the stack frame / full frame for
+    // executing any potential reductions.
+    //
+    // TBD: Can we refactor the disown / make_unrunnable code
+    // to avoid the code duplication here?
+
+    ff->call_stack = NULL;
+
+    // Normally, "make_unrunnable" would add CILK_FRAME_STOLEN and
+    // CILK_FRAME_SUSPENDED to sf_at_sync->flags and save the state of
+    // the stack so that a worker can resume the frame in the correct
+    // place.
+    //
+    // But on this path, CILK_FRAME_STOLEN should already be set.
+    // Also, we technically don't want to suspend the frame until
+    // the reduction finishes.
+    // We do, however, need to save the stack before
+    // we start any reductions, since the reductions might push more
+    // data onto the stack.
+    CILK_ASSERT(sf_at_sync->flags | CILK_FRAME_STOLEN);
+
+    __cilkrts_put_stack(ff, sf_at_sync);
+    __cilkrts_make_unrunnable_sysdep(w, ff, sf_at_sync, 1,
+                                     "execute_reductions_for_sync");
+    CILK_ASSERT(w->l->frame_ff == ff);
+
+    // Step B2: Execute reductions on user stack.
+    // Check if we have any "real" reductions to do.
+    finished_reductions = fast_path_reductions_for_sync(w, ff);
+    
+    if (!finished_reductions) {
+        // Still have some real reductions to execute.
+        // Run them here.
+
+        // This method may acquire/release the lock on ff.
+        w = slow_path_reductions_for_sync(w, ff);
+
+        // The previous call may return on a different worker.
+        // than what we started on.
+        verify_current_wkr(w);
+    }
+
+#if REDPAR_DEBUG >= 0
+    CILK_ASSERT(w->l->frame_ff == ff);
+    CILK_ASSERT(ff->call_stack == NULL);
+#endif
+
+    // Now we suspend the frame ff (since we've
+    // finished the reductions).  Roughly, we've split apart the 
+    // "make_unrunnable" call here --- we've already saved the
+    // stack info earlier before the reductions execute.
+    // All that remains is to restore the call stack back into the
+    // full frame, and mark the frame as suspended.
+    ff->call_stack = sf_at_sync;
+    sf_at_sync->flags |= CILK_FRAME_SUSPENDED;
+
+    // At a nontrivial sync, we should always free the current fiber,
+    // because it can not be leftmost.
+    w->l->fiber_to_free = ff->fiber_self;
+    ff->fiber_self = NULL;
+    return w;
+}
+
+
+/*
+  Local Variables: **
+  c-file-style:"bsd" **
+  c-basic-offset:4 **
+  indent-tabs-mode:nil **
+  End: **
+*/
diff --git a/libcilkrts/runtime/scheduler.h b/libcilkrts/runtime/scheduler.h
new file mode 100644
index 00000000000..543adaf68e0
--- /dev/null
+++ b/libcilkrts/runtime/scheduler.h
@@ -0,0 +1,421 @@
+/* scheduler.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file scheduler.h
+ *
+ * @brief scheduler.h declares routines for the Intel Cilk Plus scheduler,
+ * making it the heart of the Intel Cilk Plus implementation.
+ */
+
+#ifndef INCLUDED_SCHEDULER_DOT_H
+#define INCLUDED_SCHEDULER_DOT_H
+
+#include <cilk/common.h>
+#include <internal/abi.h>
+
+#include "rts-common.h"
+#include "full_frame.h"
+#include "reducer_impl.h"
+#include "global_state.h"
+
+#ifdef CILK_RECORD_REPLAY
+#include "record-replay.h"
+#endif
+
+__CILKRTS_BEGIN_EXTERN_C
+
+
+/**
+ * @brief Flag to disable parallel reductions.
+ *
+ * Set to 0 to allow parallel reductions.
+ */
+#define DISABLE_PARALLEL_REDUCERS 0
+
+/**
+ * @brief Debugging level for parallel reductions.
+ *
+ * Print debugging messages and assertions for parallel reducers. 0 is
+ * no debugging.  A higher value generates more output.
+ */
+#define REDPAR_DEBUG 0
+
+/**
+ * @brief Lock the worker mutex to allow exclusive access to the
+ * values in the @c __cilkrts_worker and local_state structures.
+ *
+ * @pre @c w->l->do_not_steal must not be set.  Essentially this
+ * condition asserts that the worker is not locked recursively.
+ *
+ * @param w The worker to lock.
+ */
+COMMON_PORTABLE
+void __cilkrts_worker_lock(__cilkrts_worker *w);
+
+/**
+ * @brief Unlock the worker mutex.
+ *
+ * @pre @c w->l->do_not_steal must be set.  Essentially this condition
+ * asserts that the worker has been previously locked.
+ *
+ * @param w The worker to unlock.
+ */
+COMMON_PORTABLE
+void __cilkrts_worker_unlock(__cilkrts_worker *w);
+
+/**
+ * @brief Push the next full frame to be made active in this worker
+ * and increment its join counter.
+ *
+ * __cilkrts_push_next_frame and pop_next_frame work on a one-element queue.
+ * This queue is used to communicate across the runtime from the code that
+ * wants to activate a frame to the code that can actually begin execution
+ * on that frame.  They are asymetrical in that push increments the join
+ * counter but pop does not decrement it.  Rather, a single push/pop
+ * combination makes a frame active and increments its join counter once.
+ *
+ * @note A system worker may chose to push work onto a user worker if
+ * the work is the continuation from a sync which only the user worker
+ * may complete.
+ *
+ * @param w The worker which the frame is to be pushed onto.
+ * @param ff The full_frame which is to be continued by the worker.
+ */
+COMMON_PORTABLE
+void __cilkrts_push_next_frame(__cilkrts_worker *w,
+                               full_frame *ff);
+
+/**
+ * @brief Sync on this worker.
+ *
+ * If this worker is the last to reach the sync, execution may resume
+ * on this worker after the sync.
+ *
+ * If this worker is not the last spawned child to reach the sync,
+ * then execution is suspended and the worker will re-enter the
+ * scheduling loop, looking for work it can steal.
+ *
+ * This function will jump into the runtime to switch to the scheduling
+ * stack to implement most of its logic.
+ *
+ * @param w The worker which is executing the sync.
+ * @param sf The __cilkrts_stack_frame containing the sync.
+ */
+COMMON_PORTABLE
+NORETURN __cilkrts_c_sync(__cilkrts_worker *w,
+                          __cilkrts_stack_frame *sf);
+
+/**
+ * @brief Worker @c w completely promotes its own deque, simulating the case
+ * where the whole deque is stolen.
+ *
+ * We use this mechanism to force the allocation of new storage for
+ * reducers for race-detection purposes.
+ *
+ * This method is called from the reducer lookup logic when
+ * @c g->force_reduce is set.
+ *
+ * @warning Use of "force_reduce" is known to have bugs when run with
+ * more than 1 worker.
+ *
+ * @param w The worker which is to have all entries in its deque
+ * promoted to full frames.
+ */
+COMMON_PORTABLE
+void __cilkrts_promote_own_deque(__cilkrts_worker *w);
+
+/**
+ * Called when a spawned function attempts to return and
+ * __cilkrts_undo_detach() fails. This can happen for two reasons:
+ *
+ * @li If another worker is considering stealing our parent, it bumps the
+ * exception pointer while it did so, which will cause __cilkrts_undo_detach()
+ * to fail. If the other worker didn't complete the steal of our parent, we
+ * still may be able to return to it, either because the steal attempt failed,
+ * or we won the race for the tail pointer.
+ *
+ * @li If the function's parent has been stolen then we cannot return. Instead
+ * we'll longjmp into the runtime to switch onto the scheduling stack to
+ * execute do_return_from_spawn() and determine what to do.  Either this
+ * worker is the last one to the sync, in which case we need to jump to the
+ * sync, or this worker is not the last one to the sync, in which case we'll
+ * abandon this work and jump to the scheduling loop to search for more work
+ * we can steal.
+ *
+ * @param w The worker which attempting to return from a spawn to
+ * a stolen parent.
+ * @param returning_sf The stack frame which is returning. 
+ */
+COMMON_PORTABLE
+void __cilkrts_c_THE_exception_check(__cilkrts_worker *w,
+				     __cilkrts_stack_frame *returning_sf);
+
+/**
+ * @brief Return an exception to an stolen parent.
+ *
+ * Used by the gcc implementation of exceptions to return an exception
+ * to a stolen parent
+ *
+ * @param w The worker which attempting to return from a spawn with an
+ * exception to a stolen parent.
+ * @param returning_sf The stack frame which is returning.
+ */
+COMMON_PORTABLE
+NORETURN __cilkrts_exception_from_spawn(__cilkrts_worker *w,
+					__cilkrts_stack_frame *returning_sf);
+
+/**
+ * @brief Used by the Windows implementations of exceptions to migrate an exception
+ * across fibers.
+ *
+ * Call this function when an exception has been thrown and has to
+ * traverse across a steal.  The exception has already been wrapped
+ * up, so all that remains is to longjmp() into the continuation,
+ * sync, and re-raise it.
+ *
+ * @param sf The __cilkrts_stack_frame for the frame that is attempting to
+ * return an exception to a stolen parent.
+ */
+void __cilkrts_migrate_exception (__cilkrts_stack_frame *sf);
+
+/**
+ * @brief Return from a call, not a spawn, where this frame has ever
+ * been stolen.
+ *
+ * @param w The worker that is returning from a frame which was ever stolen.
+ */
+COMMON_PORTABLE
+void __cilkrts_return(__cilkrts_worker *w);
+
+/**
+ * @brief Special return from the initial frame.
+ *
+ * This method will be called from @c __cilkrts_leave_frame if
+ * @c CILK_FRAME_LAST is set.
+ *
+ * This function will do the things necessary to cleanup, and unbind the
+ * thread from the Intel Cilk Plus runtime.  If this is the last user
+ * worker unbinding from the runtime, all system worker threads will be
+ * suspended.
+ *
+ * @pre @c w must be the currently executing worker, and must be a user
+ * worker.
+ *
+ * @param w The worker that's returning from the initial frame.
+ */
+COMMON_PORTABLE
+void __cilkrts_c_return_from_initial(__cilkrts_worker *w);
+
+/**
+ * @brief Used by exception handling code to pop an entry from the
+ * worker's deque.
+ *
+ * @param w Worker to pop the entry from
+ *
+ * @return __cilkrts_stack_frame of parent call
+ * @return NULL if the deque is empty
+ */
+COMMON_PORTABLE
+__cilkrts_stack_frame *__cilkrts_pop_tail(__cilkrts_worker *w);
+
+/**
+ * @brief Modifies the worker's protected_tail to prevent frames from
+ * being stolen.
+ *
+ * The Dekker protocol has been extended to only steal if head+1 is also
+ * less than protected_tail.
+ *
+ * @param w The worker to be modified.
+ * @param new_protected_tail The new setting for protected_tail, or NULL if the
+ * entire deque is to be protected
+ *
+ * @return Previous value of protected tail.
+ */
+COMMON_PORTABLE
+__cilkrts_stack_frame *volatile *__cilkrts_disallow_stealing(
+    __cilkrts_worker *w,
+    __cilkrts_stack_frame *volatile *new_protected_tail);
+
+/**
+ * @brief Restores the protected tail to a previous state, possibly
+ * allowing frames to be stolen.
+ *
+ * @param w The worker to be modified.
+ * @param saved_protected_tail A previous setting for protected_tail that is
+ * to be restored
+ */
+COMMON_PORTABLE
+void __cilkrts_restore_stealing(
+    __cilkrts_worker *w,
+    __cilkrts_stack_frame *volatile *saved_protected_tail);
+
+/**
+ * @brief Initialize a @c __cilkrts_worker.
+ *
+ * @note The memory for the worker must have been allocated outside
+ * this call.
+ *
+ * @param g The global_state_t.
+ * @param self The index into the global_state's array of workers for this
+ * worker, or -1 if this worker was allocated from the heap and cannot be
+ * stolen from.
+ * @param w The worker to be initialized.
+ *
+ * @return The initialized __cilkrts_worker.
+ */
+COMMON_PORTABLE
+__cilkrts_worker *make_worker(global_state_t *g,
+                              int self,
+                              __cilkrts_worker *w);
+
+/**
+ * @brief Free up any resources allocated for a worker.
+ *
+ * @note The memory for the @c __cilkrts_worker itself must be
+ * deallocated outside this call.
+ *
+ * @param w The worker to be destroyed.
+ */
+COMMON_PORTABLE
+void destroy_worker (__cilkrts_worker *w);
+
+/**
+ * @brief Initialize the runtime.
+ * 
+ * If necessary, allocates and initializes the global state.  If
+ * necessary, unsuspends the system workers.
+ *
+ * @param start Specifies whether the workers are to be unsuspended if
+ * they are suspended.  Allows __cilkrts_init() to start up the runtime without
+ * releasing the system threads.
+ */
+COMMON_PORTABLE
+void __cilkrts_init_internal(int start);
+
+/**
+ * @brief Part of the sequence to shutdown the runtime.
+ *
+ * Specifically, this call frees the @c global_state_t for the runtime.
+ *
+ * @param g The global_state_t.
+ */
+COMMON_PORTABLE
+void __cilkrts_deinit_internal(global_state_t *g);
+
+/**
+ * Obsolete.  We no longer need to import or export reducer maps.
+ */
+COMMON_PORTABLE
+cilkred_map *__cilkrts_xchg_reducer(
+    __cilkrts_worker *w, cilkred_map *newmap) cilk_nothrow;
+
+/**
+ * @brief Called when a user thread is bound to the runtime.
+ *
+ * If this action increments the count of bound user threads from 0 to
+ * 1, the system worker threads are unsuspended.
+ *
+ * If this action increments the count of bound user threads from 0 to
+ * 1, the system worker threads are unsuspended.
+ *
+ * @pre Global lock must be held.
+ * @param g The runtime global state.
+ */
+COMMON_PORTABLE
+void __cilkrts_enter_cilk(global_state_t *g);
+
+/**
+ * @brief Called when a user thread is unbound from the runtime.
+ *
+ * If this action decrements the count of bound user threads to 0, the
+ * system worker threads are suspended.
+ *
+ *
+ * @pre  Global lock must be held.
+ *
+ * @param g The runtime global state.
+ */
+COMMON_PORTABLE
+void __cilkrts_leave_cilk(global_state_t *g);
+
+
+/**
+ * @brief cilk_fiber_proc that runs the main scheduler loop on a
+ * user worker.
+ *
+ * @pre  fiber's owner field should be set to the correct __cilkrts_worker
+ * @pre  fiber must be a user worker.
+ *
+ * @param fiber    The scheduling fiber object.
+ */
+void scheduler_fiber_proc_for_user_worker(cilk_fiber *fiber);
+
+
+/**
+ * @brief Prints out Cilk runtime statistics.
+ *
+ * @param g The runtime global state.
+ *
+ * This method is useful only for debugging purposes.  No guarantees
+ * are made as to the validity of this data. :)
+ */
+COMMON_PORTABLE
+void __cilkrts_dump_stats_to_stderr(global_state_t *g);
+
+#ifdef CILK_RECORD_REPLAY
+COMMON_PORTABLE
+char * walk_pedigree_nodes(char *p, const __cilkrts_pedigree *pnode);
+
+/**
+ * @brief Used by exception handling code to simulate the popping of
+ * an entry from the worker's deque.
+ *
+ * @param w Worker whose deque we want to check
+ *
+ * @return @c __cilkrts_stack_frame of parent call
+ * @return NULL if the deque is empty
+ */
+COMMON_PORTABLE
+__cilkrts_stack_frame *simulate_pop_tail(__cilkrts_worker *w);
+
+#endif
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_SCHEDULER_DOT_H)
diff --git a/libcilkrts/runtime/signal_node.c b/libcilkrts/runtime/signal_node.c
new file mode 100644
index 00000000000..92c404b482c
--- /dev/null
+++ b/libcilkrts/runtime/signal_node.c
@@ -0,0 +1,241 @@
+/* signal_node.c               -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2011-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+#include "signal_node.h"
+#include <stdlib.h>
+
+/* Define cilk_semaphore_t for all of the respective systems. */
+#if defined __APPLE__
+#   include <mach/mach_init.h>
+#   include <mach/semaphore.h>
+#   include <mach/task.h>
+    typedef semaphore_t cilk_semaphore_t;
+#elif defined _WIN32
+#   include "windows-clean.h"
+    typedef HANDLE cilk_semaphore_t;
+#else // Linux/MIC
+#   include <errno.h>
+#   include <semaphore.h>
+#   include <stdio.h>
+    typedef sem_t cilk_semaphore_t;
+#endif // Linux/MIC
+
+#include "bug.h"
+#include "cilk_malloc.h"
+#include "signal_node.h"
+
+/**
+ * Interface within the tree to notify workers to wait without consuming cycles
+ * to expend cycles trying to steal.
+ *
+ * cilk_semaphore_t is implemented as an auto-reset event on Windows, and
+ * as a semaphore_t on Linux and MacOS.
+ */
+struct signal_node_t
+{
+    /** 0 if the worker should wait, 1 if it should be running. */
+    volatile unsigned int run;
+
+    /** OS-specific semaphore on which the worker can wait. */
+    cilk_semaphore_t sem;
+};
+
+/******************************************************************************/
+/* Semaphore-abstraction functions                                            */
+/******************************************************************************/
+
+/*
+ * All of these functions are simple wrappers for the system-specific semaphore
+ * functions.  This keeps the rest of the code reasonably clean and readable.
+ */
+
+#if defined __APPLE__
+static void initialize_cilk_semaphore (cilk_semaphore_t *sem)
+{
+    kern_return_t kstatus
+        = semaphore_create(mach_task_self(), sem, SYNC_POLICY_FIFO, 0);
+    assert(kstatus == KERN_SUCCESS);
+}
+static void deinitialize_cilk_semaphore (cilk_semaphore_t *sem)
+{
+    kern_return_t kstatus = semaphore_destroy(mach_task_self(), *sem);
+    assert(kstatus == KERN_SUCCESS);
+}
+static void wait_on_cilk_semaphore (cilk_semaphore_t *sem)
+{
+    kern_return_t kstatus = semaphore_wait(*sem);
+    assert(kstatus == KERN_SUCCESS);
+}
+static void signal_cilk_semaphore (cilk_semaphore_t *sem)
+{
+    kern_return_t kstatus = semaphore_signal(*sem);
+    assert(kstatus == KERN_SUCCESS);
+}
+#elif defined _WIN32
+// Note: Windows only provides counting semaphores, and we don't really
+// care about the count. So this is implemented using an auto-reset
+// event which will automatically reset after the WaitForSingleObject
+// call
+static void initialize_cilk_semaphore (cilk_semaphore_t *sem)
+{
+    // Create an auto-reset event
+    *sem = CreateEvent(NULL,    // Security attributes
+                       FALSE,   // Manual reset
+                       FALSE,   // Initial state (initially reset)
+                       NULL);   // Name (anonymous)
+    CILK_ASSERT (NULL != *sem);
+}
+
+static void deinitialize_cilk_semaphore (cilk_semaphore_t *sem)
+{
+    BOOL result = CloseHandle(*sem);
+    CILK_ASSERT (0 != result);
+}
+
+static void wait_on_cilk_semaphore (cilk_semaphore_t *sem)
+{
+    // WaitForSingleObject will reset the event
+    DWORD result = WaitForSingleObject (*sem, INFINITE);
+    CILK_ASSERT (WAIT_OBJECT_0 == result);
+}
+static void signal_cilk_semaphore (cilk_semaphore_t *sem)
+{
+    BOOL result = SetEvent (*sem);
+    CILK_ASSERT (0 != result);
+}
+#else // Linux/MIC
+static void initialize_cilk_semaphore (cilk_semaphore_t *sem)
+{
+    int status = sem_init(sem, 0, 0);
+    assert(0 == status);
+}
+static void deinitialize_cilk_semaphore (cilk_semaphore_t *sem)
+{
+    int status = sem_destroy(sem);
+    assert(0 == status);
+}
+static void wait_on_cilk_semaphore (cilk_semaphore_t *sem)
+{
+    int status;
+
+    do {
+        status = sem_wait(sem);
+    } while (status != 0 && errno == EINTR);
+
+    if (status != 0) {
+        perror("sem_wait");
+        abort();
+    }
+}
+static void signal_cilk_semaphore (cilk_semaphore_t *sem)
+{
+    sem_post(sem);
+}
+#endif // Linux/MIC
+
+/******************************************************************************/
+/* Runtime interface functions                                                */
+/******************************************************************************/
+
+/*
+ * Return a newly malloc'd and initialized signal_node_t.
+ */
+COMMON_SYSDEP
+signal_node_t *signal_node_create(void)
+{
+     signal_node_t *node;
+
+    node = ( signal_node_t*)
+        __cilkrts_malloc(sizeof( signal_node_t));
+    node->run = 0;
+    initialize_cilk_semaphore(&node->sem);
+
+    return node;
+}
+
+/*
+ * Clean and free a signal_node_t.
+ */
+void signal_node_destroy(signal_node_t *node)
+{
+    CILK_ASSERT(node);
+    deinitialize_cilk_semaphore(&node->sem);
+    __cilkrts_free(node);
+}
+
+/*
+ * Return 1 if the node thinks the worker should go to sleep, 0 otherwise.
+ */
+unsigned int signal_node_should_wait(signal_node_t *node)
+{
+    CILK_ASSERT(node);
+    return !node->run;
+}
+
+/*
+ * Send a message to the node that the worker will eventually read.
+ */
+void signal_node_msg(signal_node_t *node, unsigned int msg)
+{
+    CILK_ASSERT(node);
+    switch (msg) {
+    case 0:                    // worker should go to sleep.
+        node->run = msg;
+        break;
+    case 1:                    // worker should be awake.
+        node->run = msg;
+        signal_cilk_semaphore(&node->sem);
+        break;
+    default:                   // error.
+        CILK_ASSERT(0 == "Bad signal_node_t message.");
+    }
+}
+
+/*
+ * The current worker will wait on the semaphore.
+ */
+void signal_node_wait(signal_node_t *node)
+{
+    CILK_ASSERT(node);
+    while (signal_node_should_wait(node)) {
+        // The loop is here to consume extra semaphore signals that might have
+        // accumulated.  No point in passing on the accumulation.
+        wait_on_cilk_semaphore(&node->sem);
+    }
+}
diff --git a/libcilkrts/runtime/signal_node.h b/libcilkrts/runtime/signal_node.h
new file mode 100644
index 00000000000..0a1fe200201
--- /dev/null
+++ b/libcilkrts/runtime/signal_node.h
@@ -0,0 +1,109 @@
+/* signal_node.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file signal_node.h
+ *
+ * @brief Signal nodes allow coordinated waking and sleeping of the runtime
+ * without hammering on a single location in memory.
+ *
+ * The workers are logically arranged in a binary tree and propagate messages
+ * leaf-ward.  User workers notify the root about waking and sleeping, so only
+ * that one node need share a cache line with a user worker.
+ */
+
+#ifndef INCLUDED_SIGNAL_NODE_DOT_H
+#define INCLUDED_SIGNAL_NODE_DOT_H
+
+#include "rts-common.h"
+#include <cilk/common.h>
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/** Opaque type. */
+typedef struct signal_node_t signal_node_t;
+
+/**
+ * Allocate and initialize a signal_node_t
+ *
+ * @return The initialized signal_node_t
+ */
+COMMON_SYSDEP
+signal_node_t *signal_node_create(void);
+
+/**
+ * Free any resources and deallocate a signal_node_t
+ *
+ * @param node The node to be deallocated.
+ */
+COMMON_SYSDEP void signal_node_destroy(signal_node_t *node);
+
+/**
+ * Test whether the node thinks the worker should go to sleep
+ *
+ * @param node The node to be tested.
+ *
+ * @return 1 If the worker should go to sleep
+ * @return 0 If the worker should not go to sleep
+ */
+COMMON_SYSDEP
+unsigned int signal_node_should_wait(signal_node_t *node);
+
+/**
+ * Specify whether the worker should go to sleep
+ *
+ * @param node The node to be set.
+ * @param msg The value to be set.  Valid values are:
+ * - 0 - the worker should go to sleep
+ * - 1 - the worker should stay active
+ */
+COMMON_SYSDEP
+void signal_node_msg(signal_node_t *node, unsigned int msg);
+
+
+/**
+ * Wait for the node to be set
+ *
+ * @param node The node to wait on
+ */
+COMMON_SYSDEP
+void signal_node_wait(signal_node_t *node);
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_SIGNAL_NODE_DOT_H)
diff --git a/libcilkrts/runtime/spin_mutex.c b/libcilkrts/runtime/spin_mutex.c
new file mode 100644
index 00000000000..03908f26322
--- /dev/null
+++ b/libcilkrts/runtime/spin_mutex.c
@@ -0,0 +1,109 @@
+/* spin_mutex.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#include "spin_mutex.h"
+#include "bug.h"
+#include "os.h"
+#include "stats.h"
+
+// TBD (11/30/12): We should be doing a conditional test-xchg instead
+// of an unconditional xchg operation for the spin mutex.
+
+/* m->lock == 1 means that mutex M is locked */
+#define TRY_ACQUIRE(m) (__cilkrts_xchg(&(m)->lock, 1) == 0)
+
+/* ICC 11.1+ understands release semantics and generates an
+   ordinary store with a software memory barrier. */
+#if __ICC >= 1110
+#define RELEASE(m) __sync_lock_release(&(m)->lock)
+#else
+#define RELEASE(m) __cilkrts_xchg(&(m)->lock, 0)
+#endif
+
+
+spin_mutex* spin_mutex_create() 
+{
+    spin_mutex* mutex = (spin_mutex*)__cilkrts_malloc(sizeof(spin_mutex));
+    spin_mutex_init(mutex);
+    return mutex;
+}
+
+void spin_mutex_init(struct spin_mutex *m)
+{
+    // Use a simple assignment so Inspector doesn't bug us about the
+    // interlocked exchange doing a read of an uninitialized variable.
+    // By definition there can't be a race when we're initializing the
+    // lock...
+    m->lock = 0;
+}
+
+void spin_mutex_lock(struct spin_mutex *m)
+{
+    int count;
+    const int maxspin = 1000; /* SWAG */
+    if (!TRY_ACQUIRE(m)) {
+        count = 0;
+        do {
+            do {
+                __cilkrts_short_pause();
+                if (++count >= maxspin) {
+                    /* let the OS reschedule every once in a while */
+                    __cilkrts_yield();
+                    count = 0;
+                }
+            } while (m->lock != 0);
+        } while (!TRY_ACQUIRE(m));
+    }
+}
+
+int spin_mutex_trylock(struct spin_mutex *m)
+{
+    return TRY_ACQUIRE(m);
+}
+
+void spin_mutex_unlock(struct spin_mutex *m)
+{
+    RELEASE(m);
+}
+
+void spin_mutex_destroy(struct spin_mutex *m)
+{
+    __cilkrts_free(m);
+}
+
+/* End spin_mutex.c */
diff --git a/libcilkrts/runtime/spin_mutex.h b/libcilkrts/runtime/spin_mutex.h
new file mode 100644
index 00000000000..b0045ab9313
--- /dev/null
+++ b/libcilkrts/runtime/spin_mutex.h
@@ -0,0 +1,129 @@
+/* spin_mutex.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file spin_mutex.h
+ *
+ * @brief Support for Cilk runtime mutexes.
+ *
+ * Cilk runtime mutexes are implemented as simple spin loops.
+ *
+ * This file is similar to a worker_mutex, except it does not have an
+ * owner field.
+ *
+ * TBD: This class, worker_mutex, and os_mutex overlap quite a bit in
+ * functionality.  Can we unify these mutexes somehow?
+ */
+#ifndef INCLUDED_SPIN_MUTEX_DOT_H
+#define INCLUDED_SPIN_MUTEX_DOT_H
+
+#include <cilk/common.h>
+#include "rts-common.h"
+#include "cilk_malloc.h"
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * Mutexes are treated as an abstract data type within the Cilk
+ * runtime system.  They are implemented as simple spin loops.
+ */
+typedef struct spin_mutex {
+    /** Mutex spin loop variable. 0 if unowned, 1 if owned. */
+    volatile int lock;
+
+    /** Padding so the mutex takes up a cache line. */
+    char pad[64/sizeof(int) - 1];
+} spin_mutex;
+
+
+/**
+ * @brief Create a new Cilk spin_mutex.
+ *
+ * @return Returns an initialized spin mutex.  
+ */
+COMMON_PORTABLE
+spin_mutex* spin_mutex_create();
+
+/**
+ * @brief Initialize a Cilk spin_mutex.
+ *
+ * @param m Spin_Mutex to be initialized.
+ */
+COMMON_PORTABLE
+void spin_mutex_init(spin_mutex *m);
+
+/**
+ * @brief Acquire a Cilk spin_mutex.
+ *
+ * If statistics are being gathered, the time spent
+ * acquiring the spin_mutex will be attributed to the specified worker.
+ *
+ * @param m Spin_Mutex to be initialized.
+ */
+COMMON_PORTABLE
+void spin_mutex_lock(struct spin_mutex *m);
+/**
+ * @brief Attempt to lock a Cilk spin_mutex and fail if it isn't available.
+ *
+ * @param m Spin_Mutex to be acquired.
+ *
+ * @return 1 if the spin_mutex was acquired.
+ * @return 0 if the spin_mutex was not acquired.
+ */
+COMMON_PORTABLE
+int spin_mutex_trylock(struct spin_mutex *m);
+
+/**
+ * @brief Release a Cilk spin_mutex.
+ *
+ * @param m Spin_Mutex to be released.
+ */
+COMMON_PORTABLE
+void spin_mutex_unlock(struct spin_mutex *m);
+
+/**
+ * @brief Deallocate a Cilk spin_mutex.  Currently does nothing.
+ *
+ * @param m Spin_Mutex to be deallocated.
+ */
+COMMON_PORTABLE
+void spin_mutex_destroy(struct spin_mutex *m);
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_SPIN_MUTEX_DOT_H)
diff --git a/libcilkrts/runtime/stats.c b/libcilkrts/runtime/stats.c
new file mode 100644
index 00000000000..3a420745039
--- /dev/null
+++ b/libcilkrts/runtime/stats.c
@@ -0,0 +1,172 @@
+/* stats.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#include "stats.h"
+#include "bug.h"
+#include "os.h"
+#include "local_state.h"
+
+#include <stdio.h>
+
+#define INVALID_START (0ULL - 1ULL)
+
+#ifdef CILK_PROFILE
+/* MSVC does not support designated initializers, grrrr... */
+static const char *names[] = {
+    /*[INTERVAL_IN_SCHEDULER]*/                 "in scheduler",
+    /*[INTERVAL_WORKING]*/                      "  of which: working",
+    /*[INTERVAL_IN_RUNTIME]*/                   "  of which: in runtime",
+    /*[INTERVAL_STEALING]*/                     "     of which: stealing",
+    /*[INTERVAL_STEAL_SUCCESS]*/                "steal success: detach",
+    /*[INTERVAL_STEAL_FAIL_EMPTYQ]*/            "steal fail: empty queue",
+    /*[INTERVAL_STEAL_FAIL_LOCK]*/              "steal fail: victim locked",
+    /*[INTERVAL_STEAL_FAIL_USER_WORKER]*/       "steal fail: user worker",
+    /*[INTERVAL_STEAL_FAIL_DEKKER]*/            "steal fail: dekker",
+    /*[INTERVAL_SYNC_CHECK]*/                   "sync check",
+    /*[INTERVAL_THE_EXCEPTION_CHECK]*/          "THE exception check",
+    /*[INTERVAL_THE_EXCEPTION_CHECK_USELESS]*/  "  of which: useless",
+    /*[INTERVAL_RETURNING]*/                    "returning",
+    /*[INTERVAL_FINALIZE_CHILD]*/               "finalize child",
+    /*[INTERVAL_PROVABLY_GOOD_STEAL]*/          "provably good steal",
+    /*[INTERVAL_UNCONDITIONAL_STEAL]*/          "unconditional steal",
+    /*[INTERVAL_ALLOC_FULL_FRAME]*/             "alloc full frame",
+    /*[INTERVAL_FRAME_ALLOC_LARGE]*/            "large frame alloc",
+    /*[INTERVAL_FRAME_ALLOC]*/                  "small frame alloc",
+    /*[INTERVAL_FRAME_ALLOC_GLOBAL]*/           "  of which: to global pool",
+    /*[INTERVAL_FRAME_FREE_LARGE]*/             "large frame free",
+    /*[INTERVAL_FRAME_FREE]*/                   "small frame free",
+    /*[INTERVAL_FRAME_FREE_GLOBAL]*/            "  of which: to global pool",
+    /*[INTERVAL_MUTEX_LOCK]*/                   "mutex lock",
+    /*[INTERVAL_MUTEX_LOCK_SPINNING]*/          "  spinning",
+    /*[INTERVAL_MUTEX_LOCK_YIELDING]*/          "  yielding",
+    /*[INTERVAL_MUTEX_TRYLOCK]*/                "mutex trylock",
+    /*[INTERVAL_FIBER_ALLOCATE]*/               "fiber_allocate",
+    /*[INTERVAL_FIBER_DEALLOCATE]*/             "fiber_deallocate", 
+    /*[INTERVAL_FIBER_ALLOCATE_FROM_THREAD]*/   "fiber_allocate_from_thread",
+    /*[INTERVAL_FIBER_DEALLOCATE_FROM_THREAD]*/ "fiber_deallocate (thread)", 
+    /*[INTERVAL_SUSPEND_RESUME_OTHER]*/         "fiber suspend self + resume",
+    /*[INTERVAL_DEALLOCATE_RESUME_OTHER]*/      "fiber deallocate self + resume", 
+};
+#endif
+
+void __cilkrts_init_stats(statistics *s)
+{
+    int i;
+    for (i = 0; i < INTERVAL_N; ++i) {
+        s->start[i] = INVALID_START;
+        s->accum[i] = 0;
+        s->count[i] = 0;
+    }
+
+    s->stack_hwm = 0;
+}
+
+#ifdef CILK_PROFILE
+void __cilkrts_accum_stats(statistics *to, statistics *from)
+{
+    int i;
+
+    for (i = 0; i < INTERVAL_N; ++i) {
+        to->accum[i] += from->accum[i];
+        to->count[i] += from->count[i];
+        from->accum[i] = 0;
+        from->count[i] = 0;
+    }
+
+    if (from->stack_hwm > to->stack_hwm)
+        to->stack_hwm = from->stack_hwm;
+    from->stack_hwm = 0;
+}
+
+void __cilkrts_note_interval(__cilkrts_worker *w, enum interval i)
+{
+    if (w) {
+        statistics *s = w->l->stats;
+        CILK_ASSERT(s->start[i] == INVALID_START);
+        s->count[i]++;
+    }
+}
+
+void __cilkrts_start_interval(__cilkrts_worker *w, enum interval i)
+{
+    if (w) {
+        statistics *s = w->l->stats;
+        CILK_ASSERT(s->start[i] == INVALID_START);
+        s->start[i] = __cilkrts_getticks();
+        s->count[i]++;
+    }
+}
+
+void __cilkrts_stop_interval(__cilkrts_worker *w, enum interval i)
+{
+    if (w) {
+        statistics *s = w->l->stats;
+        CILK_ASSERT(s->start[i] != INVALID_START);
+        s->accum[i] += __cilkrts_getticks() - s->start[i];
+        s->start[i] = INVALID_START;
+    }
+}
+
+void dump_stats_to_file(FILE *stat_file, statistics *s)
+{
+    int i;
+    fprintf(stat_file, "\nCILK PLUS RUNTIME SYSTEM STATISTICS:\n\n");
+
+    fprintf(stat_file,
+            "  %-32s: %15s %10s %12s %10s\n",
+            "event",
+            "count",
+            "ticks",
+            "ticks/count",
+            "%total"
+        );
+    for (i = 0; i < INTERVAL_N; ++i) {
+        fprintf(stat_file, "  %-32s: %15llu", names[i], s->count[i]);
+        if (s->accum[i]) {
+            fprintf(stat_file, " %10.3g %12.3g %10.2f",
+                    (double)s->accum[i],
+                    (double)s->accum[i] / (double)s->count[i],
+                    100.0 * (double)s->accum[i] / 
+                    (double)s->accum[INTERVAL_IN_SCHEDULER]);
+        }
+        fprintf(stat_file, "\n");
+    }
+}
+#endif // CILK_PROFILE
+
+/* End stats.c */
diff --git a/libcilkrts/runtime/stats.h b/libcilkrts/runtime/stats.h
new file mode 100644
index 00000000000..aaa99274765
--- /dev/null
+++ b/libcilkrts/runtime/stats.h
@@ -0,0 +1,208 @@
+/* stats.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file stats.h
+ *
+ * @brief Support for gathering and reporting statistics on Cilk applications.
+ *
+ * Note that stats are normally NOT compiled in because it increases the
+ * overhead of stealing.  To compile in profiling support, define CILK_PROFILE.
+ */
+
+#ifndef INCLUDED_STATS_DOT_H
+#define INCLUDED_STATS_DOT_H
+
+/* #define CILK_PROFILE 1 */
+// @note  The CILK_PROFILE flag and intervals is known to be broken
+//        in at least programs with Windows exceptions. 
+//        Enable this flag at your own peril. :)
+
+#include <cilk/common.h>
+#include "rts-common.h"
+#include "internal/abi.h"
+
+#ifdef CILK_PROFILE
+#include <stdio.h>     // Define FILE *
+#endif
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/** @brief Events that we measure. */
+enum interval
+{
+    INTERVAL_IN_SCHEDULER,                  ///< Time threads spend "bound" to Cilk
+    INTERVAL_WORKING,                       ///< Time spent working
+    INTERVAL_IN_RUNTIME,                    ///< Time spent executing runtime scheduling loop
+    INTERVAL_STEALING,                      ///< Time spent stealing work
+    INTERVAL_STEAL_SUCCESS,                 ///< Time to do a successful steal
+    INTERVAL_STEAL_FAIL_EMPTYQ,             ///< Count of steal failures due to lack of stealable work
+    INTERVAL_STEAL_FAIL_LOCK,               ///< Count of steal failures due to failure to lock worker
+    INTERVAL_STEAL_FAIL_USER_WORKER,        ///< Count of steal failures by user workers which attempt to steal from another team
+    INTERVAL_STEAL_FAIL_DEKKER,             ///< Count of steal failures due to Dekker protocol failure
+    INTERVAL_SYNC_CHECK,                    ///< Time spent processing syncs
+    INTERVAL_THE_EXCEPTION_CHECK,           ///< Time spent performing THE exception checks
+    INTERVAL_THE_EXCEPTION_CHECK_USELESS,   ///< Count of useless THE exception checks
+    INTERVAL_RETURNING,                     ///< Time spent returning from calls
+    INTERVAL_FINALIZE_CHILD,                ///< Time spent in finalize_child
+    INTERVAL_PROVABLY_GOOD_STEAL,           ///< Time spent in provably_good_steal
+    INTERVAL_UNCONDITIONAL_STEAL,           ///< Time spent in unconditional_steal
+    INTERVAL_ALLOC_FULL_FRAME,              ///< Time spent in __cilkrts_make_full_frame
+    INTERVAL_FRAME_ALLOC_LARGE,             ///< Count of calls to __cilkrts_frame_malloc for buffers bigger than FRAME_MALLOC_MAX_SIZE or with a NULL worker
+    INTERVAL_FRAME_ALLOC,                   ///< Time spent allocating memory from worker buckets
+    INTERVAL_FRAME_ALLOC_GLOBAL,            ///< Time spent calling memory allocator when buckets are empty
+    INTERVAL_FRAME_FREE_LARGE,              ///< Count of calls to __cilkrts_frame_malloc for buffers bigger than FRAME_MALLOC_MAX_SIZE or with a NULL worker
+    INTERVAL_FRAME_FREE,                    ///< Time spent freeing memory to worker buckets
+    INTERVAL_FRAME_FREE_GLOBAL,             ///< Time spent calling memory deallocator when buckets are full
+    INTERVAL_MUTEX_LOCK,                    ///< Count of calls to __cilkrts_mutex_lock for a worker
+    INTERVAL_MUTEX_LOCK_SPINNING,           ///< Time spent spinning in __cilkrts_mutex_lock for a worker
+    INTERVAL_MUTEX_LOCK_YIELDING,           ///< Time spent yielding in __cilkrts_mutex_lock for a worker
+    INTERVAL_MUTEX_TRYLOCK,                 ///< Count of calls to __cilkrts_mutex_trylock
+    INTERVAL_FIBER_ALLOCATE,                ///< Time spent calling cilk_fiber_allocate
+    INTERVAL_FIBER_DEALLOCATE,              ///< Time spent calling cilk_fiber_deallocate (not from thread)
+    INTERVAL_FIBER_ALLOCATE_FROM_THREAD,    ///< Time spent calling cilk_fiber_allocate_from_thread
+    INTERVAL_FIBER_DEALLOCATE_FROM_THREAD,  ///< Time spent calling cilk_fiber_deallocate (from thread)
+    INTERVAL_SUSPEND_RESUME_OTHER,          ///< Count of fiber suspend_self_and_resume_other
+    INTERVAL_DEALLOCATE_RESUME_OTHER,       ///< Count of fiber deallocate_self_and_resume_other
+    INTERVAL_N                              ///< Number of intervals, must be last
+};
+
+/**
+ * @brief Struct that collects of all runtime statistics.
+ * 
+ * There is an instance of this structure in each worker's
+ * local_state, as well as one in the @c global_state_t which will be
+ * used to accumulate the per-worker stats.
+ */
+typedef struct statistics
+{
+    /** Number of times each interval is entered */
+    unsigned long long count[INTERVAL_N];
+
+    /**
+     * Time when the system entered each interval, in system-dependent
+     * "ticks"
+     */
+    unsigned long long start[INTERVAL_N];
+
+    /** Total time spent in each interval, in system-dependent "ticks" */
+    unsigned long long accum[INTERVAL_N];
+
+    /**
+     * Largest global number of stacks seen by this worker.
+     * The true maximum at end of execution is the max of the
+     * worker maxima.
+     */
+    long stack_hwm;
+} statistics;
+
+/**
+ * Initializes a statistics structure
+ *
+ * @param s The statistics structure to be initialized.
+ */
+COMMON_PORTABLE void __cilkrts_init_stats(statistics *s);
+
+/**
+ * @brief Sums statistics from worker to the global struct
+ *
+ * @param to   The statistics structure that will accumulate the information.
+ *             This structure is usually @c g->stats.
+ * @param from The statistics structure that will be accumulated.
+ *             This structure is usually statistics kept per worker.
+ */
+COMMON_PORTABLE
+void __cilkrts_accum_stats(statistics *to, statistics *from);
+
+/**
+ * @brief Mark the start of an interval by saving the current tick count.
+ *
+ * @pre Start time == INVALID_START
+ *
+ * @param w The worker we're accumulating stats for.
+ * @param i The interval we're accumulating stats for.
+ */
+COMMON_PORTABLE
+void __cilkrts_start_interval(__cilkrts_worker *w, enum interval i);
+
+/**
+ * @brief Mark the end of an interval by adding the ticks since the
+ * start to the accumulated time.
+ *
+ * @pre Start time != INVALID_START
+ *
+ * @param w The worker we're accumulating stats for.
+ * @param i The interval we're accumulating stats for.
+ */
+COMMON_PORTABLE
+void __cilkrts_stop_interval(__cilkrts_worker *w, enum interval i);
+
+/**
+ * @brief Start and stop interval I, charging zero time against it
+ *
+ * Precondition:
+ * - Start time == INVALID_START
+ *
+ * @param w The worker we're accumulating stats for.
+ * @param i The interval we're accumulating stats for.
+ */
+COMMON_PORTABLE
+void __cilkrts_note_interval(__cilkrts_worker *w, enum interval i);
+
+#ifdef CILK_PROFILE
+COMMON_PORTABLE
+void dump_stats_to_file(FILE *stat_file, statistics *s);
+#endif
+
+
+#ifdef CILK_PROFILE
+# define START_INTERVAL(w, i) __cilkrts_start_interval(w, i);
+# define STOP_INTERVAL(w, i) __cilkrts_stop_interval(w, i);
+# define NOTE_INTERVAL(w, i) __cilkrts_note_interval(w, i);
+#else
+/** Start an interval.  No effect unless CILK_PROFILE is defined. */
+# define START_INTERVAL(w, i)
+/** End an interval.  No effect unless CILK_PROFILE is defined. */
+# define STOP_INTERVAL(w, i)
+/** Increment a counter.  No effect unless CILK_PROFILE is defined. */
+# define NOTE_INTERVAL(w, i)
+#endif
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_STATS_DOT_H)
diff --git a/libcilkrts/runtime/symbol_test.c b/libcilkrts/runtime/symbol_test.c
new file mode 100644
index 00000000000..1113ecd44cd
--- /dev/null
+++ b/libcilkrts/runtime/symbol_test.c
@@ -0,0 +1,62 @@
+/* symbol_test.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/* simple program to verify that there are no undefined symbols in the runtime.
+ * If the runtime uses any symbols that are not defined, compiling this program
+ * will cause a linker error.
+ */
+
+extern void* __cilkrts_global_state;
+void *volatile p;
+
+void foo () { }
+int main ()
+{
+    int i;
+    long long j;
+
+    _Cilk_spawn foo();
+    _Cilk_for (i = 0; i < 2; ++i)
+        foo();
+    _Cilk_for (j = 0; j < 2; ++j)
+        foo();
+    p = __cilkrts_global_state;
+    return 0;
+}
+
+/* End symbol_test.c */
diff --git a/libcilkrts/runtime/sysdep-unix.c b/libcilkrts/runtime/sysdep-unix.c
new file mode 100644
index 00000000000..194681fffc5
--- /dev/null
+++ b/libcilkrts/runtime/sysdep-unix.c
@@ -0,0 +1,794 @@
+/*
+ * sysdep-unix.c
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2010-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************
+ */
+
+#ifdef __linux__
+    // define _GNU_SOURCE before *any* #include.
+    // Even <stdint.h> will break later #includes if this macro is not
+    // already defined when it is #included.
+#   define _GNU_SOURCE
+#endif
+
+#include "sysdep.h"
+#include "os.h"
+#include "bug.h"
+#include "local_state.h"
+#include "signal_node.h"
+#include "full_frame.h"
+#include "jmpbuf.h"
+#include "cilk_malloc.h"
+#include "reducer_impl.h"
+#include "metacall_impl.h"
+
+
+// On x86 processors (but not MIC processors), the compiler generated code to
+// save the FP state (rounding mode and the like) before calling setjmp.  We
+// will need to restore that state when we resume.
+#ifndef __MIC__
+# if defined(__i386__) || defined(__x86_64)
+#   define RESTORE_X86_FP_STATE
+# endif // defined(__i386__) || defined(__x86_64)
+#endif  // __MIC__
+
+// contains notification macros for VTune.
+#include "cilk-ittnotify.h"
+
+#include <stddef.h>
+
+#ifdef __CYGWIN__
+// On Cygwin, string.h doesnt declare strcasecmp if __STRICT_ANSI__ is defined
+#   undef __STRICT_ANSI__
+#endif
+
+#include <string.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <alloca.h>
+
+#ifdef __APPLE__
+//#   include <scheduler.h>  // Angle brackets include Apple's scheduler.h, not ours.
+#endif
+
+#ifdef __linux__
+#   include <sys/resource.h>
+#   include <sys/sysinfo.h>
+#endif
+
+#ifdef __FreeBSD__
+#   include <sys/resource.h>
+// BSD does not define MAP_ANONYMOUS, but *does* define MAP_ANON. Aren't standards great!
+#   define MAP_ANONYMOUS MAP_ANON
+#endif
+
+#ifdef  __VXWORKS__
+#   include <vxWorks.h>   
+#   include <vxCpuLib.h>  
+#endif
+
+struct global_sysdep_state
+{
+    pthread_t *threads;    ///< Array of pthreads for system workers
+    size_t pthread_t_size; ///< for cilk_db
+}; 
+
+static void internal_enforce_global_visibility();
+
+
+COMMON_SYSDEP
+void __cilkrts_init_worker_sysdep(struct __cilkrts_worker *w)
+{
+    ITT_SYNC_CREATE(w, "Scheduler");
+}
+
+COMMON_SYSDEP
+void __cilkrts_destroy_worker_sysdep(struct __cilkrts_worker *w)
+{
+}
+
+COMMON_SYSDEP
+void __cilkrts_init_global_sysdep(global_state_t *g)
+{
+    internal_enforce_global_visibility();
+
+    __cilkrts_init_tls_variables();
+
+    CILK_ASSERT(g->total_workers >= g->P - 1);
+    g->sysdep = __cilkrts_malloc(sizeof (struct global_sysdep_state));
+    CILK_ASSERT(g->sysdep);
+    g->sysdep->pthread_t_size = sizeof (pthread_t);
+    
+    // TBD: Should this value be g->total_workers, or g->P?
+    //      Need to check what we are using this field for.
+    g->sysdep->threads = __cilkrts_malloc(sizeof(pthread_t) * g->total_workers);
+    CILK_ASSERT(g->sysdep->threads);
+
+    return;
+}
+
+COMMON_SYSDEP
+void __cilkrts_destroy_global_sysdep(global_state_t *g)
+{
+    if (g->sysdep->threads)
+        __cilkrts_free(g->sysdep->threads);
+    __cilkrts_free(g->sysdep);
+}
+
+/*************************************************************
+  Creation of worker threads:
+*************************************************************/
+
+static void internal_run_scheduler_with_exceptions(__cilkrts_worker *w)
+{
+    /* We assume the stack grows down. */
+    char var;
+    __cilkrts_cilkscreen_establish_c_stack(&var - 1000000, &var);
+
+    __cilkrts_run_scheduler_with_exceptions(w);
+}
+
+
+
+/*
+ * scheduler_thread_proc_for_system_worker
+ *
+ * Thread start function called when we start a new worker.
+ *
+ */
+NON_COMMON void* scheduler_thread_proc_for_system_worker(void *arg)
+{
+    /*int status;*/
+    __cilkrts_worker *w = (__cilkrts_worker *)arg;
+
+#ifdef __INTEL_COMPILER
+#ifdef USE_ITTNOTIFY
+    // Name the threads for Advisor.  They don't want a worker number.
+    __itt_thread_set_name("Cilk Worker");
+#endif // defined USE_ITTNOTIFY
+#endif // defined __INTEL_COMPILER
+
+    /* Worker startup is serialized
+    status = pthread_mutex_lock(&__cilkrts_global_mutex);
+    CILK_ASSERT(status == 0);*/
+    CILK_ASSERT(w->l->type == WORKER_SYSTEM);
+    /*status = pthread_mutex_unlock(&__cilkrts_global_mutex);
+    CILK_ASSERT(status == 0);*/
+    
+    __cilkrts_set_tls_worker(w);
+
+    // Create a cilk fiber for this worker on this thread.
+    START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE_FROM_THREAD) {
+        w->l->scheduling_fiber = cilk_fiber_allocate_from_thread();
+        cilk_fiber_set_owner(w->l->scheduling_fiber, w);
+    } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE_FROM_THREAD);
+    
+    internal_run_scheduler_with_exceptions(w);
+
+    START_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE_FROM_THREAD) {
+        // Deallocate the scheduling fiber.  This operation reverses the
+        // effect cilk_fiber_allocate_from_thread() and must be done in this
+        // thread before it exits.
+        int ref_count = cilk_fiber_deallocate_from_thread(w->l->scheduling_fiber);
+        // Scheduling fibers should never have extra references to them.
+        // We only get extra references into fibers because of Windows
+        // exceptions.
+        CILK_ASSERT(0 == ref_count);
+        w->l->scheduling_fiber = NULL;
+    } STOP_INTERVAL(w, INTERVAL_FIBER_DEALLOCATE_FROM_THREAD);
+    
+    return 0;
+}
+
+
+/*
+ * __cilkrts_user_worker_scheduling_stub
+ *
+ * Routine for the scheduling fiber created for an imported user
+ * worker thread.  This method is analogous to
+ * scheduler_thread_proc_for_system_worker.
+ *
+ */
+void __cilkrts_user_worker_scheduling_stub(cilk_fiber* fiber, void* null_arg)
+{
+    __cilkrts_worker *w = __cilkrts_get_tls_worker();
+
+    // Sanity check.
+    CILK_ASSERT(WORKER_USER == w->l->type);
+
+    // Enter the scheduling loop on the user worker.
+    // This function will never return.
+    __cilkrts_run_scheduler_with_exceptions(w);
+
+    // A WORKER_USER, at some point, will resume on the original stack and leave
+    // Cilk.  Under no circumstances do we ever exit off of the bottom of this
+    // stack.
+    CILK_ASSERT(0);
+}
+
+/**
+ * We are exporting a function with this name to Inspector?
+ * What a confusing name...
+ *
+ * This function is exported so Piersol's stack trace displays
+ * reasonable information.
+ */ 
+void* __cilkrts_worker_stub(void* arg)
+{
+    return scheduler_thread_proc_for_system_worker(arg);
+}
+
+
+
+// /* Return the lesser of the argument and the operating system
+//    limit on the number of workers (threads) that may or ought
+//    to be created. */
+// int sysdep_thread_limit(int n, int physical_cpus)
+// {
+//     /* On Linux thread creation fails somewhere short of the
+//        number of available processes. */
+//     struct rlimit lim;
+
+//     if (n > 256 + 2 * physical_cpus)
+//         n = 256 + 2 * physical_cpus;
+
+//     if (getrlimit(RLIMIT_NPROC, &lim) == 0 && lim.rlim_cur != RLIM_INFINITY)
+//     {
+//         /* If the limit reads 0 or absurdly small, ignore it. */
+//         unsigned int maxproc = (lim.rlim_cur * 3 + 3) / 4;
+//         if (maxproc > 8 + 2 * physical_cpus && maxproc < n)
+//             n = maxproc;
+//     }
+//     return n;
+// }
+
+
+
+static void write_version_file (global_state_t *, int);
+
+/* Create n worker threads from base..top-1
+ */
+static void create_threads(global_state_t *g, int base, int top)
+{
+    // TBD(11/30/12): We want to insert code providing the option of
+    // pinning system workers to cores.
+    for (int i = base; i < top; i++) {
+        int status = pthread_create(&g->sysdep->threads[i],
+                                    NULL,
+                                    scheduler_thread_proc_for_system_worker,
+                                    g->workers[i]);
+        if (status != 0)
+            __cilkrts_bug("Cilk runtime error: thread creation (%d) failed: %d\n", i, status);
+    }
+}
+
+#if PARALLEL_THREAD_CREATE
+static int volatile threads_created = 0;
+
+// Create approximately half of the worker threads, and then become a worker
+// ourselves.
+static void * create_threads_and_work (void * arg)
+{
+    global_state_t *g = ((__cilkrts_worker *)arg)->g;
+
+    create_threads(g, g->P/2, g->P-1);
+    // Let the initial thread know that we're done.
+    threads_created = 1;
+
+    // Ideally this turns into a tail call that wipes out this stack frame.
+    return scheduler_thread_proc_for_system_worker(arg);
+}
+#endif
+void __cilkrts_start_workers(global_state_t *g, int n)
+{
+    g->workers_running = 1;
+    g->work_done = 0;
+
+    if (!g->sysdep->threads)
+        return;
+
+    // Do we actually have any threads to create?
+    if (n > 0)
+    {
+#if PARALLEL_THREAD_CREATE
+            int status;
+            // We create (a rounded up) half of the threads, thread one creates the rest
+            int half_threads = (n+1)/2;
+        
+            // Create the first thread passing a different thread function, so that it creates threads itself
+            status = pthread_create(&g->sysdep->threads[0], NULL, create_threads_and_work, g->workers[0]);
+
+            if (status != 0)
+                __cilkrts_bug("Cilk runtime error: thread creation (0) failed: %d\n", status);
+            
+            // Then the rest of the ones we have to create
+            create_threads(g, 1, half_threads);
+
+            // Now wait for the first created thread to tell us it's created all of its threads.
+            // We could maybe drop this a bit lower and overlap with write_version_file.
+            while (!threads_created)
+                __cilkrts_yield();
+#else
+            // Simply create all the threads linearly here.
+            create_threads(g, 0, n);
+#endif
+    }
+    // write the version information to a file if the environment is configured
+    // for it (the function makes the check).
+    write_version_file(g, n);
+
+
+    return;
+}
+
+void __cilkrts_stop_workers(global_state_t *g)
+{
+    int i;
+
+    // Tell the workers to give up
+
+    g->work_done = 1;
+
+    if (g->workers_running == 0)
+        return;
+
+    if (!g->sysdep->threads)
+        return;
+
+    /* Make them all runnable. */
+    if (g->P > 1) {
+        CILK_ASSERT(g->workers[0]->l->signal_node);
+        signal_node_msg(g->workers[0]->l->signal_node, 1);
+    }
+
+        for (i = 0; i < g->P - 1; ++i) {
+            int sc_status;
+            void *th_status;
+
+            sc_status = pthread_join(g->sysdep->threads[i], &th_status);
+            if (sc_status != 0)
+                __cilkrts_bug("Cilk runtime error: thread join (%d) failed: %d\n", i, sc_status);
+        }
+
+    g->workers_running = 0;
+
+
+    return;
+}
+
+
+/*
+ * @brief Returns the stack address for resuming execution of sf.
+ *
+ * This method takes in the top of the stack to use, and then returns
+ * a properly aligned address for resuming execution of sf.
+ *
+ *  @param sf           -   The stack frame we want to resume executing.
+ *  @param stack_base   -   The top of the stack we want to execute sf on.
+ *
+ */
+static char* get_sp_for_executing_sf(char* stack_base,
+                                     full_frame *ff,
+                                     __cilkrts_stack_frame *sf)
+{
+// The original calculation that had been done to correct the stack
+// pointer when resuming execution.
+//
+// But this code was never getting called in the eng branch anyway...
+// 
+// TBD(11/30/12): This logic needs to be revisited to make sure that
+// we are doing the proper calculation in reserving space for outgoing
+// arguments on all platforms and architectures.
+#if 0    
+    /* Preserve outgoing argument space and stack alignment on steal.
+       Outgoing argument space is bounded by the difference between
+       stack and frame pointers.  Some user code is known to rely on
+       16 byte alignment.  Maintain 32 byte alignment for future
+       compatibility. */
+#define SMASK 31 /* 32 byte alignment */
+    if (sf) {
+        char *fp = FP(sf), *sp = SP(sf);
+        int fp_align = (int)(size_t)fp & SMASK;
+        ptrdiff_t space = fp - sp;
+
+        fprintf(stderr, "Here: fp = %p, sp = %p\n", fp, sp);
+        char *top_aligned = (char *)((((size_t)stack_base - SMASK) & ~(size_t)SMASK) | fp_align);
+        /* Don't allocate an unreasonable amount of stack space. */
+
+        fprintf(stderr, "Here: stack_base = %p, top_aligned=%p, space=%ld\n",
+                stack_base, top_aligned, space);
+        if (space < 32)
+            space = 32 + (space & SMASK);
+        else if (space > 40 * 1024)
+            space = 40 * 1024 + (space & SMASK);
+
+        return top_aligned - space;
+    }
+#endif    
+
+#define PERFORM_FRAME_SIZE_CALCULATION 0
+    
+    char* new_stack_base = stack_base - 256;
+
+#if PERFORM_FRAME_SIZE_CALCULATION
+    // If there is a frame size saved, then use that as the
+    // correction instead of 256.
+    if (ff->frame_size > 0) {
+        if (ff->frame_size < 40*1024) {
+            new_stack_base = stack_base - ff->frame_size;
+        }
+        else {
+            // If for some reason, our frame size calculation is giving us
+            // a number which is bigger than about 10 pages, then
+            // there is likely something wrong here?  Don't allocate
+            // an unreasonable amount of space.
+            new_stack_base = stack_base - 40*1024;
+        }
+    }
+#endif
+    
+    // Whatever correction we choose, align the final stack top.
+    // This alignment seems to be necessary in particular on 32-bit
+    // Linux, and possibly Mac. (Is 32-byte alignment is sufficient?)
+    /* 256-byte alignment. Why not? */
+    const uintptr_t align_mask = ~(256 -1);
+    new_stack_base = (char*)((size_t)new_stack_base & align_mask);
+    return new_stack_base;
+}
+
+char* sysdep_reset_jump_buffers_for_resume(cilk_fiber* fiber,
+                                           full_frame *ff,
+                                           __cilkrts_stack_frame *sf)
+{
+#if FIBER_DEBUG >= 4
+    fprintf(stderr, "ThreadId=%p (fiber_proc_to_resume), Fiber %p.  sf = %p. ff=%p, ff->sync_sp=%p\n",
+            cilkos_get_current_thread_id(),
+            fiber,
+            sf,
+            ff, ff->sync_sp);
+#endif
+
+    CILK_ASSERT(fiber);
+    void* sp = (void*)get_sp_for_executing_sf(cilk_fiber_get_stack_base(fiber), ff, sf);
+    SP(sf) = sp;
+
+    /* Debugging: make sure stack is accessible. */
+    ((volatile char *)sp)[-1];
+
+    // Adjust the saved_sp to account for the SP we're about to run.  This will
+    // allow us to track fluctations in the stack
+#if FIBER_DEBUG >= 4    
+    fprintf(stderr, "ThreadId=%p, about to take stack ff=%p, sp=%p, sync_sp=%p\n",
+            cilkos_get_current_thread_id(),
+            ff,
+            sp,
+            ff->sync_sp);
+#endif
+    __cilkrts_take_stack(ff, sp);
+    return sp;
+}
+
+
+NORETURN sysdep_longjmp_to_sf(char* new_sp,
+                              __cilkrts_stack_frame *sf,
+                              full_frame *ff_for_exceptions /* UNUSED on Unix */)
+{
+#if FIBER_DEBUG >= 3
+    fprintf(stderr,
+            "ThreadId=%p. resume user code, sf=%p, new_sp = %p, original SP(sf) = %p, FP(sf) = %p\n",
+            cilkos_get_current_thread_id(), sf, new_sp, SP(sf), FP(sf));
+#endif
+
+    // Set the stack pointer.
+    SP(sf) = new_sp;
+
+#ifdef RESTORE_X86_FP_STATE
+    if (CILK_FRAME_VERSION_VALUE(sf->flags) >= 1) {
+        // Restore the floating point state that was set in this frame at the
+        // last spawn.
+        //
+        // This feature is only available in ABI 1 or later frames, and only
+        // needed on IA64 or Intel64 processors.
+        restore_x86_fp_state(sf);
+    }
+#endif
+
+    CILK_LONGJMP(sf->ctx);
+}
+
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <errno.h>
+
+
+void __cilkrts_make_unrunnable_sysdep(__cilkrts_worker *w,
+                                      full_frame *ff,
+                                      __cilkrts_stack_frame *sf,
+                                      int is_loot,
+                                      const char *why)
+{
+    (void)w; /* unused */
+    sf->except_data = 0;
+
+    if (is_loot)
+    {
+        if (ff->frame_size == 0)
+        ff->frame_size = __cilkrts_get_frame_size(sf);
+
+        // Null loot's sp for debugging purposes (so we'll know it's not valid)
+    SP(sf) = 0;
+    }
+}
+
+/*
+ * __cilkrts_sysdep_is_worker_thread_id
+ *
+ * Returns true if the thread ID specified matches the thread ID we saved
+ * for a worker.
+ */
+
+int __cilkrts_sysdep_is_worker_thread_id(global_state_t *g,
+                                         int i,
+                                         void *thread_id)
+{
+#if defined( __linux__) || defined(__VXWORKS__)
+    pthread_t tid = *(pthread_t *)thread_id;
+    if (i < 0 || i > g->total_workers)
+        return 0;
+    return g->sysdep->threads[i] == tid;
+#else
+    // Needs to be implemented
+    return 0;
+#endif
+}
+
+
+
+
+/*************************************************************
+  Version information:
+*************************************************************/
+
+#include <dlfcn.h>
+#include "internal/cilk_version.h"
+#include <stdio.h>
+#include <sys/utsname.h>
+
+#ifdef __VXWORKS__
+#include <version.h>
+# endif
+
+/* (Non-static) dummy function is used by get_runtime_path() to find the path
+ * to the .so containing the Cilk runtime.
+ */
+void dummy_function() { }
+
+/* return a string with the path to the Cilk runtime, or "unknown" if the path
+ * cannot be determined.
+ */
+static const char *get_runtime_path ()
+{
+#ifdef __CYGWIN__
+    // Cygwin doesn't support dladdr, which sucks
+    return "unknown";
+#else
+    Dl_info info;
+    if (0 == dladdr(dummy_function, &info)) return "unknown";
+    return info.dli_fname;
+#endif
+}
+
+/* if the environment variable, CILK_VERSION, is defined, writes the version
+ * information to the specified file.
+ * g is the global state that was just created, and n is the number of workers
+ * that were made (or requested from RML) for it.
+ */
+static void write_version_file (global_state_t *g, int n)
+{
+    const char *env;      // environment variable.
+    char buf[256];        // print buffer.
+    time_t t;
+    FILE *fp;
+    struct utsname sys_info;
+    int err;              // error code from system calls.
+
+    // if CILK_VERSION is not set, or if the file cannot be opened, fail
+    // silently.  Otherwise open the file for writing (or use stderr or stdout
+    // if the user specifies).
+    if (NULL == (env = getenv("CILK_VERSION"))) return;
+    if (0 == strcasecmp(env, "stderr"))         fp = stderr;
+    else if (0 == strcasecmp(env, "stdout"))    fp = stdout;
+    else if (NULL == (fp = fopen(env, "w")))    return;
+
+    // get a string for the current time.  E.g.,
+    // Cilk runtime initialized: Thu Jun 10 13:28:00 2010
+    t = time(NULL);
+    strftime(buf, 256, "%a %b %d %H:%M:%S %Y", localtime(&t));
+    fprintf(fp, "Cilk runtime initialized: %s\n", buf);
+
+    // Print runtime info.  E.g.,
+    // Cilk runtime information
+    // ========================
+    // Cilk version: 2.0.0 Build 9184
+    // Built by willtor on host willtor-desktop
+    // Compilation date: Thu Jun 10 13:27:42 2010
+    // Compiled with ICC V99.9.9, ICC build date: 20100610
+
+    fprintf(fp, "\nCilk runtime information\n");
+    fprintf(fp, "========================\n");
+    fprintf(fp, "Cilk version: %d.%d.%d Build %d\n",
+            VERSION_MAJOR,
+            VERSION_MINOR,
+            VERSION_REV,
+            VERSION_BUILD);
+#ifdef __VXWORKS__    
+    char * vxWorksVer = VXWORKS_VERSION; 
+    fprintf(fp, "Cross compiled for %s\n",vxWorksVer);
+    // user and host not avalible if VxWorks cross compiled on windows build host 
+#else
+
+    // User and host are not available for GCC builds
+#ifdef BUILD_USER
+    fprintf(fp, "Built by "BUILD_USER" on host "BUILD_HOST"\n");
+#endif // BUILD_USER
+#endif // __VXWORKS__
+
+    // GCC has requested that this be removed for GCC builds
+#ifdef BUILD_USER    
+    fprintf(fp, "Compilation date: "__DATE__" "__TIME__"\n");
+#endif // BUILD_USER
+
+#ifdef __INTEL_COMPILER
+    // Compiled by the Intel C/C++ compiler.
+    fprintf(fp, "Compiled with ICC V%d.%d.%d, ICC build date: %d\n",
+            __INTEL_COMPILER / 100,
+            (__INTEL_COMPILER / 10) % 10,
+            __INTEL_COMPILER % 10,
+            __INTEL_COMPILER_BUILD_DATE);
+#else
+    // Compiled by GCC.
+    fprintf(fp, "Compiled with GCC V%d.%d.%d\n",
+            __GNUC__,
+            __GNUC_MINOR__,
+            __GNUC_PATCHLEVEL__);
+#endif // defined __INTEL_COMPILER
+
+    // Print system info.  E.g.,
+    // System information
+    // ==================
+    // Cilk runtime path: /opt/icc/64/lib/libcilkrts.so.5
+    // System OS: Linux, release 2.6.28-19-generic
+    // System architecture: x86_64
+
+    err = uname(&sys_info);
+    fprintf(fp, "\nSystem information\n");
+    fprintf(fp, "==================\n");
+    fprintf(fp, "Cilk runtime path: %s\n", get_runtime_path());
+    fprintf(fp, "System OS: %s, release %s\n",
+            err < 0 ? "unknown" : sys_info.sysname,
+            err < 0 ? "?" : sys_info.release);
+    fprintf(fp, "System architecture: %s\n",
+            err < 0 ? "unknown" : sys_info.machine);
+
+    // Print thread info.  E.g.,
+    // Thread information
+    // ==================
+    // System cores: 8
+    // Cilk workers requested: 8
+    // Thread creator: Private
+
+    fprintf(fp, "\nThread information\n");
+    fprintf(fp, "==================\n");
+#ifdef __VXWORKS__      
+    fprintf(fp, "System cores: %d\n", (int)__builtin_popcount(vxCpuEnabledGet()));
+#else    
+    fprintf(fp, "System cores: %d\n", (int)sysconf(_SC_NPROCESSORS_ONLN));
+#endif    
+    fprintf(fp, "Cilk workers requested: %d\n", n);
+#if (PARALLEL_THREAD_CREATE)
+        fprintf(fp, "Thread creator: Private (parallel)\n");
+#else
+        fprintf(fp, "Thread creator: Private\n");
+#endif
+
+    if (fp != stderr && fp != stdout) fclose(fp);
+    else fflush(fp); // flush the handle buffer if it is stdout or stderr.
+}
+
+
+/*
+ * __cilkrts_establish_c_stack
+ *
+ * Tell Cilkscreen about the user stack bounds.
+ *
+ * Note that the Cilk V1 runtime only included the portion of the stack from
+ * the entry into Cilk, down.  We don't appear to be able to find that, but
+ * I think this will be sufficient.
+ */
+
+void __cilkrts_establish_c_stack(void)
+{
+    /* FIXME: Not implemented. */
+
+    /* TBD: Do we need this */
+    /*
+    void __cilkrts_cilkscreen_establish_c_stack(char *begin, char *end);
+
+    size_t r;
+    MEMORY_BASIC_INFORMATION mbi;
+
+    r = VirtualQuery (&mbi,
+                      &mbi,
+                      sizeof(mbi));
+
+    __cilkrts_cilkscreen_establish_c_stack((char *)mbi.BaseAddress,
+                                   (char *)mbi.BaseAddress + mbi.RegionSize);
+    */
+}
+
+
+/*
+ * internal_enforce_global_visibility
+ *
+ * Ensure global visibility of public symbols, for proper Cilk-TBB interop.
+ *
+ * If Cilk runtime is loaded dynamically, its symbols might remain unavailable
+ * for global search with dladdr; that might prevent TBB from finding Cilk
+ * in the process address space and initiating the interop protocol.
+ * The workaround is for the library to open itself with RTLD_GLOBAL flag.
+ */
+
+static __attribute__((noinline))
+void internal_enforce_global_visibility()
+{
+    void* handle = dlopen( get_runtime_path(), RTLD_GLOBAL|RTLD_LAZY );
+
+    /* For proper reference counting, close the handle immediately. */
+    if( handle) dlclose(handle);
+}
+
+/*
+  Local Variables: **
+  c-file-style:"bsd" **
+  c-basic-offset:4 **
+  indent-tabs-mode:nil **
+  End: **
+*/
diff --git a/libcilkrts/runtime/sysdep.h b/libcilkrts/runtime/sysdep.h
new file mode 100644
index 00000000000..ea939acc124
--- /dev/null
+++ b/libcilkrts/runtime/sysdep.h
@@ -0,0 +1,285 @@
+/* sysdep.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file sysdep.h
+ *
+ * @brief Common system-dependent functions
+ */
+
+#ifndef INCLUDED_SYSDEP_DOT_H
+#define INCLUDED_SYSDEP_DOT_H
+
+#include <cilk/common.h>
+#include <internal/abi.h>
+
+#include "global_state.h"
+#include "full_frame.h"
+#include "os.h"
+#include "os_mutex.h"
+
+/**
+ * @brief Default page size for Cilk stacks.
+ *
+ * All Cilk stacks should have size that is a multiple of this value.
+ */
+#define PAGE 4096
+
+/**
+ * @brief Size of a scheduling stack.
+ *
+ * A scheduling stack is used to by system workers to execute runtime
+ * code.  Since this stack is only executing runtime functions, we
+ * don't need it to be a full size stack.
+ *
+ * The number "18" should be small since the runtime doesn't require a
+ * large stack, but large enough to call "printf" for debugging.
+ */ 
+#define CILK_SCHEDULING_STACK_SIZE (18*PAGE)
+
+__CILKRTS_BEGIN_EXTERN_C
+
+
+/**
+ * Code to initialize the system-dependent portion of the global_state_t
+ *
+ * @param g Pointer to the global state.
+ */
+COMMON_SYSDEP
+void __cilkrts_init_global_sysdep(global_state_t *g);
+
+/**
+ * Code to clean up the system-dependent portion of the global_state_t
+ *
+ * @param g Pointer to the global state.
+ */
+COMMON_SYSDEP
+void __cilkrts_destroy_global_sysdep(global_state_t *g);
+
+/**
+ * Passes stack range to Cilkscreen.  This functionality should be moved
+ * into Cilkscreen.
+ */
+COMMON_SYSDEP
+void __cilkrts_establish_c_stack(void);
+
+
+/**
+ * Save system dependent information in the full_frame and
+ * __cilkrts_stack_frame.  Part of promoting a
+ * __cilkrts_stack_frame to a full_frame.
+ *
+ * @param w The worker the frame was running on.  Not used.
+ * @param ff The full frame that is being created for the
+ * __cilkrts_stack_frame.
+ * @param sf The __cilkrts_stack_frame that's being promoted
+ * to a full frame.
+ * @param state_valid ?
+ * @param why A description of why make_unrunnable was called.
+ * Used for debugging.
+ */
+COMMON_SYSDEP
+void __cilkrts_make_unrunnable_sysdep(__cilkrts_worker *w,
+                                      full_frame *ff,
+                                      __cilkrts_stack_frame *sf,
+                                      int state_valid,
+                                      const char *why);
+
+
+/**
+ * OS-specific code to spawn worker threads.
+ *
+ * @param g The global state.
+ * @param n Number of worker threads to start.
+ */
+COMMON_SYSDEP
+void __cilkrts_start_workers(global_state_t *g, int n);
+
+/**
+ * @brief OS-specific code to stop worker threads.
+ *
+ * @param g The global state.
+ */
+COMMON_SYSDEP
+void __cilkrts_stop_workers(global_state_t *g);
+
+/**
+ * @brief Imports a user thread the first time it returns to a stolen parent.
+ *
+ * The thread has been bound to a worker, but additional steps need to
+ * be taken to start running a scheduling loop.
+ *
+ * @param w The worker bound to the thread.
+ */
+COMMON_SYSDEP
+void __cilkrts_sysdep_import_user_thread(__cilkrts_worker *w);
+
+/**
+ * @brief Function to be run for each of the system worker threads.
+ * 
+ * This declaration also appears in cilk/cilk_undocumented.h -- don't
+ * change one declaration without also changing the other.
+ *
+ * @param arg The context value passed to the thread creation routine for
+ * the OS we're running on.
+ *
+ * @returns OS dependent.
+ */
+#ifdef _WIN32
+/* Do not use CILK_API because __cilkrts_worker_stub must be __stdcall */
+CILK_EXPORT unsigned __CILKRTS_NOTHROW __stdcall
+__cilkrts_worker_stub(void *arg);
+#else
+/* Do not use CILK_API because __cilkrts_worker_stub have default visibility */
+__attribute__((visibility("default")))
+void* __CILKRTS_NOTHROW __cilkrts_worker_stub(void *arg);
+#endif
+
+/**
+ * Initialize any OS-depenendent portions of a newly created
+ * __cilkrts_worker.
+ *
+ * Exported for Piersol.  Without the export, Piersol doesn't display
+ * useful information in the stack trace.  This declaration also appears in
+ * cilk/cilk_undocumented.h -- do not modify one without modifying the other.
+ *
+ * @param w The worker being initialized.
+ */
+COMMON_SYSDEP
+CILK_EXPORT
+void __cilkrts_init_worker_sysdep(__cilkrts_worker *w);
+
+/**
+ * Deallocate any OS-depenendent portions of a __cilkrts_worker.
+ *
+ * @param w The worker being deallocaed.
+ */
+COMMON_SYSDEP
+void __cilkrts_destroy_worker_sysdep(__cilkrts_worker *w);
+
+/**
+ * Called to do any OS-dependent setup before starting execution on a
+ * frame. Mostly deals with exception handling data.
+ *
+ * @param w The worker the frame will run on.
+ * @param ff The full_frame that is about to be resumed.
+ */
+COMMON_SYSDEP
+void __cilkrts_setup_for_execution_sysdep(__cilkrts_worker *w,
+                                          full_frame *ff);
+
+/**
+ * @brief OS-specific implementaton of resetting fiber and frame state
+ * to resume exeuction.
+ *
+ * This method:
+ *  1. Calculates the value of stack pointer where we should resume
+ *     execution of "sf".  This calculation uses info stored in the
+ *     fiber, and takes into account alignment and frame size.
+ *  2. Updates sf and ff to match the calculated stack pointer.
+ *
+ *  On Unix, the stack pointer calculation looks up the base of the
+ *  stack from the fiber.
+ *
+ *  On Windows, this calculation is calls "alloca" to find a stack
+ *  pointer on the currently executing stack.  Thus, the Windows code
+ *  assumes @c fiber is the currently executing fiber.
+ *
+ * @param fiber   fiber to resume execution on.
+ * @param ff      full_frame for the frame we're resuming.
+ * @param sf      __cilkrts_stack_frame that we should resume
+ * @return    The calculated stack pointer.
+ */
+COMMON_SYSDEP
+char* sysdep_reset_jump_buffers_for_resume(cilk_fiber* fiber,
+                                           full_frame *ff,
+                                           __cilkrts_stack_frame *sf);
+
+/**
+ * @brief System-dependent longjmp to user code for resuming execution
+ *   of a @c __cilkrts_stack_frame.
+ *
+ * This method:
+ *  - Changes the stack pointer in @c sf to @c new_sp.
+ *  - If @c ff_for_exceptions is not NULL, changes fields in @c sf and
+ *    @c ff_for_exceptions for exception processing.
+ *  - Restores any floating point state
+ *  - Finishes with a longjmp to user code, never to return. 
+ *
+ * @param new_sp             stack pointer where we should resume execution
+ * @param sf                 @c __cilkrts_stack_frame for the frame we're resuming.
+ * @param ff_for_exceptions  full_frame to safe exception info into, if necessary
+ */
+COMMON_SYSDEP
+NORETURN
+sysdep_longjmp_to_sf(char* new_sp,
+                     __cilkrts_stack_frame *sf,
+                     full_frame *ff_for_exceptions);
+
+/**
+ * @brief System-dependent code to save floating point control information
+ * to a @c __cilkrts_stack_frame.  This function will be called by compilers
+ * that cannot inline the code.
+ *
+ * Note that this function does *not* save the current floating point
+ * registers.  It saves the floating point control words that control
+ * precision and rounding and stuff like that.
+ *
+ * This function will be a noop for architectures that don't have warts
+ * like the floating point control words, or where the information is
+ * already being saved by the setjmp.
+ *
+ * @param sf                 @c __cilkrts_stack_frame for the frame we're
+ * saving the floating point control information in.
+ */
+COMMON_SYSDEP
+void
+sysdep_save_fp_ctrl_state(__cilkrts_stack_frame *sf);
+
+
+/**
+ * @brief restore x86 floating point state
+ *
+ * Only used for x86 and Intel64 processors
+ */
+COMMON_SYSDEP
+void restore_x86_fp_state(__cilkrts_stack_frame *sf);
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_SYSDEP_DOT_H)
diff --git a/libcilkrts/runtime/worker_mutex.c b/libcilkrts/runtime/worker_mutex.c
new file mode 100644
index 00000000000..380d6255a0c
--- /dev/null
+++ b/libcilkrts/runtime/worker_mutex.c
@@ -0,0 +1,121 @@
+/* worker_mutex.c                  -*-C-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+#include "worker_mutex.h"
+#include "bug.h"
+#include "os.h"
+#include "stats.h"
+
+/* m->lock == 1 means that mutex M is locked */
+#define TRY_ACQUIRE(m) (__cilkrts_xchg(&(m)->lock, 1) == 0)
+
+/* ICC 11.1+ understands release semantics and generates an
+   ordinary store with a software memory barrier. */
+#if __ICC >= 1110
+#define RELEASE(m) __sync_lock_release(&(m)->lock)
+#else
+#define RELEASE(m) __cilkrts_xchg(&(m)->lock, 0)
+#endif
+
+void __cilkrts_mutex_init(struct mutex *m)
+{
+    m->owner = 0;
+
+    // Use a simple assignment so Inspector doesn't bug us about the
+    // interlocked exchange doing a read of an uninitialized variable.
+    // By definition there can't be a race when we're initializing the
+    // lock...
+    m->lock = 0;
+}
+
+void __cilkrts_mutex_lock(__cilkrts_worker *w, struct mutex *m)
+{
+    int count;
+    const int maxspin = 1000; /* SWAG */
+
+    NOTE_INTERVAL(w, INTERVAL_MUTEX_LOCK);
+    if (!TRY_ACQUIRE(m)) {
+        START_INTERVAL(w, INTERVAL_MUTEX_LOCK_SPINNING);
+        count = 0;
+        do {
+            do {
+                __cilkrts_short_pause();
+                if (++count >= maxspin) {
+                    STOP_INTERVAL(w, INTERVAL_MUTEX_LOCK_SPINNING);
+                    START_INTERVAL(w, INTERVAL_MUTEX_LOCK_YIELDING);
+                    /* let the OS reschedule every once in a while */
+                    __cilkrts_yield();
+                    STOP_INTERVAL(w, INTERVAL_MUTEX_LOCK_YIELDING);
+                    START_INTERVAL(w, INTERVAL_MUTEX_LOCK_SPINNING);
+                    count = 0;
+                }
+            } while (m->lock != 0);
+        } while (!TRY_ACQUIRE(m));
+        STOP_INTERVAL(w, INTERVAL_MUTEX_LOCK_SPINNING);
+    }
+
+    CILK_ASSERT(m->owner == 0);
+    m->owner = w;
+}
+
+int __cilkrts_mutex_trylock(__cilkrts_worker *w, struct mutex *m)
+{
+    NOTE_INTERVAL(w, INTERVAL_MUTEX_TRYLOCK);
+    if (TRY_ACQUIRE(m)) {
+        CILK_ASSERT(m->owner == 0);
+        m->owner = w;
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+void __cilkrts_mutex_unlock(__cilkrts_worker *w, struct mutex *m)
+{
+    CILK_ASSERT(m->owner == w);
+    m->owner = 0;
+    RELEASE(m);
+}
+
+void __cilkrts_mutex_destroy(__cilkrts_worker *w, struct mutex *m)
+{
+    (void)w; /* unused */
+    (void)m; /* unused */
+}
+
+/* End worker_mutex.c */
diff --git a/libcilkrts/runtime/worker_mutex.h b/libcilkrts/runtime/worker_mutex.h
new file mode 100644
index 00000000000..c2c68247e0b
--- /dev/null
+++ b/libcilkrts/runtime/worker_mutex.h
@@ -0,0 +1,131 @@
+/* worker_mutex.h                  -*-C++-*-
+ *
+ *************************************************************************
+ *
+ *  @copyright
+ *  Copyright (C) 2009-2013, Intel Corporation
+ *  All rights reserved.
+ *  
+ *  @copyright
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in
+ *      the documentation and/or other materials provided with the
+ *      distribution.
+ *    * Neither the name of Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived
+ *      from this software without specific prior written permission.
+ *  
+ *  @copyright
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ *  AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ *  WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************/
+
+/**
+ * @file worker_mutex.h
+ *
+ * @brief Support for Cilk runtime mutexes.
+ *
+ * Cilk runtime mutexes are implemented as simple spin loops.
+ */
+
+#ifndef INCLUDED_WORKER_MUTEX_DOT_H
+#define INCLUDED_WORKER_MUTEX_DOT_H
+
+#include <cilk/common.h>
+#include "rts-common.h"
+
+__CILKRTS_BEGIN_EXTERN_C
+
+/**
+ * Mutexes are treated as an abstract data type within the Cilk
+ * runtime system.  They are implemented as simple spin loops and
+ * owned by a __cilkrts_worker.
+ */
+typedef struct mutex {
+    /** Mutex spin loop variable. 0 if unowned, 1 if owned. */
+    volatile int lock;
+
+    /** Worker that owns the mutex.  Must be 0 if mutex is unowned. */
+    __cilkrts_worker *owner;
+} mutex;
+
+/**
+ * @brief Initialize a Cilk mutex.
+ *
+ * @param m Mutex to be initialized.
+ */
+COMMON_PORTABLE
+void __cilkrts_mutex_init(struct mutex *m);
+
+/**
+ * @brief Acquire a Cilk mutex.
+ *
+ * If statistics are being gathered, the time spent
+ * acquiring the mutex will be attributed to the specified worker.
+ *
+ * @param w Worker that will become the owner of this mutex.
+ * @param m Mutex to be initialized.
+ */
+COMMON_PORTABLE
+void __cilkrts_mutex_lock(__cilkrts_worker *w,
+                          struct mutex *m);
+/**
+ * @brief Attempt to lock a Cilk mutex and fail if it isn't available.
+ *
+ * If statistics are being gathered, the time spent acquiring the
+ * mutex will be attributed to the specified worker.
+ *
+ * @param w Worker that will become the owner of this mutex.
+ * @param m Mutex to be acquired.
+ *
+ * @return 1 if the mutex was acquired.
+ * @return 0 if the mutex was not acquired.
+ */
+COMMON_PORTABLE
+int __cilkrts_mutex_trylock(__cilkrts_worker *w,
+                            struct mutex *m);
+
+/**
+ * @brief Release a Cilk mutex.
+ * 
+ * If statistics are being gathered, the time spent
+ * acquiring the mutex will be attributed to the specified worker.
+ *
+ * @pre The mutex must be owned by the worker.
+ *
+ * @param w Worker that owns this mutex.
+ * @param m Mutex to be released.
+ */
+COMMON_PORTABLE
+void __cilkrts_mutex_unlock(__cilkrts_worker *w,
+                            struct mutex *m);
+
+/**
+ * @brief Deallocate a Cilk mutex.  Currently does nothing.
+ *
+ * @param w Unused.
+ * @param m Mutex to be deallocated.
+ */
+COMMON_PORTABLE
+void __cilkrts_mutex_destroy(__cilkrts_worker *w,
+                             struct mutex *m);
+
+__CILKRTS_END_EXTERN_C
+
+#endif // ! defined(INCLUDED_WORKER_MUTEX_DOT_H)
author	bviyer <bviyer@138bc75d-0d04-0410-961f-82ee72b054a4>	2013-10-29 18:37:47 +0000
committer	bviyer <bviyer@138bc75d-0d04-0410-961f-82ee72b054a4>	2013-10-29 18:37:47 +0000
commit	4710dd5101f8103638ffe082a220f701f592df36 (patch)
tree	235d812c6202e962d45c0cce844b2afcc5a0596d /libcilkrts/runtime
parent	d037099fed7476ffedb6784a1f544132f258d792 (diff)
download	gcc-4710dd5101f8103638ffe082a220f701f592df36.tar.gz