diff options
Diffstat (limited to 'rts/include/stg/SMP.h')
-rw-r--r-- | rts/include/stg/SMP.h | 579 |
1 files changed, 579 insertions, 0 deletions
diff --git a/rts/include/stg/SMP.h b/rts/include/stg/SMP.h new file mode 100644 index 0000000000..de706da290 --- /dev/null +++ b/rts/include/stg/SMP.h @@ -0,0 +1,579 @@ +/* ---------------------------------------------------------------------------- + * + * (c) The GHC Team, 2005-2011 + * + * Macros for multi-CPU support + * + * Do not #include this file directly: #include "Rts.h" instead. + * + * To understand the structure of the RTS headers, see the wiki: + * https://gitlab.haskell.org/ghc/ghc/wikis/commentary/source-tree/includes + * + * -------------------------------------------------------------------------- */ + +#pragma once + +#if defined(arm_HOST_ARCH) && defined(arm_HOST_ARCH_PRE_ARMv6) +void arm_atomic_spin_lock(void); +void arm_atomic_spin_unlock(void); +#endif + +#if defined(THREADED_RTS) + +/* ---------------------------------------------------------------------------- + Atomic operations + ------------------------------------------------------------------------- */ + +#if !IN_STG_CODE || IN_STGCRUN +// We only want the barriers, e.g. write_barrier(), declared in .hc +// files. Defining the other inline functions here causes type +// mismatch errors from gcc, because the generated C code is assuming +// that there are no prototypes in scope. + +/* + * The atomic exchange operation: xchg(p,w) exchanges the value + * pointed to by p with the value w, returning the old value. + * + * Used for locking closures during updates (see lockClosure() + * in rts/include/rts/storage/SMPClosureOps.h) and the MVar primops. + */ +EXTERN_INLINE StgWord xchg(StgPtr p, StgWord w); + +/* + * Compare-and-swap. Atomically does this: + * + * cas(p,o,n) { + * r = *p; + * if (r == o) { *p = n }; + * return r; + * } + */ +EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n); +EXTERN_INLINE StgWord8 cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n); + +/* + * Compare-and-swap + * this uses a seq_cst success memory order and a relaxed failure memory order + */ +EXTERN_INLINE StgWord cas_seq_cst_relaxed(StgVolatilePtr p, StgWord o, StgWord n); + + +/* + * Atomic addition by the provided quantity + * + * atomic_inc(p, n) { + * return ((*p) += n); + * } + */ +EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord n); + + +/* + * Atomic decrement + * + * atomic_dec(p) { + * return --(*p); + * } + */ +EXTERN_INLINE StgWord atomic_dec(StgVolatilePtr p); + +/* + * Busy-wait nop: this is a hint to the CPU that we are currently in a + * busy-wait loop waiting for another CPU to change something. On a + * hypertreaded CPU it should yield to another thread, for example. + */ +EXTERN_INLINE void busy_wait_nop(void); + +#endif // !IN_STG_CODE + +/* + * Various kinds of memory barrier. + * write_barrier: prevents future stores occurring before preceding stores. + * store_load_barrier: prevents future loads occurring before preceding stores. + * load_load_barrier: prevents future loads occurring before earlier loads. + * + * Reference for these: "The JSR-133 Cookbook for Compiler Writers" + * http://gee.cs.oswego.edu/dl/jmm/cookbook.html + * + * To check whether you got these right, try the test in + * testsuite/tests/rts/testwsdeque.c + * This tests the work-stealing deque implementation, which relies on + * properly working store_load and load_load memory barriers. + */ +EXTERN_INLINE void write_barrier(void); +EXTERN_INLINE void store_load_barrier(void); +EXTERN_INLINE void load_load_barrier(void); + +/* + * Note [Heap memory barriers] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Machines with weak memory ordering semantics have consequences for how + * closures are observed and mutated. For example, consider a thunk that needs + * to be updated to an indirection. In order for the indirection to be safe for + * concurrent observers to enter, said observers must read the indirection's + * info table before they read the indirectee. Furthermore, the indirectee must + * be set before the info table pointer. This ensures that if the observer sees + * an IND info table then the indirectee is valid. + * + * When a closure is updated with an indirection, both its info table and its + * indirectee must be written. With weak memory ordering, these two writes can + * be arbitrarily reordered, and perhaps even interleaved with other threads' + * reads and writes (in the absence of memory barrier instructions). Consider + * this example of a bad reordering: + * + * - An updater writes to a closure's info table (INFO_TYPE is now IND). + * - A concurrent observer branches upon reading the closure's INFO_TYPE as IND. + * - A concurrent observer reads the closure's indirectee and enters it. + * - An updater writes the closure's indirectee. + * + * Here the update to the indirectee comes too late and the concurrent observer + * has jumped off into the abyss. Speculative execution can also cause us + * issues, consider: + * + * - an observer is about to case on a value in closure's info table. + * - the observer speculatively reads one or more of closure's fields. + * - an updater writes to closure's info table. + * - the observer takes a branch based on the new info table value, but with the + * old closure fields! + * - the updater writes to the closure's other fields, but its too late. + * + * Because of these effects, reads and writes to a closure's info table must be + * ordered carefully with respect to reads and writes to the closure's other + * fields, and memory barriers must be placed to ensure that reads and writes + * occur in program order. Specifically, updates to an already existing closure + * must follow the following pattern: + * + * - Update the closure's (non-info table) fields. + * - Write barrier. + * - Update the closure's info table. + * + * Observing the fields of an updateable closure (e.g. a THUNK) must follow the + * following pattern: + * + * - Read the closure's info pointer. + * - Read barrier. + * - Read the closure's (non-info table) fields. + * + * We must also take care when we expose a newly-allocated closure to other cores + * by writing a pointer to it to some shared data structure (e.g. an MVar#, a Message, + * or MutVar#). Specifically, we need to ensure that all writes constructing the + * closure are visible *before* the write exposing the new closure is made visible: + * + * - Allocate memory for the closure + * - Write the closure's info pointer and fields (ordering between this doesn't + * matter since the closure isn't yet visible to anyone else). + * - Write barrier + * - Make closure visible to other cores + * + * Note that thread stacks are inherently thread-local and consequently allocating an + * object and introducing a reference to it to our stack needs no barrier. + * + * There are several ways in which the mutator may make a newly-allocated + * closure visible to other cores: + * + * - Eager blackholing a THUNK: + * This is protected by an explicit write barrier in the eager blackholing + * code produced by the codegen. See GHC.StgToCmm.Bind.emitBlackHoleCode. + * + * - Lazy blackholing a THUNK: + * This is is protected by an explicit write barrier in the thread suspension + * code. See ThreadPaused.c:threadPaused. + * + * - Updating a BLACKHOLE: + * This case is protected by explicit write barriers in the update frame + * entry code (see rts/Updates.h). + * + * - Blocking on an MVar# (e.g. takeMVar#): + * In this case the appropriate MVar primops (e.g. stg_takeMVarzh). include + * explicit memory barriers to ensure that the newly-allocated + * MVAR_TSO_QUEUE is visible to other cores. + * + * - Write to an MVar# (e.g. putMVar#): + * This protected by the full barrier implied by the CAS in putMVar#. + * + * - Write to a TVar#: + * This is protected by the full barrier implied by the CAS in STM.c:lock_stm. + * + * - Write to an Array#, ArrayArray#, or SmallArray#: + * This case is protected by an explicit write barrier in the code produced + * for this primop by the codegen. See GHC.StgToCmm.Prim.doWritePtrArrayOp and + * GHC.StgToCmm.Prim.doWriteSmallPtrArrayOp. Relevant issue: #12469. + * + * - Write to MutVar# via writeMutVar#: + * This case is protected by an explicit write barrier in the code produced + * for this primop by the codegen. + * + * - Write to MutVar# via atomicModifyMutVar# or casMutVar#: + * This is protected by the full barrier implied by the cmpxchg operations + * in this primops. + * + * - Sending a Message to another capability: + * This is protected by the acquition and release of the target capability's + * lock in Messages.c:sendMessage. + * + * Finally, we must ensure that we flush all cores store buffers before + * entering and leaving GC, since stacks may be read by other cores. This + * happens as a side-effect of taking and release mutexes (which implies + * acquire and release barriers, respectively). + * + * N.B. recordClosureMutated places a reference to the mutated object on + * the capability-local mut_list. Consequently this does not require any memory + * barrier. + * + * During parallel GC we need to be careful during evacuation: before replacing + * a closure with a forwarding pointer we must commit a write barrier to ensure + * that the copy we made in to-space is visible to other cores. + * + * However, we can be a bit lax when *reading* during GC. Specifically, the GC + * can only make a very limited set of changes to existing closures: + * + * - it can replace a closure's info table with stg_WHITEHOLE. + * - it can replace a previously-whitehole'd closure's info table with a + * forwarding pointer + * - it can replace a previously-whitehole'd closure's info table with a + * valid info table pointer (done in eval_thunk_selector) + * - it can update the value of a pointer field after evacuating it + * + * This is quite nice since we don't need to worry about an interleaving + * of writes producing an invalid state: a closure's fields remain valid after + * an update of its info table pointer and vice-versa. + * + * After a round of parallel scavenging we must also ensure that any writes the + * GC thread workers made are visible to the main GC thread. This is ensured by + * the full barrier implied by the atomic decrement in + * GC.c:scavenge_until_all_done. + * + * The work-stealing queue (WSDeque) also requires barriers; these are + * documented in WSDeque.c. + * + */ + +/* ---------------------------------------------------------------------------- + Implementations + ------------------------------------------------------------------------- */ + +#if !IN_STG_CODE || IN_STGCRUN + +/* + * Exchange the value pointed to by p with w and return the former. This + * function is used to acquire a lock. An acquire memory barrier is sufficient + * for a lock operation because corresponding unlock operation issues a + * store-store barrier (write_barrier()) immediately before releasing the lock. + */ +EXTERN_INLINE StgWord +xchg(StgPtr p, StgWord w) +{ +#if defined(HAVE_C11_ATOMICS) + return __atomic_exchange_n(p, w, __ATOMIC_SEQ_CST); +#else + // When porting GHC to a new platform check that + // __sync_lock_test_and_set() actually stores w in *p. + // Use test rts/atomicxchg to verify that the correct value is stored. + // From the gcc manual: + // (https://gcc.gnu.org/onlinedocs/gcc-4.4.3/gcc/Atomic-Builtins.html) + // This built-in function, as described by Intel, is not + // a traditional test-and-set operation, but rather an atomic + // exchange operation. + // [...] + // Many targets have only minimal support for such locks, + // and do not support a full exchange operation. In this case, + // a target may support reduced functionality here by which the + // only valid value to store is the immediate constant 1. The + // exact value actually stored in *ptr is implementation defined. + return __sync_lock_test_and_set(p, w); +#endif +} + +/* + * CMPXCHG - the single-word atomic compare-and-exchange instruction. Used + * in the STM implementation. + */ +EXTERN_INLINE StgWord +cas(StgVolatilePtr p, StgWord o, StgWord n) +{ +#if defined(HAVE_C11_ATOMICS) + __atomic_compare_exchange_n(p, &o, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + return o; +#else + return __sync_val_compare_and_swap(p, o, n); +#endif +} + +EXTERN_INLINE StgWord8 +cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n) +{ +#if defined(HAVE_C11_ATOMICS) + __atomic_compare_exchange_n(p, &o, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + return o; +#else + return __sync_val_compare_and_swap(p, o, n); +#endif +} + +EXTERN_INLINE StgWord +cas_seq_cst_relaxed(StgVolatilePtr p, StgWord o, StgWord n) { +#if defined(HAVE_C11_ATOMICS) + __atomic_compare_exchange_n(p, &o, n, 0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); + return o; +#else + return __sync_val_compare_and_swap(p, o, n); +#endif + +} + +// RRN: Generalized to arbitrary increments to enable fetch-and-add in +// Haskell code (fetchAddIntArray#). +// PT: add-and-fetch, returns new value +EXTERN_INLINE StgWord +atomic_inc(StgVolatilePtr p, StgWord incr) +{ +#if defined(HAVE_C11_ATOMICS) + return __atomic_add_fetch(p, incr, __ATOMIC_SEQ_CST); +#else + return __sync_add_and_fetch(p, incr); +#endif +} + +EXTERN_INLINE StgWord +atomic_dec(StgVolatilePtr p) +{ +#if defined(HAVE_C11_ATOMICS) + return __atomic_sub_fetch(p, 1, __ATOMIC_SEQ_CST); +#else + return __sync_sub_and_fetch(p, (StgWord) 1); +#endif +} + +/* + * Some architectures have a way to tell the CPU that we're in a + * busy-wait loop, and the processor should look for something else to + * do (such as run another hardware thread). + */ +EXTERN_INLINE void +busy_wait_nop(void) +{ +#if defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH) + // On Intel, the busy-wait-nop instruction is called "pause", + // which is actually represented as a nop with the rep prefix. + // On processors before the P4 this behaves as a nop; on P4 and + // later it might do something clever like yield to another + // hyperthread. In any case, Intel recommends putting one + // of these in a spin lock loop. + __asm__ __volatile__ ("rep; nop"); +#else + // nothing +#endif +} + +#endif // !IN_STG_CODE + +/* + * We need to tell both the compiler AND the CPU about the barriers. + * It's no good preventing the CPU from reordering the operations if + * the compiler has already done so - hence the "memory" restriction + * on each of the barriers below. + */ +EXTERN_INLINE void +write_barrier(void) { +#if defined(NOSMP) + return; +#elif defined(TSAN_ENABLED) + // RELEASE is a bit stronger than the store-store barrier provided by + // write_barrier, consequently we only use this case as a conservative + // approximation when using ThreadSanitizer. See Note [ThreadSanitizer]. + __atomic_thread_fence(__ATOMIC_RELEASE); +#elif defined(i386_HOST_ARCH) || defined(x86_64_HOST_ARCH) + __asm__ __volatile__ ("" : : : "memory"); +#elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH) \ + || defined(powerpc64le_HOST_ARCH) + __asm__ __volatile__ ("lwsync" : : : "memory"); +#elif defined(s390x_HOST_ARCH) + __asm__ __volatile__ ("" : : : "memory"); +#elif defined(sparc_HOST_ARCH) + /* Sparc in TSO mode does not require store/store barriers. */ + __asm__ __volatile__ ("" : : : "memory"); +#elif defined(arm_HOST_ARCH) || defined(aarch64_HOST_ARCH) + __asm__ __volatile__ ("dmb st" : : : "memory"); +#elif defined(riscv64_HOST_ARCH) + __asm__ __volatile__ ("fence w,w" : : : "memory"); +#else +#error memory barriers unimplemented on this architecture +#endif +} + +EXTERN_INLINE void +store_load_barrier(void) { +#if defined(NOSMP) + return; +#elif defined(i386_HOST_ARCH) + __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory"); +#elif defined(x86_64_HOST_ARCH) + __asm__ __volatile__ ("lock; addq $0,0(%%rsp)" : : : "memory"); +#elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH) \ + || defined(powerpc64le_HOST_ARCH) + __asm__ __volatile__ ("sync" : : : "memory"); +#elif defined(s390x_HOST_ARCH) + __asm__ __volatile__ ("bcr 14,0" : : : "memory"); +#elif defined(sparc_HOST_ARCH) + __asm__ __volatile__ ("membar #StoreLoad" : : : "memory"); +#elif defined(arm_HOST_ARCH) + __asm__ __volatile__ ("dmb" : : : "memory"); +#elif defined(aarch64_HOST_ARCH) + __asm__ __volatile__ ("dmb sy" : : : "memory"); +#elif defined(riscv64_HOST_ARCH) + __asm__ __volatile__ ("fence w,r" : : : "memory"); +#else +#error memory barriers unimplemented on this architecture +#endif +} + +EXTERN_INLINE void +load_load_barrier(void) { +#if defined(NOSMP) + return; +#elif defined(i386_HOST_ARCH) + __asm__ __volatile__ ("" : : : "memory"); +#elif defined(x86_64_HOST_ARCH) + __asm__ __volatile__ ("" : : : "memory"); +#elif defined(powerpc_HOST_ARCH) || defined(powerpc64_HOST_ARCH) \ + || defined(powerpc64le_HOST_ARCH) + __asm__ __volatile__ ("lwsync" : : : "memory"); +#elif defined(s390x_HOST_ARCH) + __asm__ __volatile__ ("" : : : "memory"); +#elif defined(sparc_HOST_ARCH) + /* Sparc in TSO mode does not require load/load barriers. */ + __asm__ __volatile__ ("" : : : "memory"); +#elif defined(arm_HOST_ARCH) + __asm__ __volatile__ ("dmb" : : : "memory"); +#elif defined(aarch64_HOST_ARCH) + __asm__ __volatile__ ("dmb sy" : : : "memory"); +#elif defined(riscv64_HOST_ARCH) + __asm__ __volatile__ ("fence w,r" : : : "memory"); +#else +#error memory barriers unimplemented on this architecture +#endif +} + +// Load a pointer from a memory location that might be being modified +// concurrently. This prevents the compiler from optimising away +// multiple loads of the memory location, as it might otherwise do in +// a busy wait loop for example. +#define VOLATILE_LOAD(p) (*((StgVolatilePtr)(p))) + +// Relaxed atomic operations. +#define RELAXED_LOAD(ptr) __atomic_load_n(ptr, __ATOMIC_RELAXED) +#define RELAXED_STORE(ptr,val) __atomic_store_n(ptr, val, __ATOMIC_RELAXED) +#define RELAXED_ADD(ptr,val) __atomic_add_fetch(ptr, val, __ATOMIC_RELAXED) + +// Acquire/release atomic operations +#define ACQUIRE_LOAD(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) +#define RELEASE_STORE(ptr,val) __atomic_store_n(ptr, val, __ATOMIC_RELEASE) + +// Sequentially consistent atomic operations +#define SEQ_CST_LOAD(ptr) __atomic_load_n(ptr, __ATOMIC_SEQ_CST) +#define SEQ_CST_STORE(ptr,val) __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST) +#define SEQ_CST_ADD(ptr,val) __atomic_add_fetch(ptr, val, __ATOMIC_SEQ_CST) + +// Non-atomic addition for "approximate" counters that can be lossy +#define NONATOMIC_ADD(ptr,val) RELAXED_STORE(ptr, RELAXED_LOAD(ptr) + val) + +// compare-and-swap atomic operations +#define SEQ_CST_RELAXED_CAS(p,o,n) cas_seq_cst_relaxed(p,o,n) + +// Explicit fences +// +// These are typically necessary only in very specific cases (e.g. WSDeque) +// where the ordered operations aren't expressive enough to capture the desired +// ordering. +#define RELEASE_FENCE() __atomic_thread_fence(__ATOMIC_RELEASE) +#define SEQ_CST_FENCE() __atomic_thread_fence(__ATOMIC_SEQ_CST) + +/* ---------------------------------------------------------------------- */ +#else /* !THREADED_RTS */ + +EXTERN_INLINE void write_barrier(void); +EXTERN_INLINE void store_load_barrier(void); +EXTERN_INLINE void load_load_barrier(void); +EXTERN_INLINE void write_barrier () {} /* nothing */ +EXTERN_INLINE void store_load_barrier() {} /* nothing */ +EXTERN_INLINE void load_load_barrier () {} /* nothing */ + +// Relaxed atomic operations +#define RELAXED_LOAD(ptr) *ptr +#define RELAXED_STORE(ptr,val) *ptr = val +#define RELAXED_ADD(ptr,val) *ptr += val + +// Acquire/release atomic operations +#define ACQUIRE_LOAD(ptr) *ptr +#define RELEASE_STORE(ptr,val) *ptr = val + +// Sequentially consistent atomic operations +#define SEQ_CST_LOAD(ptr) *ptr +#define SEQ_CST_STORE(ptr,val) *ptr = val +#define SEQ_CST_ADD(ptr,val) *ptr += val + +// Non-atomic addition for "approximate" counters that can be lossy +#define NONATOMIC_ADD(ptr,val) *ptr += val + +// compare-and-swap atomic operations +#define SEQ_CST_RELAXED_CAS(p,o,n) cas(p,o,n) + +// Fences +#define RELEASE_FENCE() +#define SEQ_CST_FENCE() + +#if !IN_STG_CODE || IN_STGCRUN +INLINE_HEADER StgWord +xchg(StgPtr p, StgWord w) +{ + StgWord old = *p; + *p = w; + return old; +} + +EXTERN_INLINE StgWord cas(StgVolatilePtr p, StgWord o, StgWord n); +EXTERN_INLINE StgWord +cas(StgVolatilePtr p, StgWord o, StgWord n) +{ + StgWord result; + result = *p; + if (result == o) { + *p = n; + } + return result; +} + +EXTERN_INLINE StgWord8 cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n); +EXTERN_INLINE StgWord8 +cas_word8(StgWord8 *volatile p, StgWord8 o, StgWord8 n) +{ + StgWord8 result; + result = *p; + if (result == o) { + *p = n; + } + return result; +} + +EXTERN_INLINE StgWord atomic_inc(StgVolatilePtr p, StgWord incr); +EXTERN_INLINE StgWord +atomic_inc(StgVolatilePtr p, StgWord incr) +{ + return ((*p) += incr); +} + + +INLINE_HEADER StgWord +atomic_dec(StgVolatilePtr p) +{ + return --(*p); +} +#endif + +/* An alias for the C11 declspec */ +#define ATOMIC + +#define VOLATILE_LOAD(p) ((StgWord)*((StgWord*)(p))) + +#endif /* !THREADED_RTS */ |