summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Gamari <ben@smart-cactus.org>2022-11-07 22:50:53 -0500
committerMarge Bot <ben+marge-bot@smart-cactus.org>2022-12-16 16:12:44 -0500
commit99269b9fd817262a686867383bf0fe88fdc64fb0 (patch)
tree3f2651fdf0f004cbd769d9b7e418e3e778aa78ef
parent2228c999059f6bd10fb85476174180da2a7190da (diff)
downloadhaskell-99269b9fd817262a686867383bf0fe88fdc64fb0.tar.gz
Improve heap memory barrier Note
Also introduce MUT_FIELD marker in Closures.h to document mutable fields.
-rw-r--r--rts/Capability.h3
-rw-r--r--rts/Schedule.c2
-rw-r--r--rts/include/rts/storage/Closures.h35
-rw-r--r--rts/include/stg/SMP.h254
-rw-r--r--rts/sm/Storage.c4
5 files changed, 193 insertions, 105 deletions
diff --git a/rts/Capability.h b/rts/Capability.h
index 884d281fa2..168f8e565e 100644
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -493,7 +493,8 @@ contextSwitchCapability (Capability *cap, bool immediately)
INLINE_HEADER bool emptyInbox(Capability *cap)
{
- // This may race with writes to putMVars and inbox but this harmless for the
+ // See Note [Heap memory barriers], section "Barriers on Messages".
+ // This may race with writes to putMVars but this harmless for the
// intended uses of this function.
TSAN_ANNOTATE_BENIGN_RACE(&cap->putMVars, "emptyInbox(cap->putMVars)");
return (RELAXED_LOAD(&cap->inbox) == (Message*)END_TSO_QUEUE &&
diff --git a/rts/Schedule.c b/rts/Schedule.c
index d4b1bf345f..306a709883 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -1030,6 +1030,8 @@ scheduleProcessInbox (Capability **pcap USED_IF_THREADS)
// never goes idle if the inbox is non-empty, which is why we
// use cap->lock (cap->lock is released as the last thing
// before going idle; see Capability.c:releaseCapability()).
+ //
+ // See Note [Heap memory barriers], section "Barriers on messages"
r = TRY_ACQUIRE_LOCK(&cap->lock);
if (r != 0) return;
diff --git a/rts/include/rts/storage/Closures.h b/rts/include/rts/storage/Closures.h
index a2b6eb079f..01ae438a43 100644
--- a/rts/include/rts/storage/Closures.h
+++ b/rts/include/rts/storage/Closures.h
@@ -13,6 +13,13 @@
* compiling for: profiling, parallel, ticky, etc.
*/
+/*
+ * Used to mark GC-pointer fields which can be modified by the mutator after
+ * an object is made visible on the heap. See Note [Heap memory barriers] in
+ * SMP.h for details.
+ */
+#define MUT_FIELD
+
/* -----------------------------------------------------------------------------
The profiling header
-------------------------------------------------------------------------- */
@@ -43,7 +50,7 @@ typedef struct {
-------------------------------------------------------------------------- */
typedef struct {
- StgWord pad;
+ StgWord pad MUT_FIELD;
} StgSMPThunkHeader;
/* -----------------------------------------------------------------------------
@@ -210,7 +217,7 @@ typedef struct _StgMutArrPtrs {
StgHeader header;
StgWord ptrs;
StgWord size; // ptrs plus card table
- StgClosure *payload[];
+ StgClosure *payload[] MUT_FIELD;
// see also: StgMutArrPtrs macros in ClosureMacros.h
} StgMutArrPtrs;
@@ -222,7 +229,7 @@ typedef struct _StgMutArrPtrs {
typedef struct {
StgHeader header;
StgWord ptrs;
- StgClosure *payload[];
+ StgClosure *payload[] MUT_FIELD;
} StgSmallMutArrPtrs;
@@ -231,7 +238,7 @@ typedef struct {
// Closure types: MUT_VAR_CLEAN, MUT_VAR_DIRTY
typedef struct {
StgHeader header;
- StgClosure *var;
+ StgClosure *var MUT_FIELD;
} StgMutVar;
@@ -349,7 +356,7 @@ typedef struct _StgWeak {
// C finalizers, see StgCFinalizerList below
//
// Points to stg_NO_FINALIZER_closure to indicate no c finalizers.
- StgClosure *cfinalizers;
+ StgClosure *cfinalizers MUT_FIELD;
StgClosure *key;
StgClosure *value; // the actual value references by the weak reference
@@ -371,7 +378,7 @@ typedef struct _StgWeak {
// Closure type: CONSTR
typedef struct _StgCFinalizerList {
StgHeader header;
- StgClosure *link; // the next finaliser
+ StgClosure *link MUT_FIELD; // the next finaliser
// function to call
//
@@ -448,11 +455,11 @@ typedef struct {
StgHeader header;
// threads that are waiting on this MVar
- struct StgMVarTSOQueue_ *head;
- struct StgMVarTSOQueue_ *tail;
+ struct StgMVarTSOQueue_ *head MUT_FIELD;
+ struct StgMVarTSOQueue_ *tail MUT_FIELD;
// The value in the MVar if filled
- StgClosure* value;
+ StgClosure* value MUT_FIELD;
} StgMVar;
@@ -493,8 +500,8 @@ typedef struct StgTVarWatchQueue_ {
typedef struct {
StgHeader header;
- StgClosure *current_value; /* accessed via atomics */
- StgTVarWatchQueue *first_watch_queue_entry; /* accessed via atomics */
+ StgClosure *current_value MUT_FIELD; /* accessed via atomics */
+ StgTVarWatchQueue *first_watch_queue_entry MUT_FIELD; /* accessed via atomics */
StgInt num_updates; /* accessed via atomics */
} StgTVar;
@@ -533,7 +540,7 @@ typedef enum {
struct StgTRecHeader_ {
StgHeader header;
struct StgTRecHeader_ *enclosing_trec;
- StgTRecChunk *current_chunk;
+ StgTRecChunk *current_chunk MUT_FIELD;
TRecState state;
};
@@ -623,7 +630,7 @@ typedef struct StgCompactNFDataBlock_ {
// the fixup implementation.
struct StgCompactNFData_ *owner;
// the closure who owns this block (used in objectGetCompact)
- struct StgCompactNFDataBlock_ *next;
+ struct StgCompactNFDataBlock_ *next MUT_FIELD;
// chain of blocks used for serialization and freeing
} StgCompactNFDataBlock;
@@ -645,7 +652,7 @@ typedef struct StgCompactNFData_ {
// the nursery pointer below during compaction.
StgCompactNFDataBlock *nursery;
// where to (try to) allocate from when appending
- StgCompactNFDataBlock *last;
+ StgCompactNFDataBlock *last MUT_FIELD;
// the last block of the chain (to know where to append new
// blocks for resize)
struct hashtable *hash;
diff --git a/rts/include/stg/SMP.h b/rts/include/stg/SMP.h
index aaa54e8435..0800c87786 100644
--- a/rts/include/stg/SMP.h
+++ b/rts/include/stg/SMP.h
@@ -108,93 +108,128 @@ EXTERN_INLINE void load_load_barrier(void);
* Note [Heap memory barriers]
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Machines with weak memory ordering semantics have consequences for how
- * closures are observed and mutated. For example, consider a thunk that needs
- * to be updated to an indirection. In order for the indirection to be safe for
- * concurrent observers to enter, said observers must read the indirection's
- * info table before they read the indirectee. Furthermore, the indirectee must
- * be set before the info table pointer. This ensures that if the observer sees
- * an IND info table then the indirectee is valid.
- *
- * When a closure is updated with an indirection, both its info table and its
- * indirectee must be written. With weak memory ordering, these two writes can
- * be arbitrarily reordered, and perhaps even interleaved with other threads'
- * reads and writes (in the absence of memory barrier instructions). Consider
- * this example of a bad reordering:
- *
- * - An updater writes to a closure's info table (INFO_TYPE is now IND).
- * - A concurrent observer branches upon reading the closure's INFO_TYPE as IND.
- * - A concurrent observer reads the closure's indirectee and enters it.
- * - An updater writes the closure's indirectee.
- *
- * Here the update to the indirectee comes too late and the concurrent observer
- * has jumped off into the abyss. Speculative execution can also cause us
- * issues, consider:
- *
- * - an observer is about to case on a value in closure's info table.
- * - the observer speculatively reads one or more of closure's fields.
- * - an updater writes to closure's info table.
- * - the observer takes a branch based on the new info table value, but with the
- * old closure fields!
- * - the updater writes to the closure's other fields, but its too late.
- *
- * Because of these effects, reads and writes to a closure's info table must be
- * ordered carefully with respect to reads and writes to the closure's other
- * fields, and memory barriers must be placed to ensure that reads and writes
- * occur in program order. Specifically, updates to an already existing closure
- * must follow the following pattern:
- *
- * - Update the closure's (non-info table) fields.
- * - Write barrier.
- * - Update the closure's info table.
- *
- * Observing the fields of an updateable closure (e.g. a THUNK) must follow the
- * following pattern:
- *
- * - Read the closure's info pointer.
- * - Read barrier.
- * - Read the closure's (non-info table) fields.
- *
- * We must also take care when we expose a newly-allocated closure to other cores
- * by writing a pointer to it to some shared data structure (e.g. an MVar#, a Message,
- * or MutVar#). Specifically, we need to ensure that all writes constructing the
- * closure are visible *before* the write exposing the new closure is made visible:
- *
- * - Allocate memory for the closure
- * - Write the closure's info pointer and fields (ordering between this doesn't
- * matter since the closure isn't yet visible to anyone else).
- * - Write barrier
- * - Make closure visible to other cores
- *
- * Note that thread stacks are inherently thread-local and consequently allocating an
- * object and introducing a reference to it to our stack needs no barrier.
- *
- * There are several ways in which the mutator may make a newly-allocated
- * closure visible to other cores:
- *
- * - Eager blackholing a THUNK:
- * This is protected by an explicit write barrier in the eager blackholing
- * code produced by the codegen. See GHC.StgToCmm.Bind.emitBlackHoleCode.
- *
- * - Lazy blackholing a THUNK:
- * This is is protected by an explicit write barrier in the thread suspension
- * code. See ThreadPaused.c:threadPaused.
- *
- * - Updating a BLACKHOLE:
- * This case is protected by explicit write barriers in the update frame
- * entry code (see rts/Updates.h).
- *
- * - Blocking on an MVar# (e.g. takeMVar#):
- * In this case the appropriate MVar primops (e.g. stg_takeMVarzh). include
- * explicit memory barriers to ensure that the newly-allocated
- * MVAR_TSO_QUEUE is visible to other cores.
- *
- * - Write to an MVar# (e.g. putMVar#):
- * This protected by the full barrier implied by the CAS in putMVar#.
+ * closures are observed and mutated. In particular, when we
+ * expose a new (or mutated) object to another core we must ensure that the
+ * stores which formed the new object are visible (e.g. stores are flushed from
+ * cache and the relevant cachelines invalidated in other cores).
+ *
+ * To ensure this we must use memory barriers. Which barriers are required to
+ * access a field depends upon the type of the field. In general, fields come
+ * in three flavours:
+ *
+ * * Mutable GC Pointers (C type StgClosure*, Cmm type StgPtr)
+ * * Immutable GC Pointers (C type MUT_FIELD StgClosure*, Cmm type StgPtr)
+ * * Non-pointers (C type StgWord, Cmm type StdWord)
+ *
+ * Note that Addr# fields are *not* GC pointers and therefore are classified
+ * as non-pointers. Responsibility for barriers lies with the party
+ * dereferencing the pointer.
+ *
+ * Also note that we are only concerned with mutation by the mutator. The GC
+ * is free to change nearly any field as this is necessary for a moving GC.
+ * Naturally, doing this safely requires care which we discuss in section
+ * below.
+ *
+ * Immutable pointer fields are those which the mutator cannot change after
+ * an object is made visible on the heap. Most objects' fields are of this
+ * flavour (e.g. all data constructor fields). As these fields are written
+ * precisely once, no write barriers are needed on writes nor reads. This is
+ * safe due to an argument hinging on causality: Consider an immutable field F
+ * of an object O refers to object O'. Naturally, O' must have been visible to
+ * the creator of O when O was constructed. Consequently, if O is visible to a
+ * reader, O' must also be visible.
+ *
+ * Mutable pointer fields are those which can be modified by the mutator. These
+ * require a bit more care as they may break the causality argument given
+ * above. For instance, consider an object O (e.g. a MUT_VAR) with a mutable
+ * field F. A thread may allocate a new object O' and store a reference to O'
+ * into F. Without explicit synchronization O' may not be visible to another
+ * thread attempting to dereference F.
+ *
+ * Mutable fields include:
+ *
+ * - StgMutVar: var
+ * - StgWeak: finalizer
+ * - StgMVar: head, tail, value
+ * - StgMVarTSOQueue: link
+ * - StgTVar: current_value, first_watch_queue_entry
+ * - StgTVarWatchQueue: {next,prev}_queue_entry
+ * - StgTRecChunk: TODO
+ * - StgMutArrPtrs: payload
+ * - StgSmallMutArrPtrs: payload
+ * - StgThunk although this is a somewhat special case; see below
+ *
+ * Writing to a mutable pointer field must be done via a release-store.
+ * Reading from such a field is done via an acquire-load.
+ *
+ * Finally, non-pointer fields can be safely mutated without barriers as
+ * they do not refer to other memory. Technically, concurrent accesses to
+ * non-pointer fields still do need to be atomic in many cases to avoid torn
+ * accesses. However, this is something that we generally avoid by locking
+ * closures prior to mutating non-pointer fields (see Locking closures below).
+ *
+ * Note that MUT_VARs offer both synchronized and unsynchronized primops.
+ * Consequently, in these cases there is a burden on the user to ensure that
+ * synchronization is provided where necessary.
+ *
+ * Locking closures
+ * ----------------
+ * Several primops temporarily turn closures into WHITEHOLEs to ensure that
+ * they have exclusive access (see SMPClosureOps.h:reallyLockClosure).
+ * Locking is done via an atomic exchange operation on the closure's info table
+ * pointer with sequential consistency (although only acquire ordering is
+ * needed). This acquire ensures that we synchronize with any previous thread
+ * that had locked the closure. Consequently, it is important that we take great
+ * care in examining the mutable fields of a lockable closure prior to having
+ * locked it.
+ *
+ * Naturally, unlocking is done via a release-store to restore the closure's
+ * original info table pointer.
+ *
+ * Thunks
+ * ------
+ * As noted above, thunks are a rather special (yet quite common) case. In
+ * particular, they have the unique property of being updatable, transforming
+ * from a thunk to an indirection. This transformation requires its own
+ * synchronization protocol. In particular, we must ensure that a reader
+ * examining a thunk being updated can see the indirectee. Consequently, a
+ * thunk update (see rts/Updates.h) does the following:
+ *
+ * 1. Use a release-fence to ensure that the indirectee is visible
+ * 2. Use a relaxed-store to place the new indirectee into the thunk's
+ * indirectee field
+ * 3. use a release-store to set the info table to stg_BLACKHOLE (which
+ * represents an indirection)
+ *
+ * Blackholing a thunk (either eagerly, by GHC.StgToCmm.Bind.emitBlackHoleCode,
+ * or lazily, by ThreadPaused.c:threadPaused) is done similarly.
+ *
+ * Conversely, thunk entry (see the entry code of stg_BLACKHOLE in
+ * rts/StgMiscClosure) does the following:
+ *
+ * 1. We jump into the entry code for stg_BLACKHOLE; this of course implies
+ * that we have already read the thunk's info table pointer, which is done
+ * with a relaxed load.
+ * 2. use an acquire-fence to ensure that our view on the thunk is
+ * up-to-date. This synchronizes with step (3) in the update
+ * procedure.
+ * 3. relaxed-load the indirectee. Since thunks are updated at most
+ * once we know that the fence in the last step has given us
+ * an up-to-date view of the indirectee closure.
+ * 4. enter the indirectee (or block if the indirectee is a TSO)
+ *
+ * Other closures
+ * --------------
+ * Below is a summary of the barriers which are necessary to make GHC's
+ * primitive types respect the above rules:
+ *
+ * - MVar# operations.
+ * The MVar# primops lock the MVar prior to reading or writing any MVar fix.
*
* - Write to a TVar#:
* This is protected by the full barrier implied by the CAS in STM.c:lock_stm.
*
- * - Write to an Array#, ArrayArray#, or SmallArray#:
+ * - Write to an Array# or SmallArray#:
* This case is protected by an explicit write barrier in the code produced
* for this primop by the codegen. See GHC.StgToCmm.Prim.doWritePtrArrayOp and
* GHC.StgToCmm.Prim.doWriteSmallPtrArrayOp. Relevant issue: #12469.
@@ -211,15 +246,58 @@ EXTERN_INLINE void load_load_barrier(void);
* This is protected by the acquition and release of the target capability's
* lock in Messages.c:sendMessage.
*
- * Finally, we must ensure that we flush all cores store buffers before
- * entering and leaving GC, since stacks may be read by other cores. This
- * happens as a side-effect of taking and release mutexes (which implies
- * acquire and release barriers, respectively).
- *
* N.B. recordClosureMutated places a reference to the mutated object on
* the capability-local mut_list. Consequently this does not require any memory
* barrier.
*
+ * Barriers in thread migration
+ * ----------------------------
+ * When a thread is migrated from one capability to another we must take care
+ * that the new capability synchronizes with the old (e.g. to ensure that
+ * the state of the thread's stack is visible to the new executor).
+ * This is ensured by the barriers implied in sending the MessageWakeup.
+ *
+ * Barriers on Messages
+ * --------------------
+ * The RTS uses the Message mechanism to convey information between capabilities.
+ * To send a message (see Messages.c:sendMessage) the sender must first take
+ * the `lock` of the recipient capability. The Message can then be linked
+ * onto the recipient's `inbox` list and release the lock (implying a release
+ * barrier).
+ *
+ * To process its inbox (see Schedule.c:scheduleProcessInbox) the recipient must
+ * take the capability lock before examining the inbox. This implies an acquire
+ * barrier, ensuring that the messages in the inbox are visible.
+ *
+ * Under the above story there is no need for accesses to `inbox` to be
+ * atomic at all as all accesses are under the capability lock. However, there
+ * is a slight wrinkle here:
+ * Capability.h:emptyInbox wants to test whether `inbox` is empty without
+ * holding the capability lock. Consequently, accesses to `inbox` must be
+ * atomic, although may use the relaxed ordering.
+ *
+ * Barriers during GC
+ * ------------------
+ * Entering the garbage collector implies an acquire barrier across all
+ * capabilities as all capabilities obey the following protocol when entering
+ * `gcWorkerThread`:
+ *
+ * 1. acquire gc_entry_mutex
+ * 2. signal gc_entry_arrived_cv to let the GC leader know
+ * that we have started to synchronize
+ * 3. wait on gc_entry_start_now_cv, releasing
+ * gc_entry_mutex; this implies a release barrier
+ * 4. after all threads have synchronized, the leader will
+ * broadcast signal gc_entry_start_now_cv
+ * 5. each of the waiting workers will acquire
+ * gc_entry_mutex, implying an acquire barrier which will
+ * synchronize with all of the other workers' releases in
+ * (3).
+ *
+ * Similarly, leaving the garbage collector implies a release
+ * barrier as gcWorkerThread blocks on gc_exit_leave_now_cv
+ * with a similar protocol.
+ *
* During parallel GC we need to be careful during evacuation: before replacing
* a closure with a forwarding pointer we must commit a write barrier to ensure
* that the copy we made in to-space is visible to other cores.
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 896b408ff9..dc27284500 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -1408,7 +1408,7 @@ dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mvar, StgClosure *old)
Capability *cap = regTableToCapability(reg);
// No barrier required here as no other heap object fields are read. See
- // note [Heap memory barriers] in SMP.h.
+ // Note [Heap memory barriers] in SMP.h.
SET_INFO((StgClosure*) mvar, &stg_MUT_VAR_DIRTY_info);
recordClosureMutated(cap, (StgClosure *) mvar);
IF_NONMOVING_WRITE_BARRIER_ENABLED {
@@ -1429,7 +1429,7 @@ dirty_TVAR(Capability *cap, StgTVar *p,
StgClosure *old)
{
// No barrier required here as no other heap object fields are read. See
- // note [Heap memory barriers] in SMP.h.
+ // Note [Heap memory barriers] in SMP.h.
if (RELAXED_LOAD(&p->header.info) == &stg_TVAR_CLEAN_info) {
SET_INFO((StgClosure*) p, &stg_TVAR_DIRTY_info);
recordClosureMutated(cap,(StgClosure*)p);