From 99269b9fd817262a686867383bf0fe88fdc64fb0 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Mon, 7 Nov 2022 22:50:53 -0500
Subject: Improve heap memory barrier Note

Also introduce MUT_FIELD marker in Closures.h to document mutable
fields.
---
 rts/Capability.h                   |   3 +-
 rts/Schedule.c                     |   2 +
 rts/include/rts/storage/Closures.h |  35 +++--
 rts/include/stg/SMP.h              | 254 ++++++++++++++++++++++++-------------
 rts/sm/Storage.c                   |   4 +-
 5 files changed, 193 insertions(+), 105 deletions(-)

diff --git a/rts/Capability.h b/rts/Capability.h
index 884d281fa2..168f8e565e 100644
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -493,7 +493,8 @@ contextSwitchCapability (Capability *cap, bool immediately)
 
 INLINE_HEADER bool emptyInbox(Capability *cap)
 {
-    // This may race with writes to putMVars and inbox but this harmless for the
+    // See Note [Heap memory barriers], section "Barriers on Messages".
+    // This may race with writes to putMVars but this harmless for the
     // intended uses of this function.
     TSAN_ANNOTATE_BENIGN_RACE(&cap->putMVars, "emptyInbox(cap->putMVars)");
     return (RELAXED_LOAD(&cap->inbox) == (Message*)END_TSO_QUEUE &&
diff --git a/rts/Schedule.c b/rts/Schedule.c
index d4b1bf345f..306a709883 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -1030,6 +1030,8 @@ scheduleProcessInbox (Capability **pcap USED_IF_THREADS)
         // never goes idle if the inbox is non-empty, which is why we
         // use cap->lock (cap->lock is released as the last thing
         // before going idle; see Capability.c:releaseCapability()).
+        //
+        // See Note [Heap memory barriers], section "Barriers on messages"
         r = TRY_ACQUIRE_LOCK(&cap->lock);
         if (r != 0) return;
 
diff --git a/rts/include/rts/storage/Closures.h b/rts/include/rts/storage/Closures.h
index a2b6eb079f..01ae438a43 100644
--- a/rts/include/rts/storage/Closures.h
+++ b/rts/include/rts/storage/Closures.h
@@ -13,6 +13,13 @@
  * compiling for: profiling, parallel, ticky, etc.
  */
 
+/*
+ * Used to mark GC-pointer fields which can be modified by the mutator after
+ * an object is made visible on the heap. See Note [Heap memory barriers] in
+ * SMP.h for details.
+ */
+#define MUT_FIELD
+
 /* -----------------------------------------------------------------------------
    The profiling header
    -------------------------------------------------------------------------- */
@@ -43,7 +50,7 @@ typedef struct {
    -------------------------------------------------------------------------- */
 
 typedef struct {
-    StgWord pad;
+    StgWord pad MUT_FIELD;
 } StgSMPThunkHeader;
 
 /* -----------------------------------------------------------------------------
@@ -210,7 +217,7 @@ typedef struct _StgMutArrPtrs {
     StgHeader   header;
     StgWord     ptrs;
     StgWord     size; // ptrs plus card table
-    StgClosure *payload[];
+    StgClosure *payload[] MUT_FIELD;
     // see also: StgMutArrPtrs macros in ClosureMacros.h
 } StgMutArrPtrs;
 
@@ -222,7 +229,7 @@ typedef struct _StgMutArrPtrs {
 typedef struct {
     StgHeader   header;
     StgWord     ptrs;
-    StgClosure *payload[];
+    StgClosure *payload[] MUT_FIELD;
 } StgSmallMutArrPtrs;
 
 
@@ -231,7 +238,7 @@ typedef struct {
 // Closure types: MUT_VAR_CLEAN, MUT_VAR_DIRTY
 typedef struct {
     StgHeader   header;
-    StgClosure *var;
+    StgClosure *var MUT_FIELD;
 } StgMutVar;
 
 
@@ -349,7 +356,7 @@ typedef struct _StgWeak {
   // C finalizers, see StgCFinalizerList below
   //
   // Points to stg_NO_FINALIZER_closure to indicate no c finalizers.
-  StgClosure *cfinalizers;
+  StgClosure *cfinalizers MUT_FIELD;
 
   StgClosure *key;
   StgClosure *value; // the actual value references by the weak reference
@@ -371,7 +378,7 @@ typedef struct _StgWeak {
 // Closure type: CONSTR
 typedef struct _StgCFinalizerList {
   StgHeader header;
-  StgClosure *link; // the next finaliser
+  StgClosure *link MUT_FIELD; // the next finaliser
 
   // function to call
   //
@@ -448,11 +455,11 @@ typedef struct {
     StgHeader                header;
 
     // threads that are waiting on this MVar
-    struct StgMVarTSOQueue_ *head;
-    struct StgMVarTSOQueue_ *tail;
+    struct StgMVarTSOQueue_ *head MUT_FIELD;
+    struct StgMVarTSOQueue_ *tail MUT_FIELD;
 
     // The value in the MVar if filled
-    StgClosure*              value;
+    StgClosure*              value MUT_FIELD;
 } StgMVar;
 
 
@@ -493,8 +500,8 @@ typedef struct StgTVarWatchQueue_ {
 
 typedef struct {
   StgHeader                  header;
-  StgClosure                *current_value; /* accessed via atomics */
-  StgTVarWatchQueue         *first_watch_queue_entry; /* accessed via atomics */
+  StgClosure                *current_value MUT_FIELD; /* accessed via atomics */
+  StgTVarWatchQueue         *first_watch_queue_entry MUT_FIELD; /* accessed via atomics */
   StgInt                     num_updates; /* accessed via atomics */
 } StgTVar;
 
@@ -533,7 +540,7 @@ typedef enum {
 struct StgTRecHeader_ {
   StgHeader                  header;
   struct StgTRecHeader_     *enclosing_trec;
-  StgTRecChunk              *current_chunk;
+  StgTRecChunk              *current_chunk MUT_FIELD;
   TRecState                  state;
 };
 
@@ -623,7 +630,7 @@ typedef struct StgCompactNFDataBlock_ {
        // the fixup implementation.
     struct StgCompactNFData_ *owner;
        // the closure who owns this block (used in objectGetCompact)
-    struct StgCompactNFDataBlock_ *next;
+    struct StgCompactNFDataBlock_ *next MUT_FIELD;
        // chain of blocks used for serialization and freeing
 } StgCompactNFDataBlock;
 
@@ -645,7 +652,7 @@ typedef struct StgCompactNFData_ {
       // the nursery pointer below during compaction.
     StgCompactNFDataBlock *nursery;
       // where to (try to) allocate from when appending
-    StgCompactNFDataBlock *last;
+    StgCompactNFDataBlock *last MUT_FIELD;
       // the last block of the chain (to know where to append new
       // blocks for resize)
     struct hashtable *hash;
diff --git a/rts/include/stg/SMP.h b/rts/include/stg/SMP.h
index aaa54e8435..0800c87786 100644
--- a/rts/include/stg/SMP.h
+++ b/rts/include/stg/SMP.h
@@ -108,93 +108,128 @@ EXTERN_INLINE void load_load_barrier(void);
  * Note [Heap memory barriers]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
  * Machines with weak memory ordering semantics have consequences for how
- * closures are observed and mutated. For example, consider a thunk that needs
- * to be updated to an indirection. In order for the indirection to be safe for
- * concurrent observers to enter, said observers must read the indirection's
- * info table before they read the indirectee. Furthermore, the indirectee must
- * be set before the info table pointer. This ensures that if the observer sees
- * an IND info table then the indirectee is valid.
- *
- * When a closure is updated with an indirection, both its info table and its
- * indirectee must be written. With weak memory ordering, these two writes can
- * be arbitrarily reordered, and perhaps even interleaved with other threads'
- * reads and writes (in the absence of memory barrier instructions). Consider
- * this example of a bad reordering:
- *
- * - An updater writes to a closure's info table (INFO_TYPE is now IND).
- * - A concurrent observer branches upon reading the closure's INFO_TYPE as IND.
- * - A concurrent observer reads the closure's indirectee and enters it.
- * - An updater writes the closure's indirectee.
- *
- * Here the update to the indirectee comes too late and the concurrent observer
- * has jumped off into the abyss. Speculative execution can also cause us
- * issues, consider:
- *
- * - an observer is about to case on a value in closure's info table.
- * - the observer speculatively reads one or more of closure's fields.
- * - an updater writes to closure's info table.
- * - the observer takes a branch based on the new info table value, but with the
- *   old closure fields!
- * - the updater writes to the closure's other fields, but its too late.
- *
- * Because of these effects, reads and writes to a closure's info table must be
- * ordered carefully with respect to reads and writes to the closure's other
- * fields, and memory barriers must be placed to ensure that reads and writes
- * occur in program order. Specifically, updates to an already existing closure
- * must follow the following pattern:
- *
- * - Update the closure's (non-info table) fields.
- * - Write barrier.
- * - Update the closure's info table.
- *
- * Observing the fields of an updateable closure (e.g. a THUNK) must follow the
- * following pattern:
- *
- * - Read the closure's info pointer.
- * - Read barrier.
- * - Read the closure's (non-info table) fields.
- *
- * We must also take care when we expose a newly-allocated closure to other cores
- * by writing a pointer to it to some shared data structure (e.g. an MVar#, a Message,
- * or MutVar#). Specifically, we need to ensure that all writes constructing the
- * closure are visible *before* the write exposing the new closure is made visible:
- *
- * - Allocate memory for the closure
- * - Write the closure's info pointer and fields (ordering between this doesn't
- *   matter since the closure isn't yet visible to anyone else).
- * - Write barrier
- * - Make closure visible to other cores
- *
- * Note that thread stacks are inherently thread-local and consequently allocating an
- * object and introducing a reference to it to our stack needs no barrier.
- *
- * There are several ways in which the mutator may make a newly-allocated
- * closure visible to other cores:
- *
- *  - Eager blackholing a THUNK:
- *    This is protected by an explicit write barrier in the eager blackholing
- *    code produced by the codegen. See GHC.StgToCmm.Bind.emitBlackHoleCode.
- *
- *  - Lazy blackholing a THUNK:
- *    This is is protected by an explicit write barrier in the thread suspension
- *    code. See ThreadPaused.c:threadPaused.
- *
- *  - Updating a BLACKHOLE:
- *    This case is protected by explicit write barriers in the update frame
- *    entry code (see rts/Updates.h).
- *
- *  - Blocking on an MVar# (e.g. takeMVar#):
- *    In this case the appropriate MVar primops (e.g. stg_takeMVarzh).  include
- *    explicit memory barriers to ensure that the newly-allocated
- *    MVAR_TSO_QUEUE is visible to other cores.
- *
- *  - Write to an MVar# (e.g. putMVar#):
- *    This protected by the full barrier implied by the CAS in putMVar#.
+ * closures are observed and mutated. In particular, when we
+ * expose a new (or mutated) object to another core we must ensure that the
+ * stores which formed the new object are visible (e.g. stores are flushed from
+ * cache and the relevant cachelines invalidated in other cores).
+ *
+ * To ensure this we must use memory barriers. Which barriers are required to
+ * access a field depends upon the type of the field. In general, fields come
+ * in three flavours:
+ *
+ *  * Mutable GC Pointers (C type StgClosure*, Cmm type StgPtr)
+ *  * Immutable GC Pointers (C type MUT_FIELD StgClosure*, Cmm type StgPtr)
+ *  * Non-pointers (C type StgWord, Cmm type StdWord)
+ *
+ * Note that Addr# fields are *not* GC pointers and therefore are classified
+ * as non-pointers. Responsibility for barriers lies with the party
+ * dereferencing the pointer.
+ *
+ * Also note that we are only concerned with mutation by the mutator. The GC
+ * is free to change nearly any field as this is necessary for a moving GC.
+ * Naturally, doing this safely requires care which we discuss in section
+ * below.
+ *
+ * Immutable pointer fields are those which the mutator cannot change after
+ * an object is made visible on the heap. Most objects' fields are of this
+ * flavour (e.g. all data constructor fields). As these fields are written
+ * precisely once, no write barriers are needed on writes nor reads. This is
+ * safe due to an argument hinging on causality: Consider an immutable field F
+ * of an object O refers to object O'. Naturally, O' must have been visible to
+ * the creator of O when O was constructed. Consequently, if O is visible to a
+ * reader, O' must also be visible.
+ *
+ * Mutable pointer fields are those which can be modified by the mutator. These
+ * require a bit more care as they may break the causality argument given
+ * above. For instance, consider an object O (e.g. a MUT_VAR) with a mutable
+ * field F. A thread may allocate a new object O' and store a reference to O'
+ * into F. Without explicit synchronization O' may not be visible to another
+ * thread attempting to dereference F.
+ *
+ * Mutable fields include:
+ *
+ *   - StgMutVar: var
+ *   - StgWeak: finalizer
+ *   - StgMVar: head, tail, value
+ *   - StgMVarTSOQueue: link
+ *   - StgTVar: current_value, first_watch_queue_entry
+ *   - StgTVarWatchQueue: {next,prev}_queue_entry
+ *   - StgTRecChunk: TODO
+ *   - StgMutArrPtrs: payload
+ *   - StgSmallMutArrPtrs: payload
+ *   - StgThunk although this is a somewhat special case; see below
+ *
+ * Writing to a mutable pointer field must be done via a release-store.
+ * Reading from such a field is done via an acquire-load.
+ *
+ * Finally, non-pointer fields can be safely mutated without barriers as
+ * they do not refer to other memory. Technically, concurrent accesses to
+ * non-pointer fields still do need to be atomic in many cases to avoid torn
+ * accesses. However, this is something that we generally avoid by locking
+ * closures prior to mutating non-pointer fields (see Locking closures below).
+ *
+ * Note that MUT_VARs offer both synchronized and unsynchronized primops.
+ * Consequently, in these cases there is a burden on the user to ensure that
+ * synchronization is provided where necessary.
+ *
+ * Locking closures
+ * ----------------
+ * Several primops temporarily turn closures into WHITEHOLEs to ensure that
+ * they have exclusive access (see SMPClosureOps.h:reallyLockClosure).
+ * Locking is done via an atomic exchange operation on the closure's info table
+ * pointer with sequential consistency (although only acquire ordering is
+ * needed). This acquire ensures that we synchronize with any previous thread
+ * that had locked the closure. Consequently, it is important that we take great
+ * care in examining the mutable fields of a lockable closure prior to having
+ * locked it.
+ *
+ * Naturally, unlocking is done via a release-store to restore the closure's
+ * original info table pointer.
+ *
+ * Thunks
+ * ------
+ * As noted above, thunks are a rather special (yet quite common) case. In
+ * particular, they have the unique property of being updatable, transforming
+ * from a thunk to an indirection. This transformation requires its own
+ * synchronization protocol. In particular, we must ensure that a reader
+ * examining a thunk being updated can see the indirectee. Consequently, a
+ * thunk update (see rts/Updates.h) does the following:
+ *
+ *  1. Use a release-fence to ensure that the indirectee is visible
+ *  2. Use a relaxed-store to place the new indirectee into the thunk's
+ *     indirectee field
+ *  3. use a release-store to set the info table to stg_BLACKHOLE (which
+ *     represents an indirection)
+ *
+ * Blackholing a thunk (either eagerly, by GHC.StgToCmm.Bind.emitBlackHoleCode,
+ * or lazily, by ThreadPaused.c:threadPaused) is done similarly.
+ *
+ * Conversely, thunk entry (see the entry code of stg_BLACKHOLE in
+ * rts/StgMiscClosure) does the following:
+ *
+ *  1. We jump into the entry code for stg_BLACKHOLE; this of course implies
+ *     that we have already read the thunk's info table pointer, which is done
+ *     with a relaxed load.
+ *  2. use an acquire-fence to ensure that our view on the thunk is
+ *     up-to-date. This synchronizes with step (3) in the update
+ *     procedure.
+ *  3. relaxed-load the indirectee. Since thunks are updated at most
+ *     once we know that the fence in the last step has given us
+ *     an up-to-date view of the indirectee closure.
+ *  4. enter the indirectee (or block if the indirectee is a TSO)
+ *
+ * Other closures
+ * --------------
+ * Below is a summary of the barriers which are necessary to make GHC's
+ * primitive types respect the above rules:
+ *
+ *  - MVar# operations.
+ *    The MVar# primops lock the MVar prior to reading or writing any MVar fix.
  *
  *  - Write to a TVar#:
  *    This is protected by the full barrier implied by the CAS in STM.c:lock_stm.
  *
- *  - Write to an Array#, ArrayArray#, or SmallArray#:
+ *  - Write to an Array# or SmallArray#:
  *    This case is protected by an explicit write barrier in the code produced
  *    for this primop by the codegen. See GHC.StgToCmm.Prim.doWritePtrArrayOp and
  *    GHC.StgToCmm.Prim.doWriteSmallPtrArrayOp. Relevant issue: #12469.
@@ -211,15 +246,58 @@ EXTERN_INLINE void load_load_barrier(void);
  *    This is protected by the acquition and release of the target capability's
  *    lock in Messages.c:sendMessage.
  *
- * Finally, we must ensure that we flush all cores store buffers before
- * entering and leaving GC, since stacks may be read by other cores. This
- * happens as a side-effect of taking and release mutexes (which implies
- * acquire and release barriers, respectively).
- *
  * N.B. recordClosureMutated places a reference to the mutated object on
  * the capability-local mut_list. Consequently this does not require any memory
  * barrier.
  *
+ * Barriers in thread migration
+ * ----------------------------
+ * When a thread is migrated from one capability to another we must take care
+ * that the new capability synchronizes with the old (e.g. to ensure that
+ * the state of the thread's stack is visible to the new executor).
+ * This is ensured by the barriers implied in sending the MessageWakeup.
+ *
+ * Barriers on Messages
+ * --------------------
+ * The RTS uses the Message mechanism to convey information between capabilities.
+ * To send a message (see Messages.c:sendMessage) the sender must first take
+ * the `lock` of the recipient capability. The Message can then be linked
+ * onto the recipient's `inbox` list and release the lock (implying a release
+ * barrier).
+ *
+ * To process its inbox (see Schedule.c:scheduleProcessInbox) the recipient must
+ * take the capability lock before examining the inbox. This implies an acquire
+ * barrier, ensuring that the messages in the inbox are visible.
+ *
+ * Under the above story there is no need for accesses to `inbox` to be
+ * atomic at all as all accesses are under the capability lock. However, there
+ * is a slight wrinkle here:
+ * Capability.h:emptyInbox wants to test whether `inbox` is empty without
+ * holding the capability lock. Consequently, accesses to `inbox` must be
+ * atomic, although may use the relaxed ordering.
+ *
+ * Barriers during GC
+ * ------------------
+ * Entering the garbage collector implies an acquire barrier across all
+ * capabilities as all capabilities obey the following protocol when entering
+ * `gcWorkerThread`:
+ *
+ *  1. acquire gc_entry_mutex
+ *  2. signal gc_entry_arrived_cv to let the GC leader know
+ *     that we have started to synchronize
+ *  3. wait on gc_entry_start_now_cv, releasing
+ *     gc_entry_mutex; this implies a release barrier
+ *  4. after all threads have synchronized, the leader will
+ *     broadcast signal gc_entry_start_now_cv
+ *  5. each of the waiting workers will acquire
+ *     gc_entry_mutex, implying an acquire barrier which will
+ *     synchronize with all of the other workers' releases in
+ *     (3).
+ *
+ * Similarly, leaving the garbage collector implies a release
+ * barrier as gcWorkerThread blocks on gc_exit_leave_now_cv
+ * with a similar protocol.
+ *
  * During parallel GC we need to be careful during evacuation: before replacing
  * a closure with a forwarding pointer we must commit a write barrier to ensure
  * that the copy we made in to-space is visible to other cores.
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 896b408ff9..dc27284500 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -1408,7 +1408,7 @@ dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mvar, StgClosure *old)
 
     Capability *cap = regTableToCapability(reg);
     // No barrier required here as no other heap object fields are read. See
-    // note [Heap memory barriers] in SMP.h.
+    // Note [Heap memory barriers] in SMP.h.
     SET_INFO((StgClosure*) mvar, &stg_MUT_VAR_DIRTY_info);
     recordClosureMutated(cap, (StgClosure *) mvar);
     IF_NONMOVING_WRITE_BARRIER_ENABLED {
@@ -1429,7 +1429,7 @@ dirty_TVAR(Capability *cap, StgTVar *p,
            StgClosure *old)
 {
     // No barrier required here as no other heap object fields are read. See
-    // note [Heap memory barriers] in SMP.h.
+    // Note [Heap memory barriers] in SMP.h.
     if (RELAXED_LOAD(&p->header.info) == &stg_TVAR_CLEAN_info) {
         SET_INFO((StgClosure*) p, &stg_TVAR_DIRTY_info);
         recordClosureMutated(cap,(StgClosure*)p);
-- 
cgit v1.2.1