1 files changed, 80 insertions, 55 deletions
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index c39dcc2e89..a5aa7e1f4e 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -112,14 +112,8 @@ static W_ g0_pcnt_kept = 30; // percentage of g0 live at last minor GC
 
 /* Mut-list stats */
 #if defined(DEBUG)
-uint32_t mutlist_MUTVARS,
-    mutlist_MUTARRS,
-    mutlist_MVARS,
-    mutlist_TVAR,
-    mutlist_TVAR_WATCH_QUEUE,
-    mutlist_TREC_CHUNK,
-    mutlist_TREC_HEADER,
-    mutlist_OTHERS;
+// For lack of a better option we protect mutlist_scav_stats with oldest_gen->sync
+MutListScavStats mutlist_scav_stats;
 #endif
 
 /* Thread-local data for each GC thread
@@ -184,6 +178,36 @@ bdescr *mark_stack_top_bd; // topmost block in the mark stack
 bdescr *mark_stack_bd;     // current block in the mark stack
 StgPtr mark_sp;            // pointer to the next unallocated mark stack entry
 
+
+/* -----------------------------------------------------------------------------
+   Statistics from mut_list scavenging
+   -------------------------------------------------------------------------- */
+
+#if defined(DEBUG)
+void
+zeroMutListScavStats(MutListScavStats *src)
+{
+    memset(src, 0, sizeof(MutListScavStats));
+}
+
+void
+addMutListScavStats(const MutListScavStats *src,
+                    MutListScavStats *dest)
+{
+#define ADD_STATS(field) dest->field += src->field;
+    ADD_STATS(n_MUTVAR);
+    ADD_STATS(n_MUTARR);
+    ADD_STATS(n_MVAR);
+    ADD_STATS(n_TVAR);
+    ADD_STATS(n_TREC_CHUNK);
+    ADD_STATS(n_TVAR_WATCH_QUEUE);
+    ADD_STATS(n_TREC_HEADER);
+    ADD_STATS(n_OTHERS);
+#undef ADD_STATS
+}
+#endif /* DEBUG */
+
+
 /* -----------------------------------------------------------------------------
    GarbageCollect: the main entry point to the garbage collector.
 
@@ -250,14 +274,7 @@ GarbageCollect (uint32_t collect_gen,
   stablePtrLock();
 
 #if defined(DEBUG)
-  mutlist_MUTVARS = 0;
-  mutlist_MUTARRS = 0;
-  mutlist_MVARS = 0;
-  mutlist_TVAR = 0;
-  mutlist_TVAR_WATCH_QUEUE = 0;
-  mutlist_TREC_CHUNK = 0;
-  mutlist_TREC_HEADER = 0;
-  mutlist_OTHERS = 0;
+  zeroMutListScavStats(&mutlist_scav_stats);
 #endif
 
   // attribute any costs to CCS_GC
@@ -520,37 +537,37 @@ GarbageCollect (uint32_t collect_gen,
       const gc_thread* thread;
 
       for (i=0; i < n_gc_threads; i++) {
-          copied += gc_threads[i]->copied;
+          copied += RELAXED_LOAD(&gc_threads[i]->copied);
       }
       for (i=0; i < n_gc_threads; i++) {
           thread = gc_threads[i];
           if (n_gc_threads > 1) {
               debugTrace(DEBUG_gc,"thread %d:", i);
               debugTrace(DEBUG_gc,"   copied           %ld",
-                         thread->copied * sizeof(W_));
+                         RELAXED_LOAD(&thread->copied) * sizeof(W_));
               debugTrace(DEBUG_gc,"   scanned          %ld",
-                         thread->scanned * sizeof(W_));
+                         RELAXED_LOAD(&thread->scanned) * sizeof(W_));
               debugTrace(DEBUG_gc,"   any_work         %ld",
-                         thread->any_work);
+                         RELAXED_LOAD(&thread->any_work));
               debugTrace(DEBUG_gc,"   no_work          %ld",
-                         thread->no_work);
+                         RELAXED_LOAD(&thread->no_work));
               debugTrace(DEBUG_gc,"   scav_find_work %ld",
-                         thread->scav_find_work);
+                         RELAXED_LOAD(&thread->scav_find_work));
 
 #if defined(THREADED_RTS) && defined(PROF_SPIN)
-              gc_spin_spin += thread->gc_spin.spin;
-              gc_spin_yield += thread->gc_spin.yield;
-              mut_spin_spin += thread->mut_spin.spin;
-              mut_spin_yield += thread->mut_spin.yield;
+              gc_spin_spin += RELAXED_LOAD(&thread->gc_spin.spin);
+              gc_spin_yield += RELAXED_LOAD(&thread->gc_spin.yield);
+              mut_spin_spin += RELAXED_LOAD(&thread->mut_spin.spin);
+              mut_spin_yield += RELAXED_LOAD(&thread->mut_spin.yield);
 #endif
 
-              any_work += thread->any_work;
-              no_work += thread->no_work;
-              scav_find_work += thread->scav_find_work;
+              any_work += RELAXED_LOAD(&thread->any_work);
+              no_work += RELAXED_LOAD(&thread->no_work);
+              scav_find_work += RELAXED_LOAD(&thread->scav_find_work);
 
-              par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
+              par_max_copied = stg_max(RELAXED_LOAD(&thread->copied), par_max_copied);
               par_balanced_copied_acc +=
-                  stg_min(n_gc_threads * gc_threads[i]->copied, copied);
+                  stg_min(n_gc_threads * RELAXED_LOAD(&thread->copied), copied);
           }
       }
       if (n_gc_threads > 1) {
@@ -590,10 +607,14 @@ GarbageCollect (uint32_t collect_gen,
         debugTrace(DEBUG_gc,
                    "mut_list_size: %lu (%d vars, %d arrays, %d MVARs, %d TVARs, %d TVAR_WATCH_QUEUEs, %d TREC_CHUNKs, %d TREC_HEADERs, %d others)",
                    (unsigned long)(mut_list_size * sizeof(W_)),
-                   mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS,
-                   mutlist_TVAR, mutlist_TVAR_WATCH_QUEUE,
-                   mutlist_TREC_CHUNK, mutlist_TREC_HEADER,
-                   mutlist_OTHERS);
+                   mutlist_scav_stats.n_MUTVAR,
+                   mutlist_scav_stats.n_MUTARR,
+                   mutlist_scav_stats.n_MVAR,
+                   mutlist_scav_stats.n_TVAR,
+                   mutlist_scav_stats.n_TVAR_WATCH_QUEUE,
+                   mutlist_scav_stats.n_TREC_CHUNK,
+                   mutlist_scav_stats.n_TREC_HEADER,
+                   mutlist_scav_stats.n_OTHERS);
     }
 
     bdescr *next, *prev;
@@ -1109,7 +1130,7 @@ inc_running (void)
 static StgWord
 dec_running (void)
 {
-    ASSERT(gc_running_threads != 0);
+    ASSERT(RELAXED_LOAD(&gc_running_threads) != 0);
     return atomic_dec(&gc_running_threads);
 }
 
@@ -1119,7 +1140,7 @@ any_work (void)
     int g;
     gen_workspace *ws;
 
-    gct->any_work++;
+    NONATOMIC_ADD(&gct->any_work, 1);
 
     write_barrier();
 
@@ -1193,7 +1214,7 @@ loop:
 
     debugTrace(DEBUG_gc, "%d GC threads still running", r);
 
-    while (gc_running_threads != 0) {
+    while (SEQ_CST_LOAD(&gc_running_threads) != 0) {
         // usleep(1);
         if (any_work()) {
             inc_running();
@@ -1257,10 +1278,13 @@ gcWorkerThread (Capability *cap)
 
     // Wait until we're told to continue
     RELEASE_SPIN_LOCK(&gct->gc_spin);
-    stat_endGCWorker (cap, gct); // write stats before setting gct->wakeup (#17964,#18717)
-    gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE;
     debugTrace(DEBUG_gc, "GC thread %d waiting to continue...",
                gct->thread_index);
+    stat_endGCWorker (cap, gct);
+    // This must come *after* stat_endGCWorker since it serves to
+    // synchronize us with the GC leader, which will later aggregate the
+    // GC statistics  (#17964,#18717)
+    SEQ_CST_STORE(&gct->wakeup, GC_THREAD_WAITING_TO_CONTINUE);
     ACQUIRE_SPIN_LOCK(&gct->mut_spin);
     debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index);
 
@@ -1285,7 +1309,7 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
     while(retry) {
         for (i=0; i < n_threads; i++) {
             if (i == me || idle_cap[i]) continue;
-            if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
+            if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY) {
                 prodCapability(capabilities[i], cap->running_task);
             }
         }
@@ -1295,7 +1319,7 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
                 if (i == me || idle_cap[i]) continue;
                 write_barrier();
                 interruptCapability(capabilities[i]);
-                if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) {
+                if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY) {
                     retry = true;
                 }
             }
@@ -1352,10 +1376,10 @@ wakeup_gc_threads (uint32_t me USED_IF_THREADS,
         if (i == me || idle_cap[i]) continue;
         inc_running();
         debugTrace(DEBUG_gc, "waking up gc thread %d", i);
-        if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY)
+        if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY)
             barf("wakeup_gc_threads");
 
-        gc_threads[i]->wakeup = GC_THREAD_RUNNING;
+        SEQ_CST_STORE(&gc_threads[i]->wakeup, GC_THREAD_RUNNING);
         ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
         RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
     }
@@ -1376,9 +1400,8 @@ shutdown_gc_threads (uint32_t me USED_IF_THREADS,
 
     for (i=0; i < n_gc_threads; i++) {
         if (i == me || idle_cap[i]) continue;
-        while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) {
+        while (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_WAITING_TO_CONTINUE) {
             busy_wait_nop();
-            write_barrier();
         }
     }
 #endif
@@ -1393,10 +1416,10 @@ releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
     uint32_t i;
     for (i=0; i < n_threads; i++) {
         if (i == me || idle_cap[i]) continue;
-        if (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE)
+        if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_WAITING_TO_CONTINUE)
             barf("releaseGCThreads");
 
-        gc_threads[i]->wakeup = GC_THREAD_INACTIVE;
+        SEQ_CST_STORE(&gc_threads[i]->wakeup, GC_THREAD_INACTIVE);
         ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin);
         RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin);
     }
@@ -1412,7 +1435,7 @@ static void
 stash_mut_list (Capability *cap, uint32_t gen_no)
 {
     cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
-    cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
+    RELEASE_STORE(&cap->mut_lists[gen_no], allocBlockOnNode_sync(cap->node));
 }
 
 /* ----------------------------------------------------------------------------
@@ -1438,9 +1461,11 @@ prepare_collected_gen (generation *gen)
         // mutable list always has at least one block; this means we can avoid
         // a check for NULL in recordMutable().
         for (i = 0; i < n_capabilities; i++) {
-            freeChain(capabilities[i]->mut_lists[g]);
-            capabilities[i]->mut_lists[g] =
-                allocBlockOnNode(capNoToNumaNode(i));
+            bdescr *old = RELAXED_LOAD(&capabilities[i]->mut_lists[g]);
+            freeChain(old);
+
+            bdescr *new = allocBlockOnNode(capNoToNumaNode(i));
+            RELAXED_STORE(&capabilities[i]->mut_lists[g], new);
         }
     }
 
@@ -1654,7 +1679,7 @@ collect_pinned_object_blocks (void)
         bdescr *last = NULL;
         if (use_nonmoving && gen == oldest_gen) {
             // Mark objects as belonging to the nonmoving heap
-            for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
+            for (bdescr *bd = RELAXED_LOAD(&capabilities[n]->pinned_object_blocks); bd != NULL; bd = bd->link) {
                 bd->flags |= BF_NONMOVING;
                 bd->gen = oldest_gen;
                 bd->gen_no = oldest_gen->no;
@@ -1673,8 +1698,8 @@ collect_pinned_object_blocks (void)
             if (gen->large_objects != NULL) {
                 gen->large_objects->u.back = last;
             }
-            gen->large_objects = capabilities[n]->pinned_object_blocks;
-            capabilities[n]->pinned_object_blocks = NULL;
+            g0->large_objects = RELAXED_LOAD(&capabilities[n]->pinned_object_blocks);
+            RELAXED_STORE(&capabilities[n]->pinned_object_blocks, NULL);
         }
     }
 }