5 files changed, 113 insertions, 124 deletions
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h
index c896b5b244..e2d1845819 100644
--- a/includes/RtsAPI.h
+++ b/includes/RtsAPI.h
@@ -251,20 +251,6 @@ typedef struct _RTSStats {
   // -----------------------------------
   // Internal Counters
 
-    // The number of times a GC thread spun on its 'gc_spin' lock.
-    // Will be zero if the rts was not built with PROF_SPIN
-  uint64_t gc_spin_spin;
-    // The number of times a GC thread yielded on its 'gc_spin' lock.
-    // Will be zero if the rts was not built with PROF_SPIN
-  uint64_t gc_spin_yield;
-    // The number of times a GC thread spun on its 'mut_spin' lock.
-    // Will be zero if the rts was not built with PROF_SPIN
-  uint64_t mut_spin_spin;
-    // The number of times a GC thread yielded on its 'mut_spin' lock.
-    // Will be zero if the rts was not built with PROF_SPIN
-  uint64_t mut_spin_yield;
-    // The number of times a GC thread has checked for work across all parallel
-    // GCs
   uint64_t any_work;
     // The number of times a GC thread has iterated it's outer loop across all
     // parallel GCs
diff --git a/rts/Stats.c b/rts/Stats.c
index 45d40ddcad..53251bb7d6 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -154,10 +154,6 @@ initStats0(void)
         .par_copied_bytes = 0,
         .cumulative_par_max_copied_bytes = 0,
         .cumulative_par_balanced_copied_bytes = 0,
-        .gc_spin_spin = 0,
-        .gc_spin_yield = 0,
-        .mut_spin_spin = 0,
-        .mut_spin_yield = 0,
         .any_work = 0,
         .scav_find_work = 0,
         .max_n_todo_overflow = 0,
@@ -460,9 +456,8 @@ stat_startGC (Capability *cap, gc_thread *gct)
 void
 stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ slop,
             uint32_t gen, uint32_t par_n_threads, gc_thread **gc_threads,
-            W_ par_max_copied, W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
-            W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ scav_find_work,
-            W_ max_n_todo_overflow)
+            W_ par_max_copied, W_ par_balanced_copied, W_ any_work,
+            W_ scav_find_work, W_ max_n_todo_overflow)
 {
     ACQUIRE_LOCK(&stats_mutex);
 
@@ -544,10 +539,6 @@ stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ s
         stats.any_work += any_work;
         stats.scav_find_work += scav_find_work;
         stats.max_n_todo_overflow += stg_max(max_n_todo_overflow, stats.max_n_todo_overflow);
-        stats.gc_spin_spin += gc_spin_spin;
-        stats.gc_spin_yield += gc_spin_yield;
-        stats.mut_spin_spin += mut_spin_spin;
-        stats.mut_spin_yield += mut_spin_yield;
     }
     stats.gc_cpu_ns += stats.gc.cpu_ns;
     stats.gc_elapsed_ns += stats.gc.elapsed_ns;
@@ -968,16 +959,6 @@ static void report_summary(const RTSSummaryStats* sum)
                     , col_width[1], "gc_alloc_block_sync"
                     , col_width[2], gc_alloc_block_sync.spin
                     , col_width[3], gc_alloc_block_sync.yield);
-        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                    , col_width[0], ""
-                    , col_width[1], "gc_spin"
-                    , col_width[2], stats.gc_spin_spin
-                    , col_width[3], stats.gc_spin_yield);
-        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                    , col_width[0], ""
-                    , col_width[1], "mut_spin"
-                    , col_width[2], stats.mut_spin_spin
-                    , col_width[3], stats.mut_spin_yield);
         statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
                     , col_width[0], ""
                     , col_width[1], "whitehole_gc"
@@ -1154,10 +1135,6 @@ static void report_machine_readable (const RTSSummaryStats * sum)
     MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
             gc_alloc_block_sync.yield);
     MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
-    MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin);
-    MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield);
-    MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin);
-    MR_STAT("mut_spin_yield", FMT_Word64, stats.mut_spin_yield);
     MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin);
     MR_STAT("waitForGcThreads_yield", FMT_Word64,
             waitForGcThreads_yield);
@@ -1587,9 +1564,6 @@ Actual SpinLocks:
 * gc_alloc_block:
     This SpinLock protects the block allocator and free list manager. See
     BlockAlloc.c.
-* gc_spin and mut_spin:
-    These SpinLocks are used to herd gc worker threads during parallel garbage
-    collection. See gcWorkerThread, wakeup_gc_threads and releaseGCThreads.
 * gen[g].sync:
     These SpinLocks, one per generation, protect the generations[g] data
     structure during garbage collection.
diff --git a/rts/Stats.h b/rts/Stats.h
index 19bd707302..5268c3ae63 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -36,9 +36,7 @@ void      stat_endGC  (Capability *cap, struct gc_thread_ *initiating_gct, W_ li
                        W_ copied, W_ slop, uint32_t gen,
                        uint32_t n_gc_threads, struct gc_thread_ **gc_threads,
                        W_ par_max_copied, W_ par_balanced_copied,
-                       W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin,
-                       W_ mut_spin_yield, W_ any_work, W_ scav_find_work,
-                       W_ max_n_todo_overflow);
+                       W_ any_work, W_ scav_find_work, W_ max_n_todo_overflow);
 
 void      stat_startNonmovingGcSync(void);
 void      stat_endNonmovingGcSync(void);
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 5e986f2296..4ddbcfdafd 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -269,7 +269,6 @@ GarbageCollect (uint32_t collect_gen,
   bdescr *bd;
   generation *gen;
   StgWord live_blocks, live_words, par_max_copied, par_balanced_copied,
-      gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
       any_work, scav_find_work, max_n_todo_overflow;
 #if defined(THREADED_RTS)
   gc_thread *saved_gct;
@@ -391,7 +390,10 @@ GarbageCollect (uint32_t collect_gen,
   }
   work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
       N >= RtsFlags.ParFlags.parGcLoadBalancingGen &&
-      n_gc_threads > 1;
+      (gc_type == SYNC_GC_PAR) &&
+      (n_gc_threads - n_gc_idle_threads) > 1;
+      // ^ if we are the only non-idle gc thread, turn off work stealing
+      //
       // It's not always a good idea to do load balancing in parallel
       // GC.  In particular, for a parallel program we don't want to
       // lose locality by moving cached data into another CPU's cache
@@ -588,10 +590,6 @@ GarbageCollect (uint32_t collect_gen,
   copied = 0;
   par_max_copied = 0;
   par_balanced_copied = 0;
-  gc_spin_spin = 0;
-  gc_spin_yield = 0;
-  mut_spin_spin = 0;
-  mut_spin_yield = 0;
   any_work = 0;
   scav_find_work = 0;
   max_n_todo_overflow = 0;
@@ -617,13 +615,6 @@ GarbageCollect (uint32_t collect_gen,
               debugTrace(DEBUG_gc,"   scav_find_work %ld",
                          RELAXED_LOAD(&thread->scav_find_work));
 
-#if defined(THREADED_RTS) && defined(PROF_SPIN)
-              gc_spin_spin += RELAXED_LOAD(&thread->gc_spin.spin);
-              gc_spin_yield += RELAXED_LOAD(&thread->gc_spin.yield);
-              mut_spin_spin += RELAXED_LOAD(&thread->mut_spin.spin);
-              mut_spin_yield += RELAXED_LOAD(&thread->mut_spin.yield);
-#endif
-
               any_work += RELAXED_LOAD(&thread->any_work);
               scav_find_work += RELAXED_LOAD(&thread->scav_find_work);
               max_n_todo_overflow = stg_max(RELAXED_LOAD(&thread->max_n_todo_overflow), max_n_todo_overflow);
@@ -1044,7 +1035,6 @@ GarbageCollect (uint32_t collect_gen,
              live_blocks * BLOCK_SIZE_W - live_words /* slop */,
              N, n_gc_threads, gc_threads,
              par_max_copied, par_balanced_copied,
-             gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
              any_work, scav_find_work, max_n_todo_overflow);
 
 #if defined(RTS_USER_SIGNALS)
@@ -1083,10 +1073,6 @@ new_gc_thread (uint32_t n, gc_thread *t)
 
 #if defined(THREADED_RTS)
     t->id = 0;
-    initSpinLock(&t->gc_spin);
-    initSpinLock(&t->mut_spin);
-    ACQUIRE_SPIN_LOCK(&t->gc_spin);
-    ACQUIRE_SPIN_LOCK(&t->mut_spin);
     SEQ_CST_STORE(&t->wakeup, GC_THREAD_INACTIVE);  // starts true, so we can wait for the
                           // thread to start up, see wakeup_gc_threads
 #endif
@@ -1149,6 +1135,12 @@ initGcThreads (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS)
     } else {
         gc_threads = stgMallocBytes (to * sizeof(gc_thread*),
                                      "initGcThreads");
+        initMutex(&gc_entry_mutex);
+        initCondition(&gc_entry_arrived_cv);
+        initCondition(&gc_entry_start_now_cv);
+        initMutex(&gc_exit_mutex);
+        initCondition(&gc_exit_arrived_cv);
+        initCondition(&gc_exit_leave_now_cv);
         initMutex(&gc_running_mutex);
         initCondition(&gc_running_cv);
     }
@@ -1185,6 +1177,12 @@ freeGcThreads (void)
         }
         closeCondition(&gc_running_cv);
         closeMutex(&gc_running_mutex);
+        closeCondition(&gc_exit_leave_now_cv);
+        closeCondition(&gc_exit_arrived_cv);
+        closeMutex(&gc_exit_mutex);
+        closeCondition(&gc_entry_start_now_cv);
+        closeCondition(&gc_entry_arrived_cv);
+        closeMutex(&gc_entry_mutex);
         stgFree (gc_threads);
 #else
         for (g = 0; g < RtsFlags.GcFlags.generations; g++)
@@ -1339,16 +1337,16 @@ gcWorkerThread (Capability *cap)
     gct->id = osThreadId();
     stat_startGCWorker (cap, gct);
 
-    // Wait until we're told to wake up
-    RELEASE_SPIN_LOCK(&gct->mut_spin);
-    // yieldThread();
-    //    Strangely, adding a yieldThread() here makes the CPU time
-    //    measurements more accurate on Linux, perhaps because it syncs
-    //    the CPU time across the multiple cores.  Without this, CPU time
-    //    is heavily skewed towards GC rather than MUT.
     SEQ_CST_STORE(&gct->wakeup, GC_THREAD_STANDING_BY);
     debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index);
-    ACQUIRE_SPIN_LOCK(&gct->gc_spin);
+
+    ACQUIRE_LOCK(&gc_entry_mutex);
+    SEQ_CST_ADD(&n_gc_entered, 1);
+    signalCondition(&gc_entry_arrived_cv);
+    while(SEQ_CST_LOAD(&n_gc_entered) != 0) {
+        waitCondition(&gc_entry_start_now_cv, &gc_entry_mutex);
+    }
+    RELEASE_LOCK(&gc_entry_mutex);
 
     init_gc_thread(gct);
 
@@ -1372,15 +1370,22 @@ gcWorkerThread (Capability *cap)
 #endif
 
     // Wait until we're told to continue
-    RELEASE_SPIN_LOCK(&gct->gc_spin);
     debugTrace(DEBUG_gc, "GC thread %d waiting to continue...",
                gct->thread_index);
     stat_endGCWorker (cap, gct);
+
     // This must come *after* stat_endGCWorker since it serves to
     // synchronize us with the GC leader, which will later aggregate the
     // GC statistics  (#17964,#18717)
+    ACQUIRE_LOCK(&gc_exit_mutex);
     SEQ_CST_STORE(&gct->wakeup, GC_THREAD_WAITING_TO_CONTINUE);
-    ACQUIRE_SPIN_LOCK(&gct->mut_spin);
+    SEQ_CST_ADD(&n_gc_exited, 1);
+    signalCondition(&gc_exit_arrived_cv);
+    while(SEQ_CST_LOAD(&n_gc_exited) != 0) {
+        waitCondition(&gc_exit_leave_now_cv, &gc_exit_mutex);
+    }
+    RELEASE_LOCK(&gc_exit_mutex);
+
     debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index);
 
     SET_GCT(saved_gct);
@@ -1391,56 +1396,53 @@ gcWorkerThread (Capability *cap)
 #if defined(THREADED_RTS)
 
 void
-waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
+waitForGcThreads (Capability *cap, bool idle_cap[])
 {
-    const uint32_t n_threads = n_capabilities;
+    // n_gc_threads is not valid here, we're too early
+    uint32_t n_threads = n_capabilities;
     const uint32_t me = cap->no;
-    uint32_t i, j;
-    bool retry = true;
+    uint32_t i, cur_n_gc_entered;
     Time t0, t1, t2;
 
     t0 = t1 = t2 = getProcessElapsedTime();
 
-    while(retry) {
-        for (i=0; i < n_threads; i++) {
-            if (i == me || idle_cap[i]) continue;
+    for(i = 0; i < n_capabilities; ++i) {
+        if (i == me || idle_cap[i]) {
+            --n_threads;
+        }
+    }
+
+    ASSERT(n_threads < n_capabilities); // must be less becasue we don't count ourself
+    if(n_threads == 0) { return; }
+
+    ACQUIRE_LOCK(&gc_entry_mutex);
+    while((cur_n_gc_entered = SEQ_CST_LOAD(&n_gc_entered)) != n_threads) {
+        ASSERT(cur_n_gc_entered < n_threads);
+        for(i = 0; i < n_capabilities; ++i) {
+            if (i == me || idle_cap[i]) { continue; }
             if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY) {
                 prodCapability(capabilities[i], cap->running_task);
-            }
-        }
-        for (j=0; j < 10; j++) {
-            retry = false;
-            for (i=0; i < n_threads; i++) {
-                if (i == me || idle_cap[i]) continue;
                 write_barrier();
                 interruptCapability(capabilities[i]);
-                if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY) {
-                    retry = true;
-                }
             }
-            if (!retry) break;
-#if defined(PROF_SPIN)
-            waitForGcThreads_yield++;
-#endif
-            yieldThread();
         }
-
+        // this 1ms timeout is not well justified. It's the shortest timeout we
+        // can use on windows. It seems to work well for me.
+        timedWaitCondition(&gc_entry_arrived_cv, &gc_entry_mutex, USToTime(1000));
         t2 = getProcessElapsedTime();
         if (RtsFlags.GcFlags.longGCSync != 0 &&
             t2 - t1 > RtsFlags.GcFlags.longGCSync) {
+            // best not to hold the mutex while we call a hook function
+            RELEASE_LOCK(&gc_entry_mutex);
+
             /* call this every longGCSync of delay */
             rtsConfig.longGCSync(cap->no, t2 - t0);
             t1 = t2;
-        }
-        if (retry) {
-#if defined(PROF_SPIN)
-            // This is a bit strange, we'll get more yields than spins.
-            // I guess that means it's not a spin-lock at all, but these
-            // numbers are still useful (I think).
-            waitForGcThreads_spin++;
-#endif
+
+            ACQUIRE_LOCK(&gc_entry_mutex);
         }
     }
+    RELEASE_LOCK(&gc_entry_mutex);
 
     if (RtsFlags.GcFlags.longGCSync != 0 &&
         t2 - t0 > RtsFlags.GcFlags.longGCSync) {
@@ -1459,17 +1461,28 @@ wakeup_gc_threads (uint32_t me USED_IF_THREADS,
 
     if (n_gc_threads == 1) return;
 
+#if defined(DEBUG)
+    StgInt num_idle = 0;
+    for(i=0; i < n_gc_threads; ++i) {
+        ASSERT(!(i==me && idle_cap[i]));
+        if (idle_cap[i]) { ++num_idle;}
+    }
+    ASSERT(num_idle == n_gc_idle_threads);
+#endif
+
+    ACQUIRE_LOCK(&gc_entry_mutex);
     for (i=0; i < n_gc_threads; i++) {
         if (i == me || idle_cap[i]) continue;
         inc_running();
         debugTrace(DEBUG_gc, "waking up gc thread %d", i);
-        if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY)
-            barf("wakeup_gc_threads");
-
+        ASSERT(SEQ_CST_LOAD(&gc_threads[i]->wakeup) == GC_THREAD_STANDING_BY);
         SEQ_CST_STORE(&gc_threads[i]->wakeup, GC_THREAD_RUNNING);
-        ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
-        RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
     }
+    ASSERT(SEQ_CST_LOAD(&n_gc_entered) ==
+           (StgInt)n_gc_threads - 1 - (StgInt)n_gc_idle_threads);
+    SEQ_CST_STORE(&n_gc_entered, 0);
+    broadcastCondition(&gc_entry_start_now_cv);
+    RELEASE_LOCK(&gc_entry_mutex);
 #endif
 }
 
@@ -1477,21 +1490,31 @@ wakeup_gc_threads (uint32_t me USED_IF_THREADS,
 // standby state, otherwise they may still be executing inside
 // scavenge_until_all_done(), and may even remain awake until the next GC starts.
 static void
-shutdown_gc_threads (uint32_t me USED_IF_THREADS,
-                     bool idle_cap[] USED_IF_THREADS)
+shutdown_gc_threads (uint32_t me USED_IF_THREADS USED_IF_DEBUG,
+                     bool idle_cap[] USED_IF_THREADS USED_IF_DEBUG)
 {
 #if defined(THREADED_RTS)
-    uint32_t i;
 
     if (n_gc_threads == 1) return;
 
-    for (i=0; i < n_gc_threads; i++) {
+    // we need to wait for `n_threads` threads. -1 because that's ourself
+    StgInt n_threads = (StgInt)n_gc_threads - 1 - (StgInt)n_gc_idle_threads;
+    StgInt cur_n_gc_exited;
+    ACQUIRE_LOCK(&gc_exit_mutex);
+    while((cur_n_gc_exited = SEQ_CST_LOAD(&n_gc_exited)) != n_threads) {
+        ASSERT(cur_n_gc_exited >= 0);
+        ASSERT(cur_n_gc_exited < n_threads);
+        waitCondition(&gc_exit_arrived_cv, &gc_exit_mutex);
+    }
+#if defined(DEBUG)
+    uint32_t i;
+    for (i=0; i < n_capabilities; i++) {
         if (i == me || idle_cap[i]) continue;
-        while (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_WAITING_TO_CONTINUE) {
-            busy_wait_nop();
-        }
+        ASSERT(SEQ_CST_LOAD(&gc_threads[i]->wakeup) == GC_THREAD_WAITING_TO_CONTINUE);
     }
-#endif
+#endif // DEBUG
+    RELEASE_LOCK(&gc_exit_mutex);
+#endif // THREADED_RTS
 }
 
 #if defined(THREADED_RTS)
@@ -1501,15 +1524,25 @@ releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
     const uint32_t n_threads = n_capabilities;
     const uint32_t me = cap->no;
     uint32_t i;
+#if defined(DEBUG)
+    uint32_t num_idle = 0;
+    for(i=0; i < n_threads; ++i) {
+        ASSERT(!(i==me && idle_cap[i]));
+        if (idle_cap[i]) { ++num_idle;}
+    }
+#endif
+
     for (i=0; i < n_threads; i++) {
         if (i == me || idle_cap[i]) continue;
-        if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_WAITING_TO_CONTINUE)
-            barf("releaseGCThreads");
-
+        ASSERT(SEQ_CST_LOAD(&gc_threads[i]->wakeup) == GC_THREAD_WAITING_TO_CONTINUE);
         SEQ_CST_STORE(&gc_threads[i]->wakeup, GC_THREAD_INACTIVE);
-        ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin);
-        RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin);
     }
+
+    ACQUIRE_LOCK(&gc_exit_mutex);
+    ASSERT(SEQ_CST_LOAD(&n_gc_exited) == (StgInt)n_threads - 1 - (StgInt)num_idle);
+    SEQ_CST_STORE(&n_gc_exited, 0);
+    broadcastCondition(&gc_exit_leave_now_cv);
+    RELEASE_LOCK(&gc_exit_mutex);
 }
 #endif
 
diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h
index 31719ca020..84cf2be939 100644
--- a/rts/sm/GCThread.h
+++ b/rts/sm/GCThread.h
@@ -125,9 +125,7 @@ typedef struct gc_thread_ {
 
 #if defined(THREADED_RTS)
     OSThreadId id;                 // The OS thread that this struct belongs to
-    SpinLock   gc_spin;
-    SpinLock   mut_spin;
-    StgWord wakeup;                // This should only be accessed via atomic accesses
+    volatile StgWord wakeup;       // NB not StgWord8; only StgWord is guaranteed atomic
 #endif
     uint32_t thread_index;         // a zero based index identifying the thread