12 files changed, 293 insertions, 30 deletions
diff --git a/rts/Messages.c b/rts/Messages.c
index 8fab314bc4..a9c794d823 100644
--- a/rts/Messages.c
+++ b/rts/Messages.c
@@ -129,6 +129,9 @@ loop:
     }
     else if (i == &stg_WHITEHOLE_info)
     {
+#if defined(PROF_SPIN)
+        ++whitehole_executeMessage_spin;
+#endif
         goto loop;
     }
     else
diff --git a/rts/Messages.h b/rts/Messages.h
index e60f19dc1d..18371564c4 100644
--- a/rts/Messages.h
+++ b/rts/Messages.h
@@ -31,3 +31,7 @@ doneWithMsgThrowTo (MessageThrowTo *m)
 }
 
 #include "EndPrivate.h"
+
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+extern volatile StgWord64 whitehole_executeMessage_spin;
+#endif
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index 7b38bbd6fd..b674e9b685 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -232,6 +232,7 @@ void initRtsFlagsDefaults(void)
     RtsFlags.MiscFlags.generate_stack_trace    = true;
     RtsFlags.MiscFlags.generate_dump_file      = false;
     RtsFlags.MiscFlags.machineReadable         = false;
+    RtsFlags.MiscFlags.internalCounters        = false;
     RtsFlags.MiscFlags.linkerMemBase           = 0;
 
 #if defined(THREADED_RTS)
@@ -888,6 +889,11 @@ error = true;
                       OPTION_UNSAFE;
                       RtsFlags.MiscFlags.machineReadable = true;
                   }
+                  else if (strequal("internal-counters",
+                                    &rts_argv[arg][2])) {
+                      OPTION_SAFE;
+                      RtsFlags.MiscFlags.internalCounters = true;
+                  }
                   else if (strequal("info",
                                &rts_argv[arg][2])) {
                       OPTION_SAFE;
diff --git a/rts/SMPClosureOps.h b/rts/SMPClosureOps.h
index 4ea1c55976..1d18e1b018 100644
--- a/rts/SMPClosureOps.h
+++ b/rts/SMPClosureOps.h
@@ -38,6 +38,11 @@ EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info);
 
 #if defined(THREADED_RTS)
 
+#if defined(PROF_SPIN)
+extern volatile StgWord64 whitehole_lockClosure_spin;
+extern volatile StgWord64 whitehole_lockClosure_yield;
+#endif
+
 /* -----------------------------------------------------------------------------
  * Locking/unlocking closures
  *
@@ -56,7 +61,14 @@ EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p)
         do {
             info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info);
             if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info;
+#if defined(PROF_SPIN)
+            ++whitehole_lockClosure_spin;
+#endif
+            busy_wait_nop();
         } while (++i < SPIN_COUNT);
+#if defined(PROF_SPIN)
+        ++whitehole_lockClosure_yield;
+#endif
         yieldThread();
     } while (1);
 }
diff --git a/rts/Stats.c b/rts/Stats.c
index 26bdac0ea5..7eb93bef33 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -16,10 +16,15 @@
 #include "Profiling.h"
 #include "GetTime.h"
 #include "sm/Storage.h"
-#include "sm/GC.h" // gc_alloc_block_sync, whitehole_gc_spin
 #include "sm/GCThread.h"
 #include "sm/BlockAlloc.h"
 
+// for spin/yield counters
+#include "sm/GC.h"
+#include "ThreadPaused.h"
+#include "Messages.h"
+
+
 #define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
 
 static Time
@@ -43,6 +48,13 @@ static Time HCe_start_time, HCe_tot_time = 0;   // heap census prof elap time
 #define PROF_VAL(x)   0
 #endif
 
+#if defined(PROF_SPIN)
+volatile StgWord64 whitehole_lockClosure_spin = 0;
+volatile StgWord64 whitehole_lockClosure_yield = 0;
+volatile StgWord64 whitehole_threadPaused_spin = 0;
+volatile StgWord64 whitehole_executeMessage_spin = 0;
+#endif
+
 //
 // All the stats!
 //
@@ -150,6 +162,13 @@ initStats0(void)
         .par_copied_bytes = 0,
         .cumulative_par_max_copied_bytes = 0,
         .cumulative_par_balanced_copied_bytes = 0,
+        .gc_spin_spin = 0,
+        .gc_spin_yield = 0,
+        .mut_spin_spin = 0,
+        .mut_spin_yield = 0,
+        .any_work = 0,
+        .no_work = 0,
+        .scav_find_work = 0,
         .mutator_cpu_ns = 0,
         .mutator_elapsed_ns = 0,
         .gc_cpu_ns = 0,
@@ -283,10 +302,11 @@ stat_startGC (Capability *cap, gc_thread *gct)
    -------------------------------------------------------------------------- */
 
 void
-stat_endGC (Capability *cap, gc_thread *gct,
-            W_ live, W_ copied, W_ slop, uint32_t gen,
-            uint32_t par_n_threads, W_ par_max_copied,
-            W_ par_balanced_copied)
+stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
+            uint32_t gen, uint32_t par_n_threads, W_ par_max_copied,
+            W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
+            W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work,
+            W_ scav_find_work)
 {
     // -------------------------------------------------
     // Collect all the stats about this GC in stats.gc. We always do this since
@@ -350,6 +370,13 @@ stat_endGC (Capability *cap, gc_thread *gct,
             stats.gc.par_max_copied_bytes;
         stats.cumulative_par_balanced_copied_bytes +=
             stats.gc.par_balanced_copied_bytes;
+        stats.any_work += any_work;
+        stats.no_work += no_work;
+        stats.scav_find_work += scav_find_work;
+        stats.gc_spin_spin += gc_spin_spin;
+        stats.gc_spin_yield += gc_spin_yield;
+        stats.mut_spin_spin += mut_spin_spin;
+        stats.mut_spin_yield += mut_spin_yield;
     }
     stats.gc_cpu_ns += stats.gc.cpu_ns;
     stats.gc_elapsed_ns += stats.gc.elapsed_ns;
@@ -764,18 +791,96 @@ stat_exit (void)
                                 PROF_VAL(RPe_tot_time + HCe_tot_time) - init_elapsed) * 100
                     / TimeToSecondsDbl(tot_elapsed));
 
+            // See Note [Internal Counter Stats] for a description of the
+            // following counters. If you add a counter here, please remember
+            // to update the Note.
+            if (RtsFlags.MiscFlags.internalCounters) {
 #if defined(THREADED_RTS) && defined(PROF_SPIN)
-            {
                 uint32_t g;
+                const int32_t col_width[] = {4, -30, 14, 14};
+                statsPrintf("Internal Counters:\n");
+                statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
+                            , col_width[0], ""
+                            , col_width[1], "SpinLock"
+                            , col_width[2], "Spins"
+                            , col_width[3], "Yields");
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "gc_alloc_block_sync"
+                            , col_width[2], gc_alloc_block_sync.spin
+                            , col_width[3], gc_alloc_block_sync.yield);
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "gc_spin"
+                            , col_width[2], stats.gc_spin_spin
+                            , col_width[3], stats.gc_spin_yield);
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "mut_spin"
+                            , col_width[2], stats.mut_spin_spin
+                            , col_width[3], stats.mut_spin_yield);
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                            , col_width[0], ""
+                            , col_width[1], "whitehole_gc"
+                            , col_width[2], whitehole_gc_spin
+                            , col_width[3], "n/a");
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                            , col_width[0], ""
+                            , col_width[1], "whitehole_threadPaused"
+                            , col_width[2], whitehole_threadPaused_spin
+                            , col_width[3], "n/a");
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                            , col_width[0], ""
+                            , col_width[1], "whitehole_executeMessage"
+                            , col_width[2], whitehole_executeMessage_spin
+                            , col_width[3], "n/a");
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "whitehole_lockClosure"
+                            , col_width[2], whitehole_lockClosure_spin
+                            , col_width[3], whitehole_lockClosure_yield);
+                // waitForGcThreads isn't really spin-locking(see the function)
+                // but these numbers still seem useful.
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "waitForGcThreads"
+                            , col_width[2], waitForGcThreads_spin
+                            , col_width[3], waitForGcThreads_yield);
 
-                statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin);
-                statsPrintf("whitehole_gc_spin: %"FMT_Word64"\n"
-                            , whitehole_gc_spin);
                 for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-                    statsPrintf("gen[%d].sync: %"FMT_Word64"\n", g, generations[g].sync.spin);
+                    int prefix_length = 0;
+                    statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
+                                col_width[0], "", g, &prefix_length);
+                    prefix_length -= col_width[0];
+                    int suffix_length = col_width[1] + prefix_length;
+                    suffix_length =
+                      suffix_length > 0 ? col_width[1] : suffix_length;
+
+                    statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                                , suffix_length, "].sync"
+                                , col_width[2], generations[g].sync.spin
+                                , col_width[3], generations[g].sync.yield);
                 }
-            }
+                statsPrintf("\n");
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "any_work"
+                            , col_width[2], stats.any_work);
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "no_work"
+                            , col_width[2], stats.no_work);
+                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                            , col_width[0], ""
+                            , col_width[1], "scav_find_work"
+                            , col_width[2], stats.scav_find_work);
+#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
+                statsPrintf("Internal Counters require the RTS to be built "
+                        "with PROF_SPIN"); // PROF_SPIN is not #defined here
+#else // THREADED_RTS
+                statsPrintf("Internal Counters require the threaded RTS");
 #endif
+            }
         }
 
         if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
@@ -917,6 +1022,68 @@ the number of gc threads is limited to the number of cores.
 See #13830
 */
 
+/*
+Note [Internal Counter Stats]
+-----------------------------
+What do the counts at the end of a '+RTS -s --internal-counters' report mean?
+They are detailed below. Most of these counters are used by multiple threads
+with no attempt at synchronisation. This means that reported values  may be
+lower than the true value and this becomes more likely and more severe as
+contention increases.
+
+The first counters are for various SpinLock-like constructs in the RTS. See
+Spinlock.h for the definition of a SpinLock. We maintain up two counters per
+SpinLock:
+* spin: The number of busy-spins over the length of the program.
+* yield: The number of times the SpinLock spun SPIN_COUNT times without success
+         and called yieldThread().
+Not all of these are actual SpinLocks, see the details below.
+
+Actual SpinLocks:
+* gc_alloc_block:
+    This SpinLock protects the block allocator and free list manager. See
+    BlockAlloc.c.
+* gc_spin and mut_spin:
+    These SpinLocks are used to herd gc worker threads during parallel garbage
+    collection. See gcWorkerThread, wakeup_gc_threads and releaseGCThreads.
+* gen[g].sync:
+    These SpinLocks, one per generation, protect the generations[g] data
+    structure during garbage collection.
+
+waitForGcThreads:
+  These counters are incremented while we wait for all threads to be ready
+  for a parallel garbage collection. We yield more than we spin in this case.
+
+In several places in the runtime we must take a lock on a closure. To do this,
+we replace it's info table with stg_WHITEHOLE_info, spinning if it is already
+a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we
+don't. We count these white-hole spins and include them in the SpinLocks table.
+If a particular loop does not yield, we put "n/a" in the table. They are named
+for the function that has the spinning loop except that several loops in the
+garbage collector accumulate into whitehole_gc.
+TODO: Should these counters be more or less granular?
+
+white-hole spin counters:
+* whitehole_gc
+* whitehole_lockClosure
+* whitehole_executeMessage
+* whitehole_threadPaused
+
+
+We count the number of calls of several functions in the parallel garbage
+collector.
+
+Parallel garbage collector counters:
+* any_work:
+    A cheap function called whenever a gc_thread is ready for work. Does
+    not do any work.
+* no_work:
+    Incremented whenever any_work finds no work.
+* scav_find_work:
+    Called to do work when any_work return true.
+
+*/
+
 /* -----------------------------------------------------------------------------
    stat_describe_gens
 
diff --git a/rts/Stats.h b/rts/Stats.h
index 5d9cf04fa7..1c56344f81 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -30,7 +30,10 @@ void      stat_startGCSync(struct gc_thread_ *_gct);
 void      stat_startGC(Capability *cap, struct gc_thread_ *_gct);
 void      stat_endGC  (Capability *cap, struct gc_thread_ *_gct, W_ live,
                        W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads,
-                       W_ par_max_copied, W_ par_balanced_copied);
+                       W_ par_max_copied, W_ par_balanced_copied,
+                       W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin,
+                       W_ mut_spin_yield, W_ any_work, W_ no_work,
+                       W_ scav_find_work);
 
 #if defined(PROFILING)
 void      stat_startRP(void);
diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm
index 361989d0d2..595d3ce6c2 100644
--- a/rts/StgMiscClosures.cmm
+++ b/rts/StgMiscClosures.cmm
@@ -375,11 +375,21 @@ loop:
     // spin until the WHITEHOLE is updated
     info = StgHeader_info(node);
     if (info == stg_WHITEHOLE_info) {
+#if defined(PROF_SPIN)
+        W_[whitehole_lockClosure_spin] =
+            W_[whitehole_lockClosure_spin] + 1;
+#endif
         i = i + 1;
         if (i == SPIN_COUNT) {
             i = 0;
+#if defined(PROF_SPIN)
+            W_[whitehole_lockClosure_yield] =
+                W_[whitehole_lockClosure_yield] + 1;
+#endif
             ccall yieldThread();
         }
+        // TODO: We should busy_wait_nop() here, but that's not currently
+        // defined in CMM.
         goto loop;
     }
     jump %ENTRY_CODE(info) (node);
diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c
index 7ade0a64c5..3f7bddeb79 100644
--- a/rts/ThreadPaused.c
+++ b/rts/ThreadPaused.c
@@ -329,6 +329,10 @@ threadPaused(Capability *cap, StgTSO *tso)
 
             if (cur_bh_info != bh_info) {
                 bh_info = cur_bh_info;
+#if defined(PROF_SPIN)
+                ++whitehole_threadPaused_spin;
+#endif
+                busy_wait_nop();
                 goto retry;
             }
 #endif
diff --git a/rts/ThreadPaused.h b/rts/ThreadPaused.h
index 4d762f9aed..ee25189c20 100644
--- a/rts/ThreadPaused.h
+++ b/rts/ThreadPaused.h
@@ -8,4 +8,12 @@
 
 #pragma once
 
+#include "BeginPrivate.h"
+
 RTS_PRIVATE void threadPaused ( Capability *cap, StgTSO * );
+
+#include "EndPrivate.h"
+
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+extern volatile StgWord64 whitehole_threadPaused_spin;
+#endif
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index bb54c7efc3..27f280665e 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -1058,9 +1058,14 @@ selector_chain:
     // In threaded mode, we'll use WHITEHOLE to lock the selector
     // thunk while we evaluate it.
     {
-        do {
+        while(true) {
             info_ptr = xchg((StgPtr)&p->header.info, (W_)&stg_WHITEHOLE_info);
-        } while (info_ptr == (W_)&stg_WHITEHOLE_info);
+            if (info_ptr != (W_)&stg_WHITEHOLE_info) { break; }
+#if defined(PROF_SPIN)
+            ++whitehole_gc_spin;
+#endif
+            busy_wait_nop();
+        }
 
         // make sure someone else didn't get here first...
         if (IS_FORWARDING_PTR(info_ptr) ||
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 54797ba0f0..d61ca41a6b 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -139,6 +139,9 @@ uint32_t n_gc_threads;
 static long copied;        // *words* copied & scavenged during this GC
 
 #if defined(PROF_SPIN) && defined(THREADED_RTS)
+// spin and yield counts for the quasi-SpinLock in waitForGcThreads
+volatile StgWord64 waitForGcThreads_spin = 0;
+volatile StgWord64 waitForGcThreads_yield = 0;
 volatile StgWord64 whitehole_gc_spin = 0;
 #endif
 
@@ -198,7 +201,9 @@ GarbageCollect (uint32_t collect_gen,
 {
   bdescr *bd;
   generation *gen;
-  StgWord live_blocks, live_words, par_max_copied, par_balanced_copied;
+  StgWord live_blocks, live_words, par_max_copied, par_balanced_copied,
+      gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
+      any_work, no_work, scav_find_work;
 #if defined(THREADED_RTS)
   gc_thread *saved_gct;
 #endif
@@ -471,32 +476,53 @@ GarbageCollect (uint32_t collect_gen,
   copied = 0;
   par_max_copied = 0;
   par_balanced_copied = 0;
+  gc_spin_spin = 0;
+  gc_spin_yield = 0;
+  mut_spin_spin = 0;
+  mut_spin_yield = 0;
+  any_work = 0;
+  no_work = 0;
+  scav_find_work = 0;
   {
       uint32_t i;
       uint64_t par_balanced_copied_acc = 0;
+      const gc_thread* thread;
 
       for (i=0; i < n_gc_threads; i++) {
           copied += gc_threads[i]->copied;
       }
       for (i=0; i < n_gc_threads; i++) {
+          thread = gc_threads[i];
           if (n_gc_threads > 1) {
               debugTrace(DEBUG_gc,"thread %d:", i);
-              debugTrace(DEBUG_gc,"   copied           %ld", gc_threads[i]->copied * sizeof(W_));
-              debugTrace(DEBUG_gc,"   scanned          %ld", gc_threads[i]->scanned * sizeof(W_));
-              debugTrace(DEBUG_gc,"   any_work         %ld", gc_threads[i]->any_work);
-              debugTrace(DEBUG_gc,"   no_work          %ld", gc_threads[i]->no_work);
-              debugTrace(DEBUG_gc,"   scav_find_work %ld",   gc_threads[i]->scav_find_work);
+              debugTrace(DEBUG_gc,"   copied           %ld",
+                         thread->copied * sizeof(W_));
+              debugTrace(DEBUG_gc,"   scanned          %ld",
+                         thread->scanned * sizeof(W_));
+              debugTrace(DEBUG_gc,"   any_work         %ld",
+                         thread->any_work);
+              debugTrace(DEBUG_gc,"   no_work          %ld",
+                         thread->no_work);
+              debugTrace(DEBUG_gc,"   scav_find_work %ld",
+                         thread->scav_find_work);
+
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+              gc_spin_spin += thread->gc_spin.spin;
+              gc_spin_yield += thread->gc_spin.yield;
+              mut_spin_spin += thread->mut_spin.spin;
+              mut_spin_yield += thread->mut_spin.yield;
+#endif
+
+              any_work += thread->any_work;
+              no_work += thread->no_work;
+              scav_find_work += thread->scav_find_work;
+
+              par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
+              par_balanced_copied_acc +=
+                  stg_min(n_gc_threads * gc_threads[i]->copied, copied);
           }
-          par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
-          par_balanced_copied_acc +=
-            stg_min(n_gc_threads * gc_threads[i]->copied, copied);
-      }
-      if (n_gc_threads == 1) {
-          par_max_copied = 0;
-          par_balanced_copied = 0;
       }
-      else
-      {
+      if (n_gc_threads > 1) {
           // See Note [Work Balance] for an explanation of this computation
           par_balanced_copied =
               (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
@@ -834,7 +860,9 @@ GarbageCollect (uint32_t collect_gen,
   // ok, GC over: tell the stats department what happened.
   stat_endGC(cap, gct, live_words, copied,
              live_blocks * BLOCK_SIZE_W - live_words /* slop */,
-             N, n_gc_threads, par_max_copied, par_balanced_copied);
+             N, n_gc_threads, par_max_copied, par_balanced_copied,
+             gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
+             any_work, no_work, scav_find_work);
 
 #if defined(RTS_USER_SIGNALS)
   if (RtsFlags.MiscFlags.install_signal_handlers) {
@@ -1186,6 +1214,9 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
                 }
             }
             if (!retry) break;
+#if defined(PROF_SPIN)
+            waitForGcThreads_yield++;
+#endif
             yieldThread();
         }
 
@@ -1196,6 +1227,14 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
             rtsConfig.longGCSync(cap->no, t2 - t0);
             t1 = t2;
         }
+        if (retry) {
+#if defined(PROF_SPIN)
+            // This is a bit strange, we'll get more yields than spins.
+            // I guess that means it's not a spin-lock at all, but these
+            // numbers are still useful (I think).
+            waitForGcThreads_spin++;
+#endif
+        }
     }
 
     if (RtsFlags.GcFlags.longGCSync != 0 &&
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index 78f054931a..7fce87edd4 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -47,6 +47,8 @@ extern uint32_t mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS, mutlist_OTHERS,
 
 #if defined(PROF_SPIN) && defined(THREADED_RTS)
 extern volatile StgWord64 whitehole_gc_spin;
+extern volatile StgWord64 waitForGcThreads_spin;
+extern volatile StgWord64 waitForGcThreads_yield;
 #endif
 
 void gcWorkerThread (Capability *cap);