2 files changed, 37 insertions, 14 deletions
diff --git a/rts/Stats.c b/rts/Stats.c
index 55aff39196..654edebb55 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -507,10 +507,18 @@ stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ s
             initiating_gct->gc_start_elapsed - initiating_gct->gc_sync_start_elapsed;
         stats.gc.elapsed_ns = current_elapsed - initiating_gct->gc_start_elapsed;
         stats.gc.cpu_ns = 0;
-        for (unsigned int i=0; i < par_n_threads; i++) {
-            gc_thread *gct = gc_threads[i];
-            ASSERT(gct->gc_end_cpu >= gct->gc_start_cpu);
-            stats.gc.cpu_ns += gct->gc_end_cpu - gct->gc_start_cpu;
+        // see Note [n_gc_threads]
+        // par_n_threads is set to n_gc_threads at the single callsite of this
+        // function
+        if (par_n_threads == 1) {
+            ASSERT(initiating_gct->gc_end_cpu >= initiating_gct->gc_start_cpu);
+            stats.gc.cpu_ns += initiating_gct->gc_end_cpu - initiating_gct->gc_start_cpu;
+        } else {
+            for (unsigned int i=0; i < par_n_threads; i++) {
+                gc_thread *gct = gc_threads[i];
+                ASSERT(gct->gc_end_cpu >= gct->gc_start_cpu);
+                stats.gc.cpu_ns += gct->gc_end_cpu - gct->gc_start_cpu;
+            }
         }
     }
     // -------------------------------------------------
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 78be848c3f..d8ce3a1377 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -136,8 +136,22 @@ StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)]
     ATTRIBUTE_ALIGNED(64);
 #endif
 
-// Number of threads running in *this* GC.  Affects how many
-// step->todos[] lists we have to look in to find work.
+/* Note [n_gc_threads]
+This is a global variable that originally tracked the number of threads
+participating in the current gc. It's meaing has diverged from this somewhate.
+In practise, it now takes one of the values {1, n_capabilities}. Don't be
+tricked into thinking this means garbage collections must have 1 or
+n_capabilities participating: idle capabilities (idle_cap[cap->no]) are included
+in the n_gc_thread count.
+
+Clearly this is in need of some tidying up, but for now we tread carefully. We
+check n_gc_threads > 1 to see whether we are in a parallel or sequential. We
+ensure n_gc_threads > 1 before iterating over gc_threads a la:
+
+for(i=0;i<n_gc_threads;++i) { foo(gc_thread[i]; )}
+
+Omitting this check has led to issues such as #19147.
+*/
 uint32_t n_gc_threads;
 
 // For stats:
@@ -552,12 +566,13 @@ GarbageCollect (uint32_t collect_gen,
       uint64_t par_balanced_copied_acc = 0;
       const gc_thread* thread;
 
-      for (i=0; i < n_gc_threads; i++) {
-          copied += RELAXED_LOAD(&gc_threads[i]->copied);
-      }
-      for (i=0; i < n_gc_threads; i++) {
-          thread = gc_threads[i];
-          if (n_gc_threads > 1) {
+      // see Note [n_gc_threads]
+      if (n_gc_threads > 1) {
+          for (i=0; i < n_gc_threads; i++) {
+              copied += RELAXED_LOAD(&gc_threads[i]->copied);
+          }
+          for (i=0; i < n_gc_threads; i++) {
+              thread = gc_threads[i];
               debugTrace(DEBUG_gc,"thread %d:", i);
               debugTrace(DEBUG_gc,"   copied           %ld",
                          RELAXED_LOAD(&thread->copied) * sizeof(W_));
@@ -585,12 +600,12 @@ GarbageCollect (uint32_t collect_gen,
               par_balanced_copied_acc +=
                   stg_min(n_gc_threads * RELAXED_LOAD(&thread->copied), copied);
           }
-      }
-      if (n_gc_threads > 1) {
           // See Note [Work Balance] for an explanation of this computation
           par_balanced_copied =
               (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
               (n_gc_threads - 1);
+      } else {
+          copied += gct->copied;
       }
   }