rts: correct stats when running with +RTS -qn1wip/19685-count-par-gcs

Despite the documented care having been taken, several bugs are fixed here. When run with -qn1, when a SYNC_GC_PAR is requested we will have n_gc_threads == n_capabilities && n_gc_idle_threads == (n_gc_threads - 1) In this case we now: * Don't increment par_collections * Don't increment par_balanced_copied * Don't emit debug traces for idle threads * Take the fast path in scavenge_until_all_done, wakeup_gc_threads, and shutdown_gc_threads. Some ASSERTs have also been tightened. Fixes #19685
author: Douglas Wilson <douglas.wilson@gmail.com> 2021-04-13 07:31:42 +0100
committer: Ben Gamari <ben@well-typed.com> 2021-12-09 18:18:04 +0000
commit: 949f2d60f3b9cd10eae3330dc969cfde58415d1e (patch)
tree: 06659c0d0e50f157bb7c39c04a854616d4b1cac8
parent: d6177cb5dac357ff15ae048556f03039dd6987d2 (diff)
download: haskell-wip/19685-count-par-gcs.tar.gz
1 files changed, 42 insertions, 28 deletions
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 1799cf98c4..64d0924059 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -159,24 +159,37 @@ StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)]
 
 /* Note [n_gc_threads]
 This is a global variable that originally tracked the number of threads
-participating in the current gc. It's meaing has diverged from this somewhate.
-In practise, it now takes one of the values {1, n_capabilities}. Don't be
-tricked into thinking this means garbage collections must have 1 or
-n_capabilities participating: idle capabilities (idle_cap[cap->no]) are included
-in the n_gc_thread count.
+participating in the current gc. It's meaning has diverged from this somewhat,
+as it does not distinguish betweeen idle and non-idle threads. An idle thread
+is a thread i where `idle[i]` is true, using the idle array passed to
+GarbageCollect.
 
-Clearly this is in need of some tidying up, but for now we tread carefully. We
-check n_gc_threads > 1 to see whether we are in a parallel or sequential. We
-ensure n_gc_threads > 1 before iterating over gc_threads a la:
+In practice, it now takes one of the values {1, n_capabilities}, when a SYNC_GC_SEQ
+is requested it takes 1, when a SYNC_GC_PAR is requested it takes n_capabilities.
 
-for(i=0;i<n_gc_threads;++i) { foo(gc_thread[i]; )}
+Clearly this is in need of some tidying up, but for now we tread carefully. We
+call is_par_gc() to see whether we are in a parallel or sequential collection.
+Of course this is valid only inside GarbageCollect ().
 
 Omitting this check has led to issues such as #19147.
 */
+
+static bool is_par_gc(void);
+
 uint32_t n_gc_threads;
 static uint32_t n_gc_idle_threads;
 bool work_stealing;
 
+static bool is_par_gc() {
+#if defined(THREADED_RTS)
+    if(n_gc_threads == 1) { return false; }
+    ASSERT(n_gc_threads > n_gc_idle_threads);
+    return n_gc_threads - n_gc_idle_threads > 1;
+#else
+    return false;
+#endif
+}
+
 // For stats:
 static long copied;        // *words* copied & scavenged during this GC
 
@@ -393,10 +406,7 @@ GarbageCollect (uint32_t collect_gen,
   }
   work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
       N >= RtsFlags.ParFlags.parGcLoadBalancingGen &&
-      (gc_type == SYNC_GC_PAR) &&
-      (n_gc_threads - n_gc_idle_threads) > 1;
-      // ^ if we are the only non-idle gc thread, turn off work stealing
-      //
+      is_par_gc();
       // It's not always a good idea to do load balancing in parallel
       // GC.  In particular, for a parallel program we don't want to
       // lose locality by moving cached data into another CPU's cache
@@ -408,8 +418,8 @@ GarbageCollect (uint32_t collect_gen,
       // a flag.
 #else
   n_gc_threads = 1;
-  n_gc_idle_threads = 0;
   work_stealing = false;
+  n_gc_idle_threads = 0;
 #endif
 
   SEQ_CST_STORE(&gc_running_threads, 0);
@@ -484,7 +494,7 @@ GarbageCollect (uint32_t collect_gen,
   // of markSomeCapabilities() because markSomeCapabilities() can only
   // call back into the GC via mark_root() (due to the gct register
   // variable).
-  if (n_gc_threads == 1) {
+  if (!is_par_gc()) {
       for (n = 0; n < n_capabilities; n++) {
 #if defined(THREADED_RTS)
           scavenge_capability_mut_Lists1(capabilities[n]);
@@ -509,7 +519,7 @@ GarbageCollect (uint32_t collect_gen,
 
   // follow all the roots that the application knows about.
   gct->evac_gen_no = 0;
-  if (n_gc_threads == 1) {
+  if (!is_par_gc()) {
       for (n = 0; n < n_capabilities; n++) {
           markCapability(mark_root, gct, capabilities[n],
                          true/*don't mark sparks*/);
@@ -554,7 +564,7 @@ GarbageCollect (uint32_t collect_gen,
   gcStableNameTable();
 
 #if defined(THREADED_RTS)
-  if (n_gc_threads == 1) {
+  if (!is_par_gc()) {
       for (n = 0; n < n_capabilities; n++) {
           pruneSparkQueue(false, capabilities[n]);
       }
@@ -602,11 +612,16 @@ GarbageCollect (uint32_t collect_gen,
       const gc_thread* thread;
 
       // see Note [n_gc_threads]
-      if (n_gc_threads > 1) {
+      if (is_par_gc()) {
+          int other_active_threads = n_gc_threads - n_gc_idle_threads - 1;
+          ASSERT(other_active_threads > 0);
+
           for (i=0; i < n_gc_threads; i++) {
+              if(idle_cap[i]) { continue; }
               copied += RELAXED_LOAD(&gc_threads[i]->copied);
           }
           for (i=0; i < n_gc_threads; i++) {
+              if(idle_cap[i]) { continue; }
               thread = gc_threads[i];
               debugTrace(DEBUG_gc,"thread %d:", i);
               debugTrace(DEBUG_gc,"   copied           %ld",
@@ -624,13 +639,13 @@ GarbageCollect (uint32_t collect_gen,
 
               par_max_copied = stg_max(RELAXED_LOAD(&thread->copied), par_max_copied);
               par_balanced_copied_acc +=
-                  stg_min(n_gc_threads * RELAXED_LOAD(&thread->copied), copied);
+                  stg_min((other_active_threads + 1) * RELAXED_LOAD(&thread->copied), copied);
           }
+
           // See Note [Work Balance] for an explanation of this computation
           par_balanced_copied =
-              (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
-              (n_gc_threads - 1);
-      } else {
+              (par_balanced_copied_acc - copied + other_active_threads / 2) /
+              other_active_threads;
           copied += gct->copied;
       }
   }
@@ -649,7 +664,7 @@ GarbageCollect (uint32_t collect_gen,
 
     if (g == N) {
       generations[g].collections++; // for stats
-      if (n_gc_threads > 1) generations[g].par_collections++;
+      if (is_par_gc()) generations[g].par_collections++;
     }
 
     // Count the mutable list as bytes "copied" for the purposes of
@@ -1227,7 +1242,6 @@ inc_running (void)
     // See Note [Synchronising work stealing]
     StgWord new;
     new = atomic_inc(&gc_running_threads, 1);
-    ASSERT(new <= n_gc_threads);
     return new;
 }
 
@@ -1289,7 +1303,7 @@ scavenge_until_all_done (void)
 
     for(;;) {
 #if defined(THREADED_RTS)
-        if (n_gc_threads > 1) {
+        if (is_par_gc()) {
             scavenge_loop();
         } else {
             scavenge_loop1();
@@ -1313,7 +1327,7 @@ scavenge_until_all_done (void)
         // If there's no hope of stealing more work, then there's nowhere else
         // work can come from and we are finished
 #if defined(THREADED_RTS)
-        if(n_gc_threads > 1 && work_stealing && r != 0) {
+        if(is_par_gc() && work_stealing && r != 0) {
             NONATOMIC_ADD(&gct->any_work, 1);
             ACQUIRE_LOCK(&gc_running_mutex);
             // this is SEQ_CST because I haven't considered if it could be
@@ -1480,7 +1494,7 @@ wakeup_gc_threads (uint32_t me USED_IF_THREADS,
 #if defined(THREADED_RTS)
     uint32_t i;
 
-    if (n_gc_threads == 1) return;
+    if (!is_par_gc()) return;
 
 #if defined(DEBUG)
     StgWord num_idle = 0;
@@ -1516,7 +1530,7 @@ shutdown_gc_threads (uint32_t me USED_IF_THREADS USED_IF_DEBUG,
 {
 #if defined(THREADED_RTS)
 
-    if (n_gc_threads == 1) return;
+    if (!is_par_gc()) return;
 
     // we need to wait for `n_threads` threads. -1 because that's ourself
     StgInt n_threads = (StgInt)n_gc_threads - 1 - (StgInt)n_gc_idle_threads;
author	Douglas Wilson <douglas.wilson@gmail.com>	2021-04-13 07:31:42 +0100
committer	Ben Gamari <ben@well-typed.com>	2021-12-09 18:18:04 +0000
commit	949f2d60f3b9cd10eae3330dc969cfde58415d1e (patch)
tree	06659c0d0e50f157bb7c39c04a854616d4b1cac8
parent	d6177cb5dac357ff15ae048556f03039dd6987d2 (diff)
download	haskell-wip/19685-count-par-gcs.tar.gz