rts: add max_n_todo_overflow internal counter

I've never observed this counter taking a non-zero value, however I do think it's existence is justified by the comment in grab_local_todo_block. I've not added it to RTSStats in GHC.Stats, as it doesn't seem worth the api churn.
author: Douglas Wilson <douglas.wilson@gmail.com> 2020-12-14 13:55:54 +0000
committer: Marge Bot <ben+marge-bot@smart-cactus.org> 2021-01-17 05:49:54 -0500
commit: 345ae06b3334a64e9d6db9ea69573ef3227e535a (patch)
tree: 55fff9e7dccbaf5b8181ce2534c0624a00a5b161 /rts
parent: f2d118c0a018dccd3c82e885f500d4e57ff94f82 (diff)
download: haskell-345ae06b3334a64e9d6db9ea69573ef3227e535a.tar.gz
5 files changed, 37 insertions, 11 deletions
diff --git a/rts/Stats.c b/rts/Stats.c
index 02616094b8..45d40ddcad 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -160,6 +160,7 @@ initStats0(void)
         .mut_spin_yield = 0,
         .any_work = 0,
         .scav_find_work = 0,
+        .max_n_todo_overflow = 0,
         .init_cpu_ns = 0,
         .init_elapsed_ns = 0,
         .mutator_cpu_ns = 0,
@@ -460,7 +461,8 @@ void
 stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ slop,
             uint32_t gen, uint32_t par_n_threads, gc_thread **gc_threads,
             W_ par_max_copied, W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
-            W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ scav_find_work)
+            W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ scav_find_work,
+            W_ max_n_todo_overflow)
 {
     ACQUIRE_LOCK(&stats_mutex);
 
@@ -541,6 +543,7 @@ stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ s
             stats.gc.par_balanced_copied_bytes;
         stats.any_work += any_work;
         stats.scav_find_work += scav_find_work;
+        stats.max_n_todo_overflow += stg_max(max_n_todo_overflow, stats.max_n_todo_overflow);
         stats.gc_spin_spin += gc_spin_spin;
         stats.gc_spin_yield += gc_spin_yield;
         stats.mut_spin_spin += mut_spin_spin;
@@ -1026,6 +1029,10 @@ static void report_summary(const RTSSummaryStats* sum)
                     , col_width[0], ""
                     , col_width[1], "scav_find_work"
                     , col_width[2], stats.scav_find_work);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "max_n_todo_overflow"
+                    , col_width[2], stats.max_n_todo_overflow);
 #elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
         statsPrintf("Internal Counters require the RTS to be built "
                 "with PROF_SPIN"); // PROF_SPIN is not #defined here
@@ -1167,6 +1174,8 @@ static void report_machine_readable (const RTSSummaryStats * sum)
             stats.any_work);
     MR_STAT("scav_find_work", FMT_Word64,
             stats.scav_find_work);
+    MR_STAT("max_n_todo_overflow", FMT_Word64,
+            stats.max_n_todo_overflow);
 #endif // PROF_SPIN
 #endif // THREADED_RTS
 
@@ -1558,7 +1567,7 @@ See #13830
 */
 
 /*
-Note [Internal Counter Stats]
+Note [Internal Counters Stats]
 -----------------------------
 What do the counts at the end of a '+RTS -s --internal-counters' report mean?
 They are detailed below. Most of these counters are used by multiple threads
@@ -1596,7 +1605,6 @@ don't. We count these white-hole spins and include them in the SpinLocks table.
 If a particular loop does not yield, we put "n/a" in the table. They are named
 for the function that has the spinning loop except that several loops in the
 garbage collector accumulate into whitehole_gc.
-TODO: Should these counters be more or less granular?
 
 white-hole spin counters:
 * whitehole_gc
@@ -1604,16 +1612,17 @@ white-hole spin counters:
 * whitehole_executeMessage
 * whitehole_threadPaused
 
-
-We count the number of calls of several functions in the parallel garbage
-collector.
+We have several stats allowing us to observe the internals of the parallel
+garbage collector:
 
 Parallel garbage collector counters:
 * any_work:
     Incremented whenever a parallel GC looks for work to steal.
 * scav_find_work:
-    Called to do work when any_work return true.
-
+    Counts iterations of scavenge loop
+* max_n_todo_overflow:
+    Tracks the maximum length of todo_overflow lists in the gc_thread structre.
+    See comment in grab_local_todo_block.
 */
 
 /* -----------------------------------------------------------------------------
diff --git a/rts/Stats.h b/rts/Stats.h
index 64aec2d5ee..19bd707302 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -37,7 +37,8 @@ void      stat_endGC  (Capability *cap, struct gc_thread_ *initiating_gct, W_ li
                        uint32_t n_gc_threads, struct gc_thread_ **gc_threads,
                        W_ par_max_copied, W_ par_balanced_copied,
                        W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin,
-                       W_ mut_spin_yield, W_ any_work, W_ scav_find_work);
+                       W_ mut_spin_yield, W_ any_work, W_ scav_find_work,
+                       W_ max_n_todo_overflow);
 
 void      stat_startNonmovingGcSync(void);
 void      stat_endNonmovingGcSync(void);
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index b78f993260..5e986f2296 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -270,7 +270,7 @@ GarbageCollect (uint32_t collect_gen,
   generation *gen;
   StgWord live_blocks, live_words, par_max_copied, par_balanced_copied,
       gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
-      any_work, scav_find_work;
+      any_work, scav_find_work, max_n_todo_overflow;
 #if defined(THREADED_RTS)
   gc_thread *saved_gct;
 #endif
@@ -594,6 +594,7 @@ GarbageCollect (uint32_t collect_gen,
   mut_spin_yield = 0;
   any_work = 0;
   scav_find_work = 0;
+  max_n_todo_overflow = 0;
   {
       uint32_t i;
       uint64_t par_balanced_copied_acc = 0;
@@ -625,6 +626,7 @@ GarbageCollect (uint32_t collect_gen,
 
               any_work += RELAXED_LOAD(&thread->any_work);
               scav_find_work += RELAXED_LOAD(&thread->scav_find_work);
+              max_n_todo_overflow = stg_max(RELAXED_LOAD(&thread->max_n_todo_overflow), max_n_todo_overflow);
 
               par_max_copied = stg_max(RELAXED_LOAD(&thread->copied), par_max_copied);
               par_balanced_copied_acc +=
@@ -1043,7 +1045,7 @@ GarbageCollect (uint32_t collect_gen,
              N, n_gc_threads, gc_threads,
              par_max_copied, par_balanced_copied,
              gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
-             any_work, scav_find_work);
+             any_work, scav_find_work, max_n_todo_overflow);
 
 #if defined(RTS_USER_SIGNALS)
   if (RtsFlags.MiscFlags.install_signal_handlers) {
@@ -1803,6 +1805,7 @@ init_gc_thread (gc_thread *t)
     t->scanned = 0;
     t->any_work = 0;
     t->scav_find_work = 0;
+    t->max_n_todo_overflow = 0;
 }
 
 /* -----------------------------------------------------------------------------
diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h
index 90d15c69c5..31719ca020 100644
--- a/rts/sm/GCThread.h
+++ b/rts/sm/GCThread.h
@@ -183,6 +183,7 @@ typedef struct gc_thread_ {
     W_ scanned;
     W_ any_work;
     W_ scav_find_work;
+    W_ max_n_todo_overflow;
 
     Time gc_start_cpu;             // thread CPU time
     Time gc_end_cpu;               // thread CPU time
diff --git a/rts/sm/GCUtils.c b/rts/sm/GCUtils.c
index 89a92bc837..52ea27f263 100644
--- a/rts/sm/GCUtils.c
+++ b/rts/sm/GCUtils.c
@@ -183,6 +183,18 @@ push_todo_block(bdescr *bd, gen_workspace *ws)
         bd->link = ws->todo_overflow;
         ws->todo_overflow = bd;
         ws->n_todo_overflow++;
+
+        // In theory, if a gc thread pushes more blocks to it's todo_q than it
+        // pops, the todo_overflow list will continue to grow. Other gc threads
+        // can't steal from the todo_overflwo list, so they may be idle as the
+        // first gc thread works diligently on it's todo_overflow list. In
+        // practice this has not been observed to occur.
+        //
+        // The max_n_todo_overflow counter will allow us to observe large
+        // todo_overflow lists if they ever arise. As of now I've not observed
+        // any nonzero max_n_todo_overflow samples.
+        gct->max_n_todo_overflow =
+            stg_max(gct->max_n_todo_overflow, ws->n_todo_overflow);
     }
 
 #if defined(THREADED_RTS)
author	Douglas Wilson <douglas.wilson@gmail.com>	2020-12-14 13:55:54 +0000
committer	Marge Bot <ben+marge-bot@smart-cactus.org>	2021-01-17 05:49:54 -0500
commit	345ae06b3334a64e9d6db9ea69573ef3227e535a (patch)
tree	55fff9e7dccbaf5b8181ce2534c0624a00a5b161 /rts
parent	f2d118c0a018dccd3c82e885f500d4e57ff94f82 (diff)
download	haskell-345ae06b3334a64e9d6db9ea69573ef3227e535a.tar.gz