Run C finalizers incrementally during mutation

With a large heap it's possible to build up a lot of finalizers between GCs. We've observed GC spending up to 50% of its time running finalizers. But there's no reason we have to run finalizers during GC, and especially no reason we have to block *all* the mutator threads while *one* GC thread runs finalizers one by one. I thought about a bunch of alternative ways to handle this, which are documented along with runSomeFinalizers() in Weak.c. The approach I settled on is to have a capability run finalizers if it is idle. So running finalizers is like a low-priority background thread. This requires some minor scheduler changes, but not much. In the future we might be able to move more GC work into here (I have my eye on freeing large blocks, for example). Test Plan: * validate * tested on our system and saw reductions in GC pauses of 40-50%. Reviewers: bgamari, niteria, osa1, erikd Reviewed By: bgamari, osa1 Subscribers: rwbarton, thomie, carter Differential Revision: https://phabricator.haskell.org/D4521
author: Simon Marlow <marlowsd@gmail.com> 2018-03-25 14:04:02 -0400
committer: Ben Gamari <ben@smart-cactus.org> 2018-03-25 14:33:27 -0400
commit: f7bbc343a624710ecf8f8f5eda620c4f35c90fc8 (patch)
tree: 0c409ece9e40aa9e612064ac306d75068ee03b0c
parent: cf809950efb744ca884e0e0833a80ffd50527ca1 (diff)
download: haskell-f7bbc343a624710ecf8f8f5eda620c4f35c90fc8.tar.gz
5 files changed, 149 insertions, 12 deletions
diff --git a/rts/Schedule.c b/rts/Schedule.c
index 5160cb495b..2dc850c50d 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -679,7 +679,11 @@ scheduleYield (Capability **pcap, Task *task)
 
     // otherwise yield (sleep), and keep yielding if necessary.
     do {
-        didGcLast = yieldCapability(&cap,task, !didGcLast);
+        if (doIdleGCWork(cap, false)) {
+            didGcLast = false;
+        } else {
+            didGcLast = yieldCapability(&cap,task, !didGcLast);
+        }
     }
     while (shouldYieldCapability(cap,task,didGcLast));
 
@@ -1798,6 +1802,9 @@ delete_threads_and_gc:
     }
 #endif
 
+    // Do any remaining idle GC work from the previous GC
+    doIdleGCWork(cap, true /* all of it */);
+
 #if defined(THREADED_RTS)
     // reset pending_sync *before* GC, so that when the GC threads
     // emerge they don't immediately re-enter the GC.
@@ -1807,6 +1814,11 @@ delete_threads_and_gc:
     GarbageCollect(collect_gen, heap_census, 0, cap, NULL);
 #endif
 
+    // If we're shutting down, don't leave any idle GC work to do.
+    if (sched_state == SCHED_SHUTTING_DOWN) {
+        doIdleGCWork(cap, true /* all of it */);
+    }
+
     traceSparkCounters(cap);
 
     switch (recent_activity) {
diff --git a/rts/Weak.c b/rts/Weak.c
index 577d1cd7d8..80623409b5 100644
--- a/rts/Weak.c
+++ b/rts/Weak.c
@@ -17,6 +17,12 @@
 #include "ThreadLabels.h"
 #include "Trace.h"
 
+// List of dead weak pointers collected by the last GC
+static StgWeak *finalizer_list = NULL;
+
+// Count of the above list.
+static uint32_t n_finalizers = 0;
+
 void
 runCFinalizers(StgCFinalizerList *list)
 {
@@ -84,15 +90,16 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
     StgMutArrPtrs *arr;
     StgWord size;
     uint32_t n, i;
-    Task *task;
 
-    task = myTask();
-    if (task != NULL) {
-        task->running_finalizers = true;
-    }
+    ASSERT(n_finalizers == 0);
+
+    finalizer_list = list;
 
-    // count number of finalizers, and kill all the weak pointers first...
+    // Traverse the list and
+    //  * count the number of Haskell finalizers
+    //  * overwrite all the weak pointers with DEAD_WEAK
     n = 0;
+    i = 0;
     for (w = list; w; w = w->link) {
         // Better not be a DEAD_WEAK at this stage; the garbage
         // collector removes DEAD_WEAKs from the weak pointer list.
@@ -102,7 +109,8 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
             n++;
         }
 
-        runCFinalizers((StgCFinalizerList *)w->cfinalizers);
+        // Remember the length of the list, for runSomeFinalizers() below
+        i++;
 
 #if defined(PROFILING)
         // A weak pointer is inherently used, so we do not need to call
@@ -113,14 +121,16 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
         // no need to fill the slop, either.  See stg_DEAD_WEAK_info
         // in StgMiscClosures.cmm.
 #endif
+
+        // We must overwrite the header with DEAD_WEAK, so that if
+        // there's a later call to finalizeWeak# on this weak pointer,
+        // we don't run the finalizer again.
         SET_HDR(w, &stg_DEAD_WEAK_info, w->header.prof.ccs);
     }
 
-    if (task != NULL) {
-        task->running_finalizers = false;
-    }
+    n_finalizers = i;
 
-    // No finalizers to run?
+    // No Haskell finalizers to run?
     if (n == 0) return;
 
     debugTrace(DEBUG_weak, "weak: batching %d finalizers", n);
@@ -156,3 +166,90 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
     scheduleThread(cap,t);
     labelThread(cap, t, "weak finalizer thread");
 }
+
+/* -----------------------------------------------------------------------------
+   Incrementally running C finalizers
+
+   The GC detects all the dead finalizers, but we don't want to run
+   them during the GC because that increases the time that the runtime
+   is paused.
+
+   What options are there?
+
+   1. Parallelise running the C finalizers across the GC threads
+      - doesn't solve the pause problem, just reduces it (maybe by a lot)
+
+   2. Make a Haskell thread to run the C finalizers, like we do for
+      Haskell finalizers.
+      + scheduling is handled for us
+      - no guarantee that we'll process finalizers in a timely manner
+
+   3. Run finalizers when any capability is idle.
+      + reduces pause to 0
+      - requires scheduler modifications
+      - if the runtime is busy, finalizers wait until the next GC
+
+   4. like (3), but also run finalizers incrementally between GCs.
+      - reduces the delay to run finalizers compared with (3)
+
+   For now we do (3). It would be easy to do (4) later by adding a
+   call to doIdleGCWork() in the scheduler loop, but I haven't found
+   that necessary so far.
+
+   -------------------------------------------------------------------------- */
+
+// Run this many finalizers before returning from
+// runSomeFinalizers(). This is so that we only tie up the capability
+// for a short time, and respond quickly if new work becomes
+// available.
+static const int32_t finalizer_chunk = 100;
+
+// non-zero if a thread is already in runSomeFinalizers(). This
+// protects the globals finalizer_list and n_finalizers.
+static volatile StgWord finalizer_lock = 0;
+
+//
+// Run some C finalizers.  Returns true if there's more work to do.
+//
+bool runSomeFinalizers(bool all)
+{
+    if (n_finalizers == 0)
+        return false;
+
+    if (cas(&finalizer_lock, 0, 1) != 0) {
+        // another capability is doing the work, it's safe to say
+        // there's nothing to do, because the thread already in
+        // runSomeFinalizers() will call in again.
+        return false;
+    }
+
+    debugTrace(DEBUG_sched, "running C finalizers, %d remaining", n_finalizers);
+
+    Task *task = myTask();
+    if (task != NULL) {
+        task->running_finalizers = true;
+    }
+
+    StgWeak *w = finalizer_list;
+    int32_t count = 0;
+    while (w != NULL) {
+        runCFinalizers((StgCFinalizerList *)w->cfinalizers);
+        w = w->link;
+        ++count;
+        if (!all && count >= finalizer_chunk) break;
+    }
+
+    finalizer_list = w;
+    n_finalizers -= count;
+
+    if (task != NULL) {
+        task->running_finalizers = false;
+    }
+
+    debugTrace(DEBUG_sched, "ran %d C finalizers", count);
+
+    write_barrier();
+    finalizer_lock = 0;
+
+    return n_finalizers != 0;
+}
diff --git a/rts/Weak.h b/rts/Weak.h
index ab335424db..fb67981497 100644
--- a/rts/Weak.h
+++ b/rts/Weak.h
@@ -19,5 +19,6 @@ void runCFinalizers(StgCFinalizerList *list);
 void runAllCFinalizers(StgWeak *w);
 void scheduleFinalizers(Capability *cap, StgWeak *w);
 void markWeakList(void);
+bool runSomeFinalizers(bool all);
 
 #include "EndPrivate.h"
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index d61ca41a6b..d7d3723cd9 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -1875,3 +1875,28 @@ static void gcCAFs(void)
     debugTrace(DEBUG_gccafs, "%d CAFs live", i);
 }
 #endif
+
+
+/* -----------------------------------------------------------------------------
+   The GC can leave some work for the mutator to do before the next
+   GC, provided the work can be safely overlapped with mutation.  This
+   can help reduce the GC pause time.
+
+   The mutator can call doIdleGCWork() any time it likes, but
+   preferably when it is idle.  It's safe for multiple capabilities to
+   call doIdleGCWork().
+
+   When 'all' is
+     * false: doIdleGCWork() should only take a short, bounded, amount
+       of time.
+     * true: doIdleGCWork() will complete all the outstanding GC work.
+
+   The return value is
+     * true if there's more to do (only if 'all' is false).
+     * false otherwise.
+  -------------------------------------------------------------------------- */
+
+bool doIdleGCWork(Capability *cap STG_UNUSED, bool all)
+{
+    return runSomeFinalizers(all);
+}
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index 7fce87edd4..af662859ff 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -26,6 +26,8 @@ typedef void (*evac_fn)(void *user, StgClosure **root);
 StgClosure * isAlive      ( StgClosure *p );
 void         markCAFs     ( evac_fn evac, void *user );
 
+bool doIdleGCWork(Capability *cap, bool all);
+
 extern uint32_t N;
 extern bool major_gc;
author	Simon Marlow <marlowsd@gmail.com>	2018-03-25 14:04:02 -0400
committer	Ben Gamari <ben@smart-cactus.org>	2018-03-25 14:33:27 -0400
commit	f7bbc343a624710ecf8f8f5eda620c4f35c90fc8 (patch)
tree	0c409ece9e40aa9e612064ac306d75068ee03b0c
parent	cf809950efb744ca884e0e0833a80ffd50527ca1 (diff)
download	haskell-f7bbc343a624710ecf8f8f5eda620c4f35c90fc8.tar.gz