5 files changed, 149 insertions, 12 deletions
diff --git a/rts/Schedule.c b/rts/Schedule.c
index 5160cb495b..2dc850c50d 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -679,7 +679,11 @@ scheduleYield (Capability **pcap, Task *task)
 
     // otherwise yield (sleep), and keep yielding if necessary.
     do {
-        didGcLast = yieldCapability(&cap,task, !didGcLast);
+        if (doIdleGCWork(cap, false)) {
+            didGcLast = false;
+        } else {
+            didGcLast = yieldCapability(&cap,task, !didGcLast);
+        }
     }
     while (shouldYieldCapability(cap,task,didGcLast));
 
@@ -1798,6 +1802,9 @@ delete_threads_and_gc:
     }
 #endif
 
+    // Do any remaining idle GC work from the previous GC
+    doIdleGCWork(cap, true /* all of it */);
+
 #if defined(THREADED_RTS)
     // reset pending_sync *before* GC, so that when the GC threads
     // emerge they don't immediately re-enter the GC.
@@ -1807,6 +1814,11 @@ delete_threads_and_gc:
     GarbageCollect(collect_gen, heap_census, 0, cap, NULL);
 #endif
 
+    // If we're shutting down, don't leave any idle GC work to do.
+    if (sched_state == SCHED_SHUTTING_DOWN) {
+        doIdleGCWork(cap, true /* all of it */);
+    }
+
     traceSparkCounters(cap);
 
     switch (recent_activity) {
diff --git a/rts/Weak.c b/rts/Weak.c
index 577d1cd7d8..80623409b5 100644
--- a/rts/Weak.c
+++ b/rts/Weak.c
@@ -17,6 +17,12 @@
 #include "ThreadLabels.h"
 #include "Trace.h"
 
+// List of dead weak pointers collected by the last GC
+static StgWeak *finalizer_list = NULL;
+
+// Count of the above list.
+static uint32_t n_finalizers = 0;
+
 void
 runCFinalizers(StgCFinalizerList *list)
 {
@@ -84,15 +90,16 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
     StgMutArrPtrs *arr;
     StgWord size;
     uint32_t n, i;
-    Task *task;
 
-    task = myTask();
-    if (task != NULL) {
-        task->running_finalizers = true;
-    }
+    ASSERT(n_finalizers == 0);
+
+    finalizer_list = list;
 
-    // count number of finalizers, and kill all the weak pointers first...
+    // Traverse the list and
+    //  * count the number of Haskell finalizers
+    //  * overwrite all the weak pointers with DEAD_WEAK
     n = 0;
+    i = 0;
     for (w = list; w; w = w->link) {
         // Better not be a DEAD_WEAK at this stage; the garbage
         // collector removes DEAD_WEAKs from the weak pointer list.
@@ -102,7 +109,8 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
             n++;
         }
 
-        runCFinalizers((StgCFinalizerList *)w->cfinalizers);
+        // Remember the length of the list, for runSomeFinalizers() below
+        i++;
 
 #if defined(PROFILING)
         // A weak pointer is inherently used, so we do not need to call
@@ -113,14 +121,16 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
         // no need to fill the slop, either.  See stg_DEAD_WEAK_info
         // in StgMiscClosures.cmm.
 #endif
+
+        // We must overwrite the header with DEAD_WEAK, so that if
+        // there's a later call to finalizeWeak# on this weak pointer,
+        // we don't run the finalizer again.
         SET_HDR(w, &stg_DEAD_WEAK_info, w->header.prof.ccs);
     }
 
-    if (task != NULL) {
-        task->running_finalizers = false;
-    }
+    n_finalizers = i;
 
-    // No finalizers to run?
+    // No Haskell finalizers to run?
     if (n == 0) return;
 
     debugTrace(DEBUG_weak, "weak: batching %d finalizers", n);
@@ -156,3 +166,90 @@ scheduleFinalizers(Capability *cap, StgWeak *list)
     scheduleThread(cap,t);
     labelThread(cap, t, "weak finalizer thread");
 }
+
+/* -----------------------------------------------------------------------------
+   Incrementally running C finalizers
+
+   The GC detects all the dead finalizers, but we don't want to run
+   them during the GC because that increases the time that the runtime
+   is paused.
+
+   What options are there?
+
+   1. Parallelise running the C finalizers across the GC threads
+      - doesn't solve the pause problem, just reduces it (maybe by a lot)
+
+   2. Make a Haskell thread to run the C finalizers, like we do for
+      Haskell finalizers.
+      + scheduling is handled for us
+      - no guarantee that we'll process finalizers in a timely manner
+
+   3. Run finalizers when any capability is idle.
+      + reduces pause to 0
+      - requires scheduler modifications
+      - if the runtime is busy, finalizers wait until the next GC
+
+   4. like (3), but also run finalizers incrementally between GCs.
+      - reduces the delay to run finalizers compared with (3)
+
+   For now we do (3). It would be easy to do (4) later by adding a
+   call to doIdleGCWork() in the scheduler loop, but I haven't found
+   that necessary so far.
+
+   -------------------------------------------------------------------------- */
+
+// Run this many finalizers before returning from
+// runSomeFinalizers(). This is so that we only tie up the capability
+// for a short time, and respond quickly if new work becomes
+// available.
+static const int32_t finalizer_chunk = 100;
+
+// non-zero if a thread is already in runSomeFinalizers(). This
+// protects the globals finalizer_list and n_finalizers.
+static volatile StgWord finalizer_lock = 0;
+
+//
+// Run some C finalizers.  Returns true if there's more work to do.
+//
+bool runSomeFinalizers(bool all)
+{
+    if (n_finalizers == 0)
+        return false;
+
+    if (cas(&finalizer_lock, 0, 1) != 0) {
+        // another capability is doing the work, it's safe to say
+        // there's nothing to do, because the thread already in
+        // runSomeFinalizers() will call in again.
+        return false;
+    }
+
+    debugTrace(DEBUG_sched, "running C finalizers, %d remaining", n_finalizers);
+
+    Task *task = myTask();
+    if (task != NULL) {
+        task->running_finalizers = true;
+    }
+
+    StgWeak *w = finalizer_list;
+    int32_t count = 0;
+    while (w != NULL) {
+        runCFinalizers((StgCFinalizerList *)w->cfinalizers);
+        w = w->link;
+        ++count;
+        if (!all && count >= finalizer_chunk) break;
+    }
+
+    finalizer_list = w;
+    n_finalizers -= count;
+
+    if (task != NULL) {
+        task->running_finalizers = false;
+    }
+
+    debugTrace(DEBUG_sched, "ran %d C finalizers", count);
+
+    write_barrier();
+    finalizer_lock = 0;
+
+    return n_finalizers != 0;
+}
diff --git a/rts/Weak.h b/rts/Weak.h
index ab335424db..fb67981497 100644
--- a/rts/Weak.h
+++ b/rts/Weak.h
@@ -19,5 +19,6 @@ void runCFinalizers(StgCFinalizerList *list);
 void runAllCFinalizers(StgWeak *w);
 void scheduleFinalizers(Capability *cap, StgWeak *w);
 void markWeakList(void);
+bool runSomeFinalizers(bool all);
 
 #include "EndPrivate.h"
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index d61ca41a6b..d7d3723cd9 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -1875,3 +1875,28 @@ static void gcCAFs(void)
     debugTrace(DEBUG_gccafs, "%d CAFs live", i);
 }
 #endif
+
+
+/* -----------------------------------------------------------------------------
+   The GC can leave some work for the mutator to do before the next
+   GC, provided the work can be safely overlapped with mutation.  This
+   can help reduce the GC pause time.
+
+   The mutator can call doIdleGCWork() any time it likes, but
+   preferably when it is idle.  It's safe for multiple capabilities to
+   call doIdleGCWork().
+
+   When 'all' is
+     * false: doIdleGCWork() should only take a short, bounded, amount
+       of time.
+     * true: doIdleGCWork() will complete all the outstanding GC work.
+
+   The return value is
+     * true if there's more to do (only if 'all' is false).
+     * false otherwise.
+  -------------------------------------------------------------------------- */
+
+bool doIdleGCWork(Capability *cap STG_UNUSED, bool all)
+{
+    return runSomeFinalizers(all);
+}
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index 7fce87edd4..af662859ff 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -26,6 +26,8 @@ typedef void (*evac_fn)(void *user, StgClosure **root);
 StgClosure * isAlive      ( StgClosure *p );
 void         markCAFs     ( evac_fn evac, void *user );
 
+bool doIdleGCWork(Capability *cap, bool all);
+
 extern uint32_t N;
 extern bool major_gc;