Fix a bug in parallel GC synchronisation

Summary: The problem boils down to global variables: in particular gc_threads[], which was being modified by a subsequent GC before the previous GC had finished with it. The fix is to not use global variables. This was causing setnumcapabilities001 to fail (again!). It's an old bug though. Test Plan: Ran setnumcapabilities001 in a loop for a couple of hours. Before this patch it had been failing after a few minutes. Not a very scientific test, but it's the best I have. Reviewers: bgamari, austin, fryguybob, niteria, erikd Subscribers: thomie Differential Revision: https://phabricator.haskell.org/D2654
author: Simon Marlow <smarlow@fb.com> 2016-10-28 08:30:14 -0700
committer: Simon Marlow <marlowsd@gmail.com> 2016-10-29 19:33:52 +0100
commit: 4e088b497edd83f361898fa9d2d62ff310b08945 (patch)
tree: f0ef3a8f2443cec7dcdce15493fd8ad92e197997 /rts/Schedule.c
parent: 4b300a32257c3ed272747f7e75709a26dd2d8407 (diff)
download: haskell-4e088b497edd83f361898fa9d2d62ff310b08945.tar.gz
1 files changed, 16 insertions, 14 deletions
diff --git a/rts/Schedule.c b/rts/Schedule.c
index a44512b3cb..7d5654fd03 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -1535,6 +1535,9 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
     uint32_t n_idle_caps = 0, n_failed_trygrab_idles = 0;
     StgTSO *tso;
     rtsBool *idle_cap;
+      // idle_cap is an array (allocated later) of size n_capabilities, where
+      // idle_cap[i] is rtsTrue if capability i will be idle during this GC
+      // cycle.
 #endif
 
     if (sched_state == SCHED_SHUTTING_DOWN) {
@@ -1735,23 +1738,13 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
         }
         debugTrace(DEBUG_sched, "%d idle caps", n_idle_caps);
 
-        // We set the gc_thread[i]->idle flag if that
-        // capability/thread is not participating in this collection.
-        // We also keep a local record of which capabilities are idle
-        // in idle_cap[], because scheduleDoGC() is re-entrant:
-        // another thread might start a GC as soon as we've finished
-        // this one, and thus the gc_thread[]->idle flags are invalid
-        // as soon as we release any threads after GC.  Getting this
-        // wrong leads to a rare and hard to debug deadlock!
-
         for (i=0; i < n_capabilities; i++) {
-            gc_threads[i]->idle = idle_cap[i];
             capabilities[i]->idle++;
         }
 
         // For all capabilities participating in this GC, wait until
         // they have stopped mutating and are standing by for GC.
-        waitForGcThreads(cap);
+        waitForGcThreads(cap, idle_cap);
 
 #if defined(THREADED_RTS)
         // Stable point where we can do a global check on our spark counters
@@ -1819,9 +1812,9 @@ delete_threads_and_gc:
     // reset pending_sync *before* GC, so that when the GC threads
     // emerge they don't immediately re-enter the GC.
     pending_sync = 0;
-    GarbageCollect(collect_gen, heap_census, gc_type, cap);
+    GarbageCollect(collect_gen, heap_census, gc_type, cap, idle_cap);
 #else
-    GarbageCollect(collect_gen, heap_census, 0, cap);
+    GarbageCollect(collect_gen, heap_census, 0, cap, NULL);
 #endif
 
     traceSparkCounters(cap);
@@ -1871,7 +1864,6 @@ delete_threads_and_gc:
 
     if (gc_type == SYNC_GC_PAR)
     {
-        releaseGCThreads(cap);
         for (i = 0; i < n_capabilities; i++) {
             if (i != cap->no) {
                 if (idle_cap[i]) {
@@ -1884,6 +1876,16 @@ delete_threads_and_gc:
             }
         }
         task->cap = cap;
+
+        // releaseGCThreads() happens *after* we have released idle
+        // capabilities.  Otherwise what can happen is one of the released
+        // threads starts a new GC, and finds that it can't acquire some of
+        // the disabled capabilities, because the previous GC still holds
+        // them, so those disabled capabilities will not be idle during the
+        // next GC round.  However, if we release the capabilities first,
+        // then they will be free (because they're disabled) when the next
+        // GC cycle happens.
+        releaseGCThreads(cap, idle_cap);
     }
 #endif
author	Simon Marlow <smarlow@fb.com>	2016-10-28 08:30:14 -0700
committer	Simon Marlow <marlowsd@gmail.com>	2016-10-29 19:33:52 +0100
commit	4e088b497edd83f361898fa9d2d62ff310b08945 (patch)
tree	f0ef3a8f2443cec7dcdce15493fd8ad92e197997 /rts/Schedule.c
parent	4b300a32257c3ed272747f7e75709a26dd2d8407 (diff)
download	haskell-4e088b497edd83f361898fa9d2d62ff310b08945.tar.gz