rts: Non-concurrent mark and sweep

This implements the core heap structure and a serial mark/sweep collector which can be used to manage the oldest-generation heap. This is the first step towards a concurrent mark-and-sweep collector aimed at low-latency applications. The full design of the collector implemented here is described in detail in a technical note B. Gamari. "A Concurrent Garbage Collector For the Glasgow Haskell Compiler" (2018) The basic heap structure used in this design is heavily inspired by K. Ueno & A. Ohori. "A fully concurrent garbage collector for functional programs on multicore processors." /ACM SIGPLAN Notices/ Vol. 51. No. 9 (presented by ICFP 2016) This design is intended to allow both marking and sweeping concurrent to execution of a multi-core mutator. Unlike the Ueno design, which requires no global synchronization pauses, the collector introduced here requires a stop-the-world pause at the beginning and end of the mark phase. To avoid heap fragmentation, the allocator consists of a number of fixed-size /sub-allocators/. Each of these sub-allocators allocators into its own set of /segments/, themselves allocated from the block allocator. Each segment is broken into a set of fixed-size allocation blocks (which back allocations) in addition to a bitmap (used to track the liveness of blocks) and some additional metadata (used also used to track liveness). This heap structure enables collection via mark-and-sweep, which can be performed concurrently via a snapshot-at-the-beginning scheme (although concurrent collection is not implemented in this patch). The mark queue is a fairly straightforward chunked-array structure. The representation is a bit more verbose than a typical mark queue to accomodate a combination of two features: * a mark FIFO, which improves the locality of marking, reducing one of the major overheads seen in mark/sweep allocators (see [1] for details) * the selector optimization and indirection shortcutting, which requires that we track where we found each reference to an object in case we need to update the reference at a later point (e.g. when we find that it is an indirection). See Note [Origin references in the nonmoving collector] (in `NonMovingMark.h`) for details. Beyond this the mark/sweep is fairly run-of-the-mill. [1] R. Garner, S.M. Blackburn, D. Frampton. "Effective Prefetch for Mark-Sweep Garbage Collection." ISMM 2007. Co-Authored-By: Ben Gamari <ben@well-typed.com>
author: Ömer Sinan Ağacan <omer@well-typed.com> 2019-02-05 00:18:44 -0500
committer: Ben Gamari <ben@smart-cactus.org> 2019-10-20 21:15:37 -0400
commit: 68e0647f432f9d79ae13a23f614ef293bfd297a9 (patch)
tree: 89211783fbd4d8d9502e2777b2719da493fef851 /rts/sm/GC.c
parent: b3ef2d1a861e9b892d64f22f6a233ea331db86d1 (diff)
download: haskell-68e0647f432f9d79ae13a23f614ef293bfd297a9.tar.gz
1 files changed, 193 insertions, 119 deletions
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 3f301aeb53..a3d0e4a164 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -51,6 +51,7 @@
 #include "CheckUnload.h"
 #include "CNF.h"
 #include "RtsFlags.h"
+#include "NonMoving.h"
 
 #include <string.h> // for memset()
 #include <unistd.h>
@@ -159,7 +160,6 @@ static void mark_root               (void *user, StgClosure **root);
 static void prepare_collected_gen   (generation *gen);
 static void prepare_uncollected_gen (generation *gen);
 static void init_gc_thread          (gc_thread *t);
-static void resize_generations      (void);
 static void resize_nursery          (void);
 static void start_gc_threads        (void);
 static void scavenge_until_all_done (void);
@@ -572,7 +572,7 @@ GarbageCollect (uint32_t collect_gen,
     gen = &generations[g];
 
     // for generations we collected...
-    if (g <= N) {
+    if (g <= N && !(RtsFlags.GcFlags.useNonmoving && gen == oldest_gen)) {
 
         /* free old memory and shift to-space into from-space for all
          * the collected generations (except the allocation area).  These
@@ -710,8 +710,42 @@ GarbageCollect (uint32_t collect_gen,
     }
   } // for all generations
 
-  // update the max size of older generations after a major GC
-  resize_generations();
+  // Mark and sweep the oldest generation.
+  // N.B. This can only happen after we've moved
+  // oldest_gen->scavenged_large_objects back to oldest_gen->large_objects.
+  ASSERT(oldest_gen->scavenged_large_objects == NULL);
+  if (RtsFlags.GcFlags.useNonmoving && major_gc) {
+      // All threads in non-moving heap should be found to be alive, becuase
+      // threads in the non-moving generation's list should live in the
+      // non-moving heap, and we consider non-moving objects alive during
+      // preparation.
+      ASSERT(oldest_gen->old_threads == END_TSO_QUEUE);
+      // For weaks, remember that we evacuated all weaks to the non-moving heap
+      // in markWeakPtrList(), and then moved the weak_ptr_list list to
+      // old_weak_ptr_list. We then moved weaks with live keys to the
+      // weak_ptr_list again. Then, in collectDeadWeakPtrs() we moved weaks in
+      // old_weak_ptr_list to dead_weak_ptr_list. So at this point
+      // old_weak_ptr_list should be empty.
+      ASSERT(oldest_gen->old_weak_ptr_list == NULL);
+
+      // we may need to take the lock to allocate mark queue blocks
+      RELEASE_SM_LOCK;
+      // dead_weak_ptr_list contains weak pointers with dead keys. Those need to
+      // be kept alive because we'll use them in finalizeSchedulers(). Similarly
+      // resurrected_threads are also going to be used in resurrectedThreads()
+      // so we need to mark those too.
+      // Note that in sequential case these lists will be appended with more
+      // weaks and threads found to be dead in mark.
+      nonmovingCollect(&dead_weak_ptr_list, &resurrected_threads);
+      ACQUIRE_SM_LOCK;
+  }
+
+  // Update the max size of older generations after a major GC:
+  // We can't resize here in the case of the concurrent collector since we
+  // don't yet know how much live data we have. This will be instead done
+  // once we finish marking.
+  if (major_gc && RtsFlags.GcFlags.generations > 1 && ! RtsFlags.GcFlags.useNonmoving)
+      resizeGenerations();
 
   // Free the mark stack.
   if (mark_stack_top_bd != NULL) {
@@ -735,7 +769,7 @@ GarbageCollect (uint32_t collect_gen,
 
  // mark the garbage collected CAFs as dead
 #if defined(DEBUG)
-  if (major_gc) { gcCAFs(); }
+  if (major_gc && !RtsFlags.GcFlags.useNonmoving) { gcCAFs(); }
 #endif
 
   // Update the stable name hash table
@@ -768,8 +802,9 @@ GarbageCollect (uint32_t collect_gen,
   // check sanity after GC
   // before resurrectThreads(), because that might overwrite some
   // closures, which will cause problems with THREADED where we don't
-  // fill slop.
-  IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc));
+  // fill slop. If we are using the nonmoving collector then we can't claim to
+  // be *after* the major GC; it's now running concurrently.
+  IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc && !RtsFlags.GcFlags.useNonmoving));
 
   // If a heap census is due, we need to do it before
   // resurrectThreads(), for the same reason as checkSanity above:
@@ -942,6 +977,7 @@ new_gc_thread (uint32_t n, gc_thread *t)
         ws->todo_overflow = NULL;
         ws->n_todo_overflow = 0;
         ws->todo_large_objects = NULL;
+        ws->todo_seg = END_NONMOVING_TODO_LIST;
 
         ws->part_list = NULL;
         ws->n_part_blocks = 0;
@@ -1321,6 +1357,18 @@ releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
 #endif
 
 /* ----------------------------------------------------------------------------
+   Save the mutable lists in saved_mut_lists where it will be scavenged
+   during GC
+   ------------------------------------------------------------------------- */
+
+static void
+stash_mut_list (Capability *cap, uint32_t gen_no)
+{
+    cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
+    cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
+}
+
+/* ----------------------------------------------------------------------------
    Initialise a generation that is to be collected
    ------------------------------------------------------------------------- */
 
@@ -1331,11 +1379,17 @@ prepare_collected_gen (generation *gen)
     gen_workspace *ws;
     bdescr *bd, *next;
 
-    // Throw away the current mutable list.  Invariant: the mutable
-    // list always has at least one block; this means we can avoid a
-    // check for NULL in recordMutable().
     g = gen->no;
-    if (g != 0) {
+
+    if (RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no) {
+        // Nonmoving heap's mutable list is always a root.
+        for (i = 0; i < n_capabilities; i++) {
+            stash_mut_list(capabilities[i], g);
+        }
+    } else if (g != 0) {
+        // Otherwise throw away the current mutable list. Invariant: the
+        // mutable list always has at least one block; this means we can avoid
+        // a check for NULL in recordMutable().
         for (i = 0; i < n_capabilities; i++) {
             freeChain(capabilities[i]->mut_lists[g]);
             capabilities[i]->mut_lists[g] =
@@ -1351,13 +1405,17 @@ prepare_collected_gen (generation *gen)
     gen->old_threads = gen->threads;
     gen->threads = END_TSO_QUEUE;
 
-    // deprecate the existing blocks
-    gen->old_blocks   = gen->blocks;
-    gen->n_old_blocks = gen->n_blocks;
-    gen->blocks       = NULL;
-    gen->n_blocks     = 0;
-    gen->n_words      = 0;
-    gen->live_estimate = 0;
+    // deprecate the existing blocks (except in the case of the nonmoving
+    // collector since these will be preserved in nonmovingCollect for the
+    // concurrent GC).
+    if (!(RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no)) {
+        gen->old_blocks   = gen->blocks;
+        gen->n_old_blocks = gen->n_blocks;
+        gen->blocks       = NULL;
+        gen->n_blocks     = 0;
+        gen->n_words      = 0;
+        gen->live_estimate = 0;
+    }
 
     // initialise the large object queues.
     ASSERT(gen->scavenged_large_objects == NULL);
@@ -1451,18 +1509,6 @@ prepare_collected_gen (generation *gen)
     }
 }
 
-
-/* ----------------------------------------------------------------------------
-   Save the mutable lists in saved_mut_lists
-   ------------------------------------------------------------------------- */
-
-static void
-stash_mut_list (Capability *cap, uint32_t gen_no)
-{
-    cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no];
-    cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node);
-}
-
 /* ----------------------------------------------------------------------------
    Initialise a generation that is *not* to be collected
    ------------------------------------------------------------------------- */
@@ -1531,31 +1577,57 @@ collect_gct_blocks (void)
 }
 
 /* -----------------------------------------------------------------------------
-   During mutation, any blocks that are filled by allocatePinned() are
-   stashed on the local pinned_object_blocks list, to avoid needing to
-   take a global lock.  Here we collect those blocks from the
-   cap->pinned_object_blocks lists and put them on the
-   main g0->large_object list.
+   During mutation, any blocks that are filled by allocatePinned() are stashed
+   on the local pinned_object_blocks list, to avoid needing to take a global
+   lock.  Here we collect those blocks from the cap->pinned_object_blocks lists
+   and put them on the g0->large_object or oldest_gen->large_objects.
+
+   How to decide which list to put them on?
+
+   - When non-moving heap is enabled and this is a major GC, we put them on
+     oldest_gen. This is because after preparation we really want no
+     old-to-young references, and we want to be able to reset mut_lists. For
+     this we need to promote every potentially live object to the oldest gen.
+
+   - Otherwise we put them on g0.
    -------------------------------------------------------------------------- */
 
 static void
 collect_pinned_object_blocks (void)
 {
-    uint32_t n;
-    bdescr *bd, *prev;
+    generation *gen;
+    const bool use_nonmoving = RtsFlags.GcFlags.useNonmoving;
+    if (use_nonmoving && major_gc) {
+        gen = oldest_gen;
+    } else {
+        gen = g0;
+    }
 
-    for (n = 0; n < n_capabilities; n++) {
-        prev = NULL;
-        for (bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
-            prev = bd;
+    for (uint32_t n = 0; n < n_capabilities; n++) {
+        bdescr *last = NULL;
+        if (use_nonmoving && gen == oldest_gen) {
+            // Mark objects as belonging to the nonmoving heap
+            for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
+                bd->flags |= BF_NONMOVING;
+                bd->gen = oldest_gen;
+                bd->gen_no = oldest_gen->no;
+                oldest_gen->n_large_words += bd->free - bd->start;
+                oldest_gen->n_large_blocks += bd->blocks;
+                last = bd;
+            }
+        } else {
+            for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) {
+                last = bd;
+            }
         }
-        if (prev != NULL) {
-            prev->link = g0->large_objects;
-            if (g0->large_objects != NULL) {
-                g0->large_objects->u.back = prev;
+
+        if (last != NULL) {
+            last->link = gen->large_objects;
+            if (gen->large_objects != NULL) {
+                gen->large_objects->u.back = last;
             }
-            g0->large_objects = capabilities[n]->pinned_object_blocks;
-            capabilities[n]->pinned_object_blocks = 0;
+            gen->large_objects = capabilities[n]->pinned_object_blocks;
+            capabilities[n]->pinned_object_blocks = NULL;
         }
     }
 }
@@ -1614,98 +1686,100 @@ mark_root(void *user USED_IF_THREADS, StgClosure **root)
    percentage of the maximum heap size available to allocate into.
    ------------------------------------------------------------------------- */
 
-static void
-resize_generations (void)
+void
+resizeGenerations (void)
 {
     uint32_t g;
+    W_ live, size, min_alloc, words;
+    const W_ max  = RtsFlags.GcFlags.maxHeapSize;
+    const W_ gens = RtsFlags.GcFlags.generations;
 
-    if (major_gc && RtsFlags.GcFlags.generations > 1) {
-        W_ live, size, min_alloc, words;
-        const W_ max  = RtsFlags.GcFlags.maxHeapSize;
-        const W_ gens = RtsFlags.GcFlags.generations;
-
-        // live in the oldest generations
-        if (oldest_gen->live_estimate != 0) {
-            words = oldest_gen->live_estimate;
-        } else {
-            words = oldest_gen->n_words;
-        }
-        live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W +
-            oldest_gen->n_large_blocks +
-            oldest_gen->n_compact_blocks;
+    // live in the oldest generations
+    if (oldest_gen->live_estimate != 0) {
+        words = oldest_gen->live_estimate;
+    } else {
+        words = oldest_gen->n_words;
+    }
+    live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W +
+        oldest_gen->n_large_blocks +
+        oldest_gen->n_compact_blocks;
 
-        // default max size for all generations except zero
-        size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
-                       RtsFlags.GcFlags.minOldGenSize);
+    // default max size for all generations except zero
+    size = stg_max(live * RtsFlags.GcFlags.oldGenFactor,
+                   RtsFlags.GcFlags.minOldGenSize);
 
-        if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
-            if (max > 0) {
-                RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size);
-            } else {
-                RtsFlags.GcFlags.heapSizeSuggestion = size;
-            }
+    if (RtsFlags.GcFlags.heapSizeSuggestionAuto) {
+        if (max > 0) {
+            RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size);
+        } else {
+            RtsFlags.GcFlags.heapSizeSuggestion = size;
         }
+    }
 
-        // minimum size for generation zero
-        min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
-                            RtsFlags.GcFlags.minAllocAreaSize
-                            * (W_)n_capabilities);
-
-        // Auto-enable compaction when the residency reaches a
-        // certain percentage of the maximum heap size (default: 30%).
-        if (RtsFlags.GcFlags.compact ||
-            (max > 0 &&
-             oldest_gen->n_blocks >
-             (RtsFlags.GcFlags.compactThreshold * max) / 100)) {
-            oldest_gen->mark = 1;
-            oldest_gen->compact = 1;
+    // minimum size for generation zero
+    min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
+                        RtsFlags.GcFlags.minAllocAreaSize
+                        * (W_)n_capabilities);
+
+    // Auto-enable compaction when the residency reaches a
+    // certain percentage of the maximum heap size (default: 30%).
+    // Except when non-moving GC is enabled.
+    if (!RtsFlags.GcFlags.useNonmoving &&
+        (RtsFlags.GcFlags.compact ||
+         (max > 0 &&
+          oldest_gen->n_blocks >
+          (RtsFlags.GcFlags.compactThreshold * max) / 100))) {
+        oldest_gen->mark = 1;
+        oldest_gen->compact = 1;
 //        debugBelch("compaction: on\n", live);
-        } else {
-            oldest_gen->mark = 0;
-            oldest_gen->compact = 0;
+    } else {
+        oldest_gen->mark = 0;
+        oldest_gen->compact = 0;
 //        debugBelch("compaction: off\n", live);
-        }
+    }
 
-        if (RtsFlags.GcFlags.sweep) {
-            oldest_gen->mark = 1;
-        }
+    if (RtsFlags.GcFlags.sweep) {
+        oldest_gen->mark = 1;
+    }
 
-        // if we're going to go over the maximum heap size, reduce the
-        // size of the generations accordingly.  The calculation is
-        // different if compaction is turned on, because we don't need
-        // to double the space required to collect the old generation.
-        if (max != 0) {
+    // if we're going to go over the maximum heap size, reduce the
+    // size of the generations accordingly.  The calculation is
+    // different if compaction is turned on, because we don't need
+    // to double the space required to collect the old generation.
+    if (max != 0) {
+
+        // this test is necessary to ensure that the calculations
+        // below don't have any negative results - we're working
+        // with unsigned values here.
+        if (max < min_alloc) {
+            heapOverflow();
+        }
 
-            // this test is necessary to ensure that the calculations
-            // below don't have any negative results - we're working
-            // with unsigned values here.
-            if (max < min_alloc) {
-                heapOverflow();
+        if (oldest_gen->compact) {
+            if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
+                size = (max - min_alloc) / ((gens - 1) * 2 - 1);
             }
-
-            if (oldest_gen->compact) {
-                if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) {
-                    size = (max - min_alloc) / ((gens - 1) * 2 - 1);
-                }
-            } else {
-                if ( (size * (gens - 1) * 2) + min_alloc > max ) {
-                    size = (max - min_alloc) / ((gens - 1) * 2);
-                }
+        } else {
+            if ( (size * (gens - 1) * 2) + min_alloc > max ) {
+                size = (max - min_alloc) / ((gens - 1) * 2);
             }
+        }
 
-            if (size < live) {
-                heapOverflow();
-            }
+        if (size < live) {
+            heapOverflow();
         }
+    }
 
 #if 0
-        debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
-                   min_alloc, size, max);
+    debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live,
+               min_alloc, size, max);
+    debugBelch("resize_gen: n_blocks: %lu, n_large_block: %lu, n_compact_blocks: %lu\n",
+               oldest_gen->n_blocks, oldest_gen->n_large_blocks, oldest_gen->n_compact_blocks);
+    debugBelch("resize_gen: max_blocks: %lu -> %lu\n", oldest_gen->max_blocks, oldest_gen->n_blocks);
 #endif
 
-        for (g = 0; g < gens; g++) {
-            generations[g].max_blocks = size;
-        }
+    for (g = 0; g < gens; g++) {
+        generations[g].max_blocks = size;
     }
 }
author	Ömer Sinan Ağacan <omer@well-typed.com>	2019-02-05 00:18:44 -0500
committer	Ben Gamari <ben@smart-cactus.org>	2019-10-20 21:15:37 -0400
commit	68e0647f432f9d79ae13a23f614ef293bfd297a9 (patch)
tree	89211783fbd4d8d9502e2777b2719da493fef851 /rts/sm/GC.c
parent	b3ef2d1a861e9b892d64f22f6a233ea331db86d1 (diff)
download	haskell-68e0647f432f9d79ae13a23f614ef293bfd297a9.tar.gz