diff options
author | Ömer Sinan Ağacan <omer@well-typed.com> | 2019-02-05 00:18:44 -0500 |
---|---|---|
committer | Ben Gamari <ben@smart-cactus.org> | 2019-10-20 21:15:37 -0400 |
commit | 68e0647f432f9d79ae13a23f614ef293bfd297a9 (patch) | |
tree | 89211783fbd4d8d9502e2777b2719da493fef851 /rts/sm/GC.c | |
parent | b3ef2d1a861e9b892d64f22f6a233ea331db86d1 (diff) | |
download | haskell-68e0647f432f9d79ae13a23f614ef293bfd297a9.tar.gz |
rts: Non-concurrent mark and sweep
This implements the core heap structure and a serial mark/sweep
collector which can be used to manage the oldest-generation heap.
This is the first step towards a concurrent mark-and-sweep collector
aimed at low-latency applications.
The full design of the collector implemented here is described in detail
in a technical note
B. Gamari. "A Concurrent Garbage Collector For the Glasgow Haskell
Compiler" (2018)
The basic heap structure used in this design is heavily inspired by
K. Ueno & A. Ohori. "A fully concurrent garbage collector for
functional programs on multicore processors." /ACM SIGPLAN Notices/
Vol. 51. No. 9 (presented by ICFP 2016)
This design is intended to allow both marking and sweeping
concurrent to execution of a multi-core mutator. Unlike the Ueno design,
which requires no global synchronization pauses, the collector
introduced here requires a stop-the-world pause at the beginning and end
of the mark phase.
To avoid heap fragmentation, the allocator consists of a number of
fixed-size /sub-allocators/. Each of these sub-allocators allocators into
its own set of /segments/, themselves allocated from the block
allocator. Each segment is broken into a set of fixed-size allocation
blocks (which back allocations) in addition to a bitmap (used to track
the liveness of blocks) and some additional metadata (used also used
to track liveness).
This heap structure enables collection via mark-and-sweep, which can be
performed concurrently via a snapshot-at-the-beginning scheme (although
concurrent collection is not implemented in this patch).
The mark queue is a fairly straightforward chunked-array structure.
The representation is a bit more verbose than a typical mark queue to
accomodate a combination of two features:
* a mark FIFO, which improves the locality of marking, reducing one of
the major overheads seen in mark/sweep allocators (see [1] for
details)
* the selector optimization and indirection shortcutting, which
requires that we track where we found each reference to an object
in case we need to update the reference at a later point (e.g. when
we find that it is an indirection). See Note [Origin references in
the nonmoving collector] (in `NonMovingMark.h`) for details.
Beyond this the mark/sweep is fairly run-of-the-mill.
[1] R. Garner, S.M. Blackburn, D. Frampton. "Effective Prefetch for
Mark-Sweep Garbage Collection." ISMM 2007.
Co-Authored-By: Ben Gamari <ben@well-typed.com>
Diffstat (limited to 'rts/sm/GC.c')
-rw-r--r-- | rts/sm/GC.c | 312 |
1 files changed, 193 insertions, 119 deletions
diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 3f301aeb53..a3d0e4a164 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -51,6 +51,7 @@ #include "CheckUnload.h" #include "CNF.h" #include "RtsFlags.h" +#include "NonMoving.h" #include <string.h> // for memset() #include <unistd.h> @@ -159,7 +160,6 @@ static void mark_root (void *user, StgClosure **root); static void prepare_collected_gen (generation *gen); static void prepare_uncollected_gen (generation *gen); static void init_gc_thread (gc_thread *t); -static void resize_generations (void); static void resize_nursery (void); static void start_gc_threads (void); static void scavenge_until_all_done (void); @@ -572,7 +572,7 @@ GarbageCollect (uint32_t collect_gen, gen = &generations[g]; // for generations we collected... - if (g <= N) { + if (g <= N && !(RtsFlags.GcFlags.useNonmoving && gen == oldest_gen)) { /* free old memory and shift to-space into from-space for all * the collected generations (except the allocation area). These @@ -710,8 +710,42 @@ GarbageCollect (uint32_t collect_gen, } } // for all generations - // update the max size of older generations after a major GC - resize_generations(); + // Mark and sweep the oldest generation. + // N.B. This can only happen after we've moved + // oldest_gen->scavenged_large_objects back to oldest_gen->large_objects. + ASSERT(oldest_gen->scavenged_large_objects == NULL); + if (RtsFlags.GcFlags.useNonmoving && major_gc) { + // All threads in non-moving heap should be found to be alive, becuase + // threads in the non-moving generation's list should live in the + // non-moving heap, and we consider non-moving objects alive during + // preparation. + ASSERT(oldest_gen->old_threads == END_TSO_QUEUE); + // For weaks, remember that we evacuated all weaks to the non-moving heap + // in markWeakPtrList(), and then moved the weak_ptr_list list to + // old_weak_ptr_list. We then moved weaks with live keys to the + // weak_ptr_list again. Then, in collectDeadWeakPtrs() we moved weaks in + // old_weak_ptr_list to dead_weak_ptr_list. So at this point + // old_weak_ptr_list should be empty. + ASSERT(oldest_gen->old_weak_ptr_list == NULL); + + // we may need to take the lock to allocate mark queue blocks + RELEASE_SM_LOCK; + // dead_weak_ptr_list contains weak pointers with dead keys. Those need to + // be kept alive because we'll use them in finalizeSchedulers(). Similarly + // resurrected_threads are also going to be used in resurrectedThreads() + // so we need to mark those too. + // Note that in sequential case these lists will be appended with more + // weaks and threads found to be dead in mark. + nonmovingCollect(&dead_weak_ptr_list, &resurrected_threads); + ACQUIRE_SM_LOCK; + } + + // Update the max size of older generations after a major GC: + // We can't resize here in the case of the concurrent collector since we + // don't yet know how much live data we have. This will be instead done + // once we finish marking. + if (major_gc && RtsFlags.GcFlags.generations > 1 && ! RtsFlags.GcFlags.useNonmoving) + resizeGenerations(); // Free the mark stack. if (mark_stack_top_bd != NULL) { @@ -735,7 +769,7 @@ GarbageCollect (uint32_t collect_gen, // mark the garbage collected CAFs as dead #if defined(DEBUG) - if (major_gc) { gcCAFs(); } + if (major_gc && !RtsFlags.GcFlags.useNonmoving) { gcCAFs(); } #endif // Update the stable name hash table @@ -768,8 +802,9 @@ GarbageCollect (uint32_t collect_gen, // check sanity after GC // before resurrectThreads(), because that might overwrite some // closures, which will cause problems with THREADED where we don't - // fill slop. - IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc)); + // fill slop. If we are using the nonmoving collector then we can't claim to + // be *after* the major GC; it's now running concurrently. + IF_DEBUG(sanity, checkSanity(true /* after GC */, major_gc && !RtsFlags.GcFlags.useNonmoving)); // If a heap census is due, we need to do it before // resurrectThreads(), for the same reason as checkSanity above: @@ -942,6 +977,7 @@ new_gc_thread (uint32_t n, gc_thread *t) ws->todo_overflow = NULL; ws->n_todo_overflow = 0; ws->todo_large_objects = NULL; + ws->todo_seg = END_NONMOVING_TODO_LIST; ws->part_list = NULL; ws->n_part_blocks = 0; @@ -1321,6 +1357,18 @@ releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[]) #endif /* ---------------------------------------------------------------------------- + Save the mutable lists in saved_mut_lists where it will be scavenged + during GC + ------------------------------------------------------------------------- */ + +static void +stash_mut_list (Capability *cap, uint32_t gen_no) +{ + cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no]; + cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node); +} + +/* ---------------------------------------------------------------------------- Initialise a generation that is to be collected ------------------------------------------------------------------------- */ @@ -1331,11 +1379,17 @@ prepare_collected_gen (generation *gen) gen_workspace *ws; bdescr *bd, *next; - // Throw away the current mutable list. Invariant: the mutable - // list always has at least one block; this means we can avoid a - // check for NULL in recordMutable(). g = gen->no; - if (g != 0) { + + if (RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no) { + // Nonmoving heap's mutable list is always a root. + for (i = 0; i < n_capabilities; i++) { + stash_mut_list(capabilities[i], g); + } + } else if (g != 0) { + // Otherwise throw away the current mutable list. Invariant: the + // mutable list always has at least one block; this means we can avoid + // a check for NULL in recordMutable(). for (i = 0; i < n_capabilities; i++) { freeChain(capabilities[i]->mut_lists[g]); capabilities[i]->mut_lists[g] = @@ -1351,13 +1405,17 @@ prepare_collected_gen (generation *gen) gen->old_threads = gen->threads; gen->threads = END_TSO_QUEUE; - // deprecate the existing blocks - gen->old_blocks = gen->blocks; - gen->n_old_blocks = gen->n_blocks; - gen->blocks = NULL; - gen->n_blocks = 0; - gen->n_words = 0; - gen->live_estimate = 0; + // deprecate the existing blocks (except in the case of the nonmoving + // collector since these will be preserved in nonmovingCollect for the + // concurrent GC). + if (!(RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no)) { + gen->old_blocks = gen->blocks; + gen->n_old_blocks = gen->n_blocks; + gen->blocks = NULL; + gen->n_blocks = 0; + gen->n_words = 0; + gen->live_estimate = 0; + } // initialise the large object queues. ASSERT(gen->scavenged_large_objects == NULL); @@ -1451,18 +1509,6 @@ prepare_collected_gen (generation *gen) } } - -/* ---------------------------------------------------------------------------- - Save the mutable lists in saved_mut_lists - ------------------------------------------------------------------------- */ - -static void -stash_mut_list (Capability *cap, uint32_t gen_no) -{ - cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no]; - cap->mut_lists[gen_no] = allocBlockOnNode_sync(cap->node); -} - /* ---------------------------------------------------------------------------- Initialise a generation that is *not* to be collected ------------------------------------------------------------------------- */ @@ -1531,31 +1577,57 @@ collect_gct_blocks (void) } /* ----------------------------------------------------------------------------- - During mutation, any blocks that are filled by allocatePinned() are - stashed on the local pinned_object_blocks list, to avoid needing to - take a global lock. Here we collect those blocks from the - cap->pinned_object_blocks lists and put them on the - main g0->large_object list. + During mutation, any blocks that are filled by allocatePinned() are stashed + on the local pinned_object_blocks list, to avoid needing to take a global + lock. Here we collect those blocks from the cap->pinned_object_blocks lists + and put them on the g0->large_object or oldest_gen->large_objects. + + How to decide which list to put them on? + + - When non-moving heap is enabled and this is a major GC, we put them on + oldest_gen. This is because after preparation we really want no + old-to-young references, and we want to be able to reset mut_lists. For + this we need to promote every potentially live object to the oldest gen. + + - Otherwise we put them on g0. -------------------------------------------------------------------------- */ static void collect_pinned_object_blocks (void) { - uint32_t n; - bdescr *bd, *prev; + generation *gen; + const bool use_nonmoving = RtsFlags.GcFlags.useNonmoving; + if (use_nonmoving && major_gc) { + gen = oldest_gen; + } else { + gen = g0; + } - for (n = 0; n < n_capabilities; n++) { - prev = NULL; - for (bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) { - prev = bd; + for (uint32_t n = 0; n < n_capabilities; n++) { + bdescr *last = NULL; + if (use_nonmoving && gen == oldest_gen) { + // Mark objects as belonging to the nonmoving heap + for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) { + bd->flags |= BF_NONMOVING; + bd->gen = oldest_gen; + bd->gen_no = oldest_gen->no; + oldest_gen->n_large_words += bd->free - bd->start; + oldest_gen->n_large_blocks += bd->blocks; + last = bd; + } + } else { + for (bdescr *bd = capabilities[n]->pinned_object_blocks; bd != NULL; bd = bd->link) { + last = bd; + } } - if (prev != NULL) { - prev->link = g0->large_objects; - if (g0->large_objects != NULL) { - g0->large_objects->u.back = prev; + + if (last != NULL) { + last->link = gen->large_objects; + if (gen->large_objects != NULL) { + gen->large_objects->u.back = last; } - g0->large_objects = capabilities[n]->pinned_object_blocks; - capabilities[n]->pinned_object_blocks = 0; + gen->large_objects = capabilities[n]->pinned_object_blocks; + capabilities[n]->pinned_object_blocks = NULL; } } } @@ -1614,98 +1686,100 @@ mark_root(void *user USED_IF_THREADS, StgClosure **root) percentage of the maximum heap size available to allocate into. ------------------------------------------------------------------------- */ -static void -resize_generations (void) +void +resizeGenerations (void) { uint32_t g; + W_ live, size, min_alloc, words; + const W_ max = RtsFlags.GcFlags.maxHeapSize; + const W_ gens = RtsFlags.GcFlags.generations; - if (major_gc && RtsFlags.GcFlags.generations > 1) { - W_ live, size, min_alloc, words; - const W_ max = RtsFlags.GcFlags.maxHeapSize; - const W_ gens = RtsFlags.GcFlags.generations; - - // live in the oldest generations - if (oldest_gen->live_estimate != 0) { - words = oldest_gen->live_estimate; - } else { - words = oldest_gen->n_words; - } - live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W + - oldest_gen->n_large_blocks + - oldest_gen->n_compact_blocks; + // live in the oldest generations + if (oldest_gen->live_estimate != 0) { + words = oldest_gen->live_estimate; + } else { + words = oldest_gen->n_words; + } + live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W + + oldest_gen->n_large_blocks + + oldest_gen->n_compact_blocks; - // default max size for all generations except zero - size = stg_max(live * RtsFlags.GcFlags.oldGenFactor, - RtsFlags.GcFlags.minOldGenSize); + // default max size for all generations except zero + size = stg_max(live * RtsFlags.GcFlags.oldGenFactor, + RtsFlags.GcFlags.minOldGenSize); - if (RtsFlags.GcFlags.heapSizeSuggestionAuto) { - if (max > 0) { - RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size); - } else { - RtsFlags.GcFlags.heapSizeSuggestion = size; - } + if (RtsFlags.GcFlags.heapSizeSuggestionAuto) { + if (max > 0) { + RtsFlags.GcFlags.heapSizeSuggestion = stg_min(max, size); + } else { + RtsFlags.GcFlags.heapSizeSuggestion = size; } + } - // minimum size for generation zero - min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200, - RtsFlags.GcFlags.minAllocAreaSize - * (W_)n_capabilities); - - // Auto-enable compaction when the residency reaches a - // certain percentage of the maximum heap size (default: 30%). - if (RtsFlags.GcFlags.compact || - (max > 0 && - oldest_gen->n_blocks > - (RtsFlags.GcFlags.compactThreshold * max) / 100)) { - oldest_gen->mark = 1; - oldest_gen->compact = 1; + // minimum size for generation zero + min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200, + RtsFlags.GcFlags.minAllocAreaSize + * (W_)n_capabilities); + + // Auto-enable compaction when the residency reaches a + // certain percentage of the maximum heap size (default: 30%). + // Except when non-moving GC is enabled. + if (!RtsFlags.GcFlags.useNonmoving && + (RtsFlags.GcFlags.compact || + (max > 0 && + oldest_gen->n_blocks > + (RtsFlags.GcFlags.compactThreshold * max) / 100))) { + oldest_gen->mark = 1; + oldest_gen->compact = 1; // debugBelch("compaction: on\n", live); - } else { - oldest_gen->mark = 0; - oldest_gen->compact = 0; + } else { + oldest_gen->mark = 0; + oldest_gen->compact = 0; // debugBelch("compaction: off\n", live); - } + } - if (RtsFlags.GcFlags.sweep) { - oldest_gen->mark = 1; - } + if (RtsFlags.GcFlags.sweep) { + oldest_gen->mark = 1; + } - // if we're going to go over the maximum heap size, reduce the - // size of the generations accordingly. The calculation is - // different if compaction is turned on, because we don't need - // to double the space required to collect the old generation. - if (max != 0) { + // if we're going to go over the maximum heap size, reduce the + // size of the generations accordingly. The calculation is + // different if compaction is turned on, because we don't need + // to double the space required to collect the old generation. + if (max != 0) { + + // this test is necessary to ensure that the calculations + // below don't have any negative results - we're working + // with unsigned values here. + if (max < min_alloc) { + heapOverflow(); + } - // this test is necessary to ensure that the calculations - // below don't have any negative results - we're working - // with unsigned values here. - if (max < min_alloc) { - heapOverflow(); + if (oldest_gen->compact) { + if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) { + size = (max - min_alloc) / ((gens - 1) * 2 - 1); } - - if (oldest_gen->compact) { - if ( (size + (size - 1) * (gens - 2) * 2) + min_alloc > max ) { - size = (max - min_alloc) / ((gens - 1) * 2 - 1); - } - } else { - if ( (size * (gens - 1) * 2) + min_alloc > max ) { - size = (max - min_alloc) / ((gens - 1) * 2); - } + } else { + if ( (size * (gens - 1) * 2) + min_alloc > max ) { + size = (max - min_alloc) / ((gens - 1) * 2); } + } - if (size < live) { - heapOverflow(); - } + if (size < live) { + heapOverflow(); } + } #if 0 - debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live, - min_alloc, size, max); + debugBelch("live: %d, min_alloc: %d, size : %d, max = %d\n", live, + min_alloc, size, max); + debugBelch("resize_gen: n_blocks: %lu, n_large_block: %lu, n_compact_blocks: %lu\n", + oldest_gen->n_blocks, oldest_gen->n_large_blocks, oldest_gen->n_compact_blocks); + debugBelch("resize_gen: max_blocks: %lu -> %lu\n", oldest_gen->max_blocks, oldest_gen->n_blocks); #endif - for (g = 0; g < gens; g++) { - generations[g].max_blocks = size; - } + for (g = 0; g < gens; g++) { + generations[g].max_blocks = size; } } |