From 13dd78ddb158f98b35ad2eda50d0a7af63920ece Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Thu, 18 Apr 2019 14:08:32 -0400
Subject: Nonmoving: Allow aging and refactor static objects logic

This commit does two things:

 * Allow aging of objects during the preparatory minor GC
 * Refactor handling of static objects to avoid the use of a hashtable
---
 rts/sm/Evac.c           |  50 ++++++++++++++++-----
 rts/sm/GC.c             |  10 ++++-
 rts/sm/GCAux.c          |   4 +-
 rts/sm/NonMoving.c      |  40 ++++++++++++-----
 rts/sm/NonMovingMark.c  | 115 ++++++++++++++++++++++++++++++++++--------------
 rts/sm/NonMovingMark.h  |   6 +--
 rts/sm/NonMovingScav.c  |  45 +++++++++++++------
 rts/sm/NonMovingSweep.c |   5 ++-
 rts/sm/NonMovingSweep.h |   2 +-
 rts/sm/Storage.c        |  70 +++++++++++++++++++++++++++--
 10 files changed, 266 insertions(+), 81 deletions(-)

diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index c2aaaacffa..b4e35849ba 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -69,12 +69,6 @@ alloc_for_copy (uint32_t size, uint32_t gen_no)
 {
     ASSERT(gen_no < RtsFlags.GcFlags.generations);
 
-    if (RtsFlags.GcFlags.useNonmoving && major_gc) {
-        // unconditionally promote to non-moving heap in major gc
-        gct->copied += size;
-        return nonmovingAllocate(gct->cap, size);
-    }
-
     StgPtr to;
     gen_workspace *ws;
 
@@ -93,7 +87,20 @@ alloc_for_copy (uint32_t size, uint32_t gen_no)
 
     if (RtsFlags.GcFlags.useNonmoving && gen_no == oldest_gen->no) {
         gct->copied += size;
-        return nonmovingAllocate(gct->cap, size);
+        to = nonmovingAllocate(gct->cap, size);
+
+        // Add segment to the todo list unless it's already there
+        // current->todo_link == NULL means not in todo list
+        struct NonmovingSegment *seg = nonmovingGetSegment(to);
+        if (!seg->todo_link) {
+            gen_workspace *ws = &gct->gens[oldest_gen->no];
+            seg->todo_link = ws->todo_seg;
+            ws->todo_seg = seg;
+        }
+
+        if (major_gc)
+            markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure *) to);
+        return to;
     }
 
     ws = &gct->gens[gen_no];  // zero memory references here
@@ -312,9 +319,7 @@ evacuate_large(StgPtr p)
    */
   new_gen_no = bd->dest_no;
 
-  if (RtsFlags.GcFlags.useNonmoving && major_gc) {
-      new_gen_no = oldest_gen->no;
-  } else if (new_gen_no < gct->evac_gen_no) {
+  if (new_gen_no < gct->evac_gen_no) {
       if (gct->eager_promotion) {
           new_gen_no = gct->evac_gen_no;
       } else {
@@ -363,6 +368,13 @@ evacuate_large(StgPtr p)
 STATIC_INLINE void
 evacuate_static_object (StgClosure **link_field, StgClosure *q)
 {
+    if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving)) {
+        // See Note [Static objects under the nonmoving collector] in Storage.c.
+        if (major_gc)
+            markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+        return;
+    }
+
     StgWord link = (StgWord)*link_field;
 
     // See Note [STATIC_LINK fields] for how the link field bits work
@@ -603,6 +615,8 @@ loop:
           // NOTE: large objects in nonmoving heap are also marked with
           // BF_NONMOVING. Those are moved to scavenged_large_objects list in
           // mark phase.
+          if (major_gc)
+              markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
           return;
       }
 
@@ -629,6 +643,13 @@ loop:
       // they are not)
       if (bd->flags & BF_COMPACT) {
           evacuate_compact((P_)q);
+
+          // We may have evacuated the block to the nonmoving generation. If so
+          // we need to make sure it is added to the mark queue since the only
+          // reference to it may be from the moving heap.
+          if (major_gc && bd->flags & BF_NONMOVING) {
+              markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+          }
           return;
       }
 
@@ -636,6 +657,13 @@ loop:
        */
       if (bd->flags & BF_LARGE) {
           evacuate_large((P_)q);
+
+          // We may have evacuated the block to the nonmoving generation. If so
+          // we need to make sure it is added to the mark queue since the only
+          // reference to it may be from the moving heap.
+          if (major_gc && bd->flags & BF_NONMOVING) {
+              markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+          }
           return;
       }
 
@@ -937,6 +965,8 @@ evacuate_BLACKHOLE(StgClosure **p)
     ASSERT((bd->flags & BF_COMPACT) == 0);
 
     if (bd->flags & BF_NONMOVING) {
+        if (major_gc)
+            markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
         return;
     }
 
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 37f9811aaf..1df6ace800 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -263,7 +263,10 @@ GarbageCollect (uint32_t collect_gen,
   N = collect_gen;
   major_gc = (N == RtsFlags.GcFlags.generations-1);
 
-  if (major_gc) {
+  /* N.B. The nonmoving collector works a bit differently. See
+   * Note [Static objects under the nonmoving collector].
+   */
+  if (major_gc && !RtsFlags.GcFlags.useNonmoving) {
       prev_static_flag = static_flag;
       static_flag =
           static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
@@ -736,6 +739,11 @@ GarbageCollect (uint32_t collect_gen,
       // so we need to mark those too.
       // Note that in sequential case these lists will be appended with more
       // weaks and threads found to be dead in mark.
+#if !defined(THREADED_RTS)
+      // In the non-threaded runtime this is the only time we push to the
+      // upd_rem_set
+      nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue);
+#endif
       nonmovingCollect(&dead_weak_ptr_list, &resurrected_threads);
       ACQUIRE_SM_LOCK;
   }
diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c
index 6076f61d6f..11080c1f22 100644
--- a/rts/sm/GCAux.c
+++ b/rts/sm/GCAux.c
@@ -148,14 +148,14 @@ markCAFs (evac_fn evac, void *user)
     StgIndStatic *c;
 
     for (c = dyn_caf_list;
-         c != (StgIndStatic*)END_OF_CAF_LIST;
+         ((StgWord) c | STATIC_FLAG_LIST) != (StgWord)END_OF_CAF_LIST;
          c = (StgIndStatic *)c->static_link)
     {
         c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c);
         evac(user, &c->indirectee);
     }
     for (c = revertible_caf_list;
-         c != (StgIndStatic*)END_OF_CAF_LIST;
+         ((StgWord) c | STATIC_FLAG_LIST) != (StgWord)END_OF_CAF_LIST;
          c = (StgIndStatic *)c->static_link)
     {
         c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c);
diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index 24464a43f9..76f54c9302 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -68,6 +68,27 @@ Mutex concurrent_coll_finished_lock;
  *    stopAllCapabilitiesWith(SYNC_FLUSH_UPD_REM_SET). Capabilities are held
  *    the final mark has concluded.
  *
+ * Note that possibility of concurrent minor and non-moving collections
+ * requires that we handle static objects a bit specially. See
+ * Note [Static objects under the nonmoving collector] in Storage.c
+ * for details.
+ *
+ *
+ * Note [Aging under the non-moving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * The initial design of the non-moving collector mandated that all live data
+ * be evacuated to the non-moving heap prior to a major collection. This
+ * simplified certain bits of implementation and eased reasoning. However, it
+ * was (unsurprisingly) also found to result in significant amounts of
+ * unnecessary copying.
+ *
+ * Consequently, we now allow aging. We do this by teaching the moving
+ * collector to "evacuate" objects it encounters in the non-moving heap by
+ * adding them to the mark queue. Specifically, since each gc_thread holds a
+ * capability we push to the capability's update remembered set (implemented
+ * by markQueuePushClosureGC)
+ *
  *
  * Note [Live data accounting in nonmoving collector]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -150,6 +171,8 @@ static struct NonmovingSegment *nonmovingPopFreeSegment(void)
  * Request a fresh segment from the free segment list or allocate one of the
  * given node.
  *
+ * Caller must hold SM_MUTEX (although we take the gc_alloc_block_sync spinlock
+ * under the assumption that we are in a GC context).
  */
 static struct NonmovingSegment *nonmovingAllocSegment(uint32_t node)
 {
@@ -221,7 +244,7 @@ static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *al
     }
 }
 
-/* sz is in words */
+/* Allocate a block in the nonmoving heap. Caller must hold SM_MUTEX. sz is in words */
 GNUC_ATTR_HOT
 void *nonmovingAllocate(Capability *cap, StgWord sz)
 {
@@ -239,14 +262,6 @@ void *nonmovingAllocate(Capability *cap, StgWord sz)
     void *ret = nonmovingSegmentGetBlock(current, current->next_free);
     ASSERT(GET_CLOSURE_TAG(ret) == 0); // check alignment
 
-    // Add segment to the todo list unless it's already there
-    // current->todo_link == NULL means not in todo list
-    if (!current->todo_link) {
-        gen_workspace *ws = &gct->gens[oldest_gen->no];
-        current->todo_link = ws->todo_seg;
-        ws->todo_seg = current;
-    }
-
     // Advance the current segment's next_free or allocate a new segment if full
     bool full = advance_next_free(current);
     if (full) {
@@ -405,6 +420,11 @@ static void nonmovingClearAllBitmaps(void)
 /* Prepare the heap bitmaps and snapshot metadata for a mark */
 static void nonmovingPrepareMark(void)
 {
+    // See Note [Static objects under the nonmoving collector].
+    prev_static_flag = static_flag;
+    static_flag =
+        static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
+
     nonmovingClearAllBitmaps();
     nonmovingBumpEpoch();
     for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
@@ -688,7 +708,7 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
 
 #if defined(DEBUG)
     // Zap CAFs that we will sweep
-    nonmovingGcCafs(mark_queue);
+    nonmovingGcCafs();
 #endif
 
     ASSERT(mark_queue->top->head == 0);
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index 72a3ca69cb..b1c585f72a 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -205,7 +205,7 @@ static void init_mark_queue_(MarkQueue *queue);
  * Really the argument type should be UpdRemSet* but this would be rather
  * inconvenient without polymorphism.
  */
-static void nonmovingAddUpdRemSetBlocks(MarkQueue *rset)
+void nonmovingAddUpdRemSetBlocks(MarkQueue *rset)
 {
     if (markQueueIsEmpty(rset)) return;
 
@@ -374,6 +374,38 @@ push (MarkQueue *q, const MarkQueueEnt *ent)
     q->top->head++;
 }
 
+/* A variant of push to be used by the minor GC when it encounters a reference
+ * to an object in the non-moving heap. In contrast to the other push
+ * operations this uses the gc_alloc_block_sync spinlock instead of the
+ * SM_LOCK to allocate new blocks in the event that the mark queue is full.
+ */
+void
+markQueuePushClosureGC (MarkQueue *q, StgClosure *p)
+{
+    // Are we at the end of the block?
+    if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) {
+        // Yes, this block is full.
+        // allocate a fresh block.
+        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        bdescr *bd = allocGroup(1);
+        bd->link = q->blocks;
+        q->blocks = bd;
+        q->top = (MarkQueueBlock *) bd->start;
+        q->top->head = 0;
+        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+    }
+
+    MarkQueueEnt ent = {
+        .type = MARK_CLOSURE,
+        .mark_closure = {
+            .p = UNTAG_CLOSURE(p),
+            .origin = NULL,
+        }
+    };
+    q->top->entries[q->top->head] = ent;
+    q->top->head++;
+}
+
 static inline
 void push_closure (MarkQueue *q,
                    StgClosure *p,
@@ -715,7 +747,6 @@ static void init_mark_queue_ (MarkQueue *queue)
 void initMarkQueue (MarkQueue *queue)
 {
     init_mark_queue_(queue);
-    queue->marked_objects = allocHashTable();
     queue->is_upd_rem_set = false;
 }
 
@@ -723,8 +754,6 @@ void initMarkQueue (MarkQueue *queue)
 void init_upd_rem_set (UpdRemSet *rset)
 {
     init_mark_queue_(&rset->queue);
-    // Update remembered sets don't have to worry about static objects
-    rset->queue.marked_objects = NULL;
     rset->queue.is_upd_rem_set = true;
 }
 
@@ -739,7 +768,6 @@ void reset_upd_rem_set (UpdRemSet *rset)
 void freeMarkQueue (MarkQueue *queue)
 {
     freeChain_lock(queue->blocks);
-    freeHashTable(queue->marked_objects, NULL);
 }
 
 #if defined(THREADED_RTS) && defined(DEBUG)
@@ -986,12 +1014,32 @@ mark_stack (MarkQueue *queue, StgStack *stack)
     mark_stack_(queue, stack->sp, stack->stack + stack->stack_size);
 }
 
+/* See Note [Static objects under the nonmoving collector].
+ *
+ * Returns true if the object needs to be marked.
+ */
+static bool
+bump_static_flag(StgClosure **link_field, StgClosure *q STG_UNUSED)
+{
+    while (1) {
+        StgWord link = (StgWord) *link_field;
+        StgWord new = (link & ~STATIC_BITS) | static_flag;
+        if ((link & STATIC_BITS) == static_flag)
+            return false;
+        else if (cas((StgVolatilePtr) link_field, link, new) == link) {
+            return true;
+        }
+    }
+}
+
 static GNUC_ATTR_HOT void
 mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
 {
     (void)origin; // TODO: should be used for selector/thunk optimisations
 
  try_again:
+    ;
+    bdescr *bd = NULL;
     p = UNTAG_CLOSURE(p);
 
 #   define PUSH_FIELD(obj, field)                                \
@@ -1009,45 +1057,46 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
             return;
         }
 
-        if (lookupHashTable(queue->marked_objects, (W_)p)) {
-            // already marked
-            return;
-        }
-
-        insertHashTable(queue->marked_objects, (W_)p, (P_)1);
-
         switch (type) {
 
         case THUNK_STATIC:
             if (info->srt != 0) {
-                markQueuePushThunkSrt(queue, info); // TODO this function repeats the check above
+                if (bump_static_flag(THUNK_STATIC_LINK((StgClosure *)p), p)) {
+                    markQueuePushThunkSrt(queue, info); // TODO this function repeats the check above
+                }
             }
             return;
 
         case FUN_STATIC:
             if (info->srt != 0 || info->layout.payload.ptrs != 0) {
-                markQueuePushFunSrt(queue, info); // TODO this function repeats the check above
-
-                // a FUN_STATIC can also be an SRT, so it may have pointer
-                // fields.  See Note [SRTs] in CmmBuildInfoTables, specifically
-                // the [FUN] optimisation.
-                // TODO (osa) I don't understand this comment
-                for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
-                    PUSH_FIELD(p, payload[i]);
+                if (bump_static_flag(STATIC_LINK(info, (StgClosure *)p), p)) {
+                    markQueuePushFunSrt(queue, info); // TODO this function repeats the check above
+
+                    // a FUN_STATIC can also be an SRT, so it may have pointer
+                    // fields.  See Note [SRTs] in CmmBuildInfoTables, specifically
+                    // the [FUN] optimisation.
+                    // TODO (osa) I don't understand this comment
+                    for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
+                        PUSH_FIELD(p, payload[i]);
+                    }
                 }
             }
             return;
 
         case IND_STATIC:
-            PUSH_FIELD((StgInd *) p, indirectee);
+            if (bump_static_flag(IND_STATIC_LINK((StgClosure *)p), p)) {
+                PUSH_FIELD((StgInd *) p, indirectee);
+            }
             return;
 
         case CONSTR:
         case CONSTR_1_0:
         case CONSTR_2_0:
         case CONSTR_1_1:
-            for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
-                PUSH_FIELD(p, payload[i]);
+            if (bump_static_flag(STATIC_LINK(info, (StgClosure *)p), p)) {
+                for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
+                    PUSH_FIELD(p, payload[i]);
+                }
             }
             return;
 
@@ -1061,19 +1110,17 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
         }
     }
 
-    bdescr *bd = Bdescr((StgPtr) p);
+    bd = Bdescr((StgPtr) p);
 
     if (bd->gen != oldest_gen) {
-        // Here we have an object living outside of the non-moving heap. Since
-        // we moved everything to the non-moving heap before starting the major
-        // collection, we know that we don't need to trace it: it was allocated
-        // after we took our snapshot.
-#if !defined(THREADED_RTS)
-        // This should never happen in the non-concurrent case
-        barf("Closure outside of non-moving heap: %p", p);
-#else
+        // Here we have an object living outside of the non-moving heap. While
+        // we likely evacuated nearly everything to the nonmoving heap during
+        // preparation there are nevertheless a few ways in which we might trace
+        // a reference into younger generations:
+        //
+        //  * a mutable object might have been updated
+        //  * we might have aged an object
         return;
-#endif
     }
 
     ASSERTM(LOOKS_LIKE_CLOSURE_PTR(p), "invalid closure, info=%p", p->header.info);
diff --git a/rts/sm/NonMovingMark.h b/rts/sm/NonMovingMark.h
index 84b6642d6c..7e9bfed1f9 100644
--- a/rts/sm/NonMovingMark.h
+++ b/rts/sm/NonMovingMark.h
@@ -82,10 +82,6 @@ typedef struct MarkQueue_ {
 
     // Is this a mark queue or a capability-local update remembered set?
     bool is_upd_rem_set;
-
-    // Marked objects outside of nonmoving heap, namely large and static
-    // objects.
-    HashTable *marked_objects;
 } MarkQueue;
 
 /* While it shares its representation with MarkQueue, UpdRemSet differs in
@@ -145,8 +141,10 @@ void nonmovingResurrectThreads(struct MarkQueue_ *queue, StgTSO **resurrected_th
 bool nonmovingIsAlive(StgClosure *p);
 void nonmovingMarkDeadWeak(struct MarkQueue_ *queue, StgWeak *w);
 void nonmovingMarkLiveWeak(struct MarkQueue_ *queue, StgWeak *w);
+void nonmovingAddUpdRemSetBlocks(struct MarkQueue_ *rset);
 
 void markQueuePush(MarkQueue *q, const MarkQueueEnt *ent);
+void markQueuePushClosureGC(MarkQueue *q, StgClosure *p);
 void markQueuePushClosure(MarkQueue *q,
                              StgClosure *p,
                              StgClosure **origin);
diff --git a/rts/sm/NonMovingScav.c b/rts/sm/NonMovingScav.c
index 850750b43b..0efbde688c 100644
--- a/rts/sm/NonMovingScav.c
+++ b/rts/sm/NonMovingScav.c
@@ -16,6 +16,7 @@ nonmovingScavengeOne (StgClosure *q)
     ASSERT(LOOKS_LIKE_CLOSURE_PTR(q));
     StgPtr p = (StgPtr)q;
     const StgInfoTable *info = get_itbl(q);
+    const bool saved_eager_promotion = gct->eager_promotion;
 
     switch (info->type) {
 
@@ -23,9 +24,11 @@ nonmovingScavengeOne (StgClosure *q)
     case MVAR_DIRTY:
     {
         StgMVar *mvar = ((StgMVar *)p);
+        gct->eager_promotion = false;
         evacuate((StgClosure **)&mvar->head);
         evacuate((StgClosure **)&mvar->tail);
         evacuate((StgClosure **)&mvar->value);
+        gct->eager_promotion = saved_eager_promotion;
         if (gct->failed_to_evac) {
             mvar->header.info = &stg_MVAR_DIRTY_info;
         } else {
@@ -37,8 +40,10 @@ nonmovingScavengeOne (StgClosure *q)
     case TVAR:
     {
         StgTVar *tvar = ((StgTVar *)p);
+        gct->eager_promotion = false;
         evacuate((StgClosure **)&tvar->current_value);
         evacuate((StgClosure **)&tvar->first_watch_queue_entry);
+        gct->eager_promotion = saved_eager_promotion;
         if (gct->failed_to_evac) {
             tvar->header.info = &stg_TVAR_DIRTY_info;
         } else {
@@ -122,10 +127,21 @@ nonmovingScavengeOne (StgClosure *q)
         break;
     }
 
+    case WEAK:
+    {
+        // We must evacuate the key since it may refer to an object in the
+        // moving heap which may be long gone by the time we call
+        // nonmovingTidyWeaks.
+        StgWeak *weak = (StgWeak *) p;
+        gct->eager_promotion = true;
+        evacuate(&weak->key);
+        gct->eager_promotion = saved_eager_promotion;
+        goto gen_obj;
+    }
+
     gen_obj:
     case CONSTR:
     case CONSTR_NOCAF:
-    case WEAK:
     case PRIM:
     {
         StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
@@ -145,7 +161,9 @@ nonmovingScavengeOne (StgClosure *q)
 
     case MUT_VAR_CLEAN:
     case MUT_VAR_DIRTY:
+        gct->eager_promotion = false;
         evacuate(&((StgMutVar *)p)->var);
+        gct->eager_promotion = saved_eager_promotion;
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
         } else {
@@ -157,10 +175,12 @@ nonmovingScavengeOne (StgClosure *q)
     {
         StgBlockingQueue *bq = (StgBlockingQueue *)p;
 
+        gct->eager_promotion = false;
         evacuate(&bq->bh);
         evacuate((StgClosure**)&bq->owner);
         evacuate((StgClosure**)&bq->queue);
         evacuate((StgClosure**)&bq->link);
+        gct->eager_promotion = saved_eager_promotion;
 
         if (gct->failed_to_evac) {
             bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
@@ -202,11 +222,9 @@ nonmovingScavengeOne (StgClosure *q)
     case MUT_ARR_PTRS_CLEAN:
     case MUT_ARR_PTRS_DIRTY:
     {
-        // We don't eagerly promote objects pointed to by a mutable
-        // array, but if we find the array only points to objects in
-        // the same or an older generation, we mark it "clean" and
-        // avoid traversing it during minor GCs.
+        gct->eager_promotion = false;
         scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
+        gct->eager_promotion = saved_eager_promotion;
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -234,14 +252,13 @@ nonmovingScavengeOne (StgClosure *q)
     case SMALL_MUT_ARR_PTRS_DIRTY:
         // follow everything
     {
-        // We don't eagerly promote objects pointed to by a mutable
-        // array, but if we find the array only points to objects in
-        // the same or an older generation, we mark it "clean" and
-        // avoid traversing it during minor GCs.
         StgPtr next = p + small_mut_arr_ptrs_sizeW((StgSmallMutArrPtrs*)p);
+        gct->eager_promotion = false;
         for (p = (P_)((StgSmallMutArrPtrs *)p)->payload; p < next; p++) {
             evacuate((StgClosure **)p);
         }
+        gct->eager_promotion = saved_eager_promotion;
+
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -278,21 +295,21 @@ nonmovingScavengeOne (StgClosure *q)
     {
         StgStack *stack = (StgStack*)p;
 
+        gct->eager_promotion = false;
         scavenge_stack(stack->sp, stack->stack + stack->stack_size);
+        gct->eager_promotion = saved_eager_promotion;
         stack->dirty = gct->failed_to_evac;
-        // TODO (osa): There may be something special about stacks that we're
-        // missing. All other mut objects are marked by using a different info
-        // table except stacks.
-
         break;
     }
 
     case MUT_PRIM:
     {
         StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+        gct->eager_promotion = false;
         for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
             evacuate((StgClosure **)p);
         }
+        gct->eager_promotion = saved_eager_promotion;
         gct->failed_to_evac = true; // mutable
         break;
     }
@@ -302,12 +319,14 @@ nonmovingScavengeOne (StgClosure *q)
         StgWord i;
         StgTRecChunk *tc = ((StgTRecChunk *) p);
         TRecEntry *e = &(tc -> entries[0]);
+        gct->eager_promotion = false;
         evacuate((StgClosure **)&tc->prev_chunk);
         for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
           evacuate((StgClosure **)&e->tvar);
           evacuate((StgClosure **)&e->expected_value);
           evacuate((StgClosure **)&e->new_value);
         }
+        gct->eager_promotion = saved_eager_promotion;
         gct->failed_to_evac = true; // mutable
         break;
       }
diff --git a/rts/sm/NonMovingSweep.c b/rts/sm/NonMovingSweep.c
index fa3e38cca4..b67bba0ce5 100644
--- a/rts/sm/NonMovingSweep.c
+++ b/rts/sm/NonMovingSweep.c
@@ -102,7 +102,7 @@ nonmovingSweepSegment(struct NonmovingSegment *seg)
 
 #if defined(DEBUG)
 
-void nonmovingGcCafs(struct MarkQueue_ *queue)
+void nonmovingGcCafs()
 {
     uint32_t i = 0;
     StgIndStatic *next;
@@ -116,7 +116,8 @@ void nonmovingGcCafs(struct MarkQueue_ *queue)
         const StgInfoTable *info = get_itbl((StgClosure*)caf);
         ASSERT(info->type == IND_STATIC);
 
-        if (lookupHashTable(queue->marked_objects, (StgWord) caf) == NULL) {
+        StgWord flag = ((StgWord) caf->static_link) & STATIC_BITS;
+        if (flag != 0 && flag != static_flag) {
             debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", caf);
             SET_INFO((StgClosure*)caf, &stg_GCD_CAF_info); // stub it
         } else {
diff --git a/rts/sm/NonMovingSweep.h b/rts/sm/NonMovingSweep.h
index de2d52acb5..f21936004f 100644
--- a/rts/sm/NonMovingSweep.h
+++ b/rts/sm/NonMovingSweep.h
@@ -28,5 +28,5 @@ void nonmovingPrepareSweep(void);
 
 #if defined(DEBUG)
 // The non-moving equivalent of the moving collector's gcCAFs.
-void nonmovingGcCafs(struct MarkQueue_ *queue);
+void nonmovingGcCafs(void);
 #endif
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 44163a353f..f04b3c5929 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -321,7 +321,8 @@ freeStorage (bool free_heap)
 }
 
 /* -----------------------------------------------------------------------------
-   Note [CAF management].
+   Note [CAF management]
+   ~~~~~~~~~~~~~~~~~~~~~
 
    The entry code for every CAF does the following:
 
@@ -356,6 +357,7 @@ freeStorage (bool free_heap)
 
    ------------------
    Note [atomic CAF entry]
+   ~~~~~~~~~~~~~~~~~~~~~~~
 
    With THREADED_RTS, newCAF() is required to be atomic (see
    #5558). This is because if two threads happened to enter the same
@@ -369,6 +371,7 @@ freeStorage (bool free_heap)
 
    ------------------
    Note [GHCi CAFs]
+   ~~~~~~~~~~~~~~~~
 
    For GHCI, we have additional requirements when dealing with CAFs:
 
@@ -388,6 +391,51 @@ freeStorage (bool free_heap)
 
       -- SDM 29/1/01
 
+   ------------------
+   Note [Static objects under the nonmoving collector]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   Static object management is a bit tricky under the nonmoving collector as we
+   need to maintain a bit more state than in the moving collector. In
+   particular, the moving collector uses the low bits of the STATIC_LINK field
+   to determine whether the object has been moved to the scavenger's work list
+   (see Note [STATIC_LINK fields] in Storage.h).
+
+   However, the nonmoving collector also needs a place to keep its mark bit.
+   This is problematic as we therefore need at least three bits of state
+   but can assume only two bits are available in STATIC_LINK (due to 32-bit
+   systems).
+
+   To accomodate this we move handling of static objects entirely to the
+   oldest generation when the nonmoving collector is in use. To do this safely
+   and efficiently we allocate the blackhole created by lockCAF() directly in
+   the non-moving heap. This means that the moving collector can completely
+   ignore static objects in minor collections since they are guaranteed not to
+   have any references into the moving heap. Of course, the blackhole itself
+   likely will contain a reference into the moving heap but this is
+   significantly easier to handle, being a heap-allocated object (see Note
+   [Aging under the non-moving collector] in NonMoving.c for details).
+
+   During the moving phase of a major collection we treat static objects
+   as we do any other reference into the non-moving heap by pushing them
+   to the non-moving mark queue (see Note [Aging under the non-moving
+   collector]).
+
+   This allows the non-moving collector to have full control over the flags
+   in STATIC_LINK, which it uses as described in Note [STATIC_LINK fields]).
+   This is implemented by NonMovingMark.c:bump_static_flag.
+
+   In short, the plan is:
+
+     - lockCAF allocates its blackhole in the nonmoving heap. This is important
+       to ensure that we do not need to place the static object on the mut_list
+       lest we would need somw way to ensure that it evacuate only once during
+       a moving collection.
+
+     - evacuate_static_object adds merely pushes objects to the mark queue
+
+     - the nonmoving collector uses the flags in STATIC_LINK as its mark bit.
+
    -------------------------------------------------------------------------- */
 
 STATIC_INLINE StgInd *
@@ -441,7 +489,16 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     caf->saved_info = orig_info;
 
     // Allocate the blackhole indirection closure
-    bh = (StgInd *)allocate(cap, sizeofW(*bh));
+    if (RtsFlags.GcFlags.useNonmoving) {
+        // See Note [Static objects under the nonmoving collector].
+        ACQUIRE_SM_LOCK;
+        bh = (StgInd *)nonmovingAllocate(cap, sizeofW(*bh));
+        RELEASE_SM_LOCK;
+        recordMutableCap((StgClosure*)bh,
+                         regTableToCapability(reg), oldest_gen->no);
+    } else {
+        bh = (StgInd *)allocate(cap, sizeofW(*bh));
+    }
     bh->indirectee = (StgClosure *)cap->r.rCurrentTSO;
     SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs);
     // Ensure that above writes are visible before we introduce reference as CAF indirectee.
@@ -483,7 +540,9 @@ newCAF(StgRegTable *reg, StgIndStatic *caf)
     else
     {
         // Put this CAF on the mutable list for the old generation.
-        if (oldest_gen->no != 0) {
+        // N.B. the nonmoving collector works a bit differently: see
+        // Note [Static objects under the nonmoving collector].
+        if (oldest_gen->no != 0 && !RtsFlags.GcFlags.useNonmoving) {
             recordMutableCap((StgClosure*)caf,
                              regTableToCapability(reg), oldest_gen->no);
         }
@@ -560,7 +619,9 @@ StgInd* newGCdCAF (StgRegTable *reg, StgIndStatic *caf)
     if (!bh) return NULL;
 
     // Put this CAF on the mutable list for the old generation.
-    if (oldest_gen->no != 0) {
+    // N.B. the nonmoving collector works a bit differently:
+    // see Note [Static objects under the nonmoving collector].
+    if (oldest_gen->no != 0 && !RtsFlags.GcFlags.useNonmoving) {
         recordMutableCap((StgClosure*)caf,
                          regTableToCapability(reg), oldest_gen->no);
     }
@@ -1520,6 +1581,7 @@ void flushExec (W_ len, AdjustorExecutable exec_addr)
   __clear_cache((void*)begin, (void*)end);
 # endif
 #elif defined(__GNUC__)
+  /* For all other platforms, fall back to a libgcc builtin. */
   unsigned char* begin = (unsigned char*)exec_addr;
   unsigned char* end   = begin + len;
 # if GCC_HAS_BUILTIN_CLEAR_CACHE
-- 
cgit v1.2.1


From 7b79e8b49bb5fef06f0f7d86611bc8eb2be30c62 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Sun, 16 Jun 2019 12:22:56 -0400
Subject: Disable aging when doing deadlock detection GC

---
 rts/Schedule.c         | 23 +++++++++++---------
 rts/sm/Evac.c          | 57 +++++++++++++++++++++++++++++++-------------------
 rts/sm/GC.c            |  5 +++++
 rts/sm/GC.h            |  9 ++++++--
 rts/sm/NonMovingMark.c |  5 +++++
 5 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/rts/Schedule.c b/rts/Schedule.c
index f66a276602..5387bc615e 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -164,7 +164,8 @@ static void scheduleHandleThreadBlocked( StgTSO *t );
 static bool scheduleHandleThreadFinished( Capability *cap, Task *task,
                                           StgTSO *t );
 static bool scheduleNeedHeapProfile(bool ready_to_gc);
-static void scheduleDoGC(Capability **pcap, Task *task, bool force_major);
+static void scheduleDoGC( Capability **pcap, Task *task,
+                          bool force_major, bool deadlock_detect );
 
 static void deleteThread (StgTSO *tso);
 static void deleteAllThreads (void);
@@ -264,7 +265,7 @@ schedule (Capability *initialCapability, Task *task)
     case SCHED_INTERRUPTING:
         debugTrace(DEBUG_sched, "SCHED_INTERRUPTING");
         /* scheduleDoGC() deletes all the threads */
-        scheduleDoGC(&cap,task,true);
+        scheduleDoGC(&cap,task,true,false);
 
         // after scheduleDoGC(), we must be shutting down.  Either some
         // other Capability did the final GC, or we did it above,
@@ -561,7 +562,7 @@ run_thread:
     }
 
     if (ready_to_gc || scheduleNeedHeapProfile(ready_to_gc)) {
-      scheduleDoGC(&cap,task,false);
+      scheduleDoGC(&cap,task,false,false);
     }
   } /* end of while() */
 }
@@ -935,7 +936,7 @@ scheduleDetectDeadlock (Capability **pcap, Task *task)
         // they are unreachable and will therefore be sent an
         // exception.  Any threads thus released will be immediately
         // runnable.
-        scheduleDoGC (pcap, task, true/*force major GC*/);
+        scheduleDoGC (pcap, task, true/*force major GC*/, true/*deadlock detection*/);
         cap = *pcap;
         // when force_major == true. scheduleDoGC sets
         // recent_activity to ACTIVITY_DONE_GC and turns off the timer
@@ -1005,7 +1006,7 @@ scheduleProcessInbox (Capability **pcap USED_IF_THREADS)
     while (!emptyInbox(cap)) {
         // Executing messages might use heap, so we should check for GC.
         if (doYouWantToGC(cap)) {
-            scheduleDoGC(pcap, cap->running_task, false);
+            scheduleDoGC(pcap, cap->running_task, false, false);
             cap = *pcap;
         }
 
@@ -1552,9 +1553,11 @@ void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task)
  * Perform a garbage collection if necessary
  * -------------------------------------------------------------------------- */
 
+// N.B. See Note [Deadlock detection under nonmoving collector] for rationale
+// behind deadlock_detect argument.
 static void
 scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
-              bool force_major)
+              bool force_major, bool deadlock_detect)
 {
     Capability *cap = *pcap;
     bool heap_census;
@@ -1847,9 +1850,9 @@ delete_threads_and_gc:
     // emerge they don't immediately re-enter the GC.
     pending_sync = 0;
     signalCondition(&sync_finished_cond);
-    GarbageCollect(collect_gen, heap_census, gc_type, cap, idle_cap);
+    GarbageCollect(collect_gen, heap_census, deadlock_detect, gc_type, cap, idle_cap);
 #else
-    GarbageCollect(collect_gen, heap_census, 0, cap, NULL);
+    GarbageCollect(collect_gen, heap_census, deadlock_detect, 0, cap, NULL);
 #endif
 
     // If we're shutting down, don't leave any idle GC work to do.
@@ -2717,7 +2720,7 @@ exitScheduler (bool wait_foreign USED_IF_THREADS)
         nonmovingStop();
         Capability *cap = task->cap;
         waitForCapability(&cap,task);
-        scheduleDoGC(&cap,task,true);
+        scheduleDoGC(&cap,task,true,false);
         ASSERT(task->incall->tso == NULL);
         releaseCapability(cap);
     }
@@ -2785,7 +2788,7 @@ performGC_(bool force_major)
     // TODO: do we need to traceTask*() here?
 
     waitForCapability(&cap,task);
-    scheduleDoGC(&cap,task,force_major);
+    scheduleDoGC(&cap,task,force_major,false);
     releaseCapability(cap);
     boundTaskExiting(task);
 }
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index b4e35849ba..f8ee2630b6 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -85,22 +85,34 @@ alloc_for_copy (uint32_t size, uint32_t gen_no)
         }
     }
 
-    if (RtsFlags.GcFlags.useNonmoving && gen_no == oldest_gen->no) {
-        gct->copied += size;
-        to = nonmovingAllocate(gct->cap, size);
-
-        // Add segment to the todo list unless it's already there
-        // current->todo_link == NULL means not in todo list
-        struct NonmovingSegment *seg = nonmovingGetSegment(to);
-        if (!seg->todo_link) {
-            gen_workspace *ws = &gct->gens[oldest_gen->no];
-            seg->todo_link = ws->todo_seg;
-            ws->todo_seg = seg;
-        }
+    if (RtsFlags.GcFlags.useNonmoving) {
+        /* See Note [Deadlock detection under nonmoving collector]. */
+        if (deadlock_detect_gc)
+            gen_no = oldest_gen->no;
+
+        if (gen_no == oldest_gen->no) {
+            gct->copied += size;
+            to = nonmovingAllocate(gct->cap, size);
+
+            // Add segment to the todo list unless it's already there
+            // current->todo_link == NULL means not in todo list
+            struct NonmovingSegment *seg = nonmovingGetSegment(to);
+            if (!seg->todo_link) {
+                gen_workspace *ws = &gct->gens[oldest_gen->no];
+                seg->todo_link = ws->todo_seg;
+                ws->todo_seg = seg;
+            }
 
-        if (major_gc)
-            markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure *) to);
-        return to;
+            // The object which refers to this closure may have been aged (i.e.
+            // retained in a younger generation). Consequently, we must add the
+            // closure to the mark queue to ensure that it will be marked.
+            //
+            // However, if we are in a deadlock detection GC then we disable aging
+            // so there is no need.
+            if (major_gc && !deadlock_detect_gc)
+                markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure *) to);
+            return to;
+        }
     }
 
     ws = &gct->gens[gen_no];  // zero memory references here
@@ -319,7 +331,10 @@ evacuate_large(StgPtr p)
    */
   new_gen_no = bd->dest_no;
 
-  if (new_gen_no < gct->evac_gen_no) {
+  if (deadlock_detect_gc) {
+      /* See Note [Deadlock detection under nonmoving collector]. */
+      new_gen_no = oldest_gen->no;
+  } else  if (new_gen_no < gct->evac_gen_no) {
       if (gct->eager_promotion) {
           new_gen_no = gct->evac_gen_no;
       } else {
@@ -370,7 +385,7 @@ evacuate_static_object (StgClosure **link_field, StgClosure *q)
 {
     if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving)) {
         // See Note [Static objects under the nonmoving collector] in Storage.c.
-        if (major_gc)
+        if (major_gc && !deadlock_detect_gc)
             markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
         return;
     }
@@ -615,7 +630,7 @@ loop:
           // NOTE: large objects in nonmoving heap are also marked with
           // BF_NONMOVING. Those are moved to scavenged_large_objects list in
           // mark phase.
-          if (major_gc)
+          if (major_gc && !deadlock_detect_gc)
               markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
           return;
       }
@@ -647,7 +662,7 @@ loop:
           // We may have evacuated the block to the nonmoving generation. If so
           // we need to make sure it is added to the mark queue since the only
           // reference to it may be from the moving heap.
-          if (major_gc && bd->flags & BF_NONMOVING) {
+          if (major_gc && bd->flags & BF_NONMOVING && !deadlock_detect_gc) {
               markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
           }
           return;
@@ -661,7 +676,7 @@ loop:
           // We may have evacuated the block to the nonmoving generation. If so
           // we need to make sure it is added to the mark queue since the only
           // reference to it may be from the moving heap.
-          if (major_gc && bd->flags & BF_NONMOVING) {
+          if (major_gc && bd->flags & BF_NONMOVING && !deadlock_detect_gc) {
               markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
           }
           return;
@@ -965,7 +980,7 @@ evacuate_BLACKHOLE(StgClosure **p)
     ASSERT((bd->flags & BF_COMPACT) == 0);
 
     if (bd->flags & BF_NONMOVING) {
-        if (major_gc)
+        if (major_gc && !deadlock_detect_gc)
             markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
         return;
     }
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 1df6ace800..a97042c718 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -104,6 +104,7 @@
  */
 uint32_t N;
 bool major_gc;
+bool deadlock_detect_gc;
 
 /* Data used for allocation area sizing.
  */
@@ -194,6 +195,7 @@ StgPtr mark_sp;            // pointer to the next unallocated mark stack entry
 void
 GarbageCollect (uint32_t collect_gen,
                 const bool do_heap_census,
+                const bool deadlock_detect,
                 uint32_t gc_type USED_IF_THREADS,
                 Capability *cap,
                 bool idle_cap[])
@@ -263,6 +265,9 @@ GarbageCollect (uint32_t collect_gen,
   N = collect_gen;
   major_gc = (N == RtsFlags.GcFlags.generations-1);
 
+  /* See Note [Deadlock detection under nonmoving collector]. */
+  deadlock_detect_gc = deadlock_detect;
+
   /* N.B. The nonmoving collector works a bit differently. See
    * Note [Static objects under the nonmoving collector].
    */
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index ed19b8bddf..bde006913b 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -17,9 +17,12 @@
 
 #include "HeapAlloc.h"
 
-void GarbageCollect (uint32_t force_major_gc,
+void GarbageCollect (uint32_t collect_gen,
                      bool do_heap_census,
-                     uint32_t gc_type, Capability *cap, bool idle_cap[]);
+                     bool deadlock_detect,
+                     uint32_t gc_type,
+                     Capability *cap,
+                     bool idle_cap[]);
 
 typedef void (*evac_fn)(void *user, StgClosure **root);
 
@@ -30,6 +33,8 @@ bool doIdleGCWork(Capability *cap, bool all);
 
 extern uint32_t N;
 extern bool major_gc;
+/* See Note [Deadlock detection under nonmoving collector]. */
+extern bool deadlock_detect_gc;
 
 extern bdescr *mark_stack_bd;
 extern bdescr *mark_stack_top_bd;
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index b1c585f72a..1b71f52345 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -382,6 +382,11 @@ push (MarkQueue *q, const MarkQueueEnt *ent)
 void
 markQueuePushClosureGC (MarkQueue *q, StgClosure *p)
 {
+    /* We should not make it here if we are doing a deadlock detect GC.
+     * See Note [Deadlock detection under nonmoving collector].
+     */
+    ASSERT(!deadlock_detect_gc);
+
     // Are we at the end of the block?
     if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) {
         // Yes, this block is full.
-- 
cgit v1.2.1


From 8fffe12bf61347aa9e7fc52d172c989d8d4ca31f Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Tue, 18 Jun 2019 17:22:06 -0400
Subject: More comments for aging

---
 rts/sm/NonMoving.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 91 insertions(+), 5 deletions(-)

diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index 76f54c9302..f9fb329154 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -83,11 +83,97 @@ Mutex concurrent_coll_finished_lock;
  * was (unsurprisingly) also found to result in significant amounts of
  * unnecessary copying.
  *
- * Consequently, we now allow aging. We do this by teaching the moving
- * collector to "evacuate" objects it encounters in the non-moving heap by
- * adding them to the mark queue. Specifically, since each gc_thread holds a
- * capability we push to the capability's update remembered set (implemented
- * by markQueuePushClosureGC)
+ * Consequently, we now allow aging. Aging allows the preparatory GC leading up
+ * to a major collection to evacuate some objects into the young generation.
+ * However, this introduces the following tricky case that might arise after
+ * we have finished the preparatory GC:
+ *
+ *       moving heap  ┆  non-moving heap
+ *     ───────────────┆──────────────────
+ *                    ┆
+ *      B ←────────────── A ←─────────────── root
+ *      │             ┆     ↖─────────────── gen1 mut_list
+ *      ╰───────────────→ C
+ *                    ┆
+ *
+ * In this case C is clearly live, but the non-moving collector can only see
+ * this by walking through B, which lives in the moving heap. However, doing so
+ * would require that we synchronize with the mutator/minor GC to ensure that it
+ * isn't in the middle of moving B. What to do?
+ *
+ * The solution we use here is to teach the preparatory moving collector to
+ * "evacuate" objects it encounters in the non-moving heap by adding them to
+ * the mark queue. This is implemented by pushing the object to the update
+ * remembered set of the capability held by the evacuating gc_thread
+ * (implemented by markQueuePushClosureGC)
+ *
+ * Consequently collection of the case above would proceed as follows:
+ *
+ *  1. Initial state:
+ *      * A lives in the non-moving heap and is reachable from the root set
+ *      * A is on the oldest generation's mut_list, since it contains a pointer
+ *        to B, which lives in a younger generation
+ *      * B lives in the moving collector's from space
+ *      * C lives in the non-moving heap
+ *
+ *  2. Preparatory GC: Scavenging mut_lists:
+ *
+ *     The mut_list of the oldest generation is scavenged, resulting in B being
+ *     evacuated (aged) into the moving collector's to-space.
+ *
+ *  3. Preparatory GC: Scavenge B
+ *
+ *     B (now in to-space) is scavenged, resulting in evacuation of C.
+ *     evacuate(C) pushes a reference to C to the mark queue.
+ *
+ *  4. Non-moving GC: C is marked
+ *
+ *     The non-moving collector will come to C in the mark queue and mark it.
+ *
+ *
+ * Note [Deadlock detection under the non-moving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * In GHC the garbage collector is responsible for identifying deadlocked
+ * programs. Providing for this responsibility is slightly tricky in the
+ * non-moving collector due to the existence of aging. In particular, the
+ * non-moving collector cannot traverse objects living in a young generation
+ * but reachable from the non-moving generation, as described in Note [Aging
+ * under the non-moving collector].
+ *
+ * However, this can pose trouble for deadlock detection since it means that we
+ * may conservatively mark dead closures as live. Consider this case:
+ *
+ *       moving heap  ┆  non-moving heap
+ *     ───────────────┆──────────────────
+ *                    ┆
+ *      MVAR_QUEUE ←───── TSO ←───────────── gen1 mut_list
+ *        ↑  │  ╰────────↗  │
+ *        │  │        ┆     │
+ *        │  │        ┆     ↓
+ *        │  ╰──────────→ MVAR
+ *        ╰─────────────────╯
+ *                    ┆
+ *
+ * In this case we have a TSO blocked on a dead MVar. Because the MVAR_TSO_QUEUE on
+ * which it is blocked lives in the moving heap, the TSO is necessarily on the
+ * oldest generation's mut_list. As in Note [Aging under the non-moving
+ * collector], the MVAR_TSO_QUEUE will be evacuated. If MVAR_TSO_QUEUE is aged
+ * (e.g. evacuated to the young generation) then the MVAR will be added to the
+ * mark queue. Consequently, we will falsely conclude that the MVAR is still
+ * alive and fail to spot the deadlock.
+ *
+ * To avoid this sort of situation we disable aging when we are starting a
+ * major GC specifically for deadlock detection (as done by
+ * scheduleDetectDeadlock). This condition is recorded by the
+ * deadlock_detect_gc global variable declared in GC.h. Setting this has a few
+ * effects on the preparatory GC:
+ *
+ *  - Evac.c:alloc_for_copy forces evacuation to the non-moving generation.
+ *
+ *  - The evacuation logic usually responsible for pushing objects living in
+ *    the non-moving heap to the mark queue is disabled. This is safe because
+ *    we know that all live objects will be in the non-moving heap by the end
+ *    of the preparatory moving collection.
  *
  *
  * Note [Live data accounting in nonmoving collector]
-- 
cgit v1.2.1


From 039d29068f0c0ae6ab133f5dac61948584335677 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Tue, 16 Apr 2019 14:32:40 -0400
Subject: NonMoving: Eliminate integer division in nonmovingBlockCount

Perf showed that the this single div was capturing up to 10% of samples
in nonmovingMark. However, the overwhelming majority of cases is looking
at small block sizes. These cases we can easily compute explicitly,
allowing the compiler to turn the division into a significantly more
efficient division-by-constant.

While the increase in source code looks scary, this all optimises down
to very nice looking assembler. At this point the only remaining
hotspots in nonmovingBlockCount are due to memory access.
---
 rts/sm/NonMoving.h | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/rts/sm/NonMoving.h b/rts/sm/NonMoving.h
index 0f9cfa0e48..fdeee19d96 100644
--- a/rts/sm/NonMoving.h
+++ b/rts/sm/NonMoving.h
@@ -161,13 +161,29 @@ INLINE_HEADER unsigned int nonmovingSegmentBlockSize(struct NonmovingSegment *se
     return 1 << seg->block_size;
 }
 
-// How many blocks does the given segment contain? Also the size of the bitmap.
-INLINE_HEADER unsigned int nonmovingSegmentBlockCount(struct NonmovingSegment *seg)
+// How many blocks does a segment with the given block size have?
+INLINE_HEADER unsigned int nonmovingBlockCount(uint8_t log_block_size)
 {
-  unsigned int sz = nonmovingSegmentBlockSize(seg);
   unsigned int segment_data_size = NONMOVING_SEGMENT_SIZE - sizeof(struct NonmovingSegment);
   segment_data_size -= segment_data_size % SIZEOF_VOID_P;
-  return segment_data_size / (sz + 1);
+  unsigned int blk_size = 1 << log_block_size;
+  // N.B. +1 accounts for the byte in the mark bitmap.
+  return segment_data_size / (blk_size + 1);
+}
+
+// How many blocks does the given segment contain? Also the size of the bitmap.
+INLINE_HEADER unsigned int nonmovingSegmentBlockCount(struct NonmovingSegment *seg)
+{
+  // We compute the overwhelmingly common size cases directly to avoid a very
+  // expensive integer division.
+  switch (seg->block_size) {
+    case 3:  return nonmovingBlockCount(3);
+    case 4:  return nonmovingBlockCount(4);
+    case 5:  return nonmovingBlockCount(5);
+    case 6:  return nonmovingBlockCount(6);
+    case 7:  return nonmovingBlockCount(7);
+    default: return nonmovingBlockCount(seg->block_size);
+  }
 }
 
 // Get a pointer to the given block index
-- 
cgit v1.2.1


From d15ac82dcaf1994a3027596e4c8c48fd300bb62d Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Sun, 14 Apr 2019 12:26:57 -0400
Subject: NonMoving: Allocate mark queues in larger block groups

---
 rts/sm/NonMovingMark.c | 6 +++---
 rts/sm/NonMovingMark.h | 5 ++++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index 1b71f52345..16693a710f 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -361,7 +361,7 @@ push (MarkQueue *q, const MarkQueueEnt *ent)
         } else {
             // allocate a fresh block.
             ACQUIRE_SM_LOCK;
-            bdescr *bd = allocGroup(1);
+            bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
             bd->link = q->blocks;
             q->blocks = bd;
             q->top = (MarkQueueBlock *) bd->start;
@@ -392,7 +392,7 @@ markQueuePushClosureGC (MarkQueue *q, StgClosure *p)
         // Yes, this block is full.
         // allocate a fresh block.
         ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-        bdescr *bd = allocGroup(1);
+        bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
         bd->link = q->blocks;
         q->blocks = bd;
         q->top = (MarkQueueBlock *) bd->start;
@@ -742,7 +742,7 @@ again:
 /* Must hold sm_mutex. */
 static void init_mark_queue_ (MarkQueue *queue)
 {
-    bdescr *bd = allocGroup(1);
+    bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS);
     queue->blocks = bd;
     queue->top = (MarkQueueBlock *) bd->start;
     queue->top->head = 0;
diff --git a/rts/sm/NonMovingMark.h b/rts/sm/NonMovingMark.h
index 7e9bfed1f9..a3629a2e99 100644
--- a/rts/sm/NonMovingMark.h
+++ b/rts/sm/NonMovingMark.h
@@ -93,8 +93,11 @@ typedef struct {
     MarkQueue queue;
 } UpdRemSet;
 
+// Number of blocks to allocate for a mark queue
+#define MARK_QUEUE_BLOCKS 16
+
 // The length of MarkQueueBlock.entries
-#define MARK_QUEUE_BLOCK_ENTRIES ((BLOCK_SIZE - sizeof(MarkQueueBlock)) / sizeof(MarkQueueEnt))
+#define MARK_QUEUE_BLOCK_ENTRIES ((MARK_QUEUE_BLOCKS * BLOCK_SIZE - sizeof(MarkQueueBlock)) / sizeof(MarkQueueEnt))
 
 extern bdescr *nonmoving_large_objects, *nonmoving_marked_large_objects;
 extern memcount n_nonmoving_large_blocks, n_nonmoving_marked_large_blocks;
-- 
cgit v1.2.1


From 26d2d3316dcbf9a49fc7d3252d323d78c0f66e6a Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Fri, 3 May 2019 19:44:54 -0400
Subject: NonMovingMark: Optimize representation of mark queue

This shortens MarkQueueEntry by 30% (one word)
---
 rts/sm/NonMovingMark.c | 26 ++++++++++++++++----------
 rts/sm/NonMovingMark.h | 25 ++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index 16693a710f..275cfcdd51 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -401,7 +401,6 @@ markQueuePushClosureGC (MarkQueue *q, StgClosure *p)
     }
 
     MarkQueueEnt ent = {
-        .type = MARK_CLOSURE,
         .mark_closure = {
             .p = UNTAG_CLOSURE(p),
             .origin = NULL,
@@ -430,8 +429,12 @@ void push_closure (MarkQueue *q,
     // }
 #endif
 
+    // This must be true as origin points to a pointer and therefore must be
+    // word-aligned. However, we check this as otherwise we would confuse this
+    // with a mark_array entry
+    ASSERT(((uintptr_t) origin & 0x3) == 0);
+
     MarkQueueEnt ent = {
-        .type = MARK_CLOSURE,
         .mark_closure = {
             .p = UNTAG_CLOSURE(p),
             .origin = origin,
@@ -450,10 +453,9 @@ void push_array (MarkQueue *q,
         return;
 
     MarkQueueEnt ent = {
-        .type = MARK_ARRAY,
         .mark_array = {
             .array = array,
-            .start_index = start_index
+            .start_index = (start_index << 16) | 0x3,
         }
     };
     push(q, &ent);
@@ -716,7 +718,7 @@ again:
         // Is this the first block of the queue?
         if (q->blocks->link == NULL) {
             // Yes, therefore queue is empty...
-            MarkQueueEnt none = { .type = NULL_ENTRY };
+            MarkQueueEnt none = { .null_entry = { .p = NULL } };
             return none;
         } else {
             // No, unwind to the previous block and try popping again...
@@ -1492,13 +1494,13 @@ nonmovingMark (MarkQueue *queue)
         count++;
         MarkQueueEnt ent = markQueuePop(queue);
 
-        switch (ent.type) {
+        switch (nonmovingMarkQueueEntryType(&ent)) {
         case MARK_CLOSURE:
             mark_closure(queue, ent.mark_closure.p, ent.mark_closure.origin);
             break;
         case MARK_ARRAY: {
             const StgMutArrPtrs *arr = ent.mark_array.array;
-            StgWord start = ent.mark_array.start_index;
+            StgWord start = ent.mark_array.start_index >> 16;
             StgWord end = start + MARK_ARRAY_CHUNK_LENGTH;
             if (end < arr->ptrs) {
                 markQueuePushArray(queue, ent.mark_array.array, end);
@@ -1743,13 +1745,17 @@ void nonmovingResurrectThreads (struct MarkQueue_ *queue, StgTSO **resurrected_t
 
 void printMarkQueueEntry (MarkQueueEnt *ent)
 {
-    if (ent->type == MARK_CLOSURE) {
+    switch(nonmovingMarkQueueEntryType(ent)) {
+      case MARK_CLOSURE:
         debugBelch("Closure: ");
         printClosure(ent->mark_closure.p);
-    } else if (ent->type == MARK_ARRAY) {
+        break;
+      case MARK_ARRAY:
         debugBelch("Array\n");
-    } else {
+        break;
+      case NULL_ENTRY:
         debugBelch("End of mark\n");
+        break;
     }
 }
 
diff --git a/rts/sm/NonMovingMark.h b/rts/sm/NonMovingMark.h
index a3629a2e99..bff89d9e92 100644
--- a/rts/sm/NonMovingMark.h
+++ b/rts/sm/NonMovingMark.h
@@ -43,9 +43,16 @@ enum EntryType {
  */
 
 typedef struct {
-    enum EntryType type;
-    // All pointers should be untagged
+    // Which kind of mark queue entry we have is determined by the low bits of
+    // the second word: they must be zero in the case of a mark_closure entry
+    // (since the second word of a mark_closure entry points to a pointer and
+    // pointers must be word-aligned).  In the case of a mark_array we set them
+    // to 0x3 (the value of start_index is shifted to the left to accomodate
+    // this). null_entry where p==NULL is used to indicate the end of the queue.
     union {
+        struct {
+            void *p;              // must be NULL
+        } null_entry;
         struct {
             StgClosure *p;        // the object to be marked
             StgClosure **origin;  // field where this reference was found.
@@ -53,11 +60,23 @@ typedef struct {
         } mark_closure;
         struct {
             const StgMutArrPtrs *array;
-            StgWord start_index;
+            StgWord start_index;  // start index is shifted to the left by 16 bits
         } mark_array;
     };
 } MarkQueueEnt;
 
+INLINE_HEADER enum EntryType nonmovingMarkQueueEntryType(MarkQueueEnt *ent)
+{
+    if (ent->null_entry.p == NULL) {
+        return NULL_ENTRY;
+    } else if (((uintptr_t) ent->mark_closure.origin & TAG_BITS) == 0) {
+        return MARK_CLOSURE;
+    } else {
+        ASSERT((ent->mark_array.start_index & TAG_BITS) == 0x3);
+        return MARK_ARRAY;
+    }
+}
+
 typedef struct {
     // index of first *unused* queue entry
     uint32_t head;
-- 
cgit v1.2.1


From e5eda61e768a74723f6e7955cc9247c14ecabc6d Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Sat, 11 May 2019 11:55:42 -0400
Subject: NonMoving: Optimize bitmap search during allocation

Use memchr instead of a open-coded loop. This is nearly twice as fast in
a synthetic benchmark.
---
 rts/sm/NonMoving.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index f9fb329154..8b27581386 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -303,8 +303,10 @@ static inline unsigned long log2_ceil(unsigned long x)
 // Advance a segment's next_free pointer. Returns true if segment if full.
 static bool advance_next_free(struct NonmovingSegment *seg)
 {
-    uint8_t *bitmap = seg->bitmap;
-    unsigned int blk_count = nonmovingSegmentBlockCount(seg);
+    const uint8_t *bitmap = seg->bitmap;
+    const unsigned int blk_count = nonmovingSegmentBlockCount(seg);
+#if defined(NAIVE_ADVANCE_FREE)
+    // reference implementation
     for (unsigned int i = seg->next_free+1; i < blk_count; i++) {
         if (!bitmap[i]) {
             seg->next_free = i;
@@ -313,6 +315,16 @@ static bool advance_next_free(struct NonmovingSegment *seg)
     }
     seg->next_free = blk_count;
     return true;
+#else
+    const uint8_t *c = memchr(&bitmap[seg->next_free+1], 0, blk_count - seg->next_free - 1);
+    if (c == NULL) {
+        seg->next_free = blk_count;
+        return true;
+    } else {
+        seg->next_free = c - bitmap;
+        return false;
+    }
+#endif
 }
 
 static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *alloca)
-- 
cgit v1.2.1


From dacf4cae179e4f9ee67cd2a5e0c9de7900b5f274 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Sat, 18 May 2019 11:02:03 -0400
Subject: rts: Add prefetch macros

---
 includes/Rts.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/includes/Rts.h b/includes/Rts.h
index def06de90d..7ad6988f3b 100644
--- a/includes/Rts.h
+++ b/includes/Rts.h
@@ -74,6 +74,10 @@ extern "C" {
 #define RTS_UNREACHABLE abort()
 #endif
 
+/* Prefetch primitives */
+#define prefetchForRead(ptr) __builtin_prefetch(ptr, 0)
+#define prefetchForWrite(ptr) __builtin_prefetch(ptr, 1)
+
 /* Fix for mingw stat problem (done here so it's early enough) */
 #if defined(mingw32_HOST_OS)
 #define __MSVCRT__ 1
-- 
cgit v1.2.1


From 786c52d25e94e578a7d76772fbc18fac0ea1b458 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Sat, 11 May 2019 19:14:43 -0400
Subject: NonMoving: Prefetch when clearing bitmaps

Ensure that the bitmap of the segmentt that we will clear next is in
cache by the time we reach it.
---
 rts/sm/NonMoving.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index 8b27581386..b8c690d4a0 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -497,6 +497,8 @@ static void nonmovingClearBitmap(struct NonmovingSegment *seg)
 static void nonmovingClearSegmentBitmaps(struct NonmovingSegment *seg)
 {
     while (seg) {
+        prefetchForRead(seg->link);
+        prefetchForWrite(seg->link->bitmap);
         nonmovingClearBitmap(seg);
         seg = seg->link;
     }
-- 
cgit v1.2.1


From 0387df5bfd166f82ee11d4ce7d96eb52bf1ba9f7 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Sat, 11 May 2019 19:32:17 -0400
Subject: NonMoving: Inline nonmovingClearAllBitmaps

---
 rts/sm/NonMoving.c | 35 ++++++++++-------------------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index b8c690d4a0..60416cb0e3 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -488,35 +488,12 @@ void nonmovingAddCapabilities(uint32_t new_n_caps)
     nonmovingHeap.n_caps = new_n_caps;
 }
 
-static void nonmovingClearBitmap(struct NonmovingSegment *seg)
+static inline void nonmovingClearBitmap(struct NonmovingSegment *seg)
 {
     unsigned int n = nonmovingSegmentBlockCount(seg);
     memset(seg->bitmap, 0, n);
 }
 
-static void nonmovingClearSegmentBitmaps(struct NonmovingSegment *seg)
-{
-    while (seg) {
-        prefetchForRead(seg->link);
-        prefetchForWrite(seg->link->bitmap);
-        nonmovingClearBitmap(seg);
-        seg = seg->link;
-    }
-}
-
-static void nonmovingClearAllBitmaps(void)
-{
-    for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
-        struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
-        nonmovingClearSegmentBitmaps(alloca->filled);
-    }
-
-    // Clear large object bits
-    for (bdescr *bd = nonmoving_large_objects; bd; bd = bd->link) {
-        bd->flags &= ~BF_MARKED;
-    }
-}
-
 /* Prepare the heap bitmaps and snapshot metadata for a mark */
 static void nonmovingPrepareMark(void)
 {
@@ -525,7 +502,6 @@ static void nonmovingPrepareMark(void)
     static_flag =
         static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
 
-    nonmovingClearAllBitmaps();
     nonmovingBumpEpoch();
     for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
         struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
@@ -539,6 +515,9 @@ static void nonmovingPrepareMark(void)
         // Update filled segments' snapshot pointers
         struct NonmovingSegment *seg = alloca->filled;
         while (seg) {
+            prefetchForRead(seg->link);
+            prefetchForWrite(seg->link->bitmap);
+            nonmovingClearBitmap(seg);
             seg->next_free_snap = seg->next_free;
             seg = seg->link;
         }
@@ -561,6 +540,12 @@ static void nonmovingPrepareMark(void)
     oldest_gen->n_large_blocks = 0;
     nonmoving_live_words = 0;
 
+    // Clear large object bits
+    for (bdescr *bd = nonmoving_large_objects; bd; bd = bd->link) {
+        bd->flags &= ~BF_MARKED;
+    }
+
+
 #if defined(DEBUG)
     debug_caf_list_snapshot = debug_caf_list;
     debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
-- 
cgit v1.2.1


From e893877e3964cd8c8b147c14b3a2b38410e7427b Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Sat, 11 May 2019 19:53:46 -0400
Subject: NonMoving: Fuse sweep preparation into mark prep

---
 rts/sm/NonMoving.c      | 34 +++++++++++++++++++++++++---------
 rts/sm/NonMovingSweep.c | 32 --------------------------------
 rts/sm/NonMovingSweep.h |  4 ----
 3 files changed, 25 insertions(+), 45 deletions(-)

diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index 60416cb0e3..bffc0c744e 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -502,6 +502,9 @@ static void nonmovingPrepareMark(void)
     static_flag =
         static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
 
+    // Should have been cleared by the last sweep
+    ASSERT(nonmovingHeap.sweep_list == NULL);
+
     nonmovingBumpEpoch();
     for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
         struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
@@ -512,14 +515,28 @@ static void nonmovingPrepareMark(void)
             seg->next_free_snap = seg->next_free;
         }
 
-        // Update filled segments' snapshot pointers
-        struct NonmovingSegment *seg = alloca->filled;
-        while (seg) {
-            prefetchForRead(seg->link);
-            prefetchForWrite(seg->link->bitmap);
-            nonmovingClearBitmap(seg);
-            seg->next_free_snap = seg->next_free;
-            seg = seg->link;
+        // Update filled segments' snapshot pointers and move to sweep_list
+        uint32_t n_filled = 0;
+        struct NonmovingSegment *const filled = alloca->filled;
+        alloca->filled = NULL;
+        if (filled) {
+            struct NonmovingSegment *seg = filled;
+            while (true) {
+                n_filled++;
+                prefetchForRead(seg->link);
+                // Clear bitmap
+                prefetchForWrite(seg->link->bitmap);
+                nonmovingClearBitmap(seg);
+                // Set snapshot
+                seg->next_free_snap = seg->next_free;
+                if (seg->link)
+                    seg = seg->link;
+                else
+                    break;
+            }
+            // add filled segments to sweep_list
+            seg->link = nonmovingHeap.sweep_list;
+            nonmovingHeap.sweep_list = filled;
         }
 
         // N.B. It's not necessary to update snapshot pointers of active segments;
@@ -596,7 +613,6 @@ void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads)
     resizeGenerations();
 
     nonmovingPrepareMark();
-    nonmovingPrepareSweep();
 
     // N.B. These should have been cleared at the end of the last sweep.
     ASSERT(nonmoving_marked_large_objects == NULL);
diff --git a/rts/sm/NonMovingSweep.c b/rts/sm/NonMovingSweep.c
index b67bba0ce5..0236ef82e9 100644
--- a/rts/sm/NonMovingSweep.c
+++ b/rts/sm/NonMovingSweep.c
@@ -17,38 +17,6 @@
 #include "Trace.h"
 #include "StableName.h"
 
-static struct NonmovingSegment *pop_all_filled_segments(struct NonmovingAllocator *alloc)
-{
-    while (true) {
-        struct NonmovingSegment *head = alloc->filled;
-        if (cas((StgVolatilePtr) &alloc->filled, (StgWord) head, (StgWord) NULL) == (StgWord) head)
-            return head;
-    }
-}
-
-void nonmovingPrepareSweep()
-{
-    ASSERT(nonmovingHeap.sweep_list == NULL);
-
-    // Move blocks in the allocators' filled lists into sweep_list
-    for (unsigned int alloc_idx = 0; alloc_idx < NONMOVING_ALLOCA_CNT; alloc_idx++)
-    {
-        struct NonmovingAllocator *alloc = nonmovingHeap.allocators[alloc_idx];
-        struct NonmovingSegment *filled = pop_all_filled_segments(alloc);
-
-        // Link filled to sweep_list
-        if (filled) {
-            struct NonmovingSegment *filled_head = filled;
-            // Find end of filled list
-            while (filled->link) {
-                filled = filled->link;
-            }
-            filled->link = nonmovingHeap.sweep_list;
-            nonmovingHeap.sweep_list = filled_head;
-        }
-    }
-}
-
 // On which list should a particular segment be placed?
 enum SweepResult {
     SEGMENT_FREE,     // segment is empty: place on free list
diff --git a/rts/sm/NonMovingSweep.h b/rts/sm/NonMovingSweep.h
index f21936004f..5ae5b687e3 100644
--- a/rts/sm/NonMovingSweep.h
+++ b/rts/sm/NonMovingSweep.h
@@ -22,10 +22,6 @@ void nonmovingSweepLargeObjects(void);
 // Remove dead entries in the stable name table
 void nonmovingSweepStableNameTable(void);
 
-// Collect the set of segments to be collected during a major GC into
-// nonmovingHeap.sweep_list.
-void nonmovingPrepareSweep(void);
-
 #if defined(DEBUG)
 // The non-moving equivalent of the moving collector's gcCAFs.
 void nonmovingGcCafs(void);
-- 
cgit v1.2.1


From e6f6823f1eb5ae43a7cd782a649f55c40a5d53fd Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Wed, 15 May 2019 16:49:40 -0400
Subject: NonMoving: Pre-fetch during mark

This improved overall runtime on nofib's constraints test by nearly 10%.
---
 rts/sm/NonMovingMark.c | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 rts/sm/NonMovingMark.h | 10 ++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index 275cfcdd51..ddc1a1234a 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -706,7 +706,7 @@ void markQueuePushArray (MarkQueue *q,
  *********************************************************/
 
 // Returns invalid MarkQueueEnt if queue is empty.
-static MarkQueueEnt markQueuePop (MarkQueue *q)
+static MarkQueueEnt markQueuePop_ (MarkQueue *q)
 {
     MarkQueueBlock *top;
 
@@ -737,6 +737,46 @@ again:
     return ent;
 }
 
+static MarkQueueEnt markQueuePop (MarkQueue *q)
+{
+#if MARK_PREFETCH_QUEUE_DEPTH == 0
+    return markQueuePop_(q);
+#else
+    unsigned int i = q->prefetch_head;
+    while (nonmovingMarkQueueEntryType(&q->prefetch_queue[i]) == NULL_ENTRY) {
+        MarkQueueEnt new = markQueuePop_(q);
+        if (nonmovingMarkQueueEntryType(&new) == NULL_ENTRY) {
+            // Mark queue is empty; look for any valid entries in the prefetch
+            // queue
+            for (unsigned int j = (i+1) % MARK_PREFETCH_QUEUE_DEPTH;
+                 j != i;
+                 j = (j+1) % MARK_PREFETCH_QUEUE_DEPTH)
+            {
+                if (nonmovingMarkQueueEntryType(&q->prefetch_queue[j]) != NULL_ENTRY) {
+                    i = j;
+                    goto done;
+                }
+            }
+            return new;
+        }
+
+        // The entry may not be a MARK_CLOSURE but it doesn't matter, our
+        // MarkQueueEnt encoding always places the pointer to the object to be
+        // marked first.
+        prefetchForRead(&new.mark_closure.p->header.info);
+        q->prefetch_queue[i] = new;
+        i = (i + 1) % MARK_PREFETCH_QUEUE_DEPTH;
+    }
+
+  done:
+    ;
+    MarkQueueEnt ret = q->prefetch_queue[i];
+    q->prefetch_queue[i].null_entry.p = NULL;
+    q->prefetch_head = i;
+    return ret;
+#endif
+}
+
 /*********************************************************
  * Creating and destroying MarkQueues and UpdRemSets
  *********************************************************/
@@ -748,6 +788,10 @@ static void init_mark_queue_ (MarkQueue *queue)
     queue->blocks = bd;
     queue->top = (MarkQueueBlock *) bd->start;
     queue->top->head = 0;
+#if MARK_PREFETCH_QUEUE_DEPTH > 0
+    memset(&queue->prefetch_queue, 0, sizeof(queue->prefetch_queue));
+    queue->prefetch_head = 0;
+#endif
 }
 
 /* Must hold sm_mutex. */
diff --git a/rts/sm/NonMovingMark.h b/rts/sm/NonMovingMark.h
index bff89d9e92..806776cdc5 100644
--- a/rts/sm/NonMovingMark.h
+++ b/rts/sm/NonMovingMark.h
@@ -84,6 +84,9 @@ typedef struct {
     MarkQueueEnt entries[];
 } MarkQueueBlock;
 
+// How far ahead in mark queue to prefetch?
+#define MARK_PREFETCH_QUEUE_DEPTH 5
+
 /* The mark queue is not capable of concurrent read or write.
  *
  * invariants:
@@ -101,6 +104,13 @@ typedef struct MarkQueue_ {
 
     // Is this a mark queue or a capability-local update remembered set?
     bool is_upd_rem_set;
+
+#if MARK_PREFETCH_QUEUE_DEPTH > 0
+    // A ring-buffer of entries which we will mark next
+    MarkQueueEnt prefetch_queue[MARK_PREFETCH_QUEUE_DEPTH];
+    // The first free slot in prefetch_queue.
+    uint8_t prefetch_head;
+#endif
 } MarkQueue;
 
 /* While it shares its representation with MarkQueue, UpdRemSet differs in
-- 
cgit v1.2.1


From 56c5ebdc5d907313689ac08cbe15145f29fb83d5 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Thu, 16 May 2019 17:14:38 -0400
Subject: NonMoving: Prefetch segment header

---
 rts/sm/NonMoving.h     | 9 +++++++--
 rts/sm/NonMovingMark.c | 1 +
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/rts/sm/NonMoving.h b/rts/sm/NonMoving.h
index fdeee19d96..9ef00fdba0 100644
--- a/rts/sm/NonMoving.h
+++ b/rts/sm/NonMoving.h
@@ -203,13 +203,18 @@ INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmo
 
 // Get the segment which a closure resides in. Assumes that pointer points into
 // non-moving heap.
-INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment(StgPtr p)
+INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment_unchecked(StgPtr p)
 {
-    ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));
     const uintptr_t mask = ~NONMOVING_SEGMENT_MASK;
     return (struct NonmovingSegment *) (((uintptr_t) p) & mask);
 }
 
+INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment(StgPtr p)
+{
+    ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));
+    return nonmovingGetSegment_unchecked(p);
+}
+
 INLINE_HEADER nonmoving_block_idx nonmovingGetBlockIdx(StgPtr p)
 {
     ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING));
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index ddc1a1234a..27b2b679b7 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -764,6 +764,7 @@ static MarkQueueEnt markQueuePop (MarkQueue *q)
         // MarkQueueEnt encoding always places the pointer to the object to be
         // marked first.
         prefetchForRead(&new.mark_closure.p->header.info);
+        prefetchForRead(&nonmovingGetSegment_unchecked((StgPtr) new.mark_closure.p)->block_size);
         q->prefetch_queue[i] = new;
         i = (i + 1) % MARK_PREFETCH_QUEUE_DEPTH;
     }
-- 
cgit v1.2.1


From 19bfe460cd70e4962be83862b86be89d1b7e0f14 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Sat, 11 May 2019 23:04:54 -0400
Subject: NonMoving: Optimise allocator cache behavior

Previously we would look at the segment header to determine the block
size despite the fact that we already had the block size at hand.
---
 rts/sm/NonMoving.c | 36 ++++++++++++++++++++++++++----------
 rts/sm/NonMoving.h | 30 ++++++++++++++++--------------
 2 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index bffc0c744e..9b50c2d7dd 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -253,6 +253,20 @@ static struct NonmovingSegment *nonmovingPopFreeSegment(void)
     }
 }
 
+unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size)
+{
+  // We compute the overwhelmingly common size cases directly to avoid a very
+  // expensive integer division.
+  switch (log_block_size) {
+    case 3:  return nonmovingBlockCount(3);
+    case 4:  return nonmovingBlockCount(4);
+    case 5:  return nonmovingBlockCount(5);
+    case 6:  return nonmovingBlockCount(6);
+    case 7:  return nonmovingBlockCount(7);
+    default: return nonmovingBlockCount(log_block_size);
+  }
+}
+
 /*
  * Request a fresh segment from the free segment list or allocate one of the
  * given node.
@@ -301,10 +315,10 @@ static inline unsigned long log2_ceil(unsigned long x)
 }
 
 // Advance a segment's next_free pointer. Returns true if segment if full.
-static bool advance_next_free(struct NonmovingSegment *seg)
+static bool advance_next_free(struct NonmovingSegment *seg, const unsigned int blk_count)
 {
     const uint8_t *bitmap = seg->bitmap;
-    const unsigned int blk_count = nonmovingSegmentBlockCount(seg);
+    ASSERT(blk_count == nonmovingSegmentBlockCount(seg));
 #if defined(NAIVE_ADVANCE_FREE)
     // reference implementation
     for (unsigned int i = seg->next_free+1; i < blk_count; i++) {
@@ -346,22 +360,23 @@ static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *al
 GNUC_ATTR_HOT
 void *nonmovingAllocate(Capability *cap, StgWord sz)
 {
-    unsigned int allocator_idx = log2_ceil(sz * sizeof(StgWord)) - NONMOVING_ALLOCA0;
+    unsigned int log_block_size = log2_ceil(sz * sizeof(StgWord));
+    unsigned int block_count = nonmovingBlockCountFromSize(log_block_size);
 
     // The max we ever allocate is 3276 bytes (anything larger is a large
     // object and not moved) which is covered by allocator 9.
-    ASSERT(allocator_idx < NONMOVING_ALLOCA_CNT);
+    ASSERT(log_block_size < NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT);
 
-    struct NonmovingAllocator *alloca = nonmovingHeap.allocators[allocator_idx];
+    struct NonmovingAllocator *alloca = nonmovingHeap.allocators[log_block_size - NONMOVING_ALLOCA0];
 
     // Allocate into current segment
     struct NonmovingSegment *current = alloca->current[cap->no];
     ASSERT(current); // current is never NULL
-    void *ret = nonmovingSegmentGetBlock(current, current->next_free);
+    void *ret = nonmovingSegmentGetBlock_(current, log_block_size, current->next_free);
     ASSERT(GET_CLOSURE_TAG(ret) == 0); // check alignment
 
     // Advance the current segment's next_free or allocate a new segment if full
-    bool full = advance_next_free(current);
+    bool full = advance_next_free(current, block_count);
     if (full) {
         // Current segment is full: update live data estimate link it to
         // filled, take an active segment if one exists, otherwise allocate a
@@ -369,8 +384,9 @@ void *nonmovingAllocate(Capability *cap, StgWord sz)
 
         // Update live data estimate.
         // See Note [Live data accounting in nonmoving collector].
-        unsigned int new_blocks =  nonmovingSegmentBlockCount(current) - current->next_free_snap;
-        atomic_inc(&oldest_gen->live_estimate, new_blocks * nonmovingSegmentBlockSize(current) / sizeof(W_));
+        unsigned int new_blocks = block_count - current->next_free_snap;
+        unsigned int block_size = 1 << log_block_size;
+        atomic_inc(&oldest_gen->live_estimate, new_blocks * block_size / sizeof(W_));
 
         // push the current segment to the filled list
         nonmovingPushFilledSegment(current);
@@ -381,7 +397,7 @@ void *nonmovingAllocate(Capability *cap, StgWord sz)
         // there are no active segments, allocate new segment
         if (new_current == NULL) {
             new_current = nonmovingAllocSegment(cap->node);
-            nonmovingInitSegment(new_current, NONMOVING_ALLOCA0 + allocator_idx);
+            nonmovingInitSegment(new_current, log_block_size);
         }
 
         // make it current
diff --git a/rts/sm/NonMoving.h b/rts/sm/NonMoving.h
index 9ef00fdba0..06894e98be 100644
--- a/rts/sm/NonMoving.h
+++ b/rts/sm/NonMoving.h
@@ -171,28 +171,24 @@ INLINE_HEADER unsigned int nonmovingBlockCount(uint8_t log_block_size)
   return segment_data_size / (blk_size + 1);
 }
 
+unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size);
+
 // How many blocks does the given segment contain? Also the size of the bitmap.
 INLINE_HEADER unsigned int nonmovingSegmentBlockCount(struct NonmovingSegment *seg)
 {
-  // We compute the overwhelmingly common size cases directly to avoid a very
-  // expensive integer division.
-  switch (seg->block_size) {
-    case 3:  return nonmovingBlockCount(3);
-    case 4:  return nonmovingBlockCount(4);
-    case 5:  return nonmovingBlockCount(5);
-    case 6:  return nonmovingBlockCount(6);
-    case 7:  return nonmovingBlockCount(7);
-    default: return nonmovingBlockCount(seg->block_size);
-  }
+  return nonmovingBlockCountFromSize(seg->block_size);
 }
 
-// Get a pointer to the given block index
-INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmoving_block_idx i)
+// Get a pointer to the given block index assuming that the block size is as
+// given (avoiding a potential cache miss when this information is already
+// available). The log_block_size argument must be equal to seg->block_size.
+INLINE_HEADER void *nonmovingSegmentGetBlock_(struct NonmovingSegment *seg, uint8_t log_block_size, nonmoving_block_idx i)
 {
+  ASSERT(log_block_size == seg->block_size);
   // Block size in bytes
-  unsigned int blk_size = nonmovingSegmentBlockSize(seg);
+  unsigned int blk_size = 1 << log_block_size;
   // Bitmap size in bytes
-  W_ bitmap_size = nonmovingSegmentBlockCount(seg) * sizeof(uint8_t);
+  W_ bitmap_size = nonmovingBlockCountFromSize(log_block_size) * sizeof(uint8_t);
   // Where the actual data starts (address of the first block).
   // Use ROUNDUP_BYTES_TO_WDS to align to word size. Note that
   // ROUNDUP_BYTES_TO_WDS returns in _words_, not in _bytes_, so convert it back
@@ -201,6 +197,12 @@ INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmo
   return (void*)(data + i*blk_size);
 }
 
+// Get a pointer to the given block index.
+INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmoving_block_idx i)
+{
+  return nonmovingSegmentGetBlock_(seg, seg->block_size, i);
+}
+
 // Get the segment which a closure resides in. Assumes that pointer points into
 // non-moving heap.
 INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment_unchecked(StgPtr p)
-- 
cgit v1.2.1


From 53a1a27e51f978f520b6084aff2e4b25014b5cf3 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Fri, 3 May 2019 20:06:47 -0400
Subject: NonMovingMark: Eliminate redundant check_in_nonmoving_heaps

---
 rts/sm/NonMovingMark.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index 27b2b679b7..eeccd58428 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -415,11 +415,8 @@ void push_closure (MarkQueue *q,
                    StgClosure *p,
                    StgClosure **origin)
 {
-    // TODO: Push this into callers where they already have the Bdescr
-    if (HEAP_ALLOCED_GC(p) && (Bdescr((StgPtr) p)->gen != oldest_gen))
-        return;
-
 #if defined(DEBUG)
+    ASSERT(!HEAP_ALLOCED_GC(p) || (Bdescr((StgPtr) p)->gen == oldest_gen));
     ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
     // Commenting out: too slow
     // if (RtsFlags.DebugFlags.sanity) {
@@ -532,15 +529,11 @@ void updateRemembSetPushThunkEager(Capability *cap,
         MarkQueue *queue = &cap->upd_rem_set.queue;
         push_thunk_srt(queue, &info->i);
 
-        // Don't record the origin of objects living outside of the nonmoving
-        // heap; we can't perform the selector optimisation on them anyways.
-        bool record_origin = check_in_nonmoving_heap((StgClosure*)thunk);
-
         for (StgWord i = 0; i < info->i.layout.payload.ptrs; i++) {
             if (check_in_nonmoving_heap(thunk->payload[i])) {
-                push_closure(queue,
-                             thunk->payload[i],
-                             record_origin ? &thunk->payload[i] : NULL);
+                // Don't bother to push origin; it makes the barrier needlessly
+                // expensive with little benefit.
+                push_closure(queue, thunk->payload[i], NULL);
             }
         }
         break;
@@ -549,7 +542,9 @@ void updateRemembSetPushThunkEager(Capability *cap,
     {
         MarkQueue *queue = &cap->upd_rem_set.queue;
         StgAP *ap = (StgAP *) thunk;
-        push_closure(queue, ap->fun, &ap->fun);
+        if (check_in_nonmoving_heap(ap->fun)) {
+            push_closure(queue, ap->fun, NULL);
+        }
         mark_PAP_payload(queue, ap->fun, ap->payload, ap->n_args);
         break;
     }
@@ -570,9 +565,10 @@ void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p)
 
 inline void updateRemembSetPushClosure(Capability *cap, StgClosure *p)
 {
-    if (!check_in_nonmoving_heap(p)) return;
-    MarkQueue *queue = &cap->upd_rem_set.queue;
-    push_closure(queue, p, NULL);
+    if (check_in_nonmoving_heap(p)) {
+        MarkQueue *queue = &cap->upd_rem_set.queue;
+        push_closure(queue, p, NULL);
+    }
 }
 
 void updateRemembSetPushClosure_(StgRegTable *reg, struct StgClosure_ *p)
@@ -669,7 +665,10 @@ void markQueuePushClosure (MarkQueue *q,
                            StgClosure *p,
                            StgClosure **origin)
 {
-    push_closure(q, p, origin);
+    // TODO: Push this into callers where they already have the Bdescr
+    if (check_in_nonmoving_heap(p)) {
+        push_closure(q, p, origin);
+    }
 }
 
 /* TODO: Do we really never want to specify the origin here? */
-- 
cgit v1.2.1


From b967e470d806656b0f751d40884ae7edfeaa534c Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Fri, 17 May 2019 19:18:42 -0400
Subject: NonMoving: Don't do major GC if one is already running

Previously we would perform a preparatory moving collection, resulting
in many things being added to the mark queue. When we finished with this
we would realize in nonmovingCollect that there was already a collection
running, in which case we would simply not run the nonmoving collector.

However, it was very easy to end up in a "treadmilling" situation: all
subsequent GC following the first failed major GC would be scheduled as
major GCs. Consequently we would continuously feed the concurrent
collector with more mark queue entries and it would never finish.

This patch aborts the major collection far earlier, meaning that we
avoid adding nonmoving objects to the mark queue and allowing the
concurrent collector to finish.
---
 rts/sm/GC.c        | 12 ++++++++++++
 rts/sm/NonMoving.h |  4 ++++
 2 files changed, 16 insertions(+)

diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index a97042c718..6be81e5ff0 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -268,6 +268,18 @@ GarbageCollect (uint32_t collect_gen,
   /* See Note [Deadlock detection under nonmoving collector]. */
   deadlock_detect_gc = deadlock_detect;
 
+#if defined(THREADED_RTS)
+  if (major_gc && RtsFlags.GcFlags.useNonmoving && concurrent_coll_running) {
+      /* If there is already a concurrent major collection running then
+       * there is no benefit to starting another.
+       * TODO: Catch heap-size runaway.
+       */
+      N--;
+      collect_gen--;
+      major_gc = false;
+  }
+#endif
+
   /* N.B. The nonmoving collector works a bit differently. See
    * Note [Static objects under the nonmoving collector].
    */
diff --git a/rts/sm/NonMoving.h b/rts/sm/NonMoving.h
index 06894e98be..17adfd6666 100644
--- a/rts/sm/NonMoving.h
+++ b/rts/sm/NonMoving.h
@@ -93,6 +93,10 @@ extern struct NonmovingHeap nonmovingHeap;
 
 extern memcount nonmoving_live_words;
 
+#if defined(THREADED_RTS)
+extern bool concurrent_coll_running;
+#endif
+
 void nonmovingInit(void);
 void nonmovingStop(void);
 void nonmovingExit(void);
-- 
cgit v1.2.1


From 3bc172a41c43ebe3d81caf4d75f10cfb48218006 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Tue, 21 May 2019 21:07:17 -0400
Subject: NonMoving: Clean mut_list

---
 rts/sm/NonMovingSweep.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 121 insertions(+), 1 deletion(-)

diff --git a/rts/sm/NonMovingSweep.c b/rts/sm/NonMovingSweep.c
index 0236ef82e9..7af5508afc 100644
--- a/rts/sm/NonMovingSweep.c
+++ b/rts/sm/NonMovingSweep.c
@@ -153,6 +153,126 @@ GNUC_ATTR_HOT void nonmovingSweep(void)
     }
 }
 
+/* Must a closure remain on the mutable list?
+ *
+ * A closure must remain if any of the following applies:
+ *
+ *  1. it contains references to a younger generation
+ *  2. it's a mutable closure (e.g. mutable array or MUT_PRIM)
+ */
+static bool is_closure_clean(StgClosure *p)
+{
+    const StgInfoTable *info = get_itbl(p);
+
+#define CLEAN(ptr) (!HEAP_ALLOCED((StgClosure*) ptr) || Bdescr((StgPtr) ptr)->gen == oldest_gen)
+
+    switch (info->type) {
+    case MVAR_CLEAN:
+    case MVAR_DIRTY:
+    {
+        StgMVar *mvar = ((StgMVar *)p);
+        if (!CLEAN(mvar->head)) goto dirty_MVAR;
+        if (!CLEAN(mvar->tail)) goto dirty_MVAR;
+        if (!CLEAN(mvar->value)) goto dirty_MVAR;
+        mvar->header.info = &stg_MVAR_CLEAN_info;
+        return true;
+
+dirty_MVAR:
+        mvar->header.info = &stg_MVAR_DIRTY_info;
+        return false;
+    }
+
+    case TVAR:
+    {
+        StgTVar *tvar = ((StgTVar *)p);
+        if (!CLEAN(tvar->current_value)) goto dirty_TVAR;
+        if (!CLEAN(tvar->first_watch_queue_entry)) goto dirty_TVAR;
+        tvar->header.info = &stg_TVAR_CLEAN_info;
+        return true;
+
+dirty_TVAR:
+        tvar->header.info = &stg_TVAR_DIRTY_info;
+        return false;
+    }
+
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    case THUNK_2_0:
+    {
+        StgPtr end = (StgPtr)((StgThunk *)p)->payload + info->layout.payload.ptrs;
+        for (StgPtr q = (StgPtr)((StgThunk *)p)->payload; q < end; q++) {
+            if (!CLEAN(*q)) return false;
+        }
+        return true;
+    }
+
+    case FUN:
+    case FUN_1_0:                       // hardly worth specialising these guys
+    case FUN_0_1:
+    case FUN_1_1:
+    case FUN_0_2:
+    case FUN_2_0:
+    case CONSTR:
+    case CONSTR_NOCAF:
+    case CONSTR_1_0:
+    case CONSTR_0_1:
+    case CONSTR_1_1:
+    case CONSTR_0_2:
+    case CONSTR_2_0:
+    case PRIM:
+    {
+        StgPtr end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+        for (StgPtr q = (StgPtr)((StgClosure *)p)->payload; q < end; q++) {
+            if (!CLEAN(*q)) return false;
+        }
+        return true;
+    }
+
+    case WEAK:
+        return false; // TODO
+
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+        if (!CLEAN(((StgMutVar *)p)->var)) {
+            p->header.info = &stg_MUT_VAR_DIRTY_info;
+            return false;
+        } else {
+            p->header.info = &stg_MUT_VAR_CLEAN_info;
+            return true;
+        }
+
+    case BLOCKING_QUEUE:
+    {
+        StgBlockingQueue *bq = (StgBlockingQueue *)p;
+
+        if (!CLEAN(bq->bh)) goto dirty_BLOCKING_QUEUE;
+        if (!CLEAN(bq->owner)) goto dirty_BLOCKING_QUEUE;
+        if (!CLEAN(bq->queue)) goto dirty_BLOCKING_QUEUE;
+        if (!CLEAN(bq->link)) goto dirty_BLOCKING_QUEUE;
+        bq->header.info = &stg_BLOCKING_QUEUE_CLEAN_info;
+        return true;
+
+dirty_BLOCKING_QUEUE:
+        bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
+        return false;
+    }
+
+    case THUNK_SELECTOR:
+        return CLEAN(((StgSelector *) p)->selectee);
+
+    case ARR_WORDS:
+        return true;
+
+    default:
+        // TODO: the rest
+        return false;
+    }
+#undef CLEAN
+}
+
 /* N.B. This happens during the pause so we own all capabilities. */
 void nonmovingSweepMutLists()
 {
@@ -163,7 +283,7 @@ void nonmovingSweepMutLists()
         for (bdescr *bd = old_mut_list; bd; bd = bd->link) {
             for (StgPtr p = bd->start; p < bd->free; p++) {
                 StgClosure **q = (StgClosure**)p;
-                if (nonmovingIsAlive(*q)) {
+                if (nonmovingIsAlive(*q) && !is_closure_clean(*q)) {
                     recordMutableCap(*q, cap, oldest_gen->no);
                 }
             }
-- 
cgit v1.2.1


From 8e79e2a973c1e4730a1caf402898f6607f84af45 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Wed, 8 May 2019 21:28:35 -0400
Subject: Unconditionally flush update remembered set during minor GC

Flush the update remembered set. The goal here is to flush periodically to
ensure that we don't end up with a thread who marks their stack on their
local update remembered set and doesn't flush until the nonmoving sync
period as this would result in a large fraction of the heap being marked
during the sync pause.
---
 rts/sm/GC.c            |  8 ++++++++
 rts/sm/NonMovingMark.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 6be81e5ff0..83e9c97bd9 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -730,6 +730,14 @@ GarbageCollect (uint32_t collect_gen,
     }
   } // for all generations
 
+  // Flush the update remembered set. See Note [Eager update remembered set
+  // flushing] in NonMovingMark.c
+  if (RtsFlags.GcFlags.useNonmoving) {
+      RELEASE_SM_LOCK;
+      nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue);
+      ACQUIRE_SM_LOCK;
+  }
+
   // Mark and sweep the oldest generation.
   // N.B. This can only happen after we've moved
   // oldest_gen->scavenged_large_objects back to oldest_gen->large_objects.
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index eeccd58428..2ab4572447 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -131,6 +131,49 @@ StgIndStatic *debug_caf_list_snapshot = (StgIndStatic*)END_OF_CAF_LIST;
  * The mark phase is responsible for freeing update remembered set block
  * allocations.
  *
+ * Note that we eagerly flush update remembered sets during minor GCs as
+ * described in Note [Eager update remembered set flushing].
+ *
+ *
+ * Note [Eager update remembered set flushing]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * We eagerly flush update remembered sets during minor GCs to avoid scenarios
+ * like the following which could result in long sync pauses:
+ *
+ *  1. We start a major GC, all thread stacks are added to the mark queue.
+ *  2. The concurrent mark thread starts.
+ *  3. The mutator is allowed to resume. One mutator thread T is scheduled and marks its
+ *     stack to its local update remembered set.
+ *  4. The mark thread eventually encounters the mutator thread's stack but
+ *     sees that it has already been marked; skips it.
+ *  5. Thread T continues running but does not push enough to its update
+ *     remembered set to require a flush.
+ *  6. Eventually the mark thread finished marking and requests a final sync.
+ *  7. The thread T flushes its update remembered set.
+ *  8. We find that a large fraction of the heap (namely the subset that is
+ *     only reachable from the thread T's stack) needs to be marked, incurring
+ *     a large sync pause
+ *
+ * We avoid this by periodically (during minor GC) forcing a flush of the
+ * update remembered set.
+ *
+ * A better (but more complex) approach that would be worthwhile trying in the
+ * future would be to rather do the following:
+ *
+ *  1. Concurrent mark phase is on-going
+ *  2. Mark thread runs out of things to mark
+ *  3. Mark thread sends a signal to capabilities requesting that they send
+ *     their update remembered sets without suspending their execution
+ *  4. The mark thread marks everything it was sent; runs out of things to mark
+ *  5. Mark thread initiates a sync
+ *  6. Capabilities send their final update remembered sets and suspend execution
+ *  7. Mark thread marks everything is was sent
+ *  8. Mark thead allows capabilities to resume.
+ *
+ * However, this is obviously a fair amount of complexity and so far the
+ * periodic eager flushing approach has been sufficient.
+ *
  *
  * Note [Concurrent read barrier on deRefWeak#]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- 
cgit v1.2.1


From b281e80be5169ff3a6aa8044a9996854d9f588fa Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Mon, 11 Feb 2019 22:03:33 -0500
Subject: testsuite: Add nonmoving_thr way

---
 testsuite/config/ghc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/testsuite/config/ghc b/testsuite/config/ghc
index ec35ed0d8c..16c387c3cd 100644
--- a/testsuite/config/ghc
+++ b/testsuite/config/ghc
@@ -27,7 +27,8 @@ config.other_ways         = ['prof', 'normal_h',
                              'debug',
                              'ghci-ext', 'ghci-ext-prof',
                              'ext-interp',
-                             'nonmoving']
+                             'nonmoving',
+                             'nonmoving_thr']
 
 if ghc_with_native_codegen:
     config.compile_ways.append('optasm')
@@ -98,7 +99,8 @@ config.way_flags = {
     'ghci-ext'         : ['--interactive', '-v0', '-ignore-dot-ghci', '-fno-ghci-history', '-fexternal-interpreter', '+RTS', '-I0.1', '-RTS'],
     'ghci-ext-prof'    : ['--interactive', '-v0', '-ignore-dot-ghci', '-fno-ghci-history', '-fexternal-interpreter', '-prof', '+RTS', '-I0.1', '-RTS'],
     'ext-interp'   : ['-fexternal-interpreter'],
-    'nonmoving'    : ['-debug'],
+    'nonmoving'    : [],
+    'nonmoving_thr': ['-threaded'],
    }
 
 config.way_rts_flags = {
@@ -137,7 +139,8 @@ config.way_rts_flags = {
     'ghci-ext'         : [],
     'ghci-ext-prof'    : [],
     'ext-interp'       : [],
-    'nonmoving'        : ['-DS', '-xn'],
+    'nonmoving'        : ['-xn'],
+    'nonmoving_thr'    : ['-xn', '-N2'],
    }
 
 # Useful classes of ways that can be used with only_ways(), omit_ways() and
-- 
cgit v1.2.1


From 079879570bd697f6e8c8259bcc63eaa17f6cffaf Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Mon, 11 Feb 2019 22:03:33 -0500
Subject: testsuite: Add nonmoving_thr_ghc way

This uses the nonmoving collector when compiling the testcases.
---
 testsuite/config/ghc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/testsuite/config/ghc b/testsuite/config/ghc
index 16c387c3cd..9a3459ea96 100644
--- a/testsuite/config/ghc
+++ b/testsuite/config/ghc
@@ -28,7 +28,8 @@ config.other_ways         = ['prof', 'normal_h',
                              'ghci-ext', 'ghci-ext-prof',
                              'ext-interp',
                              'nonmoving',
-                             'nonmoving_thr']
+                             'nonmoving_thr',
+                             'nonmoving_thr_ghc']
 
 if ghc_with_native_codegen:
     config.compile_ways.append('optasm')
@@ -101,6 +102,7 @@ config.way_flags = {
     'ext-interp'   : ['-fexternal-interpreter'],
     'nonmoving'    : [],
     'nonmoving_thr': ['-threaded'],
+    'nonmoving_thr_ghc': ['+RTS', '-xn', '-N2', '-RTS', '-threaded'],
    }
 
 config.way_rts_flags = {
@@ -141,6 +143,7 @@ config.way_rts_flags = {
     'ext-interp'       : [],
     'nonmoving'        : ['-xn'],
     'nonmoving_thr'    : ['-xn', '-N2'],
+    'nonmoving_thr_ghc': ['-xn', '-N2'],
    }
 
 # Useful classes of ways that can be used with only_ways(), omit_ways() and
-- 
cgit v1.2.1


From 01fd0242c9e8aeaa78f268f3f9b1db1933f91b6e Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Tue, 5 Feb 2019 17:06:25 -0500
Subject: testsuite: Don't run T15892 in nonmoving ways

The nonmoving GC doesn't support `+RTS -G1`, which this test insists on.
---
 testsuite/tests/codeGen/should_run/all.T | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T
index 0882f2b605..f473ac523e 100644
--- a/testsuite/tests/codeGen/should_run/all.T
+++ b/testsuite/tests/codeGen/should_run/all.T
@@ -194,9 +194,11 @@ test('T15696_3', normal, compile_and_run, ['-O'])
 
 test('T15892',
      [ ignore_stdout,
-        # we want to do lots of major GC to make the bug more likely to
-        # happen, so -G1 -A32k:
-        extra_run_opts('+RTS -G1 -A32k -RTS') ],
+       # -G1 is unsupported by the nonmoving GC
+       omit_ways(['nonmoving', 'nonmoving_thr']),
+       # we want to do lots of major GC to make the bug more likely to
+       # happen, so -G1 -A32k:
+       extra_run_opts('+RTS -G1 -A32k -RTS') ],
      compile_and_run, ['-O'])
 test('T16617', normal, compile_and_run, [''])
 test('T16449_2', exit_code(0), compile_and_run, [''])
-- 
cgit v1.2.1


From 097f4fd0e242031693d2a6c0384762683d3bee31 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Tue, 5 Feb 2019 17:15:57 -0500
Subject: testsuite: Nonmoving collector doesn't support -G1

---
 testsuite/tests/codeGen/should_run/all.T | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T
index f473ac523e..002dc65a8b 100644
--- a/testsuite/tests/codeGen/should_run/all.T
+++ b/testsuite/tests/codeGen/should_run/all.T
@@ -1,5 +1,6 @@
 # Test +RTS -G1 here (it isn't tested anywhere else)
-setTestOpts(unless(fast(), extra_ways(['g1'])))
+# N.B. Nonmoving collector doesn't support -G1
+setTestOpts(unless(fast(), [ extra_ways(['g1']), omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc'])]))
 
 test('cgrun001', normal, compile_and_run, [''])
 test('cgrun002', normal, compile_and_run, [''])
-- 
cgit v1.2.1


From 4b91dd2566e4d26b7cf5107687f8cbb9b315d3f7 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Wed, 13 Feb 2019 13:49:04 -0500
Subject: testsuite: Ensure that threaded tests are run in nonmoving_thr

---
 testsuite/tests/concurrent/should_run/all.T | 14 +++++++-------
 testsuite/tests/rts/all.T                   |  8 ++++++--
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/testsuite/tests/concurrent/should_run/all.T b/testsuite/tests/concurrent/should_run/all.T
index 467040223f..9297c5890e 100644
--- a/testsuite/tests/concurrent/should_run/all.T
+++ b/testsuite/tests/concurrent/should_run/all.T
@@ -7,7 +7,7 @@ test('conc027', normal, compile_and_run, [''])
 test('conc051', normal, compile_and_run, [''])
 
 if ('threaded1' in config.run_ways):
-   only_threaded_ways = only_ways(['ghci','threaded1','threaded2'])
+   only_threaded_ways = only_ways(['ghci','threaded1','threaded2', 'nonmoving_thr'])
 else:
    only_threaded_ways = skip
 
@@ -203,8 +203,8 @@ test('foreignInterruptible', [when(fast(), skip),
                               ],
       compile_and_run, [''])
 
-test('conc037', only_ways(['threaded1','threaded2']), compile_and_run, [''])
-test('conc038', only_ways(['threaded1','threaded2']), compile_and_run, [''])
+test('conc037', only_ways(['threaded1', 'threaded2', 'nonmoving_thr']), compile_and_run, [''])
+test('conc038', only_ways(['threaded1', 'threaded2', 'nonmoving_thr']), compile_and_run, [''])
 
 # Omit for GHCi, uses foreign export
 # Omit for the threaded ways, because in this case the main thread is allowed to
@@ -224,7 +224,7 @@ test('conc045', normal, compile_and_run, [''])
 test('conc058', normal, compile_and_run, [''])
 
 test('conc059',
-     [only_ways(['threaded1', 'threaded2']),
+     [only_ways(['threaded1', 'threaded2', 'nonmoving_thr']),
       pre_cmd('$MAKE -s --no-print-directory conc059_setup')],
      compile_and_run, ['conc059_c.c -no-hs-main'])
 
@@ -243,7 +243,7 @@ test('conc067', ignore_stdout, compile_and_run, [''])
 test('conc068', [ omit_ways(concurrent_ways), exit_code(1) ], compile_and_run, [''])
 
 test('setnumcapabilities001',
-     [ only_ways(['threaded1','threaded2']),
+     [ only_ways(['threaded1','threaded2', 'nonmoving_thr']),
        extra_run_opts('8 12 2000'),
        req_smp ],
      compile_and_run, [''])
@@ -254,7 +254,7 @@ test('compareAndSwap', [omit_ways(['ghci','hpc']), reqlib('primitive')], compile
 test('hs_try_putmvar001',
      [
      when(opsys('mingw32'),skip), # uses pthread APIs in the C code
-     only_ways(['threaded1','threaded2']),
+     only_ways(['threaded1', 'threaded2', 'nonmoving_thr']),
       extra_clean(['hs_try_putmvar001_c.o'])],
      compile_and_run,
      ['hs_try_putmvar001_c.c'])
@@ -272,7 +272,7 @@ test('hs_try_putmvar003',
      [
      when(opsys('mingw32'),skip), # uses pthread APIs in the C code
      pre_cmd('$MAKE -s --no-print-directory hs_try_putmvar003_setup'),
-     only_ways(['threaded1','threaded2']),
+     only_ways(['threaded1', 'threaded2', 'nonmoving_thr']),
      extra_clean(['hs_try_putmvar003_c.o']),
      extra_run_opts('1 16 32 100'),
      fragile_for(16361, ['threaded1'])
diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T
index 9e20ba0b81..5e4e6991c8 100644
--- a/testsuite/tests/rts/all.T
+++ b/testsuite/tests/rts/all.T
@@ -67,8 +67,12 @@ test('outofmem', when(opsys('darwin'), skip),
                  makefile_test, ['outofmem'])
 test('outofmem2', normal, makefile_test, ['outofmem2'])
 
-test('T2047', [ignore_stdout, extra_run_opts('+RTS -c -RTS')],
-              compile_and_run, ['-package containers'])
+test('T2047',
+     [ignore_stdout,
+      extra_run_opts('+RTS -c -RTS'),
+      # Non-moving collector doesn't support -c
+      omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc'])],
+     compile_and_run, ['-package containers'])
 
 # Blackhole-detection test.
 # Skip GHCi due to #2786
-- 
cgit v1.2.1


From 78ce35b967749acb005bda4d72802d237d5f7dca Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Wed, 13 Feb 2019 14:05:37 -0500
Subject: testsuite: bug1010 requires -c, which isn't supported by nonmoving

---
 testsuite/tests/rts/all.T | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T
index 5e4e6991c8..6c27981998 100644
--- a/testsuite/tests/rts/all.T
+++ b/testsuite/tests/rts/all.T
@@ -12,7 +12,10 @@ test('testmblockalloc',
 
 # See bug #101, test requires +RTS -c (or equivalently +RTS -M<something>)
 # only GHCi triggers the bug, but we run the test all ways for completeness.
-test('bug1010', normal, compile_and_run, ['+RTS -c -RTS'])
+test('bug1010',
+     # Non-moving GC doesn't support -c
+     omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc']),
+     compile_and_run, ['+RTS -c -RTS'])
 
 def normalise_address(str):
     return re.sub('Access violation in generated code when reading [0]+',
-- 
cgit v1.2.1


From 6e97cc47d17e3c52d63d5bc3e34da27712b7ce47 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Wed, 13 Feb 2019 15:33:31 -0500
Subject: testsuite: Skip T15892 in nonmoving_thr_ghc

---
 testsuite/tests/codeGen/should_run/all.T | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T
index 002dc65a8b..f96820de81 100644
--- a/testsuite/tests/codeGen/should_run/all.T
+++ b/testsuite/tests/codeGen/should_run/all.T
@@ -196,7 +196,7 @@ test('T15696_3', normal, compile_and_run, ['-O'])
 test('T15892',
      [ ignore_stdout,
        # -G1 is unsupported by the nonmoving GC
-       omit_ways(['nonmoving', 'nonmoving_thr']),
+       omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc']),
        # we want to do lots of major GC to make the bug more likely to
        # happen, so -G1 -A32k:
        extra_run_opts('+RTS -G1 -A32k -RTS') ],
-- 
cgit v1.2.1


From 5ce853c8e4c104867e02234edddb48ab4d03c1b3 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Mon, 4 Mar 2019 19:45:36 -0500
Subject: ghc-heap: Skip heap_all test with debugged RTS

The debugged RTS initializes the heap with 0xaa, which breaks the
(admittedly rather fragile) assumption that uninitialized fields are set
to 0x00:
```
Wrong exit code for heap_all(nonmoving)(expected 0 , actual 1 )
Stderr ( heap_all ):
heap_all: user error (assertClosuresEq: Closures do not match
Expected: FunClosure {info = StgInfoTable {entry = Nothing, ptrs = 0, nptrs = 1, tipe = FUN_0_1, srtlen = 0, code = Nothing}, ptrArgs = [], dataArgs = [0]}
Actual:   FunClosure {info = StgInfoTable {entry = Nothing, ptrs = 0, nptrs = 1, tipe = FUN_0_1, srtlen = 1032832, code = Nothing}, ptrArgs = [], dataArgs = [12297829382473034410]}

CallStack (from HasCallStack):
  assertClosuresEq, called at heap_all.hs:230:9 in main:Main
)
```
---
 libraries/ghc-heap/tests/all.T | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/libraries/ghc-heap/tests/all.T b/libraries/ghc-heap/tests/all.T
index afa224fde7..e1458f49a4 100644
--- a/libraries/ghc-heap/tests/all.T
+++ b/libraries/ghc-heap/tests/all.T
@@ -2,7 +2,10 @@ test('heap_all',
      [when(have_profiling(), extra_ways(['prof'])),
       # These ways produce slightly different heap representations.
       # Currently we don't test them.
-      omit_ways(['ghci', 'hpc'])
+      omit_ways(['ghci', 'hpc']),
+      # The debug RTS initializes some fields with 0xaa and so
+      # this test spuriously fails.
+      when(compiler_debugged(), skip)
      ],
      compile_and_run, [''])
 
-- 
cgit v1.2.1


From 6abefce77dbe37ea222e6224cc3500a435717957 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Tue, 2 Apr 2019 20:50:56 -0400
Subject: Skip ghc_heap_all test in nonmoving ways

---
 libraries/ghc-heap/tests/all.T | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libraries/ghc-heap/tests/all.T b/libraries/ghc-heap/tests/all.T
index e1458f49a4..89e6f47ecb 100644
--- a/libraries/ghc-heap/tests/all.T
+++ b/libraries/ghc-heap/tests/all.T
@@ -2,7 +2,8 @@ test('heap_all',
      [when(have_profiling(), extra_ways(['prof'])),
       # These ways produce slightly different heap representations.
       # Currently we don't test them.
-      omit_ways(['ghci', 'hpc']),
+      omit_ways(['ghci', 'hpc',
+                 'nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc']),
       # The debug RTS initializes some fields with 0xaa and so
       # this test spuriously fails.
       when(compiler_debugged(), skip)
-- 
cgit v1.2.1


From 99baff8c7ba2f1ca07cfffb4348fc75d95673366 Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Tue, 18 Jun 2019 23:03:27 -0400
Subject: testsuite: Don't run T9630 in nonmoving ways

The nonmoving collector doesn't support -G1
---
 testsuite/tests/perf/compiler/all.T | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/testsuite/tests/perf/compiler/all.T b/testsuite/tests/perf/compiler/all.T
index cfc860d0b0..72e240a4c5 100644
--- a/testsuite/tests/perf/compiler/all.T
+++ b/testsuite/tests/perf/compiler/all.T
@@ -385,7 +385,9 @@ test ('T9630',
       extra_clean(['T9630a.hi', 'T9630a.o']),
 
       # Use `+RTS -G1` for more stable residency measurements. Note [residency].
-      extra_hc_opts('+RTS -G1 -RTS')
+      extra_hc_opts('+RTS -G1 -RTS'),
+      # The nonmoving collector does not support -G1
+      omit_ways(['nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc'])
       ],
       multimod_compile,
       ['T9630', '-v0 -O'])
-- 
cgit v1.2.1


From 25ae8f7d3a2c8943c3995ab2c766f52a8adf8d1c Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Tue, 18 Jun 2019 23:04:06 -0400
Subject: testsuite: Don't run T7160 in nonmoving_thr ways

The nonmoving way finalizes things in a different order.
---
 testsuite/tests/rts/all.T | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T
index 6c27981998..36f63c571e 100644
--- a/testsuite/tests/rts/all.T
+++ b/testsuite/tests/rts/all.T
@@ -190,7 +190,7 @@ test('T6006', [ omit_ways(prof_ways + ['ghci']),
 test('T7037', [], makefile_test, ['T7037'])
 
 test('T7087', exit_code(1), compile_and_run, [''])
-test('T7160', normal, compile_and_run, [''])
+test('T7160', omit_ways(['nonmoving_thr', 'nonmoving_thr_ghc']), compile_and_run, [''])
 
 test('T7040', [omit_ways(['ghci'])], compile_and_run, ['T7040_c.c'])
 
-- 
cgit v1.2.1


From 8cab149bba91b18e4e92cdc13c616fe6465ac84a Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Wed, 2 Oct 2019 22:37:31 -0400
Subject: testsuite: Mark length001 as failing under nonmoving ways

This is consistent with the other unoptimized ways.
---
 libraries/base/tests/all.T | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libraries/base/tests/all.T b/libraries/base/tests/all.T
index 0b45c9aff2..0242ece32e 100644
--- a/libraries/base/tests/all.T
+++ b/libraries/base/tests/all.T
@@ -74,7 +74,7 @@ test('length001',
      # excessive amounts of stack space. So we specifically set a low
      # stack limit and mark it as failing under a few conditions.
      [extra_run_opts('+RTS -K8m -RTS'),
-     expect_fail_for(['normal', 'threaded1', 'llvm'])],
+     expect_fail_for(['normal', 'threaded1', 'llvm', 'nonmoving', 'nonmoving_thr', 'nonmoving_thr_ghc'])],
      compile_and_run, [''])
 
 test('ratio001', 	normal, compile_and_run, [''])
-- 
cgit v1.2.1