From 13dd78ddb158f98b35ad2eda50d0a7af63920ece Mon Sep 17 00:00:00 2001
From: Ben Gamari <ben@smart-cactus.org>
Date: Thu, 18 Apr 2019 14:08:32 -0400
Subject: Nonmoving: Allow aging and refactor static objects logic

This commit does two things:

 * Allow aging of objects during the preparatory minor GC
 * Refactor handling of static objects to avoid the use of a hashtable
---
 rts/sm/Evac.c           |  50 ++++++++++++++++-----
 rts/sm/GC.c             |  10 ++++-
 rts/sm/GCAux.c          |   4 +-
 rts/sm/NonMoving.c      |  40 ++++++++++++-----
 rts/sm/NonMovingMark.c  | 115 ++++++++++++++++++++++++++++++++++--------------
 rts/sm/NonMovingMark.h  |   6 +--
 rts/sm/NonMovingScav.c  |  45 +++++++++++++------
 rts/sm/NonMovingSweep.c |   5 ++-
 rts/sm/NonMovingSweep.h |   2 +-
 rts/sm/Storage.c        |  70 +++++++++++++++++++++++++++--
 10 files changed, 266 insertions(+), 81 deletions(-)

diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index c2aaaacffa..b4e35849ba 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -69,12 +69,6 @@ alloc_for_copy (uint32_t size, uint32_t gen_no)
 {
     ASSERT(gen_no < RtsFlags.GcFlags.generations);
 
-    if (RtsFlags.GcFlags.useNonmoving && major_gc) {
-        // unconditionally promote to non-moving heap in major gc
-        gct->copied += size;
-        return nonmovingAllocate(gct->cap, size);
-    }
-
     StgPtr to;
     gen_workspace *ws;
 
@@ -93,7 +87,20 @@ alloc_for_copy (uint32_t size, uint32_t gen_no)
 
     if (RtsFlags.GcFlags.useNonmoving && gen_no == oldest_gen->no) {
         gct->copied += size;
-        return nonmovingAllocate(gct->cap, size);
+        to = nonmovingAllocate(gct->cap, size);
+
+        // Add segment to the todo list unless it's already there
+        // current->todo_link == NULL means not in todo list
+        struct NonmovingSegment *seg = nonmovingGetSegment(to);
+        if (!seg->todo_link) {
+            gen_workspace *ws = &gct->gens[oldest_gen->no];
+            seg->todo_link = ws->todo_seg;
+            ws->todo_seg = seg;
+        }
+
+        if (major_gc)
+            markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure *) to);
+        return to;
     }
 
     ws = &gct->gens[gen_no];  // zero memory references here
@@ -312,9 +319,7 @@ evacuate_large(StgPtr p)
    */
   new_gen_no = bd->dest_no;
 
-  if (RtsFlags.GcFlags.useNonmoving && major_gc) {
-      new_gen_no = oldest_gen->no;
-  } else if (new_gen_no < gct->evac_gen_no) {
+  if (new_gen_no < gct->evac_gen_no) {
       if (gct->eager_promotion) {
           new_gen_no = gct->evac_gen_no;
       } else {
@@ -363,6 +368,13 @@ evacuate_large(StgPtr p)
 STATIC_INLINE void
 evacuate_static_object (StgClosure **link_field, StgClosure *q)
 {
+    if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving)) {
+        // See Note [Static objects under the nonmoving collector] in Storage.c.
+        if (major_gc)
+            markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+        return;
+    }
+
     StgWord link = (StgWord)*link_field;
 
     // See Note [STATIC_LINK fields] for how the link field bits work
@@ -603,6 +615,8 @@ loop:
           // NOTE: large objects in nonmoving heap are also marked with
           // BF_NONMOVING. Those are moved to scavenged_large_objects list in
           // mark phase.
+          if (major_gc)
+              markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
           return;
       }
 
@@ -629,6 +643,13 @@ loop:
       // they are not)
       if (bd->flags & BF_COMPACT) {
           evacuate_compact((P_)q);
+
+          // We may have evacuated the block to the nonmoving generation. If so
+          // we need to make sure it is added to the mark queue since the only
+          // reference to it may be from the moving heap.
+          if (major_gc && bd->flags & BF_NONMOVING) {
+              markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+          }
           return;
       }
 
@@ -636,6 +657,13 @@ loop:
        */
       if (bd->flags & BF_LARGE) {
           evacuate_large((P_)q);
+
+          // We may have evacuated the block to the nonmoving generation. If so
+          // we need to make sure it is added to the mark queue since the only
+          // reference to it may be from the moving heap.
+          if (major_gc && bd->flags & BF_NONMOVING) {
+              markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
+          }
           return;
       }
 
@@ -937,6 +965,8 @@ evacuate_BLACKHOLE(StgClosure **p)
     ASSERT((bd->flags & BF_COMPACT) == 0);
 
     if (bd->flags & BF_NONMOVING) {
+        if (major_gc)
+            markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q);
         return;
     }
 
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 37f9811aaf..1df6ace800 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -263,7 +263,10 @@ GarbageCollect (uint32_t collect_gen,
   N = collect_gen;
   major_gc = (N == RtsFlags.GcFlags.generations-1);
 
-  if (major_gc) {
+  /* N.B. The nonmoving collector works a bit differently. See
+   * Note [Static objects under the nonmoving collector].
+   */
+  if (major_gc && !RtsFlags.GcFlags.useNonmoving) {
       prev_static_flag = static_flag;
       static_flag =
           static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
@@ -736,6 +739,11 @@ GarbageCollect (uint32_t collect_gen,
       // so we need to mark those too.
       // Note that in sequential case these lists will be appended with more
       // weaks and threads found to be dead in mark.
+#if !defined(THREADED_RTS)
+      // In the non-threaded runtime this is the only time we push to the
+      // upd_rem_set
+      nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue);
+#endif
       nonmovingCollect(&dead_weak_ptr_list, &resurrected_threads);
       ACQUIRE_SM_LOCK;
   }
diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c
index 6076f61d6f..11080c1f22 100644
--- a/rts/sm/GCAux.c
+++ b/rts/sm/GCAux.c
@@ -148,14 +148,14 @@ markCAFs (evac_fn evac, void *user)
     StgIndStatic *c;
 
     for (c = dyn_caf_list;
-         c != (StgIndStatic*)END_OF_CAF_LIST;
+         ((StgWord) c | STATIC_FLAG_LIST) != (StgWord)END_OF_CAF_LIST;
          c = (StgIndStatic *)c->static_link)
     {
         c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c);
         evac(user, &c->indirectee);
     }
     for (c = revertible_caf_list;
-         c != (StgIndStatic*)END_OF_CAF_LIST;
+         ((StgWord) c | STATIC_FLAG_LIST) != (StgWord)END_OF_CAF_LIST;
          c = (StgIndStatic *)c->static_link)
     {
         c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c);
diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index 24464a43f9..76f54c9302 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -68,6 +68,27 @@ Mutex concurrent_coll_finished_lock;
  *    stopAllCapabilitiesWith(SYNC_FLUSH_UPD_REM_SET). Capabilities are held
  *    the final mark has concluded.
  *
+ * Note that possibility of concurrent minor and non-moving collections
+ * requires that we handle static objects a bit specially. See
+ * Note [Static objects under the nonmoving collector] in Storage.c
+ * for details.
+ *
+ *
+ * Note [Aging under the non-moving collector]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * The initial design of the non-moving collector mandated that all live data
+ * be evacuated to the non-moving heap prior to a major collection. This
+ * simplified certain bits of implementation and eased reasoning. However, it
+ * was (unsurprisingly) also found to result in significant amounts of
+ * unnecessary copying.
+ *
+ * Consequently, we now allow aging. We do this by teaching the moving
+ * collector to "evacuate" objects it encounters in the non-moving heap by
+ * adding them to the mark queue. Specifically, since each gc_thread holds a
+ * capability we push to the capability's update remembered set (implemented
+ * by markQueuePushClosureGC)
+ *
  *
  * Note [Live data accounting in nonmoving collector]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -150,6 +171,8 @@ static struct NonmovingSegment *nonmovingPopFreeSegment(void)
  * Request a fresh segment from the free segment list or allocate one of the
  * given node.
  *
+ * Caller must hold SM_MUTEX (although we take the gc_alloc_block_sync spinlock
+ * under the assumption that we are in a GC context).
  */
 static struct NonmovingSegment *nonmovingAllocSegment(uint32_t node)
 {
@@ -221,7 +244,7 @@ static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *al
     }
 }
 
-/* sz is in words */
+/* Allocate a block in the nonmoving heap. Caller must hold SM_MUTEX. sz is in words */
 GNUC_ATTR_HOT
 void *nonmovingAllocate(Capability *cap, StgWord sz)
 {
@@ -239,14 +262,6 @@ void *nonmovingAllocate(Capability *cap, StgWord sz)
     void *ret = nonmovingSegmentGetBlock(current, current->next_free);
     ASSERT(GET_CLOSURE_TAG(ret) == 0); // check alignment
 
-    // Add segment to the todo list unless it's already there
-    // current->todo_link == NULL means not in todo list
-    if (!current->todo_link) {
-        gen_workspace *ws = &gct->gens[oldest_gen->no];
-        current->todo_link = ws->todo_seg;
-        ws->todo_seg = current;
-    }
-
     // Advance the current segment's next_free or allocate a new segment if full
     bool full = advance_next_free(current);
     if (full) {
@@ -405,6 +420,11 @@ static void nonmovingClearAllBitmaps(void)
 /* Prepare the heap bitmaps and snapshot metadata for a mark */
 static void nonmovingPrepareMark(void)
 {
+    // See Note [Static objects under the nonmoving collector].
+    prev_static_flag = static_flag;
+    static_flag =
+        static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
+
     nonmovingClearAllBitmaps();
     nonmovingBumpEpoch();
     for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
@@ -688,7 +708,7 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
 
 #if defined(DEBUG)
     // Zap CAFs that we will sweep
-    nonmovingGcCafs(mark_queue);
+    nonmovingGcCafs();
 #endif
 
     ASSERT(mark_queue->top->head == 0);
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index 72a3ca69cb..b1c585f72a 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -205,7 +205,7 @@ static void init_mark_queue_(MarkQueue *queue);
  * Really the argument type should be UpdRemSet* but this would be rather
  * inconvenient without polymorphism.
  */
-static void nonmovingAddUpdRemSetBlocks(MarkQueue *rset)
+void nonmovingAddUpdRemSetBlocks(MarkQueue *rset)
 {
     if (markQueueIsEmpty(rset)) return;
 
@@ -374,6 +374,38 @@ push (MarkQueue *q, const MarkQueueEnt *ent)
     q->top->head++;
 }
 
+/* A variant of push to be used by the minor GC when it encounters a reference
+ * to an object in the non-moving heap. In contrast to the other push
+ * operations this uses the gc_alloc_block_sync spinlock instead of the
+ * SM_LOCK to allocate new blocks in the event that the mark queue is full.
+ */
+void
+markQueuePushClosureGC (MarkQueue *q, StgClosure *p)
+{
+    // Are we at the end of the block?
+    if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) {
+        // Yes, this block is full.
+        // allocate a fresh block.
+        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        bdescr *bd = allocGroup(1);
+        bd->link = q->blocks;
+        q->blocks = bd;
+        q->top = (MarkQueueBlock *) bd->start;
+        q->top->head = 0;
+        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+    }
+
+    MarkQueueEnt ent = {
+        .type = MARK_CLOSURE,
+        .mark_closure = {
+            .p = UNTAG_CLOSURE(p),
+            .origin = NULL,
+        }
+    };
+    q->top->entries[q->top->head] = ent;
+    q->top->head++;
+}
+
 static inline
 void push_closure (MarkQueue *q,
                    StgClosure *p,
@@ -715,7 +747,6 @@ static void init_mark_queue_ (MarkQueue *queue)
 void initMarkQueue (MarkQueue *queue)
 {
     init_mark_queue_(queue);
-    queue->marked_objects = allocHashTable();
     queue->is_upd_rem_set = false;
 }
 
@@ -723,8 +754,6 @@ void initMarkQueue (MarkQueue *queue)
 void init_upd_rem_set (UpdRemSet *rset)
 {
     init_mark_queue_(&rset->queue);
-    // Update remembered sets don't have to worry about static objects
-    rset->queue.marked_objects = NULL;
     rset->queue.is_upd_rem_set = true;
 }
 
@@ -739,7 +768,6 @@ void reset_upd_rem_set (UpdRemSet *rset)
 void freeMarkQueue (MarkQueue *queue)
 {
     freeChain_lock(queue->blocks);
-    freeHashTable(queue->marked_objects, NULL);
 }
 
 #if defined(THREADED_RTS) && defined(DEBUG)
@@ -986,12 +1014,32 @@ mark_stack (MarkQueue *queue, StgStack *stack)
     mark_stack_(queue, stack->sp, stack->stack + stack->stack_size);
 }
 
+/* See Note [Static objects under the nonmoving collector].
+ *
+ * Returns true if the object needs to be marked.
+ */
+static bool
+bump_static_flag(StgClosure **link_field, StgClosure *q STG_UNUSED)
+{
+    while (1) {
+        StgWord link = (StgWord) *link_field;
+        StgWord new = (link & ~STATIC_BITS) | static_flag;
+        if ((link & STATIC_BITS) == static_flag)
+            return false;
+        else if (cas((StgVolatilePtr) link_field, link, new) == link) {
+            return true;
+        }
+    }
+}
+
 static GNUC_ATTR_HOT void
 mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
 {
     (void)origin; // TODO: should be used for selector/thunk optimisations
 
  try_again:
+    ;
+    bdescr *bd = NULL;
     p = UNTAG_CLOSURE(p);
 
 #   define PUSH_FIELD(obj, field)                                \
@@ -1009,45 +1057,46 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
             return;
         }
 
-        if (lookupHashTable(queue->marked_objects, (W_)p)) {
-            // already marked
-            return;
-        }
-
-        insertHashTable(queue->marked_objects, (W_)p, (P_)1);
-
         switch (type) {
 
         case THUNK_STATIC:
             if (info->srt != 0) {
-                markQueuePushThunkSrt(queue, info); // TODO this function repeats the check above
+                if (bump_static_flag(THUNK_STATIC_LINK((StgClosure *)p), p)) {
+                    markQueuePushThunkSrt(queue, info); // TODO this function repeats the check above
+                }
             }
             return;
 
         case FUN_STATIC:
             if (info->srt != 0 || info->layout.payload.ptrs != 0) {
-                markQueuePushFunSrt(queue, info); // TODO this function repeats the check above
-
-                // a FUN_STATIC can also be an SRT, so it may have pointer
-                // fields.  See Note [SRTs] in CmmBuildInfoTables, specifically
-                // the [FUN] optimisation.
-                // TODO (osa) I don't understand this comment
-                for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
-                    PUSH_FIELD(p, payload[i]);
+                if (bump_static_flag(STATIC_LINK(info, (StgClosure *)p), p)) {
+                    markQueuePushFunSrt(queue, info); // TODO this function repeats the check above
+
+                    // a FUN_STATIC can also be an SRT, so it may have pointer
+                    // fields.  See Note [SRTs] in CmmBuildInfoTables, specifically
+                    // the [FUN] optimisation.
+                    // TODO (osa) I don't understand this comment
+                    for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
+                        PUSH_FIELD(p, payload[i]);
+                    }
                 }
             }
             return;
 
         case IND_STATIC:
-            PUSH_FIELD((StgInd *) p, indirectee);
+            if (bump_static_flag(IND_STATIC_LINK((StgClosure *)p), p)) {
+                PUSH_FIELD((StgInd *) p, indirectee);
+            }
             return;
 
         case CONSTR:
         case CONSTR_1_0:
         case CONSTR_2_0:
         case CONSTR_1_1:
-            for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
-                PUSH_FIELD(p, payload[i]);
+            if (bump_static_flag(STATIC_LINK(info, (StgClosure *)p), p)) {
+                for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) {
+                    PUSH_FIELD(p, payload[i]);
+                }
             }
             return;
 
@@ -1061,19 +1110,17 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
         }
     }
 
-    bdescr *bd = Bdescr((StgPtr) p);
+    bd = Bdescr((StgPtr) p);
 
     if (bd->gen != oldest_gen) {
-        // Here we have an object living outside of the non-moving heap. Since
-        // we moved everything to the non-moving heap before starting the major
-        // collection, we know that we don't need to trace it: it was allocated
-        // after we took our snapshot.
-#if !defined(THREADED_RTS)
-        // This should never happen in the non-concurrent case
-        barf("Closure outside of non-moving heap: %p", p);
-#else
+        // Here we have an object living outside of the non-moving heap. While
+        // we likely evacuated nearly everything to the nonmoving heap during
+        // preparation there are nevertheless a few ways in which we might trace
+        // a reference into younger generations:
+        //
+        //  * a mutable object might have been updated
+        //  * we might have aged an object
         return;
-#endif
     }
 
     ASSERTM(LOOKS_LIKE_CLOSURE_PTR(p), "invalid closure, info=%p", p->header.info);
diff --git a/rts/sm/NonMovingMark.h b/rts/sm/NonMovingMark.h
index 84b6642d6c..7e9bfed1f9 100644
--- a/rts/sm/NonMovingMark.h
+++ b/rts/sm/NonMovingMark.h
@@ -82,10 +82,6 @@ typedef struct MarkQueue_ {
 
     // Is this a mark queue or a capability-local update remembered set?
     bool is_upd_rem_set;
-
-    // Marked objects outside of nonmoving heap, namely large and static
-    // objects.
-    HashTable *marked_objects;
 } MarkQueue;
 
 /* While it shares its representation with MarkQueue, UpdRemSet differs in
@@ -145,8 +141,10 @@ void nonmovingResurrectThreads(struct MarkQueue_ *queue, StgTSO **resurrected_th
 bool nonmovingIsAlive(StgClosure *p);
 void nonmovingMarkDeadWeak(struct MarkQueue_ *queue, StgWeak *w);
 void nonmovingMarkLiveWeak(struct MarkQueue_ *queue, StgWeak *w);
+void nonmovingAddUpdRemSetBlocks(struct MarkQueue_ *rset);
 
 void markQueuePush(MarkQueue *q, const MarkQueueEnt *ent);
+void markQueuePushClosureGC(MarkQueue *q, StgClosure *p);
 void markQueuePushClosure(MarkQueue *q,
                              StgClosure *p,
                              StgClosure **origin);
diff --git a/rts/sm/NonMovingScav.c b/rts/sm/NonMovingScav.c
index 850750b43b..0efbde688c 100644
--- a/rts/sm/NonMovingScav.c
+++ b/rts/sm/NonMovingScav.c
@@ -16,6 +16,7 @@ nonmovingScavengeOne (StgClosure *q)
     ASSERT(LOOKS_LIKE_CLOSURE_PTR(q));
     StgPtr p = (StgPtr)q;
     const StgInfoTable *info = get_itbl(q);
+    const bool saved_eager_promotion = gct->eager_promotion;
 
     switch (info->type) {
 
@@ -23,9 +24,11 @@ nonmovingScavengeOne (StgClosure *q)
     case MVAR_DIRTY:
     {
         StgMVar *mvar = ((StgMVar *)p);
+        gct->eager_promotion = false;
         evacuate((StgClosure **)&mvar->head);
         evacuate((StgClosure **)&mvar->tail);
         evacuate((StgClosure **)&mvar->value);
+        gct->eager_promotion = saved_eager_promotion;
         if (gct->failed_to_evac) {
             mvar->header.info = &stg_MVAR_DIRTY_info;
         } else {
@@ -37,8 +40,10 @@ nonmovingScavengeOne (StgClosure *q)
     case TVAR:
     {
         StgTVar *tvar = ((StgTVar *)p);
+        gct->eager_promotion = false;
         evacuate((StgClosure **)&tvar->current_value);
         evacuate((StgClosure **)&tvar->first_watch_queue_entry);
+        gct->eager_promotion = saved_eager_promotion;
         if (gct->failed_to_evac) {
             tvar->header.info = &stg_TVAR_DIRTY_info;
         } else {
@@ -122,10 +127,21 @@ nonmovingScavengeOne (StgClosure *q)
         break;
     }
 
+    case WEAK:
+    {
+        // We must evacuate the key since it may refer to an object in the
+        // moving heap which may be long gone by the time we call
+        // nonmovingTidyWeaks.
+        StgWeak *weak = (StgWeak *) p;
+        gct->eager_promotion = true;
+        evacuate(&weak->key);
+        gct->eager_promotion = saved_eager_promotion;
+        goto gen_obj;
+    }
+
     gen_obj:
     case CONSTR:
     case CONSTR_NOCAF:
-    case WEAK:
     case PRIM:
     {
         StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
@@ -145,7 +161,9 @@ nonmovingScavengeOne (StgClosure *q)
 
     case MUT_VAR_CLEAN:
     case MUT_VAR_DIRTY:
+        gct->eager_promotion = false;
         evacuate(&((StgMutVar *)p)->var);
+        gct->eager_promotion = saved_eager_promotion;
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info;
         } else {
@@ -157,10 +175,12 @@ nonmovingScavengeOne (StgClosure *q)
     {
         StgBlockingQueue *bq = (StgBlockingQueue *)p;
 
+        gct->eager_promotion = false;
         evacuate(&bq->bh);
         evacuate((StgClosure**)&bq->owner);
         evacuate((StgClosure**)&bq->queue);
         evacuate((StgClosure**)&bq->link);
+        gct->eager_promotion = saved_eager_promotion;
 
         if (gct->failed_to_evac) {
             bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info;
@@ -202,11 +222,9 @@ nonmovingScavengeOne (StgClosure *q)
     case MUT_ARR_PTRS_CLEAN:
     case MUT_ARR_PTRS_DIRTY:
     {
-        // We don't eagerly promote objects pointed to by a mutable
-        // array, but if we find the array only points to objects in
-        // the same or an older generation, we mark it "clean" and
-        // avoid traversing it during minor GCs.
+        gct->eager_promotion = false;
         scavenge_mut_arr_ptrs((StgMutArrPtrs*)p);
+        gct->eager_promotion = saved_eager_promotion;
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -234,14 +252,13 @@ nonmovingScavengeOne (StgClosure *q)
     case SMALL_MUT_ARR_PTRS_DIRTY:
         // follow everything
     {
-        // We don't eagerly promote objects pointed to by a mutable
-        // array, but if we find the array only points to objects in
-        // the same or an older generation, we mark it "clean" and
-        // avoid traversing it during minor GCs.
         StgPtr next = p + small_mut_arr_ptrs_sizeW((StgSmallMutArrPtrs*)p);
+        gct->eager_promotion = false;
         for (p = (P_)((StgSmallMutArrPtrs *)p)->payload; p < next; p++) {
             evacuate((StgClosure **)p);
         }
+        gct->eager_promotion = saved_eager_promotion;
+
         if (gct->failed_to_evac) {
             ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info;
         } else {
@@ -278,21 +295,21 @@ nonmovingScavengeOne (StgClosure *q)
     {
         StgStack *stack = (StgStack*)p;
 
+        gct->eager_promotion = false;
         scavenge_stack(stack->sp, stack->stack + stack->stack_size);
+        gct->eager_promotion = saved_eager_promotion;
         stack->dirty = gct->failed_to_evac;
-        // TODO (osa): There may be something special about stacks that we're
-        // missing. All other mut objects are marked by using a different info
-        // table except stacks.
-
         break;
     }
 
     case MUT_PRIM:
     {
         StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs;
+        gct->eager_promotion = false;
         for (p = (P_)((StgClosure *)p)->payload; p < end; p++) {
             evacuate((StgClosure **)p);
         }
+        gct->eager_promotion = saved_eager_promotion;
         gct->failed_to_evac = true; // mutable
         break;
     }
@@ -302,12 +319,14 @@ nonmovingScavengeOne (StgClosure *q)
         StgWord i;
         StgTRecChunk *tc = ((StgTRecChunk *) p);
         TRecEntry *e = &(tc -> entries[0]);
+        gct->eager_promotion = false;
         evacuate((StgClosure **)&tc->prev_chunk);
         for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) {
           evacuate((StgClosure **)&e->tvar);
           evacuate((StgClosure **)&e->expected_value);
           evacuate((StgClosure **)&e->new_value);
         }
+        gct->eager_promotion = saved_eager_promotion;
         gct->failed_to_evac = true; // mutable
         break;
       }
diff --git a/rts/sm/NonMovingSweep.c b/rts/sm/NonMovingSweep.c
index fa3e38cca4..b67bba0ce5 100644
--- a/rts/sm/NonMovingSweep.c
+++ b/rts/sm/NonMovingSweep.c
@@ -102,7 +102,7 @@ nonmovingSweepSegment(struct NonmovingSegment *seg)
 
 #if defined(DEBUG)
 
-void nonmovingGcCafs(struct MarkQueue_ *queue)
+void nonmovingGcCafs()
 {
     uint32_t i = 0;
     StgIndStatic *next;
@@ -116,7 +116,8 @@ void nonmovingGcCafs(struct MarkQueue_ *queue)
         const StgInfoTable *info = get_itbl((StgClosure*)caf);
         ASSERT(info->type == IND_STATIC);
 
-        if (lookupHashTable(queue->marked_objects, (StgWord) caf) == NULL) {
+        StgWord flag = ((StgWord) caf->static_link) & STATIC_BITS;
+        if (flag != 0 && flag != static_flag) {
             debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", caf);
             SET_INFO((StgClosure*)caf, &stg_GCD_CAF_info); // stub it
         } else {
diff --git a/rts/sm/NonMovingSweep.h b/rts/sm/NonMovingSweep.h
index de2d52acb5..f21936004f 100644
--- a/rts/sm/NonMovingSweep.h
+++ b/rts/sm/NonMovingSweep.h
@@ -28,5 +28,5 @@ void nonmovingPrepareSweep(void);
 
 #if defined(DEBUG)
 // The non-moving equivalent of the moving collector's gcCAFs.
-void nonmovingGcCafs(struct MarkQueue_ *queue);
+void nonmovingGcCafs(void);
 #endif
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 44163a353f..f04b3c5929 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -321,7 +321,8 @@ freeStorage (bool free_heap)
 }
 
 /* -----------------------------------------------------------------------------
-   Note [CAF management].
+   Note [CAF management]
+   ~~~~~~~~~~~~~~~~~~~~~
 
    The entry code for every CAF does the following:
 
@@ -356,6 +357,7 @@ freeStorage (bool free_heap)
 
    ------------------
    Note [atomic CAF entry]
+   ~~~~~~~~~~~~~~~~~~~~~~~
 
    With THREADED_RTS, newCAF() is required to be atomic (see
    #5558). This is because if two threads happened to enter the same
@@ -369,6 +371,7 @@ freeStorage (bool free_heap)
 
    ------------------
    Note [GHCi CAFs]
+   ~~~~~~~~~~~~~~~~
 
    For GHCI, we have additional requirements when dealing with CAFs:
 
@@ -388,6 +391,51 @@ freeStorage (bool free_heap)
 
       -- SDM 29/1/01
 
+   ------------------
+   Note [Static objects under the nonmoving collector]
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   Static object management is a bit tricky under the nonmoving collector as we
+   need to maintain a bit more state than in the moving collector. In
+   particular, the moving collector uses the low bits of the STATIC_LINK field
+   to determine whether the object has been moved to the scavenger's work list
+   (see Note [STATIC_LINK fields] in Storage.h).
+
+   However, the nonmoving collector also needs a place to keep its mark bit.
+   This is problematic as we therefore need at least three bits of state
+   but can assume only two bits are available in STATIC_LINK (due to 32-bit
+   systems).
+
+   To accomodate this we move handling of static objects entirely to the
+   oldest generation when the nonmoving collector is in use. To do this safely
+   and efficiently we allocate the blackhole created by lockCAF() directly in
+   the non-moving heap. This means that the moving collector can completely
+   ignore static objects in minor collections since they are guaranteed not to
+   have any references into the moving heap. Of course, the blackhole itself
+   likely will contain a reference into the moving heap but this is
+   significantly easier to handle, being a heap-allocated object (see Note
+   [Aging under the non-moving collector] in NonMoving.c for details).
+
+   During the moving phase of a major collection we treat static objects
+   as we do any other reference into the non-moving heap by pushing them
+   to the non-moving mark queue (see Note [Aging under the non-moving
+   collector]).
+
+   This allows the non-moving collector to have full control over the flags
+   in STATIC_LINK, which it uses as described in Note [STATIC_LINK fields]).
+   This is implemented by NonMovingMark.c:bump_static_flag.
+
+   In short, the plan is:
+
+     - lockCAF allocates its blackhole in the nonmoving heap. This is important
+       to ensure that we do not need to place the static object on the mut_list
+       lest we would need somw way to ensure that it evacuate only once during
+       a moving collection.
+
+     - evacuate_static_object adds merely pushes objects to the mark queue
+
+     - the nonmoving collector uses the flags in STATIC_LINK as its mark bit.
+
    -------------------------------------------------------------------------- */
 
 STATIC_INLINE StgInd *
@@ -441,7 +489,16 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     caf->saved_info = orig_info;
 
     // Allocate the blackhole indirection closure
-    bh = (StgInd *)allocate(cap, sizeofW(*bh));
+    if (RtsFlags.GcFlags.useNonmoving) {
+        // See Note [Static objects under the nonmoving collector].
+        ACQUIRE_SM_LOCK;
+        bh = (StgInd *)nonmovingAllocate(cap, sizeofW(*bh));
+        RELEASE_SM_LOCK;
+        recordMutableCap((StgClosure*)bh,
+                         regTableToCapability(reg), oldest_gen->no);
+    } else {
+        bh = (StgInd *)allocate(cap, sizeofW(*bh));
+    }
     bh->indirectee = (StgClosure *)cap->r.rCurrentTSO;
     SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs);
     // Ensure that above writes are visible before we introduce reference as CAF indirectee.
@@ -483,7 +540,9 @@ newCAF(StgRegTable *reg, StgIndStatic *caf)
     else
     {
         // Put this CAF on the mutable list for the old generation.
-        if (oldest_gen->no != 0) {
+        // N.B. the nonmoving collector works a bit differently: see
+        // Note [Static objects under the nonmoving collector].
+        if (oldest_gen->no != 0 && !RtsFlags.GcFlags.useNonmoving) {
             recordMutableCap((StgClosure*)caf,
                              regTableToCapability(reg), oldest_gen->no);
         }
@@ -560,7 +619,9 @@ StgInd* newGCdCAF (StgRegTable *reg, StgIndStatic *caf)
     if (!bh) return NULL;
 
     // Put this CAF on the mutable list for the old generation.
-    if (oldest_gen->no != 0) {
+    // N.B. the nonmoving collector works a bit differently:
+    // see Note [Static objects under the nonmoving collector].
+    if (oldest_gen->no != 0 && !RtsFlags.GcFlags.useNonmoving) {
         recordMutableCap((StgClosure*)caf,
                          regTableToCapability(reg), oldest_gen->no);
     }
@@ -1520,6 +1581,7 @@ void flushExec (W_ len, AdjustorExecutable exec_addr)
   __clear_cache((void*)begin, (void*)end);
 # endif
 #elif defined(__GNUC__)
+  /* For all other platforms, fall back to a libgcc builtin. */
   unsigned char* begin = (unsigned char*)exec_addr;
   unsigned char* end   = begin + len;
 # if GCC_HAS_BUILTIN_CLEAR_CACHE
-- 
cgit v1.2.1