23 files changed, 1073 insertions, 116 deletions
diff --git a/rts/Apply.cmm b/rts/Apply.cmm
index 8d7fc3c012..eeb760c5ed 100644
--- a/rts/Apply.cmm
+++ b/rts/Apply.cmm
@@ -654,6 +654,8 @@ INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK")
     /* someone else beat us to it */
     jump ENTRY_LBL(stg_WHITEHOLE) (ap);
   }
+  // Can't add StgInd_indirectee(ap) to UpdRemSet here because the old value is
+  // not reachable.
   StgInd_indirectee(ap) = CurrentTSO;
   prim_write_barrier;
   SET_INFO(ap, __stg_EAGER_BLACKHOLE_info);
diff --git a/rts/Capability.c b/rts/Capability.c
index 23e581359e..0baa4ef205 100644
--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -292,6 +292,11 @@ initCapability (Capability *cap, uint32_t i)
                                           RtsFlags.GcFlags.generations,
                                           "initCapability");
 
+
+    // At this point storage manager is not initialized yet, so this will be
+    // initialized in initStorage().
+    cap->upd_rem_set.queue.blocks = NULL;
+
     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
         cap->mut_lists[g] = NULL;
     }
@@ -861,16 +866,27 @@ yieldCapability (Capability** pCap, Task *task, bool gcAllowed)
     {
         PendingSync *sync = pending_sync;
 
-        if (sync && sync->type == SYNC_GC_PAR) {
-            if (! sync->idle[cap->no]) {
-                traceEventGcStart(cap);
-                gcWorkerThread(cap);
-                traceEventGcEnd(cap);
-                traceSparkCounters(cap);
-                // See Note [migrated bound threads 2]
-                if (task->cap == cap) {
-                    return true;
+        if (sync) {
+            switch (sync->type) {
+            case SYNC_GC_PAR:
+                if (! sync->idle[cap->no]) {
+                    traceEventGcStart(cap);
+                    gcWorkerThread(cap);
+                    traceEventGcEnd(cap);
+                    traceSparkCounters(cap);
+                    // See Note [migrated bound threads 2]
+                    if (task->cap == cap) {
+                        return true;
+                    }
                 }
+                break;
+
+            case SYNC_FLUSH_UPD_REM_SET:
+                debugTrace(DEBUG_nonmoving_gc, "Flushing update remembered set blocks...");
+                break;
+
+            default:
+                break;
             }
         }
     }
diff --git a/rts/Capability.h b/rts/Capability.h
index 0c41456261..e51769f887 100644
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -85,6 +85,9 @@ struct Capability_ {
     bdescr **mut_lists;
     bdescr **saved_mut_lists; // tmp use during GC
 
+    // The update remembered set for the non-moving collector
+    UpdRemSet upd_rem_set;
+
     // block for allocating pinned objects into
     bdescr *pinned_object_block;
     // full pinned object blocks allocated since the last GC
@@ -257,7 +260,8 @@ extern Capability **capabilities;
 typedef enum {
     SYNC_OTHER,
     SYNC_GC_SEQ,
-    SYNC_GC_PAR
+    SYNC_GC_PAR,
+    SYNC_FLUSH_UPD_REM_SET
 } SyncType;
 
 //
diff --git a/rts/Exception.cmm b/rts/Exception.cmm
index 8ea94b19f2..334d0ef823 100644
--- a/rts/Exception.cmm
+++ b/rts/Exception.cmm
@@ -318,6 +318,7 @@ stg_killThreadzh (P_ target, P_ exception)
             return ();
         } else {
             StgTSO_why_blocked(CurrentTSO) = BlockedOnMsgThrowTo;
+            updateRemembSetPushPtr(StgTSO_block_info(CurrentTSO));
             StgTSO_block_info(CurrentTSO) = msg;
             // we must block, and unlock the message before returning
             jump stg_block_throwto (target, exception);
@@ -489,6 +490,8 @@ retry_pop_stack:
       ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr");
       ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr");
 
+      // No need to push `trec` to update remembered set; it will be no longer
+      // reachable after we overwrite StgTSO.trec.
       StgTSO_trec(CurrentTSO) = NO_TREC;
       if (r != 0) {
         // Transaction was valid: continue searching for a catch frame
@@ -607,6 +610,8 @@ retry_pop_stack:
       outer  = StgTRecHeader_enclosing_trec(trec);
       ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr");
       ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr");
+      // No need to push `trec` to update remembered set since we just freed
+      // it; it is no longer reachable.
       StgTSO_trec(CurrentTSO) = outer;
       Sp = Sp + SIZEOF_StgCatchSTMFrame;
     }
diff --git a/rts/Messages.c b/rts/Messages.c
index d878db5eda..b16ada53e1 100644
--- a/rts/Messages.c
+++ b/rts/Messages.c
@@ -244,8 +244,8 @@ loop:
         // a barrier is necessary to ensure that all writes are visible.
         // See Note [Heap memory barriers] in SMP.h.
         write_barrier();
+        dirty_TSO(cap, owner); // we will modify owner->bq
         owner->bq = bq;
-        dirty_TSO(cap, owner); // we modified owner->bq
 
         // If the owner of the blackhole is currently runnable, then
         // bump it to the front of the run queue.  This gives the
@@ -262,6 +262,9 @@ loop:
 
         // point to the BLOCKING_QUEUE from the BLACKHOLE
         write_barrier(); // make the BQ visible, see Note [Heap memory barriers].
+        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+            updateRemembSetPushClosure(cap, (StgClosure*)p);
+        }
         ((StgInd*)bh)->indirectee = (StgClosure *)bq;
         recordClosureMutated(cap,bh); // bh was mutated
 
@@ -290,6 +293,11 @@ loop:
         }
 #endif
 
+        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+            // We are about to overwrite bq->queue; make sure its current value
+            // makes it into the update remembered set
+            updateRemembSetPushClosure(cap, (StgClosure*)bq->queue);
+        }
         msg->link = bq->queue;
         bq->queue = msg;
         // No barrier is necessary here: we are only exposing the
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index a2ab3de586..6d3df0700c 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -349,8 +349,13 @@ stg_casArrayzh ( gcptr arr, W_ ind, gcptr old, gcptr new )
         // Compare and Swap Succeeded:
         SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS);
         len = StgMutArrPtrs_ptrs(arr);
+
         // The write barrier.  We must write a byte into the mark table:
         I8[arr + SIZEOF_StgMutArrPtrs + WDS(len) + (ind >> MUT_ARR_PTRS_CARD_BITS )] = 1;
+
+        // Concurrent GC write barrier
+        updateRemembSetPushPtr(old);
+
         return (0,new);
     }
 }
@@ -462,16 +467,45 @@ stg_thawSmallArrayzh ( gcptr src, W_ offset, W_ n )
     cloneSmallArray(stg_SMALL_MUT_ARR_PTRS_DIRTY_info, src, offset, n)
 }
 
+// Concurrent GC write barrier for pointer array copies
+//
+// hdr_size in bytes. dst_off in words, n in words.
+stg_copyArray_barrier ( W_ hdr_size, gcptr dst, W_ dst_off, W_ n)
+{
+    W_ end, p;
+    ASSERT(n > 0);  // Assumes n==0 is handled by caller
+    p = dst + hdr_size + WDS(dst_off);
+    end = p + WDS(n);
+
+again:
+    IF_WRITE_BARRIER_ENABLED {
+        ccall updateRemembSetPushClosure_(BaseReg "ptr", W_[p] "ptr");
+    }
+    p = p + WDS(1);
+    if (p < end) {
+        goto again;
+    }
+
+    return ();
+}
+
 stg_copySmallArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n)
 {
     W_ dst_p, src_p, bytes;
 
-    SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+    if (n > 0) {
+        IF_WRITE_BARRIER_ENABLED {
+            call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs,
+                                      dst, dst_off, n);
+        }
 
-    dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off);
-    src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off);
-    bytes = WDS(n);
-    prim %memcpy(dst_p, src_p, bytes, SIZEOF_W);
+        SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+
+        dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off);
+        src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off);
+        bytes = WDS(n);
+        prim %memcpy(dst_p, src_p, bytes, SIZEOF_W);
+    }
 
     return ();
 }
@@ -480,15 +514,22 @@ stg_copySmallMutableArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n
 {
     W_ dst_p, src_p, bytes;
 
-    SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+    if (n > 0) {
+        IF_WRITE_BARRIER_ENABLED {
+            call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs,
+                                      dst, dst_off, n);
+        }
 
-    dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off);
-    src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off);
-    bytes = WDS(n);
-    if (src == dst) {
-        prim %memmove(dst_p, src_p, bytes, SIZEOF_W);
-    } else {
-        prim %memcpy(dst_p, src_p, bytes, SIZEOF_W);
+        SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info);
+
+        dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off);
+        src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off);
+        bytes = WDS(n);
+        if (src == dst) {
+            prim %memmove(dst_p, src_p, bytes, SIZEOF_W);
+        } else {
+            prim %memcpy(dst_p, src_p, bytes, SIZEOF_W);
+        }
     }
 
     return ();
@@ -510,6 +551,10 @@ stg_casSmallArrayzh ( gcptr arr, W_ ind, gcptr old, gcptr new )
     } else {
         // Compare and Swap Succeeded:
         SET_HDR(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info, CCCS);
+
+        // Concurrent GC write barrier
+        updateRemembSetPushPtr(old);
+
         return (0,new);
     }
 }
@@ -549,7 +594,7 @@ stg_casMutVarzh ( gcptr mv, gcptr old, gcptr new )
         return (1,h);
     } else {
         if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
-            ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr");
+            ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr", old);
         }
         return (0,new);
     }
@@ -562,7 +607,7 @@ stg_casMutVarzh ( gcptr mv, gcptr old, gcptr new )
     } else {
         StgMutVar_var(mv) = new;
         if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
-            ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr");
+            ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr", old);
         }
         return (0,new);
     }
@@ -629,11 +674,12 @@ stg_atomicModifyMutVar2zh ( gcptr mv, gcptr f )
     (h) = prim %cmpxchgW(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y);
     if (h != x) { goto retry; }
 #else
+    h = StgMutVar_var(mv);
     StgMutVar_var(mv) = y;
 #endif
 
     if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) {
-        ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr");
+        ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr", h);
     }
 
     return (x,z);
@@ -755,6 +801,9 @@ stg_addCFinalizzerToWeakzh ( W_ fptr,   // finalizer
         return (0);
     }
 
+    // Write barrier for concurrent non-moving collector
+    updateRemembSetPushPtr(StgWeak_cfinalizers(w))
+
     StgCFinalizerList_link(c) = StgWeak_cfinalizers(w);
     StgWeak_cfinalizers(w) = c;
 
@@ -835,6 +884,8 @@ stg_deRefWeakzh ( gcptr w )
     if (info == stg_WEAK_info) {
         code = 1;
         val = StgWeak_value(w);
+        // See Note [Concurrent read barrier on deRefWeak#] in NonMovingMark.c
+        updateRemembSetPushPtr(val);
     } else {
         code = 0;
         val = w;
@@ -1501,7 +1552,7 @@ stg_takeMVarzh ( P_ mvar /* :: MVar a */ )
      */
     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
         if (info == stg_MVAR_CLEAN_info) {
-            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr");
         }
 
         // We want to put the heap check down here in the slow path,
@@ -1547,6 +1598,9 @@ loop:
         // If the MVar is not already dirty, then we don't need to make
         // it dirty, as it is empty with nothing blocking on it.
         unlockClosure(mvar, info);
+        // However, we do need to ensure that the nonmoving collector
+        // knows about the reference to the value that we just removed...
+        updateRemembSetPushPtr(val);
         return (val);
     }
     qinfo = StgHeader_info(q);
@@ -1560,7 +1614,7 @@ loop:
     // There are putMVar(s) waiting... wake up the first thread on the queue
 
     if (info == stg_MVAR_CLEAN_info) {
-        ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+        ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", val "ptr");
     }
 
     tso = StgMVarTSOQueue_tso(q);
@@ -1629,7 +1683,7 @@ loop:
     // There are putMVar(s) waiting... wake up the first thread on the queue
 
     if (info == stg_MVAR_CLEAN_info) {
-        ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+        ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", val "ptr");
     }
 
     tso = StgMVarTSOQueue_tso(q);
@@ -1667,7 +1721,7 @@ stg_putMVarzh ( P_ mvar, /* :: MVar a */
     if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) {
 
         if (info == stg_MVAR_CLEAN_info) {
-            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr");
         }
 
         // We want to put the heap check down here in the slow path,
@@ -1701,14 +1755,20 @@ stg_putMVarzh ( P_ mvar, /* :: MVar a */
         jump stg_block_putmvar(mvar,val);
     }
 
+    // We are going to mutate the closure, make sure its current pointers
+    // are marked.
+    if (info == stg_MVAR_CLEAN_info) {
+        ccall update_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr");
+    }
+
     q = StgMVar_head(mvar);
 loop:
     if (q == stg_END_TSO_QUEUE_closure) {
         /* No further takes, the MVar is now full. */
+        StgMVar_value(mvar) = val;
         if (info == stg_MVAR_CLEAN_info) {
-            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr");
         }
-        StgMVar_value(mvar) = val;
         unlockClosure(mvar, stg_MVAR_DIRTY_info);
         return ();
     }
@@ -1790,7 +1850,7 @@ loop:
     if (q == stg_END_TSO_QUEUE_closure) {
         /* No further takes, the MVar is now full. */
         if (info == stg_MVAR_CLEAN_info) {
-            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr");
         }
 
         StgMVar_value(mvar) = val;
@@ -1861,7 +1921,7 @@ stg_readMVarzh ( P_ mvar, /* :: MVar a */ )
     if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) {
 
         if (info == stg_MVAR_CLEAN_info) {
-            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr");
+            ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar));
         }
 
         ALLOC_PRIM_WITH_CUSTOM_FAILURE
diff --git a/rts/RaiseAsync.c b/rts/RaiseAsync.c
index 807c3e3d30..50cddff051 100644
--- a/rts/RaiseAsync.c
+++ b/rts/RaiseAsync.c
@@ -515,9 +515,9 @@ blockedThrowTo (Capability *cap, StgTSO *target, MessageThrowTo *msg)
 
     ASSERT(target->cap == cap);
 
+    dirty_TSO(cap,target); // we will modify the blocked_exceptions queue
     msg->link = target->blocked_exceptions;
     target->blocked_exceptions = msg;
-    dirty_TSO(cap,target); // we modified the blocked_exceptions queue
 }
 
 /* -----------------------------------------------------------------------------
diff --git a/rts/RtsStartup.c b/rts/RtsStartup.c
index ce0fa2d519..d0d08a2495 100644
--- a/rts/RtsStartup.c
+++ b/rts/RtsStartup.c
@@ -392,7 +392,8 @@ hs_exit_(bool wait_foreign)
     ioManagerDie();
 #endif
 
-    /* stop all running tasks */
+    /* stop all running tasks. This is also where we stop concurrent non-moving
+     * collection if it's running */
     exitScheduler(wait_foreign);
 
     /* run C finalizers for all active weak pointers */
@@ -436,9 +437,6 @@ hs_exit_(bool wait_foreign)
     /* shutdown the hpc support (if needed) */
     exitHpc();
 
-    /* wait for any on-going concurrent GC to finish */
-    nonmovingExit();
-
     // clean up things from the storage manager's point of view.
     // also outputs the stats (+RTS -s) info.
     exitStorage();
diff --git a/rts/RtsSymbols.c b/rts/RtsSymbols.c
index e34fcf03f5..e64c78fbcc 100644
--- a/rts/RtsSymbols.c
+++ b/rts/RtsSymbols.c
@@ -14,6 +14,7 @@
 #include "HsFFI.h"
 
 #include "sm/Storage.h"
+#include "sm/NonMovingMark.h"
 #include <stdbool.h>
 
 #if !defined(mingw32_HOST_OS)
@@ -682,6 +683,9 @@
       SymI_HasProto(stg_shrinkMutableByteArrayzh)                       \
       SymI_HasProto(stg_resizzeMutableByteArrayzh)                      \
       SymI_HasProto(newSpark)                                           \
+      SymI_HasProto(updateRemembSetPushThunk)                             \
+      SymI_HasProto(updateRemembSetPushThunk_)                            \
+      SymI_HasProto(updateRemembSetPushClosure_)                          \
       SymI_HasProto(performGC)                                          \
       SymI_HasProto(performMajorGC)                                     \
       SymI_HasProto(prog_argc)                                          \
@@ -1037,6 +1041,7 @@ RtsSymbolVal rtsSyms[] = {
       RTS_OPENBSD_ONLY_SYMBOLS
       RTS_LIBGCC_SYMBOLS
       RTS_LIBFFI_SYMBOLS
+      SymI_HasDataProto(nonmoving_write_barrier_enabled)
 #if defined(darwin_HOST_OS) && defined(i386_HOST_ARCH)
       // dyld stub code contains references to this,
       // but it should never be called because we treat
diff --git a/rts/STM.c b/rts/STM.c
index dc0b0ebb78..c17f33aaa0 100644
--- a/rts/STM.c
+++ b/rts/STM.c
@@ -182,7 +182,8 @@ static void unlock_stm(StgTRecHeader *trec STG_UNUSED) {
   TRACE("%p : unlock_stm()", trec);
 }
 
-static StgClosure *lock_tvar(StgTRecHeader *trec STG_UNUSED,
+static StgClosure *lock_tvar(Capability *cap STG_UNUSED,
+                             StgTRecHeader *trec STG_UNUSED,
                              StgTVar *s STG_UNUSED) {
   StgClosure *result;
   TRACE("%p : lock_tvar(%p)", trec, s);
@@ -197,12 +198,14 @@ static void unlock_tvar(Capability *cap,
                         StgBool force_update) {
   TRACE("%p : unlock_tvar(%p)", trec, s);
   if (force_update) {
+    StgClosure *old_value = s -> current_value;
     s -> current_value = c;
-    dirty_TVAR(cap,s);
+    dirty_TVAR(cap, s, old_value);
   }
 }
 
-static StgBool cond_lock_tvar(StgTRecHeader *trec STG_UNUSED,
+static StgBool cond_lock_tvar(Capability *cap STG_UNUSED,
+                              StgTRecHeader *trec STG_UNUSED,
                               StgTVar *s STG_UNUSED,
                               StgClosure *expected) {
   StgClosure *result;
@@ -231,7 +234,8 @@ static void unlock_stm(StgTRecHeader *trec STG_UNUSED) {
   smp_locked = 0;
 }
 
-static StgClosure *lock_tvar(StgTRecHeader *trec STG_UNUSED,
+static StgClosure *lock_tvar(Capability *cap STG_UNUSED,
+                             StgTRecHeader *trec STG_UNUSED,
                              StgTVar *s STG_UNUSED) {
   StgClosure *result;
   TRACE("%p : lock_tvar(%p)", trec, s);
@@ -248,12 +252,14 @@ static void *unlock_tvar(Capability *cap,
   TRACE("%p : unlock_tvar(%p, %p)", trec, s, c);
   ASSERT(smp_locked == trec);
   if (force_update) {
+    StgClosure *old_value = s -> current_value;
     s -> current_value = c;
-    dirty_TVAR(cap,s);
+    dirty_TVAR(cap, s, old_value);
   }
 }
 
-static StgBool cond_lock_tvar(StgTRecHeader *trec STG_UNUSED,
+static StgBool cond_lock_tvar(Capability *cap STG_UNUSED,
+                              StgTRecHeader *trec STG_UNUSED,
                                StgTVar *s STG_UNUSED,
                                StgClosure *expected) {
   StgClosure *result;
@@ -279,7 +285,8 @@ static void unlock_stm(StgTRecHeader *trec STG_UNUSED) {
   TRACE("%p : unlock_stm()", trec);
 }
 
-static StgClosure *lock_tvar(StgTRecHeader *trec,
+static StgClosure *lock_tvar(Capability *cap,
+                             StgTRecHeader *trec,
                              StgTVar *s STG_UNUSED) {
   StgClosure *result;
   TRACE("%p : lock_tvar(%p)", trec, s);
@@ -289,6 +296,10 @@ static StgClosure *lock_tvar(StgTRecHeader *trec,
     } while (GET_INFO(UNTAG_CLOSURE(result)) == &stg_TREC_HEADER_info);
   } while (cas((void *)&(s -> current_value),
                (StgWord)result, (StgWord)trec) != (StgWord)result);
+
+  if (RTS_UNLIKELY(nonmoving_write_barrier_enabled && result)) {
+      updateRemembSetPushClosure(cap, result);
+  }
   return result;
 }
 
@@ -300,10 +311,11 @@ static void unlock_tvar(Capability *cap,
   TRACE("%p : unlock_tvar(%p, %p)", trec, s, c);
   ASSERT(s -> current_value == (StgClosure *)trec);
   s -> current_value = c;
-  dirty_TVAR(cap,s);
+  dirty_TVAR(cap, s, (StgClosure *) trec);
 }
 
-static StgBool cond_lock_tvar(StgTRecHeader *trec,
+static StgBool cond_lock_tvar(Capability *cap,
+                              StgTRecHeader *trec,
                               StgTVar *s,
                               StgClosure *expected) {
   StgClosure *result;
@@ -311,6 +323,9 @@ static StgBool cond_lock_tvar(StgTRecHeader *trec,
   TRACE("%p : cond_lock_tvar(%p, %p)", trec, s, expected);
   w = cas((void *)&(s -> current_value), (StgWord)expected, (StgWord)trec);
   result = (StgClosure *)w;
+  if (RTS_UNLIKELY(nonmoving_write_barrier_enabled && result)) {
+      updateRemembSetPushClosure(cap, expected);
+  }
   TRACE("%p : %s", trec, result ? "success" : "failure");
   return (result == expected);
 }
@@ -525,7 +540,7 @@ static void build_watch_queue_entries_for_trec(Capability *cap,
     }
     s -> first_watch_queue_entry = q;
     e -> new_value = (StgClosure *) q;
-    dirty_TVAR(cap,s); // we modified first_watch_queue_entry
+    dirty_TVAR(cap, s, (StgClosure *) fq); // we modified first_watch_queue_entry
   });
 }
 
@@ -545,7 +560,7 @@ static void remove_watch_queue_entries_for_trec(Capability *cap,
     StgTVarWatchQueue *q;
     StgClosure *saw;
     s = e -> tvar;
-    saw = lock_tvar(trec, s);
+    saw = lock_tvar(cap, trec, s);
     q = (StgTVarWatchQueue *) (e -> new_value);
     TRACE("%p : removing tso=%p from watch queue for tvar=%p",
           trec,
@@ -562,7 +577,7 @@ static void remove_watch_queue_entries_for_trec(Capability *cap,
     } else {
       ASSERT(s -> first_watch_queue_entry == q);
       s -> first_watch_queue_entry = nq;
-      dirty_TVAR(cap,s); // we modified first_watch_queue_entry
+      dirty_TVAR(cap, s, (StgClosure *) q); // we modified first_watch_queue_entry
     }
     free_stg_tvar_watch_queue(cap, q);
     unlock_tvar(cap, trec, s, saw, false);
@@ -773,7 +788,7 @@ static StgBool validate_and_acquire_ownership (Capability *cap,
       s = e -> tvar;
       if (acquire_all || entry_is_update(e)) {
         TRACE("%p : trying to acquire %p", trec, s);
-        if (!cond_lock_tvar(trec, s, e -> expected_value)) {
+        if (!cond_lock_tvar(cap, trec, s, e -> expected_value)) {
           TRACE("%p : failed to acquire %p", trec, s);
           result = false;
           BREAK_FOR_EACH;
diff --git a/rts/Schedule.c b/rts/Schedule.c
index 8d7acc963e..8d82daf381 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -44,6 +44,8 @@
 #include "StablePtr.h"
 #include "StableName.h"
 #include "TopHandler.h"
+#include "sm/NonMoving.h"
+#include "sm/NonMovingMark.h"
 
 #if defined(HAVE_SYS_TYPES_H)
 #include <sys/types.h>
@@ -2497,7 +2499,11 @@ resumeThread (void *task_)
     tso = incall->suspended_tso;
     incall->suspended_tso = NULL;
     incall->suspended_cap = NULL;
-    tso->_link = END_TSO_QUEUE; // no write barrier reqd
+    // we will modify tso->_link
+    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        updateRemembSetPushClosure(cap, (StgClosure *)tso->_link);
+    }
+    tso->_link = END_TSO_QUEUE;
 
     traceEventRunThread(cap, tso);
 
@@ -2671,6 +2677,8 @@ initScheduler(void)
   /* Initialise the mutex and condition variables used by
    * the scheduler. */
   initMutex(&sched_mutex);
+  initMutex(&sync_finished_mutex);
+  initCondition(&sync_finished_cond);
 #endif
 
   ACQUIRE_LOCK(&sched_mutex);
@@ -2706,6 +2714,7 @@ exitScheduler (bool wait_foreign USED_IF_THREADS)
     // If we haven't killed all the threads yet, do it now.
     if (sched_state < SCHED_SHUTTING_DOWN) {
         sched_state = SCHED_INTERRUPTING;
+        nonmovingExit();
         Capability *cap = task->cap;
         waitForCapability(&cap,task);
         scheduleDoGC(&cap,task,true);
diff --git a/rts/StableName.c b/rts/StableName.c
index 383d87e3db..4b26fee396 100644
--- a/rts/StableName.c
+++ b/rts/StableName.c
@@ -263,6 +263,9 @@ threadStableNameTable( evac_fn evac, void *user )
 void
 gcStableNameTable( void )
 {
+    // We must take the stable name lock lest we race with the nonmoving
+    // collector (namely nonmovingSweepStableNameTable).
+    stableNameLock();
     FOR_EACH_STABLE_NAME(
         p, {
             // FOR_EACH_STABLE_NAME traverses free entries too, so
@@ -286,6 +289,7 @@ gcStableNameTable( void )
                 }
             }
         });
+    stableNameUnlock();
 }
 
 /* -----------------------------------------------------------------------------
diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c
index cccc7ad0b0..5cbb3f5595 100644
--- a/rts/ThreadPaused.c
+++ b/rts/ThreadPaused.c
@@ -334,6 +334,17 @@ threadPaused(Capability *cap, StgTSO *tso)
             }
 #endif
 
+            if (RTS_UNLIKELY(nonmoving_write_barrier_enabled
+                             && ip_THUNK(INFO_PTR_TO_STRUCT(bh_info)))) {
+                // We are about to replace a thunk with a blackhole.
+                // Add the free variables of the closure we are about to
+                // overwrite to the update remembered set.
+                // N.B. We caught the WHITEHOLE case above.
+                updateRemembSetPushThunkEager(cap,
+                                             THUNK_INFO_PTR_TO_STRUCT(bh_info),
+                                             (StgThunk *) bh);
+            }
+
             // The payload of the BLACKHOLE points to the TSO
             ((StgInd *)bh)->indirectee = (StgClosure *)tso;
             write_barrier();
diff --git a/rts/Threads.c b/rts/Threads.c
index 3d5b463051..2b11a1eb90 100644
--- a/rts/Threads.c
+++ b/rts/Threads.c
@@ -86,6 +86,7 @@ createThread(Capability *cap, W_ size)
     stack->stack_size   = stack_size - sizeofW(StgStack);
     stack->sp           = stack->stack + stack->stack_size;
     stack->dirty        = STACK_DIRTY;
+    stack->marking      = 0;
 
     tso = (StgTSO *)allocate(cap, sizeofW(StgTSO));
     TICK_ALLOC_TSO();
@@ -611,6 +612,7 @@ threadStackOverflow (Capability *cap, StgTSO *tso)
     TICK_ALLOC_STACK(chunk_size);
 
     new_stack->dirty = 0; // begin clean, we'll mark it dirty below
+    new_stack->marking = 0;
     new_stack->stack_size = chunk_size - sizeofW(StgStack);
     new_stack->sp = new_stack->stack + new_stack->stack_size;
 
@@ -721,9 +723,17 @@ threadStackUnderflow (Capability *cap, StgTSO *tso)
             barf("threadStackUnderflow: not enough space for return values");
         }
 
-        new_stack->sp -= retvals;
+        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+            // ensure that values that we copy into the new stack are marked
+            // for the nonmoving collector. Note that these values won't
+            // necessarily form a full closure so we need to handle them
+            // specially.
+            for (unsigned int i = 0; i < retvals; i++) {
+                updateRemembSetPushClosure(cap, (StgClosure *) old_stack->sp[i]);
+            }
+        }
 
-        memcpy(/* dest */ new_stack->sp,
+        memcpy(/* dest */ new_stack->sp - retvals,
                /* src  */ old_stack->sp,
                /* size */ retvals * sizeof(W_));
     }
@@ -735,8 +745,12 @@ threadStackUnderflow (Capability *cap, StgTSO *tso)
     // restore the stack parameters, and update tot_stack_size
     tso->tot_stack_size -= old_stack->stack_size;
 
-    // we're about to run it, better mark it dirty
+    // we're about to run it, better mark it dirty.
+    //
+    // N.B. the nonmoving collector may mark the stack, meaning that sp must
+    // point at a valid stack frame.
     dirty_STACK(cap, new_stack);
+    new_stack->sp -= retvals;
 
     return retvals;
 }
@@ -768,7 +782,7 @@ loop:
     if (q == (StgMVarTSOQueue*)&stg_END_TSO_QUEUE_closure) {
         /* No further takes, the MVar is now full. */
         if (info == &stg_MVAR_CLEAN_info) {
-            dirty_MVAR(&cap->r, (StgClosure*)mvar);
+            dirty_MVAR(&cap->r, (StgClosure*)mvar, mvar->value);
         }
 
         mvar->value = value;
diff --git a/rts/Updates.h b/rts/Updates.h
index 1bd3e065af..84d9162868 100644
--- a/rts/Updates.h
+++ b/rts/Updates.h
@@ -50,6 +50,9 @@
                                                                 \
     prim_write_barrier;                                         \
     OVERWRITING_CLOSURE(p1);                                    \
+    IF_WRITE_BARRIER_ENABLED {                                  \
+      ccall updateRemembSetPushThunk_(BaseReg, p1 "ptr");       \
+    }                                                           \
     StgInd_indirectee(p1) = p2;                                 \
     prim_write_barrier;                                         \
     SET_INFO(p1, stg_BLACKHOLE_info);                           \
@@ -62,7 +65,7 @@
     } else {                                                    \
       TICK_UPD_NEW_IND();                                       \
       and_then;                                                 \
-  }
+    }
 
 #else /* !CMINUSMINUS */
 
@@ -78,6 +81,9 @@ INLINE_HEADER void updateWithIndirection (Capability *cap,
     /* See Note [Heap memory barriers] in SMP.h */
     write_barrier();
     OVERWRITING_CLOSURE(p1);
+    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        updateRemembSetPushThunk(cap, (StgThunk*)p1);
+    }
     ((StgInd *)p1)->indirectee = p2;
     write_barrier();
     SET_INFO(p1, &stg_BLACKHOLE_info);
diff --git a/rts/rts.cabal.in b/rts/rts.cabal.in
index 7aad5e4385..2c28426d75 100644
--- a/rts/rts.cabal.in
+++ b/rts/rts.cabal.in
@@ -139,6 +139,7 @@ library
                       rts/Linker.h
                       rts/Main.h
                       rts/Messages.h
+                      rts/NonMoving.h
                       rts/OSThreads.h
                       rts/Parallel.h
                       rts/PrimFloat.h
diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index f383949ebf..6bccf7f100 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -33,6 +33,18 @@ static void nonmovingBumpEpoch(void) {
     nonmovingMarkEpoch = nonmovingMarkEpoch == 1 ? 2 : 1;
 }
 
+#if defined(THREADED_RTS)
+/*
+ * This mutex ensures that only one non-moving collection is active at a time.
+ */
+Mutex nonmoving_collection_mutex;
+
+OSThreadId mark_thread;
+bool concurrent_coll_running = false;
+Condition concurrent_coll_finished;
+Mutex concurrent_coll_finished_lock;
+#endif
+
 /*
  * Note [Non-moving garbage collector]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -76,13 +88,12 @@ static void nonmovingBumpEpoch(void) {
 
 memcount nonmoving_live_words = 0;
 
+#if defined(THREADED_RTS)
+static void* nonmovingConcurrentMark(void *mark_queue);
+#endif
 static void nonmovingClearBitmap(struct NonmovingSegment *seg);
 static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO **resurrected_threads);
 
-/* Signals to mutators that they should stop to synchronize with the nonmoving
- * collector so it can proceed to sweep phase. */
-bool nonmoving_syncing = false;
-
 static void nonmovingInitSegment(struct NonmovingSegment *seg, uint8_t block_size)
 {
     seg->link = NULL;
@@ -283,29 +294,39 @@ static struct NonmovingAllocator *alloc_nonmoving_allocator(uint32_t n_caps)
 void nonmovingInit(void)
 {
     if (! RtsFlags.GcFlags.useNonmoving) return;
+#if defined(THREADED_RTS)
+    initMutex(&nonmoving_collection_mutex);
+    initCondition(&concurrent_coll_finished);
+    initMutex(&concurrent_coll_finished_lock);
+#endif
     for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
         nonmovingHeap.allocators[i] = alloc_nonmoving_allocator(n_capabilities);
     }
+    nonmovingMarkInitUpdRemSet();
 }
 
 void nonmovingExit(void)
 {
     if (! RtsFlags.GcFlags.useNonmoving) return;
+#if defined(THREADED_RTS)
+    if (mark_thread) {
+        debugTrace(DEBUG_nonmoving_gc,
+                   "waiting for nonmoving collector thread to terminate");
+        ACQUIRE_LOCK(&concurrent_coll_finished_lock);
+        waitCondition(&concurrent_coll_finished, &concurrent_coll_finished_lock);
+    }
+
+    closeMutex(&concurrent_coll_finished_lock);
+    closeCondition(&concurrent_coll_finished);
+    closeMutex(&nonmoving_collection_mutex);
+#endif
+
     for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
         stgFree(nonmovingHeap.allocators[i]);
     }
 }
 
 /*
- * Wait for any concurrent collections to finish. Called during shutdown to
- * ensure we don't steal capabilities that the nonmoving collector still has yet
- * to synchronize with.
- */
-void nonmovingWaitUntilFinished(void)
-{
-}
-
-/*
  * Assumes that no garbage collector or mutator threads are running to safely
  * resize the nonmoving_allocators.
  *
@@ -443,6 +464,14 @@ static void nonmovingMarkWeakPtrList(MarkQueue *mark_queue, StgWeak *dead_weak_p
 
 void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads)
 {
+#if defined(THREADED_RTS)
+    // We can't start a new collection until the old one has finished
+    // We also don't run in final GC
+    if (concurrent_coll_running || sched_state > SCHED_RUNNING) {
+        return;
+    }
+#endif
+
     resizeGenerations();
 
     nonmovingPrepareMark();
@@ -501,9 +530,26 @@ void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads)
     // those lists to mark function in sequential case. In concurrent case we
     // allocate fresh lists.
 
+#if defined(THREADED_RTS)
+    // If we're interrupting or shutting down, do not let this capability go and
+    // run a STW collection. Reason: we won't be able to acquire this capability
+    // again for the sync if we let it go, because it'll immediately start doing
+    // a major GC, becuase that's what we do when exiting scheduler (see
+    // exitScheduler()).
+    if (sched_state == SCHED_RUNNING) {
+        concurrent_coll_running = true;
+        nonmoving_write_barrier_enabled = true;
+        debugTrace(DEBUG_nonmoving_gc, "Starting concurrent mark thread");
+        createOSThread(&mark_thread, "non-moving mark thread",
+                       nonmovingConcurrentMark, mark_queue);
+    } else {
+        nonmovingConcurrentMark(mark_queue);
+    }
+#else
     // Use the weak and thread lists from the preparation for any new weaks and
     // threads found to be dead in mark.
     nonmovingMark_(mark_queue, dead_weaks, resurrected_threads);
+#endif
 }
 
 /* Mark mark queue, threads, and weak pointers until no more weaks have been
@@ -523,13 +569,70 @@ static void nonmovingMarkThreadsWeaks(MarkQueue *mark_queue)
     }
 }
 
+#if defined(THREADED_RTS)
+static void* nonmovingConcurrentMark(void *data)
+{
+    MarkQueue *mark_queue = (MarkQueue*)data;
+    StgWeak *dead_weaks = NULL;
+    StgTSO *resurrected_threads = (StgTSO*)&stg_END_TSO_QUEUE_closure;
+    nonmovingMark_(mark_queue, &dead_weaks, &resurrected_threads);
+    return NULL;
+}
+
+// TODO: Not sure where to put this function.
+// Append w2 to the end of w1.
+static void appendWeakList( StgWeak **w1, StgWeak *w2 )
+{
+    while (*w1) {
+        w1 = &(*w1)->link;
+    }
+    *w1 = w2;
+}
+#endif
+
 static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO **resurrected_threads)
 {
+    ACQUIRE_LOCK(&nonmoving_collection_mutex);
     debugTrace(DEBUG_nonmoving_gc, "Starting mark...");
 
     // Do concurrent marking; most of the heap will get marked here.
     nonmovingMarkThreadsWeaks(mark_queue);
 
+#if defined(THREADED_RTS)
+    Task *task = newBoundTask();
+
+    // If at this point if we've decided to exit then just return
+    if (sched_state > SCHED_RUNNING) {
+        // Note that we break our invariants here and leave segments in
+        // nonmovingHeap.sweep_list, don't free nonmoving_large_objects etc.
+        // However because we won't be running mark-sweep in the final GC this
+        // is OK.
+
+        // This is a RTS shutdown so we need to move our copy (snapshot) of
+        // weaks (nonmoving_old_weak_ptr_list and nonmoving_weak_ptr_list) to
+        // oldest_gen->threads to be able to run C finalizers in hs_exit_. Note
+        // that there may be more weaks added to oldest_gen->threads since we
+        // started mark, so we need to append our list to the tail of
+        // oldest_gen->threads.
+        appendWeakList(&nonmoving_old_weak_ptr_list, nonmoving_weak_ptr_list);
+        appendWeakList(&oldest_gen->weak_ptr_list, nonmoving_old_weak_ptr_list);
+        // These lists won't be used again so this is not necessary, but still
+        nonmoving_old_weak_ptr_list = NULL;
+        nonmoving_weak_ptr_list = NULL;
+
+        goto finish;
+    }
+
+    // We're still running, request a sync
+    nonmovingBeginFlush(task);
+
+    bool all_caps_syncd;
+    do {
+        all_caps_syncd = nonmovingWaitForFlush();
+        nonmovingMarkThreadsWeaks(mark_queue);
+    } while (!all_caps_syncd);
+#endif
+
     nonmovingResurrectThreads(mark_queue, resurrected_threads);
 
     // No more resurrecting threads after this point
@@ -555,6 +658,18 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
     debugTrace(DEBUG_nonmoving_gc,
                "Done marking, resurrecting threads before releasing capabilities");
 
+
+    // Schedule finalizers and resurrect threads
+#if defined(THREADED_RTS)
+    // Just pick a random capability. Not sure if this is a good idea -- we use
+    // only one capability for all finalizers.
+    scheduleFinalizers(capabilities[0], *dead_weaks);
+    // Note that this mutates heap and causes running write barriers.
+    // See Note [Unintentional marking in resurrectThreads] in NonMovingMark.c
+    // for how we deal with this.
+    resurrectThreads(*resurrected_threads);
+#endif
+
 #if defined(DEBUG)
     // Zap CAFs that we will sweep
     nonmovingGcCafs(mark_queue);
@@ -586,6 +701,12 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
         nonmoving_old_weak_ptr_list = NULL;
     }
 
+    // Everything has been marked; allow the mutators to proceed
+#if defined(THREADED_RTS)
+    nonmoving_write_barrier_enabled = false;
+    nonmovingFinishFlush(task);
+#endif
+
     current_mark_queue = NULL;
     freeMarkQueue(mark_queue);
     stgFree(mark_queue);
@@ -609,6 +730,20 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
     debugTrace(DEBUG_nonmoving_gc, "Finished sweeping.");
 
     // TODO: Remainder of things done by GarbageCollect (update stats)
+
+#if defined(THREADED_RTS)
+finish:
+    boundTaskExiting(task);
+
+    // We are done...
+    mark_thread = 0;
+
+    // Signal that the concurrent collection is finished, allowing the next
+    // non-moving collection to proceed
+    concurrent_coll_running = false;
+    signalCondition(&concurrent_coll_finished);
+    RELEASE_LOCK(&nonmoving_collection_mutex);
+#endif
 }
 
 #if defined(DEBUG)
@@ -817,6 +952,31 @@ void locate_object(P_ obj)
             return;
         }
     }
+
+    // Search workspaces FIXME only works in non-threaded runtime
+#if !defined(THREADED_RTS)
+    for (uint32_t g = 0; g < RtsFlags.GcFlags.generations - 1; ++ g) {
+        gen_workspace *ws = &gct->gens[g];
+        for (bdescr *blk = ws->todo_bd; blk; blk = blk->link) {
+            if (obj >= blk->start && obj < blk->free) {
+                debugBelch("%p is in generation %" FMT_Word32 " todo bds\n", obj, g);
+                return;
+            }
+        }
+        for (bdescr *blk = ws->scavd_list; blk; blk = blk->link) {
+            if (obj >= blk->start && obj < blk->free) {
+                debugBelch("%p is in generation %" FMT_Word32 " scavd bds\n", obj, g);
+                return;
+            }
+        }
+        for (bdescr *blk = ws->todo_large_objects; blk; blk = blk->link) {
+            if (obj >= blk->start && obj < blk->free) {
+                debugBelch("%p is in generation %" FMT_Word32 " todo large bds\n", obj, g);
+                return;
+            }
+        }
+    }
+#endif
 }
 
 void nonmovingPrintSweepList()
diff --git a/rts/sm/NonMoving.h b/rts/sm/NonMoving.h
index a031f3d223..21c69b1ca1 100644
--- a/rts/sm/NonMoving.h
+++ b/rts/sm/NonMoving.h
@@ -95,7 +95,6 @@ extern memcount nonmoving_live_words;
 
 void nonmovingInit(void);
 void nonmovingExit(void);
-void nonmovingWaitUntilFinished(void);
 
 
 // dead_weaks and resurrected_threads lists are used for two things:
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index cf1950471e..b273b09b05 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -67,6 +67,14 @@ bdescr *nonmoving_large_objects = NULL;
 bdescr *nonmoving_marked_large_objects = NULL;
 memcount n_nonmoving_large_blocks = 0;
 memcount n_nonmoving_marked_large_blocks = 0;
+#if defined(THREADED_RTS)
+/* Protects everything above. Furthermore, we only set the BF_MARKED bit of
+ * large object blocks when this is held. This ensures that the write barrier
+ * (e.g. finish_upd_rem_set_mark) and the collector (mark_closure) don't try to
+ * move the same large object to nonmoving_marked_large_objects more than once.
+ */
+static Mutex nonmoving_large_objects_mutex;
+#endif
 
 /*
  * Where we keep our threads during collection since we must have a snapshot of
@@ -87,11 +95,257 @@ StgWeak *nonmoving_weak_ptr_list = NULL;
 StgIndStatic *debug_caf_list_snapshot = (StgIndStatic*)END_OF_CAF_LIST;
 #endif
 
+/* Note [Update remembered set]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * The concurrent non-moving collector uses a remembered set to ensure
+ * that its marking is consistent with the snapshot invariant defined in
+ * the design. This remembered set, known as the update remembered set,
+ * records all pointers that have been overwritten since the beginning
+ * of the concurrent mark. This ensures that concurrent mutation cannot hide
+ * pointers to live objects from the nonmoving garbage collector.
+ *
+ * The update remembered set is maintained via a write barrier that
+ * is enabled whenever a concurrent mark is active. This write barrier
+ * can be found in a number of places:
+ *
+ *  - In rts/Primops.cmm in primops responsible for modifying mutable closures
+ *    (e.g. MVARs, MUT_VARs, etc.)
+ *
+ *  - In rts/STM.c, where
+ *
+ *  - In the dirty_* functions found in rts/Storage.c where we dirty MVARs,
+ *    MUT_VARs, TSOs and STACKs. STACK is a somewhat special case, as described
+ *    in Note [StgStack dirtiness flags and concurrent marking] in TSO.h.
+ *
+ *  - In the code generated by the STG code generator for pointer array writes
+ *
+ * There is also a read barrier to handle weak references, as described in
+ * Note [Concurrent read barrier on deRefWeak#].
+ *
+ * The representation of the update remembered set is the same as that of
+ * the mark queue. For efficiency, each capability maintains its own local
+ * accumulator of remembered set entries. When a capability fills its
+ * accumulator it is linked in to the global remembered set
+ * (upd_rem_set_block_list), where it is consumed by the mark phase.
+ *
+ * The mark phase is responsible for freeing update remembered set block
+ * allocations.
+ *
+ *
+ * Note [Concurrent read barrier on deRefWeak#]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ * In general the non-moving GC assumes that all pointers reachable from a
+ * marked object are themselves marked (or in the mark queue). However,
+ * weak pointers are an obvious exception to this rule. In particular,
+ * deRefWeakPtr# allows the mutator to turn a weak reference into a strong
+ * reference. This interacts badly with concurrent collection. For
+ * instance, consider this program:
+ *
+ *     f :: a -> b -> IO b
+ *     f k v = do
+ *         -- assume that k and v are the only references to the
+ *         -- closures to which they refer.
+ *         weak <- mkWeakPtr k v Nothing
+ *
+ *         -- N.B. k is now technically dead since the only reference to it is
+ *         -- weak, but we've not yet had a chance to tombstone the WeakPtr
+ *         -- (which will happen in the course of major GC).
+ *         performMajorGC
+ *         -- Now we are running concurrently with the mark...
+
+ *         Just x <- deRefWeak weak
+ *         -- We have now introduced a reference to `v`, which will
+ *         -- not be marked as the only reference to `v` when the snapshot was
+ *         -- taken is via a WeakPtr.
+ *         return x
+ *
+ */
+static Mutex upd_rem_set_lock;
+bdescr *upd_rem_set_block_list = NULL;
+
+#if defined(THREADED_RTS)
+/* Used during the mark/sweep phase transition to track how many capabilities
+ * have pushed their update remembered sets. Protected by upd_rem_set_lock.
+ */
+static volatile StgWord upd_rem_set_flush_count = 0;
+#endif
+
+
+/* Signaled by each capability when it has flushed its update remembered set */
+static Condition upd_rem_set_flushed_cond;
+
+/* Indicates to mutators that the write barrier must be respected. Set while
+ * concurrent mark is running.
+ */
+StgWord nonmoving_write_barrier_enabled = false;
+
 /* Used to provide the current mark queue to the young generation
  * collector for scavenging.
  */
 MarkQueue *current_mark_queue = NULL;
 
+/* Initialise update remembered set data structures */
+void nonmovingMarkInitUpdRemSet() {
+    initMutex(&upd_rem_set_lock);
+    initCondition(&upd_rem_set_flushed_cond);
+#if defined(THREADED_RTS)
+    initMutex(&nonmoving_large_objects_mutex);
+#endif
+}
+
+#if defined(THREADED_RTS) && defined(DEBUG)
+static uint32_t markQueueLength(MarkQueue *q);
+#endif
+static void init_mark_queue_(MarkQueue *queue);
+
+/* Transfers the given capability's update-remembered set to the global
+ * remembered set.
+ *
+ * Really the argument type should be UpdRemSet* but this would be rather
+ * inconvenient without polymorphism.
+ */
+static void nonmovingAddUpdRemSetBlocks(MarkQueue *rset)
+{
+    if (markQueueIsEmpty(rset)) return;
+
+    // find the tail of the queue
+    bdescr *start = rset->blocks;
+    bdescr *end = start;
+    while (end->link != NULL)
+        end = end->link;
+
+    // add the blocks to the global remembered set
+    ACQUIRE_LOCK(&upd_rem_set_lock);
+    end->link = upd_rem_set_block_list;
+    upd_rem_set_block_list = start;
+    RELEASE_LOCK(&upd_rem_set_lock);
+
+    // Reset remembered set
+    ACQUIRE_SM_LOCK;
+    init_mark_queue_(rset);
+    rset->is_upd_rem_set = true;
+    RELEASE_SM_LOCK;
+}
+
+#if defined(THREADED_RTS)
+/* Called by capabilities to flush their update remembered sets when
+ * synchronising with the non-moving collector as it transitions from mark to
+ * sweep phase.
+ */
+void nonmovingFlushCapUpdRemSetBlocks(Capability *cap)
+{
+    debugTrace(DEBUG_nonmoving_gc,
+               "Capability %d flushing update remembered set: %d",
+               cap->no, markQueueLength(&cap->upd_rem_set.queue));
+    nonmovingAddUpdRemSetBlocks(&cap->upd_rem_set.queue);
+    atomic_inc(&upd_rem_set_flush_count, 1);
+    signalCondition(&upd_rem_set_flushed_cond);
+    // After this mutation will remain suspended until nonmovingFinishFlush
+    // releases its capabilities.
+}
+
+/* Request that all capabilities flush their update remembered sets and suspend
+ * execution until the further notice.
+ */
+void nonmovingBeginFlush(Task *task)
+{
+    debugTrace(DEBUG_nonmoving_gc, "Starting update remembered set flush...");
+    upd_rem_set_flush_count = 0;
+    stopAllCapabilitiesWith(NULL, task, SYNC_FLUSH_UPD_REM_SET);
+
+    // XXX: We may have been given a capability via releaseCapability (i.e. a
+    // task suspended due to a foreign call) in which case our requestSync
+    // logic won't have been hit. Make sure that everyone so far has flushed.
+    // Ideally we want to mark asynchronously with syncing.
+    for (uint32_t i = 0; i < n_capabilities; i++) {
+        nonmovingFlushCapUpdRemSetBlocks(capabilities[i]);
+    }
+}
+
+/* Wait until a capability has flushed its update remembered set. Returns true
+ * if all capabilities have flushed.
+ */
+bool nonmovingWaitForFlush()
+{
+    ACQUIRE_LOCK(&upd_rem_set_lock);
+    debugTrace(DEBUG_nonmoving_gc, "Flush count %d", upd_rem_set_flush_count);
+    bool finished = upd_rem_set_flush_count == n_capabilities;
+    if (!finished) {
+        waitCondition(&upd_rem_set_flushed_cond, &upd_rem_set_lock);
+    }
+    RELEASE_LOCK(&upd_rem_set_lock);
+    return finished;
+}
+
+/* Note [Unintentional marking in resurrectThreads]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * In both moving and non-moving collectors threads found to be unreachable are
+ * evacuated/marked and then resurrected with resurrectThreads. resurrectThreads
+ * raises an exception in the unreachable thread via raiseAsync, which does
+ * mutations on the heap. These mutations cause adding stuff to UpdRemSet of the
+ * thread's capability. Here's an example backtrace where this happens:
+ *
+ *     #0  updateRemembSetPushClosure
+ *     #1  0x000000000072b363 in dirty_TVAR
+ *     #2  0x00000000007162e5 in remove_watch_queue_entries_for_trec
+ *     #3  0x0000000000717098 in stmAbortTransaction
+ *     #4  0x000000000070c6eb in raiseAsync
+ *     #5  0x000000000070b473 in throwToSingleThreaded__
+ *     #6  0x000000000070b4ab in throwToSingleThreaded
+ *     #7  0x00000000006fce82 in resurrectThreads
+ *     #8  0x00000000007215db in nonmovingMark_
+ *     #9  0x0000000000721438 in nonmovingConcurrentMark
+ *     #10 0x00007f1ee81cd6db in start_thread
+ *     #11 0x00007f1ee850688f in clone
+ *
+ * However we don't really want to run write barriers when calling
+ * resurrectThreads here, because we're in a GC pause, and overwritten values
+ * are definitely gone forever (as opposed to being inserted in a marked object
+ * or kept in registers and used later).
+ *
+ * When this happens, if we don't reset the UpdRemSets, what happens is in the
+ * next mark we see these objects that were added in previous mark's
+ * resurrectThreads in UpdRemSets, and mark those. This causes keeping
+ * unreachable objects alive, and effects weak finalization and thread resurrect
+ * (which rely on things become unreachable). As an example, stm048 fails when
+ * we get this wrong, because when we do raiseAsync on a thread that was blocked
+ * on an STM transaction we mutate a TVAR_WATCH_QUEUE, which has a reference to
+ * the TSO that was running the STM transaction. If the TSO becomes unreachable
+ * again in the next GC we don't realize this, because it was added to an
+ * UpdRemSet in the previous GC's mark phase, because of raiseAsync.
+ *
+ * To fix this we clear all UpdRemSets in nonmovingFinishFlush, right before
+ * releasing capabilities. This is somewhat inefficient (we allow adding objects
+ * to UpdRemSets, only to later reset them), but the only case where we add to
+ * UpdRemSets during mark is resurrectThreads, and I don't think we do so many
+ * resurrection in a thread that we fill UpdRemSets and allocate new blocks. So
+ * pushing an UpdRemSet in this case is really fast, and resetting is even
+ * faster (we just update a pointer).
+ *
+ * TODO (osa): What if we actually marked UpdRemSets in this case, in the mark
+ * loop? Would that work? Or what would break?
+ */
+
+/* Notify capabilities that the synchronisation is finished; they may resume
+ * execution.
+ */
+void nonmovingFinishFlush(Task *task)
+{
+    // See Note [Unintentional marking in resurrectThreads]
+    for (uint32_t i = 0; i < n_capabilities; i++) {
+        reset_upd_rem_set(&capabilities[i]->upd_rem_set);
+    }
+    // Also reset upd_rem_set_block_list in case some of the UpdRemSets were
+    // filled and we flushed them.
+    freeChain_lock(upd_rem_set_block_list);
+    upd_rem_set_block_list = NULL;
+
+    debugTrace(DEBUG_nonmoving_gc, "Finished update remembered set flush...");
+    releaseAllCapabilities(n_capabilities, NULL, task);
+}
+#endif
+
 /*********************************************************
  * Pushing to either the mark queue or remembered set
  *********************************************************/
@@ -102,14 +356,18 @@ push (MarkQueue *q, const MarkQueueEnt *ent)
     // Are we at the end of the block?
     if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) {
         // Yes, this block is full.
-        // allocate a fresh block.
-        ACQUIRE_SM_LOCK;
-        bdescr *bd = allocGroup(1);
-        bd->link = q->blocks;
-        q->blocks = bd;
-        q->top = (MarkQueueBlock *) bd->start;
-        q->top->head = 0;
-        RELEASE_SM_LOCK;
+        if (q->is_upd_rem_set) {
+            nonmovingAddUpdRemSetBlocks(q);
+        } else {
+            // allocate a fresh block.
+            ACQUIRE_SM_LOCK;
+            bdescr *bd = allocGroup(1);
+            bd->link = q->blocks;
+            q->blocks = bd;
+            q->top = (MarkQueueBlock *) bd->start;
+            q->top->head = 0;
+            RELEASE_SM_LOCK;
+        }
     }
 
     q->top->entries[q->top->head] = *ent;
@@ -183,6 +441,183 @@ void push_fun_srt (MarkQueue *q, const StgInfoTable *info)
 }
 
 /*********************************************************
+ * Pushing to the update remembered set
+ *
+ * upd_rem_set_push_* functions are directly called by
+ * mutators and need to check whether the value is in
+ * non-moving heap.
+ *********************************************************/
+
+// Check if the object is traced by the non-moving collector. This holds in two
+// conditions:
+//
+// - Object is in non-moving heap
+// - Object is a large (BF_LARGE) and marked as BF_NONMOVING
+// - Object is static (HEAP_ALLOCED_GC(obj) == false)
+//
+static
+bool check_in_nonmoving_heap(StgClosure *p) {
+    if (HEAP_ALLOCED_GC(p)) {
+        // This works for both large and small objects:
+        return Bdescr((P_)p)->flags & BF_NONMOVING;
+    } else {
+        return true; // a static object
+    }
+}
+
+/* Push the free variables of a (now-evaluated) thunk to the
+ * update remembered set.
+ */
+inline void updateRemembSetPushThunk(Capability *cap, StgThunk *thunk)
+{
+    const StgInfoTable *info;
+    do {
+        info = get_volatile_itbl((StgClosure *) thunk);
+    } while (info->type == WHITEHOLE);
+    updateRemembSetPushThunkEager(cap, (StgThunkInfoTable *) info, thunk);
+}
+
+void updateRemembSetPushThunkEager(Capability *cap,
+                                   const StgThunkInfoTable *info,
+                                   StgThunk *thunk)
+{
+    /* N.B. info->i.type mustn't be WHITEHOLE */
+    switch (info->i.type) {
+    case THUNK:
+    case THUNK_1_0:
+    case THUNK_0_1:
+    case THUNK_2_0:
+    case THUNK_1_1:
+    case THUNK_0_2:
+    {
+        MarkQueue *queue = &cap->upd_rem_set.queue;
+        push_thunk_srt(queue, &info->i);
+
+        // Don't record the origin of objects living outside of the nonmoving
+        // heap; we can't perform the selector optimisation on them anyways.
+        bool record_origin = check_in_nonmoving_heap((StgClosure*)thunk);
+
+        for (StgWord i = 0; i < info->i.layout.payload.ptrs; i++) {
+            if (check_in_nonmoving_heap(thunk->payload[i])) {
+                push_closure(queue,
+                             thunk->payload[i],
+                             record_origin ? &thunk->payload[i] : NULL);
+            }
+        }
+        break;
+    }
+    case AP:
+    {
+        MarkQueue *queue = &cap->upd_rem_set.queue;
+        StgAP *ap = (StgAP *) thunk;
+        push_closure(queue, ap->fun, &ap->fun);
+        mark_PAP_payload(queue, ap->fun, ap->payload, ap->n_args);
+        break;
+    }
+    case THUNK_SELECTOR:
+    case BLACKHOLE:
+        // TODO: This is right, right?
+        break;
+    default:
+        barf("updateRemembSetPushThunk: invalid thunk pushed: p=%p, type=%d",
+             thunk, info->i.type);
+    }
+}
+
+void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p)
+{
+    updateRemembSetPushThunk(regTableToCapability(reg), p);
+}
+
+inline void updateRemembSetPushClosure(Capability *cap, StgClosure *p)
+{
+    if (!check_in_nonmoving_heap(p)) return;
+    MarkQueue *queue = &cap->upd_rem_set.queue;
+    push_closure(queue, p, NULL);
+}
+
+void updateRemembSetPushClosure_(StgRegTable *reg, StgClosure *p)
+{
+    updateRemembSetPushClosure(regTableToCapability(reg), p);
+}
+
+STATIC_INLINE bool needs_upd_rem_set_mark(StgClosure *p)
+{
+    // TODO: Deduplicate with mark_closure
+    bdescr *bd = Bdescr((StgPtr) p);
+    if (bd->gen != oldest_gen) {
+        return false;
+    } else if (bd->flags & BF_LARGE) {
+        if (! (bd->flags & BF_NONMOVING_SWEEPING)) {
+            return false;
+        } else {
+            return ! (bd->flags & BF_MARKED);
+        }
+    } else {
+        struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+        nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p);
+        return nonmovingGetMark(seg, block_idx) != nonmovingMarkEpoch;
+    }
+}
+
+/* Set the mark bit; only to be called *after* we have fully marked the closure */
+STATIC_INLINE void finish_upd_rem_set_mark(StgClosure *p)
+{
+    bdescr *bd = Bdescr((StgPtr) p);
+    if (bd->flags & BF_LARGE) {
+        // Someone else may have already marked it.
+        ACQUIRE_LOCK(&nonmoving_large_objects_mutex);
+        if (! (bd->flags & BF_MARKED)) {
+            bd->flags |= BF_MARKED;
+            dbl_link_remove(bd, &nonmoving_large_objects);
+            dbl_link_onto(bd, &nonmoving_marked_large_objects);
+            n_nonmoving_large_blocks -= bd->blocks;
+            n_nonmoving_marked_large_blocks += bd->blocks;
+        }
+        RELEASE_LOCK(&nonmoving_large_objects_mutex);
+    } else {
+        struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
+        nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p);
+        nonmovingSetMark(seg, block_idx);
+    }
+}
+
+void updateRemembSetPushTSO(Capability *cap, StgTSO *tso)
+{
+    if (needs_upd_rem_set_mark((StgClosure *) tso)) {
+        debugTrace(DEBUG_nonmoving_gc, "upd_rem_set: TSO %p", tso);
+        mark_tso(&cap->upd_rem_set.queue, tso);
+        finish_upd_rem_set_mark((StgClosure *) tso);
+    }
+}
+
+void updateRemembSetPushStack(Capability *cap, StgStack *stack)
+{
+    // N.B. caller responsible for checking nonmoving_write_barrier_enabled
+    if (needs_upd_rem_set_mark((StgClosure *) stack)) {
+        StgWord marking = stack->marking;
+        // See Note [StgStack dirtiness flags and concurrent marking]
+        if (cas(&stack->marking, marking, nonmovingMarkEpoch)
+              != nonmovingMarkEpoch) {
+            // We have claimed the right to mark the stack.
+            debugTrace(DEBUG_nonmoving_gc, "upd_rem_set: STACK %p", stack->sp);
+            mark_stack(&cap->upd_rem_set.queue, stack);
+            finish_upd_rem_set_mark((StgClosure *) stack);
+            return;
+        } else {
+            // The concurrent GC has claimed the right to mark the stack.
+            // Wait until it finishes marking before proceeding with
+            // mutation.
+            while (needs_upd_rem_set_mark((StgClosure *) stack));
+#if defined(PARALLEL_GC)
+                busy_wait_nop(); // TODO: Spinning here is unfortunate
+#endif
+            return;
+        }
+    }
+}
+
+/*********************************************************
  * Pushing to the mark queue
  *********************************************************/
 
@@ -192,8 +627,8 @@ void markQueuePush (MarkQueue *q, const MarkQueueEnt *ent)
 }
 
 void markQueuePushClosure (MarkQueue *q,
-                              StgClosure *p,
-                              StgClosure **origin)
+                           StgClosure *p,
+                           StgClosure **origin)
 {
     push_closure(q, p, origin);
 }
@@ -264,7 +699,7 @@ again:
 }
 
 /*********************************************************
- * Creating and destroying MarkQueues
+ * Creating and destroying MarkQueues and UpdRemSets
  *********************************************************/
 
 /* Must hold sm_mutex. */
@@ -281,22 +716,45 @@ void initMarkQueue (MarkQueue *queue)
 {
     init_mark_queue_(queue);
     queue->marked_objects = allocHashTable();
+    queue->is_upd_rem_set = false;
+}
+
+/* Must hold sm_mutex. */
+void init_upd_rem_set (UpdRemSet *rset)
+{
+    init_mark_queue_(&rset->queue);
+    // Update remembered sets don't have to worry about static objects
+    rset->queue.marked_objects = NULL;
+    rset->queue.is_upd_rem_set = true;
+}
+
+void reset_upd_rem_set (UpdRemSet *rset)
+{
+    // UpdRemSets always have one block for the mark queue. This assertion is to
+    // update this code if we change that.
+    ASSERT(rset->queue.blocks->link == NULL);
+    rset->queue.top->head = 0;
 }
 
 void freeMarkQueue (MarkQueue *queue)
 {
-    bdescr* b = queue->blocks;
-    ACQUIRE_SM_LOCK;
-    while (b)
-    {
-        bdescr* b_ = b->link;
-        freeGroup(b);
-        b = b_;
-    }
-    RELEASE_SM_LOCK;
+    freeChain_lock(queue->blocks);
     freeHashTable(queue->marked_objects, NULL);
 }
 
+#if defined(THREADED_RTS) && defined(DEBUG)
+static uint32_t
+markQueueLength (MarkQueue *q)
+{
+    uint32_t n = 0;
+    for (bdescr *block = q->blocks; block; block = block->link) {
+        MarkQueueBlock *queue = (MarkQueueBlock*)block->start;
+        n += queue->head;
+    }
+    return n;
+}
+#endif
+
 
 /*********************************************************
  * Marking
@@ -307,7 +765,8 @@ void freeMarkQueue (MarkQueue *queue)
  * barrier. Consequently it's quite important that we deeply mark
  * any outstanding transactions.
  */
-static void mark_trec_header (MarkQueue *queue, StgTRecHeader *trec)
+static void
+mark_trec_header (MarkQueue *queue, StgTRecHeader *trec)
 {
     while (trec != NO_TREC) {
         StgTRecChunk *chunk = trec->current_chunk;
@@ -326,7 +785,8 @@ static void mark_trec_header (MarkQueue *queue, StgTRecHeader *trec)
     }
 }
 
-static void mark_tso (MarkQueue *queue, StgTSO *tso)
+static void
+mark_tso (MarkQueue *queue, StgTSO *tso)
 {
     // TODO: Clear dirty if contains only old gen objects
 
@@ -535,7 +995,7 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
     p = UNTAG_CLOSURE(p);
 
 #   define PUSH_FIELD(obj, field)                                \
-        markQueuePushClosure(queue,                           \
+        markQueuePushClosure(queue,                              \
                                 (StgClosure *) (obj)->field,     \
                                 (StgClosure **) &(obj)->field)
 
@@ -592,7 +1052,7 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
             return;
 
         case WHITEHOLE:
-            while (get_itbl(p)->type == WHITEHOLE);
+            while (get_volatile_itbl(p)->type == WHITEHOLE);
                 // busy_wait_nop(); // FIXME
             goto try_again;
 
@@ -608,9 +1068,12 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
         // we moved everything to the non-moving heap before starting the major
         // collection, we know that we don't need to trace it: it was allocated
         // after we took our snapshot.
-
+#if !defined(THREADED_RTS)
         // This should never happen in the non-concurrent case
         barf("Closure outside of non-moving heap: %p", p);
+#else
+        return;
+#endif
     }
 
     ASSERTM(LOOKS_LIKE_CLOSURE_PTR(p), "invalid closure, info=%p", p->header.info);
@@ -878,7 +1341,22 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
     case STACK: {
         // See Note [StgStack dirtiness flags and concurrent marking]
         StgStack *stack = (StgStack *) p;
-        mark_stack(queue, stack);
+        StgWord marking = stack->marking;
+
+        // N.B. stack->marking must be != nonmovingMarkEpoch unless
+        // someone has already marked it.
+        if (cas(&stack->marking, marking, nonmovingMarkEpoch)
+              != nonmovingMarkEpoch) {
+            // We have claimed the right to mark the stack.
+            mark_stack(queue, stack);
+        } else {
+            // A mutator has already started marking the stack; we just let it
+            // do its thing and move on. There's no reason to wait; we know that
+            // the stack will be fully marked before we sweep due to the final
+            // post-mark synchronization. Most importantly, we do not set its
+            // mark bit, the mutator is responsible for this.
+            return;
+        }
         break;
     }
 
@@ -905,8 +1383,7 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
     }
 
     case WHITEHOLE:
-        while (get_itbl(p)->type == WHITEHOLE);
-            // busy_wait_nop(); // FIXME
+        while (get_volatile_itbl(p)->type == WHITEHOLE);
         goto try_again;
 
     default:
@@ -921,6 +1398,12 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
      * mutator waiting for us to finish so it can start execution.
      */
     if (bd->flags & BF_LARGE) {
+        /* Marking a large object isn't idempotent since we move it to
+         * nonmoving_marked_large_objects; to ensure that we don't repeatedly
+         * mark a large object, we only set BF_MARKED on large objects in the
+         * nonmoving heap while holding nonmoving_large_objects_mutex
+         */
+        ACQUIRE_LOCK(&nonmoving_large_objects_mutex);
         if (! (bd->flags & BF_MARKED)) {
             // Remove the object from nonmoving_large_objects and link it to
             // nonmoving_marked_large_objects
@@ -930,6 +1413,7 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
             n_nonmoving_marked_large_blocks += bd->blocks;
             bd->flags |= BF_MARKED;
         }
+        RELEASE_LOCK(&nonmoving_large_objects_mutex);
     } else {
         // TODO: Kill repetition
         struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p);
@@ -947,7 +1431,8 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin)
  *  c. the mark queue has been seeded with a set of roots.
  *
  */
-GNUC_ATTR_HOT void nonmovingMark (MarkQueue *queue)
+GNUC_ATTR_HOT void
+nonmovingMark (MarkQueue *queue)
 {
     debugTrace(DEBUG_nonmoving_gc, "Starting mark pass");
     unsigned int count = 0;
@@ -974,9 +1459,23 @@ GNUC_ATTR_HOT void nonmovingMark (MarkQueue *queue)
             break;
         }
         case NULL_ENTRY:
-            // Nothing more to do
-            debugTrace(DEBUG_nonmoving_gc, "Finished mark pass: %d", count);
-            return;
+            // Perhaps the update remembered set has more to mark...
+            if (upd_rem_set_block_list) {
+                ACQUIRE_LOCK(&upd_rem_set_lock);
+                bdescr *old = queue->blocks;
+                queue->blocks = upd_rem_set_block_list;
+                queue->top = (MarkQueueBlock *) queue->blocks->start;
+                upd_rem_set_block_list = NULL;
+                RELEASE_LOCK(&upd_rem_set_lock);
+
+                ACQUIRE_SM_LOCK;
+                freeGroup(old);
+                RELEASE_SM_LOCK;
+            } else {
+                // Nothing more to do
+                debugTrace(DEBUG_nonmoving_gc, "Finished mark pass: %d", count);
+                return;
+            }
         }
     }
 }
diff --git a/rts/sm/NonMovingMark.h b/rts/sm/NonMovingMark.h
index 636f41890c..d7066e56d6 100644
--- a/rts/sm/NonMovingMark.h
+++ b/rts/sm/NonMovingMark.h
@@ -80,11 +80,23 @@ typedef struct MarkQueue_ {
     // Cached value of blocks->start.
     MarkQueueBlock *top;
 
+    // Is this a mark queue or a capability-local update remembered set?
+    bool is_upd_rem_set;
+
     // Marked objects outside of nonmoving heap, namely large and static
     // objects.
     HashTable *marked_objects;
 } MarkQueue;
 
+/* While it shares its representation with MarkQueue, UpdRemSet differs in
+ * behavior when pushing; namely full chunks are immediately pushed to the
+ * global update remembered set, not accumulated into a chain. We make this
+ * distinction apparent in the types.
+ */
+typedef struct {
+    MarkQueue queue;
+} UpdRemSet;
+
 // The length of MarkQueueBlock.entries
 #define MARK_QUEUE_BLOCK_ENTRIES ((BLOCK_SIZE - sizeof(MarkQueueBlock)) / sizeof(MarkQueueEnt))
 
@@ -101,6 +113,22 @@ extern StgIndStatic *debug_caf_list_snapshot;
 #endif
 
 extern MarkQueue *current_mark_queue;
+extern bdescr *upd_rem_set_block_list;
+
+void nonmovingMarkInitUpdRemSet(void);
+
+void init_upd_rem_set(UpdRemSet *rset);
+void reset_upd_rem_set(UpdRemSet *rset);
+void updateRemembSetPushThunk(Capability *cap, StgThunk *p);
+void updateRemembSetPushTSO(Capability *cap, StgTSO *tso);
+void updateRemembSetPushStack(Capability *cap, StgStack *stack);
+
+#if defined(THREADED_RTS)
+void nonmovingFlushCapUpdRemSetBlocks(Capability *cap);
+void nonmovingBeginFlush(Task *task);
+bool nonmovingWaitForFlush(void);
+void nonmovingFinishFlush(Task *task);
+#endif
 
 void markQueueAddRoot(MarkQueue* q, StgClosure** root);
 
@@ -124,6 +152,9 @@ void markQueuePushClosure_(MarkQueue *q, StgClosure *p);
 void markQueuePushThunkSrt(MarkQueue *q, const StgInfoTable *info);
 void markQueuePushFunSrt(MarkQueue *q, const StgInfoTable *info);
 void markQueuePushArray(MarkQueue *q, const StgMutArrPtrs *array, StgWord start_index);
+void updateRemembSetPushThunkEager(Capability *cap,
+                                  const StgThunkInfoTable *orig_info,
+                                  StgThunk *thunk);
 
 INLINE_HEADER bool markQueueIsEmpty(MarkQueue *q)
 {
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index 3e1748d5b6..0724a0059a 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -912,9 +912,11 @@ findMemoryLeak (void)
     for (i = 0; i < n_capabilities; i++) {
         markBlocks(gc_threads[i]->free_blocks);
         markBlocks(capabilities[i]->pinned_object_block);
+        markBlocks(capabilities[i]->upd_rem_set.queue.blocks);
     }
 
     if (RtsFlags.GcFlags.useNonmoving) {
+        markBlocks(upd_rem_set_block_list);
         markBlocks(nonmoving_large_objects);
         markBlocks(nonmoving_marked_large_objects);
         for (i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
@@ -1054,7 +1056,8 @@ memInventory (bool show)
   uint32_t g, i;
   W_ gen_blocks[RtsFlags.GcFlags.generations];
   W_ nursery_blocks = 0, retainer_blocks = 0,
-      arena_blocks = 0, exec_blocks = 0, gc_free_blocks = 0;
+      arena_blocks = 0, exec_blocks = 0, gc_free_blocks = 0,
+      upd_rem_set_blocks = 0;
   W_ live_blocks = 0, free_blocks = 0;
   bool leak;
 
@@ -1099,12 +1102,19 @@ memInventory (bool show)
   /* count the blocks on the free list */
   free_blocks = countFreeList();
 
+  // count UpdRemSet blocks
+  for (i = 0; i < n_capabilities; ++i) {
+      upd_rem_set_blocks += countBlocks(capabilities[i]->upd_rem_set.queue.blocks);
+  }
+  upd_rem_set_blocks += countBlocks(upd_rem_set_block_list);
+
   live_blocks = 0;
   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
       live_blocks += gen_blocks[g];
   }
   live_blocks += nursery_blocks +
-               + retainer_blocks + arena_blocks + exec_blocks + gc_free_blocks;
+               + retainer_blocks + arena_blocks + exec_blocks + gc_free_blocks
+               + upd_rem_set_blocks;
 
 #define MB(n) (((double)(n) * BLOCK_SIZE_W) / ((1024*1024)/sizeof(W_)))
 
@@ -1133,6 +1143,8 @@ memInventory (bool show)
                  gc_free_blocks, MB(gc_free_blocks));
       debugBelch("  free         : %5" FMT_Word " blocks (%6.1lf MB)\n",
                  free_blocks, MB(free_blocks));
+      debugBelch("  UpdRemSet    : %5" FMT_Word " blocks (%6.1lf MB)\n",
+                 upd_rem_set_blocks, MB(upd_rem_set_blocks));
       debugBelch("  total        : %5" FMT_Word " blocks (%6.1lf MB)\n",
                  live_blocks + free_blocks, MB(live_blocks+free_blocks));
       if (leak) {
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 9fe68e98b7..0199bc58f3 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -281,6 +281,14 @@ void storageAddCapabilities (uint32_t from, uint32_t to)
         }
     }
 
+    // Initialize NonmovingAllocators and UpdRemSets
+    if (RtsFlags.GcFlags.useNonmoving) {
+        nonmovingAddCapabilities(to);
+        for (i = 0; i < to; ++i) {
+            init_upd_rem_set(&capabilities[i]->upd_rem_set);
+        }
+    }
+
 #if defined(THREADED_RTS) && defined(CC_LLVM_BACKEND) && (CC_SUPPORTS_TLS == 0)
     newThreadLocalKey(&gctKey);
 #endif
@@ -412,6 +420,22 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf)
     // successfully claimed by us; overwrite with IND_STATIC
 #endif
 
+    // Push stuff that will become unreachable after updating to UpdRemSet to
+    // maintain snapshot invariant
+    const StgInfoTable *orig_info_tbl = INFO_PTR_TO_STRUCT(orig_info);
+    // OSA: Assertions to make sure my understanding of static thunks is correct
+    ASSERT(orig_info_tbl->type == THUNK_STATIC);
+    // Secondly I think static thunks can't have payload: anything that they
+    // reference should be in SRTs
+    ASSERT(orig_info_tbl->layout.payload.ptrs == 0);
+    // Becuase the payload is empty we just push the SRT
+    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        StgThunkInfoTable *thunk_info = itbl_to_thunk_itbl(orig_info_tbl);
+        if (thunk_info->i.srt) {
+            updateRemembSetPushClosure(cap, GET_SRT(thunk_info));
+        }
+    }
+
     // For the benefit of revertCAFs(), save the original info pointer
     caf->saved_info = orig_info;
 
@@ -1083,6 +1107,27 @@ allocatePinned (Capability *cap, W_ n)
    Write Barriers
    -------------------------------------------------------------------------- */
 
+/* These write barriers on heavily mutated objects serve two purposes:
+ *
+ * - Efficient maintenance of the generational invariant: Record whether or not
+ *   we have added a particular mutable object to mut_list as they may contain
+ *   references to younger generations.
+ *
+ * - Maintenance of the nonmoving collector's snapshot invariant: Record objects
+ *   which are about to no longer be reachable due to mutation.
+ *
+ * In each case we record whether the object has been added to the mutable list
+ * by way of either the info pointer or a dedicated "dirty" flag. The GC will
+ * clear this flag and remove the object from mut_list (or rather, not re-add it)
+ * to if it finds the object contains no references into any younger generation.
+ *
+ * Note that all dirty objects will be marked as clean during preparation for a
+ * concurrent collection. Consequently, we can use the dirtiness flag to determine
+ * whether or not we need to add overwritten pointers to the update remembered
+ * set (since we need only write the value prior to the first update to maintain
+ * the snapshot invariant).
+ */
+
 /*
    This is the write barrier for MUT_VARs, a.k.a. IORefs.  A
    MUT_VAR_CLEAN object is not on the mutable list; a MUT_VAR_DIRTY
@@ -1090,25 +1135,39 @@ allocatePinned (Capability *cap, W_ n)
    and is put on the mutable list.
 */
 void
-dirty_MUT_VAR(StgRegTable *reg, StgClosure *p)
+dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mvar, StgClosure *old)
 {
     Capability *cap = regTableToCapability(reg);
     // No barrier required here as no other heap object fields are read. See
     // note [Heap memory barriers] in SMP.h.
-    if (p->header.info == &stg_MUT_VAR_CLEAN_info) {
-        p->header.info = &stg_MUT_VAR_DIRTY_info;
-        recordClosureMutated(cap,p);
+    if (mvar->header.info == &stg_MUT_VAR_CLEAN_info) {
+        mvar->header.info = &stg_MUT_VAR_DIRTY_info;
+        recordClosureMutated(cap, (StgClosure *) mvar);
+        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled != 0)) {
+            updateRemembSetPushClosure_(reg, old);
+        }
     }
 }
 
+/*
+ * This is the write barrier for TVARs.
+ * old is the pointer that we overwrote, which is required by the concurrent
+ * garbage collector. Note that we, while StgTVars contain multiple pointers,
+ * only overwrite one per dirty_TVAR call so we only need to take one old
+ * pointer argument.
+ */
 void
-dirty_TVAR(Capability *cap, StgTVar *p)
+dirty_TVAR(Capability *cap, StgTVar *p,
+           StgClosure *old)
 {
     // No barrier required here as no other heap object fields are read. See
     // note [Heap memory barriers] in SMP.h.
     if (p->header.info == &stg_TVAR_CLEAN_info) {
         p->header.info = &stg_TVAR_DIRTY_info;
         recordClosureMutated(cap,(StgClosure*)p);
+        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled != 0)) {
+            updateRemembSetPushClosure(cap, old);
+        }
     }
 }
 
@@ -1123,6 +1182,8 @@ setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target)
     if (tso->dirty == 0) {
         tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
+        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+            updateRemembSetPushClosure(cap, (StgClosure *) tso->_link);
     }
     tso->_link = target;
 }
@@ -1133,6 +1194,8 @@ setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target)
     if (tso->dirty == 0) {
         tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
+        if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+            updateRemembSetPushClosure(cap, (StgClosure *) tso->block_info.prev);
     }
     tso->block_info.prev = target;
 }
@@ -1144,15 +1207,47 @@ dirty_TSO (Capability *cap, StgTSO *tso)
         tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
     }
+
+    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+        updateRemembSetPushTSO(cap, tso);
 }
 
 void
 dirty_STACK (Capability *cap, StgStack *stack)
 {
+    // First push to upd_rem_set before we set stack->dirty since we
+    // the nonmoving collector may already be marking the stack.
+    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled))
+        updateRemembSetPushStack(cap, stack);
+
     if (! (stack->dirty & STACK_DIRTY)) {
         stack->dirty = STACK_DIRTY;
         recordClosureMutated(cap,(StgClosure*)stack);
     }
+
+}
+
+/*
+ * This is the concurrent collector's write barrier for MVARs. In the other
+ * write barriers above this is folded into the dirty_* functions.  However, in
+ * the case of MVars we need to separate the acts of adding the MVar to the
+ * mutable list and adding its fields to the update remembered set.
+ *
+ * Specifically, the wakeup loop in stg_putMVarzh wants to freely mutate the
+ * pointers of the MVar but needs to keep its lock, meaning we can't yet add it
+ * to the mutable list lest the assertion checking for clean MVars on the
+ * mutable list would fail.
+ */
+void
+update_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val)
+{
+    Capability *cap = regTableToCapability(reg);
+    if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) {
+        StgMVar *mvar = (StgMVar *) p;
+        updateRemembSetPushClosure(cap, old_val);
+        updateRemembSetPushClosure(cap, (StgClosure *) mvar->head);
+        updateRemembSetPushClosure(cap, (StgClosure *) mvar->tail);
+    }
 }
 
 /*
@@ -1164,9 +1259,11 @@ dirty_STACK (Capability *cap, StgStack *stack)
    such as Chaneneos and cheap-concurrency.
 */
 void
-dirty_MVAR(StgRegTable *reg, StgClosure *p)
+dirty_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val)
 {
-    recordClosureMutated(regTableToCapability(reg),p);
+    Capability *cap = regTableToCapability(reg);
+    update_MVAR(reg, p, old_val);
+    recordClosureMutated(cap, p);
 }
 
 /* -----------------------------------------------------------------------------
diff --git a/rts/sm/Storage.h b/rts/sm/Storage.h
index 08bdb37ba3..cdb9720650 100644
--- a/rts/sm/Storage.h
+++ b/rts/sm/Storage.h
@@ -47,8 +47,9 @@ extern Mutex sm_mutex;
    The write barrier for MVARs and TVARs
    -------------------------------------------------------------------------- */
 
-void dirty_MVAR(StgRegTable *reg, StgClosure *p);
-void dirty_TVAR(Capability *cap, StgTVar *p);
+void update_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val);
+void dirty_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old);
+void dirty_TVAR(Capability *cap, StgTVar *p, StgClosure *old);
 
 /* -----------------------------------------------------------------------------
    Nursery manipulation