From bd8e3ff43b64a72ed1c820e89691d0a83a1c6e96 Mon Sep 17 00:00:00 2001 From: Ben Gamari Date: Tue, 5 Feb 2019 11:51:14 -0500 Subject: rts: Implement concurrent collection in the nonmoving collector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This extends the non-moving collector to allow concurrent collection. The full design of the collector implemented here is described in detail in a technical note B. Gamari. "A Concurrent Garbage Collector For the Glasgow Haskell Compiler" (2018) This extension involves the introduction of a capability-local remembered set, known as the /update remembered set/, which tracks objects which may no longer be visible to the collector due to mutation. To maintain this remembered set we introduce a write barrier on mutations which is enabled while a concurrent mark is underway. The update remembered set representation is similar to that of the nonmoving mark queue, being a chunked array of `MarkEntry`s. Each `Capability` maintains a single accumulator chunk, which it flushed when it (a) is filled, or (b) when the nonmoving collector enters its post-mark synchronization phase. While the write barrier touches a significant amount of code it is conceptually straightforward: the mutator must ensure that the referee of any pointer it overwrites is added to the update remembered set. However, there are a few details: * In the case of objects with a dirty flag (e.g. `MVar`s) we can exploit the fact that only the *first* mutation requires a write barrier. * Weak references, as usual, complicate things. In particular, we must ensure that the referee of a weak object is marked if dereferenced by the mutator. For this we (unfortunately) must introduce a read barrier, as described in Note [Concurrent read barrier on deRefWeak#] (in `NonMovingMark.c`). * Stable names are also a bit tricky as described in Note [Sweeping stable names in the concurrent collector] (`NonMovingSweep.c`). We take quite some pains to ensure that the high thread count often seen in parallel Haskell applications doesn't affect pause times. To this end we allow thread stacks to be marked either by the thread itself (when it is executed or stack-underflows) or the concurrent mark thread (if the thread owning the stack is never scheduled). There is a non-trivial handshake to ensure that this happens without racing which is described in Note [StgStack dirtiness flags and concurrent marking]. Co-Authored-by: Ömer Sinan Ağacan --- compiler/GHC/StgToCmm/Bind.hs | 1 + compiler/GHC/StgToCmm/Prim.hs | 61 +++- compiler/GHC/StgToCmm/Utils.hs | 42 +++ compiler/cmm/CLabel.hs | 7 +- includes/Cmm.h | 22 ++ includes/Rts.h | 1 + includes/rts/NonMoving.h | 24 ++ includes/rts/storage/ClosureMacros.h | 14 + includes/rts/storage/GC.h | 2 +- includes/rts/storage/TSO.h | 50 +++- includes/stg/MiscClosures.h | 1 + rts/Apply.cmm | 2 + rts/Capability.c | 34 ++- rts/Capability.h | 6 +- rts/Exception.cmm | 5 + rts/Messages.c | 10 +- rts/PrimOps.cmm | 108 +++++-- rts/RaiseAsync.c | 2 +- rts/RtsStartup.c | 6 +- rts/RtsSymbols.c | 5 + rts/STM.c | 41 ++- rts/Schedule.c | 11 +- rts/StableName.c | 4 + rts/ThreadPaused.c | 11 + rts/Threads.c | 22 +- rts/Updates.h | 8 +- rts/rts.cabal.in | 1 + rts/sm/NonMoving.c | 186 +++++++++++- rts/sm/NonMoving.h | 1 - rts/sm/NonMovingMark.c | 563 +++++++++++++++++++++++++++++++++-- rts/sm/NonMovingMark.h | 31 ++ rts/sm/Sanity.c | 16 +- rts/sm/Storage.c | 111 ++++++- rts/sm/Storage.h | 5 +- 34 files changed, 1290 insertions(+), 124 deletions(-) create mode 100644 includes/rts/NonMoving.h diff --git a/compiler/GHC/StgToCmm/Bind.hs b/compiler/GHC/StgToCmm/Bind.hs index 44316cacb0..9e192a0ac8 100644 --- a/compiler/GHC/StgToCmm/Bind.hs +++ b/compiler/GHC/StgToCmm/Bind.hs @@ -631,6 +631,7 @@ emitBlackHoleCode node = do -- work with profiling. when eager_blackholing $ do + whenUpdRemSetEnabled dflags $ emitUpdRemSetPushThunk node emitStore (cmmOffsetW dflags node (fixedHdrSizeW dflags)) currentTSOExpr -- See Note [Heap memory barriers] in SMP.h. emitPrimCall [] MO_WriteBarrier [] diff --git a/compiler/GHC/StgToCmm/Prim.hs b/compiler/GHC/StgToCmm/Prim.hs index cdbc8d9fd9..155cdcbf80 100644 --- a/compiler/GHC/StgToCmm/Prim.hs +++ b/compiler/GHC/StgToCmm/Prim.hs @@ -42,6 +42,7 @@ import BlockId import MkGraph import StgSyn import Cmm +import Module ( rtsUnitId ) import Type ( Type, tyConAppTyCon ) import TyCon import CLabel @@ -339,14 +340,20 @@ dispatchPrimop dflags = \case emitAssign (CmmLocal res) (cmmLoadIndexW dflags mutv (fixedHdrSizeW dflags) (gcWord dflags)) WriteMutVarOp -> \[mutv, var] -> OpDest_AllDone $ \res@[] -> do + old_val <- CmmLocal <$> newTemp (cmmExprType dflags var) + emitAssign old_val (cmmLoadIndexW dflags mutv (fixedHdrSizeW dflags) (gcWord dflags)) + -- Without this write barrier, other CPUs may see this pointer before -- the writes for the closure it points to have occurred. + -- Note that this also must come after we read the old value to ensure + -- that the read of old_val comes before another core's write to the + -- MutVar's value. emitPrimCall res MO_WriteBarrier [] emitStore (cmmOffsetW dflags mutv (fixedHdrSizeW dflags)) var emitCCall [{-no results-}] (CmmLit (CmmLabel mkDirty_MUT_VAR_Label)) - [(baseExpr, AddrHint), (mutv,AddrHint)] + [(baseExpr, AddrHint), (mutv, AddrHint), (CmmReg old_val, AddrHint)] -- #define sizzeofByteArrayzh(r,a) \ -- r = ((StgArrBytes *)(a))->bytes @@ -1983,17 +1990,21 @@ doWritePtrArrayOp :: CmmExpr doWritePtrArrayOp addr idx val = do dflags <- getDynFlags let ty = cmmExprType dflags val + hdr_size = arrPtrsHdrSize dflags + -- Update remembered set for non-moving collector + whenUpdRemSetEnabled dflags + $ emitUpdRemSetPush (cmmLoadIndexOffExpr dflags hdr_size ty addr ty idx) -- This write barrier is to ensure that the heap writes to the object -- referred to by val have happened before we write val into the array. -- See #12469 for details. emitPrimCall [] MO_WriteBarrier [] - mkBasicIndexedWrite (arrPtrsHdrSize dflags) Nothing addr ty idx val + mkBasicIndexedWrite hdr_size Nothing addr ty idx val emit (setInfo addr (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel))) - -- the write barrier. We must write a byte into the mark table: - -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N] + -- the write barrier. We must write a byte into the mark table: + -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N] emit $ mkStore ( cmmOffsetExpr dflags - (cmmOffsetExprW dflags (cmmOffsetB dflags addr (arrPtrsHdrSize dflags)) + (cmmOffsetExprW dflags (cmmOffsetB dflags addr hdr_size) (loadArrPtrsSize dflags addr)) (CmmMachOp (mo_wordUShr dflags) [idx, mkIntExpr dflags (mUT_ARR_PTRS_CARD_BITS dflags)]) @@ -2584,6 +2595,9 @@ emitCopyArray copy src0 src_off dst0 dst_off0 n = dst <- assignTempE dst0 dst_off <- assignTempE dst_off0 + -- Nonmoving collector write barrier + emitCopyUpdRemSetPush dflags (arrPtrsHdrSizeW dflags) dst dst_off n + -- Set the dirty bit in the header. emit (setInfo dst (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel))) @@ -2646,6 +2660,9 @@ emitCopySmallArray copy src0 src_off dst0 dst_off n = src <- assignTempE src0 dst <- assignTempE dst0 + -- Nonmoving collector write barrier + emitCopyUpdRemSetPush dflags (smallArrPtrsHdrSizeW dflags) dst dst_off n + -- Set the dirty bit in the header. emit (setInfo dst (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel))) @@ -2774,6 +2791,12 @@ doWriteSmallPtrArrayOp :: CmmExpr doWriteSmallPtrArrayOp addr idx val = do dflags <- getDynFlags let ty = cmmExprType dflags val + + -- Update remembered set for non-moving collector + tmp <- newTemp ty + mkBasicIndexedRead (smallArrPtrsHdrSize dflags) Nothing ty tmp addr ty idx + whenUpdRemSetEnabled dflags $ emitUpdRemSetPush (CmmReg (CmmLocal tmp)) + emitPrimCall [] MO_WriteBarrier [] -- #12469 mkBasicIndexedWrite (smallArrPtrsHdrSize dflags) Nothing addr ty idx val emit (setInfo addr (CmmLit (CmmLabel mkSMAP_DIRTY_infoLabel))) @@ -2953,3 +2976,31 @@ emitCtzCall res x width = do [ res ] (MO_Ctz width) [ x ] + +--------------------------------------------------------------------------- +-- Pushing to the update remembered set +--------------------------------------------------------------------------- + +-- | Push a range of pointer-array elements that are about to be copied over to +-- the update remembered set. +emitCopyUpdRemSetPush :: DynFlags + -> WordOff -- ^ array header size + -> CmmExpr -- ^ destination array + -> CmmExpr -- ^ offset in destination array (in words) + -> Int -- ^ number of elements to copy + -> FCode () +emitCopyUpdRemSetPush _dflags _hdr_size _dst _dst_off 0 = return () +emitCopyUpdRemSetPush dflags hdr_size dst dst_off n = + whenUpdRemSetEnabled dflags $ do + updfr_off <- getUpdFrameOff + graph <- mkCall lbl (NativeNodeCall,NativeReturn) [] args updfr_off [] + emit graph + where + lbl = mkLblExpr $ mkPrimCallLabel + $ PrimCall (fsLit "stg_copyArray_barrier") rtsUnitId + args = + [ mkIntExpr dflags hdr_size + , dst + , dst_off + , mkIntExpr dflags n + ] diff --git a/compiler/GHC/StgToCmm/Utils.hs b/compiler/GHC/StgToCmm/Utils.hs index 30e37bb930..0b3a8d8b08 100644 --- a/compiler/GHC/StgToCmm/Utils.hs +++ b/compiler/GHC/StgToCmm/Utils.hs @@ -39,6 +39,11 @@ module GHC.StgToCmm.Utils ( mkWordCLit, newStringCLit, newByteStringCLit, blankWord, + + -- * Update remembered set operations + whenUpdRemSetEnabled, + emitUpdRemSetPush, + emitUpdRemSetPushThunk, ) where #include "HsVersions.h" @@ -576,3 +581,40 @@ assignTemp' e let reg = CmmLocal lreg emitAssign reg e return (CmmReg reg) + + +--------------------------------------------------------------------------- +-- Pushing to the update remembered set +--------------------------------------------------------------------------- + +whenUpdRemSetEnabled :: DynFlags -> FCode a -> FCode () +whenUpdRemSetEnabled dflags code = do + do_it <- getCode code + the_if <- mkCmmIfThenElse' is_enabled do_it mkNop (Just False) + emit the_if + where + enabled = CmmLoad (CmmLit $ CmmLabel mkNonmovingWriteBarrierEnabledLabel) (bWord dflags) + zero = zeroExpr dflags + is_enabled = cmmNeWord dflags enabled zero + +-- | Emit code to add an entry to a now-overwritten pointer to the update +-- remembered set. +emitUpdRemSetPush :: CmmExpr -- ^ value of pointer which was overwritten + -> FCode () +emitUpdRemSetPush ptr = do + emitRtsCall + rtsUnitId + (fsLit "updateRemembSetPushClosure_") + [(CmmReg (CmmGlobal BaseReg), AddrHint), + (ptr, AddrHint)] + False + +emitUpdRemSetPushThunk :: CmmExpr -- ^ the thunk + -> FCode () +emitUpdRemSetPushThunk ptr = do + emitRtsCall + rtsUnitId + (fsLit "updateRemembSetPushThunk_") + [(CmmReg (CmmGlobal BaseReg), AddrHint), + (ptr, AddrHint)] + False diff --git a/compiler/cmm/CLabel.hs b/compiler/cmm/CLabel.hs index 0c3dae8001..66e39f0d69 100644 --- a/compiler/cmm/CLabel.hs +++ b/compiler/cmm/CLabel.hs @@ -40,6 +40,7 @@ module CLabel ( mkAsmTempDieLabel, mkDirty_MUT_VAR_Label, + mkNonmovingWriteBarrierEnabledLabel, mkUpdInfoLabel, mkBHUpdInfoLabel, mkIndStaticInfoLabel, @@ -484,7 +485,9 @@ mkBlockInfoTableLabel name c = IdLabel name c BlockInfoTable -- See Note [Proc-point local block entry-point]. -- Constructing Cmm Labels -mkDirty_MUT_VAR_Label, mkUpdInfoLabel, +mkDirty_MUT_VAR_Label, + mkNonmovingWriteBarrierEnabledLabel, + mkUpdInfoLabel, mkBHUpdInfoLabel, mkIndStaticInfoLabel, mkMainCapabilityLabel, mkMAP_FROZEN_CLEAN_infoLabel, mkMAP_FROZEN_DIRTY_infoLabel, mkMAP_DIRTY_infoLabel, @@ -494,6 +497,8 @@ mkDirty_MUT_VAR_Label, mkUpdInfoLabel, mkSMAP_FROZEN_CLEAN_infoLabel, mkSMAP_FROZEN_DIRTY_infoLabel, mkSMAP_DIRTY_infoLabel, mkBadAlignmentLabel :: CLabel mkDirty_MUT_VAR_Label = mkForeignLabel (fsLit "dirty_MUT_VAR") Nothing ForeignLabelInExternalPackage IsFunction +mkNonmovingWriteBarrierEnabledLabel + = CmmLabel rtsUnitId (fsLit "nonmoving_write_barrier_enabled") CmmData mkUpdInfoLabel = CmmLabel rtsUnitId (fsLit "stg_upd_frame") CmmInfo mkBHUpdInfoLabel = CmmLabel rtsUnitId (fsLit "stg_bh_upd_frame" ) CmmInfo mkIndStaticInfoLabel = CmmLabel rtsUnitId (fsLit "stg_IND_STATIC") CmmInfo diff --git a/includes/Cmm.h b/includes/Cmm.h index 21d5da310c..42aa0bf550 100644 --- a/includes/Cmm.h +++ b/includes/Cmm.h @@ -842,6 +842,10 @@ __gen = TO_W_(bdescr_gen_no(__bd)); \ if (__gen > 0) { recordMutableCap(__p, __gen); } +/* ----------------------------------------------------------------------------- + Update remembered set write barrier + -------------------------------------------------------------------------- */ + /* ----------------------------------------------------------------------------- Arrays -------------------------------------------------------------------------- */ @@ -944,3 +948,21 @@ prim %memcpy(dst_p, src_p, n * SIZEOF_W, SIZEOF_W); \ \ return (dst); + + +#if defined(THREADED_RTS) +#define IF_WRITE_BARRIER_ENABLED \ + if (W_[nonmoving_write_barrier_enabled] != 0) (likely: False) +#else +// A similar measure is also taken in rts/NonMoving.h, but that isn't visible from C-- +#define IF_WRITE_BARRIER_ENABLED \ + if (0) +#define nonmoving_write_barrier_enabled 0 +#endif + +// A useful helper for pushing a pointer to the update remembered set. +// See Note [Update remembered set] in NonMovingMark.c. +#define updateRemembSetPushPtr(p) \ + IF_WRITE_BARRIER_ENABLED { \ + ccall updateRemembSetPushClosure_(BaseReg "ptr", p "ptr"); \ + } diff --git a/includes/Rts.h b/includes/Rts.h index 56642e14c5..def06de90d 100644 --- a/includes/Rts.h +++ b/includes/Rts.h @@ -197,6 +197,7 @@ void _assertFail(const char *filename, unsigned int linenum) #include "rts/storage/ClosureMacros.h" #include "rts/storage/MBlock.h" #include "rts/storage/GC.h" +#include "rts/NonMoving.h" /* Other RTS external APIs */ #include "rts/Parallel.h" diff --git a/includes/rts/NonMoving.h b/includes/rts/NonMoving.h new file mode 100644 index 0000000000..6a6d96b2c8 --- /dev/null +++ b/includes/rts/NonMoving.h @@ -0,0 +1,24 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 2018-2019 + * + * Non-moving garbage collector + * + * Do not #include this file directly: #include "Rts.h" instead. + * + * To understand the structure of the RTS headers, see the wiki: + * http://ghc.haskell.org/trac/ghc/wiki/Commentary/SourceTree/Includes + * + * -------------------------------------------------------------------------- */ + +#pragma once + +/* This is called by the code generator */ +extern DLL_IMPORT_RTS +void updateRemembSetPushClosure_(StgRegTable *reg, StgClosure *p); + +void updateRemembSetPushClosure(Capability *cap, StgClosure *p); + +void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p); + +extern StgWord DLL_IMPORT_DATA_VAR(nonmoving_write_barrier_enabled); diff --git a/includes/rts/storage/ClosureMacros.h b/includes/rts/storage/ClosureMacros.h index a3873cc49d..2af50863d0 100644 --- a/includes/rts/storage/ClosureMacros.h +++ b/includes/rts/storage/ClosureMacros.h @@ -107,6 +107,20 @@ INLINE_HEADER const StgConInfoTable *get_con_itbl(const StgClosure *c) return CON_INFO_PTR_TO_STRUCT((c)->header.info); } +/* Used when we expect another thread to be mutating the info table pointer of + * a closure (e.g. when busy-waiting on a WHITEHOLE). + */ +INLINE_HEADER const StgInfoTable *get_volatile_itbl(StgClosure *c) { + // The volatile here is import to ensure that the compiler does not + // optimise away multiple loads, e.g. in a busy-wait loop. Note that + // we can't use VOLATILE_LOAD here as the casts result in strict aliasing + // rule violations and this header may be compiled outside of the RTS + // (where we use -fno-strict-aliasing). + StgInfoTable * *volatile p = (StgInfoTable * *volatile) &c->header.info; + return INFO_PTR_TO_STRUCT(*p); +} + + INLINE_HEADER StgHalfWord GET_TAG(const StgClosure *con) { return get_itbl(con)->srt; diff --git a/includes/rts/storage/GC.h b/includes/rts/storage/GC.h index 77dbe60297..7931433019 100644 --- a/includes/rts/storage/GC.h +++ b/includes/rts/storage/GC.h @@ -234,7 +234,7 @@ void setKeepCAFs (void); and is put on the mutable list. -------------------------------------------------------------------------- */ -void dirty_MUT_VAR(StgRegTable *reg, StgClosure *p); +void dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mv, StgClosure *old); /* set to disable CAF garbage collection in GHCi. */ /* (needed when dynamic libraries are used). */ diff --git a/includes/rts/storage/TSO.h b/includes/rts/storage/TSO.h index 63d2a11e8e..d56ae8ad27 100644 --- a/includes/rts/storage/TSO.h +++ b/includes/rts/storage/TSO.h @@ -185,6 +185,53 @@ typedef struct StgTSO_ { } *StgTSOPtr; // StgTSO defined in rts/Types.h +/* Note [StgStack dirtiness flags and concurrent marking] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Without concurrent collection by the nonmoving collector the stack dirtiness story + * is quite simple: The stack is either STACK_DIRTY (meaning it has been added to mut_list) + * or not. + * + * However, things are considerably more complicated with concurrent collection + * (namely, when nonmoving_write_barrier_enabled is set): In addition to adding + * the stack to mut_list and flagging it as STACK_DIRTY, we also must ensure + * that stacks are marked in accordance with the nonmoving collector's snapshot + * invariant. This is: every stack alive at the time the snapshot is taken must + * be marked at some point after the moment the snapshot is taken and before it + * is mutated or the commencement of the sweep phase. + * + * This marking may be done by the concurrent mark phase (in the case of a + * thread that never runs during the concurrent mark) or by the mutator when + * dirtying the stack. However, it is unsafe for the concurrent collector to + * traverse the stack while it is under mutation. Consequently, the following + * handshake is obeyed by the mutator's write barrier and the concurrent mark to + * ensure this doesn't happen: + * + * 1. The entity seeking to mark first checks that the stack lives in the nonmoving + * generation; if not then the stack was not alive at the time the snapshot + * was taken and therefore we need not mark it. + * + * 2. The entity seeking to mark checks the stack's mark bit. If it is set then + * no mark is necessary. + * + * 3. The entity seeking to mark tries to lock the stack for marking by + * atomically setting its `marking` field to the current non-moving mark + * epoch: + * + * a. If the mutator finds the concurrent collector has already locked the + * stack then it waits until it is finished (indicated by the mark bit + * being set) before proceeding with execution. + * + * b. If the concurrent collector finds that the mutator has locked the stack + * then it moves on, leaving the mutator to mark it. There is no need to wait; + * the mark is guaranteed to finish before sweep due to the post-mark + * synchronization with mutators. + * + * c. Whoever succeeds in locking the stack is responsible for marking it and + * setting the stack's mark bit (either the BF_MARKED bit for large objects + * or otherwise its bit in its segment's mark bitmap). + * + */ #define STACK_DIRTY 1 // used by sanity checker to verify that all dirty stacks are on the mutable list @@ -193,7 +240,8 @@ typedef struct StgTSO_ { typedef struct StgStack_ { StgHeader header; StgWord32 stack_size; // stack size in *words* - StgWord32 dirty; // non-zero => dirty + StgWord dirty; // non-zero => dirty + StgWord marking; // non-zero => someone is currently marking the stack StgPtr sp; // current stack pointer StgWord stack[]; } StgStack; diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h index 217b1bc89d..f4ae2245d2 100644 --- a/includes/stg/MiscClosures.h +++ b/includes/stg/MiscClosures.h @@ -542,5 +542,6 @@ void * pushCostCentre (void *ccs, void *cc); // Capability.c extern unsigned int n_capabilities; +extern void updateRemembSetPushThunk_(void *reg, void *p1); #endif diff --git a/rts/Apply.cmm b/rts/Apply.cmm index 8d7fc3c012..eeb760c5ed 100644 --- a/rts/Apply.cmm +++ b/rts/Apply.cmm @@ -654,6 +654,8 @@ INFO_TABLE(stg_AP_STACK,/*special layout*/0,0,AP_STACK,"AP_STACK","AP_STACK") /* someone else beat us to it */ jump ENTRY_LBL(stg_WHITEHOLE) (ap); } + // Can't add StgInd_indirectee(ap) to UpdRemSet here because the old value is + // not reachable. StgInd_indirectee(ap) = CurrentTSO; prim_write_barrier; SET_INFO(ap, __stg_EAGER_BLACKHOLE_info); diff --git a/rts/Capability.c b/rts/Capability.c index 23e581359e..0baa4ef205 100644 --- a/rts/Capability.c +++ b/rts/Capability.c @@ -292,6 +292,11 @@ initCapability (Capability *cap, uint32_t i) RtsFlags.GcFlags.generations, "initCapability"); + + // At this point storage manager is not initialized yet, so this will be + // initialized in initStorage(). + cap->upd_rem_set.queue.blocks = NULL; + for (g = 0; g < RtsFlags.GcFlags.generations; g++) { cap->mut_lists[g] = NULL; } @@ -861,16 +866,27 @@ yieldCapability (Capability** pCap, Task *task, bool gcAllowed) { PendingSync *sync = pending_sync; - if (sync && sync->type == SYNC_GC_PAR) { - if (! sync->idle[cap->no]) { - traceEventGcStart(cap); - gcWorkerThread(cap); - traceEventGcEnd(cap); - traceSparkCounters(cap); - // See Note [migrated bound threads 2] - if (task->cap == cap) { - return true; + if (sync) { + switch (sync->type) { + case SYNC_GC_PAR: + if (! sync->idle[cap->no]) { + traceEventGcStart(cap); + gcWorkerThread(cap); + traceEventGcEnd(cap); + traceSparkCounters(cap); + // See Note [migrated bound threads 2] + if (task->cap == cap) { + return true; + } } + break; + + case SYNC_FLUSH_UPD_REM_SET: + debugTrace(DEBUG_nonmoving_gc, "Flushing update remembered set blocks..."); + break; + + default: + break; } } } diff --git a/rts/Capability.h b/rts/Capability.h index 0c41456261..e51769f887 100644 --- a/rts/Capability.h +++ b/rts/Capability.h @@ -85,6 +85,9 @@ struct Capability_ { bdescr **mut_lists; bdescr **saved_mut_lists; // tmp use during GC + // The update remembered set for the non-moving collector + UpdRemSet upd_rem_set; + // block for allocating pinned objects into bdescr *pinned_object_block; // full pinned object blocks allocated since the last GC @@ -257,7 +260,8 @@ extern Capability **capabilities; typedef enum { SYNC_OTHER, SYNC_GC_SEQ, - SYNC_GC_PAR + SYNC_GC_PAR, + SYNC_FLUSH_UPD_REM_SET } SyncType; // diff --git a/rts/Exception.cmm b/rts/Exception.cmm index 8ea94b19f2..334d0ef823 100644 --- a/rts/Exception.cmm +++ b/rts/Exception.cmm @@ -318,6 +318,7 @@ stg_killThreadzh (P_ target, P_ exception) return (); } else { StgTSO_why_blocked(CurrentTSO) = BlockedOnMsgThrowTo; + updateRemembSetPushPtr(StgTSO_block_info(CurrentTSO)); StgTSO_block_info(CurrentTSO) = msg; // we must block, and unlock the message before returning jump stg_block_throwto (target, exception); @@ -489,6 +490,8 @@ retry_pop_stack: ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr"); ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr"); + // No need to push `trec` to update remembered set; it will be no longer + // reachable after we overwrite StgTSO.trec. StgTSO_trec(CurrentTSO) = NO_TREC; if (r != 0) { // Transaction was valid: continue searching for a catch frame @@ -607,6 +610,8 @@ retry_pop_stack: outer = StgTRecHeader_enclosing_trec(trec); ccall stmAbortTransaction(MyCapability() "ptr", trec "ptr"); ccall stmFreeAbortedTRec(MyCapability() "ptr", trec "ptr"); + // No need to push `trec` to update remembered set since we just freed + // it; it is no longer reachable. StgTSO_trec(CurrentTSO) = outer; Sp = Sp + SIZEOF_StgCatchSTMFrame; } diff --git a/rts/Messages.c b/rts/Messages.c index d878db5eda..b16ada53e1 100644 --- a/rts/Messages.c +++ b/rts/Messages.c @@ -244,8 +244,8 @@ loop: // a barrier is necessary to ensure that all writes are visible. // See Note [Heap memory barriers] in SMP.h. write_barrier(); + dirty_TSO(cap, owner); // we will modify owner->bq owner->bq = bq; - dirty_TSO(cap, owner); // we modified owner->bq // If the owner of the blackhole is currently runnable, then // bump it to the front of the run queue. This gives the @@ -262,6 +262,9 @@ loop: // point to the BLOCKING_QUEUE from the BLACKHOLE write_barrier(); // make the BQ visible, see Note [Heap memory barriers]. + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) { + updateRemembSetPushClosure(cap, (StgClosure*)p); + } ((StgInd*)bh)->indirectee = (StgClosure *)bq; recordClosureMutated(cap,bh); // bh was mutated @@ -290,6 +293,11 @@ loop: } #endif + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) { + // We are about to overwrite bq->queue; make sure its current value + // makes it into the update remembered set + updateRemembSetPushClosure(cap, (StgClosure*)bq->queue); + } msg->link = bq->queue; bq->queue = msg; // No barrier is necessary here: we are only exposing the diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm index a2ab3de586..6d3df0700c 100644 --- a/rts/PrimOps.cmm +++ b/rts/PrimOps.cmm @@ -349,8 +349,13 @@ stg_casArrayzh ( gcptr arr, W_ ind, gcptr old, gcptr new ) // Compare and Swap Succeeded: SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, CCCS); len = StgMutArrPtrs_ptrs(arr); + // The write barrier. We must write a byte into the mark table: I8[arr + SIZEOF_StgMutArrPtrs + WDS(len) + (ind >> MUT_ARR_PTRS_CARD_BITS )] = 1; + + // Concurrent GC write barrier + updateRemembSetPushPtr(old); + return (0,new); } } @@ -462,16 +467,45 @@ stg_thawSmallArrayzh ( gcptr src, W_ offset, W_ n ) cloneSmallArray(stg_SMALL_MUT_ARR_PTRS_DIRTY_info, src, offset, n) } +// Concurrent GC write barrier for pointer array copies +// +// hdr_size in bytes. dst_off in words, n in words. +stg_copyArray_barrier ( W_ hdr_size, gcptr dst, W_ dst_off, W_ n) +{ + W_ end, p; + ASSERT(n > 0); // Assumes n==0 is handled by caller + p = dst + hdr_size + WDS(dst_off); + end = p + WDS(n); + +again: + IF_WRITE_BARRIER_ENABLED { + ccall updateRemembSetPushClosure_(BaseReg "ptr", W_[p] "ptr"); + } + p = p + WDS(1); + if (p < end) { + goto again; + } + + return (); +} + stg_copySmallArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n) { W_ dst_p, src_p, bytes; - SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info); + if (n > 0) { + IF_WRITE_BARRIER_ENABLED { + call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs, + dst, dst_off, n); + } - dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off); - src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off); - bytes = WDS(n); - prim %memcpy(dst_p, src_p, bytes, SIZEOF_W); + SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info); + + dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off); + src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off); + bytes = WDS(n); + prim %memcpy(dst_p, src_p, bytes, SIZEOF_W); + } return (); } @@ -480,15 +514,22 @@ stg_copySmallMutableArrayzh ( gcptr src, W_ src_off, gcptr dst, W_ dst_off, W_ n { W_ dst_p, src_p, bytes; - SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info); + if (n > 0) { + IF_WRITE_BARRIER_ENABLED { + call stg_copyArray_barrier(SIZEOF_StgSmallMutArrPtrs, + dst, dst_off, n); + } - dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off); - src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off); - bytes = WDS(n); - if (src == dst) { - prim %memmove(dst_p, src_p, bytes, SIZEOF_W); - } else { - prim %memcpy(dst_p, src_p, bytes, SIZEOF_W); + SET_INFO(dst, stg_SMALL_MUT_ARR_PTRS_DIRTY_info); + + dst_p = dst + SIZEOF_StgSmallMutArrPtrs + WDS(dst_off); + src_p = src + SIZEOF_StgSmallMutArrPtrs + WDS(src_off); + bytes = WDS(n); + if (src == dst) { + prim %memmove(dst_p, src_p, bytes, SIZEOF_W); + } else { + prim %memcpy(dst_p, src_p, bytes, SIZEOF_W); + } } return (); @@ -510,6 +551,10 @@ stg_casSmallArrayzh ( gcptr arr, W_ ind, gcptr old, gcptr new ) } else { // Compare and Swap Succeeded: SET_HDR(arr, stg_SMALL_MUT_ARR_PTRS_DIRTY_info, CCCS); + + // Concurrent GC write barrier + updateRemembSetPushPtr(old); + return (0,new); } } @@ -549,7 +594,7 @@ stg_casMutVarzh ( gcptr mv, gcptr old, gcptr new ) return (1,h); } else { if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) { - ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr"); + ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr", old); } return (0,new); } @@ -562,7 +607,7 @@ stg_casMutVarzh ( gcptr mv, gcptr old, gcptr new ) } else { StgMutVar_var(mv) = new; if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) { - ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr"); + ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr", old); } return (0,new); } @@ -629,11 +674,12 @@ stg_atomicModifyMutVar2zh ( gcptr mv, gcptr f ) (h) = prim %cmpxchgW(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y); if (h != x) { goto retry; } #else + h = StgMutVar_var(mv); StgMutVar_var(mv) = y; #endif if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) { - ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr"); + ccall dirty_MUT_VAR(BaseReg "ptr", mv "ptr", h); } return (x,z); @@ -755,6 +801,9 @@ stg_addCFinalizzerToWeakzh ( W_ fptr, // finalizer return (0); } + // Write barrier for concurrent non-moving collector + updateRemembSetPushPtr(StgWeak_cfinalizers(w)) + StgCFinalizerList_link(c) = StgWeak_cfinalizers(w); StgWeak_cfinalizers(w) = c; @@ -835,6 +884,8 @@ stg_deRefWeakzh ( gcptr w ) if (info == stg_WEAK_info) { code = 1; val = StgWeak_value(w); + // See Note [Concurrent read barrier on deRefWeak#] in NonMovingMark.c + updateRemembSetPushPtr(val); } else { code = 0; val = w; @@ -1501,7 +1552,7 @@ stg_takeMVarzh ( P_ mvar /* :: MVar a */ ) */ if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) { if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr"); } // We want to put the heap check down here in the slow path, @@ -1547,6 +1598,9 @@ loop: // If the MVar is not already dirty, then we don't need to make // it dirty, as it is empty with nothing blocking on it. unlockClosure(mvar, info); + // However, we do need to ensure that the nonmoving collector + // knows about the reference to the value that we just removed... + updateRemembSetPushPtr(val); return (val); } qinfo = StgHeader_info(q); @@ -1560,7 +1614,7 @@ loop: // There are putMVar(s) waiting... wake up the first thread on the queue if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", val "ptr"); } tso = StgMVarTSOQueue_tso(q); @@ -1629,7 +1683,7 @@ loop: // There are putMVar(s) waiting... wake up the first thread on the queue if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", val "ptr"); } tso = StgMVarTSOQueue_tso(q); @@ -1667,7 +1721,7 @@ stg_putMVarzh ( P_ mvar, /* :: MVar a */ if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) { if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr"); } // We want to put the heap check down here in the slow path, @@ -1701,14 +1755,20 @@ stg_putMVarzh ( P_ mvar, /* :: MVar a */ jump stg_block_putmvar(mvar,val); } + // We are going to mutate the closure, make sure its current pointers + // are marked. + if (info == stg_MVAR_CLEAN_info) { + ccall update_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr"); + } + q = StgMVar_head(mvar); loop: if (q == stg_END_TSO_QUEUE_closure) { /* No further takes, the MVar is now full. */ + StgMVar_value(mvar) = val; if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr"); } - StgMVar_value(mvar) = val; unlockClosure(mvar, stg_MVAR_DIRTY_info); return (); } @@ -1790,7 +1850,7 @@ loop: if (q == stg_END_TSO_QUEUE_closure) { /* No further takes, the MVar is now full. */ if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar) "ptr"); } StgMVar_value(mvar) = val; @@ -1861,7 +1921,7 @@ stg_readMVarzh ( P_ mvar, /* :: MVar a */ ) if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) { if (info == stg_MVAR_CLEAN_info) { - ccall dirty_MVAR(BaseReg "ptr", mvar "ptr"); + ccall dirty_MVAR(BaseReg "ptr", mvar "ptr", StgMVar_value(mvar)); } ALLOC_PRIM_WITH_CUSTOM_FAILURE diff --git a/rts/RaiseAsync.c b/rts/RaiseAsync.c index 807c3e3d30..50cddff051 100644 --- a/rts/RaiseAsync.c +++ b/rts/RaiseAsync.c @@ -515,9 +515,9 @@ blockedThrowTo (Capability *cap, StgTSO *target, MessageThrowTo *msg) ASSERT(target->cap == cap); + dirty_TSO(cap,target); // we will modify the blocked_exceptions queue msg->link = target->blocked_exceptions; target->blocked_exceptions = msg; - dirty_TSO(cap,target); // we modified the blocked_exceptions queue } /* ----------------------------------------------------------------------------- diff --git a/rts/RtsStartup.c b/rts/RtsStartup.c index ce0fa2d519..d0d08a2495 100644 --- a/rts/RtsStartup.c +++ b/rts/RtsStartup.c @@ -392,7 +392,8 @@ hs_exit_(bool wait_foreign) ioManagerDie(); #endif - /* stop all running tasks */ + /* stop all running tasks. This is also where we stop concurrent non-moving + * collection if it's running */ exitScheduler(wait_foreign); /* run C finalizers for all active weak pointers */ @@ -436,9 +437,6 @@ hs_exit_(bool wait_foreign) /* shutdown the hpc support (if needed) */ exitHpc(); - /* wait for any on-going concurrent GC to finish */ - nonmovingExit(); - // clean up things from the storage manager's point of view. // also outputs the stats (+RTS -s) info. exitStorage(); diff --git a/rts/RtsSymbols.c b/rts/RtsSymbols.c index e34fcf03f5..e64c78fbcc 100644 --- a/rts/RtsSymbols.c +++ b/rts/RtsSymbols.c @@ -14,6 +14,7 @@ #include "HsFFI.h" #include "sm/Storage.h" +#include "sm/NonMovingMark.h" #include #if !defined(mingw32_HOST_OS) @@ -682,6 +683,9 @@ SymI_HasProto(stg_shrinkMutableByteArrayzh) \ SymI_HasProto(stg_resizzeMutableByteArrayzh) \ SymI_HasProto(newSpark) \ + SymI_HasProto(updateRemembSetPushThunk) \ + SymI_HasProto(updateRemembSetPushThunk_) \ + SymI_HasProto(updateRemembSetPushClosure_) \ SymI_HasProto(performGC) \ SymI_HasProto(performMajorGC) \ SymI_HasProto(prog_argc) \ @@ -1037,6 +1041,7 @@ RtsSymbolVal rtsSyms[] = { RTS_OPENBSD_ONLY_SYMBOLS RTS_LIBGCC_SYMBOLS RTS_LIBFFI_SYMBOLS + SymI_HasDataProto(nonmoving_write_barrier_enabled) #if defined(darwin_HOST_OS) && defined(i386_HOST_ARCH) // dyld stub code contains references to this, // but it should never be called because we treat diff --git a/rts/STM.c b/rts/STM.c index dc0b0ebb78..c17f33aaa0 100644 --- a/rts/STM.c +++ b/rts/STM.c @@ -182,7 +182,8 @@ static void unlock_stm(StgTRecHeader *trec STG_UNUSED) { TRACE("%p : unlock_stm()", trec); } -static StgClosure *lock_tvar(StgTRecHeader *trec STG_UNUSED, +static StgClosure *lock_tvar(Capability *cap STG_UNUSED, + StgTRecHeader *trec STG_UNUSED, StgTVar *s STG_UNUSED) { StgClosure *result; TRACE("%p : lock_tvar(%p)", trec, s); @@ -197,12 +198,14 @@ static void unlock_tvar(Capability *cap, StgBool force_update) { TRACE("%p : unlock_tvar(%p)", trec, s); if (force_update) { + StgClosure *old_value = s -> current_value; s -> current_value = c; - dirty_TVAR(cap,s); + dirty_TVAR(cap, s, old_value); } } -static StgBool cond_lock_tvar(StgTRecHeader *trec STG_UNUSED, +static StgBool cond_lock_tvar(Capability *cap STG_UNUSED, + StgTRecHeader *trec STG_UNUSED, StgTVar *s STG_UNUSED, StgClosure *expected) { StgClosure *result; @@ -231,7 +234,8 @@ static void unlock_stm(StgTRecHeader *trec STG_UNUSED) { smp_locked = 0; } -static StgClosure *lock_tvar(StgTRecHeader *trec STG_UNUSED, +static StgClosure *lock_tvar(Capability *cap STG_UNUSED, + StgTRecHeader *trec STG_UNUSED, StgTVar *s STG_UNUSED) { StgClosure *result; TRACE("%p : lock_tvar(%p)", trec, s); @@ -248,12 +252,14 @@ static void *unlock_tvar(Capability *cap, TRACE("%p : unlock_tvar(%p, %p)", trec, s, c); ASSERT(smp_locked == trec); if (force_update) { + StgClosure *old_value = s -> current_value; s -> current_value = c; - dirty_TVAR(cap,s); + dirty_TVAR(cap, s, old_value); } } -static StgBool cond_lock_tvar(StgTRecHeader *trec STG_UNUSED, +static StgBool cond_lock_tvar(Capability *cap STG_UNUSED, + StgTRecHeader *trec STG_UNUSED, StgTVar *s STG_UNUSED, StgClosure *expected) { StgClosure *result; @@ -279,7 +285,8 @@ static void unlock_stm(StgTRecHeader *trec STG_UNUSED) { TRACE("%p : unlock_stm()", trec); } -static StgClosure *lock_tvar(StgTRecHeader *trec, +static StgClosure *lock_tvar(Capability *cap, + StgTRecHeader *trec, StgTVar *s STG_UNUSED) { StgClosure *result; TRACE("%p : lock_tvar(%p)", trec, s); @@ -289,6 +296,10 @@ static StgClosure *lock_tvar(StgTRecHeader *trec, } while (GET_INFO(UNTAG_CLOSURE(result)) == &stg_TREC_HEADER_info); } while (cas((void *)&(s -> current_value), (StgWord)result, (StgWord)trec) != (StgWord)result); + + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled && result)) { + updateRemembSetPushClosure(cap, result); + } return result; } @@ -300,10 +311,11 @@ static void unlock_tvar(Capability *cap, TRACE("%p : unlock_tvar(%p, %p)", trec, s, c); ASSERT(s -> current_value == (StgClosure *)trec); s -> current_value = c; - dirty_TVAR(cap,s); + dirty_TVAR(cap, s, (StgClosure *) trec); } -static StgBool cond_lock_tvar(StgTRecHeader *trec, +static StgBool cond_lock_tvar(Capability *cap, + StgTRecHeader *trec, StgTVar *s, StgClosure *expected) { StgClosure *result; @@ -311,6 +323,9 @@ static StgBool cond_lock_tvar(StgTRecHeader *trec, TRACE("%p : cond_lock_tvar(%p, %p)", trec, s, expected); w = cas((void *)&(s -> current_value), (StgWord)expected, (StgWord)trec); result = (StgClosure *)w; + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled && result)) { + updateRemembSetPushClosure(cap, expected); + } TRACE("%p : %s", trec, result ? "success" : "failure"); return (result == expected); } @@ -525,7 +540,7 @@ static void build_watch_queue_entries_for_trec(Capability *cap, } s -> first_watch_queue_entry = q; e -> new_value = (StgClosure *) q; - dirty_TVAR(cap,s); // we modified first_watch_queue_entry + dirty_TVAR(cap, s, (StgClosure *) fq); // we modified first_watch_queue_entry }); } @@ -545,7 +560,7 @@ static void remove_watch_queue_entries_for_trec(Capability *cap, StgTVarWatchQueue *q; StgClosure *saw; s = e -> tvar; - saw = lock_tvar(trec, s); + saw = lock_tvar(cap, trec, s); q = (StgTVarWatchQueue *) (e -> new_value); TRACE("%p : removing tso=%p from watch queue for tvar=%p", trec, @@ -562,7 +577,7 @@ static void remove_watch_queue_entries_for_trec(Capability *cap, } else { ASSERT(s -> first_watch_queue_entry == q); s -> first_watch_queue_entry = nq; - dirty_TVAR(cap,s); // we modified first_watch_queue_entry + dirty_TVAR(cap, s, (StgClosure *) q); // we modified first_watch_queue_entry } free_stg_tvar_watch_queue(cap, q); unlock_tvar(cap, trec, s, saw, false); @@ -773,7 +788,7 @@ static StgBool validate_and_acquire_ownership (Capability *cap, s = e -> tvar; if (acquire_all || entry_is_update(e)) { TRACE("%p : trying to acquire %p", trec, s); - if (!cond_lock_tvar(trec, s, e -> expected_value)) { + if (!cond_lock_tvar(cap, trec, s, e -> expected_value)) { TRACE("%p : failed to acquire %p", trec, s); result = false; BREAK_FOR_EACH; diff --git a/rts/Schedule.c b/rts/Schedule.c index 8d7acc963e..8d82daf381 100644 --- a/rts/Schedule.c +++ b/rts/Schedule.c @@ -44,6 +44,8 @@ #include "StablePtr.h" #include "StableName.h" #include "TopHandler.h" +#include "sm/NonMoving.h" +#include "sm/NonMovingMark.h" #if defined(HAVE_SYS_TYPES_H) #include @@ -2497,7 +2499,11 @@ resumeThread (void *task_) tso = incall->suspended_tso; incall->suspended_tso = NULL; incall->suspended_cap = NULL; - tso->_link = END_TSO_QUEUE; // no write barrier reqd + // we will modify tso->_link + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) { + updateRemembSetPushClosure(cap, (StgClosure *)tso->_link); + } + tso->_link = END_TSO_QUEUE; traceEventRunThread(cap, tso); @@ -2671,6 +2677,8 @@ initScheduler(void) /* Initialise the mutex and condition variables used by * the scheduler. */ initMutex(&sched_mutex); + initMutex(&sync_finished_mutex); + initCondition(&sync_finished_cond); #endif ACQUIRE_LOCK(&sched_mutex); @@ -2706,6 +2714,7 @@ exitScheduler (bool wait_foreign USED_IF_THREADS) // If we haven't killed all the threads yet, do it now. if (sched_state < SCHED_SHUTTING_DOWN) { sched_state = SCHED_INTERRUPTING; + nonmovingExit(); Capability *cap = task->cap; waitForCapability(&cap,task); scheduleDoGC(&cap,task,true); diff --git a/rts/StableName.c b/rts/StableName.c index 383d87e3db..4b26fee396 100644 --- a/rts/StableName.c +++ b/rts/StableName.c @@ -263,6 +263,9 @@ threadStableNameTable( evac_fn evac, void *user ) void gcStableNameTable( void ) { + // We must take the stable name lock lest we race with the nonmoving + // collector (namely nonmovingSweepStableNameTable). + stableNameLock(); FOR_EACH_STABLE_NAME( p, { // FOR_EACH_STABLE_NAME traverses free entries too, so @@ -286,6 +289,7 @@ gcStableNameTable( void ) } } }); + stableNameUnlock(); } /* ----------------------------------------------------------------------------- diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c index cccc7ad0b0..5cbb3f5595 100644 --- a/rts/ThreadPaused.c +++ b/rts/ThreadPaused.c @@ -334,6 +334,17 @@ threadPaused(Capability *cap, StgTSO *tso) } #endif + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled + && ip_THUNK(INFO_PTR_TO_STRUCT(bh_info)))) { + // We are about to replace a thunk with a blackhole. + // Add the free variables of the closure we are about to + // overwrite to the update remembered set. + // N.B. We caught the WHITEHOLE case above. + updateRemembSetPushThunkEager(cap, + THUNK_INFO_PTR_TO_STRUCT(bh_info), + (StgThunk *) bh); + } + // The payload of the BLACKHOLE points to the TSO ((StgInd *)bh)->indirectee = (StgClosure *)tso; write_barrier(); diff --git a/rts/Threads.c b/rts/Threads.c index 3d5b463051..2b11a1eb90 100644 --- a/rts/Threads.c +++ b/rts/Threads.c @@ -86,6 +86,7 @@ createThread(Capability *cap, W_ size) stack->stack_size = stack_size - sizeofW(StgStack); stack->sp = stack->stack + stack->stack_size; stack->dirty = STACK_DIRTY; + stack->marking = 0; tso = (StgTSO *)allocate(cap, sizeofW(StgTSO)); TICK_ALLOC_TSO(); @@ -611,6 +612,7 @@ threadStackOverflow (Capability *cap, StgTSO *tso) TICK_ALLOC_STACK(chunk_size); new_stack->dirty = 0; // begin clean, we'll mark it dirty below + new_stack->marking = 0; new_stack->stack_size = chunk_size - sizeofW(StgStack); new_stack->sp = new_stack->stack + new_stack->stack_size; @@ -721,9 +723,17 @@ threadStackUnderflow (Capability *cap, StgTSO *tso) barf("threadStackUnderflow: not enough space for return values"); } - new_stack->sp -= retvals; + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) { + // ensure that values that we copy into the new stack are marked + // for the nonmoving collector. Note that these values won't + // necessarily form a full closure so we need to handle them + // specially. + for (unsigned int i = 0; i < retvals; i++) { + updateRemembSetPushClosure(cap, (StgClosure *) old_stack->sp[i]); + } + } - memcpy(/* dest */ new_stack->sp, + memcpy(/* dest */ new_stack->sp - retvals, /* src */ old_stack->sp, /* size */ retvals * sizeof(W_)); } @@ -735,8 +745,12 @@ threadStackUnderflow (Capability *cap, StgTSO *tso) // restore the stack parameters, and update tot_stack_size tso->tot_stack_size -= old_stack->stack_size; - // we're about to run it, better mark it dirty + // we're about to run it, better mark it dirty. + // + // N.B. the nonmoving collector may mark the stack, meaning that sp must + // point at a valid stack frame. dirty_STACK(cap, new_stack); + new_stack->sp -= retvals; return retvals; } @@ -768,7 +782,7 @@ loop: if (q == (StgMVarTSOQueue*)&stg_END_TSO_QUEUE_closure) { /* No further takes, the MVar is now full. */ if (info == &stg_MVAR_CLEAN_info) { - dirty_MVAR(&cap->r, (StgClosure*)mvar); + dirty_MVAR(&cap->r, (StgClosure*)mvar, mvar->value); } mvar->value = value; diff --git a/rts/Updates.h b/rts/Updates.h index 1bd3e065af..84d9162868 100644 --- a/rts/Updates.h +++ b/rts/Updates.h @@ -50,6 +50,9 @@ \ prim_write_barrier; \ OVERWRITING_CLOSURE(p1); \ + IF_WRITE_BARRIER_ENABLED { \ + ccall updateRemembSetPushThunk_(BaseReg, p1 "ptr"); \ + } \ StgInd_indirectee(p1) = p2; \ prim_write_barrier; \ SET_INFO(p1, stg_BLACKHOLE_info); \ @@ -62,7 +65,7 @@ } else { \ TICK_UPD_NEW_IND(); \ and_then; \ - } + } #else /* !CMINUSMINUS */ @@ -78,6 +81,9 @@ INLINE_HEADER void updateWithIndirection (Capability *cap, /* See Note [Heap memory barriers] in SMP.h */ write_barrier(); OVERWRITING_CLOSURE(p1); + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) { + updateRemembSetPushThunk(cap, (StgThunk*)p1); + } ((StgInd *)p1)->indirectee = p2; write_barrier(); SET_INFO(p1, &stg_BLACKHOLE_info); diff --git a/rts/rts.cabal.in b/rts/rts.cabal.in index 7aad5e4385..2c28426d75 100644 --- a/rts/rts.cabal.in +++ b/rts/rts.cabal.in @@ -139,6 +139,7 @@ library rts/Linker.h rts/Main.h rts/Messages.h + rts/NonMoving.h rts/OSThreads.h rts/Parallel.h rts/PrimFloat.h diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c index f383949ebf..6bccf7f100 100644 --- a/rts/sm/NonMoving.c +++ b/rts/sm/NonMoving.c @@ -33,6 +33,18 @@ static void nonmovingBumpEpoch(void) { nonmovingMarkEpoch = nonmovingMarkEpoch == 1 ? 2 : 1; } +#if defined(THREADED_RTS) +/* + * This mutex ensures that only one non-moving collection is active at a time. + */ +Mutex nonmoving_collection_mutex; + +OSThreadId mark_thread; +bool concurrent_coll_running = false; +Condition concurrent_coll_finished; +Mutex concurrent_coll_finished_lock; +#endif + /* * Note [Non-moving garbage collector] * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -76,13 +88,12 @@ static void nonmovingBumpEpoch(void) { memcount nonmoving_live_words = 0; +#if defined(THREADED_RTS) +static void* nonmovingConcurrentMark(void *mark_queue); +#endif static void nonmovingClearBitmap(struct NonmovingSegment *seg); static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO **resurrected_threads); -/* Signals to mutators that they should stop to synchronize with the nonmoving - * collector so it can proceed to sweep phase. */ -bool nonmoving_syncing = false; - static void nonmovingInitSegment(struct NonmovingSegment *seg, uint8_t block_size) { seg->link = NULL; @@ -283,28 +294,38 @@ static struct NonmovingAllocator *alloc_nonmoving_allocator(uint32_t n_caps) void nonmovingInit(void) { if (! RtsFlags.GcFlags.useNonmoving) return; +#if defined(THREADED_RTS) + initMutex(&nonmoving_collection_mutex); + initCondition(&concurrent_coll_finished); + initMutex(&concurrent_coll_finished_lock); +#endif for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) { nonmovingHeap.allocators[i] = alloc_nonmoving_allocator(n_capabilities); } + nonmovingMarkInitUpdRemSet(); } void nonmovingExit(void) { if (! RtsFlags.GcFlags.useNonmoving) return; +#if defined(THREADED_RTS) + if (mark_thread) { + debugTrace(DEBUG_nonmoving_gc, + "waiting for nonmoving collector thread to terminate"); + ACQUIRE_LOCK(&concurrent_coll_finished_lock); + waitCondition(&concurrent_coll_finished, &concurrent_coll_finished_lock); + } + + closeMutex(&concurrent_coll_finished_lock); + closeCondition(&concurrent_coll_finished); + closeMutex(&nonmoving_collection_mutex); +#endif + for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) { stgFree(nonmovingHeap.allocators[i]); } } -/* - * Wait for any concurrent collections to finish. Called during shutdown to - * ensure we don't steal capabilities that the nonmoving collector still has yet - * to synchronize with. - */ -void nonmovingWaitUntilFinished(void) -{ -} - /* * Assumes that no garbage collector or mutator threads are running to safely * resize the nonmoving_allocators. @@ -443,6 +464,14 @@ static void nonmovingMarkWeakPtrList(MarkQueue *mark_queue, StgWeak *dead_weak_p void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads) { +#if defined(THREADED_RTS) + // We can't start a new collection until the old one has finished + // We also don't run in final GC + if (concurrent_coll_running || sched_state > SCHED_RUNNING) { + return; + } +#endif + resizeGenerations(); nonmovingPrepareMark(); @@ -501,9 +530,26 @@ void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads) // those lists to mark function in sequential case. In concurrent case we // allocate fresh lists. +#if defined(THREADED_RTS) + // If we're interrupting or shutting down, do not let this capability go and + // run a STW collection. Reason: we won't be able to acquire this capability + // again for the sync if we let it go, because it'll immediately start doing + // a major GC, becuase that's what we do when exiting scheduler (see + // exitScheduler()). + if (sched_state == SCHED_RUNNING) { + concurrent_coll_running = true; + nonmoving_write_barrier_enabled = true; + debugTrace(DEBUG_nonmoving_gc, "Starting concurrent mark thread"); + createOSThread(&mark_thread, "non-moving mark thread", + nonmovingConcurrentMark, mark_queue); + } else { + nonmovingConcurrentMark(mark_queue); + } +#else // Use the weak and thread lists from the preparation for any new weaks and // threads found to be dead in mark. nonmovingMark_(mark_queue, dead_weaks, resurrected_threads); +#endif } /* Mark mark queue, threads, and weak pointers until no more weaks have been @@ -523,13 +569,70 @@ static void nonmovingMarkThreadsWeaks(MarkQueue *mark_queue) } } +#if defined(THREADED_RTS) +static void* nonmovingConcurrentMark(void *data) +{ + MarkQueue *mark_queue = (MarkQueue*)data; + StgWeak *dead_weaks = NULL; + StgTSO *resurrected_threads = (StgTSO*)&stg_END_TSO_QUEUE_closure; + nonmovingMark_(mark_queue, &dead_weaks, &resurrected_threads); + return NULL; +} + +// TODO: Not sure where to put this function. +// Append w2 to the end of w1. +static void appendWeakList( StgWeak **w1, StgWeak *w2 ) +{ + while (*w1) { + w1 = &(*w1)->link; + } + *w1 = w2; +} +#endif + static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO **resurrected_threads) { + ACQUIRE_LOCK(&nonmoving_collection_mutex); debugTrace(DEBUG_nonmoving_gc, "Starting mark..."); // Do concurrent marking; most of the heap will get marked here. nonmovingMarkThreadsWeaks(mark_queue); +#if defined(THREADED_RTS) + Task *task = newBoundTask(); + + // If at this point if we've decided to exit then just return + if (sched_state > SCHED_RUNNING) { + // Note that we break our invariants here and leave segments in + // nonmovingHeap.sweep_list, don't free nonmoving_large_objects etc. + // However because we won't be running mark-sweep in the final GC this + // is OK. + + // This is a RTS shutdown so we need to move our copy (snapshot) of + // weaks (nonmoving_old_weak_ptr_list and nonmoving_weak_ptr_list) to + // oldest_gen->threads to be able to run C finalizers in hs_exit_. Note + // that there may be more weaks added to oldest_gen->threads since we + // started mark, so we need to append our list to the tail of + // oldest_gen->threads. + appendWeakList(&nonmoving_old_weak_ptr_list, nonmoving_weak_ptr_list); + appendWeakList(&oldest_gen->weak_ptr_list, nonmoving_old_weak_ptr_list); + // These lists won't be used again so this is not necessary, but still + nonmoving_old_weak_ptr_list = NULL; + nonmoving_weak_ptr_list = NULL; + + goto finish; + } + + // We're still running, request a sync + nonmovingBeginFlush(task); + + bool all_caps_syncd; + do { + all_caps_syncd = nonmovingWaitForFlush(); + nonmovingMarkThreadsWeaks(mark_queue); + } while (!all_caps_syncd); +#endif + nonmovingResurrectThreads(mark_queue, resurrected_threads); // No more resurrecting threads after this point @@ -555,6 +658,18 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO * debugTrace(DEBUG_nonmoving_gc, "Done marking, resurrecting threads before releasing capabilities"); + + // Schedule finalizers and resurrect threads +#if defined(THREADED_RTS) + // Just pick a random capability. Not sure if this is a good idea -- we use + // only one capability for all finalizers. + scheduleFinalizers(capabilities[0], *dead_weaks); + // Note that this mutates heap and causes running write barriers. + // See Note [Unintentional marking in resurrectThreads] in NonMovingMark.c + // for how we deal with this. + resurrectThreads(*resurrected_threads); +#endif + #if defined(DEBUG) // Zap CAFs that we will sweep nonmovingGcCafs(mark_queue); @@ -586,6 +701,12 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO * nonmoving_old_weak_ptr_list = NULL; } + // Everything has been marked; allow the mutators to proceed +#if defined(THREADED_RTS) + nonmoving_write_barrier_enabled = false; + nonmovingFinishFlush(task); +#endif + current_mark_queue = NULL; freeMarkQueue(mark_queue); stgFree(mark_queue); @@ -609,6 +730,20 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO * debugTrace(DEBUG_nonmoving_gc, "Finished sweeping."); // TODO: Remainder of things done by GarbageCollect (update stats) + +#if defined(THREADED_RTS) +finish: + boundTaskExiting(task); + + // We are done... + mark_thread = 0; + + // Signal that the concurrent collection is finished, allowing the next + // non-moving collection to proceed + concurrent_coll_running = false; + signalCondition(&concurrent_coll_finished); + RELEASE_LOCK(&nonmoving_collection_mutex); +#endif } #if defined(DEBUG) @@ -817,6 +952,31 @@ void locate_object(P_ obj) return; } } + + // Search workspaces FIXME only works in non-threaded runtime +#if !defined(THREADED_RTS) + for (uint32_t g = 0; g < RtsFlags.GcFlags.generations - 1; ++ g) { + gen_workspace *ws = &gct->gens[g]; + for (bdescr *blk = ws->todo_bd; blk; blk = blk->link) { + if (obj >= blk->start && obj < blk->free) { + debugBelch("%p is in generation %" FMT_Word32 " todo bds\n", obj, g); + return; + } + } + for (bdescr *blk = ws->scavd_list; blk; blk = blk->link) { + if (obj >= blk->start && obj < blk->free) { + debugBelch("%p is in generation %" FMT_Word32 " scavd bds\n", obj, g); + return; + } + } + for (bdescr *blk = ws->todo_large_objects; blk; blk = blk->link) { + if (obj >= blk->start && obj < blk->free) { + debugBelch("%p is in generation %" FMT_Word32 " todo large bds\n", obj, g); + return; + } + } + } +#endif } void nonmovingPrintSweepList() diff --git a/rts/sm/NonMoving.h b/rts/sm/NonMoving.h index a031f3d223..21c69b1ca1 100644 --- a/rts/sm/NonMoving.h +++ b/rts/sm/NonMoving.h @@ -95,7 +95,6 @@ extern memcount nonmoving_live_words; void nonmovingInit(void); void nonmovingExit(void); -void nonmovingWaitUntilFinished(void); // dead_weaks and resurrected_threads lists are used for two things: diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c index cf1950471e..b273b09b05 100644 --- a/rts/sm/NonMovingMark.c +++ b/rts/sm/NonMovingMark.c @@ -67,6 +67,14 @@ bdescr *nonmoving_large_objects = NULL; bdescr *nonmoving_marked_large_objects = NULL; memcount n_nonmoving_large_blocks = 0; memcount n_nonmoving_marked_large_blocks = 0; +#if defined(THREADED_RTS) +/* Protects everything above. Furthermore, we only set the BF_MARKED bit of + * large object blocks when this is held. This ensures that the write barrier + * (e.g. finish_upd_rem_set_mark) and the collector (mark_closure) don't try to + * move the same large object to nonmoving_marked_large_objects more than once. + */ +static Mutex nonmoving_large_objects_mutex; +#endif /* * Where we keep our threads during collection since we must have a snapshot of @@ -87,11 +95,257 @@ StgWeak *nonmoving_weak_ptr_list = NULL; StgIndStatic *debug_caf_list_snapshot = (StgIndStatic*)END_OF_CAF_LIST; #endif +/* Note [Update remembered set] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * The concurrent non-moving collector uses a remembered set to ensure + * that its marking is consistent with the snapshot invariant defined in + * the design. This remembered set, known as the update remembered set, + * records all pointers that have been overwritten since the beginning + * of the concurrent mark. This ensures that concurrent mutation cannot hide + * pointers to live objects from the nonmoving garbage collector. + * + * The update remembered set is maintained via a write barrier that + * is enabled whenever a concurrent mark is active. This write barrier + * can be found in a number of places: + * + * - In rts/Primops.cmm in primops responsible for modifying mutable closures + * (e.g. MVARs, MUT_VARs, etc.) + * + * - In rts/STM.c, where + * + * - In the dirty_* functions found in rts/Storage.c where we dirty MVARs, + * MUT_VARs, TSOs and STACKs. STACK is a somewhat special case, as described + * in Note [StgStack dirtiness flags and concurrent marking] in TSO.h. + * + * - In the code generated by the STG code generator for pointer array writes + * + * There is also a read barrier to handle weak references, as described in + * Note [Concurrent read barrier on deRefWeak#]. + * + * The representation of the update remembered set is the same as that of + * the mark queue. For efficiency, each capability maintains its own local + * accumulator of remembered set entries. When a capability fills its + * accumulator it is linked in to the global remembered set + * (upd_rem_set_block_list), where it is consumed by the mark phase. + * + * The mark phase is responsible for freeing update remembered set block + * allocations. + * + * + * Note [Concurrent read barrier on deRefWeak#] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * In general the non-moving GC assumes that all pointers reachable from a + * marked object are themselves marked (or in the mark queue). However, + * weak pointers are an obvious exception to this rule. In particular, + * deRefWeakPtr# allows the mutator to turn a weak reference into a strong + * reference. This interacts badly with concurrent collection. For + * instance, consider this program: + * + * f :: a -> b -> IO b + * f k v = do + * -- assume that k and v are the only references to the + * -- closures to which they refer. + * weak <- mkWeakPtr k v Nothing + * + * -- N.B. k is now technically dead since the only reference to it is + * -- weak, but we've not yet had a chance to tombstone the WeakPtr + * -- (which will happen in the course of major GC). + * performMajorGC + * -- Now we are running concurrently with the mark... + + * Just x <- deRefWeak weak + * -- We have now introduced a reference to `v`, which will + * -- not be marked as the only reference to `v` when the snapshot was + * -- taken is via a WeakPtr. + * return x + * + */ +static Mutex upd_rem_set_lock; +bdescr *upd_rem_set_block_list = NULL; + +#if defined(THREADED_RTS) +/* Used during the mark/sweep phase transition to track how many capabilities + * have pushed their update remembered sets. Protected by upd_rem_set_lock. + */ +static volatile StgWord upd_rem_set_flush_count = 0; +#endif + + +/* Signaled by each capability when it has flushed its update remembered set */ +static Condition upd_rem_set_flushed_cond; + +/* Indicates to mutators that the write barrier must be respected. Set while + * concurrent mark is running. + */ +StgWord nonmoving_write_barrier_enabled = false; + /* Used to provide the current mark queue to the young generation * collector for scavenging. */ MarkQueue *current_mark_queue = NULL; +/* Initialise update remembered set data structures */ +void nonmovingMarkInitUpdRemSet() { + initMutex(&upd_rem_set_lock); + initCondition(&upd_rem_set_flushed_cond); +#if defined(THREADED_RTS) + initMutex(&nonmoving_large_objects_mutex); +#endif +} + +#if defined(THREADED_RTS) && defined(DEBUG) +static uint32_t markQueueLength(MarkQueue *q); +#endif +static void init_mark_queue_(MarkQueue *queue); + +/* Transfers the given capability's update-remembered set to the global + * remembered set. + * + * Really the argument type should be UpdRemSet* but this would be rather + * inconvenient without polymorphism. + */ +static void nonmovingAddUpdRemSetBlocks(MarkQueue *rset) +{ + if (markQueueIsEmpty(rset)) return; + + // find the tail of the queue + bdescr *start = rset->blocks; + bdescr *end = start; + while (end->link != NULL) + end = end->link; + + // add the blocks to the global remembered set + ACQUIRE_LOCK(&upd_rem_set_lock); + end->link = upd_rem_set_block_list; + upd_rem_set_block_list = start; + RELEASE_LOCK(&upd_rem_set_lock); + + // Reset remembered set + ACQUIRE_SM_LOCK; + init_mark_queue_(rset); + rset->is_upd_rem_set = true; + RELEASE_SM_LOCK; +} + +#if defined(THREADED_RTS) +/* Called by capabilities to flush their update remembered sets when + * synchronising with the non-moving collector as it transitions from mark to + * sweep phase. + */ +void nonmovingFlushCapUpdRemSetBlocks(Capability *cap) +{ + debugTrace(DEBUG_nonmoving_gc, + "Capability %d flushing update remembered set: %d", + cap->no, markQueueLength(&cap->upd_rem_set.queue)); + nonmovingAddUpdRemSetBlocks(&cap->upd_rem_set.queue); + atomic_inc(&upd_rem_set_flush_count, 1); + signalCondition(&upd_rem_set_flushed_cond); + // After this mutation will remain suspended until nonmovingFinishFlush + // releases its capabilities. +} + +/* Request that all capabilities flush their update remembered sets and suspend + * execution until the further notice. + */ +void nonmovingBeginFlush(Task *task) +{ + debugTrace(DEBUG_nonmoving_gc, "Starting update remembered set flush..."); + upd_rem_set_flush_count = 0; + stopAllCapabilitiesWith(NULL, task, SYNC_FLUSH_UPD_REM_SET); + + // XXX: We may have been given a capability via releaseCapability (i.e. a + // task suspended due to a foreign call) in which case our requestSync + // logic won't have been hit. Make sure that everyone so far has flushed. + // Ideally we want to mark asynchronously with syncing. + for (uint32_t i = 0; i < n_capabilities; i++) { + nonmovingFlushCapUpdRemSetBlocks(capabilities[i]); + } +} + +/* Wait until a capability has flushed its update remembered set. Returns true + * if all capabilities have flushed. + */ +bool nonmovingWaitForFlush() +{ + ACQUIRE_LOCK(&upd_rem_set_lock); + debugTrace(DEBUG_nonmoving_gc, "Flush count %d", upd_rem_set_flush_count); + bool finished = upd_rem_set_flush_count == n_capabilities; + if (!finished) { + waitCondition(&upd_rem_set_flushed_cond, &upd_rem_set_lock); + } + RELEASE_LOCK(&upd_rem_set_lock); + return finished; +} + +/* Note [Unintentional marking in resurrectThreads] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * In both moving and non-moving collectors threads found to be unreachable are + * evacuated/marked and then resurrected with resurrectThreads. resurrectThreads + * raises an exception in the unreachable thread via raiseAsync, which does + * mutations on the heap. These mutations cause adding stuff to UpdRemSet of the + * thread's capability. Here's an example backtrace where this happens: + * + * #0 updateRemembSetPushClosure + * #1 0x000000000072b363 in dirty_TVAR + * #2 0x00000000007162e5 in remove_watch_queue_entries_for_trec + * #3 0x0000000000717098 in stmAbortTransaction + * #4 0x000000000070c6eb in raiseAsync + * #5 0x000000000070b473 in throwToSingleThreaded__ + * #6 0x000000000070b4ab in throwToSingleThreaded + * #7 0x00000000006fce82 in resurrectThreads + * #8 0x00000000007215db in nonmovingMark_ + * #9 0x0000000000721438 in nonmovingConcurrentMark + * #10 0x00007f1ee81cd6db in start_thread + * #11 0x00007f1ee850688f in clone + * + * However we don't really want to run write barriers when calling + * resurrectThreads here, because we're in a GC pause, and overwritten values + * are definitely gone forever (as opposed to being inserted in a marked object + * or kept in registers and used later). + * + * When this happens, if we don't reset the UpdRemSets, what happens is in the + * next mark we see these objects that were added in previous mark's + * resurrectThreads in UpdRemSets, and mark those. This causes keeping + * unreachable objects alive, and effects weak finalization and thread resurrect + * (which rely on things become unreachable). As an example, stm048 fails when + * we get this wrong, because when we do raiseAsync on a thread that was blocked + * on an STM transaction we mutate a TVAR_WATCH_QUEUE, which has a reference to + * the TSO that was running the STM transaction. If the TSO becomes unreachable + * again in the next GC we don't realize this, because it was added to an + * UpdRemSet in the previous GC's mark phase, because of raiseAsync. + * + * To fix this we clear all UpdRemSets in nonmovingFinishFlush, right before + * releasing capabilities. This is somewhat inefficient (we allow adding objects + * to UpdRemSets, only to later reset them), but the only case where we add to + * UpdRemSets during mark is resurrectThreads, and I don't think we do so many + * resurrection in a thread that we fill UpdRemSets and allocate new blocks. So + * pushing an UpdRemSet in this case is really fast, and resetting is even + * faster (we just update a pointer). + * + * TODO (osa): What if we actually marked UpdRemSets in this case, in the mark + * loop? Would that work? Or what would break? + */ + +/* Notify capabilities that the synchronisation is finished; they may resume + * execution. + */ +void nonmovingFinishFlush(Task *task) +{ + // See Note [Unintentional marking in resurrectThreads] + for (uint32_t i = 0; i < n_capabilities; i++) { + reset_upd_rem_set(&capabilities[i]->upd_rem_set); + } + // Also reset upd_rem_set_block_list in case some of the UpdRemSets were + // filled and we flushed them. + freeChain_lock(upd_rem_set_block_list); + upd_rem_set_block_list = NULL; + + debugTrace(DEBUG_nonmoving_gc, "Finished update remembered set flush..."); + releaseAllCapabilities(n_capabilities, NULL, task); +} +#endif + /********************************************************* * Pushing to either the mark queue or remembered set *********************************************************/ @@ -102,14 +356,18 @@ push (MarkQueue *q, const MarkQueueEnt *ent) // Are we at the end of the block? if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) { // Yes, this block is full. - // allocate a fresh block. - ACQUIRE_SM_LOCK; - bdescr *bd = allocGroup(1); - bd->link = q->blocks; - q->blocks = bd; - q->top = (MarkQueueBlock *) bd->start; - q->top->head = 0; - RELEASE_SM_LOCK; + if (q->is_upd_rem_set) { + nonmovingAddUpdRemSetBlocks(q); + } else { + // allocate a fresh block. + ACQUIRE_SM_LOCK; + bdescr *bd = allocGroup(1); + bd->link = q->blocks; + q->blocks = bd; + q->top = (MarkQueueBlock *) bd->start; + q->top->head = 0; + RELEASE_SM_LOCK; + } } q->top->entries[q->top->head] = *ent; @@ -182,6 +440,183 @@ void push_fun_srt (MarkQueue *q, const StgInfoTable *info) } } +/********************************************************* + * Pushing to the update remembered set + * + * upd_rem_set_push_* functions are directly called by + * mutators and need to check whether the value is in + * non-moving heap. + *********************************************************/ + +// Check if the object is traced by the non-moving collector. This holds in two +// conditions: +// +// - Object is in non-moving heap +// - Object is a large (BF_LARGE) and marked as BF_NONMOVING +// - Object is static (HEAP_ALLOCED_GC(obj) == false) +// +static +bool check_in_nonmoving_heap(StgClosure *p) { + if (HEAP_ALLOCED_GC(p)) { + // This works for both large and small objects: + return Bdescr((P_)p)->flags & BF_NONMOVING; + } else { + return true; // a static object + } +} + +/* Push the free variables of a (now-evaluated) thunk to the + * update remembered set. + */ +inline void updateRemembSetPushThunk(Capability *cap, StgThunk *thunk) +{ + const StgInfoTable *info; + do { + info = get_volatile_itbl((StgClosure *) thunk); + } while (info->type == WHITEHOLE); + updateRemembSetPushThunkEager(cap, (StgThunkInfoTable *) info, thunk); +} + +void updateRemembSetPushThunkEager(Capability *cap, + const StgThunkInfoTable *info, + StgThunk *thunk) +{ + /* N.B. info->i.type mustn't be WHITEHOLE */ + switch (info->i.type) { + case THUNK: + case THUNK_1_0: + case THUNK_0_1: + case THUNK_2_0: + case THUNK_1_1: + case THUNK_0_2: + { + MarkQueue *queue = &cap->upd_rem_set.queue; + push_thunk_srt(queue, &info->i); + + // Don't record the origin of objects living outside of the nonmoving + // heap; we can't perform the selector optimisation on them anyways. + bool record_origin = check_in_nonmoving_heap((StgClosure*)thunk); + + for (StgWord i = 0; i < info->i.layout.payload.ptrs; i++) { + if (check_in_nonmoving_heap(thunk->payload[i])) { + push_closure(queue, + thunk->payload[i], + record_origin ? &thunk->payload[i] : NULL); + } + } + break; + } + case AP: + { + MarkQueue *queue = &cap->upd_rem_set.queue; + StgAP *ap = (StgAP *) thunk; + push_closure(queue, ap->fun, &ap->fun); + mark_PAP_payload(queue, ap->fun, ap->payload, ap->n_args); + break; + } + case THUNK_SELECTOR: + case BLACKHOLE: + // TODO: This is right, right? + break; + default: + barf("updateRemembSetPushThunk: invalid thunk pushed: p=%p, type=%d", + thunk, info->i.type); + } +} + +void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p) +{ + updateRemembSetPushThunk(regTableToCapability(reg), p); +} + +inline void updateRemembSetPushClosure(Capability *cap, StgClosure *p) +{ + if (!check_in_nonmoving_heap(p)) return; + MarkQueue *queue = &cap->upd_rem_set.queue; + push_closure(queue, p, NULL); +} + +void updateRemembSetPushClosure_(StgRegTable *reg, StgClosure *p) +{ + updateRemembSetPushClosure(regTableToCapability(reg), p); +} + +STATIC_INLINE bool needs_upd_rem_set_mark(StgClosure *p) +{ + // TODO: Deduplicate with mark_closure + bdescr *bd = Bdescr((StgPtr) p); + if (bd->gen != oldest_gen) { + return false; + } else if (bd->flags & BF_LARGE) { + if (! (bd->flags & BF_NONMOVING_SWEEPING)) { + return false; + } else { + return ! (bd->flags & BF_MARKED); + } + } else { + struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p); + nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p); + return nonmovingGetMark(seg, block_idx) != nonmovingMarkEpoch; + } +} + +/* Set the mark bit; only to be called *after* we have fully marked the closure */ +STATIC_INLINE void finish_upd_rem_set_mark(StgClosure *p) +{ + bdescr *bd = Bdescr((StgPtr) p); + if (bd->flags & BF_LARGE) { + // Someone else may have already marked it. + ACQUIRE_LOCK(&nonmoving_large_objects_mutex); + if (! (bd->flags & BF_MARKED)) { + bd->flags |= BF_MARKED; + dbl_link_remove(bd, &nonmoving_large_objects); + dbl_link_onto(bd, &nonmoving_marked_large_objects); + n_nonmoving_large_blocks -= bd->blocks; + n_nonmoving_marked_large_blocks += bd->blocks; + } + RELEASE_LOCK(&nonmoving_large_objects_mutex); + } else { + struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p); + nonmoving_block_idx block_idx = nonmovingGetBlockIdx((StgPtr) p); + nonmovingSetMark(seg, block_idx); + } +} + +void updateRemembSetPushTSO(Capability *cap, StgTSO *tso) +{ + if (needs_upd_rem_set_mark((StgClosure *) tso)) { + debugTrace(DEBUG_nonmoving_gc, "upd_rem_set: TSO %p", tso); + mark_tso(&cap->upd_rem_set.queue, tso); + finish_upd_rem_set_mark((StgClosure *) tso); + } +} + +void updateRemembSetPushStack(Capability *cap, StgStack *stack) +{ + // N.B. caller responsible for checking nonmoving_write_barrier_enabled + if (needs_upd_rem_set_mark((StgClosure *) stack)) { + StgWord marking = stack->marking; + // See Note [StgStack dirtiness flags and concurrent marking] + if (cas(&stack->marking, marking, nonmovingMarkEpoch) + != nonmovingMarkEpoch) { + // We have claimed the right to mark the stack. + debugTrace(DEBUG_nonmoving_gc, "upd_rem_set: STACK %p", stack->sp); + mark_stack(&cap->upd_rem_set.queue, stack); + finish_upd_rem_set_mark((StgClosure *) stack); + return; + } else { + // The concurrent GC has claimed the right to mark the stack. + // Wait until it finishes marking before proceeding with + // mutation. + while (needs_upd_rem_set_mark((StgClosure *) stack)); +#if defined(PARALLEL_GC) + busy_wait_nop(); // TODO: Spinning here is unfortunate +#endif + return; + } + } +} + /********************************************************* * Pushing to the mark queue *********************************************************/ @@ -192,8 +627,8 @@ void markQueuePush (MarkQueue *q, const MarkQueueEnt *ent) } void markQueuePushClosure (MarkQueue *q, - StgClosure *p, - StgClosure **origin) + StgClosure *p, + StgClosure **origin) { push_closure(q, p, origin); } @@ -264,7 +699,7 @@ again: } /********************************************************* - * Creating and destroying MarkQueues + * Creating and destroying MarkQueues and UpdRemSets *********************************************************/ /* Must hold sm_mutex. */ @@ -281,22 +716,45 @@ void initMarkQueue (MarkQueue *queue) { init_mark_queue_(queue); queue->marked_objects = allocHashTable(); + queue->is_upd_rem_set = false; +} + +/* Must hold sm_mutex. */ +void init_upd_rem_set (UpdRemSet *rset) +{ + init_mark_queue_(&rset->queue); + // Update remembered sets don't have to worry about static objects + rset->queue.marked_objects = NULL; + rset->queue.is_upd_rem_set = true; +} + +void reset_upd_rem_set (UpdRemSet *rset) +{ + // UpdRemSets always have one block for the mark queue. This assertion is to + // update this code if we change that. + ASSERT(rset->queue.blocks->link == NULL); + rset->queue.top->head = 0; } void freeMarkQueue (MarkQueue *queue) { - bdescr* b = queue->blocks; - ACQUIRE_SM_LOCK; - while (b) - { - bdescr* b_ = b->link; - freeGroup(b); - b = b_; - } - RELEASE_SM_LOCK; + freeChain_lock(queue->blocks); freeHashTable(queue->marked_objects, NULL); } +#if defined(THREADED_RTS) && defined(DEBUG) +static uint32_t +markQueueLength (MarkQueue *q) +{ + uint32_t n = 0; + for (bdescr *block = q->blocks; block; block = block->link) { + MarkQueueBlock *queue = (MarkQueueBlock*)block->start; + n += queue->head; + } + return n; +} +#endif + /********************************************************* * Marking @@ -307,7 +765,8 @@ void freeMarkQueue (MarkQueue *queue) * barrier. Consequently it's quite important that we deeply mark * any outstanding transactions. */ -static void mark_trec_header (MarkQueue *queue, StgTRecHeader *trec) +static void +mark_trec_header (MarkQueue *queue, StgTRecHeader *trec) { while (trec != NO_TREC) { StgTRecChunk *chunk = trec->current_chunk; @@ -326,7 +785,8 @@ static void mark_trec_header (MarkQueue *queue, StgTRecHeader *trec) } } -static void mark_tso (MarkQueue *queue, StgTSO *tso) +static void +mark_tso (MarkQueue *queue, StgTSO *tso) { // TODO: Clear dirty if contains only old gen objects @@ -535,7 +995,7 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin) p = UNTAG_CLOSURE(p); # define PUSH_FIELD(obj, field) \ - markQueuePushClosure(queue, \ + markQueuePushClosure(queue, \ (StgClosure *) (obj)->field, \ (StgClosure **) &(obj)->field) @@ -592,7 +1052,7 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin) return; case WHITEHOLE: - while (get_itbl(p)->type == WHITEHOLE); + while (get_volatile_itbl(p)->type == WHITEHOLE); // busy_wait_nop(); // FIXME goto try_again; @@ -608,9 +1068,12 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin) // we moved everything to the non-moving heap before starting the major // collection, we know that we don't need to trace it: it was allocated // after we took our snapshot. - +#if !defined(THREADED_RTS) // This should never happen in the non-concurrent case barf("Closure outside of non-moving heap: %p", p); +#else + return; +#endif } ASSERTM(LOOKS_LIKE_CLOSURE_PTR(p), "invalid closure, info=%p", p->header.info); @@ -878,7 +1341,22 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin) case STACK: { // See Note [StgStack dirtiness flags and concurrent marking] StgStack *stack = (StgStack *) p; - mark_stack(queue, stack); + StgWord marking = stack->marking; + + // N.B. stack->marking must be != nonmovingMarkEpoch unless + // someone has already marked it. + if (cas(&stack->marking, marking, nonmovingMarkEpoch) + != nonmovingMarkEpoch) { + // We have claimed the right to mark the stack. + mark_stack(queue, stack); + } else { + // A mutator has already started marking the stack; we just let it + // do its thing and move on. There's no reason to wait; we know that + // the stack will be fully marked before we sweep due to the final + // post-mark synchronization. Most importantly, we do not set its + // mark bit, the mutator is responsible for this. + return; + } break; } @@ -905,8 +1383,7 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin) } case WHITEHOLE: - while (get_itbl(p)->type == WHITEHOLE); - // busy_wait_nop(); // FIXME + while (get_volatile_itbl(p)->type == WHITEHOLE); goto try_again; default: @@ -921,6 +1398,12 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin) * mutator waiting for us to finish so it can start execution. */ if (bd->flags & BF_LARGE) { + /* Marking a large object isn't idempotent since we move it to + * nonmoving_marked_large_objects; to ensure that we don't repeatedly + * mark a large object, we only set BF_MARKED on large objects in the + * nonmoving heap while holding nonmoving_large_objects_mutex + */ + ACQUIRE_LOCK(&nonmoving_large_objects_mutex); if (! (bd->flags & BF_MARKED)) { // Remove the object from nonmoving_large_objects and link it to // nonmoving_marked_large_objects @@ -930,6 +1413,7 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin) n_nonmoving_marked_large_blocks += bd->blocks; bd->flags |= BF_MARKED; } + RELEASE_LOCK(&nonmoving_large_objects_mutex); } else { // TODO: Kill repetition struct NonmovingSegment *seg = nonmovingGetSegment((StgPtr) p); @@ -947,7 +1431,8 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin) * c. the mark queue has been seeded with a set of roots. * */ -GNUC_ATTR_HOT void nonmovingMark (MarkQueue *queue) +GNUC_ATTR_HOT void +nonmovingMark (MarkQueue *queue) { debugTrace(DEBUG_nonmoving_gc, "Starting mark pass"); unsigned int count = 0; @@ -974,9 +1459,23 @@ GNUC_ATTR_HOT void nonmovingMark (MarkQueue *queue) break; } case NULL_ENTRY: - // Nothing more to do - debugTrace(DEBUG_nonmoving_gc, "Finished mark pass: %d", count); - return; + // Perhaps the update remembered set has more to mark... + if (upd_rem_set_block_list) { + ACQUIRE_LOCK(&upd_rem_set_lock); + bdescr *old = queue->blocks; + queue->blocks = upd_rem_set_block_list; + queue->top = (MarkQueueBlock *) queue->blocks->start; + upd_rem_set_block_list = NULL; + RELEASE_LOCK(&upd_rem_set_lock); + + ACQUIRE_SM_LOCK; + freeGroup(old); + RELEASE_SM_LOCK; + } else { + // Nothing more to do + debugTrace(DEBUG_nonmoving_gc, "Finished mark pass: %d", count); + return; + } } } } diff --git a/rts/sm/NonMovingMark.h b/rts/sm/NonMovingMark.h index 636f41890c..d7066e56d6 100644 --- a/rts/sm/NonMovingMark.h +++ b/rts/sm/NonMovingMark.h @@ -80,11 +80,23 @@ typedef struct MarkQueue_ { // Cached value of blocks->start. MarkQueueBlock *top; + // Is this a mark queue or a capability-local update remembered set? + bool is_upd_rem_set; + // Marked objects outside of nonmoving heap, namely large and static // objects. HashTable *marked_objects; } MarkQueue; +/* While it shares its representation with MarkQueue, UpdRemSet differs in + * behavior when pushing; namely full chunks are immediately pushed to the + * global update remembered set, not accumulated into a chain. We make this + * distinction apparent in the types. + */ +typedef struct { + MarkQueue queue; +} UpdRemSet; + // The length of MarkQueueBlock.entries #define MARK_QUEUE_BLOCK_ENTRIES ((BLOCK_SIZE - sizeof(MarkQueueBlock)) / sizeof(MarkQueueEnt)) @@ -101,6 +113,22 @@ extern StgIndStatic *debug_caf_list_snapshot; #endif extern MarkQueue *current_mark_queue; +extern bdescr *upd_rem_set_block_list; + +void nonmovingMarkInitUpdRemSet(void); + +void init_upd_rem_set(UpdRemSet *rset); +void reset_upd_rem_set(UpdRemSet *rset); +void updateRemembSetPushThunk(Capability *cap, StgThunk *p); +void updateRemembSetPushTSO(Capability *cap, StgTSO *tso); +void updateRemembSetPushStack(Capability *cap, StgStack *stack); + +#if defined(THREADED_RTS) +void nonmovingFlushCapUpdRemSetBlocks(Capability *cap); +void nonmovingBeginFlush(Task *task); +bool nonmovingWaitForFlush(void); +void nonmovingFinishFlush(Task *task); +#endif void markQueueAddRoot(MarkQueue* q, StgClosure** root); @@ -124,6 +152,9 @@ void markQueuePushClosure_(MarkQueue *q, StgClosure *p); void markQueuePushThunkSrt(MarkQueue *q, const StgInfoTable *info); void markQueuePushFunSrt(MarkQueue *q, const StgInfoTable *info); void markQueuePushArray(MarkQueue *q, const StgMutArrPtrs *array, StgWord start_index); +void updateRemembSetPushThunkEager(Capability *cap, + const StgThunkInfoTable *orig_info, + StgThunk *thunk); INLINE_HEADER bool markQueueIsEmpty(MarkQueue *q) { diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c index 3e1748d5b6..0724a0059a 100644 --- a/rts/sm/Sanity.c +++ b/rts/sm/Sanity.c @@ -912,9 +912,11 @@ findMemoryLeak (void) for (i = 0; i < n_capabilities; i++) { markBlocks(gc_threads[i]->free_blocks); markBlocks(capabilities[i]->pinned_object_block); + markBlocks(capabilities[i]->upd_rem_set.queue.blocks); } if (RtsFlags.GcFlags.useNonmoving) { + markBlocks(upd_rem_set_block_list); markBlocks(nonmoving_large_objects); markBlocks(nonmoving_marked_large_objects); for (i = 0; i < NONMOVING_ALLOCA_CNT; i++) { @@ -1054,7 +1056,8 @@ memInventory (bool show) uint32_t g, i; W_ gen_blocks[RtsFlags.GcFlags.generations]; W_ nursery_blocks = 0, retainer_blocks = 0, - arena_blocks = 0, exec_blocks = 0, gc_free_blocks = 0; + arena_blocks = 0, exec_blocks = 0, gc_free_blocks = 0, + upd_rem_set_blocks = 0; W_ live_blocks = 0, free_blocks = 0; bool leak; @@ -1099,12 +1102,19 @@ memInventory (bool show) /* count the blocks on the free list */ free_blocks = countFreeList(); + // count UpdRemSet blocks + for (i = 0; i < n_capabilities; ++i) { + upd_rem_set_blocks += countBlocks(capabilities[i]->upd_rem_set.queue.blocks); + } + upd_rem_set_blocks += countBlocks(upd_rem_set_block_list); + live_blocks = 0; for (g = 0; g < RtsFlags.GcFlags.generations; g++) { live_blocks += gen_blocks[g]; } live_blocks += nursery_blocks + - + retainer_blocks + arena_blocks + exec_blocks + gc_free_blocks; + + retainer_blocks + arena_blocks + exec_blocks + gc_free_blocks + + upd_rem_set_blocks; #define MB(n) (((double)(n) * BLOCK_SIZE_W) / ((1024*1024)/sizeof(W_))) @@ -1133,6 +1143,8 @@ memInventory (bool show) gc_free_blocks, MB(gc_free_blocks)); debugBelch(" free : %5" FMT_Word " blocks (%6.1lf MB)\n", free_blocks, MB(free_blocks)); + debugBelch(" UpdRemSet : %5" FMT_Word " blocks (%6.1lf MB)\n", + upd_rem_set_blocks, MB(upd_rem_set_blocks)); debugBelch(" total : %5" FMT_Word " blocks (%6.1lf MB)\n", live_blocks + free_blocks, MB(live_blocks+free_blocks)); if (leak) { diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index 9fe68e98b7..0199bc58f3 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -281,6 +281,14 @@ void storageAddCapabilities (uint32_t from, uint32_t to) } } + // Initialize NonmovingAllocators and UpdRemSets + if (RtsFlags.GcFlags.useNonmoving) { + nonmovingAddCapabilities(to); + for (i = 0; i < to; ++i) { + init_upd_rem_set(&capabilities[i]->upd_rem_set); + } + } + #if defined(THREADED_RTS) && defined(CC_LLVM_BACKEND) && (CC_SUPPORTS_TLS == 0) newThreadLocalKey(&gctKey); #endif @@ -412,6 +420,22 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf) // successfully claimed by us; overwrite with IND_STATIC #endif + // Push stuff that will become unreachable after updating to UpdRemSet to + // maintain snapshot invariant + const StgInfoTable *orig_info_tbl = INFO_PTR_TO_STRUCT(orig_info); + // OSA: Assertions to make sure my understanding of static thunks is correct + ASSERT(orig_info_tbl->type == THUNK_STATIC); + // Secondly I think static thunks can't have payload: anything that they + // reference should be in SRTs + ASSERT(orig_info_tbl->layout.payload.ptrs == 0); + // Becuase the payload is empty we just push the SRT + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) { + StgThunkInfoTable *thunk_info = itbl_to_thunk_itbl(orig_info_tbl); + if (thunk_info->i.srt) { + updateRemembSetPushClosure(cap, GET_SRT(thunk_info)); + } + } + // For the benefit of revertCAFs(), save the original info pointer caf->saved_info = orig_info; @@ -1083,6 +1107,27 @@ allocatePinned (Capability *cap, W_ n) Write Barriers -------------------------------------------------------------------------- */ +/* These write barriers on heavily mutated objects serve two purposes: + * + * - Efficient maintenance of the generational invariant: Record whether or not + * we have added a particular mutable object to mut_list as they may contain + * references to younger generations. + * + * - Maintenance of the nonmoving collector's snapshot invariant: Record objects + * which are about to no longer be reachable due to mutation. + * + * In each case we record whether the object has been added to the mutable list + * by way of either the info pointer or a dedicated "dirty" flag. The GC will + * clear this flag and remove the object from mut_list (or rather, not re-add it) + * to if it finds the object contains no references into any younger generation. + * + * Note that all dirty objects will be marked as clean during preparation for a + * concurrent collection. Consequently, we can use the dirtiness flag to determine + * whether or not we need to add overwritten pointers to the update remembered + * set (since we need only write the value prior to the first update to maintain + * the snapshot invariant). + */ + /* This is the write barrier for MUT_VARs, a.k.a. IORefs. A MUT_VAR_CLEAN object is not on the mutable list; a MUT_VAR_DIRTY @@ -1090,25 +1135,39 @@ allocatePinned (Capability *cap, W_ n) and is put on the mutable list. */ void -dirty_MUT_VAR(StgRegTable *reg, StgClosure *p) +dirty_MUT_VAR(StgRegTable *reg, StgMutVar *mvar, StgClosure *old) { Capability *cap = regTableToCapability(reg); // No barrier required here as no other heap object fields are read. See // note [Heap memory barriers] in SMP.h. - if (p->header.info == &stg_MUT_VAR_CLEAN_info) { - p->header.info = &stg_MUT_VAR_DIRTY_info; - recordClosureMutated(cap,p); + if (mvar->header.info == &stg_MUT_VAR_CLEAN_info) { + mvar->header.info = &stg_MUT_VAR_DIRTY_info; + recordClosureMutated(cap, (StgClosure *) mvar); + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled != 0)) { + updateRemembSetPushClosure_(reg, old); + } } } +/* + * This is the write barrier for TVARs. + * old is the pointer that we overwrote, which is required by the concurrent + * garbage collector. Note that we, while StgTVars contain multiple pointers, + * only overwrite one per dirty_TVAR call so we only need to take one old + * pointer argument. + */ void -dirty_TVAR(Capability *cap, StgTVar *p) +dirty_TVAR(Capability *cap, StgTVar *p, + StgClosure *old) { // No barrier required here as no other heap object fields are read. See // note [Heap memory barriers] in SMP.h. if (p->header.info == &stg_TVAR_CLEAN_info) { p->header.info = &stg_TVAR_DIRTY_info; recordClosureMutated(cap,(StgClosure*)p); + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled != 0)) { + updateRemembSetPushClosure(cap, old); + } } } @@ -1123,6 +1182,8 @@ setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target) if (tso->dirty == 0) { tso->dirty = 1; recordClosureMutated(cap,(StgClosure*)tso); + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) + updateRemembSetPushClosure(cap, (StgClosure *) tso->_link); } tso->_link = target; } @@ -1133,6 +1194,8 @@ setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target) if (tso->dirty == 0) { tso->dirty = 1; recordClosureMutated(cap,(StgClosure*)tso); + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) + updateRemembSetPushClosure(cap, (StgClosure *) tso->block_info.prev); } tso->block_info.prev = target; } @@ -1144,15 +1207,47 @@ dirty_TSO (Capability *cap, StgTSO *tso) tso->dirty = 1; recordClosureMutated(cap,(StgClosure*)tso); } + + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) + updateRemembSetPushTSO(cap, tso); } void dirty_STACK (Capability *cap, StgStack *stack) { + // First push to upd_rem_set before we set stack->dirty since we + // the nonmoving collector may already be marking the stack. + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) + updateRemembSetPushStack(cap, stack); + if (! (stack->dirty & STACK_DIRTY)) { stack->dirty = STACK_DIRTY; recordClosureMutated(cap,(StgClosure*)stack); } + +} + +/* + * This is the concurrent collector's write barrier for MVARs. In the other + * write barriers above this is folded into the dirty_* functions. However, in + * the case of MVars we need to separate the acts of adding the MVar to the + * mutable list and adding its fields to the update remembered set. + * + * Specifically, the wakeup loop in stg_putMVarzh wants to freely mutate the + * pointers of the MVar but needs to keep its lock, meaning we can't yet add it + * to the mutable list lest the assertion checking for clean MVars on the + * mutable list would fail. + */ +void +update_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val) +{ + Capability *cap = regTableToCapability(reg); + if (RTS_UNLIKELY(nonmoving_write_barrier_enabled)) { + StgMVar *mvar = (StgMVar *) p; + updateRemembSetPushClosure(cap, old_val); + updateRemembSetPushClosure(cap, (StgClosure *) mvar->head); + updateRemembSetPushClosure(cap, (StgClosure *) mvar->tail); + } } /* @@ -1164,9 +1259,11 @@ dirty_STACK (Capability *cap, StgStack *stack) such as Chaneneos and cheap-concurrency. */ void -dirty_MVAR(StgRegTable *reg, StgClosure *p) +dirty_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val) { - recordClosureMutated(regTableToCapability(reg),p); + Capability *cap = regTableToCapability(reg); + update_MVAR(reg, p, old_val); + recordClosureMutated(cap, p); } /* ----------------------------------------------------------------------------- diff --git a/rts/sm/Storage.h b/rts/sm/Storage.h index 08bdb37ba3..cdb9720650 100644 --- a/rts/sm/Storage.h +++ b/rts/sm/Storage.h @@ -47,8 +47,9 @@ extern Mutex sm_mutex; The write barrier for MVARs and TVARs -------------------------------------------------------------------------- */ -void dirty_MVAR(StgRegTable *reg, StgClosure *p); -void dirty_TVAR(Capability *cap, StgTVar *p); +void update_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old_val); +void dirty_MVAR(StgRegTable *reg, StgClosure *p, StgClosure *old); +void dirty_TVAR(Capability *cap, StgTVar *p, StgClosure *old); /* ----------------------------------------------------------------------------- Nursery manipulation -- cgit v1.2.1