diff options
-rw-r--r-- | includes/Rts.h | 4 | ||||
-rw-r--r-- | includes/rts/EventLogFormat.h | 11 | ||||
-rw-r--r-- | includes/rts/Flags.h | 1 | ||||
-rw-r--r-- | libraries/base/GHC/RTS/Flags.hsc | 4 | ||||
-rw-r--r-- | rts/RtsFlags.c | 5 | ||||
-rw-r--r-- | rts/Schedule.c | 23 | ||||
-rw-r--r-- | rts/Trace.c | 53 | ||||
-rw-r--r-- | rts/Trace.h | 21 | ||||
-rw-r--r-- | rts/eventlog/EventLog.c | 75 | ||||
-rw-r--r-- | rts/eventlog/EventLog.h | 10 | ||||
-rw-r--r-- | rts/rts.cabal.in | 1 | ||||
-rw-r--r-- | rts/sm/Evac.c | 67 | ||||
-rw-r--r-- | rts/sm/GC.c | 35 | ||||
-rw-r--r-- | rts/sm/GC.h | 9 | ||||
-rw-r--r-- | rts/sm/GCAux.c | 4 | ||||
-rw-r--r-- | rts/sm/NonMoving.c | 245 | ||||
-rw-r--r-- | rts/sm/NonMoving.h | 47 | ||||
-rw-r--r-- | rts/sm/NonMovingCensus.c | 129 | ||||
-rw-r--r-- | rts/sm/NonMovingCensus.h | 28 | ||||
-rw-r--r-- | rts/sm/NonMovingMark.c | 274 | ||||
-rw-r--r-- | rts/sm/NonMovingMark.h | 44 | ||||
-rw-r--r-- | rts/sm/NonMovingScav.c | 45 | ||||
-rw-r--r-- | rts/sm/NonMovingSweep.c | 159 | ||||
-rw-r--r-- | rts/sm/NonMovingSweep.h | 6 | ||||
-rw-r--r-- | rts/sm/Storage.c | 70 |
25 files changed, 1154 insertions, 216 deletions
diff --git a/includes/Rts.h b/includes/Rts.h index def06de90d..7ad6988f3b 100644 --- a/includes/Rts.h +++ b/includes/Rts.h @@ -74,6 +74,10 @@ extern "C" { #define RTS_UNREACHABLE abort() #endif +/* Prefetch primitives */ +#define prefetchForRead(ptr) __builtin_prefetch(ptr, 0) +#define prefetchForWrite(ptr) __builtin_prefetch(ptr, 1) + /* Fix for mingw stat problem (done here so it's early enough) */ #if defined(mingw32_HOST_OS) #define __MSVCRT__ 1 diff --git a/includes/rts/EventLogFormat.h b/includes/rts/EventLogFormat.h index ad983e70b3..0ffa77a2df 100644 --- a/includes/rts/EventLogFormat.h +++ b/includes/rts/EventLogFormat.h @@ -183,12 +183,21 @@ #define EVENT_USER_BINARY_MSG 181 +#define EVENT_CONC_MARK_BEGIN 200 +#define EVENT_CONC_MARK_END 201 +#define EVENT_CONC_SYNC_BEGIN 202 +#define EVENT_CONC_SYNC_END 203 +#define EVENT_CONC_SWEEP_BEGIN 204 +#define EVENT_CONC_SWEEP_END 205 +#define EVENT_CONC_UPD_REM_SET_FLUSH 206 +#define EVENT_NONMOVING_HEAP_CENSUS 207 + /* * The highest event code +1 that ghc itself emits. Note that some event * ranges higher than this are reserved but not currently emitted by ghc. * This must match the size of the EventDesc[] array in EventLog.c */ -#define NUM_GHC_EVENT_TAGS 182 +#define NUM_GHC_EVENT_TAGS 208 #if 0 /* DEPRECATED EVENTS: */ /* we don't actually need to record the thread, it's implicit */ diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h index 4499af9da6..9a039fd95c 100644 --- a/includes/rts/Flags.h +++ b/includes/rts/Flags.h @@ -170,6 +170,7 @@ typedef struct _TRACE_FLAGS { bool timestamp; /* show timestamp in stderr output */ bool scheduler; /* trace scheduler events */ bool gc; /* trace GC events */ + bool nonmoving_gc; /* trace nonmoving GC events */ bool sparks_sampled; /* trace spark events by a sampled method */ bool sparks_full; /* trace spark events 100% accurately */ bool user; /* trace user events (emitted from Haskell code) */ diff --git a/libraries/base/GHC/RTS/Flags.hsc b/libraries/base/GHC/RTS/Flags.hsc index 4778eac397..913344c166 100644 --- a/libraries/base/GHC/RTS/Flags.hsc +++ b/libraries/base/GHC/RTS/Flags.hsc @@ -292,6 +292,8 @@ data TraceFlags = TraceFlags , timestamp :: Bool -- ^ show timestamp in stderr output , traceScheduler :: Bool -- ^ trace scheduler events , traceGc :: Bool -- ^ trace GC events + , traceNonmovingGc + :: Bool -- ^ trace nonmoving GC heap census samples , sparksSampled :: Bool -- ^ trace spark events by a sampled method , sparksFull :: Bool -- ^ trace spark events 100% accurately , user :: Bool -- ^ trace user events (emitted from Haskell code) @@ -526,6 +528,8 @@ getTraceFlags = do <*> (toBool <$> (#{peek TRACE_FLAGS, gc} ptr :: IO CBool)) <*> (toBool <$> + (#{peek TRACE_FLAGS, nonmoving_gc} ptr :: IO CBool)) + <*> (toBool <$> (#{peek TRACE_FLAGS, sparks_sampled} ptr :: IO CBool)) <*> (toBool <$> (#{peek TRACE_FLAGS, sparks_full} ptr :: IO CBool)) diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c index 7d486824ab..c606d86418 100644 --- a/rts/RtsFlags.c +++ b/rts/RtsFlags.c @@ -222,6 +222,7 @@ void initRtsFlagsDefaults(void) RtsFlags.TraceFlags.timestamp = false; RtsFlags.TraceFlags.scheduler = false; RtsFlags.TraceFlags.gc = false; + RtsFlags.TraceFlags.nonmoving_gc = false; RtsFlags.TraceFlags.sparks_sampled= false; RtsFlags.TraceFlags.sparks_full = false; RtsFlags.TraceFlags.user = false; @@ -2131,6 +2132,10 @@ static void read_trace_flags(const char *arg) RtsFlags.TraceFlags.gc = enabled; enabled = true; break; + case 'n': + RtsFlags.TraceFlags.nonmoving_gc = enabled; + enabled = true; + break; case 'u': RtsFlags.TraceFlags.user = enabled; enabled = true; diff --git a/rts/Schedule.c b/rts/Schedule.c index f66a276602..5387bc615e 100644 --- a/rts/Schedule.c +++ b/rts/Schedule.c @@ -164,7 +164,8 @@ static void scheduleHandleThreadBlocked( StgTSO *t ); static bool scheduleHandleThreadFinished( Capability *cap, Task *task, StgTSO *t ); static bool scheduleNeedHeapProfile(bool ready_to_gc); -static void scheduleDoGC(Capability **pcap, Task *task, bool force_major); +static void scheduleDoGC( Capability **pcap, Task *task, + bool force_major, bool deadlock_detect ); static void deleteThread (StgTSO *tso); static void deleteAllThreads (void); @@ -264,7 +265,7 @@ schedule (Capability *initialCapability, Task *task) case SCHED_INTERRUPTING: debugTrace(DEBUG_sched, "SCHED_INTERRUPTING"); /* scheduleDoGC() deletes all the threads */ - scheduleDoGC(&cap,task,true); + scheduleDoGC(&cap,task,true,false); // after scheduleDoGC(), we must be shutting down. Either some // other Capability did the final GC, or we did it above, @@ -561,7 +562,7 @@ run_thread: } if (ready_to_gc || scheduleNeedHeapProfile(ready_to_gc)) { - scheduleDoGC(&cap,task,false); + scheduleDoGC(&cap,task,false,false); } } /* end of while() */ } @@ -935,7 +936,7 @@ scheduleDetectDeadlock (Capability **pcap, Task *task) // they are unreachable and will therefore be sent an // exception. Any threads thus released will be immediately // runnable. - scheduleDoGC (pcap, task, true/*force major GC*/); + scheduleDoGC (pcap, task, true/*force major GC*/, true/*deadlock detection*/); cap = *pcap; // when force_major == true. scheduleDoGC sets // recent_activity to ACTIVITY_DONE_GC and turns off the timer @@ -1005,7 +1006,7 @@ scheduleProcessInbox (Capability **pcap USED_IF_THREADS) while (!emptyInbox(cap)) { // Executing messages might use heap, so we should check for GC. if (doYouWantToGC(cap)) { - scheduleDoGC(pcap, cap->running_task, false); + scheduleDoGC(pcap, cap->running_task, false, false); cap = *pcap; } @@ -1552,9 +1553,11 @@ void releaseAllCapabilities(uint32_t n, Capability *keep_cap, Task *task) * Perform a garbage collection if necessary * -------------------------------------------------------------------------- */ +// N.B. See Note [Deadlock detection under nonmoving collector] for rationale +// behind deadlock_detect argument. static void scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS, - bool force_major) + bool force_major, bool deadlock_detect) { Capability *cap = *pcap; bool heap_census; @@ -1847,9 +1850,9 @@ delete_threads_and_gc: // emerge they don't immediately re-enter the GC. pending_sync = 0; signalCondition(&sync_finished_cond); - GarbageCollect(collect_gen, heap_census, gc_type, cap, idle_cap); + GarbageCollect(collect_gen, heap_census, deadlock_detect, gc_type, cap, idle_cap); #else - GarbageCollect(collect_gen, heap_census, 0, cap, NULL); + GarbageCollect(collect_gen, heap_census, deadlock_detect, 0, cap, NULL); #endif // If we're shutting down, don't leave any idle GC work to do. @@ -2717,7 +2720,7 @@ exitScheduler (bool wait_foreign USED_IF_THREADS) nonmovingStop(); Capability *cap = task->cap; waitForCapability(&cap,task); - scheduleDoGC(&cap,task,true); + scheduleDoGC(&cap,task,true,false); ASSERT(task->incall->tso == NULL); releaseCapability(cap); } @@ -2785,7 +2788,7 @@ performGC_(bool force_major) // TODO: do we need to traceTask*() here? waitForCapability(&cap,task); - scheduleDoGC(&cap,task,force_major); + scheduleDoGC(&cap,task,force_major,false); releaseCapability(cap); boundTaskExiting(task); } diff --git a/rts/Trace.c b/rts/Trace.c index de647f762b..ecc28d8fec 100644 --- a/rts/Trace.c +++ b/rts/Trace.c @@ -30,6 +30,7 @@ // events int TRACE_sched; int TRACE_gc; +int TRACE_nonmoving_gc; int TRACE_spark_sampled; int TRACE_spark_full; int TRACE_user; @@ -72,6 +73,9 @@ void initTracing (void) RtsFlags.GcFlags.giveStats = COLLECT_GC_STATS; } + TRACE_nonmoving_gc = + RtsFlags.TraceFlags.nonmoving_gc; + TRACE_spark_sampled = RtsFlags.TraceFlags.sparks_sampled; @@ -802,6 +806,55 @@ void traceThreadLabel_(Capability *cap, } } +void traceConcMarkBegin() +{ + if (eventlog_enabled) + postEventNoCap(EVENT_CONC_MARK_BEGIN); +} + +void traceConcMarkEnd(StgWord32 marked_obj_count) +{ + if (eventlog_enabled) + postConcMarkEnd(marked_obj_count); +} + +void traceConcSyncBegin() +{ + if (eventlog_enabled) + postEventNoCap(EVENT_CONC_SYNC_BEGIN); +} + +void traceConcSyncEnd() +{ + if (eventlog_enabled) + postEventNoCap(EVENT_CONC_SYNC_END); +} + +void traceConcSweepBegin() +{ + if (eventlog_enabled) + postEventNoCap(EVENT_CONC_SWEEP_BEGIN); +} + +void traceConcSweepEnd() +{ + if (eventlog_enabled) + postEventNoCap(EVENT_CONC_SWEEP_END); +} + +void traceConcUpdRemSetFlush(Capability *cap) +{ + if (eventlog_enabled) + postConcUpdRemSetFlush(cap); +} + +void traceNonmovingHeapCensus(uint32_t log_blk_size, + const struct NonmovingAllocCensus *census) +{ + if (eventlog_enabled && TRACE_nonmoving_gc) + postNonmovingHeapCensus(log_blk_size, census); +} + void traceThreadStatus_ (StgTSO *tso USED_IF_DEBUG) { #if defined(DEBUG) diff --git a/rts/Trace.h b/rts/Trace.h index be4d844a6b..7f72fd8093 100644 --- a/rts/Trace.h +++ b/rts/Trace.h @@ -9,6 +9,7 @@ #pragma once #include "rts/EventLogFormat.h" +#include "sm/NonMovingCensus.h" #include "Capability.h" #if defined(DTRACE) @@ -72,6 +73,7 @@ extern int TRACE_spark_sampled; extern int TRACE_spark_full; /* extern int TRACE_user; */ // only used in Trace.c extern int TRACE_cap; +extern int TRACE_nonmoving_gc; // ----------------------------------------------------------------------------- // Posting events @@ -304,6 +306,16 @@ void traceHeapProfSampleCostCentre(StgWord8 profile_id, CostCentreStack *stack, StgWord residency); #endif /* PROFILING */ +void traceConcMarkBegin(void); +void traceConcMarkEnd(StgWord32 marked_obj_count); +void traceConcSyncBegin(void); +void traceConcSyncEnd(void); +void traceConcSweepBegin(void); +void traceConcSweepEnd(void); +void traceConcUpdRemSetFlush(Capability *cap); +void traceNonmovingHeapCensus(uint32_t log_blk_size, + const struct NonmovingAllocCensus *census); + void flushTrace(void); #else /* !TRACING */ @@ -344,6 +356,15 @@ void flushTrace(void); #define traceHeapProfSampleCostCentre(profile_id, stack, residency) /* nothing */ #define traceHeapProfSampleString(profile_id, label, residency) /* nothing */ +#define traceConcMarkBegin() /* nothing */ +#define traceConcMarkEnd(marked_obj_count) /* nothing */ +#define traceConcSyncBegin() /* nothing */ +#define traceConcSyncEnd() /* nothing */ +#define traceConcSweepBegin() /* nothing */ +#define traceConcSweepEnd() /* nothing */ +#define traceConcUpdRemSetFlush(cap) /* nothing */ +#define traceNonmovingHeapCensus(blk_size, census) /* nothing */ + #define flushTrace() /* nothing */ #endif /* TRACING */ diff --git a/rts/eventlog/EventLog.c b/rts/eventlog/EventLog.c index 5c6a1ca48a..8683ad9972 100644 --- a/rts/eventlog/EventLog.c +++ b/rts/eventlog/EventLog.c @@ -107,7 +107,15 @@ char *EventDesc[] = { [EVENT_HEAP_PROF_SAMPLE_END] = "End of heap profile sample", [EVENT_HEAP_PROF_SAMPLE_STRING] = "Heap profile string sample", [EVENT_HEAP_PROF_SAMPLE_COST_CENTRE] = "Heap profile cost-centre sample", - [EVENT_USER_BINARY_MSG] = "User binary message" + [EVENT_USER_BINARY_MSG] = "User binary message", + [EVENT_CONC_MARK_BEGIN] = "Begin concurrent mark phase", + [EVENT_CONC_MARK_END] = "End concurrent mark phase", + [EVENT_CONC_SYNC_BEGIN] = "Begin concurrent GC synchronisation", + [EVENT_CONC_SYNC_END] = "End concurrent GC synchronisation", + [EVENT_CONC_SWEEP_BEGIN] = "Begin concurrent sweep", + [EVENT_CONC_SWEEP_END] = "End concurrent sweep", + [EVENT_CONC_UPD_REM_SET_FLUSH] = "Update remembered set flushed", + [EVENT_NONMOVING_HEAP_CENSUS] = "Nonmoving heap census" }; // Event type. @@ -446,6 +454,27 @@ init_event_types(void) eventTypes[t].size = EVENT_SIZE_DYNAMIC; break; + case EVENT_CONC_MARK_BEGIN: + case EVENT_CONC_SYNC_BEGIN: + case EVENT_CONC_SYNC_END: + case EVENT_CONC_SWEEP_BEGIN: + case EVENT_CONC_SWEEP_END: + eventTypes[t].size = 0; + break; + + case EVENT_CONC_MARK_END: + eventTypes[t].size = 4; + break; + + case EVENT_CONC_UPD_REM_SET_FLUSH: // (cap) + eventTypes[t].size = + sizeof(EventCapNo); + break; + + case EVENT_NONMOVING_HEAP_CENSUS: // (cap, blk_size, active_segs, filled_segs, live_blks) + eventTypes[t].size = 13; + break; + default: continue; /* ignore deprecated events */ } @@ -487,8 +516,10 @@ initEventLogging(const EventLogWriter *ev_writer) event_log_writer = ev_writer; initEventLogWriter(); - if (sizeof(EventDesc) / sizeof(char*) != NUM_GHC_EVENT_TAGS) { - barf("EventDesc array has the wrong number of elements"); + int num_descs = sizeof(EventDesc) / sizeof(char*); + if (num_descs != NUM_GHC_EVENT_TAGS) { + barf("EventDesc array has the wrong number of elements (%d, NUM_GHC_EVENT_TAGS=%d)", + num_descs, NUM_GHC_EVENT_TAGS); } /* @@ -1005,6 +1036,15 @@ void postTaskDeleteEvent (EventTaskId taskId) } void +postEventNoCap (EventTypeNum tag) +{ + ACQUIRE_LOCK(&eventBufMutex); + ensureRoomForEvent(&eventBuf, tag); + postEventHeader(&eventBuf, tag); + RELEASE_LOCK(&eventBufMutex); +} + +void postEvent (Capability *cap, EventTypeNum tag) { EventsBuf *eb = &capEventBuf[cap->no]; @@ -1130,6 +1170,35 @@ void postThreadLabel(Capability *cap, postBuf(eb, (StgWord8*) label, strsize); } +void postConcUpdRemSetFlush(Capability *cap) +{ + EventsBuf *eb = &capEventBuf[cap->no]; + ensureRoomForEvent(eb, EVENT_CONC_UPD_REM_SET_FLUSH); + postEventHeader(eb, EVENT_CONC_UPD_REM_SET_FLUSH); + postCapNo(eb, cap->no); +} + +void postConcMarkEnd(StgWord32 marked_obj_count) +{ + ACQUIRE_LOCK(&eventBufMutex); + ensureRoomForEvent(&eventBuf, EVENT_CONC_MARK_END); + postEventHeader(&eventBuf, EVENT_CONC_MARK_END); + postWord32(&eventBuf, marked_obj_count); + RELEASE_LOCK(&eventBufMutex); +} + +void postNonmovingHeapCensus(int log_blk_size, + const struct NonmovingAllocCensus *census) +{ + ACQUIRE_LOCK(&eventBufMutex); + postEventHeader(&eventBuf, EVENT_NONMOVING_HEAP_CENSUS); + postWord8(&eventBuf, log_blk_size); + postWord32(&eventBuf, census->n_active_segs); + postWord32(&eventBuf, census->n_filled_segs); + postWord32(&eventBuf, census->n_live_blocks); + RELEASE_LOCK(&eventBufMutex); +} + void closeBlockMarker (EventsBuf *ebuf) { if (ebuf->marker) diff --git a/rts/eventlog/EventLog.h b/rts/eventlog/EventLog.h index d8a614b45c..0d439b836a 100644 --- a/rts/eventlog/EventLog.h +++ b/rts/eventlog/EventLog.h @@ -11,6 +11,7 @@ #include "rts/EventLogFormat.h" #include "rts/EventLogWriter.h" #include "Capability.h" +#include "sm/NonMovingCensus.h" #include "BeginPrivate.h" @@ -39,6 +40,7 @@ void postSchedEvent(Capability *cap, EventTypeNum tag, * Post a nullary event. */ void postEvent(Capability *cap, EventTypeNum tag); +void postEventNoCap(EventTypeNum tag); void postEventAtTimestamp (Capability *cap, EventTimestamp ts, EventTypeNum tag); @@ -159,6 +161,11 @@ void postHeapProfSampleCostCentre(StgWord8 profile_id, StgWord64 residency); #endif /* PROFILING */ +void postConcUpdRemSetFlush(Capability *cap); +void postConcMarkEnd(StgWord32 marked_obj_count); +void postNonmovingHeapCensus(int log_blk_size, + const struct NonmovingAllocCensus *census); + #else /* !TRACING */ INLINE_HEADER void postSchedEvent (Capability *cap STG_UNUSED, @@ -172,6 +179,9 @@ INLINE_HEADER void postEvent (Capability *cap STG_UNUSED, EventTypeNum tag STG_UNUSED) { /* nothing */ } +INLINE_HEADER void postEventNoCap (EventTypeNum tag STG_UNUSED) +{ /* nothing */ } + INLINE_HEADER void postMsg (char *msg STG_UNUSED, va_list ap STG_UNUSED) { /* nothing */ } diff --git a/rts/rts.cabal.in b/rts/rts.cabal.in index 2c28426d75..a53c166849 100644 --- a/rts/rts.cabal.in +++ b/rts/rts.cabal.in @@ -467,6 +467,7 @@ library sm/MBlock.c sm/MarkWeak.c sm/NonMoving.c + sm/NonMovingCensus.c sm/NonMovingMark.c sm/NonMovingScav.c sm/NonMovingSweep.c diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c index c2aaaacffa..f8ee2630b6 100644 --- a/rts/sm/Evac.c +++ b/rts/sm/Evac.c @@ -69,12 +69,6 @@ alloc_for_copy (uint32_t size, uint32_t gen_no) { ASSERT(gen_no < RtsFlags.GcFlags.generations); - if (RtsFlags.GcFlags.useNonmoving && major_gc) { - // unconditionally promote to non-moving heap in major gc - gct->copied += size; - return nonmovingAllocate(gct->cap, size); - } - StgPtr to; gen_workspace *ws; @@ -91,9 +85,34 @@ alloc_for_copy (uint32_t size, uint32_t gen_no) } } - if (RtsFlags.GcFlags.useNonmoving && gen_no == oldest_gen->no) { - gct->copied += size; - return nonmovingAllocate(gct->cap, size); + if (RtsFlags.GcFlags.useNonmoving) { + /* See Note [Deadlock detection under nonmoving collector]. */ + if (deadlock_detect_gc) + gen_no = oldest_gen->no; + + if (gen_no == oldest_gen->no) { + gct->copied += size; + to = nonmovingAllocate(gct->cap, size); + + // Add segment to the todo list unless it's already there + // current->todo_link == NULL means not in todo list + struct NonmovingSegment *seg = nonmovingGetSegment(to); + if (!seg->todo_link) { + gen_workspace *ws = &gct->gens[oldest_gen->no]; + seg->todo_link = ws->todo_seg; + ws->todo_seg = seg; + } + + // The object which refers to this closure may have been aged (i.e. + // retained in a younger generation). Consequently, we must add the + // closure to the mark queue to ensure that it will be marked. + // + // However, if we are in a deadlock detection GC then we disable aging + // so there is no need. + if (major_gc && !deadlock_detect_gc) + markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, (StgClosure *) to); + return to; + } } ws = &gct->gens[gen_no]; // zero memory references here @@ -312,9 +331,10 @@ evacuate_large(StgPtr p) */ new_gen_no = bd->dest_no; - if (RtsFlags.GcFlags.useNonmoving && major_gc) { + if (deadlock_detect_gc) { + /* See Note [Deadlock detection under nonmoving collector]. */ new_gen_no = oldest_gen->no; - } else if (new_gen_no < gct->evac_gen_no) { + } else if (new_gen_no < gct->evac_gen_no) { if (gct->eager_promotion) { new_gen_no = gct->evac_gen_no; } else { @@ -363,6 +383,13 @@ evacuate_large(StgPtr p) STATIC_INLINE void evacuate_static_object (StgClosure **link_field, StgClosure *q) { + if (RTS_UNLIKELY(RtsFlags.GcFlags.useNonmoving)) { + // See Note [Static objects under the nonmoving collector] in Storage.c. + if (major_gc && !deadlock_detect_gc) + markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q); + return; + } + StgWord link = (StgWord)*link_field; // See Note [STATIC_LINK fields] for how the link field bits work @@ -603,6 +630,8 @@ loop: // NOTE: large objects in nonmoving heap are also marked with // BF_NONMOVING. Those are moved to scavenged_large_objects list in // mark phase. + if (major_gc && !deadlock_detect_gc) + markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q); return; } @@ -629,6 +658,13 @@ loop: // they are not) if (bd->flags & BF_COMPACT) { evacuate_compact((P_)q); + + // We may have evacuated the block to the nonmoving generation. If so + // we need to make sure it is added to the mark queue since the only + // reference to it may be from the moving heap. + if (major_gc && bd->flags & BF_NONMOVING && !deadlock_detect_gc) { + markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q); + } return; } @@ -636,6 +672,13 @@ loop: */ if (bd->flags & BF_LARGE) { evacuate_large((P_)q); + + // We may have evacuated the block to the nonmoving generation. If so + // we need to make sure it is added to the mark queue since the only + // reference to it may be from the moving heap. + if (major_gc && bd->flags & BF_NONMOVING && !deadlock_detect_gc) { + markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q); + } return; } @@ -937,6 +980,8 @@ evacuate_BLACKHOLE(StgClosure **p) ASSERT((bd->flags & BF_COMPACT) == 0); if (bd->flags & BF_NONMOVING) { + if (major_gc && !deadlock_detect_gc) + markQueuePushClosureGC(&gct->cap->upd_rem_set.queue, q); return; } diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 37f9811aaf..83e9c97bd9 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -104,6 +104,7 @@ */ uint32_t N; bool major_gc; +bool deadlock_detect_gc; /* Data used for allocation area sizing. */ @@ -194,6 +195,7 @@ StgPtr mark_sp; // pointer to the next unallocated mark stack entry void GarbageCollect (uint32_t collect_gen, const bool do_heap_census, + const bool deadlock_detect, uint32_t gc_type USED_IF_THREADS, Capability *cap, bool idle_cap[]) @@ -263,7 +265,25 @@ GarbageCollect (uint32_t collect_gen, N = collect_gen; major_gc = (N == RtsFlags.GcFlags.generations-1); - if (major_gc) { + /* See Note [Deadlock detection under nonmoving collector]. */ + deadlock_detect_gc = deadlock_detect; + +#if defined(THREADED_RTS) + if (major_gc && RtsFlags.GcFlags.useNonmoving && concurrent_coll_running) { + /* If there is already a concurrent major collection running then + * there is no benefit to starting another. + * TODO: Catch heap-size runaway. + */ + N--; + collect_gen--; + major_gc = false; + } +#endif + + /* N.B. The nonmoving collector works a bit differently. See + * Note [Static objects under the nonmoving collector]. + */ + if (major_gc && !RtsFlags.GcFlags.useNonmoving) { prev_static_flag = static_flag; static_flag = static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A; @@ -710,6 +730,14 @@ GarbageCollect (uint32_t collect_gen, } } // for all generations + // Flush the update remembered set. See Note [Eager update remembered set + // flushing] in NonMovingMark.c + if (RtsFlags.GcFlags.useNonmoving) { + RELEASE_SM_LOCK; + nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue); + ACQUIRE_SM_LOCK; + } + // Mark and sweep the oldest generation. // N.B. This can only happen after we've moved // oldest_gen->scavenged_large_objects back to oldest_gen->large_objects. @@ -736,6 +764,11 @@ GarbageCollect (uint32_t collect_gen, // so we need to mark those too. // Note that in sequential case these lists will be appended with more // weaks and threads found to be dead in mark. +#if !defined(THREADED_RTS) + // In the non-threaded runtime this is the only time we push to the + // upd_rem_set + nonmovingAddUpdRemSetBlocks(&gct->cap->upd_rem_set.queue); +#endif nonmovingCollect(&dead_weak_ptr_list, &resurrected_threads); ACQUIRE_SM_LOCK; } diff --git a/rts/sm/GC.h b/rts/sm/GC.h index ed19b8bddf..bde006913b 100644 --- a/rts/sm/GC.h +++ b/rts/sm/GC.h @@ -17,9 +17,12 @@ #include "HeapAlloc.h" -void GarbageCollect (uint32_t force_major_gc, +void GarbageCollect (uint32_t collect_gen, bool do_heap_census, - uint32_t gc_type, Capability *cap, bool idle_cap[]); + bool deadlock_detect, + uint32_t gc_type, + Capability *cap, + bool idle_cap[]); typedef void (*evac_fn)(void *user, StgClosure **root); @@ -30,6 +33,8 @@ bool doIdleGCWork(Capability *cap, bool all); extern uint32_t N; extern bool major_gc; +/* See Note [Deadlock detection under nonmoving collector]. */ +extern bool deadlock_detect_gc; extern bdescr *mark_stack_bd; extern bdescr *mark_stack_top_bd; diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c index 6076f61d6f..11080c1f22 100644 --- a/rts/sm/GCAux.c +++ b/rts/sm/GCAux.c @@ -148,14 +148,14 @@ markCAFs (evac_fn evac, void *user) StgIndStatic *c; for (c = dyn_caf_list; - c != (StgIndStatic*)END_OF_CAF_LIST; + ((StgWord) c | STATIC_FLAG_LIST) != (StgWord)END_OF_CAF_LIST; c = (StgIndStatic *)c->static_link) { c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c); evac(user, &c->indirectee); } for (c = revertible_caf_list; - c != (StgIndStatic*)END_OF_CAF_LIST; + ((StgWord) c | STATIC_FLAG_LIST) != (StgWord)END_OF_CAF_LIST; c = (StgIndStatic *)c->static_link) { c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c); diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c index 24464a43f9..6dc1010d43 100644 --- a/rts/sm/NonMoving.c +++ b/rts/sm/NonMoving.c @@ -21,6 +21,7 @@ #include "NonMoving.h" #include "NonMovingMark.h" #include "NonMovingSweep.h" +#include "NonMovingCensus.h" #include "StablePtr.h" // markStablePtrTable #include "Schedule.h" // markScheduler #include "Weak.h" // dead_weak_ptr_list @@ -68,6 +69,113 @@ Mutex concurrent_coll_finished_lock; * stopAllCapabilitiesWith(SYNC_FLUSH_UPD_REM_SET). Capabilities are held * the final mark has concluded. * + * Note that possibility of concurrent minor and non-moving collections + * requires that we handle static objects a bit specially. See + * Note [Static objects under the nonmoving collector] in Storage.c + * for details. + * + * + * Note [Aging under the non-moving collector] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * The initial design of the non-moving collector mandated that all live data + * be evacuated to the non-moving heap prior to a major collection. This + * simplified certain bits of implementation and eased reasoning. However, it + * was (unsurprisingly) also found to result in significant amounts of + * unnecessary copying. + * + * Consequently, we now allow aging. Aging allows the preparatory GC leading up + * to a major collection to evacuate some objects into the young generation. + * However, this introduces the following tricky case that might arise after + * we have finished the preparatory GC: + * + * moving heap ┆ non-moving heap + * ───────────────┆────────────────── + * ┆ + * B ←────────────── A ←─────────────── root + * │ ┆ ↖─────────────── gen1 mut_list + * ╰───────────────→ C + * ┆ + * + * In this case C is clearly live, but the non-moving collector can only see + * this by walking through B, which lives in the moving heap. However, doing so + * would require that we synchronize with the mutator/minor GC to ensure that it + * isn't in the middle of moving B. What to do? + * + * The solution we use here is to teach the preparatory moving collector to + * "evacuate" objects it encounters in the non-moving heap by adding them to + * the mark queue. This is implemented by pushing the object to the update + * remembered set of the capability held by the evacuating gc_thread + * (implemented by markQueuePushClosureGC) + * + * Consequently collection of the case above would proceed as follows: + * + * 1. Initial state: + * * A lives in the non-moving heap and is reachable from the root set + * * A is on the oldest generation's mut_list, since it contains a pointer + * to B, which lives in a younger generation + * * B lives in the moving collector's from space + * * C lives in the non-moving heap + * + * 2. Preparatory GC: Scavenging mut_lists: + * + * The mut_list of the oldest generation is scavenged, resulting in B being + * evacuated (aged) into the moving collector's to-space. + * + * 3. Preparatory GC: Scavenge B + * + * B (now in to-space) is scavenged, resulting in evacuation of C. + * evacuate(C) pushes a reference to C to the mark queue. + * + * 4. Non-moving GC: C is marked + * + * The non-moving collector will come to C in the mark queue and mark it. + * + * + * Note [Deadlock detection under the non-moving collector] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * In GHC the garbage collector is responsible for identifying deadlocked + * programs. Providing for this responsibility is slightly tricky in the + * non-moving collector due to the existence of aging. In particular, the + * non-moving collector cannot traverse objects living in a young generation + * but reachable from the non-moving generation, as described in Note [Aging + * under the non-moving collector]. + * + * However, this can pose trouble for deadlock detection since it means that we + * may conservatively mark dead closures as live. Consider this case: + * + * moving heap ┆ non-moving heap + * ───────────────┆────────────────── + * ┆ + * MVAR_QUEUE ←───── TSO ←───────────── gen1 mut_list + * ↑ │ ╰────────↗ │ + * │ │ ┆ │ + * │ │ ┆ ↓ + * │ ╰──────────→ MVAR + * ╰─────────────────╯ + * ┆ + * + * In this case we have a TSO blocked on a dead MVar. Because the MVAR_TSO_QUEUE on + * which it is blocked lives in the moving heap, the TSO is necessarily on the + * oldest generation's mut_list. As in Note [Aging under the non-moving + * collector], the MVAR_TSO_QUEUE will be evacuated. If MVAR_TSO_QUEUE is aged + * (e.g. evacuated to the young generation) then the MVAR will be added to the + * mark queue. Consequently, we will falsely conclude that the MVAR is still + * alive and fail to spot the deadlock. + * + * To avoid this sort of situation we disable aging when we are starting a + * major GC specifically for deadlock detection (as done by + * scheduleDetectDeadlock). This condition is recorded by the + * deadlock_detect_gc global variable declared in GC.h. Setting this has a few + * effects on the preparatory GC: + * + * - Evac.c:alloc_for_copy forces evacuation to the non-moving generation. + * + * - The evacuation logic usually responsible for pushing objects living in + * the non-moving heap to the mark queue is disabled. This is safe because + * we know that all live objects will be in the non-moving heap by the end + * of the preparatory moving collection. + * * * Note [Live data accounting in nonmoving collector] * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -146,10 +254,26 @@ static struct NonmovingSegment *nonmovingPopFreeSegment(void) } } +unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size) +{ + // We compute the overwhelmingly common size cases directly to avoid a very + // expensive integer division. + switch (log_block_size) { + case 3: return nonmovingBlockCount(3); + case 4: return nonmovingBlockCount(4); + case 5: return nonmovingBlockCount(5); + case 6: return nonmovingBlockCount(6); + case 7: return nonmovingBlockCount(7); + default: return nonmovingBlockCount(log_block_size); + } +} + /* * Request a fresh segment from the free segment list or allocate one of the * given node. * + * Caller must hold SM_MUTEX (although we take the gc_alloc_block_sync spinlock + * under the assumption that we are in a GC context). */ static struct NonmovingSegment *nonmovingAllocSegment(uint32_t node) { @@ -192,10 +316,12 @@ static inline unsigned long log2_ceil(unsigned long x) } // Advance a segment's next_free pointer. Returns true if segment if full. -static bool advance_next_free(struct NonmovingSegment *seg) +static bool advance_next_free(struct NonmovingSegment *seg, const unsigned int blk_count) { - uint8_t *bitmap = seg->bitmap; - unsigned int blk_count = nonmovingSegmentBlockCount(seg); + const uint8_t *bitmap = seg->bitmap; + ASSERT(blk_count == nonmovingSegmentBlockCount(seg)); +#if defined(NAIVE_ADVANCE_FREE) + // reference implementation for (unsigned int i = seg->next_free+1; i < blk_count; i++) { if (!bitmap[i]) { seg->next_free = i; @@ -204,6 +330,16 @@ static bool advance_next_free(struct NonmovingSegment *seg) } seg->next_free = blk_count; return true; +#else + const uint8_t *c = memchr(&bitmap[seg->next_free+1], 0, blk_count - seg->next_free - 1); + if (c == NULL) { + seg->next_free = blk_count; + return true; + } else { + seg->next_free = c - bitmap; + return false; + } +#endif } static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *alloca) @@ -221,34 +357,27 @@ static struct NonmovingSegment *pop_active_segment(struct NonmovingAllocator *al } } -/* sz is in words */ +/* Allocate a block in the nonmoving heap. Caller must hold SM_MUTEX. sz is in words */ GNUC_ATTR_HOT void *nonmovingAllocate(Capability *cap, StgWord sz) { - unsigned int allocator_idx = log2_ceil(sz * sizeof(StgWord)) - NONMOVING_ALLOCA0; + unsigned int log_block_size = log2_ceil(sz * sizeof(StgWord)); + unsigned int block_count = nonmovingBlockCountFromSize(log_block_size); // The max we ever allocate is 3276 bytes (anything larger is a large // object and not moved) which is covered by allocator 9. - ASSERT(allocator_idx < NONMOVING_ALLOCA_CNT); + ASSERT(log_block_size < NONMOVING_ALLOCA0 + NONMOVING_ALLOCA_CNT); - struct NonmovingAllocator *alloca = nonmovingHeap.allocators[allocator_idx]; + struct NonmovingAllocator *alloca = nonmovingHeap.allocators[log_block_size - NONMOVING_ALLOCA0]; // Allocate into current segment struct NonmovingSegment *current = alloca->current[cap->no]; ASSERT(current); // current is never NULL - void *ret = nonmovingSegmentGetBlock(current, current->next_free); + void *ret = nonmovingSegmentGetBlock_(current, log_block_size, current->next_free); ASSERT(GET_CLOSURE_TAG(ret) == 0); // check alignment - // Add segment to the todo list unless it's already there - // current->todo_link == NULL means not in todo list - if (!current->todo_link) { - gen_workspace *ws = &gct->gens[oldest_gen->no]; - current->todo_link = ws->todo_seg; - ws->todo_seg = current; - } - // Advance the current segment's next_free or allocate a new segment if full - bool full = advance_next_free(current); + bool full = advance_next_free(current, block_count); if (full) { // Current segment is full: update live data estimate link it to // filled, take an active segment if one exists, otherwise allocate a @@ -256,8 +385,9 @@ void *nonmovingAllocate(Capability *cap, StgWord sz) // Update live data estimate. // See Note [Live data accounting in nonmoving collector]. - unsigned int new_blocks = nonmovingSegmentBlockCount(current) - current->next_free_snap; - atomic_inc(&oldest_gen->live_estimate, new_blocks * nonmovingSegmentBlockSize(current) / sizeof(W_)); + unsigned int new_blocks = block_count - current->next_free_snap; + unsigned int block_size = 1 << log_block_size; + atomic_inc(&oldest_gen->live_estimate, new_blocks * block_size / sizeof(W_)); // push the current segment to the filled list nonmovingPushFilledSegment(current); @@ -268,7 +398,7 @@ void *nonmovingAllocate(Capability *cap, StgWord sz) // there are no active segments, allocate new segment if (new_current == NULL) { new_current = nonmovingAllocSegment(cap->node); - nonmovingInitSegment(new_current, NONMOVING_ALLOCA0 + allocator_idx); + nonmovingInitSegment(new_current, log_block_size); } // make it current @@ -375,37 +505,23 @@ void nonmovingAddCapabilities(uint32_t new_n_caps) nonmovingHeap.n_caps = new_n_caps; } -static void nonmovingClearBitmap(struct NonmovingSegment *seg) +static inline void nonmovingClearBitmap(struct NonmovingSegment *seg) { unsigned int n = nonmovingSegmentBlockCount(seg); memset(seg->bitmap, 0, n); } -static void nonmovingClearSegmentBitmaps(struct NonmovingSegment *seg) -{ - while (seg) { - nonmovingClearBitmap(seg); - seg = seg->link; - } -} - -static void nonmovingClearAllBitmaps(void) -{ - for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) { - struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx]; - nonmovingClearSegmentBitmaps(alloca->filled); - } - - // Clear large object bits - for (bdescr *bd = nonmoving_large_objects; bd; bd = bd->link) { - bd->flags &= ~BF_MARKED; - } -} - /* Prepare the heap bitmaps and snapshot metadata for a mark */ static void nonmovingPrepareMark(void) { - nonmovingClearAllBitmaps(); + // See Note [Static objects under the nonmoving collector]. + prev_static_flag = static_flag; + static_flag = + static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A; + + // Should have been cleared by the last sweep + ASSERT(nonmovingHeap.sweep_list == NULL); + nonmovingBumpEpoch(); for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) { struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx]; @@ -416,11 +532,28 @@ static void nonmovingPrepareMark(void) seg->next_free_snap = seg->next_free; } - // Update filled segments' snapshot pointers - struct NonmovingSegment *seg = alloca->filled; - while (seg) { - seg->next_free_snap = seg->next_free; - seg = seg->link; + // Update filled segments' snapshot pointers and move to sweep_list + uint32_t n_filled = 0; + struct NonmovingSegment *const filled = alloca->filled; + alloca->filled = NULL; + if (filled) { + struct NonmovingSegment *seg = filled; + while (true) { + n_filled++; + prefetchForRead(seg->link); + // Clear bitmap + prefetchForWrite(seg->link->bitmap); + nonmovingClearBitmap(seg); + // Set snapshot + seg->next_free_snap = seg->next_free; + if (seg->link) + seg = seg->link; + else + break; + } + // add filled segments to sweep_list + seg->link = nonmovingHeap.sweep_list; + nonmovingHeap.sweep_list = filled; } // N.B. It's not necessary to update snapshot pointers of active segments; @@ -441,6 +574,12 @@ static void nonmovingPrepareMark(void) oldest_gen->n_large_blocks = 0; nonmoving_live_words = 0; + // Clear large object bits + for (bdescr *bd = nonmoving_large_objects; bd; bd = bd->link) { + bd->flags &= ~BF_MARKED; + } + + #if defined(DEBUG) debug_caf_list_snapshot = debug_caf_list; debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST; @@ -491,7 +630,6 @@ void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads) resizeGenerations(); nonmovingPrepareMark(); - nonmovingPrepareSweep(); // N.B. These should have been cleared at the end of the last sweep. ASSERT(nonmoving_marked_large_objects == NULL); @@ -688,7 +826,7 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO * #if defined(DEBUG) // Zap CAFs that we will sweep - nonmovingGcCafs(mark_queue); + nonmovingGcCafs(); #endif ASSERT(mark_queue->top->head == 0); @@ -735,6 +873,8 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO * * Sweep ****************************************************/ + traceConcSweepBegin(); + // Because we can't mark large object blocks (no room for mark bit) we // collect them in a map in mark_queue and we pass it here to sweep large // objects @@ -744,6 +884,11 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO * nonmovingSweep(); ASSERT(nonmovingHeap.sweep_list == NULL); debugTrace(DEBUG_nonmoving_gc, "Finished sweeping."); + traceConcSweepEnd(); +#if defined(DEBUG) + if (RtsFlags.DebugFlags.nonmoving_gc) + nonmovingPrintAllocatorCensus(); +#endif // TODO: Remainder of things done by GarbageCollect (update stats) diff --git a/rts/sm/NonMoving.h b/rts/sm/NonMoving.h index 0f9cfa0e48..17adfd6666 100644 --- a/rts/sm/NonMoving.h +++ b/rts/sm/NonMoving.h @@ -93,6 +93,10 @@ extern struct NonmovingHeap nonmovingHeap; extern memcount nonmoving_live_words; +#if defined(THREADED_RTS) +extern bool concurrent_coll_running; +#endif + void nonmovingInit(void); void nonmovingStop(void); void nonmovingExit(void); @@ -161,22 +165,34 @@ INLINE_HEADER unsigned int nonmovingSegmentBlockSize(struct NonmovingSegment *se return 1 << seg->block_size; } -// How many blocks does the given segment contain? Also the size of the bitmap. -INLINE_HEADER unsigned int nonmovingSegmentBlockCount(struct NonmovingSegment *seg) +// How many blocks does a segment with the given block size have? +INLINE_HEADER unsigned int nonmovingBlockCount(uint8_t log_block_size) { - unsigned int sz = nonmovingSegmentBlockSize(seg); unsigned int segment_data_size = NONMOVING_SEGMENT_SIZE - sizeof(struct NonmovingSegment); segment_data_size -= segment_data_size % SIZEOF_VOID_P; - return segment_data_size / (sz + 1); + unsigned int blk_size = 1 << log_block_size; + // N.B. +1 accounts for the byte in the mark bitmap. + return segment_data_size / (blk_size + 1); } -// Get a pointer to the given block index -INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmoving_block_idx i) +unsigned int nonmovingBlockCountFromSize(uint8_t log_block_size); + +// How many blocks does the given segment contain? Also the size of the bitmap. +INLINE_HEADER unsigned int nonmovingSegmentBlockCount(struct NonmovingSegment *seg) +{ + return nonmovingBlockCountFromSize(seg->block_size); +} + +// Get a pointer to the given block index assuming that the block size is as +// given (avoiding a potential cache miss when this information is already +// available). The log_block_size argument must be equal to seg->block_size. +INLINE_HEADER void *nonmovingSegmentGetBlock_(struct NonmovingSegment *seg, uint8_t log_block_size, nonmoving_block_idx i) { + ASSERT(log_block_size == seg->block_size); // Block size in bytes - unsigned int blk_size = nonmovingSegmentBlockSize(seg); + unsigned int blk_size = 1 << log_block_size; // Bitmap size in bytes - W_ bitmap_size = nonmovingSegmentBlockCount(seg) * sizeof(uint8_t); + W_ bitmap_size = nonmovingBlockCountFromSize(log_block_size) * sizeof(uint8_t); // Where the actual data starts (address of the first block). // Use ROUNDUP_BYTES_TO_WDS to align to word size. Note that // ROUNDUP_BYTES_TO_WDS returns in _words_, not in _bytes_, so convert it back @@ -185,15 +201,26 @@ INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmo return (void*)(data + i*blk_size); } +// Get a pointer to the given block index. +INLINE_HEADER void *nonmovingSegmentGetBlock(struct NonmovingSegment *seg, nonmoving_block_idx i) +{ + return nonmovingSegmentGetBlock_(seg, seg->block_size, i); +} + // Get the segment which a closure resides in. Assumes that pointer points into // non-moving heap. -INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment(StgPtr p) +INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment_unchecked(StgPtr p) { - ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING)); const uintptr_t mask = ~NONMOVING_SEGMENT_MASK; return (struct NonmovingSegment *) (((uintptr_t) p) & mask); } +INLINE_HEADER struct NonmovingSegment *nonmovingGetSegment(StgPtr p) +{ + ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING)); + return nonmovingGetSegment_unchecked(p); +} + INLINE_HEADER nonmoving_block_idx nonmovingGetBlockIdx(StgPtr p) { ASSERT(HEAP_ALLOCED_GC(p) && (Bdescr(p)->flags & BF_NONMOVING)); diff --git a/rts/sm/NonMovingCensus.c b/rts/sm/NonMovingCensus.c new file mode 100644 index 0000000000..670d51263c --- /dev/null +++ b/rts/sm/NonMovingCensus.c @@ -0,0 +1,129 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 1998-2018 + * + * Non-moving garbage collector and allocator: Accounting census + * + * This is a simple space accounting census useful for characterising + * fragmentation in the nonmoving heap. + * + * ---------------------------------------------------------------------------*/ + +#include "Rts.h" +#include "NonMoving.h" +#include "Trace.h" +#include "NonMovingCensus.h" + +// N.B. This may miss segments in the event of concurrent mutation (e.g. if a +// mutator retires its current segment to the filled list). +// +// all_stopped is whether we can guarantee that all mutators and minor GCs are +// stopped. In this case is safe to look at active and current segments so we can +// also collect statistics on live words. +static inline struct NonmovingAllocCensus +nonmovingAllocatorCensus_(struct NonmovingAllocator *alloc, bool collect_live_words) +{ + struct NonmovingAllocCensus census = {0, 0, 0, 0}; + + for (struct NonmovingSegment *seg = alloc->filled; + seg != NULL; + seg = seg->link) + { + unsigned int n = nonmovingSegmentBlockCount(seg); + census.n_filled_segs++; + census.n_live_blocks += n; + if (collect_live_words) { + for (unsigned int i=0; i < n; i++) { + StgClosure *c = (StgClosure *) nonmovingSegmentGetBlock(seg, i); + census.n_live_words += closure_sizeW(c); + } + } + } + + for (struct NonmovingSegment *seg = alloc->active; + seg != NULL; + seg = seg->link) + { + census.n_active_segs++; + unsigned int n = nonmovingSegmentBlockCount(seg); + for (unsigned int i=0; i < n; i++) { + if (nonmovingGetMark(seg, i)) { + StgClosure *c = (StgClosure *) nonmovingSegmentGetBlock(seg, i); + if (collect_live_words) + census.n_live_words += closure_sizeW(c); + census.n_live_blocks++; + } + } + } + + for (unsigned int cap=0; cap < n_capabilities; cap++) + { + struct NonmovingSegment *seg = alloc->current[cap]; + unsigned int n = nonmovingSegmentBlockCount(seg); + for (unsigned int i=0; i < n; i++) { + if (nonmovingGetMark(seg, i)) { + StgClosure *c = (StgClosure *) nonmovingSegmentGetBlock(seg, i); + if (collect_live_words) + census.n_live_words += closure_sizeW(c); + census.n_live_blocks++; + } + } + } + return census; +} + +/* This must not be used when mutators are active since it assumes that + * all blocks in nonmoving heap are valid closures. + */ +struct NonmovingAllocCensus +nonmovingAllocatorCensusWithWords(struct NonmovingAllocator *alloc) +{ + return nonmovingAllocatorCensus_(alloc, true); +} + +struct NonmovingAllocCensus +nonmovingAllocatorCensus(struct NonmovingAllocator *alloc) +{ + return nonmovingAllocatorCensus_(alloc, false); +} + + +void nonmovingPrintAllocatorCensus() +{ + if (!RtsFlags.GcFlags.useNonmoving) + return; + + for (int i=0; i < NONMOVING_ALLOCA_CNT; i++) { + struct NonmovingAllocCensus census = + nonmovingAllocatorCensus(nonmovingHeap.allocators[i]); + + uint32_t blk_size = 1 << (i + NONMOVING_ALLOCA0); + // We define occupancy as the fraction of space that is used for useful + // data (that is, live and not slop). + double occupancy = 100.0 * census.n_live_words * sizeof(W_) + / (census.n_live_blocks * blk_size); + if (census.n_live_blocks == 0) occupancy = 100; + (void) occupancy; // silence warning if !DEBUG + debugTrace(DEBUG_nonmoving_gc, "Allocator %d (%d bytes - %d bytes): " + "%d active segs, %d filled segs, %d live blocks, %d live words " + "(%2.1f%% occupancy)", + i, 1 << (i + NONMOVING_ALLOCA0 - 1), 1 << (i + NONMOVING_ALLOCA0), + census.n_active_segs, census.n_filled_segs, census.n_live_blocks, census.n_live_words, + occupancy); + } +} + +void nonmovingTraceAllocatorCensus() +{ +#if defined(TRACING) + if (!RtsFlags.GcFlags.useNonmoving && !TRACE_nonmoving_gc) + return; + + for (int i=0; i < NONMOVING_ALLOCA_CNT; i++) { + const struct NonmovingAllocCensus census = + nonmovingAllocatorCensus(nonmovingHeap.allocators[i]); + const uint32_t log_blk_size = i + NONMOVING_ALLOCA0; + traceNonmovingHeapCensus(log_blk_size, &census); + } +#endif +} diff --git a/rts/sm/NonMovingCensus.h b/rts/sm/NonMovingCensus.h new file mode 100644 index 0000000000..7a66dc9b69 --- /dev/null +++ b/rts/sm/NonMovingCensus.h @@ -0,0 +1,28 @@ +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 1998-2018 + * + * Non-moving garbage collector and allocator: Accounting census + * + * ---------------------------------------------------------------------------*/ + +#pragma once + +#include "NonMoving.h" + +struct NonmovingAllocCensus { + uint32_t n_active_segs; + uint32_t n_filled_segs; + uint32_t n_live_blocks; + uint32_t n_live_words; +}; + + +struct NonmovingAllocCensus +nonmovingAllocatorCensusWithWords(struct NonmovingAllocator *alloc); + +struct NonmovingAllocCensus +nonmovingAllocatorCensus(struct NonmovingAllocator *alloc); + +void nonmovingPrintAllocatorCensus(void); +void nonmovingTraceAllocatorCensus(void); diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c index 72a3ca69cb..bb5d72bbf1 100644 --- a/rts/sm/NonMovingMark.c +++ b/rts/sm/NonMovingMark.c @@ -131,6 +131,49 @@ StgIndStatic *debug_caf_list_snapshot = (StgIndStatic*)END_OF_CAF_LIST; * The mark phase is responsible for freeing update remembered set block * allocations. * + * Note that we eagerly flush update remembered sets during minor GCs as + * described in Note [Eager update remembered set flushing]. + * + * + * Note [Eager update remembered set flushing] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * We eagerly flush update remembered sets during minor GCs to avoid scenarios + * like the following which could result in long sync pauses: + * + * 1. We start a major GC, all thread stacks are added to the mark queue. + * 2. The concurrent mark thread starts. + * 3. The mutator is allowed to resume. One mutator thread T is scheduled and marks its + * stack to its local update remembered set. + * 4. The mark thread eventually encounters the mutator thread's stack but + * sees that it has already been marked; skips it. + * 5. Thread T continues running but does not push enough to its update + * remembered set to require a flush. + * 6. Eventually the mark thread finished marking and requests a final sync. + * 7. The thread T flushes its update remembered set. + * 8. We find that a large fraction of the heap (namely the subset that is + * only reachable from the thread T's stack) needs to be marked, incurring + * a large sync pause + * + * We avoid this by periodically (during minor GC) forcing a flush of the + * update remembered set. + * + * A better (but more complex) approach that would be worthwhile trying in the + * future would be to rather do the following: + * + * 1. Concurrent mark phase is on-going + * 2. Mark thread runs out of things to mark + * 3. Mark thread sends a signal to capabilities requesting that they send + * their update remembered sets without suspending their execution + * 4. The mark thread marks everything it was sent; runs out of things to mark + * 5. Mark thread initiates a sync + * 6. Capabilities send their final update remembered sets and suspend execution + * 7. Mark thread marks everything is was sent + * 8. Mark thead allows capabilities to resume. + * + * However, this is obviously a fair amount of complexity and so far the + * periodic eager flushing approach has been sufficient. + * * * Note [Concurrent read barrier on deRefWeak#] * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -205,7 +248,7 @@ static void init_mark_queue_(MarkQueue *queue); * Really the argument type should be UpdRemSet* but this would be rather * inconvenient without polymorphism. */ -static void nonmovingAddUpdRemSetBlocks(MarkQueue *rset) +void nonmovingAddUpdRemSetBlocks(MarkQueue *rset) { if (markQueueIsEmpty(rset)) return; @@ -238,6 +281,7 @@ void nonmovingFlushCapUpdRemSetBlocks(Capability *cap) debugTrace(DEBUG_nonmoving_gc, "Capability %d flushing update remembered set: %d", cap->no, markQueueLength(&cap->upd_rem_set.queue)); + traceConcUpdRemSetFlush(cap); nonmovingAddUpdRemSetBlocks(&cap->upd_rem_set.queue); atomic_inc(&upd_rem_set_flush_count, 1); signalCondition(&upd_rem_set_flushed_cond); @@ -251,6 +295,7 @@ void nonmovingFlushCapUpdRemSetBlocks(Capability *cap) void nonmovingBeginFlush(Task *task) { debugTrace(DEBUG_nonmoving_gc, "Starting update remembered set flush..."); + traceConcSyncBegin(); upd_rem_set_flush_count = 0; stopAllCapabilitiesWith(NULL, task, SYNC_FLUSH_UPD_REM_SET); @@ -342,6 +387,7 @@ void nonmovingFinishFlush(Task *task) upd_rem_set_block_list = NULL; debugTrace(DEBUG_nonmoving_gc, "Finished update remembered set flush..."); + traceConcSyncEnd(); releaseAllCapabilities(n_capabilities, NULL, task); } #endif @@ -361,7 +407,7 @@ push (MarkQueue *q, const MarkQueueEnt *ent) } else { // allocate a fresh block. ACQUIRE_SM_LOCK; - bdescr *bd = allocGroup(1); + bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS); bd->link = q->blocks; q->blocks = bd; q->top = (MarkQueueBlock *) bd->start; @@ -374,16 +420,49 @@ push (MarkQueue *q, const MarkQueueEnt *ent) q->top->head++; } +/* A variant of push to be used by the minor GC when it encounters a reference + * to an object in the non-moving heap. In contrast to the other push + * operations this uses the gc_alloc_block_sync spinlock instead of the + * SM_LOCK to allocate new blocks in the event that the mark queue is full. + */ +void +markQueuePushClosureGC (MarkQueue *q, StgClosure *p) +{ + /* We should not make it here if we are doing a deadlock detect GC. + * See Note [Deadlock detection under nonmoving collector]. + */ + ASSERT(!deadlock_detect_gc); + + // Are we at the end of the block? + if (q->top->head == MARK_QUEUE_BLOCK_ENTRIES) { + // Yes, this block is full. + // allocate a fresh block. + ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); + bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS); + bd->link = q->blocks; + q->blocks = bd; + q->top = (MarkQueueBlock *) bd->start; + q->top->head = 0; + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + } + + MarkQueueEnt ent = { + .mark_closure = { + .p = UNTAG_CLOSURE(p), + .origin = NULL, + } + }; + q->top->entries[q->top->head] = ent; + q->top->head++; +} + static inline void push_closure (MarkQueue *q, StgClosure *p, StgClosure **origin) { - // TODO: Push this into callers where they already have the Bdescr - if (HEAP_ALLOCED_GC(p) && (Bdescr((StgPtr) p)->gen != oldest_gen)) - return; - #if defined(DEBUG) + ASSERT(!HEAP_ALLOCED_GC(p) || (Bdescr((StgPtr) p)->gen == oldest_gen)); ASSERT(LOOKS_LIKE_CLOSURE_PTR(p)); // Commenting out: too slow // if (RtsFlags.DebugFlags.sanity) { @@ -393,8 +472,12 @@ void push_closure (MarkQueue *q, // } #endif + // This must be true as origin points to a pointer and therefore must be + // word-aligned. However, we check this as otherwise we would confuse this + // with a mark_array entry + ASSERT(((uintptr_t) origin & 0x3) == 0); + MarkQueueEnt ent = { - .type = MARK_CLOSURE, .mark_closure = { .p = UNTAG_CLOSURE(p), .origin = origin, @@ -413,10 +496,9 @@ void push_array (MarkQueue *q, return; MarkQueueEnt ent = { - .type = MARK_ARRAY, .mark_array = { .array = array, - .start_index = start_index + .start_index = (start_index << 16) | 0x3, } }; push(q, &ent); @@ -493,15 +575,11 @@ void updateRemembSetPushThunkEager(Capability *cap, MarkQueue *queue = &cap->upd_rem_set.queue; push_thunk_srt(queue, &info->i); - // Don't record the origin of objects living outside of the nonmoving - // heap; we can't perform the selector optimisation on them anyways. - bool record_origin = check_in_nonmoving_heap((StgClosure*)thunk); - for (StgWord i = 0; i < info->i.layout.payload.ptrs; i++) { if (check_in_nonmoving_heap(thunk->payload[i])) { - push_closure(queue, - thunk->payload[i], - record_origin ? &thunk->payload[i] : NULL); + // Don't bother to push origin; it makes the barrier needlessly + // expensive with little benefit. + push_closure(queue, thunk->payload[i], NULL); } } break; @@ -510,7 +588,9 @@ void updateRemembSetPushThunkEager(Capability *cap, { MarkQueue *queue = &cap->upd_rem_set.queue; StgAP *ap = (StgAP *) thunk; - push_closure(queue, ap->fun, &ap->fun); + if (check_in_nonmoving_heap(ap->fun)) { + push_closure(queue, ap->fun, NULL); + } mark_PAP_payload(queue, ap->fun, ap->payload, ap->n_args); break; } @@ -531,9 +611,10 @@ void updateRemembSetPushThunk_(StgRegTable *reg, StgThunk *p) inline void updateRemembSetPushClosure(Capability *cap, StgClosure *p) { - if (!check_in_nonmoving_heap(p)) return; - MarkQueue *queue = &cap->upd_rem_set.queue; - push_closure(queue, p, NULL); + if (check_in_nonmoving_heap(p)) { + MarkQueue *queue = &cap->upd_rem_set.queue; + push_closure(queue, p, NULL); + } } void updateRemembSetPushClosure_(StgRegTable *reg, struct StgClosure_ *p) @@ -630,7 +711,10 @@ void markQueuePushClosure (MarkQueue *q, StgClosure *p, StgClosure **origin) { - push_closure(q, p, origin); + // TODO: Push this into callers where they already have the Bdescr + if (check_in_nonmoving_heap(p)) { + push_closure(q, p, origin); + } } /* TODO: Do we really never want to specify the origin here? */ @@ -667,7 +751,7 @@ void markQueuePushArray (MarkQueue *q, *********************************************************/ // Returns invalid MarkQueueEnt if queue is empty. -static MarkQueueEnt markQueuePop (MarkQueue *q) +static MarkQueueEnt markQueuePop_ (MarkQueue *q) { MarkQueueBlock *top; @@ -679,7 +763,7 @@ again: // Is this the first block of the queue? if (q->blocks->link == NULL) { // Yes, therefore queue is empty... - MarkQueueEnt none = { .type = NULL_ENTRY }; + MarkQueueEnt none = { .null_entry = { .p = NULL } }; return none; } else { // No, unwind to the previous block and try popping again... @@ -698,6 +782,47 @@ again: return ent; } +static MarkQueueEnt markQueuePop (MarkQueue *q) +{ +#if MARK_PREFETCH_QUEUE_DEPTH == 0 + return markQueuePop_(q); +#else + unsigned int i = q->prefetch_head; + while (nonmovingMarkQueueEntryType(&q->prefetch_queue[i]) == NULL_ENTRY) { + MarkQueueEnt new = markQueuePop_(q); + if (nonmovingMarkQueueEntryType(&new) == NULL_ENTRY) { + // Mark queue is empty; look for any valid entries in the prefetch + // queue + for (unsigned int j = (i+1) % MARK_PREFETCH_QUEUE_DEPTH; + j != i; + j = (j+1) % MARK_PREFETCH_QUEUE_DEPTH) + { + if (nonmovingMarkQueueEntryType(&q->prefetch_queue[j]) != NULL_ENTRY) { + i = j; + goto done; + } + } + return new; + } + + // The entry may not be a MARK_CLOSURE but it doesn't matter, our + // MarkQueueEnt encoding always places the pointer to the object to be + // marked first. + prefetchForRead(&new.mark_closure.p->header.info); + prefetchForRead(&nonmovingGetSegment_unchecked((StgPtr) new.mark_closure.p)->block_size); + q->prefetch_queue[i] = new; + i = (i + 1) % MARK_PREFETCH_QUEUE_DEPTH; + } + + done: + ; + MarkQueueEnt ret = q->prefetch_queue[i]; + q->prefetch_queue[i].null_entry.p = NULL; + q->prefetch_head = i; + return ret; +#endif +} + /********************************************************* * Creating and destroying MarkQueues and UpdRemSets *********************************************************/ @@ -705,17 +830,20 @@ again: /* Must hold sm_mutex. */ static void init_mark_queue_ (MarkQueue *queue) { - bdescr *bd = allocGroup(1); + bdescr *bd = allocGroup(MARK_QUEUE_BLOCKS); queue->blocks = bd; queue->top = (MarkQueueBlock *) bd->start; queue->top->head = 0; +#if MARK_PREFETCH_QUEUE_DEPTH > 0 + memset(&queue->prefetch_queue, 0, sizeof(queue->prefetch_queue)); + queue->prefetch_head = 0; +#endif } /* Must hold sm_mutex. */ void initMarkQueue (MarkQueue *queue) { init_mark_queue_(queue); - queue->marked_objects = allocHashTable(); queue->is_upd_rem_set = false; } @@ -723,8 +851,6 @@ void initMarkQueue (MarkQueue *queue) void init_upd_rem_set (UpdRemSet *rset) { init_mark_queue_(&rset->queue); - // Update remembered sets don't have to worry about static objects - rset->queue.marked_objects = NULL; rset->queue.is_upd_rem_set = true; } @@ -739,7 +865,6 @@ void reset_upd_rem_set (UpdRemSet *rset) void freeMarkQueue (MarkQueue *queue) { freeChain_lock(queue->blocks); - freeHashTable(queue->marked_objects, NULL); } #if defined(THREADED_RTS) && defined(DEBUG) @@ -986,12 +1111,32 @@ mark_stack (MarkQueue *queue, StgStack *stack) mark_stack_(queue, stack->sp, stack->stack + stack->stack_size); } +/* See Note [Static objects under the nonmoving collector]. + * + * Returns true if the object needs to be marked. + */ +static bool +bump_static_flag(StgClosure **link_field, StgClosure *q STG_UNUSED) +{ + while (1) { + StgWord link = (StgWord) *link_field; + StgWord new = (link & ~STATIC_BITS) | static_flag; + if ((link & STATIC_BITS) == static_flag) + return false; + else if (cas((StgVolatilePtr) link_field, link, new) == link) { + return true; + } + } +} + static GNUC_ATTR_HOT void mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin) { (void)origin; // TODO: should be used for selector/thunk optimisations try_again: + ; + bdescr *bd = NULL; p = UNTAG_CLOSURE(p); # define PUSH_FIELD(obj, field) \ @@ -1009,45 +1154,46 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin) return; } - if (lookupHashTable(queue->marked_objects, (W_)p)) { - // already marked - return; - } - - insertHashTable(queue->marked_objects, (W_)p, (P_)1); - switch (type) { case THUNK_STATIC: if (info->srt != 0) { - markQueuePushThunkSrt(queue, info); // TODO this function repeats the check above + if (bump_static_flag(THUNK_STATIC_LINK((StgClosure *)p), p)) { + markQueuePushThunkSrt(queue, info); // TODO this function repeats the check above + } } return; case FUN_STATIC: if (info->srt != 0 || info->layout.payload.ptrs != 0) { - markQueuePushFunSrt(queue, info); // TODO this function repeats the check above - - // a FUN_STATIC can also be an SRT, so it may have pointer - // fields. See Note [SRTs] in CmmBuildInfoTables, specifically - // the [FUN] optimisation. - // TODO (osa) I don't understand this comment - for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) { - PUSH_FIELD(p, payload[i]); + if (bump_static_flag(STATIC_LINK(info, (StgClosure *)p), p)) { + markQueuePushFunSrt(queue, info); // TODO this function repeats the check above + + // a FUN_STATIC can also be an SRT, so it may have pointer + // fields. See Note [SRTs] in CmmBuildInfoTables, specifically + // the [FUN] optimisation. + // TODO (osa) I don't understand this comment + for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) { + PUSH_FIELD(p, payload[i]); + } } } return; case IND_STATIC: - PUSH_FIELD((StgInd *) p, indirectee); + if (bump_static_flag(IND_STATIC_LINK((StgClosure *)p), p)) { + PUSH_FIELD((StgInd *) p, indirectee); + } return; case CONSTR: case CONSTR_1_0: case CONSTR_2_0: case CONSTR_1_1: - for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) { - PUSH_FIELD(p, payload[i]); + if (bump_static_flag(STATIC_LINK(info, (StgClosure *)p), p)) { + for (StgHalfWord i = 0; i < info->layout.payload.ptrs; ++i) { + PUSH_FIELD(p, payload[i]); + } } return; @@ -1061,19 +1207,17 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin) } } - bdescr *bd = Bdescr((StgPtr) p); + bd = Bdescr((StgPtr) p); if (bd->gen != oldest_gen) { - // Here we have an object living outside of the non-moving heap. Since - // we moved everything to the non-moving heap before starting the major - // collection, we know that we don't need to trace it: it was allocated - // after we took our snapshot. -#if !defined(THREADED_RTS) - // This should never happen in the non-concurrent case - barf("Closure outside of non-moving heap: %p", p); -#else + // Here we have an object living outside of the non-moving heap. While + // we likely evacuated nearly everything to the nonmoving heap during + // preparation there are nevertheless a few ways in which we might trace + // a reference into younger generations: + // + // * a mutable object might have been updated + // * we might have aged an object return; -#endif } ASSERTM(LOOKS_LIKE_CLOSURE_PTR(p), "invalid closure, info=%p", p->header.info); @@ -1434,19 +1578,20 @@ mark_closure (MarkQueue *queue, StgClosure *p, StgClosure **origin) GNUC_ATTR_HOT void nonmovingMark (MarkQueue *queue) { + traceConcMarkBegin(); debugTrace(DEBUG_nonmoving_gc, "Starting mark pass"); unsigned int count = 0; while (true) { count++; MarkQueueEnt ent = markQueuePop(queue); - switch (ent.type) { + switch (nonmovingMarkQueueEntryType(&ent)) { case MARK_CLOSURE: mark_closure(queue, ent.mark_closure.p, ent.mark_closure.origin); break; case MARK_ARRAY: { const StgMutArrPtrs *arr = ent.mark_array.array; - StgWord start = ent.mark_array.start_index; + StgWord start = ent.mark_array.start_index >> 16; StgWord end = start + MARK_ARRAY_CHUNK_LENGTH; if (end < arr->ptrs) { markQueuePushArray(queue, ent.mark_array.array, end); @@ -1474,6 +1619,7 @@ nonmovingMark (MarkQueue *queue) } else { // Nothing more to do debugTrace(DEBUG_nonmoving_gc, "Finished mark pass: %d", count); + traceConcMarkEnd(count); return; } } @@ -1691,13 +1837,17 @@ void nonmovingResurrectThreads (struct MarkQueue_ *queue, StgTSO **resurrected_t void printMarkQueueEntry (MarkQueueEnt *ent) { - if (ent->type == MARK_CLOSURE) { + switch(nonmovingMarkQueueEntryType(ent)) { + case MARK_CLOSURE: debugBelch("Closure: "); printClosure(ent->mark_closure.p); - } else if (ent->type == MARK_ARRAY) { + break; + case MARK_ARRAY: debugBelch("Array\n"); - } else { + break; + case NULL_ENTRY: debugBelch("End of mark\n"); + break; } } diff --git a/rts/sm/NonMovingMark.h b/rts/sm/NonMovingMark.h index 84b6642d6c..806776cdc5 100644 --- a/rts/sm/NonMovingMark.h +++ b/rts/sm/NonMovingMark.h @@ -43,21 +43,40 @@ enum EntryType { */ typedef struct { - enum EntryType type; - // All pointers should be untagged + // Which kind of mark queue entry we have is determined by the low bits of + // the second word: they must be zero in the case of a mark_closure entry + // (since the second word of a mark_closure entry points to a pointer and + // pointers must be word-aligned). In the case of a mark_array we set them + // to 0x3 (the value of start_index is shifted to the left to accomodate + // this). null_entry where p==NULL is used to indicate the end of the queue. union { struct { + void *p; // must be NULL + } null_entry; + struct { StgClosure *p; // the object to be marked StgClosure **origin; // field where this reference was found. // See Note [Origin references in the nonmoving collector] } mark_closure; struct { const StgMutArrPtrs *array; - StgWord start_index; + StgWord start_index; // start index is shifted to the left by 16 bits } mark_array; }; } MarkQueueEnt; +INLINE_HEADER enum EntryType nonmovingMarkQueueEntryType(MarkQueueEnt *ent) +{ + if (ent->null_entry.p == NULL) { + return NULL_ENTRY; + } else if (((uintptr_t) ent->mark_closure.origin & TAG_BITS) == 0) { + return MARK_CLOSURE; + } else { + ASSERT((ent->mark_array.start_index & TAG_BITS) == 0x3); + return MARK_ARRAY; + } +} + typedef struct { // index of first *unused* queue entry uint32_t head; @@ -65,6 +84,9 @@ typedef struct { MarkQueueEnt entries[]; } MarkQueueBlock; +// How far ahead in mark queue to prefetch? +#define MARK_PREFETCH_QUEUE_DEPTH 5 + /* The mark queue is not capable of concurrent read or write. * * invariants: @@ -83,9 +105,12 @@ typedef struct MarkQueue_ { // Is this a mark queue or a capability-local update remembered set? bool is_upd_rem_set; - // Marked objects outside of nonmoving heap, namely large and static - // objects. - HashTable *marked_objects; +#if MARK_PREFETCH_QUEUE_DEPTH > 0 + // A ring-buffer of entries which we will mark next + MarkQueueEnt prefetch_queue[MARK_PREFETCH_QUEUE_DEPTH]; + // The first free slot in prefetch_queue. + uint8_t prefetch_head; +#endif } MarkQueue; /* While it shares its representation with MarkQueue, UpdRemSet differs in @@ -97,8 +122,11 @@ typedef struct { MarkQueue queue; } UpdRemSet; +// Number of blocks to allocate for a mark queue +#define MARK_QUEUE_BLOCKS 16 + // The length of MarkQueueBlock.entries -#define MARK_QUEUE_BLOCK_ENTRIES ((BLOCK_SIZE - sizeof(MarkQueueBlock)) / sizeof(MarkQueueEnt)) +#define MARK_QUEUE_BLOCK_ENTRIES ((MARK_QUEUE_BLOCKS * BLOCK_SIZE - sizeof(MarkQueueBlock)) / sizeof(MarkQueueEnt)) extern bdescr *nonmoving_large_objects, *nonmoving_marked_large_objects; extern memcount n_nonmoving_large_blocks, n_nonmoving_marked_large_blocks; @@ -145,8 +173,10 @@ void nonmovingResurrectThreads(struct MarkQueue_ *queue, StgTSO **resurrected_th bool nonmovingIsAlive(StgClosure *p); void nonmovingMarkDeadWeak(struct MarkQueue_ *queue, StgWeak *w); void nonmovingMarkLiveWeak(struct MarkQueue_ *queue, StgWeak *w); +void nonmovingAddUpdRemSetBlocks(struct MarkQueue_ *rset); void markQueuePush(MarkQueue *q, const MarkQueueEnt *ent); +void markQueuePushClosureGC(MarkQueue *q, StgClosure *p); void markQueuePushClosure(MarkQueue *q, StgClosure *p, StgClosure **origin); diff --git a/rts/sm/NonMovingScav.c b/rts/sm/NonMovingScav.c index 850750b43b..0efbde688c 100644 --- a/rts/sm/NonMovingScav.c +++ b/rts/sm/NonMovingScav.c @@ -16,6 +16,7 @@ nonmovingScavengeOne (StgClosure *q) ASSERT(LOOKS_LIKE_CLOSURE_PTR(q)); StgPtr p = (StgPtr)q; const StgInfoTable *info = get_itbl(q); + const bool saved_eager_promotion = gct->eager_promotion; switch (info->type) { @@ -23,9 +24,11 @@ nonmovingScavengeOne (StgClosure *q) case MVAR_DIRTY: { StgMVar *mvar = ((StgMVar *)p); + gct->eager_promotion = false; evacuate((StgClosure **)&mvar->head); evacuate((StgClosure **)&mvar->tail); evacuate((StgClosure **)&mvar->value); + gct->eager_promotion = saved_eager_promotion; if (gct->failed_to_evac) { mvar->header.info = &stg_MVAR_DIRTY_info; } else { @@ -37,8 +40,10 @@ nonmovingScavengeOne (StgClosure *q) case TVAR: { StgTVar *tvar = ((StgTVar *)p); + gct->eager_promotion = false; evacuate((StgClosure **)&tvar->current_value); evacuate((StgClosure **)&tvar->first_watch_queue_entry); + gct->eager_promotion = saved_eager_promotion; if (gct->failed_to_evac) { tvar->header.info = &stg_TVAR_DIRTY_info; } else { @@ -122,10 +127,21 @@ nonmovingScavengeOne (StgClosure *q) break; } + case WEAK: + { + // We must evacuate the key since it may refer to an object in the + // moving heap which may be long gone by the time we call + // nonmovingTidyWeaks. + StgWeak *weak = (StgWeak *) p; + gct->eager_promotion = true; + evacuate(&weak->key); + gct->eager_promotion = saved_eager_promotion; + goto gen_obj; + } + gen_obj: case CONSTR: case CONSTR_NOCAF: - case WEAK: case PRIM: { StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs; @@ -145,7 +161,9 @@ nonmovingScavengeOne (StgClosure *q) case MUT_VAR_CLEAN: case MUT_VAR_DIRTY: + gct->eager_promotion = false; evacuate(&((StgMutVar *)p)->var); + gct->eager_promotion = saved_eager_promotion; if (gct->failed_to_evac) { ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info; } else { @@ -157,10 +175,12 @@ nonmovingScavengeOne (StgClosure *q) { StgBlockingQueue *bq = (StgBlockingQueue *)p; + gct->eager_promotion = false; evacuate(&bq->bh); evacuate((StgClosure**)&bq->owner); evacuate((StgClosure**)&bq->queue); evacuate((StgClosure**)&bq->link); + gct->eager_promotion = saved_eager_promotion; if (gct->failed_to_evac) { bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info; @@ -202,11 +222,9 @@ nonmovingScavengeOne (StgClosure *q) case MUT_ARR_PTRS_CLEAN: case MUT_ARR_PTRS_DIRTY: { - // We don't eagerly promote objects pointed to by a mutable - // array, but if we find the array only points to objects in - // the same or an older generation, we mark it "clean" and - // avoid traversing it during minor GCs. + gct->eager_promotion = false; scavenge_mut_arr_ptrs((StgMutArrPtrs*)p); + gct->eager_promotion = saved_eager_promotion; if (gct->failed_to_evac) { ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info; } else { @@ -234,14 +252,13 @@ nonmovingScavengeOne (StgClosure *q) case SMALL_MUT_ARR_PTRS_DIRTY: // follow everything { - // We don't eagerly promote objects pointed to by a mutable - // array, but if we find the array only points to objects in - // the same or an older generation, we mark it "clean" and - // avoid traversing it during minor GCs. StgPtr next = p + small_mut_arr_ptrs_sizeW((StgSmallMutArrPtrs*)p); + gct->eager_promotion = false; for (p = (P_)((StgSmallMutArrPtrs *)p)->payload; p < next; p++) { evacuate((StgClosure **)p); } + gct->eager_promotion = saved_eager_promotion; + if (gct->failed_to_evac) { ((StgClosure *)q)->header.info = &stg_SMALL_MUT_ARR_PTRS_DIRTY_info; } else { @@ -278,21 +295,21 @@ nonmovingScavengeOne (StgClosure *q) { StgStack *stack = (StgStack*)p; + gct->eager_promotion = false; scavenge_stack(stack->sp, stack->stack + stack->stack_size); + gct->eager_promotion = saved_eager_promotion; stack->dirty = gct->failed_to_evac; - // TODO (osa): There may be something special about stacks that we're - // missing. All other mut objects are marked by using a different info - // table except stacks. - break; } case MUT_PRIM: { StgPtr end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs; + gct->eager_promotion = false; for (p = (P_)((StgClosure *)p)->payload; p < end; p++) { evacuate((StgClosure **)p); } + gct->eager_promotion = saved_eager_promotion; gct->failed_to_evac = true; // mutable break; } @@ -302,12 +319,14 @@ nonmovingScavengeOne (StgClosure *q) StgWord i; StgTRecChunk *tc = ((StgTRecChunk *) p); TRecEntry *e = &(tc -> entries[0]); + gct->eager_promotion = false; evacuate((StgClosure **)&tc->prev_chunk); for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) { evacuate((StgClosure **)&e->tvar); evacuate((StgClosure **)&e->expected_value); evacuate((StgClosure **)&e->new_value); } + gct->eager_promotion = saved_eager_promotion; gct->failed_to_evac = true; // mutable break; } diff --git a/rts/sm/NonMovingSweep.c b/rts/sm/NonMovingSweep.c index fa3e38cca4..7af5508afc 100644 --- a/rts/sm/NonMovingSweep.c +++ b/rts/sm/NonMovingSweep.c @@ -17,38 +17,6 @@ #include "Trace.h" #include "StableName.h" -static struct NonmovingSegment *pop_all_filled_segments(struct NonmovingAllocator *alloc) -{ - while (true) { - struct NonmovingSegment *head = alloc->filled; - if (cas((StgVolatilePtr) &alloc->filled, (StgWord) head, (StgWord) NULL) == (StgWord) head) - return head; - } -} - -void nonmovingPrepareSweep() -{ - ASSERT(nonmovingHeap.sweep_list == NULL); - - // Move blocks in the allocators' filled lists into sweep_list - for (unsigned int alloc_idx = 0; alloc_idx < NONMOVING_ALLOCA_CNT; alloc_idx++) - { - struct NonmovingAllocator *alloc = nonmovingHeap.allocators[alloc_idx]; - struct NonmovingSegment *filled = pop_all_filled_segments(alloc); - - // Link filled to sweep_list - if (filled) { - struct NonmovingSegment *filled_head = filled; - // Find end of filled list - while (filled->link) { - filled = filled->link; - } - filled->link = nonmovingHeap.sweep_list; - nonmovingHeap.sweep_list = filled_head; - } - } -} - // On which list should a particular segment be placed? enum SweepResult { SEGMENT_FREE, // segment is empty: place on free list @@ -102,7 +70,7 @@ nonmovingSweepSegment(struct NonmovingSegment *seg) #if defined(DEBUG) -void nonmovingGcCafs(struct MarkQueue_ *queue) +void nonmovingGcCafs() { uint32_t i = 0; StgIndStatic *next; @@ -116,7 +84,8 @@ void nonmovingGcCafs(struct MarkQueue_ *queue) const StgInfoTable *info = get_itbl((StgClosure*)caf); ASSERT(info->type == IND_STATIC); - if (lookupHashTable(queue->marked_objects, (StgWord) caf) == NULL) { + StgWord flag = ((StgWord) caf->static_link) & STATIC_BITS; + if (flag != 0 && flag != static_flag) { debugTrace(DEBUG_gccafs, "CAF gc'd at 0x%p", caf); SET_INFO((StgClosure*)caf, &stg_GCD_CAF_info); // stub it } else { @@ -184,6 +153,126 @@ GNUC_ATTR_HOT void nonmovingSweep(void) } } +/* Must a closure remain on the mutable list? + * + * A closure must remain if any of the following applies: + * + * 1. it contains references to a younger generation + * 2. it's a mutable closure (e.g. mutable array or MUT_PRIM) + */ +static bool is_closure_clean(StgClosure *p) +{ + const StgInfoTable *info = get_itbl(p); + +#define CLEAN(ptr) (!HEAP_ALLOCED((StgClosure*) ptr) || Bdescr((StgPtr) ptr)->gen == oldest_gen) + + switch (info->type) { + case MVAR_CLEAN: + case MVAR_DIRTY: + { + StgMVar *mvar = ((StgMVar *)p); + if (!CLEAN(mvar->head)) goto dirty_MVAR; + if (!CLEAN(mvar->tail)) goto dirty_MVAR; + if (!CLEAN(mvar->value)) goto dirty_MVAR; + mvar->header.info = &stg_MVAR_CLEAN_info; + return true; + +dirty_MVAR: + mvar->header.info = &stg_MVAR_DIRTY_info; + return false; + } + + case TVAR: + { + StgTVar *tvar = ((StgTVar *)p); + if (!CLEAN(tvar->current_value)) goto dirty_TVAR; + if (!CLEAN(tvar->first_watch_queue_entry)) goto dirty_TVAR; + tvar->header.info = &stg_TVAR_CLEAN_info; + return true; + +dirty_TVAR: + tvar->header.info = &stg_TVAR_DIRTY_info; + return false; + } + + case THUNK: + case THUNK_1_0: + case THUNK_0_1: + case THUNK_1_1: + case THUNK_0_2: + case THUNK_2_0: + { + StgPtr end = (StgPtr)((StgThunk *)p)->payload + info->layout.payload.ptrs; + for (StgPtr q = (StgPtr)((StgThunk *)p)->payload; q < end; q++) { + if (!CLEAN(*q)) return false; + } + return true; + } + + case FUN: + case FUN_1_0: // hardly worth specialising these guys + case FUN_0_1: + case FUN_1_1: + case FUN_0_2: + case FUN_2_0: + case CONSTR: + case CONSTR_NOCAF: + case CONSTR_1_0: + case CONSTR_0_1: + case CONSTR_1_1: + case CONSTR_0_2: + case CONSTR_2_0: + case PRIM: + { + StgPtr end = (StgPtr)((StgClosure *)p)->payload + info->layout.payload.ptrs; + for (StgPtr q = (StgPtr)((StgClosure *)p)->payload; q < end; q++) { + if (!CLEAN(*q)) return false; + } + return true; + } + + case WEAK: + return false; // TODO + + case MUT_VAR_CLEAN: + case MUT_VAR_DIRTY: + if (!CLEAN(((StgMutVar *)p)->var)) { + p->header.info = &stg_MUT_VAR_DIRTY_info; + return false; + } else { + p->header.info = &stg_MUT_VAR_CLEAN_info; + return true; + } + + case BLOCKING_QUEUE: + { + StgBlockingQueue *bq = (StgBlockingQueue *)p; + + if (!CLEAN(bq->bh)) goto dirty_BLOCKING_QUEUE; + if (!CLEAN(bq->owner)) goto dirty_BLOCKING_QUEUE; + if (!CLEAN(bq->queue)) goto dirty_BLOCKING_QUEUE; + if (!CLEAN(bq->link)) goto dirty_BLOCKING_QUEUE; + bq->header.info = &stg_BLOCKING_QUEUE_CLEAN_info; + return true; + +dirty_BLOCKING_QUEUE: + bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info; + return false; + } + + case THUNK_SELECTOR: + return CLEAN(((StgSelector *) p)->selectee); + + case ARR_WORDS: + return true; + + default: + // TODO: the rest + return false; + } +#undef CLEAN +} + /* N.B. This happens during the pause so we own all capabilities. */ void nonmovingSweepMutLists() { @@ -194,7 +283,7 @@ void nonmovingSweepMutLists() for (bdescr *bd = old_mut_list; bd; bd = bd->link) { for (StgPtr p = bd->start; p < bd->free; p++) { StgClosure **q = (StgClosure**)p; - if (nonmovingIsAlive(*q)) { + if (nonmovingIsAlive(*q) && !is_closure_clean(*q)) { recordMutableCap(*q, cap, oldest_gen->no); } } diff --git a/rts/sm/NonMovingSweep.h b/rts/sm/NonMovingSweep.h index de2d52acb5..5ae5b687e3 100644 --- a/rts/sm/NonMovingSweep.h +++ b/rts/sm/NonMovingSweep.h @@ -22,11 +22,7 @@ void nonmovingSweepLargeObjects(void); // Remove dead entries in the stable name table void nonmovingSweepStableNameTable(void); -// Collect the set of segments to be collected during a major GC into -// nonmovingHeap.sweep_list. -void nonmovingPrepareSweep(void); - #if defined(DEBUG) // The non-moving equivalent of the moving collector's gcCAFs. -void nonmovingGcCafs(struct MarkQueue_ *queue); +void nonmovingGcCafs(void); #endif diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c index 44163a353f..f04b3c5929 100644 --- a/rts/sm/Storage.c +++ b/rts/sm/Storage.c @@ -321,7 +321,8 @@ freeStorage (bool free_heap) } /* ----------------------------------------------------------------------------- - Note [CAF management]. + Note [CAF management] + ~~~~~~~~~~~~~~~~~~~~~ The entry code for every CAF does the following: @@ -356,6 +357,7 @@ freeStorage (bool free_heap) ------------------ Note [atomic CAF entry] + ~~~~~~~~~~~~~~~~~~~~~~~ With THREADED_RTS, newCAF() is required to be atomic (see #5558). This is because if two threads happened to enter the same @@ -369,6 +371,7 @@ freeStorage (bool free_heap) ------------------ Note [GHCi CAFs] + ~~~~~~~~~~~~~~~~ For GHCI, we have additional requirements when dealing with CAFs: @@ -388,6 +391,51 @@ freeStorage (bool free_heap) -- SDM 29/1/01 + ------------------ + Note [Static objects under the nonmoving collector] + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Static object management is a bit tricky under the nonmoving collector as we + need to maintain a bit more state than in the moving collector. In + particular, the moving collector uses the low bits of the STATIC_LINK field + to determine whether the object has been moved to the scavenger's work list + (see Note [STATIC_LINK fields] in Storage.h). + + However, the nonmoving collector also needs a place to keep its mark bit. + This is problematic as we therefore need at least three bits of state + but can assume only two bits are available in STATIC_LINK (due to 32-bit + systems). + + To accomodate this we move handling of static objects entirely to the + oldest generation when the nonmoving collector is in use. To do this safely + and efficiently we allocate the blackhole created by lockCAF() directly in + the non-moving heap. This means that the moving collector can completely + ignore static objects in minor collections since they are guaranteed not to + have any references into the moving heap. Of course, the blackhole itself + likely will contain a reference into the moving heap but this is + significantly easier to handle, being a heap-allocated object (see Note + [Aging under the non-moving collector] in NonMoving.c for details). + + During the moving phase of a major collection we treat static objects + as we do any other reference into the non-moving heap by pushing them + to the non-moving mark queue (see Note [Aging under the non-moving + collector]). + + This allows the non-moving collector to have full control over the flags + in STATIC_LINK, which it uses as described in Note [STATIC_LINK fields]). + This is implemented by NonMovingMark.c:bump_static_flag. + + In short, the plan is: + + - lockCAF allocates its blackhole in the nonmoving heap. This is important + to ensure that we do not need to place the static object on the mut_list + lest we would need somw way to ensure that it evacuate only once during + a moving collection. + + - evacuate_static_object adds merely pushes objects to the mark queue + + - the nonmoving collector uses the flags in STATIC_LINK as its mark bit. + -------------------------------------------------------------------------- */ STATIC_INLINE StgInd * @@ -441,7 +489,16 @@ lockCAF (StgRegTable *reg, StgIndStatic *caf) caf->saved_info = orig_info; // Allocate the blackhole indirection closure - bh = (StgInd *)allocate(cap, sizeofW(*bh)); + if (RtsFlags.GcFlags.useNonmoving) { + // See Note [Static objects under the nonmoving collector]. + ACQUIRE_SM_LOCK; + bh = (StgInd *)nonmovingAllocate(cap, sizeofW(*bh)); + RELEASE_SM_LOCK; + recordMutableCap((StgClosure*)bh, + regTableToCapability(reg), oldest_gen->no); + } else { + bh = (StgInd *)allocate(cap, sizeofW(*bh)); + } bh->indirectee = (StgClosure *)cap->r.rCurrentTSO; SET_HDR(bh, &stg_CAF_BLACKHOLE_info, caf->header.prof.ccs); // Ensure that above writes are visible before we introduce reference as CAF indirectee. @@ -483,7 +540,9 @@ newCAF(StgRegTable *reg, StgIndStatic *caf) else { // Put this CAF on the mutable list for the old generation. - if (oldest_gen->no != 0) { + // N.B. the nonmoving collector works a bit differently: see + // Note [Static objects under the nonmoving collector]. + if (oldest_gen->no != 0 && !RtsFlags.GcFlags.useNonmoving) { recordMutableCap((StgClosure*)caf, regTableToCapability(reg), oldest_gen->no); } @@ -560,7 +619,9 @@ StgInd* newGCdCAF (StgRegTable *reg, StgIndStatic *caf) if (!bh) return NULL; // Put this CAF on the mutable list for the old generation. - if (oldest_gen->no != 0) { + // N.B. the nonmoving collector works a bit differently: + // see Note [Static objects under the nonmoving collector]. + if (oldest_gen->no != 0 && !RtsFlags.GcFlags.useNonmoving) { recordMutableCap((StgClosure*)caf, regTableToCapability(reg), oldest_gen->no); } @@ -1520,6 +1581,7 @@ void flushExec (W_ len, AdjustorExecutable exec_addr) __clear_cache((void*)begin, (void*)end); # endif #elif defined(__GNUC__) + /* For all other platforms, fall back to a libgcc builtin. */ unsigned char* begin = (unsigned char*)exec_addr; unsigned char* end = begin + len; # if GCC_HAS_BUILTIN_CLEAR_CACHE |