diff options
author | Ben Gamari <ben@smart-cactus.org> | 2020-02-20 20:57:48 -0500 |
---|---|---|
committer | Marge Bot <ben+marge-bot@smart-cactus.org> | 2020-03-05 14:53:12 -0500 |
commit | ace618cd2294989e783bd453cee88e0e1c0dad77 (patch) | |
tree | 97d34db5521cf1781e57e786eb41f14087808cf9 /rts | |
parent | cedd6f3041de6abe64dfa3257bec7730a9dced9f (diff) | |
download | haskell-ace618cd2294989e783bd453cee88e0e1c0dad77.tar.gz |
nonmoving-gc: Track time usage of nonmoving marking
Diffstat (limited to 'rts')
-rw-r--r-- | rts/ProfHeap.c | 4 | ||||
-rw-r--r-- | rts/Stats.c | 190 | ||||
-rw-r--r-- | rts/Stats.h | 12 | ||||
-rw-r--r-- | rts/sm/GC.c | 16 | ||||
-rw-r--r-- | rts/sm/GCThread.h | 8 | ||||
-rw-r--r-- | rts/sm/NonMoving.c | 3 | ||||
-rw-r--r-- | rts/sm/NonMovingMark.c | 3 |
7 files changed, 212 insertions, 24 deletions
diff --git a/rts/ProfHeap.c b/rts/ProfHeap.c index bc439c1678..efb312e899 100644 --- a/rts/ProfHeap.c +++ b/rts/ProfHeap.c @@ -1212,6 +1212,8 @@ heapCensusChain( Census *census, bdescr *bd ) } } +// Time is process CPU time of beginning of current GC and is used as +// the mutator CPU time reported as the census timestamp. void heapCensus (Time t) { uint32_t g, n; @@ -1219,7 +1221,7 @@ void heapCensus (Time t) gen_workspace *ws; census = &censuses[era]; - census->time = mut_user_time_until(t); + census->time = TimeToSecondsDbl(t); census->rtime = TimeToNS(stat_getElapsedTime()); diff --git a/rts/Stats.c b/rts/Stats.c index 889f6d92f3..0e6cb09b27 100644 --- a/rts/Stats.c +++ b/rts/Stats.c @@ -26,14 +26,14 @@ #include <string.h> // for memset -#define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION) - static Time start_init_cpu, start_init_elapsed, end_init_cpu, end_init_elapsed, start_exit_cpu, start_exit_elapsed, start_exit_gc_elapsed, start_exit_gc_cpu, - end_exit_cpu, end_exit_elapsed; + end_exit_cpu, end_exit_elapsed, + start_nonmoving_gc_cpu, start_nonmoving_gc_elapsed, + start_nonmoving_gc_sync_elapsed; #if defined(PROFILING) static Time RP_start_time = 0, RP_tot_time = 0; // retainer prof user time @@ -84,7 +84,7 @@ Time stat_getElapsedTime(void) double mut_user_time_until( Time t ) { - return TimeToSecondsDbl(t - stats.gc_cpu_ns); + return TimeToSecondsDbl(t - stats.gc_cpu_ns - stats.nonmoving_gc_cpu_ns); // heapCensus() time is included in GC_tot_cpu, so we don't need // to subtract it here. @@ -125,6 +125,10 @@ initStats0(void) end_init_cpu = 0; end_init_elapsed = 0; + start_nonmoving_gc_cpu = 0; + start_nonmoving_gc_elapsed = 0; + start_nonmoving_gc_sync_elapsed = 0; + start_exit_cpu = 0; start_exit_elapsed = 0; start_exit_gc_cpu = 0; @@ -175,6 +179,11 @@ initStats0(void) .gc_elapsed_ns = 0, .cpu_ns = 0, .elapsed_ns = 0, + .nonmoving_gc_cpu_ns = 0, + .nonmoving_gc_elapsed_ns = 0, + .nonmoving_gc_max_elapsed_ns = 0, + .nonmoving_gc_sync_elapsed_ns = 0, + .nonmoving_gc_sync_max_elapsed_ns = 0, .gc = { .gen = 0, .threads = 0, @@ -189,7 +198,10 @@ initStats0(void) .par_balanced_copied_bytes = 0, .sync_elapsed_ns = 0, .cpu_ns = 0, - .elapsed_ns = 0 + .elapsed_ns = 0, + .nonmoving_gc_cpu_ns = 0, + .nonmoving_gc_elapsed_ns = 0, + .nonmoving_gc_sync_elapsed_ns = 0, } }; } @@ -274,6 +286,11 @@ stat_startExit(void) start_exit_gc_cpu = stats.gc_cpu_ns; } +/* ----------------------------------------------------------------------------- + Nonmoving (concurrent) collector statistics + + These two measure the time taken in the concurrent mark & sweep collector. + -------------------------------------------------------------------------- */ void stat_endExit(void) { @@ -286,10 +303,115 @@ stat_startGCSync (gc_thread *gct) gct->gc_sync_start_elapsed = getProcessElapsedTime(); } +void +stat_startNonmovingGc () +{ + start_nonmoving_gc_cpu = getCurrentThreadCPUTime(); + start_nonmoving_gc_elapsed = getProcessCPUTime(); +} + +void +stat_endNonmovingGc () +{ + Time cpu = getCurrentThreadCPUTime(); + Time elapsed = getProcessCPUTime(); + stats.gc.nonmoving_gc_elapsed_ns = elapsed - start_nonmoving_gc_elapsed; + stats.nonmoving_gc_elapsed_ns += stats.gc.nonmoving_gc_elapsed_ns; + + stats.gc.nonmoving_gc_cpu_ns = cpu - start_nonmoving_gc_cpu; + stats.nonmoving_gc_cpu_ns += stats.gc.nonmoving_gc_cpu_ns; + + stats.nonmoving_gc_max_elapsed_ns = + stg_max(stats.gc.nonmoving_gc_elapsed_ns, + stats.nonmoving_gc_max_elapsed_ns); +} + +void +stat_startNonmovingGcSync () +{ + start_nonmoving_gc_sync_elapsed = getProcessElapsedTime(); + traceConcSyncBegin(); +} + +void +stat_endNonmovingGcSync () +{ + Time end_elapsed = getProcessElapsedTime(); + stats.gc.nonmoving_gc_sync_elapsed_ns = end_elapsed - start_nonmoving_gc_sync_elapsed; + stats.nonmoving_gc_sync_elapsed_ns += stats.gc.nonmoving_gc_sync_elapsed_ns; + stats.nonmoving_gc_sync_max_elapsed_ns = + stg_max(stats.gc.nonmoving_gc_sync_elapsed_ns, + stats.nonmoving_gc_sync_max_elapsed_ns); + traceConcSyncEnd(); +} + /* ----------------------------------------------------------------------------- Called at the beginning of each GC -------------------------------------------------------------------------- */ +/* + * Note [Time accounting] + * ~~~~~~~~~~~~~~~~~~~~~~ + * In the "vanilla" configuration (using the standard copying GC) GHC keeps + * track of a two different sinks of elapsed and CPU time: + * + * - time spent synchronising to initiate garbage collection + * - garbage collection (per generation) + * - mutation + * + * When using the (concurrent) non-moving garbage collector (see Note + * [Non-moving garbage collector]) we also track a few more sinks: + * + * - minor GC + * - major GC (namly time spent in the preparatory phase) + * - concurrent mark + * - final synchronization (elapsed only) + * - mutation + * + * To keep track of these CPU times we rely on the system's per-thread CPU time + * clock (exposed via the runtime's getCurrentThreadCPUTime utility). + * + * CPU time spent in the copying garbage collector is tracked in each GC + * worker's gc_thread struct. At the beginning of scavenging each worker + * records its OS thread's CPU time its gc_thread (by stat_startGCWorker). At + * the end of scavenging we again record the CPU time (in stat_endGCworker). + * The differences of these are then summed over by the thread leading the GC + * at the end of collection in stat_endGC. By contrast, the elapsed time is + * recorded only by the leader. + * + * Mutator time is derived from the process's CPU time, subtracting out + * contributions from stop-the-world and concurrent GCs. + * + * Time spent in concurrent marking is recorded by stat_{start,end}NonmovingGc. + * Likewise, elapsed time spent in the final synchronization is recorded by + * stat_{start,end}NonmovingGcSync. + */ + +void +stat_startGCWorker (Capability *cap STG_UNUSED, gc_thread *gct) +{ + bool stats_enabled = + RtsFlags.GcFlags.giveStats != NO_GC_STATS || + rtsConfig.gcDoneHook != NULL; + + if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) { + gct->gc_start_cpu = getCurrentThreadCPUTime(); + } +} + +void +stat_endGCWorker (Capability *cap STG_UNUSED, gc_thread *gct) +{ + bool stats_enabled = + RtsFlags.GcFlags.giveStats != NO_GC_STATS || + rtsConfig.gcDoneHook != NULL; + + if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) { + gct->gc_end_cpu = getCurrentThreadCPUTime(); + ASSERT(gct->gc_end_cpu >= gct->gc_start_cpu); + } +} + void stat_startGC (Capability *cap, gc_thread *gct) { @@ -297,7 +419,15 @@ stat_startGC (Capability *cap, gc_thread *gct) debugBelch("\007"); } - getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed); + bool stats_enabled = + RtsFlags.GcFlags.giveStats != NO_GC_STATS || + rtsConfig.gcDoneHook != NULL; + + if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) { + gct->gc_start_cpu = getCurrentThreadCPUTime(); + } + + gct->gc_start_elapsed = getProcessElapsedTime(); // Post EVENT_GC_START with the same timestamp as used for stats // (though converted from Time=StgInt64 to EventTimestamp=StgWord64). @@ -320,9 +450,9 @@ stat_startGC (Capability *cap, gc_thread *gct) -------------------------------------------------------------------------- */ void -stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop, - uint32_t gen, uint32_t par_n_threads, W_ par_max_copied, - W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield, +stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ slop, + uint32_t gen, uint32_t par_n_threads, gc_thread **gc_threads, + W_ par_max_copied, W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work, W_ scav_find_work) { @@ -364,9 +494,14 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop, stats.elapsed_ns = current_elapsed - start_init_elapsed; stats.gc.sync_elapsed_ns = - gct->gc_start_elapsed - gct->gc_sync_start_elapsed; - stats.gc.elapsed_ns = current_elapsed - gct->gc_start_elapsed; - stats.gc.cpu_ns = current_cpu - gct->gc_start_cpu; + initiating_gct->gc_start_elapsed - initiating_gct->gc_sync_start_elapsed; + stats.gc.elapsed_ns = current_elapsed - initiating_gct->gc_start_elapsed; + stats.gc.cpu_ns = 0; + for (unsigned int i=0; i < par_n_threads; i++) { + gc_thread *gct = gc_threads[i]; + ASSERT(gct->gc_end_cpu >= gct->gc_start_cpu); + stats.gc.cpu_ns += gct->gc_end_cpu - gct->gc_start_cpu; + } } // ------------------------------------------------- // Update the cumulative stats @@ -473,8 +608,8 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop, TimeToSecondsDbl(stats.gc.elapsed_ns), TimeToSecondsDbl(stats.cpu_ns), TimeToSecondsDbl(stats.elapsed_ns), - faults - gct->gc_start_faults, - gct->gc_start_faults - GC_end_faults, + faults - initiating_gct->gc_start_faults, + initiating_gct->gc_start_faults - GC_end_faults, gen); GC_end_faults = faults; @@ -700,6 +835,21 @@ static void report_summary(const RTSSummaryStats* sum) TimeToSecondsDbl(gen_stats->avg_pause_ns), TimeToSecondsDbl(gen_stats->max_pause_ns)); } + if (RtsFlags.GcFlags.useNonmoving) { + const int n_major_colls = sum->gc_summary_stats[RtsFlags.GcFlags.generations-1].collections; + statsPrintf(" Gen 1 %5d syncs" + ", %6.3fs %3.4fs %3.4fs\n", + n_major_colls, + TimeToSecondsDbl(stats.nonmoving_gc_sync_elapsed_ns), + TimeToSecondsDbl(stats.nonmoving_gc_sync_elapsed_ns) / n_major_colls, + TimeToSecondsDbl(stats.nonmoving_gc_sync_max_elapsed_ns)); + statsPrintf(" Gen 1 concurrent" + ", %6.3fs %6.3fs %3.4fs %3.4fs\n", + TimeToSecondsDbl(stats.nonmoving_gc_cpu_ns), + TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns), + TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns) / n_major_colls, + TimeToSecondsDbl(stats.nonmoving_gc_max_elapsed_ns)); + } statsPrintf("\n"); @@ -736,6 +886,12 @@ static void report_summary(const RTSSummaryStats* sum) statsPrintf(" GC time %7.3fs (%7.3fs elapsed)\n", TimeToSecondsDbl(stats.gc_cpu_ns), TimeToSecondsDbl(stats.gc_elapsed_ns)); + if (RtsFlags.GcFlags.useNonmoving) { + statsPrintf( + " CONC GC time %7.3fs (%7.3fs elapsed)\n", + TimeToSecondsDbl(stats.nonmoving_gc_cpu_ns), + TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns)); + } #if defined(PROFILING) statsPrintf(" RP time %7.3fs (%7.3fs elapsed)\n", @@ -1094,7 +1250,8 @@ stat_exit (void) stats.mutator_cpu_ns = start_exit_cpu - end_init_cpu - - (stats.gc_cpu_ns - exit_gc_cpu); + - (stats.gc_cpu_ns - exit_gc_cpu) + - stats.nonmoving_gc_cpu_ns; stats.mutator_elapsed_ns = start_exit_elapsed - end_init_elapsed - (stats.gc_elapsed_ns - exit_gc_elapsed); @@ -1504,7 +1661,8 @@ void getRTSStats( RTSStats *s ) s->cpu_ns = current_cpu - end_init_cpu; s->elapsed_ns = current_elapsed - end_init_elapsed; - s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns; + s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns - + stats.nonmoving_gc_cpu_ns; s->mutator_elapsed_ns = current_elapsed - end_init_elapsed - stats.gc_elapsed_ns; } diff --git a/rts/Stats.h b/rts/Stats.h index 1f498aea23..f5b8ce9991 100644 --- a/rts/Stats.h +++ b/rts/Stats.h @@ -30,13 +30,21 @@ void stat_endInit(void); void stat_startGCSync(struct gc_thread_ *_gct); void stat_startGC(Capability *cap, struct gc_thread_ *_gct); -void stat_endGC (Capability *cap, struct gc_thread_ *_gct, W_ live, - W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads, +void stat_startGCWorker (Capability *cap, struct gc_thread_ *_gct); +void stat_endGCWorker (Capability *cap, struct gc_thread_ *_gct); +void stat_endGC (Capability *cap, struct gc_thread_ *initiating_gct, W_ live, + W_ copied, W_ slop, uint32_t gen, + uint32_t n_gc_threads, struct gc_thread_ **gc_threads, W_ par_max_copied, W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work, W_ scav_find_work); +void stat_startNonmovingGcSync(void); +void stat_endNonmovingGcSync(void); +void stat_startNonmovingGc (void); +void stat_endNonmovingGc (void); + #if defined(PROFILING) void stat_startRP(void); void stat_endRP(uint32_t, int, double); diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 526c84f411..16d8c77453 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -209,6 +209,14 @@ GarbageCollect (uint32_t collect_gen, gc_thread *saved_gct; #endif uint32_t g, n; + // The time we should report our heap census as occurring at, if necessary. + Time mut_time = 0; + + if (do_heap_census) { + RTSStats stats; + getRTSStats(&stats); + mut_time = stats.mutator_cpu_ns; + } // necessary if we stole a callee-saves register for gct: #if defined(THREADED_RTS) @@ -846,7 +854,7 @@ GarbageCollect (uint32_t collect_gen, if (do_heap_census) { debugTrace(DEBUG_sched, "performing heap census"); RELEASE_SM_LOCK; - heapCensus(gct->gc_start_cpu); + heapCensus(mut_time); ACQUIRE_SM_LOCK; } @@ -927,9 +935,11 @@ GarbageCollect (uint32_t collect_gen, #endif // ok, GC over: tell the stats department what happened. + stat_endGCWorker(cap, gct); stat_endGC(cap, gct, live_words, copied, live_blocks * BLOCK_SIZE_W - live_words /* slop */, - N, n_gc_threads, par_max_copied, par_balanced_copied, + N, n_gc_threads, gc_threads, + par_max_copied, par_balanced_copied, gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield, any_work, no_work, scav_find_work); @@ -1209,6 +1219,7 @@ gcWorkerThread (Capability *cap) SET_GCT(gc_threads[cap->no]); gct->id = osThreadId(); + stat_startGCWorker (cap, gct); // Wait until we're told to wake up RELEASE_SPIN_LOCK(&gct->mut_spin); @@ -1247,6 +1258,7 @@ gcWorkerThread (Capability *cap) gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE; debugTrace(DEBUG_gc, "GC thread %d waiting to continue...", gct->thread_index); + stat_endGCWorker (cap, gct); ACQUIRE_SPIN_LOCK(&gct->mut_spin); debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index); diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h index 3012f52f28..723bb00bf1 100644 --- a/rts/sm/GCThread.h +++ b/rts/sm/GCThread.h @@ -185,9 +185,11 @@ typedef struct gc_thread_ { W_ no_work; W_ scav_find_work; - Time gc_start_cpu; // process CPU time - Time gc_sync_start_elapsed; // start of GC sync - Time gc_start_elapsed; // process elapsed time + Time gc_start_cpu; // thread CPU time + Time gc_end_cpu; // thread CPU time + Time gc_sync_start_elapsed; // start of GC sync + Time gc_start_elapsed; // process elapsed time + Time gc_end_elapsed; // process elapsed time W_ gc_start_faults; // ------------------- diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c index 0bd96d1800..2dd201f0f9 100644 --- a/rts/sm/NonMoving.c +++ b/rts/sm/NonMoving.c @@ -17,6 +17,7 @@ #include "GCThread.h" #include "GCTDecl.h" #include "Schedule.h" +#include "Stats.h" #include "NonMoving.h" #include "NonMovingMark.h" @@ -947,6 +948,7 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO * { ACQUIRE_LOCK(&nonmoving_collection_mutex); debugTrace(DEBUG_nonmoving_gc, "Starting mark..."); + stat_startNonmovingGc(); // Do concurrent marking; most of the heap will get marked here. nonmovingMarkThreadsWeaks(mark_queue); @@ -1098,6 +1100,7 @@ finish: // We are done... mark_thread = 0; + stat_endNonmovingGc(); // Signal that the concurrent collection is finished, allowing the next // non-moving collection to proceed diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c index 6d70ca21dd..3113e7f013 100644 --- a/rts/sm/NonMovingMark.c +++ b/rts/sm/NonMovingMark.c @@ -21,6 +21,7 @@ #include "Printer.h" #include "Schedule.h" #include "Weak.h" +#include "Stats.h" #include "STM.h" #include "MarkWeak.h" #include "sm/Storage.h" @@ -316,6 +317,7 @@ void nonmovingBeginFlush(Task *task) debugTrace(DEBUG_nonmoving_gc, "Starting update remembered set flush..."); traceConcSyncBegin(); upd_rem_set_flush_count = 0; + stat_startNonmovingGcSync(); stopAllCapabilitiesWith(NULL, task, SYNC_FLUSH_UPD_REM_SET); // XXX: We may have been given a capability via releaseCapability (i.e. a @@ -407,6 +409,7 @@ void nonmovingFinishFlush(Task *task) debugTrace(DEBUG_nonmoving_gc, "Finished update remembered set flush..."); traceConcSyncEnd(); + stat_endNonmovingGcSync(); releaseAllCapabilities(n_capabilities, NULL, task); } #endif |