diff options
author | Douglas Wilson <douglas.wilson@gmail.com> | 2017-07-11 11:54:09 -0400 |
---|---|---|
committer | Ben Gamari <ben@smart-cactus.org> | 2017-07-11 13:41:54 -0400 |
commit | 7c9e356de1114ab3e31f2d6d03e83672076dd533 (patch) | |
tree | 5a89a142a31a53486e41c75673024548ce4c39ce /rts | |
parent | 905dc8bc74bebf5370eb9237cc8756cd9fe871ae (diff) | |
download | haskell-7c9e356de1114ab3e31f2d6d03e83672076dd533.tar.gz |
Fix Work Balance computation in RTS stats
An additional stat is tracked per gc: par_balanced_copied This is the
the number of bytes copied by each gc thread under the balanced lmit,
which is simply (copied_bytes / num_gc_threads). The stat is added to
all the appropriate GC structures, so is visible in the eventlog and in
GHC.Stats.
A note is added explaining how work balance is computed.
Remove some end of line whitespace
Test Plan:
./validate
experiment with the program attached to the ticket
examine code changes carefully
Reviewers: simonmar, austin, hvr, bgamari, erikd
Reviewed By: simonmar
Subscribers: Phyx, rwbarton, thomie
GHC Trac Issues: #13830
Differential Revision: https://phabricator.haskell.org/D3658
Diffstat (limited to 'rts')
-rw-r--r-- | rts/RtsProbes.d | 2 | ||||
-rw-r--r-- | rts/Stats.c | 97 | ||||
-rw-r--r-- | rts/Stats.h | 2 | ||||
-rw-r--r-- | rts/Trace.c | 6 | ||||
-rw-r--r-- | rts/Trace.h | 22 | ||||
-rw-r--r-- | rts/eventlog/EventLog.c | 13 | ||||
-rw-r--r-- | rts/eventlog/EventLog.h | 3 | ||||
-rw-r--r-- | rts/sm/GC.c | 21 |
8 files changed, 141 insertions, 25 deletions
diff --git a/rts/RtsProbes.d b/rts/RtsProbes.d index e5d8378845..277a494632 100644 --- a/rts/RtsProbes.d +++ b/rts/RtsProbes.d @@ -62,7 +62,7 @@ provider HaskellEvent { probe gc__work (EventCapNo); probe gc__done (EventCapNo); probe gc__global__sync (EventCapNo); - probe gc__stats (EventCapsetID, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord); + probe gc__stats (EventCapsetID, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord); probe heap__info (EventCapsetID, StgWord, StgWord, StgWord, StgWord, StgWord); probe heap__allocated (EventCapNo, EventCapsetID, StgWord64); probe heap__size (EventCapsetID, StgWord); diff --git a/rts/Stats.c b/rts/Stats.c index b0c1be0582..6a5f80130e 100644 --- a/rts/Stats.c +++ b/rts/Stats.c @@ -149,6 +149,7 @@ initStats0(void) .copied_bytes = 0, .par_copied_bytes = 0, .cumulative_par_max_copied_bytes = 0, + .cumulative_par_balanced_copied_bytes = 0, .mutator_cpu_ns = 0, .mutator_elapsed_ns = 0, .gc_cpu_ns = 0, @@ -166,6 +167,7 @@ initStats0(void) .mem_in_use_bytes = 0, .copied_bytes = 0, .par_max_copied_bytes = 0, + .par_balanced_copied_bytes = 0, .sync_elapsed_ns = 0, .cpu_ns = 0, .elapsed_ns = 0 @@ -283,7 +285,8 @@ stat_startGC (Capability *cap, gc_thread *gct) void stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop, uint32_t gen, - uint32_t par_n_threads, W_ par_max_copied) + uint32_t par_n_threads, W_ par_max_copied, + W_ par_balanced_copied) { // ------------------------------------------------- // Collect all the stats about this GC in stats.gc. We always do this since @@ -305,6 +308,7 @@ stat_endGC (Capability *cap, gc_thread *gct, stats.gc.mem_in_use_bytes = mblocks_allocated * MBLOCK_SIZE; stats.gc.copied_bytes = copied * sizeof(W_); stats.gc.par_max_copied_bytes = par_max_copied * sizeof(W_); + stats.gc.par_balanced_copied_bytes = par_balanced_copied * sizeof(W_); // ------------------------------------------------- // Update the cumulative stats @@ -324,6 +328,8 @@ stat_endGC (Capability *cap, gc_thread *gct, stats.par_copied_bytes += stats.gc.copied_bytes; stats.cumulative_par_max_copied_bytes += stats.gc.par_max_copied_bytes; + stats.cumulative_par_balanced_copied_bytes += + stats.gc.par_balanced_copied_bytes; } stats.gc_cpu_ns += stats.gc.cpu_ns; stats.gc_elapsed_ns += stats.gc.elapsed_ns; @@ -385,7 +391,8 @@ stat_endGC (Capability *cap, gc_thread *gct, * BLOCK_SIZE, par_n_threads, stats.gc.par_max_copied_bytes, - stats.gc.copied_bytes); + stats.gc.copied_bytes, + stats.gc.par_balanced_copied_bytes); // Post EVENT_GC_END with the same timestamp as used for stats // (though converted from Time=StgInt64 to EventTimestamp=StgWord64). @@ -672,11 +679,11 @@ stat_exit (void) } #if defined(THREADED_RTS) - if (RtsFlags.ParFlags.parGcEnabled && n_capabilities > 1) { + if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) { + // See Note [Work Balance] statsPrintf("\n Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n", - 100 * (((double)stats.par_copied_bytes / (double)stats.cumulative_par_max_copied_bytes) - 1) - / (n_capabilities - 1) - ); + 100 * (double)stats.cumulative_par_balanced_copied_bytes / + (double)stats.par_copied_bytes); } #endif statsPrintf("\n"); @@ -825,6 +832,84 @@ stat_exit (void) } } +/* Note [Work Balance] +---------------------- +Work balance is a measure of how evenly the work done during parallel garbage +collection is spread across threads. To compute work balance we must take care +to account for the number of GC threads changing between GCs. The statistics we +track must have the number of GC threads "integrated out". + +We accumulate two values from each garbage collection: +* par_copied: is a measure of the total work done during parallel garbage + collection +* par_balanced_copied: is a measure of the balanced work done + during parallel garbage collection. + +par_copied is simple to compute, but par_balanced_copied_bytes is somewhat more +complicated: + +For a given garbage collection: +Let gc_copied := total copies during the gc + gc_copied_i := copies by the ith thread during the gc + num_gc_threads := the number of threads participating in the gc + balance_limit := (gc_copied / num_gc_threads) + +If we were to graph gc_copied_i, sorted from largest to smallest we would see +something like: + + |X + ^ |X X + | |X X X X: unbalanced copies +copies |----------- Y: balanced copies by the busiest GC thread + |Y Z Z Z: other balanced copies + |Y Z Z Z + |Y Z Z Z Z + |Y Z Z Z Z Z + |=========== + |1 2 3 4 5 6 + i -> + +where the --- line is at balance_limit. Balanced copies are those under the --- +line, i.e. the area of the Ys and Zs. Note that the area occupied by the Ys will +always equal balance_limit. Completely balanced gc has every thread copying +balance_limit and a completely unbalanced gc has a single thread copying +gc_copied. + +One could define par_balance_copied as the areas of the Ys and Zs in the graph +above, however we would like the ratio of (par_balance_copied / gc_copied) to +range from 0 to 1, so that work_balance will be a nice percentage, also ranging +from 0 to 1. We therefore define par_balanced_copied as: + + ( num_gc_threads ) +{Sum[Min(gc_copied_i,balance_limit)] - balance_limit} * (------------------) + i (num_gc_threads - 1) + vvv vvv + S T + +Where the S and T terms serve to remove the area of the Ys, and +to normalize the result to lie between 0 and gc_copied. + +Note that the implementation orders these operations differently to minimize +error due to integer rounding. + +Then cumulative work balance is computed as +(cumulative_par_balanced_copied_bytes / par_copied_byes) + +Previously, cumulative work balance was computed as: + +(cumulative_par_max_copied_bytes) +(-------------------------------) - 1 +( par_copied_bytes ) +------------------------------------- + (n_capabilities - 1) + +This was less accurate than the current method, and invalid whenever a garbage +collection had occurred with num_gc_threads /= n_capabilities; which can happen +when setNumCapabilities is called, when -qn is passed as an RTS option, or when +the number of gc threads is limited to the number of cores. +See #13830 +*/ + /* ----------------------------------------------------------------------------- stat_describe_gens diff --git a/rts/Stats.h b/rts/Stats.h index ff018716c4..5d9cf04fa7 100644 --- a/rts/Stats.h +++ b/rts/Stats.h @@ -30,7 +30,7 @@ void stat_startGCSync(struct gc_thread_ *_gct); void stat_startGC(Capability *cap, struct gc_thread_ *_gct); void stat_endGC (Capability *cap, struct gc_thread_ *_gct, W_ live, W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads, - W_ par_max_copied); + W_ par_max_copied, W_ par_balanced_copied); #if defined(PROFILING) void stat_startRP(void); diff --git a/rts/Trace.c b/rts/Trace.c index 8978e2722b..71403f8a57 100644 --- a/rts/Trace.c +++ b/rts/Trace.c @@ -347,7 +347,8 @@ void traceEventGcStats_ (Capability *cap, W_ fragmentation, uint32_t par_n_threads, W_ par_max_copied, - W_ par_tot_copied) + W_ par_tot_copied, + W_ par_balanced_copied) { #if defined(DEBUG) if (RtsFlags.TraceFlags.tracing == TRACE_STDERR) { @@ -357,7 +358,8 @@ void traceEventGcStats_ (Capability *cap, { postEventGcStats(cap, heap_capset, gen, copied, slop, fragmentation, - par_n_threads, par_max_copied, par_tot_copied); + par_n_threads, par_max_copied, + par_tot_copied, par_balanced_copied); } } diff --git a/rts/Trace.h b/rts/Trace.h index 2058ac2e05..624d459420 100644 --- a/rts/Trace.h +++ b/rts/Trace.h @@ -151,7 +151,8 @@ void traceEventGcStats_ (Capability *cap, W_ fragmentation, uint32_t par_n_threads, W_ par_max_copied, - W_ par_tot_copied); + W_ par_tot_copied, + W_ par_balanced_copied); /* * Record a spark event @@ -302,7 +303,8 @@ void traceHeapProfSampleCostCentre(StgWord8 profile_id, #define traceGcEventAtT(cap, ts, tag) /* nothing */ #define traceEventGcStats_(cap, heap_capset, gen, \ copied, slop, fragmentation, \ - par_n_threads, par_max_copied, par_tot_copied) /* nothing */ + par_n_threads, par_max_copied, \ + par_tot_copied, par_balanced_copied) /* nothing */ #define traceHeapEvent(cap, tag, heap_capset, info1) /* nothing */ #define traceEventHeapInfo_(heap_capset, gens, \ maxHeapSize, allocAreaSize, \ @@ -395,11 +397,13 @@ void dtraceUserMarkerWrapper(Capability *cap, char *msg); copies, slop, fragmentation, \ par_n_threads, \ par_max_copied, \ - par_tot_copied) \ + par_tot_copied, \ + par_balanced_copied) HASKELLEVENT_GC_STATS(heap_capset, gens, \ copies, slop, fragmentation, \ par_n_threads, \ par_max_copied, \ + par_balanced_copied, \ par_tot_copied) #define dtraceHeapInfo(heap_capset, gens, \ maxHeapSize, allocAreaSize, \ @@ -470,7 +474,8 @@ void dtraceUserMarkerWrapper(Capability *cap, char *msg); copies, slop, fragmentation, \ par_n_threads, \ par_max_copied, \ - par_tot_copied) /* nothing */ + par_tot_copied, \ + par_balanced_copied) /* nothing */ #define dtraceHeapInfo(heap_capset, gens, \ maxHeapSize, allocAreaSize, \ mblockSize, blockSize) /* nothing */ @@ -663,16 +668,19 @@ INLINE_HEADER void traceEventGcStats(Capability *cap STG_UNUSED, W_ fragmentation STG_UNUSED, uint32_t par_n_threads STG_UNUSED, W_ par_max_copied STG_UNUSED, - W_ par_tot_copied STG_UNUSED) + W_ par_tot_copied STG_UNUSED, + W_ par_balanced_copied STG_UNUSED) { if (RTS_UNLIKELY(TRACE_gc)) { traceEventGcStats_(cap, heap_capset, gen, copied, slop, fragmentation, - par_n_threads, par_max_copied, par_tot_copied); + par_n_threads, par_max_copied, + par_tot_copied, par_balanced_copied); } dtraceEventGcStats(heap_capset, gen, copied, slop, fragmentation, - par_n_threads, par_max_copied, par_tot_copied); + par_n_threads, par_max_copied, + par_tot_copied, par_balanced_copied); } INLINE_HEADER void traceEventHeapInfo(CapsetID heap_capset STG_UNUSED, diff --git a/rts/eventlog/EventLog.c b/rts/eventlog/EventLog.c index c175aecd3e..ed297b879e 100644 --- a/rts/eventlog/EventLog.c +++ b/rts/eventlog/EventLog.c @@ -412,12 +412,14 @@ initEventLogging(const EventLogWriter *ev_writer) case EVENT_GC_STATS_GHC: // (heap_capset, generation, // copied_bytes, slop_bytes, frag_bytes, // par_n_threads, - // par_max_copied, par_tot_copied) + // par_max_copied, par_tot_copied, + // par_balanced_copied + // ) eventTypes[t].size = sizeof(EventCapsetID) + sizeof(StgWord16) + sizeof(StgWord64) * 3 + sizeof(StgWord32) - + sizeof(StgWord64) * 2; + + sizeof(StgWord64) * 3; break; case EVENT_TASK_CREATE: // (taskId, cap, tid) @@ -903,7 +905,8 @@ void postEventGcStats (Capability *cap, W_ fragmentation, uint32_t par_n_threads, W_ par_max_copied, - W_ par_tot_copied) + W_ par_tot_copied, + W_ par_balanced_copied) { EventsBuf *eb = &capEventBuf[cap->no]; ensureRoomForEvent(eb, EVENT_GC_STATS_GHC); @@ -911,7 +914,8 @@ void postEventGcStats (Capability *cap, postEventHeader(eb, EVENT_GC_STATS_GHC); /* EVENT_GC_STATS_GHC (heap_capset, generation, copied_bytes, slop_bytes, frag_bytes, - par_n_threads, par_max_copied, par_tot_copied) */ + par_n_threads, par_max_copied, + par_tot_copied, par_balanced_copied) */ postCapsetID(eb, heap_capset); postWord16(eb, gen); postWord64(eb, copied); @@ -920,6 +924,7 @@ void postEventGcStats (Capability *cap, postWord32(eb, par_n_threads); postWord64(eb, par_max_copied); postWord64(eb, par_tot_copied); + postWord64(eb, par_balanced_copied); } void postTaskCreateEvent (EventTaskId taskId, diff --git a/rts/eventlog/EventLog.h b/rts/eventlog/EventLog.h index 11c2577745..eae11ede45 100644 --- a/rts/eventlog/EventLog.h +++ b/rts/eventlog/EventLog.h @@ -121,7 +121,8 @@ void postEventGcStats (Capability *cap, W_ fragmentation, uint32_t par_n_threads, W_ par_max_copied, - W_ par_tot_copied); + W_ par_tot_copied, + W_ par_balanced_copied); void postTaskCreateEvent (EventTaskId taskId, EventCapNo cap, diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 515a7fe6d8..aa804a8b76 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -188,7 +188,7 @@ GarbageCollect (uint32_t collect_gen, { bdescr *bd; generation *gen; - StgWord live_blocks, live_words, par_max_copied; + StgWord live_blocks, live_words, par_max_copied, par_balanced_copied; #if defined(THREADED_RTS) gc_thread *saved_gct; #endif @@ -460,8 +460,14 @@ GarbageCollect (uint32_t collect_gen, copied = 0; par_max_copied = 0; + par_balanced_copied = 0; { uint32_t i; + uint64_t par_balanced_copied_acc = 0; + + for (i=0; i < n_gc_threads; i++) { + copied += gc_threads[i]->copied; + } for (i=0; i < n_gc_threads; i++) { if (n_gc_threads > 1) { debugTrace(DEBUG_gc,"thread %d:", i); @@ -471,11 +477,20 @@ GarbageCollect (uint32_t collect_gen, debugTrace(DEBUG_gc," no_work %ld", gc_threads[i]->no_work); debugTrace(DEBUG_gc," scav_find_work %ld", gc_threads[i]->scav_find_work); } - copied += gc_threads[i]->copied; par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied); + par_balanced_copied_acc += + stg_min(n_gc_threads * gc_threads[i]->copied, copied); } if (n_gc_threads == 1) { par_max_copied = 0; + par_balanced_copied = 0; + } + else + { + // See Note [Work Balance] for an explanation of this computation + par_balanced_copied = + (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) / + (n_gc_threads - 1); } } @@ -782,7 +797,7 @@ GarbageCollect (uint32_t collect_gen, // ok, GC over: tell the stats department what happened. stat_endGC(cap, gct, live_words, copied, live_blocks * BLOCK_SIZE_W - live_words /* slop */, - N, n_gc_threads, par_max_copied); + N, n_gc_threads, par_max_copied, par_balanced_copied); #if defined(RTS_USER_SIGNALS) if (RtsFlags.MiscFlags.install_signal_handlers) { |