diff options
-rw-r--r-- | includes/RtsAPI.h | 4 | ||||
-rw-r--r-- | includes/rts/EventLogFormat.h | 11 | ||||
-rw-r--r-- | libraries/base/GHC/Stats.hsc | 28 | ||||
-rw-r--r-- | rts/RtsProbes.d | 2 | ||||
-rw-r--r-- | rts/Stats.c | 97 | ||||
-rw-r--r-- | rts/Stats.h | 2 | ||||
-rw-r--r-- | rts/Trace.c | 6 | ||||
-rw-r--r-- | rts/Trace.h | 22 | ||||
-rw-r--r-- | rts/eventlog/EventLog.c | 13 | ||||
-rw-r--r-- | rts/eventlog/EventLog.h | 3 | ||||
-rw-r--r-- | rts/sm/GC.c | 21 |
11 files changed, 176 insertions, 33 deletions
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h index f1c04fa360..1ed5fb06f0 100644 --- a/includes/RtsAPI.h +++ b/includes/RtsAPI.h @@ -135,6 +135,8 @@ typedef struct GCDetails_ { uint64_t copied_bytes; // In parallel GC, the max amount of data copied by any one thread uint64_t par_max_copied_bytes; + // In parallel GC, the amount of balanced data copied by all threads + uint64_t par_balanced_copied_bytes; // The time elapsed during synchronisation before GC Time sync_elapsed_ns; // The CPU time used during GC itself @@ -176,6 +178,8 @@ typedef struct _RTSStats { uint64_t par_copied_bytes; // Sum of par_max_copied_bytes across all parallel GCs uint64_t cumulative_par_max_copied_bytes; + // Sum of par_balanced_copied_byes across all parallel GCs. + uint64_t cumulative_par_balanced_copied_bytes; // ----------------------------------- // Cumulative stats about time use diff --git a/includes/rts/EventLogFormat.h b/includes/rts/EventLogFormat.h index b63f842008..f839be04a1 100644 --- a/includes/rts/EventLogFormat.h +++ b/includes/rts/EventLogFormat.h @@ -3,16 +3,16 @@ * (c) The GHC Team, 2008-2009 * * Event log format - * + * * The log format is designed to be extensible: old tools should be * able to parse (but not necessarily understand all of) new versions * of the format, and new tools will be able to understand old log * files. - * + * * Each event has a specific format. If you add new events, give them * new numbers: we never re-use old event numbers. * - * - The format is endian-independent: all values are represented in + * - The format is endian-independent: all values are represented in * bigendian order. * * - The format is extensible: @@ -51,7 +51,7 @@ * Word8* -- extra info (for future extensions) * EVENT_ET_END * - * Event : + * Event : * Word16 -- event_type * Word64 -- time (nanosecs) * [Word16] -- length of the rest (for variable-sized events only) @@ -154,7 +154,8 @@ #define EVENT_GC_STATS_GHC 53 /* (heap_capset, generation, copied_bytes, slop_bytes, frag_bytes, par_n_threads, - par_max_copied, par_tot_copied) */ + par_max_copied, + par_tot_copied, par_balanced_copied) */ #define EVENT_GC_GLOBAL_SYNC 54 /* () */ #define EVENT_TASK_CREATE 55 /* (taskID, cap, tid) */ #define EVENT_TASK_MIGRATE 56 /* (taskID, cap, new_cap) */ diff --git a/libraries/base/GHC/Stats.hsc b/libraries/base/GHC/Stats.hsc index c4e2e809f8..58fb12592f 100644 --- a/libraries/base/GHC/Stats.hsc +++ b/libraries/base/GHC/Stats.hsc @@ -81,8 +81,10 @@ data RTSStats = RTSStats { , copied_bytes :: Word64 -- | Sum of copied_bytes across all parallel GCs , par_copied_bytes :: Word64 - -- | Sum of par_max_copied_bytes across all parallel GCs + -- | Sum of par_max_copied_bytes across all parallel GCs. Deprecated. , cumulative_par_max_copied_bytes :: Word64 + -- | Sum of par_balanced_copied bytes across all parallel GCs + , cumulative_par_balanced_copied_bytes :: Word64 -- ----------------------------------- -- Cumulative stats about time use @@ -130,8 +132,11 @@ data GCDetails = GCDetails { , gcdetails_mem_in_use_bytes :: Word64 -- | Total amount of data copied during this GC , gcdetails_copied_bytes :: Word64 - -- | In parallel GC, the max amount of data copied by any one thread + -- | In parallel GC, the max amount of data copied by any one thread. + -- Deprecated. , gcdetails_par_max_copied_bytes :: Word64 + -- | In parallel GC, the amount of balanced data copied by all threads + , gcdetails_par_balanced_copied_bytes :: Word64 -- | The time elapsed during synchronisation before GC , gcdetails_sync_elapsed_ns :: RtsTime -- | The CPU time used during GC itself @@ -170,6 +175,8 @@ getRTSStats = do par_copied_bytes <- (# peek RTSStats, par_copied_bytes) p cumulative_par_max_copied_bytes <- (# peek RTSStats, cumulative_par_max_copied_bytes) p + cumulative_par_balanced_copied_bytes <- + (# peek RTSStats, cumulative_par_balanced_copied_bytes) p mutator_cpu_ns <- (# peek RTSStats, mutator_cpu_ns) p mutator_elapsed_ns <- (# peek RTSStats, mutator_elapsed_ns) p gc_cpu_ns <- (# peek RTSStats, gc_cpu_ns) p @@ -190,6 +197,8 @@ getRTSStats = do gcdetails_copied_bytes <- (# peek GCDetails, copied_bytes) pgc gcdetails_par_max_copied_bytes <- (# peek GCDetails, par_max_copied_bytes) pgc + gcdetails_par_balanced_copied_bytes <- + (# peek GCDetails, par_balanced_copied_bytes) pgc gcdetails_sync_elapsed_ns <- (# peek GCDetails, sync_elapsed_ns) pgc gcdetails_cpu_ns <- (# peek GCDetails, cpu_ns) pgc gcdetails_elapsed_ns <- (# peek GCDetails, elapsed_ns) pgc @@ -259,8 +268,19 @@ data GCStats = GCStats -- thread each GC. The ratio of 'parTotBytesCopied' divided by -- 'parMaxBytesCopied' approaches 1 for a maximally sequential -- run and approaches the number of threads (set by the RTS flag - -- @-N@) for a maximally parallel run. + -- @-N@) for a maximally parallel run. This is included for + -- backwards compatibility; to compute work balance use + -- `parBalancedBytesCopied`. , parMaxBytesCopied :: !Int64 + + -- | Sum of number of balanced bytes copied on each thread of each GC. + -- Balanced bytes are those up to a + -- limit = (parTotBytesCopied / num_gc_threads). + -- This number is normalized so that when balance is perfect + -- @parBalancedBytesCopied = parTotBytesCopied@ and when all + -- gc is done by a single thread @parBalancedBytesCopied = 0@. + , parBalancedBytesCopied :: !Int64 + } deriving (Show, Read) -- | Retrieves garbage collection and memory statistics as of the last @@ -306,6 +326,8 @@ getGCStats = do wallSeconds <- nsToSecs <$> (# peek RTSStats, elapsed_ns) p parTotBytesCopied <- (# peek RTSStats, par_copied_bytes) p parMaxBytesCopied <- (# peek RTSStats, cumulative_par_max_copied_bytes) p + parBalancedBytesCopied <- + (# peek RTSStats, cumulative_par_balanced_copied_bytes) p return GCStats { .. } nsToSecs :: Int64 -> Double diff --git a/rts/RtsProbes.d b/rts/RtsProbes.d index e5d8378845..277a494632 100644 --- a/rts/RtsProbes.d +++ b/rts/RtsProbes.d @@ -62,7 +62,7 @@ provider HaskellEvent { probe gc__work (EventCapNo); probe gc__done (EventCapNo); probe gc__global__sync (EventCapNo); - probe gc__stats (EventCapsetID, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord); + probe gc__stats (EventCapsetID, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord); probe heap__info (EventCapsetID, StgWord, StgWord, StgWord, StgWord, StgWord); probe heap__allocated (EventCapNo, EventCapsetID, StgWord64); probe heap__size (EventCapsetID, StgWord); diff --git a/rts/Stats.c b/rts/Stats.c index b0c1be0582..6a5f80130e 100644 --- a/rts/Stats.c +++ b/rts/Stats.c @@ -149,6 +149,7 @@ initStats0(void) .copied_bytes = 0, .par_copied_bytes = 0, .cumulative_par_max_copied_bytes = 0, + .cumulative_par_balanced_copied_bytes = 0, .mutator_cpu_ns = 0, .mutator_elapsed_ns = 0, .gc_cpu_ns = 0, @@ -166,6 +167,7 @@ initStats0(void) .mem_in_use_bytes = 0, .copied_bytes = 0, .par_max_copied_bytes = 0, + .par_balanced_copied_bytes = 0, .sync_elapsed_ns = 0, .cpu_ns = 0, .elapsed_ns = 0 @@ -283,7 +285,8 @@ stat_startGC (Capability *cap, gc_thread *gct) void stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop, uint32_t gen, - uint32_t par_n_threads, W_ par_max_copied) + uint32_t par_n_threads, W_ par_max_copied, + W_ par_balanced_copied) { // ------------------------------------------------- // Collect all the stats about this GC in stats.gc. We always do this since @@ -305,6 +308,7 @@ stat_endGC (Capability *cap, gc_thread *gct, stats.gc.mem_in_use_bytes = mblocks_allocated * MBLOCK_SIZE; stats.gc.copied_bytes = copied * sizeof(W_); stats.gc.par_max_copied_bytes = par_max_copied * sizeof(W_); + stats.gc.par_balanced_copied_bytes = par_balanced_copied * sizeof(W_); // ------------------------------------------------- // Update the cumulative stats @@ -324,6 +328,8 @@ stat_endGC (Capability *cap, gc_thread *gct, stats.par_copied_bytes += stats.gc.copied_bytes; stats.cumulative_par_max_copied_bytes += stats.gc.par_max_copied_bytes; + stats.cumulative_par_balanced_copied_bytes += + stats.gc.par_balanced_copied_bytes; } stats.gc_cpu_ns += stats.gc.cpu_ns; stats.gc_elapsed_ns += stats.gc.elapsed_ns; @@ -385,7 +391,8 @@ stat_endGC (Capability *cap, gc_thread *gct, * BLOCK_SIZE, par_n_threads, stats.gc.par_max_copied_bytes, - stats.gc.copied_bytes); + stats.gc.copied_bytes, + stats.gc.par_balanced_copied_bytes); // Post EVENT_GC_END with the same timestamp as used for stats // (though converted from Time=StgInt64 to EventTimestamp=StgWord64). @@ -672,11 +679,11 @@ stat_exit (void) } #if defined(THREADED_RTS) - if (RtsFlags.ParFlags.parGcEnabled && n_capabilities > 1) { + if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) { + // See Note [Work Balance] statsPrintf("\n Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n", - 100 * (((double)stats.par_copied_bytes / (double)stats.cumulative_par_max_copied_bytes) - 1) - / (n_capabilities - 1) - ); + 100 * (double)stats.cumulative_par_balanced_copied_bytes / + (double)stats.par_copied_bytes); } #endif statsPrintf("\n"); @@ -825,6 +832,84 @@ stat_exit (void) } } +/* Note [Work Balance] +---------------------- +Work balance is a measure of how evenly the work done during parallel garbage +collection is spread across threads. To compute work balance we must take care +to account for the number of GC threads changing between GCs. The statistics we +track must have the number of GC threads "integrated out". + +We accumulate two values from each garbage collection: +* par_copied: is a measure of the total work done during parallel garbage + collection +* par_balanced_copied: is a measure of the balanced work done + during parallel garbage collection. + +par_copied is simple to compute, but par_balanced_copied_bytes is somewhat more +complicated: + +For a given garbage collection: +Let gc_copied := total copies during the gc + gc_copied_i := copies by the ith thread during the gc + num_gc_threads := the number of threads participating in the gc + balance_limit := (gc_copied / num_gc_threads) + +If we were to graph gc_copied_i, sorted from largest to smallest we would see +something like: + + |X + ^ |X X + | |X X X X: unbalanced copies +copies |----------- Y: balanced copies by the busiest GC thread + |Y Z Z Z: other balanced copies + |Y Z Z Z + |Y Z Z Z Z + |Y Z Z Z Z Z + |=========== + |1 2 3 4 5 6 + i -> + +where the --- line is at balance_limit. Balanced copies are those under the --- +line, i.e. the area of the Ys and Zs. Note that the area occupied by the Ys will +always equal balance_limit. Completely balanced gc has every thread copying +balance_limit and a completely unbalanced gc has a single thread copying +gc_copied. + +One could define par_balance_copied as the areas of the Ys and Zs in the graph +above, however we would like the ratio of (par_balance_copied / gc_copied) to +range from 0 to 1, so that work_balance will be a nice percentage, also ranging +from 0 to 1. We therefore define par_balanced_copied as: + + ( num_gc_threads ) +{Sum[Min(gc_copied_i,balance_limit)] - balance_limit} * (------------------) + i (num_gc_threads - 1) + vvv vvv + S T + +Where the S and T terms serve to remove the area of the Ys, and +to normalize the result to lie between 0 and gc_copied. + +Note that the implementation orders these operations differently to minimize +error due to integer rounding. + +Then cumulative work balance is computed as +(cumulative_par_balanced_copied_bytes / par_copied_byes) + +Previously, cumulative work balance was computed as: + +(cumulative_par_max_copied_bytes) +(-------------------------------) - 1 +( par_copied_bytes ) +------------------------------------- + (n_capabilities - 1) + +This was less accurate than the current method, and invalid whenever a garbage +collection had occurred with num_gc_threads /= n_capabilities; which can happen +when setNumCapabilities is called, when -qn is passed as an RTS option, or when +the number of gc threads is limited to the number of cores. +See #13830 +*/ + /* ----------------------------------------------------------------------------- stat_describe_gens diff --git a/rts/Stats.h b/rts/Stats.h index ff018716c4..5d9cf04fa7 100644 --- a/rts/Stats.h +++ b/rts/Stats.h @@ -30,7 +30,7 @@ void stat_startGCSync(struct gc_thread_ *_gct); void stat_startGC(Capability *cap, struct gc_thread_ *_gct); void stat_endGC (Capability *cap, struct gc_thread_ *_gct, W_ live, W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads, - W_ par_max_copied); + W_ par_max_copied, W_ par_balanced_copied); #if defined(PROFILING) void stat_startRP(void); diff --git a/rts/Trace.c b/rts/Trace.c index 8978e2722b..71403f8a57 100644 --- a/rts/Trace.c +++ b/rts/Trace.c @@ -347,7 +347,8 @@ void traceEventGcStats_ (Capability *cap, W_ fragmentation, uint32_t par_n_threads, W_ par_max_copied, - W_ par_tot_copied) + W_ par_tot_copied, + W_ par_balanced_copied) { #if defined(DEBUG) if (RtsFlags.TraceFlags.tracing == TRACE_STDERR) { @@ -357,7 +358,8 @@ void traceEventGcStats_ (Capability *cap, { postEventGcStats(cap, heap_capset, gen, copied, slop, fragmentation, - par_n_threads, par_max_copied, par_tot_copied); + par_n_threads, par_max_copied, + par_tot_copied, par_balanced_copied); } } diff --git a/rts/Trace.h b/rts/Trace.h index 2058ac2e05..624d459420 100644 --- a/rts/Trace.h +++ b/rts/Trace.h @@ -151,7 +151,8 @@ void traceEventGcStats_ (Capability *cap, W_ fragmentation, uint32_t par_n_threads, W_ par_max_copied, - W_ par_tot_copied); + W_ par_tot_copied, + W_ par_balanced_copied); /* * Record a spark event @@ -302,7 +303,8 @@ void traceHeapProfSampleCostCentre(StgWord8 profile_id, #define traceGcEventAtT(cap, ts, tag) /* nothing */ #define traceEventGcStats_(cap, heap_capset, gen, \ copied, slop, fragmentation, \ - par_n_threads, par_max_copied, par_tot_copied) /* nothing */ + par_n_threads, par_max_copied, \ + par_tot_copied, par_balanced_copied) /* nothing */ #define traceHeapEvent(cap, tag, heap_capset, info1) /* nothing */ #define traceEventHeapInfo_(heap_capset, gens, \ maxHeapSize, allocAreaSize, \ @@ -395,11 +397,13 @@ void dtraceUserMarkerWrapper(Capability *cap, char *msg); copies, slop, fragmentation, \ par_n_threads, \ par_max_copied, \ - par_tot_copied) \ + par_tot_copied, \ + par_balanced_copied) HASKELLEVENT_GC_STATS(heap_capset, gens, \ copies, slop, fragmentation, \ par_n_threads, \ par_max_copied, \ + par_balanced_copied, \ par_tot_copied) #define dtraceHeapInfo(heap_capset, gens, \ maxHeapSize, allocAreaSize, \ @@ -470,7 +474,8 @@ void dtraceUserMarkerWrapper(Capability *cap, char *msg); copies, slop, fragmentation, \ par_n_threads, \ par_max_copied, \ - par_tot_copied) /* nothing */ + par_tot_copied, \ + par_balanced_copied) /* nothing */ #define dtraceHeapInfo(heap_capset, gens, \ maxHeapSize, allocAreaSize, \ mblockSize, blockSize) /* nothing */ @@ -663,16 +668,19 @@ INLINE_HEADER void traceEventGcStats(Capability *cap STG_UNUSED, W_ fragmentation STG_UNUSED, uint32_t par_n_threads STG_UNUSED, W_ par_max_copied STG_UNUSED, - W_ par_tot_copied STG_UNUSED) + W_ par_tot_copied STG_UNUSED, + W_ par_balanced_copied STG_UNUSED) { if (RTS_UNLIKELY(TRACE_gc)) { traceEventGcStats_(cap, heap_capset, gen, copied, slop, fragmentation, - par_n_threads, par_max_copied, par_tot_copied); + par_n_threads, par_max_copied, + par_tot_copied, par_balanced_copied); } dtraceEventGcStats(heap_capset, gen, copied, slop, fragmentation, - par_n_threads, par_max_copied, par_tot_copied); + par_n_threads, par_max_copied, + par_tot_copied, par_balanced_copied); } INLINE_HEADER void traceEventHeapInfo(CapsetID heap_capset STG_UNUSED, diff --git a/rts/eventlog/EventLog.c b/rts/eventlog/EventLog.c index c175aecd3e..ed297b879e 100644 --- a/rts/eventlog/EventLog.c +++ b/rts/eventlog/EventLog.c @@ -412,12 +412,14 @@ initEventLogging(const EventLogWriter *ev_writer) case EVENT_GC_STATS_GHC: // (heap_capset, generation, // copied_bytes, slop_bytes, frag_bytes, // par_n_threads, - // par_max_copied, par_tot_copied) + // par_max_copied, par_tot_copied, + // par_balanced_copied + // ) eventTypes[t].size = sizeof(EventCapsetID) + sizeof(StgWord16) + sizeof(StgWord64) * 3 + sizeof(StgWord32) - + sizeof(StgWord64) * 2; + + sizeof(StgWord64) * 3; break; case EVENT_TASK_CREATE: // (taskId, cap, tid) @@ -903,7 +905,8 @@ void postEventGcStats (Capability *cap, W_ fragmentation, uint32_t par_n_threads, W_ par_max_copied, - W_ par_tot_copied) + W_ par_tot_copied, + W_ par_balanced_copied) { EventsBuf *eb = &capEventBuf[cap->no]; ensureRoomForEvent(eb, EVENT_GC_STATS_GHC); @@ -911,7 +914,8 @@ void postEventGcStats (Capability *cap, postEventHeader(eb, EVENT_GC_STATS_GHC); /* EVENT_GC_STATS_GHC (heap_capset, generation, copied_bytes, slop_bytes, frag_bytes, - par_n_threads, par_max_copied, par_tot_copied) */ + par_n_threads, par_max_copied, + par_tot_copied, par_balanced_copied) */ postCapsetID(eb, heap_capset); postWord16(eb, gen); postWord64(eb, copied); @@ -920,6 +924,7 @@ void postEventGcStats (Capability *cap, postWord32(eb, par_n_threads); postWord64(eb, par_max_copied); postWord64(eb, par_tot_copied); + postWord64(eb, par_balanced_copied); } void postTaskCreateEvent (EventTaskId taskId, diff --git a/rts/eventlog/EventLog.h b/rts/eventlog/EventLog.h index 11c2577745..eae11ede45 100644 --- a/rts/eventlog/EventLog.h +++ b/rts/eventlog/EventLog.h @@ -121,7 +121,8 @@ void postEventGcStats (Capability *cap, W_ fragmentation, uint32_t par_n_threads, W_ par_max_copied, - W_ par_tot_copied); + W_ par_tot_copied, + W_ par_balanced_copied); void postTaskCreateEvent (EventTaskId taskId, EventCapNo cap, diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 515a7fe6d8..aa804a8b76 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -188,7 +188,7 @@ GarbageCollect (uint32_t collect_gen, { bdescr *bd; generation *gen; - StgWord live_blocks, live_words, par_max_copied; + StgWord live_blocks, live_words, par_max_copied, par_balanced_copied; #if defined(THREADED_RTS) gc_thread *saved_gct; #endif @@ -460,8 +460,14 @@ GarbageCollect (uint32_t collect_gen, copied = 0; par_max_copied = 0; + par_balanced_copied = 0; { uint32_t i; + uint64_t par_balanced_copied_acc = 0; + + for (i=0; i < n_gc_threads; i++) { + copied += gc_threads[i]->copied; + } for (i=0; i < n_gc_threads; i++) { if (n_gc_threads > 1) { debugTrace(DEBUG_gc,"thread %d:", i); @@ -471,11 +477,20 @@ GarbageCollect (uint32_t collect_gen, debugTrace(DEBUG_gc," no_work %ld", gc_threads[i]->no_work); debugTrace(DEBUG_gc," scav_find_work %ld", gc_threads[i]->scav_find_work); } - copied += gc_threads[i]->copied; par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied); + par_balanced_copied_acc += + stg_min(n_gc_threads * gc_threads[i]->copied, copied); } if (n_gc_threads == 1) { par_max_copied = 0; + par_balanced_copied = 0; + } + else + { + // See Note [Work Balance] for an explanation of this computation + par_balanced_copied = + (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) / + (n_gc_threads - 1); } } @@ -782,7 +797,7 @@ GarbageCollect (uint32_t collect_gen, // ok, GC over: tell the stats department what happened. stat_endGC(cap, gct, live_words, copied, live_blocks * BLOCK_SIZE_W - live_words /* slop */, - N, n_gc_threads, par_max_copied); + N, n_gc_threads, par_max_copied, par_balanced_copied); #if defined(RTS_USER_SIGNALS) if (RtsFlags.MiscFlags.install_signal_handlers) { |