summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDouglas Wilson <douglas.wilson@gmail.com>2017-07-11 11:54:09 -0400
committerBen Gamari <ben@smart-cactus.org>2017-07-11 13:41:54 -0400
commit7c9e356de1114ab3e31f2d6d03e83672076dd533 (patch)
tree5a89a142a31a53486e41c75673024548ce4c39ce
parent905dc8bc74bebf5370eb9237cc8756cd9fe871ae (diff)
downloadhaskell-7c9e356de1114ab3e31f2d6d03e83672076dd533.tar.gz
Fix Work Balance computation in RTS stats
An additional stat is tracked per gc: par_balanced_copied This is the the number of bytes copied by each gc thread under the balanced lmit, which is simply (copied_bytes / num_gc_threads). The stat is added to all the appropriate GC structures, so is visible in the eventlog and in GHC.Stats. A note is added explaining how work balance is computed. Remove some end of line whitespace Test Plan: ./validate experiment with the program attached to the ticket examine code changes carefully Reviewers: simonmar, austin, hvr, bgamari, erikd Reviewed By: simonmar Subscribers: Phyx, rwbarton, thomie GHC Trac Issues: #13830 Differential Revision: https://phabricator.haskell.org/D3658
-rw-r--r--includes/RtsAPI.h4
-rw-r--r--includes/rts/EventLogFormat.h11
-rw-r--r--libraries/base/GHC/Stats.hsc28
-rw-r--r--rts/RtsProbes.d2
-rw-r--r--rts/Stats.c97
-rw-r--r--rts/Stats.h2
-rw-r--r--rts/Trace.c6
-rw-r--r--rts/Trace.h22
-rw-r--r--rts/eventlog/EventLog.c13
-rw-r--r--rts/eventlog/EventLog.h3
-rw-r--r--rts/sm/GC.c21
11 files changed, 176 insertions, 33 deletions
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h
index f1c04fa360..1ed5fb06f0 100644
--- a/includes/RtsAPI.h
+++ b/includes/RtsAPI.h
@@ -135,6 +135,8 @@ typedef struct GCDetails_ {
uint64_t copied_bytes;
// In parallel GC, the max amount of data copied by any one thread
uint64_t par_max_copied_bytes;
+ // In parallel GC, the amount of balanced data copied by all threads
+ uint64_t par_balanced_copied_bytes;
// The time elapsed during synchronisation before GC
Time sync_elapsed_ns;
// The CPU time used during GC itself
@@ -176,6 +178,8 @@ typedef struct _RTSStats {
uint64_t par_copied_bytes;
// Sum of par_max_copied_bytes across all parallel GCs
uint64_t cumulative_par_max_copied_bytes;
+ // Sum of par_balanced_copied_byes across all parallel GCs.
+ uint64_t cumulative_par_balanced_copied_bytes;
// -----------------------------------
// Cumulative stats about time use
diff --git a/includes/rts/EventLogFormat.h b/includes/rts/EventLogFormat.h
index b63f842008..f839be04a1 100644
--- a/includes/rts/EventLogFormat.h
+++ b/includes/rts/EventLogFormat.h
@@ -3,16 +3,16 @@
* (c) The GHC Team, 2008-2009
*
* Event log format
- *
+ *
* The log format is designed to be extensible: old tools should be
* able to parse (but not necessarily understand all of) new versions
* of the format, and new tools will be able to understand old log
* files.
- *
+ *
* Each event has a specific format. If you add new events, give them
* new numbers: we never re-use old event numbers.
*
- * - The format is endian-independent: all values are represented in
+ * - The format is endian-independent: all values are represented in
* bigendian order.
*
* - The format is extensible:
@@ -51,7 +51,7 @@
* Word8* -- extra info (for future extensions)
* EVENT_ET_END
*
- * Event :
+ * Event :
* Word16 -- event_type
* Word64 -- time (nanosecs)
* [Word16] -- length of the rest (for variable-sized events only)
@@ -154,7 +154,8 @@
#define EVENT_GC_STATS_GHC 53 /* (heap_capset, generation,
copied_bytes, slop_bytes, frag_bytes,
par_n_threads,
- par_max_copied, par_tot_copied) */
+ par_max_copied,
+ par_tot_copied, par_balanced_copied) */
#define EVENT_GC_GLOBAL_SYNC 54 /* () */
#define EVENT_TASK_CREATE 55 /* (taskID, cap, tid) */
#define EVENT_TASK_MIGRATE 56 /* (taskID, cap, new_cap) */
diff --git a/libraries/base/GHC/Stats.hsc b/libraries/base/GHC/Stats.hsc
index c4e2e809f8..58fb12592f 100644
--- a/libraries/base/GHC/Stats.hsc
+++ b/libraries/base/GHC/Stats.hsc
@@ -81,8 +81,10 @@ data RTSStats = RTSStats {
, copied_bytes :: Word64
-- | Sum of copied_bytes across all parallel GCs
, par_copied_bytes :: Word64
- -- | Sum of par_max_copied_bytes across all parallel GCs
+ -- | Sum of par_max_copied_bytes across all parallel GCs. Deprecated.
, cumulative_par_max_copied_bytes :: Word64
+ -- | Sum of par_balanced_copied bytes across all parallel GCs
+ , cumulative_par_balanced_copied_bytes :: Word64
-- -----------------------------------
-- Cumulative stats about time use
@@ -130,8 +132,11 @@ data GCDetails = GCDetails {
, gcdetails_mem_in_use_bytes :: Word64
-- | Total amount of data copied during this GC
, gcdetails_copied_bytes :: Word64
- -- | In parallel GC, the max amount of data copied by any one thread
+ -- | In parallel GC, the max amount of data copied by any one thread.
+ -- Deprecated.
, gcdetails_par_max_copied_bytes :: Word64
+ -- | In parallel GC, the amount of balanced data copied by all threads
+ , gcdetails_par_balanced_copied_bytes :: Word64
-- | The time elapsed during synchronisation before GC
, gcdetails_sync_elapsed_ns :: RtsTime
-- | The CPU time used during GC itself
@@ -170,6 +175,8 @@ getRTSStats = do
par_copied_bytes <- (# peek RTSStats, par_copied_bytes) p
cumulative_par_max_copied_bytes <-
(# peek RTSStats, cumulative_par_max_copied_bytes) p
+ cumulative_par_balanced_copied_bytes <-
+ (# peek RTSStats, cumulative_par_balanced_copied_bytes) p
mutator_cpu_ns <- (# peek RTSStats, mutator_cpu_ns) p
mutator_elapsed_ns <- (# peek RTSStats, mutator_elapsed_ns) p
gc_cpu_ns <- (# peek RTSStats, gc_cpu_ns) p
@@ -190,6 +197,8 @@ getRTSStats = do
gcdetails_copied_bytes <- (# peek GCDetails, copied_bytes) pgc
gcdetails_par_max_copied_bytes <-
(# peek GCDetails, par_max_copied_bytes) pgc
+ gcdetails_par_balanced_copied_bytes <-
+ (# peek GCDetails, par_balanced_copied_bytes) pgc
gcdetails_sync_elapsed_ns <- (# peek GCDetails, sync_elapsed_ns) pgc
gcdetails_cpu_ns <- (# peek GCDetails, cpu_ns) pgc
gcdetails_elapsed_ns <- (# peek GCDetails, elapsed_ns) pgc
@@ -259,8 +268,19 @@ data GCStats = GCStats
-- thread each GC. The ratio of 'parTotBytesCopied' divided by
-- 'parMaxBytesCopied' approaches 1 for a maximally sequential
-- run and approaches the number of threads (set by the RTS flag
- -- @-N@) for a maximally parallel run.
+ -- @-N@) for a maximally parallel run. This is included for
+ -- backwards compatibility; to compute work balance use
+ -- `parBalancedBytesCopied`.
, parMaxBytesCopied :: !Int64
+
+ -- | Sum of number of balanced bytes copied on each thread of each GC.
+ -- Balanced bytes are those up to a
+ -- limit = (parTotBytesCopied / num_gc_threads).
+ -- This number is normalized so that when balance is perfect
+ -- @parBalancedBytesCopied = parTotBytesCopied@ and when all
+ -- gc is done by a single thread @parBalancedBytesCopied = 0@.
+ , parBalancedBytesCopied :: !Int64
+
} deriving (Show, Read)
-- | Retrieves garbage collection and memory statistics as of the last
@@ -306,6 +326,8 @@ getGCStats = do
wallSeconds <- nsToSecs <$> (# peek RTSStats, elapsed_ns) p
parTotBytesCopied <- (# peek RTSStats, par_copied_bytes) p
parMaxBytesCopied <- (# peek RTSStats, cumulative_par_max_copied_bytes) p
+ parBalancedBytesCopied <-
+ (# peek RTSStats, cumulative_par_balanced_copied_bytes) p
return GCStats { .. }
nsToSecs :: Int64 -> Double
diff --git a/rts/RtsProbes.d b/rts/RtsProbes.d
index e5d8378845..277a494632 100644
--- a/rts/RtsProbes.d
+++ b/rts/RtsProbes.d
@@ -62,7 +62,7 @@ provider HaskellEvent {
probe gc__work (EventCapNo);
probe gc__done (EventCapNo);
probe gc__global__sync (EventCapNo);
- probe gc__stats (EventCapsetID, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord);
+ probe gc__stats (EventCapsetID, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord, StgWord);
probe heap__info (EventCapsetID, StgWord, StgWord, StgWord, StgWord, StgWord);
probe heap__allocated (EventCapNo, EventCapsetID, StgWord64);
probe heap__size (EventCapsetID, StgWord);
diff --git a/rts/Stats.c b/rts/Stats.c
index b0c1be0582..6a5f80130e 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -149,6 +149,7 @@ initStats0(void)
.copied_bytes = 0,
.par_copied_bytes = 0,
.cumulative_par_max_copied_bytes = 0,
+ .cumulative_par_balanced_copied_bytes = 0,
.mutator_cpu_ns = 0,
.mutator_elapsed_ns = 0,
.gc_cpu_ns = 0,
@@ -166,6 +167,7 @@ initStats0(void)
.mem_in_use_bytes = 0,
.copied_bytes = 0,
.par_max_copied_bytes = 0,
+ .par_balanced_copied_bytes = 0,
.sync_elapsed_ns = 0,
.cpu_ns = 0,
.elapsed_ns = 0
@@ -283,7 +285,8 @@ stat_startGC (Capability *cap, gc_thread *gct)
void
stat_endGC (Capability *cap, gc_thread *gct,
W_ live, W_ copied, W_ slop, uint32_t gen,
- uint32_t par_n_threads, W_ par_max_copied)
+ uint32_t par_n_threads, W_ par_max_copied,
+ W_ par_balanced_copied)
{
// -------------------------------------------------
// Collect all the stats about this GC in stats.gc. We always do this since
@@ -305,6 +308,7 @@ stat_endGC (Capability *cap, gc_thread *gct,
stats.gc.mem_in_use_bytes = mblocks_allocated * MBLOCK_SIZE;
stats.gc.copied_bytes = copied * sizeof(W_);
stats.gc.par_max_copied_bytes = par_max_copied * sizeof(W_);
+ stats.gc.par_balanced_copied_bytes = par_balanced_copied * sizeof(W_);
// -------------------------------------------------
// Update the cumulative stats
@@ -324,6 +328,8 @@ stat_endGC (Capability *cap, gc_thread *gct,
stats.par_copied_bytes += stats.gc.copied_bytes;
stats.cumulative_par_max_copied_bytes +=
stats.gc.par_max_copied_bytes;
+ stats.cumulative_par_balanced_copied_bytes +=
+ stats.gc.par_balanced_copied_bytes;
}
stats.gc_cpu_ns += stats.gc.cpu_ns;
stats.gc_elapsed_ns += stats.gc.elapsed_ns;
@@ -385,7 +391,8 @@ stat_endGC (Capability *cap, gc_thread *gct,
* BLOCK_SIZE,
par_n_threads,
stats.gc.par_max_copied_bytes,
- stats.gc.copied_bytes);
+ stats.gc.copied_bytes,
+ stats.gc.par_balanced_copied_bytes);
// Post EVENT_GC_END with the same timestamp as used for stats
// (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
@@ -672,11 +679,11 @@ stat_exit (void)
}
#if defined(THREADED_RTS)
- if (RtsFlags.ParFlags.parGcEnabled && n_capabilities > 1) {
+ if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
+ // See Note [Work Balance]
statsPrintf("\n Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n",
- 100 * (((double)stats.par_copied_bytes / (double)stats.cumulative_par_max_copied_bytes) - 1)
- / (n_capabilities - 1)
- );
+ 100 * (double)stats.cumulative_par_balanced_copied_bytes /
+ (double)stats.par_copied_bytes);
}
#endif
statsPrintf("\n");
@@ -825,6 +832,84 @@ stat_exit (void)
}
}
+/* Note [Work Balance]
+----------------------
+Work balance is a measure of how evenly the work done during parallel garbage
+collection is spread across threads. To compute work balance we must take care
+to account for the number of GC threads changing between GCs. The statistics we
+track must have the number of GC threads "integrated out".
+
+We accumulate two values from each garbage collection:
+* par_copied: is a measure of the total work done during parallel garbage
+ collection
+* par_balanced_copied: is a measure of the balanced work done
+ during parallel garbage collection.
+
+par_copied is simple to compute, but par_balanced_copied_bytes is somewhat more
+complicated:
+
+For a given garbage collection:
+Let gc_copied := total copies during the gc
+ gc_copied_i := copies by the ith thread during the gc
+ num_gc_threads := the number of threads participating in the gc
+ balance_limit := (gc_copied / num_gc_threads)
+
+If we were to graph gc_copied_i, sorted from largest to smallest we would see
+something like:
+
+ |X
+ ^ |X X
+ | |X X X X: unbalanced copies
+copies |----------- Y: balanced copies by the busiest GC thread
+ |Y Z Z Z: other balanced copies
+ |Y Z Z Z
+ |Y Z Z Z Z
+ |Y Z Z Z Z Z
+ |===========
+ |1 2 3 4 5 6
+ i ->
+
+where the --- line is at balance_limit. Balanced copies are those under the ---
+line, i.e. the area of the Ys and Zs. Note that the area occupied by the Ys will
+always equal balance_limit. Completely balanced gc has every thread copying
+balance_limit and a completely unbalanced gc has a single thread copying
+gc_copied.
+
+One could define par_balance_copied as the areas of the Ys and Zs in the graph
+above, however we would like the ratio of (par_balance_copied / gc_copied) to
+range from 0 to 1, so that work_balance will be a nice percentage, also ranging
+from 0 to 1. We therefore define par_balanced_copied as:
+
+ ( num_gc_threads )
+{Sum[Min(gc_copied_i,balance_limit)] - balance_limit} * (------------------)
+ i (num_gc_threads - 1)
+ vvv vvv
+ S T
+
+Where the S and T terms serve to remove the area of the Ys, and
+to normalize the result to lie between 0 and gc_copied.
+
+Note that the implementation orders these operations differently to minimize
+error due to integer rounding.
+
+Then cumulative work balance is computed as
+(cumulative_par_balanced_copied_bytes / par_copied_byes)
+
+Previously, cumulative work balance was computed as:
+
+(cumulative_par_max_copied_bytes)
+(-------------------------------) - 1
+( par_copied_bytes )
+-------------------------------------
+ (n_capabilities - 1)
+
+This was less accurate than the current method, and invalid whenever a garbage
+collection had occurred with num_gc_threads /= n_capabilities; which can happen
+when setNumCapabilities is called, when -qn is passed as an RTS option, or when
+the number of gc threads is limited to the number of cores.
+See #13830
+*/
+
/* -----------------------------------------------------------------------------
stat_describe_gens
diff --git a/rts/Stats.h b/rts/Stats.h
index ff018716c4..5d9cf04fa7 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -30,7 +30,7 @@ void stat_startGCSync(struct gc_thread_ *_gct);
void stat_startGC(Capability *cap, struct gc_thread_ *_gct);
void stat_endGC (Capability *cap, struct gc_thread_ *_gct, W_ live,
W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads,
- W_ par_max_copied);
+ W_ par_max_copied, W_ par_balanced_copied);
#if defined(PROFILING)
void stat_startRP(void);
diff --git a/rts/Trace.c b/rts/Trace.c
index 8978e2722b..71403f8a57 100644
--- a/rts/Trace.c
+++ b/rts/Trace.c
@@ -347,7 +347,8 @@ void traceEventGcStats_ (Capability *cap,
W_ fragmentation,
uint32_t par_n_threads,
W_ par_max_copied,
- W_ par_tot_copied)
+ W_ par_tot_copied,
+ W_ par_balanced_copied)
{
#if defined(DEBUG)
if (RtsFlags.TraceFlags.tracing == TRACE_STDERR) {
@@ -357,7 +358,8 @@ void traceEventGcStats_ (Capability *cap,
{
postEventGcStats(cap, heap_capset, gen,
copied, slop, fragmentation,
- par_n_threads, par_max_copied, par_tot_copied);
+ par_n_threads, par_max_copied,
+ par_tot_copied, par_balanced_copied);
}
}
diff --git a/rts/Trace.h b/rts/Trace.h
index 2058ac2e05..624d459420 100644
--- a/rts/Trace.h
+++ b/rts/Trace.h
@@ -151,7 +151,8 @@ void traceEventGcStats_ (Capability *cap,
W_ fragmentation,
uint32_t par_n_threads,
W_ par_max_copied,
- W_ par_tot_copied);
+ W_ par_tot_copied,
+ W_ par_balanced_copied);
/*
* Record a spark event
@@ -302,7 +303,8 @@ void traceHeapProfSampleCostCentre(StgWord8 profile_id,
#define traceGcEventAtT(cap, ts, tag) /* nothing */
#define traceEventGcStats_(cap, heap_capset, gen, \
copied, slop, fragmentation, \
- par_n_threads, par_max_copied, par_tot_copied) /* nothing */
+ par_n_threads, par_max_copied, \
+ par_tot_copied, par_balanced_copied) /* nothing */
#define traceHeapEvent(cap, tag, heap_capset, info1) /* nothing */
#define traceEventHeapInfo_(heap_capset, gens, \
maxHeapSize, allocAreaSize, \
@@ -395,11 +397,13 @@ void dtraceUserMarkerWrapper(Capability *cap, char *msg);
copies, slop, fragmentation, \
par_n_threads, \
par_max_copied, \
- par_tot_copied) \
+ par_tot_copied, \
+ par_balanced_copied)
HASKELLEVENT_GC_STATS(heap_capset, gens, \
copies, slop, fragmentation, \
par_n_threads, \
par_max_copied, \
+ par_balanced_copied, \
par_tot_copied)
#define dtraceHeapInfo(heap_capset, gens, \
maxHeapSize, allocAreaSize, \
@@ -470,7 +474,8 @@ void dtraceUserMarkerWrapper(Capability *cap, char *msg);
copies, slop, fragmentation, \
par_n_threads, \
par_max_copied, \
- par_tot_copied) /* nothing */
+ par_tot_copied, \
+ par_balanced_copied) /* nothing */
#define dtraceHeapInfo(heap_capset, gens, \
maxHeapSize, allocAreaSize, \
mblockSize, blockSize) /* nothing */
@@ -663,16 +668,19 @@ INLINE_HEADER void traceEventGcStats(Capability *cap STG_UNUSED,
W_ fragmentation STG_UNUSED,
uint32_t par_n_threads STG_UNUSED,
W_ par_max_copied STG_UNUSED,
- W_ par_tot_copied STG_UNUSED)
+ W_ par_tot_copied STG_UNUSED,
+ W_ par_balanced_copied STG_UNUSED)
{
if (RTS_UNLIKELY(TRACE_gc)) {
traceEventGcStats_(cap, heap_capset, gen,
copied, slop, fragmentation,
- par_n_threads, par_max_copied, par_tot_copied);
+ par_n_threads, par_max_copied,
+ par_tot_copied, par_balanced_copied);
}
dtraceEventGcStats(heap_capset, gen,
copied, slop, fragmentation,
- par_n_threads, par_max_copied, par_tot_copied);
+ par_n_threads, par_max_copied,
+ par_tot_copied, par_balanced_copied);
}
INLINE_HEADER void traceEventHeapInfo(CapsetID heap_capset STG_UNUSED,
diff --git a/rts/eventlog/EventLog.c b/rts/eventlog/EventLog.c
index c175aecd3e..ed297b879e 100644
--- a/rts/eventlog/EventLog.c
+++ b/rts/eventlog/EventLog.c
@@ -412,12 +412,14 @@ initEventLogging(const EventLogWriter *ev_writer)
case EVENT_GC_STATS_GHC: // (heap_capset, generation,
// copied_bytes, slop_bytes, frag_bytes,
// par_n_threads,
- // par_max_copied, par_tot_copied)
+ // par_max_copied, par_tot_copied,
+ // par_balanced_copied
+ // )
eventTypes[t].size = sizeof(EventCapsetID)
+ sizeof(StgWord16)
+ sizeof(StgWord64) * 3
+ sizeof(StgWord32)
- + sizeof(StgWord64) * 2;
+ + sizeof(StgWord64) * 3;
break;
case EVENT_TASK_CREATE: // (taskId, cap, tid)
@@ -903,7 +905,8 @@ void postEventGcStats (Capability *cap,
W_ fragmentation,
uint32_t par_n_threads,
W_ par_max_copied,
- W_ par_tot_copied)
+ W_ par_tot_copied,
+ W_ par_balanced_copied)
{
EventsBuf *eb = &capEventBuf[cap->no];
ensureRoomForEvent(eb, EVENT_GC_STATS_GHC);
@@ -911,7 +914,8 @@ void postEventGcStats (Capability *cap,
postEventHeader(eb, EVENT_GC_STATS_GHC);
/* EVENT_GC_STATS_GHC (heap_capset, generation,
copied_bytes, slop_bytes, frag_bytes,
- par_n_threads, par_max_copied, par_tot_copied) */
+ par_n_threads, par_max_copied,
+ par_tot_copied, par_balanced_copied) */
postCapsetID(eb, heap_capset);
postWord16(eb, gen);
postWord64(eb, copied);
@@ -920,6 +924,7 @@ void postEventGcStats (Capability *cap,
postWord32(eb, par_n_threads);
postWord64(eb, par_max_copied);
postWord64(eb, par_tot_copied);
+ postWord64(eb, par_balanced_copied);
}
void postTaskCreateEvent (EventTaskId taskId,
diff --git a/rts/eventlog/EventLog.h b/rts/eventlog/EventLog.h
index 11c2577745..eae11ede45 100644
--- a/rts/eventlog/EventLog.h
+++ b/rts/eventlog/EventLog.h
@@ -121,7 +121,8 @@ void postEventGcStats (Capability *cap,
W_ fragmentation,
uint32_t par_n_threads,
W_ par_max_copied,
- W_ par_tot_copied);
+ W_ par_tot_copied,
+ W_ par_balanced_copied);
void postTaskCreateEvent (EventTaskId taskId,
EventCapNo cap,
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 515a7fe6d8..aa804a8b76 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -188,7 +188,7 @@ GarbageCollect (uint32_t collect_gen,
{
bdescr *bd;
generation *gen;
- StgWord live_blocks, live_words, par_max_copied;
+ StgWord live_blocks, live_words, par_max_copied, par_balanced_copied;
#if defined(THREADED_RTS)
gc_thread *saved_gct;
#endif
@@ -460,8 +460,14 @@ GarbageCollect (uint32_t collect_gen,
copied = 0;
par_max_copied = 0;
+ par_balanced_copied = 0;
{
uint32_t i;
+ uint64_t par_balanced_copied_acc = 0;
+
+ for (i=0; i < n_gc_threads; i++) {
+ copied += gc_threads[i]->copied;
+ }
for (i=0; i < n_gc_threads; i++) {
if (n_gc_threads > 1) {
debugTrace(DEBUG_gc,"thread %d:", i);
@@ -471,11 +477,20 @@ GarbageCollect (uint32_t collect_gen,
debugTrace(DEBUG_gc," no_work %ld", gc_threads[i]->no_work);
debugTrace(DEBUG_gc," scav_find_work %ld", gc_threads[i]->scav_find_work);
}
- copied += gc_threads[i]->copied;
par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
+ par_balanced_copied_acc +=
+ stg_min(n_gc_threads * gc_threads[i]->copied, copied);
}
if (n_gc_threads == 1) {
par_max_copied = 0;
+ par_balanced_copied = 0;
+ }
+ else
+ {
+ // See Note [Work Balance] for an explanation of this computation
+ par_balanced_copied =
+ (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
+ (n_gc_threads - 1);
}
}
@@ -782,7 +797,7 @@ GarbageCollect (uint32_t collect_gen,
// ok, GC over: tell the stats department what happened.
stat_endGC(cap, gct, live_words, copied,
live_blocks * BLOCK_SIZE_W - live_words /* slop */,
- N, n_gc_threads, par_max_copied);
+ N, n_gc_threads, par_max_copied, par_balanced_copied);
#if defined(RTS_USER_SIGNALS)
if (RtsFlags.MiscFlags.install_signal_handlers) {