summaryrefslogtreecommitdiff
path: root/rts
diff options
context:
space:
mode:
authorÖmer Sinan Ağacan <omeragacan@gmail.com>2018-12-28 13:51:28 +0300
committerÖmer Sinan Ağacan <omeragacan@gmail.com>2019-01-12 06:52:38 -0500
commit19670bc397d858b04eb9b4eb01480f7f8c59e2f5 (patch)
tree0e84040446183e032bcd05f2a8c1fd620ba1b9b2 /rts
parent74cd4ec5d2f9321aad5db3285cb60d78f2562996 (diff)
downloadhaskell-19670bc397d858b04eb9b4eb01480f7f8c59e2f5.tar.gz
Fix negative mutator time in GC stats in prof builds
Because garbage collector calls `retainerProfile()` and `heapCensus()`, GC times normally include some of PROF times too. To fix this we have these lines: // heapCensus() is called by the GC, so RP and HC time are // included in the GC stats. We therefore subtract them to // obtain the actual GC cpu time. stats.gc_cpu_ns -= prof_cpu; stats.gc_elapsed_ns -= prof_elapsed; These variables are later used for calculating GC time excluding the final GC (which should be attributed to EXIT). exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed; The problem is if we subtract PROF times from `gc_elapsed_ns` and then subtract `start_exit_gc_elapsed` from the result, we end up subtracting PROF times twice, because `start_exit_gc_elapsed` also includes PROF times. We now subtract PROF times from GC after the calculations for EXIT and MUT times. The existing assertion that checks INIT + MUT + GC + EXIT = TOTAL now holds. When we subtract PROF numbers from GC, and a new assertion INIT + MUT + GC + PROF + EXIT = TOTAL also holds. Fixes #15897. New assertions added in this commit also revealed #16102, which is also fixed by this commit.
Diffstat (limited to 'rts')
-rw-r--r--rts/Schedule.c4
-rw-r--r--rts/Stats.c114
-rw-r--r--rts/Stats.h1
3 files changed, 70 insertions, 49 deletions
diff --git a/rts/Schedule.c b/rts/Schedule.c
index eb9203f783..02055d2566 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -2018,6 +2018,10 @@ forkProcess(HsStablePtr *entry
} else { // child
+ // Current process times reset in the child process, so we should reset
+ // the stats too. See #16102.
+ resetChildProcessStats();
+
#if defined(THREADED_RTS)
initMutex(&sched_mutex);
initMutex(&sm_mutex);
diff --git a/rts/Stats.c b/rts/Stats.c
index e42524f924..b2e48c48cd 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -43,12 +43,6 @@ static Time HC_start_time, HC_tot_time = 0; // heap census prof user time
static Time HCe_start_time, HCe_tot_time = 0; // heap census prof elap time
#endif
-#if defined(PROFILING)
-#define PROF_VAL(x) (x)
-#else
-#define PROF_VAL(x) 0
-#endif
-
#if defined(PROF_SPIN)
volatile StgWord64 whitehole_lockClosure_spin = 0;
volatile StgWord64 whitehole_lockClosure_yield = 0;
@@ -204,11 +198,11 @@ initStats0(void)
initStats1() can be called after setupRtsFlags()
------------------------------------------------------------------------ */
+void initGenerationStats(void);
+
void
initStats1 (void)
{
- uint32_t i;
-
if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
statsPrintf(" Alloc Copied Live GC GC TOT TOT Page Flts\n");
statsPrintf(" bytes bytes bytes user elap user elap\n");
@@ -225,13 +219,29 @@ initStats1 (void)
(Time *)stgMallocBytes(
sizeof(Time)*RtsFlags.GcFlags.generations,
"initStats");
- for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
+ initGenerationStats();
+}
+
+void
+initGenerationStats()
+{
+ for (uint32_t i = 0; i < RtsFlags.GcFlags.generations; i++) {
GC_coll_cpu[i] = 0;
GC_coll_elapsed[i] = 0;
GC_coll_max_pause[i] = 0;
}
}
+/* ---------------------------------------------------------------------------
+ Reset stats of child process after fork()
+ ------------------------------------------------------------------------ */
+
+void resetChildProcessStats()
+{
+ initStats0();
+ initGenerationStats();
+}
+
/* -----------------------------------------------------------------------------
Initialisation time...
-------------------------------------------------------------------------- */
@@ -517,14 +527,14 @@ stat_endRP(
RP_tot_time += user - RP_start_time;
RPe_tot_time += elapsed - RPe_start_time;
- fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
- retainerGeneration, mut_user_time_during_RP());
+ fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
+ retainerGeneration, mut_user_time_during_RP());
#if defined(DEBUG_RETAINER)
- fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
- fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
+ fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
+ fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
#endif
- fprintf(prof_file, "\tAverage number of visits per object = %f\n",
- averageNumVisit);
+ fprintf(prof_file, "\tAverage number of visits per object = %f\n",
+ averageNumVisit);
}
#endif /* PROFILING */
@@ -1049,21 +1059,15 @@ void
stat_exit (void)
{
RTSSummaryStats sum;
- uint32_t g;
-
init_RTSSummaryStats(&sum);
+
if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
// First we tidy the times in stats, and populate the times in sum.
// In particular, we adjust the gc_* time in stats to remove
// profiling times.
{
Time now_cpu_ns, now_elapsed_ns;
- Time exit_gc_cpu = 0;
- Time exit_gc_elapsed = 0;
- Time prof_cpu = 0;
- Time prof_elapsed = 0;
-
- getProcessTimes( &now_cpu_ns, &now_elapsed_ns);
+ getProcessTimes(&now_cpu_ns, &now_elapsed_ns);
stats.cpu_ns = now_cpu_ns - start_init_cpu;
stats.elapsed_ns = now_elapsed_ns - start_init_elapsed;
@@ -1072,15 +1076,6 @@ stat_exit (void)
if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; }
if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; }
- prof_cpu = PROF_VAL(RP_tot_time + HC_tot_time);
- prof_elapsed = PROF_VAL(RPe_tot_time + HCe_tot_time);
-
- // heapCensus() is called by the GC, so RP and HC time are
- // included in the GC stats. We therefore subtract them to
- // obtain the actual GC cpu time.
- stats.gc_cpu_ns -= prof_cpu;
- stats.gc_elapsed_ns -= prof_elapsed;
-
#if defined(PROFILING)
sum.rp_cpu_ns = RP_tot_time;
sum.rp_elapsed_ns = RPe_tot_time;
@@ -1091,8 +1086,11 @@ stat_exit (void)
// We do a GC during the EXIT phase. We'll attribute the cost of
// that to GC instead of EXIT, so carefully subtract it from the
// EXIT time.
- exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
- exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
+ // Note that exit_gc includes RP and HC for the exit GC too.
+ Time exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
+ Time exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
+
+ ASSERT(exit_gc_elapsed > 0);
sum.exit_cpu_ns = end_exit_cpu
- start_exit_cpu
@@ -1101,14 +1099,16 @@ stat_exit (void)
- start_exit_elapsed
- exit_gc_elapsed;
+ ASSERT(sum.exit_elapsed_ns >= 0);
+
stats.mutator_cpu_ns = start_exit_cpu
- end_init_cpu
- - (stats.gc_cpu_ns - exit_gc_cpu)
- - prof_cpu;
+ - (stats.gc_cpu_ns - exit_gc_cpu);
stats.mutator_elapsed_ns = start_exit_elapsed
- end_init_elapsed
- - (stats.gc_elapsed_ns - exit_gc_elapsed)
- - prof_elapsed;
+ - (stats.gc_elapsed_ns - exit_gc_elapsed);
+
+ ASSERT(stats.mutator_elapsed_ns >= 0);
if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; }
@@ -1116,12 +1116,29 @@ stat_exit (void)
// and subtracting, so the parts should add up to the total exactly.
// Note that stats->total_ns is captured a tiny bit later than
// end_exit_elapsed, so we don't use it here.
- ASSERT(stats.init_elapsed_ns \
- + stats.mutator_elapsed_ns \
- + stats.gc_elapsed_ns \
- + sum.exit_elapsed_ns \
+ ASSERT(stats.init_elapsed_ns // INIT
+ + stats.mutator_elapsed_ns // MUT
+ + stats.gc_elapsed_ns // GC
+ + sum.exit_elapsed_ns // EXIT
== end_exit_elapsed - start_init_elapsed);
+ // heapCensus() is called by the GC, so RP and HC time are
+ // included in the GC stats. We therefore subtract them to
+ // obtain the actual GC cpu time.
+ Time prof_cpu = sum.rp_cpu_ns + sum.hc_cpu_ns;
+ Time prof_elapsed = sum.rp_elapsed_ns + sum.hc_elapsed_ns;
+
+ stats.gc_cpu_ns -= prof_cpu;
+ stats.gc_elapsed_ns -= prof_elapsed;
+
+ // This assertion is probably not necessary; make sure the
+ // subdivision with PROF also makes sense
+ ASSERT(stats.init_elapsed_ns // INIT
+ + stats.mutator_elapsed_ns // MUT
+ + stats.gc_elapsed_ns // GC
+ + sum.exit_elapsed_ns // EXIT
+ + (sum.rp_elapsed_ns + sum.hc_elapsed_ns) // PROF
+ == end_exit_elapsed - start_init_elapsed);
}
// REVIEWERS: it's not clear to me why the following isn't done in
@@ -1142,10 +1159,9 @@ stat_exit (void)
// We populate the remainder (non-time elements) of sum
{
#if defined(THREADED_RTS)
- uint32_t i;
sum.bound_task_count = taskCount - workerCount;
- for (i = 0; i < n_capabilities; i++) {
+ for (uint32_t i = 0; i < n_capabilities; i++) {
sum.sparks.created += capabilities[i]->spark_stats.created;
sum.sparks.dud += capabilities[i]->spark_stats.dud;
sum.sparks.overflowed+=
@@ -1195,22 +1211,22 @@ stat_exit (void)
sum.productivity_cpu_percent =
TimeToSecondsDbl(stats.cpu_ns
- stats.gc_cpu_ns
- - sum.rp_cpu_ns
- - sum.hc_cpu_ns
- stats.init_cpu_ns
- sum.exit_cpu_ns)
/ TimeToSecondsDbl(stats.cpu_ns);
+ ASSERT(sum.productivity_cpu_percent >= 0);
+
sum.productivity_elapsed_percent =
TimeToSecondsDbl(stats.elapsed_ns
- stats.gc_elapsed_ns
- - sum.rp_elapsed_ns
- - sum.hc_elapsed_ns
- stats.init_elapsed_ns
- sum.exit_elapsed_ns)
/ TimeToSecondsDbl(stats.elapsed_ns);
- for(g = 0; g < RtsFlags.GcFlags.generations; ++g) {
+ ASSERT(sum.productivity_elapsed_percent >= 0);
+
+ for(uint32_t g = 0; g < RtsFlags.GcFlags.generations; ++g) {
const generation* gen = &generations[g];
GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g];
gen_stats->collections = gen->collections;
diff --git a/rts/Stats.h b/rts/Stats.h
index fbcca110e9..ed2479863a 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -59,6 +59,7 @@ void stat_workerStop(void);
void initStats0(void);
void initStats1(void);
+void resetChildProcessStats(void);
double mut_user_time_until(Time t);
double mut_user_time(void);