summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÖmer Sinan Ağacan <omeragacan@gmail.com>2018-12-28 13:51:28 +0300
committerMoritz Angermann <moritz.angermann@gmail.com>2020-09-18 08:46:17 +0000
commit4571d4ea00e6ea86df45be3e890f9fd18aae1a79 (patch)
treef5a5ac870e88bc9cae965349a3953426b7cc5f8f
parent675deaefde126c77281b2777b5ab29bc43806d02 (diff)
downloadhaskell-4571d4ea00e6ea86df45be3e890f9fd18aae1a79.tar.gz
Fix negative mutator time in GC stats in prof builds
Because garbage collector calls `retainerProfile()` and `heapCensus()`, GC times normally include some of PROF times too. To fix this we have these lines: // heapCensus() is called by the GC, so RP and HC time are // included in the GC stats. We therefore subtract them to // obtain the actual GC cpu time. stats.gc_cpu_ns -= prof_cpu; stats.gc_elapsed_ns -= prof_elapsed; These variables are later used for calculating GC time excluding the final GC (which should be attributed to EXIT). exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed; The problem is if we subtract PROF times from `gc_elapsed_ns` and then subtract `start_exit_gc_elapsed` from the result, we end up subtracting PROF times twice, because `start_exit_gc_elapsed` also includes PROF times. We now subtract PROF times from GC after the calculations for EXIT and MUT times. The existing assertion that checks INIT + MUT + GC + EXIT = TOTAL now holds. When we subtract PROF numbers from GC, and a new assertion INIT + MUT + GC + PROF + EXIT = TOTAL also holds. Fixes #15897. New assertions added in this commit also revealed #16102, which is also fixed by this commit.
-rw-r--r--rts/Schedule.c4
-rw-r--r--rts/Stats.c111
-rw-r--r--rts/Stats.h1
-rw-r--r--testsuite/tests/profiling/should_run/Makefile8
-rw-r--r--testsuite/tests/profiling/should_run/T15897.hs7
-rw-r--r--testsuite/tests/profiling/should_run/all.T5
6 files changed, 89 insertions, 47 deletions
diff --git a/rts/Schedule.c b/rts/Schedule.c
index f95247e703..51ef7c830b 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -2029,6 +2029,10 @@ forkProcess(HsStablePtr *entry
} else { // child
+ // Current process times reset in the child process, so we should reset
+ // the stats too. See #16102.
+ resetChildProcessStats();
+
#if defined(THREADED_RTS)
initMutex(&sched_mutex);
initMutex(&sm_mutex);
diff --git a/rts/Stats.c b/rts/Stats.c
index 88ea47c156..8999b27844 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -43,12 +43,6 @@ static Time HC_start_time, HC_tot_time = 0; // heap census prof user time
static Time HCe_start_time, HCe_tot_time = 0; // heap census prof elap time
#endif
-#if defined(PROFILING)
-#define PROF_VAL(x) (x)
-#else
-#define PROF_VAL(x) 0
-#endif
-
#if defined(PROF_SPIN)
volatile StgWord64 whitehole_lockClosure_spin = 0;
volatile StgWord64 whitehole_lockClosure_yield = 0;
@@ -204,11 +198,11 @@ initStats0(void)
initStats1() can be called after setupRtsFlags()
------------------------------------------------------------------------ */
+void initGenerationStats(void);
+
void
initStats1 (void)
{
- uint32_t i;
-
if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
statsPrintf(" Alloc Copied Live GC GC TOT TOT Page Flts\n");
statsPrintf(" bytes bytes bytes user elap user elap\n");
@@ -225,13 +219,29 @@ initStats1 (void)
(Time *)stgMallocBytes(
sizeof(Time)*RtsFlags.GcFlags.generations,
"initStats");
- for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
+ initGenerationStats();
+}
+
+void
+initGenerationStats()
+{
+ for (uint32_t i = 0; i < RtsFlags.GcFlags.generations; i++) {
GC_coll_cpu[i] = 0;
GC_coll_elapsed[i] = 0;
GC_coll_max_pause[i] = 0;
}
}
+/* ---------------------------------------------------------------------------
+ Reset stats of child process after fork()
+ ------------------------------------------------------------------------ */
+
+void resetChildProcessStats()
+{
+ initStats0();
+ initGenerationStats();
+}
+
/* -----------------------------------------------------------------------------
Initialisation time...
-------------------------------------------------------------------------- */
@@ -521,14 +531,14 @@ stat_endRP(
RP_tot_time += user - RP_start_time;
RPe_tot_time += elapsed - RPe_start_time;
- fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
- retainerGeneration, mut_user_time_during_RP());
+ fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
+ retainerGeneration, mut_user_time_during_RP());
#if defined(DEBUG_RETAINER)
- fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
- fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
+ fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
+ fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
#endif
- fprintf(prof_file, "\tAverage number of visits per object = %f\n",
- averageNumVisit);
+ fprintf(prof_file, "\tAverage number of visits per object = %f\n",
+ averageNumVisit);
}
#endif /* PROFILING */
@@ -1064,12 +1074,7 @@ stat_exit (void)
// profiling times.
{
Time now_cpu_ns, now_elapsed_ns;
- Time exit_gc_cpu = 0;
- Time exit_gc_elapsed = 0;
- Time prof_cpu = 0;
- Time prof_elapsed = 0;
-
- getProcessTimes( &now_cpu_ns, &now_elapsed_ns);
+ getProcessTimes(&now_cpu_ns, &now_elapsed_ns);
stats.cpu_ns = now_cpu_ns - start_init_cpu;
stats.elapsed_ns = now_elapsed_ns - start_init_elapsed;
@@ -1078,15 +1083,6 @@ stat_exit (void)
if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; }
if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; }
- prof_cpu = PROF_VAL(RP_tot_time + HC_tot_time);
- prof_elapsed = PROF_VAL(RPe_tot_time + HCe_tot_time);
-
- // heapCensus() is called by the GC, so RP and HC time are
- // included in the GC stats. We therefore subtract them to
- // obtain the actual GC cpu time.
- stats.gc_cpu_ns -= prof_cpu;
- stats.gc_elapsed_ns -= prof_elapsed;
-
#if defined(PROFILING)
sum.rp_cpu_ns = RP_tot_time;
sum.rp_elapsed_ns = RPe_tot_time;
@@ -1097,8 +1093,11 @@ stat_exit (void)
// We do a GC during the EXIT phase. We'll attribute the cost of
// that to GC instead of EXIT, so carefully subtract it from the
// EXIT time.
- exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
- exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
+ // Note that exit_gc includes RP and HC for the exit GC too.
+ Time exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
+ Time exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
+
+ ASSERT(exit_gc_elapsed > 0);
sum.exit_cpu_ns = end_exit_cpu
- start_exit_cpu
@@ -1107,14 +1106,16 @@ stat_exit (void)
- start_exit_elapsed
- exit_gc_elapsed;
+ ASSERT(sum.exit_elapsed_ns >= 0);
+
stats.mutator_cpu_ns = start_exit_cpu
- end_init_cpu
- - (stats.gc_cpu_ns - exit_gc_cpu)
- - prof_cpu;
+ - (stats.gc_cpu_ns - exit_gc_cpu);
stats.mutator_elapsed_ns = start_exit_elapsed
- end_init_elapsed
- - (stats.gc_elapsed_ns - exit_gc_elapsed)
- - prof_elapsed;
+ - (stats.gc_elapsed_ns - exit_gc_elapsed);
+
+ ASSERT(stats.mutator_elapsed_ns >= 0);
if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; }
@@ -1122,12 +1123,29 @@ stat_exit (void)
// and subtracting, so the parts should add up to the total exactly.
// Note that stats->total_ns is captured a tiny bit later than
// end_exit_elapsed, so we don't use it here.
- ASSERT(stats.init_elapsed_ns \
- + stats.mutator_elapsed_ns \
- + stats.gc_elapsed_ns \
- + sum.exit_elapsed_ns \
+ ASSERT(stats.init_elapsed_ns // INIT
+ + stats.mutator_elapsed_ns // MUT
+ + stats.gc_elapsed_ns // GC
+ + sum.exit_elapsed_ns // EXIT
== end_exit_elapsed - start_init_elapsed);
+ // heapCensus() is called by the GC, so RP and HC time are
+ // included in the GC stats. We therefore subtract them to
+ // obtain the actual GC cpu time.
+ Time prof_cpu = sum.rp_cpu_ns + sum.hc_cpu_ns;
+ Time prof_elapsed = sum.rp_elapsed_ns + sum.hc_elapsed_ns;
+
+ stats.gc_cpu_ns -= prof_cpu;
+ stats.gc_elapsed_ns -= prof_elapsed;
+
+ // This assertion is probably not necessary; make sure the
+ // subdivision with PROF also makes sense
+ ASSERT(stats.init_elapsed_ns // INIT
+ + stats.mutator_elapsed_ns // MUT
+ + stats.gc_elapsed_ns // GC
+ + sum.exit_elapsed_ns // EXIT
+ + (sum.rp_elapsed_ns + sum.hc_elapsed_ns) // PROF
+ == end_exit_elapsed - start_init_elapsed);
}
// REVIEWERS: it's not clear to me why the following isn't done in
@@ -1148,10 +1166,9 @@ stat_exit (void)
// We populate the remainder (non-time elements) of sum
{
#if defined(THREADED_RTS)
- uint32_t i;
sum.bound_task_count = taskCount - workerCount;
- for (i = 0; i < n_capabilities; i++) {
+ for (uint32_t i = 0; i < n_capabilities; i++) {
sum.sparks.created += capabilities[i]->spark_stats.created;
sum.sparks.dud += capabilities[i]->spark_stats.dud;
sum.sparks.overflowed+=
@@ -1201,22 +1218,22 @@ stat_exit (void)
sum.productivity_cpu_percent =
TimeToSecondsDbl(stats.cpu_ns
- stats.gc_cpu_ns
- - sum.rp_cpu_ns
- - sum.hc_cpu_ns
- stats.init_cpu_ns
- sum.exit_cpu_ns)
/ TimeToSecondsDbl(stats.cpu_ns);
+ ASSERT(sum.productivity_cpu_percent >= 0);
+
sum.productivity_elapsed_percent =
TimeToSecondsDbl(stats.elapsed_ns
- stats.gc_elapsed_ns
- - sum.rp_elapsed_ns
- - sum.hc_elapsed_ns
- stats.init_elapsed_ns
- sum.exit_elapsed_ns)
/ TimeToSecondsDbl(stats.elapsed_ns);
- for(g = 0; g < RtsFlags.GcFlags.generations; ++g) {
+ ASSERT(sum.productivity_elapsed_percent >= 0);
+
+ for(uint32_t g = 0; g < RtsFlags.GcFlags.generations; ++g) {
const generation* gen = &generations[g];
GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g];
gen_stats->collections = gen->collections;
diff --git a/rts/Stats.h b/rts/Stats.h
index fbcca110e9..ed2479863a 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -59,6 +59,7 @@ void stat_workerStop(void);
void initStats0(void);
void initStats1(void);
+void resetChildProcessStats(void);
double mut_user_time_until(Time t);
double mut_user_time(void);
diff --git a/testsuite/tests/profiling/should_run/Makefile b/testsuite/tests/profiling/should_run/Makefile
index 33d8d9a51d..e3a6ec40d5 100644
--- a/testsuite/tests/profiling/should_run/Makefile
+++ b/testsuite/tests/profiling/should_run/Makefile
@@ -41,3 +41,11 @@ T14257:
./T14257 +RTS -hc
# Make sure that samples are monotonically increasing
awk 'BEGIN{t=0} /BEGIN_SAMPLE/{if ($$2 < t) print "uh oh", $$t, $$0; t=$$2;}' T14257.hp
+
+.PHONY: T15897
+T15897:
+ # The bug is caught by an assertion so we run the tests with debug runtime
+ # and ignore the outputs
+ "$(TEST_HC)" -prof -fprof-auto -debug -v0 T15897.hs
+ ./T15897 10000000 +RTS -s -hc 2>/dev/null
+ ./T15897 10000000 +RTS -s -hr 2>/dev/null
diff --git a/testsuite/tests/profiling/should_run/T15897.hs b/testsuite/tests/profiling/should_run/T15897.hs
new file mode 100644
index 0000000000..b004cd1248
--- /dev/null
+++ b/testsuite/tests/profiling/should_run/T15897.hs
@@ -0,0 +1,7 @@
+import Control.Monad
+import Data.IORef
+import System.Environment
+
+main = do
+ [n] <- getArgs
+ replicateM (read n) (newIORef [1,2,3])
diff --git a/testsuite/tests/profiling/should_run/all.T b/testsuite/tests/profiling/should_run/all.T
index c3d34af2bf..326afee39d 100644
--- a/testsuite/tests/profiling/should_run/all.T
+++ b/testsuite/tests/profiling/should_run/all.T
@@ -145,3 +145,8 @@ test('toplevel_scc_1',
test('T12962', [], compile_and_run, [''])
test('T14257', [], run_command, ['$MAKE -s --no-print-directory T14257'])
+
+test('T15897',
+ [extra_ways(['profasm']), only_ways(['profasm']), run_timeout_multiplier(2)],
+ run_command,
+ ['$MAKE -s --no-print-directory T15897'])