diff options
author | Ömer Sinan Ağacan <omeragacan@gmail.com> | 2018-12-28 13:51:28 +0300 |
---|---|---|
committer | Moritz Angermann <moritz.angermann@gmail.com> | 2020-09-18 08:46:17 +0000 |
commit | 4571d4ea00e6ea86df45be3e890f9fd18aae1a79 (patch) | |
tree | f5a5ac870e88bc9cae965349a3953426b7cc5f8f | |
parent | 675deaefde126c77281b2777b5ab29bc43806d02 (diff) | |
download | haskell-4571d4ea00e6ea86df45be3e890f9fd18aae1a79.tar.gz |
Fix negative mutator time in GC stats in prof builds
Because garbage collector calls `retainerProfile()` and `heapCensus()`,
GC times normally include some of PROF times too. To fix this we have
these lines:
// heapCensus() is called by the GC, so RP and HC time are
// included in the GC stats. We therefore subtract them to
// obtain the actual GC cpu time.
stats.gc_cpu_ns -= prof_cpu;
stats.gc_elapsed_ns -= prof_elapsed;
These variables are later used for calculating GC time excluding the
final GC (which should be attributed to EXIT).
exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
The problem is if we subtract PROF times from `gc_elapsed_ns` and then
subtract `start_exit_gc_elapsed` from the result, we end up subtracting
PROF times twice, because `start_exit_gc_elapsed` also includes PROF
times.
We now subtract PROF times from GC after the calculations for EXIT and
MUT times. The existing assertion that checks
INIT + MUT + GC + EXIT = TOTAL
now holds. When we subtract PROF numbers from GC, and a new assertion
INIT + MUT + GC + PROF + EXIT = TOTAL
also holds.
Fixes #15897. New assertions added in this commit also revealed #16102,
which is also fixed by this commit.
-rw-r--r-- | rts/Schedule.c | 4 | ||||
-rw-r--r-- | rts/Stats.c | 111 | ||||
-rw-r--r-- | rts/Stats.h | 1 | ||||
-rw-r--r-- | testsuite/tests/profiling/should_run/Makefile | 8 | ||||
-rw-r--r-- | testsuite/tests/profiling/should_run/T15897.hs | 7 | ||||
-rw-r--r-- | testsuite/tests/profiling/should_run/all.T | 5 |
6 files changed, 89 insertions, 47 deletions
diff --git a/rts/Schedule.c b/rts/Schedule.c index f95247e703..51ef7c830b 100644 --- a/rts/Schedule.c +++ b/rts/Schedule.c @@ -2029,6 +2029,10 @@ forkProcess(HsStablePtr *entry } else { // child + // Current process times reset in the child process, so we should reset + // the stats too. See #16102. + resetChildProcessStats(); + #if defined(THREADED_RTS) initMutex(&sched_mutex); initMutex(&sm_mutex); diff --git a/rts/Stats.c b/rts/Stats.c index 88ea47c156..8999b27844 100644 --- a/rts/Stats.c +++ b/rts/Stats.c @@ -43,12 +43,6 @@ static Time HC_start_time, HC_tot_time = 0; // heap census prof user time static Time HCe_start_time, HCe_tot_time = 0; // heap census prof elap time #endif -#if defined(PROFILING) -#define PROF_VAL(x) (x) -#else -#define PROF_VAL(x) 0 -#endif - #if defined(PROF_SPIN) volatile StgWord64 whitehole_lockClosure_spin = 0; volatile StgWord64 whitehole_lockClosure_yield = 0; @@ -204,11 +198,11 @@ initStats0(void) initStats1() can be called after setupRtsFlags() ------------------------------------------------------------------------ */ +void initGenerationStats(void); + void initStats1 (void) { - uint32_t i; - if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) { statsPrintf(" Alloc Copied Live GC GC TOT TOT Page Flts\n"); statsPrintf(" bytes bytes bytes user elap user elap\n"); @@ -225,13 +219,29 @@ initStats1 (void) (Time *)stgMallocBytes( sizeof(Time)*RtsFlags.GcFlags.generations, "initStats"); - for (i = 0; i < RtsFlags.GcFlags.generations; i++) { + initGenerationStats(); +} + +void +initGenerationStats() +{ + for (uint32_t i = 0; i < RtsFlags.GcFlags.generations; i++) { GC_coll_cpu[i] = 0; GC_coll_elapsed[i] = 0; GC_coll_max_pause[i] = 0; } } +/* --------------------------------------------------------------------------- + Reset stats of child process after fork() + ------------------------------------------------------------------------ */ + +void resetChildProcessStats() +{ + initStats0(); + initGenerationStats(); +} + /* ----------------------------------------------------------------------------- Initialisation time... -------------------------------------------------------------------------- */ @@ -521,14 +531,14 @@ stat_endRP( RP_tot_time += user - RP_start_time; RPe_tot_time += elapsed - RPe_start_time; - fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n", - retainerGeneration, mut_user_time_during_RP()); + fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n", + retainerGeneration, mut_user_time_during_RP()); #if defined(DEBUG_RETAINER) - fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize); - fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize); + fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize); + fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize); #endif - fprintf(prof_file, "\tAverage number of visits per object = %f\n", - averageNumVisit); + fprintf(prof_file, "\tAverage number of visits per object = %f\n", + averageNumVisit); } #endif /* PROFILING */ @@ -1064,12 +1074,7 @@ stat_exit (void) // profiling times. { Time now_cpu_ns, now_elapsed_ns; - Time exit_gc_cpu = 0; - Time exit_gc_elapsed = 0; - Time prof_cpu = 0; - Time prof_elapsed = 0; - - getProcessTimes( &now_cpu_ns, &now_elapsed_ns); + getProcessTimes(&now_cpu_ns, &now_elapsed_ns); stats.cpu_ns = now_cpu_ns - start_init_cpu; stats.elapsed_ns = now_elapsed_ns - start_init_elapsed; @@ -1078,15 +1083,6 @@ stat_exit (void) if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; } if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; } - prof_cpu = PROF_VAL(RP_tot_time + HC_tot_time); - prof_elapsed = PROF_VAL(RPe_tot_time + HCe_tot_time); - - // heapCensus() is called by the GC, so RP and HC time are - // included in the GC stats. We therefore subtract them to - // obtain the actual GC cpu time. - stats.gc_cpu_ns -= prof_cpu; - stats.gc_elapsed_ns -= prof_elapsed; - #if defined(PROFILING) sum.rp_cpu_ns = RP_tot_time; sum.rp_elapsed_ns = RPe_tot_time; @@ -1097,8 +1093,11 @@ stat_exit (void) // We do a GC during the EXIT phase. We'll attribute the cost of // that to GC instead of EXIT, so carefully subtract it from the // EXIT time. - exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu; - exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed; + // Note that exit_gc includes RP and HC for the exit GC too. + Time exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu; + Time exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed; + + ASSERT(exit_gc_elapsed > 0); sum.exit_cpu_ns = end_exit_cpu - start_exit_cpu @@ -1107,14 +1106,16 @@ stat_exit (void) - start_exit_elapsed - exit_gc_elapsed; + ASSERT(sum.exit_elapsed_ns >= 0); + stats.mutator_cpu_ns = start_exit_cpu - end_init_cpu - - (stats.gc_cpu_ns - exit_gc_cpu) - - prof_cpu; + - (stats.gc_cpu_ns - exit_gc_cpu); stats.mutator_elapsed_ns = start_exit_elapsed - end_init_elapsed - - (stats.gc_elapsed_ns - exit_gc_elapsed) - - prof_elapsed; + - (stats.gc_elapsed_ns - exit_gc_elapsed); + + ASSERT(stats.mutator_elapsed_ns >= 0); if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; } @@ -1122,12 +1123,29 @@ stat_exit (void) // and subtracting, so the parts should add up to the total exactly. // Note that stats->total_ns is captured a tiny bit later than // end_exit_elapsed, so we don't use it here. - ASSERT(stats.init_elapsed_ns \ - + stats.mutator_elapsed_ns \ - + stats.gc_elapsed_ns \ - + sum.exit_elapsed_ns \ + ASSERT(stats.init_elapsed_ns // INIT + + stats.mutator_elapsed_ns // MUT + + stats.gc_elapsed_ns // GC + + sum.exit_elapsed_ns // EXIT == end_exit_elapsed - start_init_elapsed); + // heapCensus() is called by the GC, so RP and HC time are + // included in the GC stats. We therefore subtract them to + // obtain the actual GC cpu time. + Time prof_cpu = sum.rp_cpu_ns + sum.hc_cpu_ns; + Time prof_elapsed = sum.rp_elapsed_ns + sum.hc_elapsed_ns; + + stats.gc_cpu_ns -= prof_cpu; + stats.gc_elapsed_ns -= prof_elapsed; + + // This assertion is probably not necessary; make sure the + // subdivision with PROF also makes sense + ASSERT(stats.init_elapsed_ns // INIT + + stats.mutator_elapsed_ns // MUT + + stats.gc_elapsed_ns // GC + + sum.exit_elapsed_ns // EXIT + + (sum.rp_elapsed_ns + sum.hc_elapsed_ns) // PROF + == end_exit_elapsed - start_init_elapsed); } // REVIEWERS: it's not clear to me why the following isn't done in @@ -1148,10 +1166,9 @@ stat_exit (void) // We populate the remainder (non-time elements) of sum { #if defined(THREADED_RTS) - uint32_t i; sum.bound_task_count = taskCount - workerCount; - for (i = 0; i < n_capabilities; i++) { + for (uint32_t i = 0; i < n_capabilities; i++) { sum.sparks.created += capabilities[i]->spark_stats.created; sum.sparks.dud += capabilities[i]->spark_stats.dud; sum.sparks.overflowed+= @@ -1201,22 +1218,22 @@ stat_exit (void) sum.productivity_cpu_percent = TimeToSecondsDbl(stats.cpu_ns - stats.gc_cpu_ns - - sum.rp_cpu_ns - - sum.hc_cpu_ns - stats.init_cpu_ns - sum.exit_cpu_ns) / TimeToSecondsDbl(stats.cpu_ns); + ASSERT(sum.productivity_cpu_percent >= 0); + sum.productivity_elapsed_percent = TimeToSecondsDbl(stats.elapsed_ns - stats.gc_elapsed_ns - - sum.rp_elapsed_ns - - sum.hc_elapsed_ns - stats.init_elapsed_ns - sum.exit_elapsed_ns) / TimeToSecondsDbl(stats.elapsed_ns); - for(g = 0; g < RtsFlags.GcFlags.generations; ++g) { + ASSERT(sum.productivity_elapsed_percent >= 0); + + for(uint32_t g = 0; g < RtsFlags.GcFlags.generations; ++g) { const generation* gen = &generations[g]; GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g]; gen_stats->collections = gen->collections; diff --git a/rts/Stats.h b/rts/Stats.h index fbcca110e9..ed2479863a 100644 --- a/rts/Stats.h +++ b/rts/Stats.h @@ -59,6 +59,7 @@ void stat_workerStop(void); void initStats0(void); void initStats1(void); +void resetChildProcessStats(void); double mut_user_time_until(Time t); double mut_user_time(void); diff --git a/testsuite/tests/profiling/should_run/Makefile b/testsuite/tests/profiling/should_run/Makefile index 33d8d9a51d..e3a6ec40d5 100644 --- a/testsuite/tests/profiling/should_run/Makefile +++ b/testsuite/tests/profiling/should_run/Makefile @@ -41,3 +41,11 @@ T14257: ./T14257 +RTS -hc # Make sure that samples are monotonically increasing awk 'BEGIN{t=0} /BEGIN_SAMPLE/{if ($$2 < t) print "uh oh", $$t, $$0; t=$$2;}' T14257.hp + +.PHONY: T15897 +T15897: + # The bug is caught by an assertion so we run the tests with debug runtime + # and ignore the outputs + "$(TEST_HC)" -prof -fprof-auto -debug -v0 T15897.hs + ./T15897 10000000 +RTS -s -hc 2>/dev/null + ./T15897 10000000 +RTS -s -hr 2>/dev/null diff --git a/testsuite/tests/profiling/should_run/T15897.hs b/testsuite/tests/profiling/should_run/T15897.hs new file mode 100644 index 0000000000..b004cd1248 --- /dev/null +++ b/testsuite/tests/profiling/should_run/T15897.hs @@ -0,0 +1,7 @@ +import Control.Monad +import Data.IORef +import System.Environment + +main = do + [n] <- getArgs + replicateM (read n) (newIORef [1,2,3]) diff --git a/testsuite/tests/profiling/should_run/all.T b/testsuite/tests/profiling/should_run/all.T index c3d34af2bf..326afee39d 100644 --- a/testsuite/tests/profiling/should_run/all.T +++ b/testsuite/tests/profiling/should_run/all.T @@ -145,3 +145,8 @@ test('toplevel_scc_1', test('T12962', [], compile_and_run, ['']) test('T14257', [], run_command, ['$MAKE -s --no-print-directory T14257']) + +test('T15897', + [extra_ways(['profasm']), only_ways(['profasm']), run_timeout_multiplier(2)], + run_command, + ['$MAKE -s --no-print-directory T15897']) |