diff options
author | Ben Gamari <ben@smart-cactus.org> | 2018-03-20 12:17:43 -0400 |
---|---|---|
committer | Ben Gamari <ben@smart-cactus.org> | 2018-03-20 12:18:17 -0400 |
commit | 0a778ebeccefe6c3c2540a06d5a1c585f18e01ab (patch) | |
tree | 18d125a173019985b5827e9ed6f5ab6845cad0af | |
parent | 57001d93da7b50a58e67a712bd4d4cee1d004b6a (diff) | |
download | haskell-0a778ebeccefe6c3c2540a06d5a1c585f18e01ab.tar.gz |
Revert "rts, base: Refactor stats.c to improve --machine-readable report"
This reverts commit 2d4bda2e4ac68816baba0afab00da6f769ea75a7.
-rw-r--r-- | includes/RtsAPI.h | 34 | ||||
-rw-r--r-- | libraries/base/GHC/Stats.hsc | 8 | ||||
-rw-r--r-- | rts/Stats.c | 921 | ||||
-rw-r--r-- | rts/Stats.h | 45 |
4 files changed, 317 insertions, 691 deletions
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h index 69ec6075db..6f011cbf6e 100644 --- a/includes/RtsAPI.h +++ b/includes/RtsAPI.h @@ -184,7 +184,7 @@ typedef struct _RTSStats { uint64_t par_copied_bytes; // Sum of par_max_copied_bytes across all parallel GCs uint64_t cumulative_par_max_copied_bytes; - // Sum of par_balanced_copied_byes across all parallel GCs. + // Sum of par_balanced_copied_byes across all parallel GCs. uint64_t cumulative_par_balanced_copied_bytes; // ----------------------------------- @@ -192,10 +192,6 @@ typedef struct _RTSStats { // (we use signed values here because due to inaccuracies in timers // the values can occasionally go slightly negative) - // Total CPU time used by the init phase - Time init_cpu_ns; - // Total elapsed time used by the init phase - Time init_elapsed_ns; // Total CPU time used by the mutator Time mutator_cpu_ns; // Total elapsed time used by the mutator @@ -217,26 +213,26 @@ typedef struct _RTSStats { // ----------------------------------- // Internal Counters - // The number of times a GC thread spun on its 'gc_spin' lock. - // Will be zero if the rts was not built with PROF_SPIN + // The number of times a GC thread spun on its 'gc_spin' lock. + // Will be zero if the rts was not built with PROF_SPIN uint64_t gc_spin_spin; - // The number of times a GC thread yielded on its 'gc_spin' lock. - // Will be zero if the rts was not built with PROF_SPIN + // The number of times a GC thread yielded on its 'gc_spin' lock. + // Will be zero if the rts was not built with PROF_SPIN uint64_t gc_spin_yield; - // The number of times a GC thread spun on its 'mut_spin' lock. - // Will be zero if the rts was not built with PROF_SPIN + // The number of times a GC thread spun on its 'mut_spin' lock. + // Will be zero if the rts was not built with PROF_SPIN uint64_t mut_spin_spin; - // The number of times a GC thread yielded on its 'mut_spin' lock. - // Will be zero if the rts was not built with PROF_SPIN + // The number of times a GC thread yielded on its 'mut_spin' lock. + // Will be zero if the rts was not built with PROF_SPIN uint64_t mut_spin_yield; - // The number of times a GC thread has checked for work across all parallel - // GCs + // The number of times a GC thread has checked for work across all parallel + // GCs uint64_t any_work; - // The number of times a GC thread has checked for work and found none - // across all parallel GCs + // The number of times a GC thread has checked for work and found none across + // all parallel GCs uint64_t no_work; - // The number of times a GC thread has iterated it's outer loop across all - // parallel GCs + // The number of times a GC thread has iterated it's outer loop across all + // parallel GCs uint64_t scav_find_work; } RTSStats; diff --git a/libraries/base/GHC/Stats.hsc b/libraries/base/GHC/Stats.hsc index d81173fe58..3497ff5185 100644 --- a/libraries/base/GHC/Stats.hsc +++ b/libraries/base/GHC/Stats.hsc @@ -83,12 +83,6 @@ data RTSStats = RTSStats { -- (we use signed values here because due to inaccuracies in timers -- the values can occasionally go slightly negative) - -- | Total CPU time used by the init phase - -- @since 4.12.0.0 - , init_cpu_ns :: RtsTime - -- | Total elapsed time used by the init phase - -- @since 4.12.0.0 - , init_elapsed_ns :: RtsTime -- | Total CPU time used by the mutator , mutator_cpu_ns :: RtsTime -- | Total elapsed time used by the mutator @@ -181,8 +175,6 @@ getRTSStats = do (# peek RTSStats, cumulative_par_max_copied_bytes) p cumulative_par_balanced_copied_bytes <- (# peek RTSStats, cumulative_par_balanced_copied_bytes) p - init_cpu_ns <- (# peek RTSStats, init_cpu_ns) p - init_elapsed_ns <- (# peek RTSStats, init_elapsed_ns) p mutator_cpu_ns <- (# peek RTSStats, mutator_cpu_ns) p mutator_elapsed_ns <- (# peek RTSStats, mutator_elapsed_ns) p gc_cpu_ns <- (# peek RTSStats, gc_cpu_ns) p diff --git a/rts/Stats.c b/rts/Stats.c index 8769551160..7eb93bef33 100644 --- a/rts/Stats.c +++ b/rts/Stats.c @@ -24,7 +24,6 @@ #include "ThreadPaused.h" #include "Messages.h" -#include <string.h> // for memset #define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION) @@ -93,9 +92,6 @@ mut_user_time_until( Time t ) return TimeToSecondsDbl(t - stats.gc_cpu_ns); // heapCensus() time is included in GC_tot_cpu, so we don't need // to subtract it here. - - // TODO: This seems wrong to me. Surely we should be subtracting - // (at least) start_init_cpu? } double @@ -173,8 +169,6 @@ initStats0(void) .any_work = 0, .no_work = 0, .scav_find_work = 0, - .init_cpu_ns = 0, - .init_elapsed_ns = 0, .mutator_cpu_ns = 0, .mutator_elapsed_ns = 0, .gc_cpu_ns = 0, @@ -246,8 +240,6 @@ void stat_endInit(void) { getProcessTimes(&end_init_cpu, &end_init_elapsed); - stats.init_cpu_ns = end_init_cpu - start_init_cpu; - stats.init_elapsed_ns = end_init_elapsed - start_init_elapsed; } /* ----------------------------------------------------------------------------- @@ -427,8 +419,8 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop, stats.gc.copied_bytes, stats.gc.slop_bytes, /* current loss due to fragmentation */ - (mblocks_allocated * BLOCKS_PER_MBLOCK - - n_alloc_blocks) * BLOCK_SIZE, + (mblocks_allocated * BLOCKS_PER_MBLOCK - n_alloc_blocks) + * BLOCK_SIZE, par_n_threads, stats.gc.par_max_copied_bytes, stats.gc.copied_bytes, @@ -523,8 +515,7 @@ stat_endRP( fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize); fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize); #endif - fprintf(prof_file, "\tAverage number of visits per object = %f\n", - averageNumVisit); + fprintf(prof_file, "\tAverage number of visits per object = %f\n", averageNumVisit); } #endif /* PROFILING */ @@ -582,7 +573,7 @@ StgInt TOTAL_CALLS=1; #define REPORT(counter) \ { \ showStgWord64(counter,temp,true/*commas*/); \ - statsPrintf(" (" #counter ") : %s\n",temp); \ + statsPrintf(" (" #counter ") : %s\n",temp); \ } /* Report the value of a counter as a percentage of another counter */ @@ -601,647 +592,340 @@ StgInt TOTAL_CALLS=1; statsPrintf(" (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \ SLOW_CALLS_##arity * 100.0/TOTAL_CALLS) -/* -Note [RTS Stats Reporting] -========================== - -There are currently three reporting functions: - * report_summary: - Responsible for producing '+RTS -s' output. - Will report internal counters if the RTS flag --internal-counters is - passed. See [Internal Counters Stats] - * report_machine_readable: - Responsible for producing '+RTS -t --machine-readable' output. - * report_one_line: - Responsible for productin '+RTS -t' output - -Stats are accumulated into the global variable 'stats' as the program runs, then -in 'stat_exit' we do the following: - * Finalise 'stats'. This involves setting final running times and allocations - that have not yet been accounted for. - * Create a RTSSummaryStats. This contains all data for reports that is not - included in stats (because they do not make sense before the program has - completed) or in a global variable. - * call the appropriate report_* function, passing the newly constructed - RTSSummaryStats. - -To ensure that the data in the different reports is kept consistent, the -report_* functions should not do any calculation, excepting unit changes and -formatting. If you need to add a new calculated field, add it to -RTSSummaryStats. - -*/ - - -static void init_RTSSummaryStats(RTSSummaryStats* sum) -{ - const size_t sizeof_gc_summary_stats = - RtsFlags.GcFlags.generations * sizeof(GenerationSummaryStats); - - memset(sum, 0, sizeof(RTSSummaryStats)); - sum->gc_summary_stats = - stgMallocBytes(sizeof_gc_summary_stats, - "alloc_RTSSummaryStats.gc_summary_stats"); - memset(sum->gc_summary_stats, 0, sizeof_gc_summary_stats); +STATIC_INLINE Time get_init_cpu(void) { + return end_init_cpu - start_init_cpu; } -static void free_RTSSummaryStats(RTSSummaryStats * sum) -{ - if (!sum) { return; } - if (!sum->gc_summary_stats) { - stgFree(sum->gc_summary_stats); - sum->gc_summary_stats = NULL; - } +STATIC_INLINE Time get_init_elapsed(void) { + return end_init_elapsed - start_init_elapsed; } -static void report_summary(const RTSSummaryStats* sum) -{ - // We should do no calculation, other than unit changes and formatting, and - // we should not not use any data from outside of globals, sum and stats - // here. See Note [RTS Stats Reporting] - - uint32_t g; - char temp[512]; - showStgWord64(stats.allocated_bytes, temp, true/*commas*/); - statsPrintf("%16s bytes allocated in the heap\n", temp); - - showStgWord64(stats.copied_bytes, temp, true/*commas*/); - statsPrintf("%16s bytes copied during GC\n", temp); - - if ( stats.major_gcs > 0 ) { - showStgWord64(stats.max_live_bytes, temp, true/*commas*/); - statsPrintf("%16s bytes maximum residency (%" FMT_Word32 - " sample(s))\n", - temp, stats.major_gcs); - } - showStgWord64(stats.max_slop_bytes, temp, true/*commas*/); - statsPrintf("%16s bytes maximum slop\n", temp); - - statsPrintf("%16" FMT_SizeT " MB total memory in use (%" - FMT_SizeT " MB lost due to fragmentation)\n\n", - stats.max_live_bytes / (1024 * 1024), - sum->fragmentation_bytes / (1024 * 1024)); - - /* Print garbage collections in each gen */ - statsPrintf(" Tot time (elapsed) Avg pause Max pause\n"); - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - const GenerationSummaryStats * gen_stats = - &sum->gc_summary_stats[g]; - statsPrintf(" Gen %2d %5d colls" - ", %5d par %6.3fs %6.3fs %3.4fs %3.4fs\n", - g, // REVIEWERS: this used to be gen->no - //, this can't ever be different right? - gen_stats->collections, - gen_stats->par_collections, - TimeToSecondsDbl(gen_stats->cpu_ns), - TimeToSecondsDbl(gen_stats->elapsed_ns), - TimeToSecondsDbl(gen_stats->avg_pause_ns), - TimeToSecondsDbl(gen_stats->max_pause_ns)); - } +void +stat_exit (void) +{ + generation *gen; + Time gc_cpu = 0; + Time gc_elapsed = 0; + Time init_cpu = 0; + Time init_elapsed = 0; + Time mut_cpu = 0; + Time mut_elapsed = 0; + Time exit_cpu = 0; + Time exit_elapsed = 0; + Time exit_gc_cpu = 0; + Time exit_gc_elapsed = 0; - statsPrintf("\n"); + if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) { -#if defined(THREADED_RTS) - if (RtsFlags.ParFlags.parGcEnabled && sum->work_balance > 0) { - // See Note [Work Balance] - statsPrintf(" Parallel GC work balance: " - "%.2f%% (serial 0%%, perfect 100%%)\n\n", - sum->work_balance * 100); - } + char temp[512]; + Time tot_cpu; + Time tot_elapsed; + uint32_t g; - statsPrintf(" TASKS: %d " - "(%d bound, %d peak workers (%d total), using -N%d)\n\n", - taskCount, sum->bound_task_count, - peakWorkerCount, workerCount, - n_capabilities); - - statsPrintf(" SPARKS: %" FMT_Word - "(%" FMT_Word " converted, %" FMT_Word " overflowed, %" - FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n", - sum->sparks_count, - sum->sparks.converted, sum->sparks.overflowed, - sum->sparks.dud, sum->sparks.gcd, - sum->sparks.fizzled); -#endif + getProcessTimes( &tot_cpu, &tot_elapsed ); + tot_cpu -= start_init_cpu; + tot_elapsed -= start_init_elapsed; - statsPrintf(" INIT time %7.3fs (%7.3fs elapsed)\n", - TimeToSecondsDbl(stats.init_cpu_ns), - TimeToSecondsDbl(stats.init_elapsed_ns)); + uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_); - statsPrintf(" MUT time %7.3fs (%7.3fs elapsed)\n", - TimeToSecondsDbl(stats.mutator_cpu_ns), - TimeToSecondsDbl(stats.mutator_elapsed_ns)); - statsPrintf(" GC time %7.3fs (%7.3fs elapsed)\n", - TimeToSecondsDbl(stats.gc_cpu_ns), - TimeToSecondsDbl(stats.gc_elapsed_ns)); + // allocated since the last GC + stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes; + stats.allocated_bytes = tot_alloc_bytes; -#if defined(PROFILING) - statsPrintf(" RP time %7.3fs (%7.3fs elapsed)\n", - TimeToSecondsDbl(sum->rc_cpu_ns), - TimeToSecondsDbl(sum->rc_elapsed_ns)); - statsPrintf(" PROF time %7.3fs (%7.3fs elapsed)\n", - TimeToSecondsDbl(sum->hc_cpu_ns), - TimeToSecondsDbl(sum->hc_elapsed_ns)); -#endif - statsPrintf(" EXIT time %7.3fs (%7.3fs elapsed)\n", - TimeToSecondsDbl(sum->exit_cpu_ns), - TimeToSecondsDbl(sum->exit_elapsed_ns)); - statsPrintf(" Total time %7.3fs (%7.3fs elapsed)\n\n", - TimeToSecondsDbl(stats.cpu_ns), - TimeToSecondsDbl(stats.elapsed_ns)); -#if !defined(THREADED_RTS) - statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n", - sum->gc_cpu_percent * 100, - sum->gc_elapsed_percent * 100); -#endif + /* avoid divide by zero if tot_cpu is measured as 0.00 seconds -- SDM */ + if (tot_cpu <= 0) tot_cpu = 1; + if (tot_elapsed <= 0) tot_elapsed = 1; - showStgWord64(sum->alloc_rate, temp, true/*commas*/); - - statsPrintf(" Alloc rate %s bytes per MUT second\n\n", temp); - - statsPrintf(" Productivity %5.1f%% of total user, " - "%.1f%% of total elapsed\n\n", - sum->productivity_cpu_percent * 100, - sum->productivity_elapsed_percent * 100); - - // See Note [Internal Counter Stats] for a description of the - // following counters. If you add a counter here, please remember - // to update the Note. - if (RtsFlags.MiscFlags.internalCounters) { -#if defined(THREADED_RTS) && defined(PROF_SPIN) - const int32_t col_width[] = {4, -30, 14, 14}; - statsPrintf("Internal Counters:\n"); - statsPrintf("%*s" "%*s" "%*s" "%*s" "\n" - , col_width[0], "" - , col_width[1], "SpinLock" - , col_width[2], "Spins" - , col_width[3], "Yields"); - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" - , col_width[0], "" - , col_width[1], "gc_alloc_block_sync" - , col_width[2], gc_alloc_block_sync.spin - , col_width[3], gc_alloc_block_sync.yield); - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" - , col_width[0], "" - , col_width[1], "gc_spin" - , col_width[2], stats.gc_spin_spin - , col_width[3], stats.gc_spin_yield); - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" - , col_width[0], "" - , col_width[1], "mut_spin" - , col_width[2], stats.mut_spin_spin - , col_width[3], stats.mut_spin_yield); - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n" - , col_width[0], "" - , col_width[1], "whitehole_gc" - , col_width[2], whitehole_gc_spin - , col_width[3], "n/a"); - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n" - , col_width[0], "" - , col_width[1], "whitehole_threadPaused" - , col_width[2], whitehole_threadPaused_spin - , col_width[3], "n/a"); - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n" - , col_width[0], "" - , col_width[1], "whitehole_executeMessage" - , col_width[2], whitehole_executeMessage_spin - , col_width[3], "n/a"); - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" - , col_width[0], "" - , col_width[1], "whitehole_lockClosure" - , col_width[2], whitehole_lockClosure_spin - , col_width[3], whitehole_lockClosure_yield); - // waitForGcThreads isn't really spin-locking(see the function) - // but these numbers still seem useful. - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" - , col_width[0], "" - , col_width[1], "waitForGcThreads" - , col_width[2], waitForGcThreads_spin - , col_width[3], waitForGcThreads_yield); - - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - int prefix_length = 0; - statsPrintf("%*s" "gen[%" FMT_Word32 "%n", - col_width[0], "", g, &prefix_length); - prefix_length -= col_width[0]; - int suffix_length = col_width[1] + prefix_length; - suffix_length = - suffix_length > 0 ? col_width[1] : suffix_length; - - statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" - , suffix_length, "].sync" - , col_width[2], generations[g].sync.spin - , col_width[3], generations[g].sync.yield); + if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) { + statsPrintf("%9" FMT_Word " %9.9s %9.9s", + (W_)stats.gc.allocated_bytes, "", ""); + statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0); } - statsPrintf("\n"); - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n" - , col_width[0], "" - , col_width[1], "any_work" - , col_width[2], stats.any_work); - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n" - , col_width[0], "" - , col_width[1], "no_work" - , col_width[2], stats.no_work); - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n" - , col_width[0], "" - , col_width[1], "scav_find_work" - , col_width[2], stats.scav_find_work); -#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN - statsPrintf("Internal Counters require the RTS to be built " - "with PROF_SPIN"); // PROF_SPIN is not #defined here -#else // THREADED_RTS - statsPrintf("Internal Counters require the threaded RTS"); -#endif - } -} -static void report_machine_readable (const RTSSummaryStats * sum) -{ - // We should do no calculation, other than unit changes and formatting, and - // we should not not use any data from outside of globals, sum and stats - // here. See Note [RTS Stats Reporting] - uint32_t g; - -#define MR_STAT(field_name,format,value) \ - statsPrintf(" ,(\"" field_name "\", \"%" format "\")\n", value) -#define MR_STAT_GEN(gen,field_name,format,value) \ - statsPrintf(" ,(\"gen_%" FMT_Word32 "_" field_name "\", \"%" \ - format "\")\n", g, value) - - // These first values are for backwards compatibility. - // Some of these first fields are duplicated with more machine-readable - // names, or to match the name in RtsStats. - - // we don't use for the first field helper macro here because the prefix is - // different - statsPrintf(" [(\"%s\", \"%" FMT_Word64 "\")\n", "bytes allocated", - stats.allocated_bytes); - MR_STAT("num_GCs", FMT_Word32, stats.gcs); - MR_STAT("average_bytes_used", FMT_Word64, sum->average_bytes_used); - MR_STAT("max_bytes_used", FMT_Word64, stats.max_live_bytes); - MR_STAT("num_byte_usage_samples", FMT_Word32, stats.major_gcs); - MR_STAT("peak_megabytes_allocated", FMT_SizeT, - stats.max_mem_in_use_bytes / (1024 * 1024)); - - MR_STAT("init_cpu_seconds", "f", TimeToSecondsDbl(stats.init_cpu_ns)); - MR_STAT("init_wall_seconds", "f", TimeToSecondsDbl(stats.init_elapsed_ns)); - MR_STAT("mut_cpu_seconds", "f", TimeToSecondsDbl(stats.mutator_cpu_ns)); - MR_STAT("mut_wall_seconds", "f", - TimeToSecondsDbl(stats.mutator_elapsed_ns)); - MR_STAT("GC_cpu_seconds", "f", TimeToSecondsDbl(stats.gc_cpu_ns)); - MR_STAT("GC_wall_seconds", "f", TimeToSecondsDbl(stats.gc_elapsed_ns)); - - // end backward compatibility - - // First, the rest of the times - - MR_STAT("exit_cpu_seconds", "f", TimeToSecondsDbl(sum->exit_cpu_ns)); - MR_STAT("exit_wall_seconds", "f", TimeToSecondsDbl(sum->exit_elapsed_ns)); -#if defined(PROFILING) - MR_STAT("rp_cpu_seconds", "f", TimeToSecondsDbl(sum->rp_cpu_ns)); - MR_STAT("rp_wall_seconds", "f", TimeToSecondsDbl(sum->rp_elapsed_ns)); - MR_STAT("hc_cpu_seconds", "f", TimeToSecondsDbl(sum->hp_cpu_ns)); - MR_STAT("hc_wall_seconds", "f", TimeToSecondsDbl(sum->hp_elapsed_ns)); -#endif - MR_STAT("total_cpu_seconds", "f", TimeToSecondsDbl(stats.cpu_ns)); - MR_STAT("total_wall_seconds", "f", - TimeToSecondsDbl(stats.elapsed_ns)); - - // next, the remainder of the fields of RTSStats, except internal counters - - // The first two are duplicates of those above, but have more machine - // readable names that match the field names in RTSStats. - - - // gcs has been done as num_GCs above - MR_STAT("major_gcs", FMT_Word32, stats.major_gcs); - MR_STAT("allocated_bytes", FMT_Word64, stats.allocated_bytes); - MR_STAT("max_live_bytes", FMT_Word64, stats.max_live_bytes); - MR_STAT("max_large_objects_bytes", FMT_Word64, - stats.max_large_objects_bytes); - MR_STAT("max_compact_bytes", FMT_Word64, stats.max_compact_bytes); - MR_STAT("max_slop_bytes", FMT_Word64, stats.max_slop_bytes); - // This duplicates, except for unit, peak_megabytes_allocated above - MR_STAT("max_mem_in_use_bytes", FMT_Word64, stats.max_mem_in_use_bytes); - MR_STAT("cumulative_live_bytes", FMT_Word64, stats.cumulative_live_bytes); - MR_STAT("copied_bytes", FMT_Word64, stats.copied_bytes); - MR_STAT("par_copied_bytes", FMT_Word64, stats.par_copied_bytes); - MR_STAT("cumulative_par_max_copied_bytes", FMT_Word64, - stats.cumulative_par_max_copied_bytes); - MR_STAT("cumulative_par_balanced_copied_bytes", FMT_Word64, - stats.cumulative_par_balanced_copied_bytes); - - // next, the computed fields in RTSSummaryStats -#if !defined(THREADED_RTS) // THREADED_RTS - MR_STAT("gc_cpu_percent", "f", sum->gc_cpu_percent); - MR_STAT("gc_wall_percent", "f", sum->gc_cpu_percent); -#endif - MR_STAT("fragmentation_bytes", FMT_SizeT, sum->fragmentation_bytes); - // average_bytes_used is done above - MR_STAT("alloc_rate", FMT_Word64, sum->alloc_rate); - MR_STAT("productivity_cpu_percent", "f", sum->productivity_cpu_percent); - MR_STAT("productivity_wall_percent", "f", - sum->productivity_elapsed_percent); + // heapCensus() is called by the GC, so RP and HC time are + // included in the GC stats. We therefore subtract them to + // obtain the actual GC cpu time. + gc_cpu = stats.gc_cpu_ns - PROF_VAL(RP_tot_time + HC_tot_time); + gc_elapsed = stats.gc_elapsed_ns - PROF_VAL(RPe_tot_time + HCe_tot_time); - // next, the THREADED_RTS fields in RTSSummaryStats + init_cpu = get_init_cpu(); + init_elapsed = get_init_elapsed(); -#if defined(THREADED_RTS) - MR_STAT("bound_task_count", FMT_Word32, sum->bound_task_count); - MR_STAT("sparks_count", FMT_Word64, sum->sparks_count); - MR_STAT("sparks_converted", FMT_Word64, sum->sparks.converted); - MR_STAT("sparks_overflowed", FMT_Word64, sum->sparks.overflowed); - MR_STAT("sparks_dud ", FMT_Word64, sum->sparks.dud); - MR_STAT("sparks_gcd", FMT_Word64, sum->sparks.gcd); - MR_STAT("sparks_fizzled", FMT_Word64, sum->sparks.fizzled); - MR_STAT("work_balance", "f", sum->work_balance); - - // next, globals (other than internal counters) - MR_STAT("n_capabilities", FMT_Word32, n_capabilities); - MR_STAT("task_count", FMT_Word32, taskCount); - MR_STAT("peak_worker_count", FMT_Word32, peakWorkerCount); - MR_STAT("worker_count", FMT_Word32, workerCount); - - // next, internal counters -#if defined(PROF_SPIN) - MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin); - MR_STAT("gc_alloc_block_sync_yield", FMT_Word64, - gc_alloc_block_sync.yield); - MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin); - MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin); - MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield); - MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin); - MR_STAT("mut_spin_yield", FMT_Word64, stats.mut_spin_yield); - MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin); - MR_STAT("waitForGcThreads_yield", FMT_Word64, - waitForGcThreads_yield); - MR_STAT("whitehole_gc_spin", FMT_Word64, whitehole_gc_spin); - MR_STAT("whitehole_lockClosure_spin", FMT_Word64, - whitehole_lockClosure_spin); - MR_STAT("whitehole_lockClosure_yield", FMT_Word64, - whitehole_lockClosure_yield); - MR_STAT("whitehole_executeMessage_spin", FMT_Word64, - whitehole_executeMessage_spin); - MR_STAT("whitehole_threadPaused_spin", FMT_Word64, - whitehole_threadPaused_spin); - MR_STAT("any_work", FMT_Word64, - stats.any_work); - MR_STAT("no_work", FMT_Word64, - stats.no_work); - MR_STAT("scav_find_work", FMT_Word64, - stats.scav_find_work); -#endif // PROF_SPIN -#endif // THREADED_RTS - - // finally, per-generation stats. Named as, for example for generation 0, - // gen_0_collections - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - const GenerationSummaryStats* gc_sum = &sum->gc_summary_stats[g]; - MR_STAT_GEN(g, "collections", FMT_Word32, gc_sum->collections); - MR_STAT_GEN(g, "par_collections", FMT_Word32, gc_sum->par_collections); - MR_STAT_GEN(g, "cpu_seconds", "f", TimeToSecondsDbl(gc_sum->cpu_ns)); - MR_STAT_GEN(g, "wall_seconds", "f", - TimeToSecondsDbl(gc_sum->elapsed_ns)); - MR_STAT_GEN(g, "max_pause_seconds", "f", - TimeToSecondsDbl(gc_sum->max_pause_ns)); - MR_STAT_GEN(g, "avg_pause_seconds", "f", - TimeToSecondsDbl(gc_sum->avg_pause_ns)); -#if defined(THREADED_RTS) && defined(PROF_SPIN) - MR_STAT_GEN(g, "sync_spin", FMT_Word64, gc_sum->sync_spin); - MR_STAT_GEN(g, "sync_yield", FMT_Word64, gc_sum->sync_yield); -#endif - } + // We do a GC during the EXIT phase. We'll attribute the cost of that + // to GC instead of EXIT, so carefully subtract it from the EXIT time. + exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu; + exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed; + exit_cpu = end_exit_cpu - start_exit_cpu - exit_gc_cpu; + exit_elapsed = end_exit_elapsed - start_exit_elapsed - exit_gc_elapsed; - statsPrintf(" ]\n"); -} + mut_elapsed = start_exit_elapsed - end_init_elapsed - + (gc_elapsed - exit_gc_elapsed) - + PROF_VAL(RPe_tot_time + HCe_tot_time); -static void report_one_line(const RTSSummaryStats * sum) -{ - // We should do no calculation, other than unit changes and formatting, and - // we should not not use any data from outside of globals, sum and stats - // here. See Note [RTS Stats Reporting] + mut_cpu = start_exit_cpu - end_init_cpu - (gc_cpu - exit_gc_cpu) + - PROF_VAL(RP_tot_time + HC_tot_time); + if (mut_cpu < 0) { mut_cpu = 0; } - /* print the long long separately to avoid bugginess on mingwin (2001-07-02, - mingw-0.5) */ - statsPrintf("<<ghc: %" FMT_Word64 " bytes, " - "%" FMT_Word32 " GCs, " - "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency " - "(%" FMT_Word32 " samples), " - "%" FMT_Word64 "M in use, " - "%.3f INIT (%.3f elapsed), " - "%.3f MUT (%.3f elapsed), " - "%.3f GC (%.3f elapsed) :ghc>>\n", - stats.allocated_bytes, - stats.gcs, - sum->average_bytes_used, - stats.max_live_bytes, - stats.major_gcs, - stats.max_mem_in_use_bytes / (1024 * 1024), - TimeToSecondsDbl(stats.init_cpu_ns), - TimeToSecondsDbl(stats.init_elapsed_ns), - TimeToSecondsDbl(stats.mutator_cpu_ns), - TimeToSecondsDbl(stats.mutator_elapsed_ns), - TimeToSecondsDbl(stats.gc_cpu_ns), - TimeToSecondsDbl(stats.gc_elapsed_ns)); -} + // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding and + // subtracting, so the parts should add up to the total exactly. Note + // that tot_elapsed is captured a tiny bit later than end_exit_elapsed, + // so we don't use it here. + ASSERT(init_elapsed + mut_elapsed + gc_elapsed + exit_elapsed + == end_exit_elapsed - start_init_elapsed); -void -stat_exit (void) -{ - RTSSummaryStats sum; - uint32_t g; - init_RTSSummaryStats(&sum); - if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) { - // First we tidy the times in stats, and populate the times in sum. - // In particular, we adjust the gc_* time in stats to remove - // profiling times. - { - Time now_cpu_ns, now_elapsed_ns; - Time exit_gc_cpu = 0; - Time exit_gc_elapsed = 0; - Time prof_cpu = 0; - Time prof_elapsed = 0; - - getProcessTimes( &now_cpu_ns, &now_elapsed_ns); - - stats.cpu_ns = now_cpu_ns - start_init_cpu; - stats.elapsed_ns = now_elapsed_ns - start_init_elapsed; - /* avoid divide by zero if stats.total_cpu_ns is measured as 0.00 - seconds -- SDM */ - if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; } - if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; } - - prof_cpu = PROF_VAL(RP_tot_time + HC_tot_time); - prof_elapsed = PROF_VAL(RPe_tot_time + HCe_tot_time); - - // heapCensus() is called by the GC, so RP and HC time are - // included in the GC stats. We therefore subtract them to - // obtain the actual GC cpu time. - stats.gc_cpu_ns -= prof_cpu; - stats.gc_elapsed_ns -= prof_elapsed; - -#if defined(PROFILING) - sum.rp_cpu_ns = RP_tot_time; - sum.rp_elapsed_ns = RPe_tot_time; - sum.hc_cpu_ns = HC_tot_time; - sum.hc_elapsed_ns = HCe_tot_time; -#endif // PROFILING - - // We do a GC during the EXIT phase. We'll attribute the cost of - // that to GC instead of EXIT, so carefully subtract it from the - // EXIT time. - exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu; - exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed; - - sum.exit_cpu_ns = end_exit_cpu - - start_exit_cpu - - exit_gc_cpu; - sum.exit_elapsed_ns = end_exit_elapsed - - start_exit_elapsed - - exit_gc_elapsed; - - stats.mutator_cpu_ns = start_exit_cpu - - end_init_cpu - - (stats.gc_cpu_ns - exit_gc_cpu) - - prof_cpu; - stats.mutator_elapsed_ns = start_exit_elapsed - - end_init_elapsed - - (stats.gc_elapsed_ns - exit_gc_elapsed) - - prof_elapsed; - - if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; } - - // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding - // and subtracting, so the parts should add up to the total exactly. - // Note that stats->total_ns is captured a tiny bit later than - // end_exit_elapsed, so we don't use it here. - ASSERT(stats.init_elapsed_ns \ - + stats.mutator_elapsed_ns \ - + stats.gc_elapsed_ns \ - + sum.exit_elapsed_ns \ - == end_exit_elapsed - start_init_elapsed); + if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) { + showStgWord64(stats.allocated_bytes, temp, true/*commas*/); + statsPrintf("%16s bytes allocated in the heap\n", temp); - } + showStgWord64(stats.copied_bytes, temp, true/*commas*/); + statsPrintf("%16s bytes copied during GC\n", temp); - // REVIEWERS: it's not clear to me why the following isn't done in - // stat_endGC of the last garbage collection? - - // We account for the last garbage collection - { - uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_); - stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes; - stats.allocated_bytes = tot_alloc_bytes; - if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) { - statsPrintf("%9" FMT_Word " %9.9s %9.9s", - (W_)stats.gc.allocated_bytes, "", ""); - statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0); + if ( stats.major_gcs > 0 ) { + showStgWord64(stats.max_live_bytes, temp, true/*commas*/); + statsPrintf("%16s bytes maximum residency (%" FMT_Word32 + " sample(s))\n", + temp, stats.major_gcs); } - } - // We populate the remainder (non-time elements) of sum - { - #if defined(THREADED_RTS) - uint32_t i; - sum.bound_task_count = taskCount - workerCount; - - for (i = 0; i < n_capabilities; i++) { - sum.sparks.created += capabilities[i]->spark_stats.created; - sum.sparks.dud += capabilities[i]->spark_stats.dud; - sum.sparks.overflowed+= - capabilities[i]->spark_stats.overflowed; - sum.sparks.converted += - capabilities[i]->spark_stats.converted; - sum.sparks.gcd += capabilities[i]->spark_stats.gcd; - sum.sparks.fizzled += capabilities[i]->spark_stats.fizzled; + showStgWord64(stats.max_slop_bytes, temp, true/*commas*/); + statsPrintf("%16s bytes maximum slop\n", temp); + + statsPrintf("%16" FMT_SizeT " MB total memory in use (%" + FMT_SizeT " MB lost due to fragmentation)\n\n", + (size_t)(peak_mblocks_allocated * MBLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)), + (size_t)(peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_))); + + /* Print garbage collections in each gen */ + statsPrintf(" Tot time (elapsed) Avg pause Max pause\n"); + for (g = 0; g < RtsFlags.GcFlags.generations; g++) { + gen = &generations[g]; + statsPrintf(" Gen %2d %5d colls, %5d par %6.3fs %6.3fs %3.4fs %3.4fs\n", + gen->no, + gen->collections, + gen->par_collections, + TimeToSecondsDbl(GC_coll_cpu[g]), + TimeToSecondsDbl(GC_coll_elapsed[g]), + gen->collections == 0 ? 0 : TimeToSecondsDbl(GC_coll_elapsed[g] / gen->collections), + TimeToSecondsDbl(GC_coll_max_pause[g])); } - sum.sparks_count = sum.sparks.created - + sum.sparks.dud - + sum.sparks.overflowed; - +#if defined(THREADED_RTS) if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) { // See Note [Work Balance] - sum.work_balance = - (double)stats.cumulative_par_balanced_copied_bytes - / (double)stats.par_copied_bytes; - } else { - sum.work_balance = 0; + statsPrintf("\n Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n", + 100 * (double)stats.cumulative_par_balanced_copied_bytes / + (double)stats.par_copied_bytes); } +#endif + statsPrintf("\n"); + +#if defined(THREADED_RTS) + statsPrintf(" TASKS: %d (%d bound, %d peak workers (%d total), using -N%d)\n", + taskCount, taskCount - workerCount, + peakWorkerCount, workerCount, + n_capabilities); + + statsPrintf("\n"); + + { + uint32_t i; + SparkCounters sparks = { 0, 0, 0, 0, 0, 0}; + for (i = 0; i < n_capabilities; i++) { + sparks.created += capabilities[i]->spark_stats.created; + sparks.dud += capabilities[i]->spark_stats.dud; + sparks.overflowed+= capabilities[i]->spark_stats.overflowed; + sparks.converted += capabilities[i]->spark_stats.converted; + sparks.gcd += capabilities[i]->spark_stats.gcd; + sparks.fizzled += capabilities[i]->spark_stats.fizzled; + } + + statsPrintf(" SPARKS: %" FMT_Word " (%" FMT_Word " converted, %" FMT_Word " overflowed, %" FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n", + sparks.created + sparks.dud + sparks.overflowed, + sparks.converted, sparks.overflowed, sparks.dud, + sparks.gcd, sparks.fizzled); + } +#endif + + statsPrintf(" INIT time %7.3fs (%7.3fs elapsed)\n", + TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed)); + + statsPrintf(" MUT time %7.3fs (%7.3fs elapsed)\n", + TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed)); + statsPrintf(" GC time %7.3fs (%7.3fs elapsed)\n", + TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed)); +#if defined(PROFILING) + statsPrintf(" RP time %7.3fs (%7.3fs elapsed)\n", + TimeToSecondsDbl(RP_tot_time), TimeToSecondsDbl(RPe_tot_time)); + statsPrintf(" PROF time %7.3fs (%7.3fs elapsed)\n", + TimeToSecondsDbl(HC_tot_time), TimeToSecondsDbl(HCe_tot_time)); +#endif + statsPrintf(" EXIT time %7.3fs (%7.3fs elapsed)\n", + TimeToSecondsDbl(exit_cpu), TimeToSecondsDbl(exit_elapsed)); + statsPrintf(" Total time %7.3fs (%7.3fs elapsed)\n\n", + TimeToSecondsDbl(tot_cpu), TimeToSecondsDbl(tot_elapsed)); +#if !defined(THREADED_RTS) + statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n", + TimeToSecondsDbl(gc_cpu)*100/TimeToSecondsDbl(tot_cpu), + TimeToSecondsDbl(gc_elapsed)*100/TimeToSecondsDbl(tot_elapsed)); +#endif - #else // THREADED_RTS - sum.gc_cpu_percent = stats.gc_cpu_ns - / stats.cpu_ns; - sum.gc_elapsed_percent = stats.gc_elapsed_ns - / stats.elapsed_ns; - #endif // THREADED_RTS - - sum.fragmentation_bytes = - (uint64_t)(peak_mblocks_allocated - * BLOCKS_PER_MBLOCK - * BLOCK_SIZE_W - - hw_alloc_blocks * BLOCK_SIZE_W) - / (uint64_t)sizeof(W_); - - sum.average_bytes_used = stats.major_gcs == 0 ? 0 : - stats.cumulative_live_bytes/stats.major_gcs, - - sum.alloc_rate = stats.mutator_cpu_ns == 0 ? 0 : - (uint64_t)((double)stats.allocated_bytes - / TimeToSecondsDbl(stats.mutator_cpu_ns)); - - // REVIEWERS: These two values didn't used to include the exit times - sum.productivity_cpu_percent = - TimeToSecondsDbl(stats.cpu_ns - - stats.gc_cpu_ns - - sum.rp_cpu_ns - - sum.hc_cpu_ns - - stats.init_cpu_ns - - sum.exit_cpu_ns) - / TimeToSecondsDbl(stats.cpu_ns); - - sum.productivity_elapsed_percent = - TimeToSecondsDbl(stats.elapsed_ns - - stats.gc_elapsed_ns - - sum.rp_elapsed_ns - - sum.hc_elapsed_ns - - stats.init_elapsed_ns - - sum.exit_elapsed_ns) - / TimeToSecondsDbl(stats.elapsed_ns); - - for(g = 0; g < RtsFlags.GcFlags.generations; ++g) { - const generation* gen = &generations[g]; - GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g]; - gen_stats->collections = gen->collections; - gen_stats->par_collections = gen->par_collections; - gen_stats->cpu_ns = GC_coll_cpu[g]; - gen_stats->elapsed_ns = GC_coll_elapsed[g]; - gen_stats->max_pause_ns = GC_coll_max_pause[g]; - gen_stats->avg_pause_ns = gen->collections == 0 ? - 0 : (GC_coll_elapsed[g] / gen->collections); - #if defined(THREADED_RTS) && defined(PROF_SPIN) - gen_stats->sync_spin = gen->sync.spin; - gen_stats->sync_yield = gen->sync.yield; - #endif // PROF_SPIN + if (mut_cpu == 0) { + showStgWord64(0, temp, true/*commas*/); + } else { + showStgWord64( + (StgWord64)((double)stats.allocated_bytes / + TimeToSecondsDbl(mut_cpu)), + temp, true/*commas*/); } - } - // Now we generate the report - if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) { - report_summary(&sum); + statsPrintf(" Alloc rate %s bytes per MUT second\n\n", temp); + + statsPrintf(" Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n", + TimeToSecondsDbl(tot_cpu - gc_cpu - + PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100 + / TimeToSecondsDbl(tot_cpu), + TimeToSecondsDbl(tot_elapsed - gc_elapsed - + PROF_VAL(RPe_tot_time + HCe_tot_time) - init_elapsed) * 100 + / TimeToSecondsDbl(tot_elapsed)); + + // See Note [Internal Counter Stats] for a description of the + // following counters. If you add a counter here, please remember + // to update the Note. + if (RtsFlags.MiscFlags.internalCounters) { +#if defined(THREADED_RTS) && defined(PROF_SPIN) + uint32_t g; + const int32_t col_width[] = {4, -30, 14, 14}; + statsPrintf("Internal Counters:\n"); + statsPrintf("%*s" "%*s" "%*s" "%*s" "\n" + , col_width[0], "" + , col_width[1], "SpinLock" + , col_width[2], "Spins" + , col_width[3], "Yields"); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "gc_alloc_block_sync" + , col_width[2], gc_alloc_block_sync.spin + , col_width[3], gc_alloc_block_sync.yield); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "gc_spin" + , col_width[2], stats.gc_spin_spin + , col_width[3], stats.gc_spin_yield); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "mut_spin" + , col_width[2], stats.mut_spin_spin + , col_width[3], stats.mut_spin_yield); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n" + , col_width[0], "" + , col_width[1], "whitehole_gc" + , col_width[2], whitehole_gc_spin + , col_width[3], "n/a"); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n" + , col_width[0], "" + , col_width[1], "whitehole_threadPaused" + , col_width[2], whitehole_threadPaused_spin + , col_width[3], "n/a"); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n" + , col_width[0], "" + , col_width[1], "whitehole_executeMessage" + , col_width[2], whitehole_executeMessage_spin + , col_width[3], "n/a"); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "whitehole_lockClosure" + , col_width[2], whitehole_lockClosure_spin + , col_width[3], whitehole_lockClosure_yield); + // waitForGcThreads isn't really spin-locking(see the function) + // but these numbers still seem useful. + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "waitForGcThreads" + , col_width[2], waitForGcThreads_spin + , col_width[3], waitForGcThreads_yield); + + for (g = 0; g < RtsFlags.GcFlags.generations; g++) { + int prefix_length = 0; + statsPrintf("%*s" "gen[%" FMT_Word32 "%n", + col_width[0], "", g, &prefix_length); + prefix_length -= col_width[0]; + int suffix_length = col_width[1] + prefix_length; + suffix_length = + suffix_length > 0 ? col_width[1] : suffix_length; + + statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" + , suffix_length, "].sync" + , col_width[2], generations[g].sync.spin + , col_width[3], generations[g].sync.yield); + } + statsPrintf("\n"); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "any_work" + , col_width[2], stats.any_work); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "no_work" + , col_width[2], stats.no_work); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "scav_find_work" + , col_width[2], stats.scav_find_work); +#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN + statsPrintf("Internal Counters require the RTS to be built " + "with PROF_SPIN"); // PROF_SPIN is not #defined here +#else // THREADED_RTS + statsPrintf("Internal Counters require the threaded RTS"); +#endif + } } if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) { - if (RtsFlags.MiscFlags.machineReadable) { - report_machine_readable(&sum); - } - else { - report_one_line(&sum); - } + char *fmt; + if (RtsFlags.MiscFlags.machineReadable) { + fmt = + " [(\"bytes allocated\", \"%" FMT_Word64 "\")\n" + " ,(\"num_GCs\", \"%" FMT_Word32 "\")\n" + " ,(\"average_bytes_used\", \"%" FMT_Word64 "\")\n" + " ,(\"max_bytes_used\", \"%" FMT_Word64 "\")\n" + " ,(\"num_byte_usage_samples\", \"%" FMT_Word32 "\")\n" + " ,(\"peak_megabytes_allocated\", \"%" FMT_Word64 "\")\n" + " ,(\"init_cpu_seconds\", \"%.3f\")\n" + " ,(\"init_wall_seconds\", \"%.3f\")\n" + " ,(\"mutator_cpu_seconds\", \"%.3f\")\n" + " ,(\"mutator_wall_seconds\", \"%.3f\")\n" + " ,(\"GC_cpu_seconds\", \"%.3f\")\n" + " ,(\"GC_wall_seconds\", \"%.3f\")\n" + " ]\n"; + } + else { + fmt = + "<<ghc: %" FMT_Word64 " bytes, " + "%" FMT_Word32 " GCs, " + "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency (%" FMT_Word32 " samples), " + "%" FMT_Word64 "M in use, " + "%.3f INIT (%.3f elapsed), " + "%.3f MUT (%.3f elapsed), " + "%.3f GC (%.3f elapsed) :ghc>>\n"; + } + /* print the long long separately to avoid bugginess on mingwin (2001-07-02, mingw-0.5) */ + statsPrintf(fmt, + stats.allocated_bytes, + stats.gcs, + (uint64_t) + (stats.major_gcs == 0 ? 0 : + stats.cumulative_live_bytes/stats.major_gcs), + stats.max_live_bytes, + stats.major_gcs, + (uint64_t) (peak_mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)), + TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed), + TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed), + TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed)); } - free_RTSSummaryStats(&sum); statsFlush(); statsClose(); } @@ -1371,7 +1055,7 @@ waitForGcThreads: for a parallel garbage collection. We yield more than we spin in this case. In several places in the runtime we must take a lock on a closure. To do this, -we replace its info table with stg_WHITEHOLE_info, spinning if it is already +we replace it's info table with stg_WHITEHOLE_info, spinning if it is already a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we don't. We count these white-hole spins and include them in the SpinLocks table. If a particular loop does not yield, we put "n/a" in the table. They are named @@ -1457,9 +1141,8 @@ statDescribeGens(void) gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live; - debugBelch("%8" FMT_Word " %8d %8d %9" FMT_Word " %9" FMT_Word "\n", - gen_blocks, lge, compacts, gen_live*(W_)sizeof(W_), - gen_slop*(W_)sizeof(W_)); + debugBelch("%8" FMT_Word " %8d %8d %9" FMT_Word " %9" FMT_Word "\n", gen_blocks, lge, compacts, + gen_live*(W_)sizeof(W_), gen_slop*(W_)sizeof(W_)); tot_live += gen_live; tot_slop += gen_slop; } diff --git a/rts/Stats.h b/rts/Stats.h index 7cd49fc768..1c56344f81 100644 --- a/rts/Stats.h +++ b/rts/Stats.h @@ -66,49 +66,4 @@ void statDescribeGens( void ); Time stat_getElapsedGCTime(void); Time stat_getElapsedTime(void); -typedef struct GenerationSummaryStats_ { - uint32_t collections; - uint32_t par_collections; - Time cpu_ns; - Time elapsed_ns; - Time max_pause_ns; - Time avg_pause_ns; -#if defined(THREADED_RTS) && defined(PROF_SPIN) - uint64_t sync_spin; - uint64_t sync_yield; -#endif -} GenerationSummaryStats; - -typedef struct RTSSummaryStats_ { - // These profiling times could potentially be in RTSStats. However, I'm not - // confident enough to do this now, since there is some logic depending on - // global state that I do not understand. (Or if I do understand it, it's - // wrong) - Time rp_cpu_ns; - Time rp_elapsed_ns; - Time hc_cpu_ns; - Time hc_elapsed_ns; - - Time exit_cpu_ns; - Time exit_elapsed_ns; - -#if defined(THREADED_RTS) - uint32_t bound_task_count; - uint64_t sparks_count; - SparkCounters sparks; - double work_balance; -#else // THREADED_RTS - double gc_cpu_percent; - double gc_elapsed_percent; -#endif - uint64_t fragmentation_bytes; - uint64_t average_bytes_used; // This is not shown in the '+RTS -s' report - uint64_t alloc_rate; - double productivity_cpu_percent; - double productivity_elapsed_percent; - - // one for each generation, 0 first - GenerationSummaryStats* gc_summary_stats; -} RTSSummaryStats; - #include "EndPrivate.h" |