summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--includes/RtsAPI.h34
-rw-r--r--libraries/base/GHC/Stats.hsc8
-rw-r--r--rts/Stats.c921
-rw-r--r--rts/Stats.h45
4 files changed, 317 insertions, 691 deletions
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h
index 69ec6075db..6f011cbf6e 100644
--- a/includes/RtsAPI.h
+++ b/includes/RtsAPI.h
@@ -184,7 +184,7 @@ typedef struct _RTSStats {
uint64_t par_copied_bytes;
// Sum of par_max_copied_bytes across all parallel GCs
uint64_t cumulative_par_max_copied_bytes;
- // Sum of par_balanced_copied_byes across all parallel GCs.
+ // Sum of par_balanced_copied_byes across all parallel GCs.
uint64_t cumulative_par_balanced_copied_bytes;
// -----------------------------------
@@ -192,10 +192,6 @@ typedef struct _RTSStats {
// (we use signed values here because due to inaccuracies in timers
// the values can occasionally go slightly negative)
- // Total CPU time used by the init phase
- Time init_cpu_ns;
- // Total elapsed time used by the init phase
- Time init_elapsed_ns;
// Total CPU time used by the mutator
Time mutator_cpu_ns;
// Total elapsed time used by the mutator
@@ -217,26 +213,26 @@ typedef struct _RTSStats {
// -----------------------------------
// Internal Counters
- // The number of times a GC thread spun on its 'gc_spin' lock.
- // Will be zero if the rts was not built with PROF_SPIN
+ // The number of times a GC thread spun on its 'gc_spin' lock.
+ // Will be zero if the rts was not built with PROF_SPIN
uint64_t gc_spin_spin;
- // The number of times a GC thread yielded on its 'gc_spin' lock.
- // Will be zero if the rts was not built with PROF_SPIN
+ // The number of times a GC thread yielded on its 'gc_spin' lock.
+ // Will be zero if the rts was not built with PROF_SPIN
uint64_t gc_spin_yield;
- // The number of times a GC thread spun on its 'mut_spin' lock.
- // Will be zero if the rts was not built with PROF_SPIN
+ // The number of times a GC thread spun on its 'mut_spin' lock.
+ // Will be zero if the rts was not built with PROF_SPIN
uint64_t mut_spin_spin;
- // The number of times a GC thread yielded on its 'mut_spin' lock.
- // Will be zero if the rts was not built with PROF_SPIN
+ // The number of times a GC thread yielded on its 'mut_spin' lock.
+ // Will be zero if the rts was not built with PROF_SPIN
uint64_t mut_spin_yield;
- // The number of times a GC thread has checked for work across all parallel
- // GCs
+ // The number of times a GC thread has checked for work across all parallel
+ // GCs
uint64_t any_work;
- // The number of times a GC thread has checked for work and found none
- // across all parallel GCs
+ // The number of times a GC thread has checked for work and found none across
+ // all parallel GCs
uint64_t no_work;
- // The number of times a GC thread has iterated it's outer loop across all
- // parallel GCs
+ // The number of times a GC thread has iterated it's outer loop across all
+ // parallel GCs
uint64_t scav_find_work;
} RTSStats;
diff --git a/libraries/base/GHC/Stats.hsc b/libraries/base/GHC/Stats.hsc
index d81173fe58..3497ff5185 100644
--- a/libraries/base/GHC/Stats.hsc
+++ b/libraries/base/GHC/Stats.hsc
@@ -83,12 +83,6 @@ data RTSStats = RTSStats {
-- (we use signed values here because due to inaccuracies in timers
-- the values can occasionally go slightly negative)
- -- | Total CPU time used by the init phase
- -- @since 4.12.0.0
- , init_cpu_ns :: RtsTime
- -- | Total elapsed time used by the init phase
- -- @since 4.12.0.0
- , init_elapsed_ns :: RtsTime
-- | Total CPU time used by the mutator
, mutator_cpu_ns :: RtsTime
-- | Total elapsed time used by the mutator
@@ -181,8 +175,6 @@ getRTSStats = do
(# peek RTSStats, cumulative_par_max_copied_bytes) p
cumulative_par_balanced_copied_bytes <-
(# peek RTSStats, cumulative_par_balanced_copied_bytes) p
- init_cpu_ns <- (# peek RTSStats, init_cpu_ns) p
- init_elapsed_ns <- (# peek RTSStats, init_elapsed_ns) p
mutator_cpu_ns <- (# peek RTSStats, mutator_cpu_ns) p
mutator_elapsed_ns <- (# peek RTSStats, mutator_elapsed_ns) p
gc_cpu_ns <- (# peek RTSStats, gc_cpu_ns) p
diff --git a/rts/Stats.c b/rts/Stats.c
index 8769551160..7eb93bef33 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -24,7 +24,6 @@
#include "ThreadPaused.h"
#include "Messages.h"
-#include <string.h> // for memset
#define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
@@ -93,9 +92,6 @@ mut_user_time_until( Time t )
return TimeToSecondsDbl(t - stats.gc_cpu_ns);
// heapCensus() time is included in GC_tot_cpu, so we don't need
// to subtract it here.
-
- // TODO: This seems wrong to me. Surely we should be subtracting
- // (at least) start_init_cpu?
}
double
@@ -173,8 +169,6 @@ initStats0(void)
.any_work = 0,
.no_work = 0,
.scav_find_work = 0,
- .init_cpu_ns = 0,
- .init_elapsed_ns = 0,
.mutator_cpu_ns = 0,
.mutator_elapsed_ns = 0,
.gc_cpu_ns = 0,
@@ -246,8 +240,6 @@ void
stat_endInit(void)
{
getProcessTimes(&end_init_cpu, &end_init_elapsed);
- stats.init_cpu_ns = end_init_cpu - start_init_cpu;
- stats.init_elapsed_ns = end_init_elapsed - start_init_elapsed;
}
/* -----------------------------------------------------------------------------
@@ -427,8 +419,8 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
stats.gc.copied_bytes,
stats.gc.slop_bytes,
/* current loss due to fragmentation */
- (mblocks_allocated * BLOCKS_PER_MBLOCK
- - n_alloc_blocks) * BLOCK_SIZE,
+ (mblocks_allocated * BLOCKS_PER_MBLOCK - n_alloc_blocks)
+ * BLOCK_SIZE,
par_n_threads,
stats.gc.par_max_copied_bytes,
stats.gc.copied_bytes,
@@ -523,8 +515,7 @@ stat_endRP(
fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
#endif
- fprintf(prof_file, "\tAverage number of visits per object = %f\n",
- averageNumVisit);
+ fprintf(prof_file, "\tAverage number of visits per object = %f\n", averageNumVisit);
}
#endif /* PROFILING */
@@ -582,7 +573,7 @@ StgInt TOTAL_CALLS=1;
#define REPORT(counter) \
{ \
showStgWord64(counter,temp,true/*commas*/); \
- statsPrintf(" (" #counter ") : %s\n",temp); \
+ statsPrintf(" (" #counter ") : %s\n",temp); \
}
/* Report the value of a counter as a percentage of another counter */
@@ -601,647 +592,340 @@ StgInt TOTAL_CALLS=1;
statsPrintf(" (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \
SLOW_CALLS_##arity * 100.0/TOTAL_CALLS)
-/*
-Note [RTS Stats Reporting]
-==========================
-
-There are currently three reporting functions:
- * report_summary:
- Responsible for producing '+RTS -s' output.
- Will report internal counters if the RTS flag --internal-counters is
- passed. See [Internal Counters Stats]
- * report_machine_readable:
- Responsible for producing '+RTS -t --machine-readable' output.
- * report_one_line:
- Responsible for productin '+RTS -t' output
-
-Stats are accumulated into the global variable 'stats' as the program runs, then
-in 'stat_exit' we do the following:
- * Finalise 'stats'. This involves setting final running times and allocations
- that have not yet been accounted for.
- * Create a RTSSummaryStats. This contains all data for reports that is not
- included in stats (because they do not make sense before the program has
- completed) or in a global variable.
- * call the appropriate report_* function, passing the newly constructed
- RTSSummaryStats.
-
-To ensure that the data in the different reports is kept consistent, the
-report_* functions should not do any calculation, excepting unit changes and
-formatting. If you need to add a new calculated field, add it to
-RTSSummaryStats.
-
-*/
-
-
-static void init_RTSSummaryStats(RTSSummaryStats* sum)
-{
- const size_t sizeof_gc_summary_stats =
- RtsFlags.GcFlags.generations * sizeof(GenerationSummaryStats);
-
- memset(sum, 0, sizeof(RTSSummaryStats));
- sum->gc_summary_stats =
- stgMallocBytes(sizeof_gc_summary_stats,
- "alloc_RTSSummaryStats.gc_summary_stats");
- memset(sum->gc_summary_stats, 0, sizeof_gc_summary_stats);
+STATIC_INLINE Time get_init_cpu(void) {
+ return end_init_cpu - start_init_cpu;
}
-static void free_RTSSummaryStats(RTSSummaryStats * sum)
-{
- if (!sum) { return; }
- if (!sum->gc_summary_stats) {
- stgFree(sum->gc_summary_stats);
- sum->gc_summary_stats = NULL;
- }
+STATIC_INLINE Time get_init_elapsed(void) {
+ return end_init_elapsed - start_init_elapsed;
}
-static void report_summary(const RTSSummaryStats* sum)
-{
- // We should do no calculation, other than unit changes and formatting, and
- // we should not not use any data from outside of globals, sum and stats
- // here. See Note [RTS Stats Reporting]
-
- uint32_t g;
- char temp[512];
- showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
- statsPrintf("%16s bytes allocated in the heap\n", temp);
-
- showStgWord64(stats.copied_bytes, temp, true/*commas*/);
- statsPrintf("%16s bytes copied during GC\n", temp);
-
- if ( stats.major_gcs > 0 ) {
- showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
- statsPrintf("%16s bytes maximum residency (%" FMT_Word32
- " sample(s))\n",
- temp, stats.major_gcs);
- }
- showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
- statsPrintf("%16s bytes maximum slop\n", temp);
-
- statsPrintf("%16" FMT_SizeT " MB total memory in use (%"
- FMT_SizeT " MB lost due to fragmentation)\n\n",
- stats.max_live_bytes / (1024 * 1024),
- sum->fragmentation_bytes / (1024 * 1024));
-
- /* Print garbage collections in each gen */
- statsPrintf(" Tot time (elapsed) Avg pause Max pause\n");
- for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
- const GenerationSummaryStats * gen_stats =
- &sum->gc_summary_stats[g];
- statsPrintf(" Gen %2d %5d colls"
- ", %5d par %6.3fs %6.3fs %3.4fs %3.4fs\n",
- g, // REVIEWERS: this used to be gen->no
- //, this can't ever be different right?
- gen_stats->collections,
- gen_stats->par_collections,
- TimeToSecondsDbl(gen_stats->cpu_ns),
- TimeToSecondsDbl(gen_stats->elapsed_ns),
- TimeToSecondsDbl(gen_stats->avg_pause_ns),
- TimeToSecondsDbl(gen_stats->max_pause_ns));
- }
+void
+stat_exit (void)
+{
+ generation *gen;
+ Time gc_cpu = 0;
+ Time gc_elapsed = 0;
+ Time init_cpu = 0;
+ Time init_elapsed = 0;
+ Time mut_cpu = 0;
+ Time mut_elapsed = 0;
+ Time exit_cpu = 0;
+ Time exit_elapsed = 0;
+ Time exit_gc_cpu = 0;
+ Time exit_gc_elapsed = 0;
- statsPrintf("\n");
+ if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
-#if defined(THREADED_RTS)
- if (RtsFlags.ParFlags.parGcEnabled && sum->work_balance > 0) {
- // See Note [Work Balance]
- statsPrintf(" Parallel GC work balance: "
- "%.2f%% (serial 0%%, perfect 100%%)\n\n",
- sum->work_balance * 100);
- }
+ char temp[512];
+ Time tot_cpu;
+ Time tot_elapsed;
+ uint32_t g;
- statsPrintf(" TASKS: %d "
- "(%d bound, %d peak workers (%d total), using -N%d)\n\n",
- taskCount, sum->bound_task_count,
- peakWorkerCount, workerCount,
- n_capabilities);
-
- statsPrintf(" SPARKS: %" FMT_Word
- "(%" FMT_Word " converted, %" FMT_Word " overflowed, %"
- FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
- sum->sparks_count,
- sum->sparks.converted, sum->sparks.overflowed,
- sum->sparks.dud, sum->sparks.gcd,
- sum->sparks.fizzled);
-#endif
+ getProcessTimes( &tot_cpu, &tot_elapsed );
+ tot_cpu -= start_init_cpu;
+ tot_elapsed -= start_init_elapsed;
- statsPrintf(" INIT time %7.3fs (%7.3fs elapsed)\n",
- TimeToSecondsDbl(stats.init_cpu_ns),
- TimeToSecondsDbl(stats.init_elapsed_ns));
+ uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
- statsPrintf(" MUT time %7.3fs (%7.3fs elapsed)\n",
- TimeToSecondsDbl(stats.mutator_cpu_ns),
- TimeToSecondsDbl(stats.mutator_elapsed_ns));
- statsPrintf(" GC time %7.3fs (%7.3fs elapsed)\n",
- TimeToSecondsDbl(stats.gc_cpu_ns),
- TimeToSecondsDbl(stats.gc_elapsed_ns));
+ // allocated since the last GC
+ stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
+ stats.allocated_bytes = tot_alloc_bytes;
-#if defined(PROFILING)
- statsPrintf(" RP time %7.3fs (%7.3fs elapsed)\n",
- TimeToSecondsDbl(sum->rc_cpu_ns),
- TimeToSecondsDbl(sum->rc_elapsed_ns));
- statsPrintf(" PROF time %7.3fs (%7.3fs elapsed)\n",
- TimeToSecondsDbl(sum->hc_cpu_ns),
- TimeToSecondsDbl(sum->hc_elapsed_ns));
-#endif
- statsPrintf(" EXIT time %7.3fs (%7.3fs elapsed)\n",
- TimeToSecondsDbl(sum->exit_cpu_ns),
- TimeToSecondsDbl(sum->exit_elapsed_ns));
- statsPrintf(" Total time %7.3fs (%7.3fs elapsed)\n\n",
- TimeToSecondsDbl(stats.cpu_ns),
- TimeToSecondsDbl(stats.elapsed_ns));
-#if !defined(THREADED_RTS)
- statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n",
- sum->gc_cpu_percent * 100,
- sum->gc_elapsed_percent * 100);
-#endif
+ /* avoid divide by zero if tot_cpu is measured as 0.00 seconds -- SDM */
+ if (tot_cpu <= 0) tot_cpu = 1;
+ if (tot_elapsed <= 0) tot_elapsed = 1;
- showStgWord64(sum->alloc_rate, temp, true/*commas*/);
-
- statsPrintf(" Alloc rate %s bytes per MUT second\n\n", temp);
-
- statsPrintf(" Productivity %5.1f%% of total user, "
- "%.1f%% of total elapsed\n\n",
- sum->productivity_cpu_percent * 100,
- sum->productivity_elapsed_percent * 100);
-
- // See Note [Internal Counter Stats] for a description of the
- // following counters. If you add a counter here, please remember
- // to update the Note.
- if (RtsFlags.MiscFlags.internalCounters) {
-#if defined(THREADED_RTS) && defined(PROF_SPIN)
- const int32_t col_width[] = {4, -30, 14, 14};
- statsPrintf("Internal Counters:\n");
- statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
- , col_width[0], ""
- , col_width[1], "SpinLock"
- , col_width[2], "Spins"
- , col_width[3], "Yields");
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
- , col_width[0], ""
- , col_width[1], "gc_alloc_block_sync"
- , col_width[2], gc_alloc_block_sync.spin
- , col_width[3], gc_alloc_block_sync.yield);
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
- , col_width[0], ""
- , col_width[1], "gc_spin"
- , col_width[2], stats.gc_spin_spin
- , col_width[3], stats.gc_spin_yield);
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
- , col_width[0], ""
- , col_width[1], "mut_spin"
- , col_width[2], stats.mut_spin_spin
- , col_width[3], stats.mut_spin_yield);
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
- , col_width[0], ""
- , col_width[1], "whitehole_gc"
- , col_width[2], whitehole_gc_spin
- , col_width[3], "n/a");
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
- , col_width[0], ""
- , col_width[1], "whitehole_threadPaused"
- , col_width[2], whitehole_threadPaused_spin
- , col_width[3], "n/a");
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
- , col_width[0], ""
- , col_width[1], "whitehole_executeMessage"
- , col_width[2], whitehole_executeMessage_spin
- , col_width[3], "n/a");
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
- , col_width[0], ""
- , col_width[1], "whitehole_lockClosure"
- , col_width[2], whitehole_lockClosure_spin
- , col_width[3], whitehole_lockClosure_yield);
- // waitForGcThreads isn't really spin-locking(see the function)
- // but these numbers still seem useful.
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
- , col_width[0], ""
- , col_width[1], "waitForGcThreads"
- , col_width[2], waitForGcThreads_spin
- , col_width[3], waitForGcThreads_yield);
-
- for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
- int prefix_length = 0;
- statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
- col_width[0], "", g, &prefix_length);
- prefix_length -= col_width[0];
- int suffix_length = col_width[1] + prefix_length;
- suffix_length =
- suffix_length > 0 ? col_width[1] : suffix_length;
-
- statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
- , suffix_length, "].sync"
- , col_width[2], generations[g].sync.spin
- , col_width[3], generations[g].sync.yield);
+ if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
+ statsPrintf("%9" FMT_Word " %9.9s %9.9s",
+ (W_)stats.gc.allocated_bytes, "", "");
+ statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
}
- statsPrintf("\n");
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
- , col_width[0], ""
- , col_width[1], "any_work"
- , col_width[2], stats.any_work);
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
- , col_width[0], ""
- , col_width[1], "no_work"
- , col_width[2], stats.no_work);
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
- , col_width[0], ""
- , col_width[1], "scav_find_work"
- , col_width[2], stats.scav_find_work);
-#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
- statsPrintf("Internal Counters require the RTS to be built "
- "with PROF_SPIN"); // PROF_SPIN is not #defined here
-#else // THREADED_RTS
- statsPrintf("Internal Counters require the threaded RTS");
-#endif
- }
-}
-static void report_machine_readable (const RTSSummaryStats * sum)
-{
- // We should do no calculation, other than unit changes and formatting, and
- // we should not not use any data from outside of globals, sum and stats
- // here. See Note [RTS Stats Reporting]
- uint32_t g;
-
-#define MR_STAT(field_name,format,value) \
- statsPrintf(" ,(\"" field_name "\", \"%" format "\")\n", value)
-#define MR_STAT_GEN(gen,field_name,format,value) \
- statsPrintf(" ,(\"gen_%" FMT_Word32 "_" field_name "\", \"%" \
- format "\")\n", g, value)
-
- // These first values are for backwards compatibility.
- // Some of these first fields are duplicated with more machine-readable
- // names, or to match the name in RtsStats.
-
- // we don't use for the first field helper macro here because the prefix is
- // different
- statsPrintf(" [(\"%s\", \"%" FMT_Word64 "\")\n", "bytes allocated",
- stats.allocated_bytes);
- MR_STAT("num_GCs", FMT_Word32, stats.gcs);
- MR_STAT("average_bytes_used", FMT_Word64, sum->average_bytes_used);
- MR_STAT("max_bytes_used", FMT_Word64, stats.max_live_bytes);
- MR_STAT("num_byte_usage_samples", FMT_Word32, stats.major_gcs);
- MR_STAT("peak_megabytes_allocated", FMT_SizeT,
- stats.max_mem_in_use_bytes / (1024 * 1024));
-
- MR_STAT("init_cpu_seconds", "f", TimeToSecondsDbl(stats.init_cpu_ns));
- MR_STAT("init_wall_seconds", "f", TimeToSecondsDbl(stats.init_elapsed_ns));
- MR_STAT("mut_cpu_seconds", "f", TimeToSecondsDbl(stats.mutator_cpu_ns));
- MR_STAT("mut_wall_seconds", "f",
- TimeToSecondsDbl(stats.mutator_elapsed_ns));
- MR_STAT("GC_cpu_seconds", "f", TimeToSecondsDbl(stats.gc_cpu_ns));
- MR_STAT("GC_wall_seconds", "f", TimeToSecondsDbl(stats.gc_elapsed_ns));
-
- // end backward compatibility
-
- // First, the rest of the times
-
- MR_STAT("exit_cpu_seconds", "f", TimeToSecondsDbl(sum->exit_cpu_ns));
- MR_STAT("exit_wall_seconds", "f", TimeToSecondsDbl(sum->exit_elapsed_ns));
-#if defined(PROFILING)
- MR_STAT("rp_cpu_seconds", "f", TimeToSecondsDbl(sum->rp_cpu_ns));
- MR_STAT("rp_wall_seconds", "f", TimeToSecondsDbl(sum->rp_elapsed_ns));
- MR_STAT("hc_cpu_seconds", "f", TimeToSecondsDbl(sum->hp_cpu_ns));
- MR_STAT("hc_wall_seconds", "f", TimeToSecondsDbl(sum->hp_elapsed_ns));
-#endif
- MR_STAT("total_cpu_seconds", "f", TimeToSecondsDbl(stats.cpu_ns));
- MR_STAT("total_wall_seconds", "f",
- TimeToSecondsDbl(stats.elapsed_ns));
-
- // next, the remainder of the fields of RTSStats, except internal counters
-
- // The first two are duplicates of those above, but have more machine
- // readable names that match the field names in RTSStats.
-
-
- // gcs has been done as num_GCs above
- MR_STAT("major_gcs", FMT_Word32, stats.major_gcs);
- MR_STAT("allocated_bytes", FMT_Word64, stats.allocated_bytes);
- MR_STAT("max_live_bytes", FMT_Word64, stats.max_live_bytes);
- MR_STAT("max_large_objects_bytes", FMT_Word64,
- stats.max_large_objects_bytes);
- MR_STAT("max_compact_bytes", FMT_Word64, stats.max_compact_bytes);
- MR_STAT("max_slop_bytes", FMT_Word64, stats.max_slop_bytes);
- // This duplicates, except for unit, peak_megabytes_allocated above
- MR_STAT("max_mem_in_use_bytes", FMT_Word64, stats.max_mem_in_use_bytes);
- MR_STAT("cumulative_live_bytes", FMT_Word64, stats.cumulative_live_bytes);
- MR_STAT("copied_bytes", FMT_Word64, stats.copied_bytes);
- MR_STAT("par_copied_bytes", FMT_Word64, stats.par_copied_bytes);
- MR_STAT("cumulative_par_max_copied_bytes", FMT_Word64,
- stats.cumulative_par_max_copied_bytes);
- MR_STAT("cumulative_par_balanced_copied_bytes", FMT_Word64,
- stats.cumulative_par_balanced_copied_bytes);
-
- // next, the computed fields in RTSSummaryStats
-#if !defined(THREADED_RTS) // THREADED_RTS
- MR_STAT("gc_cpu_percent", "f", sum->gc_cpu_percent);
- MR_STAT("gc_wall_percent", "f", sum->gc_cpu_percent);
-#endif
- MR_STAT("fragmentation_bytes", FMT_SizeT, sum->fragmentation_bytes);
- // average_bytes_used is done above
- MR_STAT("alloc_rate", FMT_Word64, sum->alloc_rate);
- MR_STAT("productivity_cpu_percent", "f", sum->productivity_cpu_percent);
- MR_STAT("productivity_wall_percent", "f",
- sum->productivity_elapsed_percent);
+ // heapCensus() is called by the GC, so RP and HC time are
+ // included in the GC stats. We therefore subtract them to
+ // obtain the actual GC cpu time.
+ gc_cpu = stats.gc_cpu_ns - PROF_VAL(RP_tot_time + HC_tot_time);
+ gc_elapsed = stats.gc_elapsed_ns - PROF_VAL(RPe_tot_time + HCe_tot_time);
- // next, the THREADED_RTS fields in RTSSummaryStats
+ init_cpu = get_init_cpu();
+ init_elapsed = get_init_elapsed();
-#if defined(THREADED_RTS)
- MR_STAT("bound_task_count", FMT_Word32, sum->bound_task_count);
- MR_STAT("sparks_count", FMT_Word64, sum->sparks_count);
- MR_STAT("sparks_converted", FMT_Word64, sum->sparks.converted);
- MR_STAT("sparks_overflowed", FMT_Word64, sum->sparks.overflowed);
- MR_STAT("sparks_dud ", FMT_Word64, sum->sparks.dud);
- MR_STAT("sparks_gcd", FMT_Word64, sum->sparks.gcd);
- MR_STAT("sparks_fizzled", FMT_Word64, sum->sparks.fizzled);
- MR_STAT("work_balance", "f", sum->work_balance);
-
- // next, globals (other than internal counters)
- MR_STAT("n_capabilities", FMT_Word32, n_capabilities);
- MR_STAT("task_count", FMT_Word32, taskCount);
- MR_STAT("peak_worker_count", FMT_Word32, peakWorkerCount);
- MR_STAT("worker_count", FMT_Word32, workerCount);
-
- // next, internal counters
-#if defined(PROF_SPIN)
- MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
- MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
- gc_alloc_block_sync.yield);
- MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
- MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin);
- MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield);
- MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin);
- MR_STAT("mut_spin_yield", FMT_Word64, stats.mut_spin_yield);
- MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin);
- MR_STAT("waitForGcThreads_yield", FMT_Word64,
- waitForGcThreads_yield);
- MR_STAT("whitehole_gc_spin", FMT_Word64, whitehole_gc_spin);
- MR_STAT("whitehole_lockClosure_spin", FMT_Word64,
- whitehole_lockClosure_spin);
- MR_STAT("whitehole_lockClosure_yield", FMT_Word64,
- whitehole_lockClosure_yield);
- MR_STAT("whitehole_executeMessage_spin", FMT_Word64,
- whitehole_executeMessage_spin);
- MR_STAT("whitehole_threadPaused_spin", FMT_Word64,
- whitehole_threadPaused_spin);
- MR_STAT("any_work", FMT_Word64,
- stats.any_work);
- MR_STAT("no_work", FMT_Word64,
- stats.no_work);
- MR_STAT("scav_find_work", FMT_Word64,
- stats.scav_find_work);
-#endif // PROF_SPIN
-#endif // THREADED_RTS
-
- // finally, per-generation stats. Named as, for example for generation 0,
- // gen_0_collections
- for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
- const GenerationSummaryStats* gc_sum = &sum->gc_summary_stats[g];
- MR_STAT_GEN(g, "collections", FMT_Word32, gc_sum->collections);
- MR_STAT_GEN(g, "par_collections", FMT_Word32, gc_sum->par_collections);
- MR_STAT_GEN(g, "cpu_seconds", "f", TimeToSecondsDbl(gc_sum->cpu_ns));
- MR_STAT_GEN(g, "wall_seconds", "f",
- TimeToSecondsDbl(gc_sum->elapsed_ns));
- MR_STAT_GEN(g, "max_pause_seconds", "f",
- TimeToSecondsDbl(gc_sum->max_pause_ns));
- MR_STAT_GEN(g, "avg_pause_seconds", "f",
- TimeToSecondsDbl(gc_sum->avg_pause_ns));
-#if defined(THREADED_RTS) && defined(PROF_SPIN)
- MR_STAT_GEN(g, "sync_spin", FMT_Word64, gc_sum->sync_spin);
- MR_STAT_GEN(g, "sync_yield", FMT_Word64, gc_sum->sync_yield);
-#endif
- }
+ // We do a GC during the EXIT phase. We'll attribute the cost of that
+ // to GC instead of EXIT, so carefully subtract it from the EXIT time.
+ exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
+ exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
+ exit_cpu = end_exit_cpu - start_exit_cpu - exit_gc_cpu;
+ exit_elapsed = end_exit_elapsed - start_exit_elapsed - exit_gc_elapsed;
- statsPrintf(" ]\n");
-}
+ mut_elapsed = start_exit_elapsed - end_init_elapsed -
+ (gc_elapsed - exit_gc_elapsed) -
+ PROF_VAL(RPe_tot_time + HCe_tot_time);
-static void report_one_line(const RTSSummaryStats * sum)
-{
- // We should do no calculation, other than unit changes and formatting, and
- // we should not not use any data from outside of globals, sum and stats
- // here. See Note [RTS Stats Reporting]
+ mut_cpu = start_exit_cpu - end_init_cpu - (gc_cpu - exit_gc_cpu)
+ - PROF_VAL(RP_tot_time + HC_tot_time);
+ if (mut_cpu < 0) { mut_cpu = 0; }
- /* print the long long separately to avoid bugginess on mingwin (2001-07-02,
- mingw-0.5) */
- statsPrintf("<<ghc: %" FMT_Word64 " bytes, "
- "%" FMT_Word32 " GCs, "
- "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency "
- "(%" FMT_Word32 " samples), "
- "%" FMT_Word64 "M in use, "
- "%.3f INIT (%.3f elapsed), "
- "%.3f MUT (%.3f elapsed), "
- "%.3f GC (%.3f elapsed) :ghc>>\n",
- stats.allocated_bytes,
- stats.gcs,
- sum->average_bytes_used,
- stats.max_live_bytes,
- stats.major_gcs,
- stats.max_mem_in_use_bytes / (1024 * 1024),
- TimeToSecondsDbl(stats.init_cpu_ns),
- TimeToSecondsDbl(stats.init_elapsed_ns),
- TimeToSecondsDbl(stats.mutator_cpu_ns),
- TimeToSecondsDbl(stats.mutator_elapsed_ns),
- TimeToSecondsDbl(stats.gc_cpu_ns),
- TimeToSecondsDbl(stats.gc_elapsed_ns));
-}
+ // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding and
+ // subtracting, so the parts should add up to the total exactly. Note
+ // that tot_elapsed is captured a tiny bit later than end_exit_elapsed,
+ // so we don't use it here.
+ ASSERT(init_elapsed + mut_elapsed + gc_elapsed + exit_elapsed
+ == end_exit_elapsed - start_init_elapsed);
-void
-stat_exit (void)
-{
- RTSSummaryStats sum;
- uint32_t g;
- init_RTSSummaryStats(&sum);
- if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
- // First we tidy the times in stats, and populate the times in sum.
- // In particular, we adjust the gc_* time in stats to remove
- // profiling times.
- {
- Time now_cpu_ns, now_elapsed_ns;
- Time exit_gc_cpu = 0;
- Time exit_gc_elapsed = 0;
- Time prof_cpu = 0;
- Time prof_elapsed = 0;
-
- getProcessTimes( &now_cpu_ns, &now_elapsed_ns);
-
- stats.cpu_ns = now_cpu_ns - start_init_cpu;
- stats.elapsed_ns = now_elapsed_ns - start_init_elapsed;
- /* avoid divide by zero if stats.total_cpu_ns is measured as 0.00
- seconds -- SDM */
- if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; }
- if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; }
-
- prof_cpu = PROF_VAL(RP_tot_time + HC_tot_time);
- prof_elapsed = PROF_VAL(RPe_tot_time + HCe_tot_time);
-
- // heapCensus() is called by the GC, so RP and HC time are
- // included in the GC stats. We therefore subtract them to
- // obtain the actual GC cpu time.
- stats.gc_cpu_ns -= prof_cpu;
- stats.gc_elapsed_ns -= prof_elapsed;
-
-#if defined(PROFILING)
- sum.rp_cpu_ns = RP_tot_time;
- sum.rp_elapsed_ns = RPe_tot_time;
- sum.hc_cpu_ns = HC_tot_time;
- sum.hc_elapsed_ns = HCe_tot_time;
-#endif // PROFILING
-
- // We do a GC during the EXIT phase. We'll attribute the cost of
- // that to GC instead of EXIT, so carefully subtract it from the
- // EXIT time.
- exit_gc_cpu = stats.gc_cpu_ns - start_exit_gc_cpu;
- exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
-
- sum.exit_cpu_ns = end_exit_cpu
- - start_exit_cpu
- - exit_gc_cpu;
- sum.exit_elapsed_ns = end_exit_elapsed
- - start_exit_elapsed
- - exit_gc_elapsed;
-
- stats.mutator_cpu_ns = start_exit_cpu
- - end_init_cpu
- - (stats.gc_cpu_ns - exit_gc_cpu)
- - prof_cpu;
- stats.mutator_elapsed_ns = start_exit_elapsed
- - end_init_elapsed
- - (stats.gc_elapsed_ns - exit_gc_elapsed)
- - prof_elapsed;
-
- if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; }
-
- // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding
- // and subtracting, so the parts should add up to the total exactly.
- // Note that stats->total_ns is captured a tiny bit later than
- // end_exit_elapsed, so we don't use it here.
- ASSERT(stats.init_elapsed_ns \
- + stats.mutator_elapsed_ns \
- + stats.gc_elapsed_ns \
- + sum.exit_elapsed_ns \
- == end_exit_elapsed - start_init_elapsed);
+ if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
+ showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
+ statsPrintf("%16s bytes allocated in the heap\n", temp);
- }
+ showStgWord64(stats.copied_bytes, temp, true/*commas*/);
+ statsPrintf("%16s bytes copied during GC\n", temp);
- // REVIEWERS: it's not clear to me why the following isn't done in
- // stat_endGC of the last garbage collection?
-
- // We account for the last garbage collection
- {
- uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
- stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
- stats.allocated_bytes = tot_alloc_bytes;
- if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
- statsPrintf("%9" FMT_Word " %9.9s %9.9s",
- (W_)stats.gc.allocated_bytes, "", "");
- statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
+ if ( stats.major_gcs > 0 ) {
+ showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
+ statsPrintf("%16s bytes maximum residency (%" FMT_Word32
+ " sample(s))\n",
+ temp, stats.major_gcs);
}
- }
- // We populate the remainder (non-time elements) of sum
- {
- #if defined(THREADED_RTS)
- uint32_t i;
- sum.bound_task_count = taskCount - workerCount;
-
- for (i = 0; i < n_capabilities; i++) {
- sum.sparks.created += capabilities[i]->spark_stats.created;
- sum.sparks.dud += capabilities[i]->spark_stats.dud;
- sum.sparks.overflowed+=
- capabilities[i]->spark_stats.overflowed;
- sum.sparks.converted +=
- capabilities[i]->spark_stats.converted;
- sum.sparks.gcd += capabilities[i]->spark_stats.gcd;
- sum.sparks.fizzled += capabilities[i]->spark_stats.fizzled;
+ showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
+ statsPrintf("%16s bytes maximum slop\n", temp);
+
+ statsPrintf("%16" FMT_SizeT " MB total memory in use (%"
+ FMT_SizeT " MB lost due to fragmentation)\n\n",
+ (size_t)(peak_mblocks_allocated * MBLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)),
+ (size_t)(peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)));
+
+ /* Print garbage collections in each gen */
+ statsPrintf(" Tot time (elapsed) Avg pause Max pause\n");
+ for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+ gen = &generations[g];
+ statsPrintf(" Gen %2d %5d colls, %5d par %6.3fs %6.3fs %3.4fs %3.4fs\n",
+ gen->no,
+ gen->collections,
+ gen->par_collections,
+ TimeToSecondsDbl(GC_coll_cpu[g]),
+ TimeToSecondsDbl(GC_coll_elapsed[g]),
+ gen->collections == 0 ? 0 : TimeToSecondsDbl(GC_coll_elapsed[g] / gen->collections),
+ TimeToSecondsDbl(GC_coll_max_pause[g]));
}
- sum.sparks_count = sum.sparks.created
- + sum.sparks.dud
- + sum.sparks.overflowed;
-
+#if defined(THREADED_RTS)
if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
// See Note [Work Balance]
- sum.work_balance =
- (double)stats.cumulative_par_balanced_copied_bytes
- / (double)stats.par_copied_bytes;
- } else {
- sum.work_balance = 0;
+ statsPrintf("\n Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n",
+ 100 * (double)stats.cumulative_par_balanced_copied_bytes /
+ (double)stats.par_copied_bytes);
}
+#endif
+ statsPrintf("\n");
+
+#if defined(THREADED_RTS)
+ statsPrintf(" TASKS: %d (%d bound, %d peak workers (%d total), using -N%d)\n",
+ taskCount, taskCount - workerCount,
+ peakWorkerCount, workerCount,
+ n_capabilities);
+
+ statsPrintf("\n");
+
+ {
+ uint32_t i;
+ SparkCounters sparks = { 0, 0, 0, 0, 0, 0};
+ for (i = 0; i < n_capabilities; i++) {
+ sparks.created += capabilities[i]->spark_stats.created;
+ sparks.dud += capabilities[i]->spark_stats.dud;
+ sparks.overflowed+= capabilities[i]->spark_stats.overflowed;
+ sparks.converted += capabilities[i]->spark_stats.converted;
+ sparks.gcd += capabilities[i]->spark_stats.gcd;
+ sparks.fizzled += capabilities[i]->spark_stats.fizzled;
+ }
+
+ statsPrintf(" SPARKS: %" FMT_Word " (%" FMT_Word " converted, %" FMT_Word " overflowed, %" FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
+ sparks.created + sparks.dud + sparks.overflowed,
+ sparks.converted, sparks.overflowed, sparks.dud,
+ sparks.gcd, sparks.fizzled);
+ }
+#endif
+
+ statsPrintf(" INIT time %7.3fs (%7.3fs elapsed)\n",
+ TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed));
+
+ statsPrintf(" MUT time %7.3fs (%7.3fs elapsed)\n",
+ TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed));
+ statsPrintf(" GC time %7.3fs (%7.3fs elapsed)\n",
+ TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
+#if defined(PROFILING)
+ statsPrintf(" RP time %7.3fs (%7.3fs elapsed)\n",
+ TimeToSecondsDbl(RP_tot_time), TimeToSecondsDbl(RPe_tot_time));
+ statsPrintf(" PROF time %7.3fs (%7.3fs elapsed)\n",
+ TimeToSecondsDbl(HC_tot_time), TimeToSecondsDbl(HCe_tot_time));
+#endif
+ statsPrintf(" EXIT time %7.3fs (%7.3fs elapsed)\n",
+ TimeToSecondsDbl(exit_cpu), TimeToSecondsDbl(exit_elapsed));
+ statsPrintf(" Total time %7.3fs (%7.3fs elapsed)\n\n",
+ TimeToSecondsDbl(tot_cpu), TimeToSecondsDbl(tot_elapsed));
+#if !defined(THREADED_RTS)
+ statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n",
+ TimeToSecondsDbl(gc_cpu)*100/TimeToSecondsDbl(tot_cpu),
+ TimeToSecondsDbl(gc_elapsed)*100/TimeToSecondsDbl(tot_elapsed));
+#endif
- #else // THREADED_RTS
- sum.gc_cpu_percent = stats.gc_cpu_ns
- / stats.cpu_ns;
- sum.gc_elapsed_percent = stats.gc_elapsed_ns
- / stats.elapsed_ns;
- #endif // THREADED_RTS
-
- sum.fragmentation_bytes =
- (uint64_t)(peak_mblocks_allocated
- * BLOCKS_PER_MBLOCK
- * BLOCK_SIZE_W
- - hw_alloc_blocks * BLOCK_SIZE_W)
- / (uint64_t)sizeof(W_);
-
- sum.average_bytes_used = stats.major_gcs == 0 ? 0 :
- stats.cumulative_live_bytes/stats.major_gcs,
-
- sum.alloc_rate = stats.mutator_cpu_ns == 0 ? 0 :
- (uint64_t)((double)stats.allocated_bytes
- / TimeToSecondsDbl(stats.mutator_cpu_ns));
-
- // REVIEWERS: These two values didn't used to include the exit times
- sum.productivity_cpu_percent =
- TimeToSecondsDbl(stats.cpu_ns
- - stats.gc_cpu_ns
- - sum.rp_cpu_ns
- - sum.hc_cpu_ns
- - stats.init_cpu_ns
- - sum.exit_cpu_ns)
- / TimeToSecondsDbl(stats.cpu_ns);
-
- sum.productivity_elapsed_percent =
- TimeToSecondsDbl(stats.elapsed_ns
- - stats.gc_elapsed_ns
- - sum.rp_elapsed_ns
- - sum.hc_elapsed_ns
- - stats.init_elapsed_ns
- - sum.exit_elapsed_ns)
- / TimeToSecondsDbl(stats.elapsed_ns);
-
- for(g = 0; g < RtsFlags.GcFlags.generations; ++g) {
- const generation* gen = &generations[g];
- GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g];
- gen_stats->collections = gen->collections;
- gen_stats->par_collections = gen->par_collections;
- gen_stats->cpu_ns = GC_coll_cpu[g];
- gen_stats->elapsed_ns = GC_coll_elapsed[g];
- gen_stats->max_pause_ns = GC_coll_max_pause[g];
- gen_stats->avg_pause_ns = gen->collections == 0 ?
- 0 : (GC_coll_elapsed[g] / gen->collections);
- #if defined(THREADED_RTS) && defined(PROF_SPIN)
- gen_stats->sync_spin = gen->sync.spin;
- gen_stats->sync_yield = gen->sync.yield;
- #endif // PROF_SPIN
+ if (mut_cpu == 0) {
+ showStgWord64(0, temp, true/*commas*/);
+ } else {
+ showStgWord64(
+ (StgWord64)((double)stats.allocated_bytes /
+ TimeToSecondsDbl(mut_cpu)),
+ temp, true/*commas*/);
}
- }
- // Now we generate the report
- if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
- report_summary(&sum);
+ statsPrintf(" Alloc rate %s bytes per MUT second\n\n", temp);
+
+ statsPrintf(" Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n",
+ TimeToSecondsDbl(tot_cpu - gc_cpu -
+ PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100
+ / TimeToSecondsDbl(tot_cpu),
+ TimeToSecondsDbl(tot_elapsed - gc_elapsed -
+ PROF_VAL(RPe_tot_time + HCe_tot_time) - init_elapsed) * 100
+ / TimeToSecondsDbl(tot_elapsed));
+
+ // See Note [Internal Counter Stats] for a description of the
+ // following counters. If you add a counter here, please remember
+ // to update the Note.
+ if (RtsFlags.MiscFlags.internalCounters) {
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+ uint32_t g;
+ const int32_t col_width[] = {4, -30, 14, 14};
+ statsPrintf("Internal Counters:\n");
+ statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
+ , col_width[0], ""
+ , col_width[1], "SpinLock"
+ , col_width[2], "Spins"
+ , col_width[3], "Yields");
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "gc_alloc_block_sync"
+ , col_width[2], gc_alloc_block_sync.spin
+ , col_width[3], gc_alloc_block_sync.yield);
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "gc_spin"
+ , col_width[2], stats.gc_spin_spin
+ , col_width[3], stats.gc_spin_yield);
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "mut_spin"
+ , col_width[2], stats.mut_spin_spin
+ , col_width[3], stats.mut_spin_yield);
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+ , col_width[0], ""
+ , col_width[1], "whitehole_gc"
+ , col_width[2], whitehole_gc_spin
+ , col_width[3], "n/a");
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+ , col_width[0], ""
+ , col_width[1], "whitehole_threadPaused"
+ , col_width[2], whitehole_threadPaused_spin
+ , col_width[3], "n/a");
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+ , col_width[0], ""
+ , col_width[1], "whitehole_executeMessage"
+ , col_width[2], whitehole_executeMessage_spin
+ , col_width[3], "n/a");
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "whitehole_lockClosure"
+ , col_width[2], whitehole_lockClosure_spin
+ , col_width[3], whitehole_lockClosure_yield);
+ // waitForGcThreads isn't really spin-locking(see the function)
+ // but these numbers still seem useful.
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "waitForGcThreads"
+ , col_width[2], waitForGcThreads_spin
+ , col_width[3], waitForGcThreads_yield);
+
+ for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+ int prefix_length = 0;
+ statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
+ col_width[0], "", g, &prefix_length);
+ prefix_length -= col_width[0];
+ int suffix_length = col_width[1] + prefix_length;
+ suffix_length =
+ suffix_length > 0 ? col_width[1] : suffix_length;
+
+ statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+ , suffix_length, "].sync"
+ , col_width[2], generations[g].sync.spin
+ , col_width[3], generations[g].sync.yield);
+ }
+ statsPrintf("\n");
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "any_work"
+ , col_width[2], stats.any_work);
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "no_work"
+ , col_width[2], stats.no_work);
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "scav_find_work"
+ , col_width[2], stats.scav_find_work);
+#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
+ statsPrintf("Internal Counters require the RTS to be built "
+ "with PROF_SPIN"); // PROF_SPIN is not #defined here
+#else // THREADED_RTS
+ statsPrintf("Internal Counters require the threaded RTS");
+#endif
+ }
}
if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
- if (RtsFlags.MiscFlags.machineReadable) {
- report_machine_readable(&sum);
- }
- else {
- report_one_line(&sum);
- }
+ char *fmt;
+ if (RtsFlags.MiscFlags.machineReadable) {
+ fmt =
+ " [(\"bytes allocated\", \"%" FMT_Word64 "\")\n"
+ " ,(\"num_GCs\", \"%" FMT_Word32 "\")\n"
+ " ,(\"average_bytes_used\", \"%" FMT_Word64 "\")\n"
+ " ,(\"max_bytes_used\", \"%" FMT_Word64 "\")\n"
+ " ,(\"num_byte_usage_samples\", \"%" FMT_Word32 "\")\n"
+ " ,(\"peak_megabytes_allocated\", \"%" FMT_Word64 "\")\n"
+ " ,(\"init_cpu_seconds\", \"%.3f\")\n"
+ " ,(\"init_wall_seconds\", \"%.3f\")\n"
+ " ,(\"mutator_cpu_seconds\", \"%.3f\")\n"
+ " ,(\"mutator_wall_seconds\", \"%.3f\")\n"
+ " ,(\"GC_cpu_seconds\", \"%.3f\")\n"
+ " ,(\"GC_wall_seconds\", \"%.3f\")\n"
+ " ]\n";
+ }
+ else {
+ fmt =
+ "<<ghc: %" FMT_Word64 " bytes, "
+ "%" FMT_Word32 " GCs, "
+ "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency (%" FMT_Word32 " samples), "
+ "%" FMT_Word64 "M in use, "
+ "%.3f INIT (%.3f elapsed), "
+ "%.3f MUT (%.3f elapsed), "
+ "%.3f GC (%.3f elapsed) :ghc>>\n";
+ }
+ /* print the long long separately to avoid bugginess on mingwin (2001-07-02, mingw-0.5) */
+ statsPrintf(fmt,
+ stats.allocated_bytes,
+ stats.gcs,
+ (uint64_t)
+ (stats.major_gcs == 0 ? 0 :
+ stats.cumulative_live_bytes/stats.major_gcs),
+ stats.max_live_bytes,
+ stats.major_gcs,
+ (uint64_t) (peak_mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)),
+ TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed),
+ TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed),
+ TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
}
- free_RTSSummaryStats(&sum);
statsFlush();
statsClose();
}
@@ -1371,7 +1055,7 @@ waitForGcThreads:
for a parallel garbage collection. We yield more than we spin in this case.
In several places in the runtime we must take a lock on a closure. To do this,
-we replace its info table with stg_WHITEHOLE_info, spinning if it is already
+we replace it's info table with stg_WHITEHOLE_info, spinning if it is already
a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we
don't. We count these white-hole spins and include them in the SpinLocks table.
If a particular loop does not yield, we put "n/a" in the table. They are named
@@ -1457,9 +1141,8 @@ statDescribeGens(void)
gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live;
- debugBelch("%8" FMT_Word " %8d %8d %9" FMT_Word " %9" FMT_Word "\n",
- gen_blocks, lge, compacts, gen_live*(W_)sizeof(W_),
- gen_slop*(W_)sizeof(W_));
+ debugBelch("%8" FMT_Word " %8d %8d %9" FMT_Word " %9" FMT_Word "\n", gen_blocks, lge, compacts,
+ gen_live*(W_)sizeof(W_), gen_slop*(W_)sizeof(W_));
tot_live += gen_live;
tot_slop += gen_slop;
}
diff --git a/rts/Stats.h b/rts/Stats.h
index 7cd49fc768..1c56344f81 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -66,49 +66,4 @@ void statDescribeGens( void );
Time stat_getElapsedGCTime(void);
Time stat_getElapsedTime(void);
-typedef struct GenerationSummaryStats_ {
- uint32_t collections;
- uint32_t par_collections;
- Time cpu_ns;
- Time elapsed_ns;
- Time max_pause_ns;
- Time avg_pause_ns;
-#if defined(THREADED_RTS) && defined(PROF_SPIN)
- uint64_t sync_spin;
- uint64_t sync_yield;
-#endif
-} GenerationSummaryStats;
-
-typedef struct RTSSummaryStats_ {
- // These profiling times could potentially be in RTSStats. However, I'm not
- // confident enough to do this now, since there is some logic depending on
- // global state that I do not understand. (Or if I do understand it, it's
- // wrong)
- Time rp_cpu_ns;
- Time rp_elapsed_ns;
- Time hc_cpu_ns;
- Time hc_elapsed_ns;
-
- Time exit_cpu_ns;
- Time exit_elapsed_ns;
-
-#if defined(THREADED_RTS)
- uint32_t bound_task_count;
- uint64_t sparks_count;
- SparkCounters sparks;
- double work_balance;
-#else // THREADED_RTS
- double gc_cpu_percent;
- double gc_elapsed_percent;
-#endif
- uint64_t fragmentation_bytes;
- uint64_t average_bytes_used; // This is not shown in the '+RTS -s' report
- uint64_t alloc_rate;
- double productivity_cpu_percent;
- double productivity_elapsed_percent;
-
- // one for each generation, 0 first
- GenerationSummaryStats* gc_summary_stats;
-} RTSSummaryStats;
-
#include "EndPrivate.h"