rts, base: Refactor stats.c to improve --machine-readable report

There should be no change in the output of the '+RTS -s' (summary) report, or the 'RTS -t' (one-line) report. All data shown in the summary report is now shown in the machine readable report. All data in RTSStats is now shown in the machine readable report. init times are added to RTSStats and added to GHC.Stats. Example of the new output: ``` [("bytes allocated", "375016384") ,("num_GCs", "113") ,("average_bytes_used", "148348") ,("max_bytes_used", "206552") ,("num_byte_usage_samples", "2") ,("peak_megabytes_allocated", "6") ,("init_cpu_seconds", "0.001642") ,("init_wall_seconds", "0.001027") ,("mut_cpu_seconds", "3.020166") ,("mut_wall_seconds", "0.757244") ,("GC_cpu_seconds", "0.037750") ,("GC_wall_seconds", "0.009569") ,("exit_cpu_seconds", "0.000890") ,("exit_wall_seconds", "0.002551") ,("total_cpu_seconds", "3.060452") ,("total_wall_seconds", "0.770395") ,("major_gcs", "2") ,("allocated_bytes", "375016384") ,("max_live_bytes", "206552") ,("max_large_objects_bytes", "159344") ,("max_compact_bytes", "0") ,("max_slop_bytes", "59688") ,("max_mem_in_use_bytes", "6291456") ,("cumulative_live_bytes", "296696") ,("copied_bytes", "541024") ,("par_copied_bytes", "493976") ,("cumulative_par_max_copied_bytes", "104104") ,("cumulative_par_balanced_copied_bytes", "274456") ,("fragmentation_bytes", "2112") ,("alloc_rate", "124170795") ,("productivity_cpu_percent", "0.986838") ,("productivity_wall_percent", "0.982935") ,("bound_task_count", "1") ,("sparks_count", "5836258") ,("sparks_converted", "237") ,("sparks_overflowed", "1990408") ,("sparks_dud ", "0") ,("sparks_gcd", "3455553") ,("sparks_fizzled", "390060") ,("work_balance", "0.555606") ,("n_capabilities", "4") ,("task_count", "10") ,("peak_worker_count", "9") ,("worker_count", "9") ,("gc_alloc_block_sync_spin", "162") ,("gc_alloc_block_sync_yield", "0") ,("gc_alloc_block_sync_spin", "162") ,("gc_spin_spin", "18840855") ,("gc_spin_yield", "10355") ,("mut_spin_spin", "70331392") ,("mut_spin_yield", "61700") ,("waitForGcThreads_spin", "241") ,("waitForGcThreads_yield", "2797") ,("whitehole_gc_spin", "0") ,("whitehole_lockClosure_spin", "0") ,("whitehole_lockClosure_yield", "0") ,("whitehole_executeMessage_spin", "0") ,("whitehole_threadPaused_spin", "0") ,("any_work", "1667") ,("no_work", "1662") ,("scav_find_work", "1026") ,("gen_0_collections", "111") ,("gen_0_par_collections", "111") ,("gen_0_cpu_seconds", "0.036126") ,("gen_0_wall_seconds", "0.036126") ,("gen_0_max_pause_seconds", "0.036126") ,("gen_0_avg_pause_seconds", "0.000081") ,("gen_0_sync_spin", "21") ,("gen_0_sync_yield", "0") ,("gen_1_collections", "2") ,("gen_1_par_collections", "1") ,("gen_1_cpu_seconds", "0.001624") ,("gen_1_wall_seconds", "0.001624") ,("gen_1_max_pause_seconds", "0.001624") ,("gen_1_avg_pause_seconds", "0.000272") ,("gen_1_sync_spin", "3") ,("gen_1_sync_yield", "0") ] ``` Test Plan: Ensure that one-line and summary reports are unchanged. Reviewers: bgamari, erikd, simonmar, hvr Reviewed By: simonmar Subscribers: rwbarton, thomie, carter GHC Trac Issues: #14660 Differential Revision: https://phabricator.haskell.org/D4303
author: Douglas Wilson <douglas.wilson@gmail.com> 2018-03-19 13:26:41 -0400
committer: Ben Gamari <ben@smart-cactus.org> 2018-03-19 13:26:42 -0400
commit: 2d4bda2e4ac68816baba0afab00da6f769ea75a7 (patch)
tree: caab3da4c7506d6589b6ca89ecae2d234b4810ad /rts/Stats.c
parent: b3b394b44e42f19ab7c23668a4008e4f728b51ba (diff)
download: haskell-2d4bda2e4ac68816baba0afab00da6f769ea75a7.tar.gz
1 files changed, 619 insertions, 302 deletions
diff --git a/rts/Stats.c b/rts/Stats.c
index 7eb93bef33..8769551160 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -24,6 +24,7 @@
 #include "ThreadPaused.h"
 #include "Messages.h"
 
+#include <string.h> // for memset
 
 #define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
 
@@ -92,6 +93,9 @@ mut_user_time_until( Time t )
     return TimeToSecondsDbl(t - stats.gc_cpu_ns);
     // heapCensus() time is included in GC_tot_cpu, so we don't need
     // to subtract it here.
+
+    // TODO: This seems wrong to me. Surely we should be subtracting
+    // (at least) start_init_cpu?
 }
 
 double
@@ -169,6 +173,8 @@ initStats0(void)
         .any_work = 0,
         .no_work = 0,
         .scav_find_work = 0,
+        .init_cpu_ns = 0,
+        .init_elapsed_ns = 0,
         .mutator_cpu_ns = 0,
         .mutator_elapsed_ns = 0,
         .gc_cpu_ns = 0,
@@ -240,6 +246,8 @@ void
 stat_endInit(void)
 {
     getProcessTimes(&end_init_cpu, &end_init_elapsed);
+    stats.init_cpu_ns = end_init_cpu - start_init_cpu;
+    stats.init_elapsed_ns = end_init_elapsed - start_init_elapsed;
 }
 
 /* -----------------------------------------------------------------------------
@@ -419,8 +427,8 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
                           stats.gc.copied_bytes,
                           stats.gc.slop_bytes,
                           /* current loss due to fragmentation */
-                          (mblocks_allocated * BLOCKS_PER_MBLOCK - n_alloc_blocks)
-                                 * BLOCK_SIZE,
+                          (mblocks_allocated * BLOCKS_PER_MBLOCK
+                           - n_alloc_blocks) * BLOCK_SIZE,
                           par_n_threads,
                           stats.gc.par_max_copied_bytes,
                           stats.gc.copied_bytes,
@@ -515,7 +523,8 @@ stat_endRP(
   fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
   fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
 #endif
-  fprintf(prof_file, "\tAverage number of visits per object = %f\n", averageNumVisit);
+  fprintf(prof_file, "\tAverage number of visits per object = %f\n",
+          averageNumVisit);
 }
 #endif /* PROFILING */
 
@@ -573,7 +582,7 @@ StgInt TOTAL_CALLS=1;
 #define REPORT(counter) \
   { \
     showStgWord64(counter,temp,true/*commas*/); \
-    statsPrintf("  (" #counter ")  : %s\n",temp);                               \
+    statsPrintf("  (" #counter ")  : %s\n",temp); \
   }
 
 /* Report the value of a counter as a percentage of another counter */
@@ -592,340 +601,647 @@ StgInt TOTAL_CALLS=1;
   statsPrintf("  (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \
               SLOW_CALLS_##arity * 100.0/TOTAL_CALLS)
 
-STATIC_INLINE Time get_init_cpu(void) {
-    return end_init_cpu - start_init_cpu;
-}
+/*
+Note [RTS Stats Reporting]
+==========================
+
+There are currently three reporting functions:
+  * report_summary:
+      Responsible for producing '+RTS -s' output.
+      Will report internal counters if the RTS flag --internal-counters is
+      passed. See [Internal Counters Stats]
+  * report_machine_readable:
+      Responsible for producing '+RTS -t --machine-readable' output.
+  * report_one_line:
+      Responsible for productin '+RTS -t' output
+
+Stats are accumulated into the global variable 'stats' as the program runs, then
+in 'stat_exit' we do the following:
+  * Finalise 'stats'. This involves setting final running times and allocations
+    that have not yet been accounted for.
+  * Create a RTSSummaryStats. This contains all data for reports that is not
+    included in stats (because they do not make sense before the program has
+    completed) or in a global variable.
+  * call the appropriate report_* function, passing the newly constructed
+    RTSSummaryStats.
+
+To ensure that the data in the different reports is kept consistent, the
+report_* functions should not do any calculation, excepting unit changes and
+formatting. If you need to add a new calculated field, add it to
+RTSSummaryStats.
+
+*/
+
 
-STATIC_INLINE Time get_init_elapsed(void) {
-    return end_init_elapsed - start_init_elapsed;
+static void init_RTSSummaryStats(RTSSummaryStats* sum)
+{
+    const size_t sizeof_gc_summary_stats =
+      RtsFlags.GcFlags.generations * sizeof(GenerationSummaryStats);
+
+    memset(sum, 0, sizeof(RTSSummaryStats));
+    sum->gc_summary_stats =
+      stgMallocBytes(sizeof_gc_summary_stats,
+                     "alloc_RTSSummaryStats.gc_summary_stats");
+    memset(sum->gc_summary_stats, 0, sizeof_gc_summary_stats);
 }
 
+static void free_RTSSummaryStats(RTSSummaryStats * sum)
+{
+    if (!sum) { return; }
+    if (!sum->gc_summary_stats) {
+        stgFree(sum->gc_summary_stats);
+        sum->gc_summary_stats = NULL;
+    }
+}
 
-void
-stat_exit (void)
+static void report_summary(const RTSSummaryStats* sum)
 {
-    generation *gen;
-    Time gc_cpu = 0;
-    Time gc_elapsed = 0;
-    Time init_cpu = 0;
-    Time init_elapsed = 0;
-    Time mut_cpu = 0;
-    Time mut_elapsed = 0;
-    Time exit_cpu = 0;
-    Time exit_elapsed = 0;
-    Time exit_gc_cpu = 0;
-    Time exit_gc_elapsed = 0;
+    // We should do no calculation, other than unit changes and formatting, and
+    // we should not not use any data from outside of globals, sum and stats
+    // here. See Note [RTS Stats Reporting]
+
+    uint32_t g;
+    char temp[512];
+    showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
+    statsPrintf("%16s bytes allocated in the heap\n", temp);
+
+    showStgWord64(stats.copied_bytes, temp, true/*commas*/);
+    statsPrintf("%16s bytes copied during GC\n", temp);
+
+    if ( stats.major_gcs > 0 ) {
+        showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
+        statsPrintf("%16s bytes maximum residency (%" FMT_Word32
+                    " sample(s))\n",
+                    temp, stats.major_gcs);
+    }
 
-    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
+    showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
+    statsPrintf("%16s bytes maximum slop\n", temp);
+
+    statsPrintf("%16" FMT_SizeT " MB total memory in use (%"
+                FMT_SizeT " MB lost due to fragmentation)\n\n",
+                stats.max_live_bytes  / (1024 * 1024),
+                sum->fragmentation_bytes / (1024 * 1024));
+
+    /* Print garbage collections in each gen */
+    statsPrintf("                                     Tot time (elapsed)  Avg pause  Max pause\n");
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+        const GenerationSummaryStats * gen_stats =
+            &sum->gc_summary_stats[g];
+        statsPrintf("  Gen %2d     %5d colls"
+                    ", %5d par   %6.3fs  %6.3fs     %3.4fs    %3.4fs\n",
+                    g, // REVIEWERS: this used to be gen->no
+                    //, this can't ever be different right?
+                    gen_stats->collections,
+                    gen_stats->par_collections,
+                    TimeToSecondsDbl(gen_stats->cpu_ns),
+                    TimeToSecondsDbl(gen_stats->elapsed_ns),
+                    TimeToSecondsDbl(gen_stats->avg_pause_ns),
+                    TimeToSecondsDbl(gen_stats->max_pause_ns));
+    }
 
-        char temp[512];
-        Time tot_cpu;
-        Time tot_elapsed;
-        uint32_t g;
+    statsPrintf("\n");
 
-        getProcessTimes( &tot_cpu, &tot_elapsed );
-        tot_cpu -= start_init_cpu;
-        tot_elapsed -= start_init_elapsed;
+#if defined(THREADED_RTS)
+    if (RtsFlags.ParFlags.parGcEnabled && sum->work_balance > 0) {
+        // See Note [Work Balance]
+        statsPrintf("  Parallel GC work balance: "
+                    "%.2f%% (serial 0%%, perfect 100%%)\n\n",
+                    sum->work_balance * 100);
+    }
 
-        uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
+    statsPrintf("  TASKS: %d "
+                "(%d bound, %d peak workers (%d total), using -N%d)\n\n",
+                taskCount, sum->bound_task_count,
+                peakWorkerCount, workerCount,
+                n_capabilities);
+
+    statsPrintf("  SPARKS: %" FMT_Word
+                "(%" FMT_Word " converted, %" FMT_Word " overflowed, %"
+                FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
+                sum->sparks_count,
+                sum->sparks.converted, sum->sparks.overflowed,
+                sum->sparks.dud, sum->sparks.gcd,
+                sum->sparks.fizzled);
+#endif
 
-        // allocated since the last GC
-        stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
-        stats.allocated_bytes = tot_alloc_bytes;
+    statsPrintf("  INIT    time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(stats.init_cpu_ns),
+                TimeToSecondsDbl(stats.init_elapsed_ns));
 
-        /* avoid divide by zero if tot_cpu is measured as 0.00 seconds -- SDM */
-        if (tot_cpu <= 0)  tot_cpu = 1;
-        if (tot_elapsed <= 0) tot_elapsed = 1;
+    statsPrintf("  MUT     time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(stats.mutator_cpu_ns),
+                TimeToSecondsDbl(stats.mutator_elapsed_ns));
+    statsPrintf("  GC      time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(stats.gc_cpu_ns),
+                TimeToSecondsDbl(stats.gc_elapsed_ns));
 
-        if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
-            statsPrintf("%9" FMT_Word " %9.9s %9.9s",
-                        (W_)stats.gc.allocated_bytes, "", "");
-            statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
-        }
+#if defined(PROFILING)
+    statsPrintf("  RP      time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(sum->rc_cpu_ns),
+                TimeToSecondsDbl(sum->rc_elapsed_ns));
+    statsPrintf("  PROF    time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(sum->hc_cpu_ns),
+                TimeToSecondsDbl(sum->hc_elapsed_ns));
+#endif
+    statsPrintf("  EXIT    time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(sum->exit_cpu_ns),
+                TimeToSecondsDbl(sum->exit_elapsed_ns));
+    statsPrintf("  Total   time  %7.3fs  (%7.3fs elapsed)\n\n",
+                TimeToSecondsDbl(stats.cpu_ns),
+                TimeToSecondsDbl(stats.elapsed_ns));
+#if !defined(THREADED_RTS)
+    statsPrintf("  %%GC     time     %5.1f%%  (%.1f%% elapsed)\n\n",
+                sum->gc_cpu_percent * 100,
+                sum->gc_elapsed_percent * 100);
+#endif
 
-        // heapCensus() is called by the GC, so RP and HC time are
-        // included in the GC stats.  We therefore subtract them to
-        // obtain the actual GC cpu time.
-        gc_cpu     = stats.gc_cpu_ns - PROF_VAL(RP_tot_time + HC_tot_time);
-        gc_elapsed = stats.gc_elapsed_ns - PROF_VAL(RPe_tot_time + HCe_tot_time);
+    showStgWord64(sum->alloc_rate, temp, true/*commas*/);
 
-        init_cpu     = get_init_cpu();
-        init_elapsed = get_init_elapsed();
+    statsPrintf("  Alloc rate    %s bytes per MUT second\n\n", temp);
 
-        // We do a GC during the EXIT phase.  We'll attribute the cost of that
-        // to GC instead of EXIT, so carefully subtract it from the EXIT time.
-        exit_gc_cpu  = stats.gc_cpu_ns - start_exit_gc_cpu;
-        exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
-        exit_cpu     = end_exit_cpu - start_exit_cpu - exit_gc_cpu;
-        exit_elapsed = end_exit_elapsed - start_exit_elapsed - exit_gc_elapsed;
+    statsPrintf("  Productivity %5.1f%% of total user, "
+                "%.1f%% of total elapsed\n\n",
+                sum->productivity_cpu_percent * 100,
+                sum->productivity_elapsed_percent * 100);
 
-        mut_elapsed = start_exit_elapsed - end_init_elapsed -
-            (gc_elapsed - exit_gc_elapsed) -
-            PROF_VAL(RPe_tot_time + HCe_tot_time);
+    // See Note [Internal Counter Stats] for a description of the
+    // following counters. If you add a counter here, please remember
+    // to update the Note.
+    if (RtsFlags.MiscFlags.internalCounters) {
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+        const int32_t col_width[] = {4, -30, 14, 14};
+        statsPrintf("Internal Counters:\n");
+        statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
+                    , col_width[0], ""
+                    , col_width[1], "SpinLock"
+                    , col_width[2], "Spins"
+                    , col_width[3], "Yields");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "gc_alloc_block_sync"
+                    , col_width[2], gc_alloc_block_sync.spin
+                    , col_width[3], gc_alloc_block_sync.yield);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "gc_spin"
+                    , col_width[2], stats.gc_spin_spin
+                    , col_width[3], stats.gc_spin_yield);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "mut_spin"
+                    , col_width[2], stats.mut_spin_spin
+                    , col_width[3], stats.mut_spin_yield);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                    , col_width[0], ""
+                    , col_width[1], "whitehole_gc"
+                    , col_width[2], whitehole_gc_spin
+                    , col_width[3], "n/a");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                    , col_width[0], ""
+                    , col_width[1], "whitehole_threadPaused"
+                    , col_width[2], whitehole_threadPaused_spin
+                    , col_width[3], "n/a");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                    , col_width[0], ""
+                    , col_width[1], "whitehole_executeMessage"
+                    , col_width[2], whitehole_executeMessage_spin
+                    , col_width[3], "n/a");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "whitehole_lockClosure"
+                    , col_width[2], whitehole_lockClosure_spin
+                    , col_width[3], whitehole_lockClosure_yield);
+        // waitForGcThreads isn't really spin-locking(see the function)
+        // but these numbers still seem useful.
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "waitForGcThreads"
+                    , col_width[2], waitForGcThreads_spin
+                    , col_width[3], waitForGcThreads_yield);
+
+        for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+            int prefix_length = 0;
+            statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
+                        col_width[0], "", g, &prefix_length);
+            prefix_length -= col_width[0];
+            int suffix_length = col_width[1] + prefix_length;
+            suffix_length =
+                  suffix_length > 0 ? col_width[1] : suffix_length;
+
+            statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                        , suffix_length, "].sync"
+                        , col_width[2], generations[g].sync.spin
+                        , col_width[3], generations[g].sync.yield);
+        }
+        statsPrintf("\n");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "any_work"
+                    , col_width[2], stats.any_work);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "no_work"
+                    , col_width[2], stats.no_work);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "scav_find_work"
+                    , col_width[2], stats.scav_find_work);
+#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
+        statsPrintf("Internal Counters require the RTS to be built "
+                "with PROF_SPIN"); // PROF_SPIN is not #defined here
+#else // THREADED_RTS
+        statsPrintf("Internal Counters require the threaded RTS");
+#endif
+    }
+}
 
-        mut_cpu = start_exit_cpu - end_init_cpu - (gc_cpu - exit_gc_cpu)
-            - PROF_VAL(RP_tot_time + HC_tot_time);
-        if (mut_cpu < 0) { mut_cpu = 0; }
+static void report_machine_readable (const RTSSummaryStats * sum)
+{
+    // We should do no calculation, other than unit changes and formatting, and
+    // we should not not use any data from outside of globals, sum and stats
+    // here. See Note [RTS Stats Reporting]
+    uint32_t g;
+
+#define MR_STAT(field_name,format,value) \
+    statsPrintf(" ,(\"" field_name "\", \"%" format "\")\n", value)
+#define MR_STAT_GEN(gen,field_name,format,value) \
+    statsPrintf(" ,(\"gen_%" FMT_Word32 "_" field_name "\", \"%" \
+      format "\")\n", g, value)
+
+    // These first values are for backwards compatibility.
+    // Some of these first fields are duplicated with more machine-readable
+    // names, or to match the name in RtsStats.
+
+    // we don't use for the first field helper macro here because the prefix is
+    // different
+    statsPrintf(" [(\"%s\", \"%" FMT_Word64 "\")\n", "bytes allocated",
+                stats.allocated_bytes);
+    MR_STAT("num_GCs", FMT_Word32, stats.gcs);
+    MR_STAT("average_bytes_used", FMT_Word64, sum->average_bytes_used);
+    MR_STAT("max_bytes_used", FMT_Word64, stats.max_live_bytes);
+    MR_STAT("num_byte_usage_samples", FMT_Word32, stats.major_gcs);
+    MR_STAT("peak_megabytes_allocated", FMT_SizeT,
+      stats.max_mem_in_use_bytes / (1024 * 1024));
+
+    MR_STAT("init_cpu_seconds", "f", TimeToSecondsDbl(stats.init_cpu_ns));
+    MR_STAT("init_wall_seconds", "f", TimeToSecondsDbl(stats.init_elapsed_ns));
+    MR_STAT("mut_cpu_seconds", "f", TimeToSecondsDbl(stats.mutator_cpu_ns));
+    MR_STAT("mut_wall_seconds", "f",
+            TimeToSecondsDbl(stats.mutator_elapsed_ns));
+    MR_STAT("GC_cpu_seconds", "f", TimeToSecondsDbl(stats.gc_cpu_ns));
+    MR_STAT("GC_wall_seconds", "f", TimeToSecondsDbl(stats.gc_elapsed_ns));
+
+    // end backward compatibility
+
+    // First, the rest of the times
+
+    MR_STAT("exit_cpu_seconds", "f", TimeToSecondsDbl(sum->exit_cpu_ns));
+    MR_STAT("exit_wall_seconds", "f", TimeToSecondsDbl(sum->exit_elapsed_ns));
+#if defined(PROFILING)
+    MR_STAT("rp_cpu_seconds", "f", TimeToSecondsDbl(sum->rp_cpu_ns));
+    MR_STAT("rp_wall_seconds", "f", TimeToSecondsDbl(sum->rp_elapsed_ns));
+    MR_STAT("hc_cpu_seconds", "f", TimeToSecondsDbl(sum->hp_cpu_ns));
+    MR_STAT("hc_wall_seconds", "f", TimeToSecondsDbl(sum->hp_elapsed_ns));
+#endif
+    MR_STAT("total_cpu_seconds", "f", TimeToSecondsDbl(stats.cpu_ns));
+    MR_STAT("total_wall_seconds", "f",
+            TimeToSecondsDbl(stats.elapsed_ns));
+
+    // next, the remainder of the fields of RTSStats, except internal counters
+
+    // The first two are duplicates of those above, but have more machine
+    // readable names that match the field names in RTSStats.
+
+
+    // gcs has been done as num_GCs above
+    MR_STAT("major_gcs", FMT_Word32, stats.major_gcs);
+    MR_STAT("allocated_bytes", FMT_Word64, stats.allocated_bytes);
+    MR_STAT("max_live_bytes", FMT_Word64, stats.max_live_bytes);
+    MR_STAT("max_large_objects_bytes", FMT_Word64,
+            stats.max_large_objects_bytes);
+    MR_STAT("max_compact_bytes", FMT_Word64, stats.max_compact_bytes);
+    MR_STAT("max_slop_bytes", FMT_Word64, stats.max_slop_bytes);
+    // This duplicates, except for unit, peak_megabytes_allocated above
+    MR_STAT("max_mem_in_use_bytes", FMT_Word64, stats.max_mem_in_use_bytes);
+    MR_STAT("cumulative_live_bytes", FMT_Word64, stats.cumulative_live_bytes);
+    MR_STAT("copied_bytes", FMT_Word64, stats.copied_bytes);
+    MR_STAT("par_copied_bytes", FMT_Word64, stats.par_copied_bytes);
+    MR_STAT("cumulative_par_max_copied_bytes", FMT_Word64,
+            stats.cumulative_par_max_copied_bytes);
+    MR_STAT("cumulative_par_balanced_copied_bytes", FMT_Word64,
+            stats.cumulative_par_balanced_copied_bytes);
+
+    // next, the computed fields in RTSSummaryStats
+#if !defined(THREADED_RTS) // THREADED_RTS
+    MR_STAT("gc_cpu_percent", "f", sum->gc_cpu_percent);
+    MR_STAT("gc_wall_percent", "f", sum->gc_cpu_percent);
+#endif
+    MR_STAT("fragmentation_bytes", FMT_SizeT, sum->fragmentation_bytes);
+    // average_bytes_used is done above
+    MR_STAT("alloc_rate", FMT_Word64, sum->alloc_rate);
+    MR_STAT("productivity_cpu_percent", "f", sum->productivity_cpu_percent);
+    MR_STAT("productivity_wall_percent", "f",
+            sum->productivity_elapsed_percent);
 
-        // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding and
-        // subtracting, so the parts should add up to the total exactly.  Note
-        // that tot_elapsed is captured a tiny bit later than end_exit_elapsed,
-        // so we don't use it here.
-        ASSERT(init_elapsed + mut_elapsed + gc_elapsed + exit_elapsed
-               == end_exit_elapsed - start_init_elapsed);
+    // next, the THREADED_RTS fields in RTSSummaryStats
 
+#if defined(THREADED_RTS)
+    MR_STAT("bound_task_count", FMT_Word32, sum->bound_task_count);
+    MR_STAT("sparks_count", FMT_Word64, sum->sparks_count);
+    MR_STAT("sparks_converted", FMT_Word64, sum->sparks.converted);
+    MR_STAT("sparks_overflowed", FMT_Word64, sum->sparks.overflowed);
+    MR_STAT("sparks_dud ", FMT_Word64, sum->sparks.dud);
+    MR_STAT("sparks_gcd", FMT_Word64, sum->sparks.gcd);
+    MR_STAT("sparks_fizzled", FMT_Word64, sum->sparks.fizzled);
+    MR_STAT("work_balance", "f", sum->work_balance);
+
+    // next, globals (other than internal counters)
+    MR_STAT("n_capabilities", FMT_Word32, n_capabilities);
+    MR_STAT("task_count", FMT_Word32, taskCount);
+    MR_STAT("peak_worker_count", FMT_Word32, peakWorkerCount);
+    MR_STAT("worker_count", FMT_Word32, workerCount);
+
+    // next, internal counters
+#if defined(PROF_SPIN)
+    MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
+    MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
+            gc_alloc_block_sync.yield);
+    MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
+    MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin);
+    MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield);
+    MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin);
+    MR_STAT("mut_spin_yield", FMT_Word64, stats.mut_spin_yield);
+    MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin);
+    MR_STAT("waitForGcThreads_yield", FMT_Word64,
+            waitForGcThreads_yield);
+    MR_STAT("whitehole_gc_spin", FMT_Word64, whitehole_gc_spin);
+    MR_STAT("whitehole_lockClosure_spin", FMT_Word64,
+            whitehole_lockClosure_spin);
+    MR_STAT("whitehole_lockClosure_yield", FMT_Word64,
+            whitehole_lockClosure_yield);
+    MR_STAT("whitehole_executeMessage_spin", FMT_Word64,
+            whitehole_executeMessage_spin);
+    MR_STAT("whitehole_threadPaused_spin", FMT_Word64,
+            whitehole_threadPaused_spin);
+    MR_STAT("any_work", FMT_Word64,
+            stats.any_work);
+    MR_STAT("no_work", FMT_Word64,
+            stats.no_work);
+    MR_STAT("scav_find_work", FMT_Word64,
+            stats.scav_find_work);
+#endif // PROF_SPIN
+#endif // THREADED_RTS
+
+    // finally, per-generation stats. Named as, for example for generation 0,
+    // gen_0_collections
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+        const GenerationSummaryStats* gc_sum = &sum->gc_summary_stats[g];
+        MR_STAT_GEN(g, "collections", FMT_Word32, gc_sum->collections);
+        MR_STAT_GEN(g, "par_collections", FMT_Word32, gc_sum->par_collections);
+        MR_STAT_GEN(g, "cpu_seconds", "f", TimeToSecondsDbl(gc_sum->cpu_ns));
+        MR_STAT_GEN(g, "wall_seconds", "f",
+                    TimeToSecondsDbl(gc_sum->elapsed_ns));
+        MR_STAT_GEN(g, "max_pause_seconds", "f",
+                    TimeToSecondsDbl(gc_sum->max_pause_ns));
+        MR_STAT_GEN(g, "avg_pause_seconds", "f",
+                    TimeToSecondsDbl(gc_sum->avg_pause_ns));
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+        MR_STAT_GEN(g, "sync_spin", FMT_Word64, gc_sum->sync_spin);
+        MR_STAT_GEN(g, "sync_yield", FMT_Word64, gc_sum->sync_yield);
+#endif
+    }
 
-        if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
-            showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
-            statsPrintf("%16s bytes allocated in the heap\n", temp);
+    statsPrintf(" ]\n");
+}
 
-            showStgWord64(stats.copied_bytes, temp, true/*commas*/);
-            statsPrintf("%16s bytes copied during GC\n", temp);
+static void report_one_line(const RTSSummaryStats * sum)
+{
+    // We should do no calculation, other than unit changes and formatting, and
+    // we should not not use any data from outside of globals, sum and stats
+    // here. See Note [RTS Stats Reporting]
 
-            if ( stats.major_gcs > 0 ) {
-                showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
-                statsPrintf("%16s bytes maximum residency (%" FMT_Word32
-                            " sample(s))\n",
-                            temp, stats.major_gcs);
-            }
+    /* print the long long separately to avoid bugginess on mingwin (2001-07-02,
+    mingw-0.5) */
+    statsPrintf("<<ghc: %" FMT_Word64 " bytes, "
+                "%" FMT_Word32 " GCs, "
+                "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency "
+                "(%" FMT_Word32 " samples), "
+                "%" FMT_Word64 "M in use, "
+                "%.3f INIT (%.3f elapsed), "
+                "%.3f MUT (%.3f elapsed), "
+                "%.3f GC (%.3f elapsed) :ghc>>\n",
+                stats.allocated_bytes,
+                stats.gcs,
+                sum->average_bytes_used,
+                stats.max_live_bytes,
+                stats.major_gcs,
+                stats.max_mem_in_use_bytes / (1024 * 1024),
+                TimeToSecondsDbl(stats.init_cpu_ns),
+                TimeToSecondsDbl(stats.init_elapsed_ns),
+                TimeToSecondsDbl(stats.mutator_cpu_ns),
+                TimeToSecondsDbl(stats.mutator_elapsed_ns),
+                TimeToSecondsDbl(stats.gc_cpu_ns),
+                TimeToSecondsDbl(stats.gc_elapsed_ns));
+}
 
-            showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
-            statsPrintf("%16s bytes maximum slop\n", temp);
-
-            statsPrintf("%16" FMT_SizeT " MB total memory in use (%"
-                        FMT_SizeT " MB lost due to fragmentation)\n\n",
-                        (size_t)(peak_mblocks_allocated * MBLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)),
-                        (size_t)(peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)));
-
-            /* Print garbage collections in each gen */
-            statsPrintf("                                     Tot time (elapsed)  Avg pause  Max pause\n");
-            for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-                gen = &generations[g];
-                statsPrintf("  Gen %2d     %5d colls, %5d par   %6.3fs  %6.3fs     %3.4fs    %3.4fs\n",
-                            gen->no,
-                            gen->collections,
-                            gen->par_collections,
-                            TimeToSecondsDbl(GC_coll_cpu[g]),
-                            TimeToSecondsDbl(GC_coll_elapsed[g]),
-                            gen->collections == 0 ? 0 : TimeToSecondsDbl(GC_coll_elapsed[g] / gen->collections),
-                            TimeToSecondsDbl(GC_coll_max_pause[g]));
-            }
+void
+stat_exit (void)
+{
+    RTSSummaryStats sum;
+    uint32_t g;
 
-#if defined(THREADED_RTS)
-            if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
-                // See Note [Work Balance]
-                statsPrintf("\n  Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n",
-                    100 * (double)stats.cumulative_par_balanced_copied_bytes /
-                          (double)stats.par_copied_bytes);
-            }
-#endif
-            statsPrintf("\n");
+    init_RTSSummaryStats(&sum);
+    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
+        // First we tidy the times in stats, and populate the times in sum.
+        // In particular, we adjust the gc_* time in stats to remove
+        // profiling times.
+        {
+            Time now_cpu_ns, now_elapsed_ns;
+            Time exit_gc_cpu = 0;
+            Time exit_gc_elapsed = 0;
+            Time prof_cpu = 0;
+            Time prof_elapsed = 0;
+
+            getProcessTimes( &now_cpu_ns, &now_elapsed_ns);
+
+            stats.cpu_ns = now_cpu_ns - start_init_cpu;
+            stats.elapsed_ns = now_elapsed_ns - start_init_elapsed;
+            /* avoid divide by zero if stats.total_cpu_ns is measured as 0.00
+               seconds -- SDM */
+            if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; }
+            if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; }
+
+            prof_cpu = PROF_VAL(RP_tot_time + HC_tot_time);
+            prof_elapsed = PROF_VAL(RPe_tot_time + HCe_tot_time);
+
+            // heapCensus() is called by the GC, so RP and HC time are
+            // included in the GC stats.  We therefore subtract them to
+            // obtain the actual GC cpu time.
+            stats.gc_cpu_ns      -=  prof_cpu;
+            stats.gc_elapsed_ns  -=  prof_elapsed;
 
-#if defined(THREADED_RTS)
-            statsPrintf("  TASKS: %d (%d bound, %d peak workers (%d total), using -N%d)\n",
-                        taskCount, taskCount - workerCount,
-                        peakWorkerCount, workerCount,
-                        n_capabilities);
-
-            statsPrintf("\n");
-
-            {
-                uint32_t i;
-                SparkCounters sparks = { 0, 0, 0, 0, 0, 0};
-                for (i = 0; i < n_capabilities; i++) {
-                    sparks.created   += capabilities[i]->spark_stats.created;
-                    sparks.dud       += capabilities[i]->spark_stats.dud;
-                    sparks.overflowed+= capabilities[i]->spark_stats.overflowed;
-                    sparks.converted += capabilities[i]->spark_stats.converted;
-                    sparks.gcd       += capabilities[i]->spark_stats.gcd;
-                    sparks.fizzled   += capabilities[i]->spark_stats.fizzled;
-                }
-
-                statsPrintf("  SPARKS: %" FMT_Word " (%" FMT_Word " converted, %" FMT_Word " overflowed, %" FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
-                            sparks.created + sparks.dud + sparks.overflowed,
-                            sparks.converted, sparks.overflowed, sparks.dud,
-                            sparks.gcd, sparks.fizzled);
-            }
-#endif
+#if defined(PROFILING)
+            sum.rp_cpu_ns = RP_tot_time;
+            sum.rp_elapsed_ns = RPe_tot_time;
+            sum.hc_cpu_ns = HC_tot_time;
+            sum.hc_elapsed_ns = HCe_tot_time;
+#endif // PROFILING
+
+            // We do a GC during the EXIT phase. We'll attribute the cost of
+            // that to GC instead of EXIT, so carefully subtract it from the
+            // EXIT time.
+            exit_gc_cpu          = stats.gc_cpu_ns - start_exit_gc_cpu;
+            exit_gc_elapsed      = stats.gc_elapsed_ns - start_exit_gc_elapsed;
+
+            sum.exit_cpu_ns     = end_exit_cpu
+                                      - start_exit_cpu
+                                      - exit_gc_cpu;
+            sum.exit_elapsed_ns = end_exit_elapsed
+                                       - start_exit_elapsed
+                                       - exit_gc_elapsed;
+
+            stats.mutator_cpu_ns     = start_exit_cpu
+                                 - end_init_cpu
+                                 - (stats.gc_cpu_ns - exit_gc_cpu)
+                                 - prof_cpu;
+            stats.mutator_elapsed_ns = start_exit_elapsed
+                                 - end_init_elapsed
+                                 - (stats.gc_elapsed_ns - exit_gc_elapsed)
+                                 - prof_elapsed;
+
+            if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; }
+
+            // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding
+            // and subtracting, so the parts should add up to the total exactly.
+            // Note that stats->total_ns is captured a tiny bit later than
+            // end_exit_elapsed, so we don't use it here.
+            ASSERT(stats.init_elapsed_ns \
+                   + stats.mutator_elapsed_ns \
+                   + stats.gc_elapsed_ns \
+                   + sum.exit_elapsed_ns \
+                   == end_exit_elapsed - start_init_elapsed);
 
-            statsPrintf("  INIT    time  %7.3fs  (%7.3fs elapsed)\n",
-                        TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed));
+        }
 
-            statsPrintf("  MUT     time  %7.3fs  (%7.3fs elapsed)\n",
-                        TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed));
-            statsPrintf("  GC      time  %7.3fs  (%7.3fs elapsed)\n",
-                        TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
+        // REVIEWERS: it's not clear to me why the following isn't done in
+        // stat_endGC of the last garbage collection?
+
+        // We account for the last garbage collection
+        {
+            uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
+            stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
+            stats.allocated_bytes = tot_alloc_bytes;
+            if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
+                statsPrintf("%9" FMT_Word " %9.9s %9.9s",
+                            (W_)stats.gc.allocated_bytes, "", "");
+                statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
+            }
+        }
 
-#if defined(PROFILING)
-            statsPrintf("  RP      time  %7.3fs  (%7.3fs elapsed)\n",
-                    TimeToSecondsDbl(RP_tot_time), TimeToSecondsDbl(RPe_tot_time));
-            statsPrintf("  PROF    time  %7.3fs  (%7.3fs elapsed)\n",
-                    TimeToSecondsDbl(HC_tot_time), TimeToSecondsDbl(HCe_tot_time));
-#endif
-            statsPrintf("  EXIT    time  %7.3fs  (%7.3fs elapsed)\n",
-                    TimeToSecondsDbl(exit_cpu), TimeToSecondsDbl(exit_elapsed));
-            statsPrintf("  Total   time  %7.3fs  (%7.3fs elapsed)\n\n",
-                    TimeToSecondsDbl(tot_cpu), TimeToSecondsDbl(tot_elapsed));
-#if !defined(THREADED_RTS)
-            statsPrintf("  %%GC     time     %5.1f%%  (%.1f%% elapsed)\n\n",
-                    TimeToSecondsDbl(gc_cpu)*100/TimeToSecondsDbl(tot_cpu),
-                    TimeToSecondsDbl(gc_elapsed)*100/TimeToSecondsDbl(tot_elapsed));
-#endif
+        // We populate the remainder (non-time elements) of sum
+        {
+    #if defined(THREADED_RTS)
+            uint32_t i;
+            sum.bound_task_count = taskCount - workerCount;
+
+            for (i = 0; i < n_capabilities; i++) {
+                sum.sparks.created   += capabilities[i]->spark_stats.created;
+                sum.sparks.dud       += capabilities[i]->spark_stats.dud;
+                sum.sparks.overflowed+=
+                  capabilities[i]->spark_stats.overflowed;
+                sum.sparks.converted +=
+                  capabilities[i]->spark_stats.converted;
+                sum.sparks.gcd       += capabilities[i]->spark_stats.gcd;
+                sum.sparks.fizzled   += capabilities[i]->spark_stats.fizzled;
+            }
 
-            if (mut_cpu == 0) {
-                showStgWord64(0, temp, true/*commas*/);
+            sum.sparks_count = sum.sparks.created
+                + sum.sparks.dud
+                + sum.sparks.overflowed;
+
+            if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
+                // See Note [Work Balance]
+                sum.work_balance =
+                    (double)stats.cumulative_par_balanced_copied_bytes
+                    / (double)stats.par_copied_bytes;
             } else {
-                showStgWord64(
-                    (StgWord64)((double)stats.allocated_bytes /
-                                TimeToSecondsDbl(mut_cpu)),
-                    temp, true/*commas*/);
+                sum.work_balance = 0;
             }
 
-            statsPrintf("  Alloc rate    %s bytes per MUT second\n\n", temp);
 
-            statsPrintf("  Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n",
-                    TimeToSecondsDbl(tot_cpu - gc_cpu -
-                                PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100
-                    / TimeToSecondsDbl(tot_cpu),
-                    TimeToSecondsDbl(tot_elapsed - gc_elapsed -
-                                PROF_VAL(RPe_tot_time + HCe_tot_time) - init_elapsed) * 100
-                    / TimeToSecondsDbl(tot_elapsed));
-
-            // See Note [Internal Counter Stats] for a description of the
-            // following counters. If you add a counter here, please remember
-            // to update the Note.
-            if (RtsFlags.MiscFlags.internalCounters) {
-#if defined(THREADED_RTS) && defined(PROF_SPIN)
-                uint32_t g;
-                const int32_t col_width[] = {4, -30, 14, 14};
-                statsPrintf("Internal Counters:\n");
-                statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
-                            , col_width[0], ""
-                            , col_width[1], "SpinLock"
-                            , col_width[2], "Spins"
-                            , col_width[3], "Yields");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "gc_alloc_block_sync"
-                            , col_width[2], gc_alloc_block_sync.spin
-                            , col_width[3], gc_alloc_block_sync.yield);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "gc_spin"
-                            , col_width[2], stats.gc_spin_spin
-                            , col_width[3], stats.gc_spin_yield);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "mut_spin"
-                            , col_width[2], stats.mut_spin_spin
-                            , col_width[3], stats.mut_spin_yield);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
-                            , col_width[0], ""
-                            , col_width[1], "whitehole_gc"
-                            , col_width[2], whitehole_gc_spin
-                            , col_width[3], "n/a");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
-                            , col_width[0], ""
-                            , col_width[1], "whitehole_threadPaused"
-                            , col_width[2], whitehole_threadPaused_spin
-                            , col_width[3], "n/a");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
-                            , col_width[0], ""
-                            , col_width[1], "whitehole_executeMessage"
-                            , col_width[2], whitehole_executeMessage_spin
-                            , col_width[3], "n/a");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "whitehole_lockClosure"
-                            , col_width[2], whitehole_lockClosure_spin
-                            , col_width[3], whitehole_lockClosure_yield);
-                // waitForGcThreads isn't really spin-locking(see the function)
-                // but these numbers still seem useful.
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "waitForGcThreads"
-                            , col_width[2], waitForGcThreads_spin
-                            , col_width[3], waitForGcThreads_yield);
-
-                for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-                    int prefix_length = 0;
-                    statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
-                                col_width[0], "", g, &prefix_length);
-                    prefix_length -= col_width[0];
-                    int suffix_length = col_width[1] + prefix_length;
-                    suffix_length =
-                      suffix_length > 0 ? col_width[1] : suffix_length;
-
-                    statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                                , suffix_length, "].sync"
-                                , col_width[2], generations[g].sync.spin
-                                , col_width[3], generations[g].sync.yield);
-                }
-                statsPrintf("\n");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "any_work"
-                            , col_width[2], stats.any_work);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "no_work"
-                            , col_width[2], stats.no_work);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "scav_find_work"
-                            , col_width[2], stats.scav_find_work);
-#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
-                statsPrintf("Internal Counters require the RTS to be built "
-                        "with PROF_SPIN"); // PROF_SPIN is not #defined here
-#else // THREADED_RTS
-                statsPrintf("Internal Counters require the threaded RTS");
-#endif
+    #else // THREADED_RTS
+            sum.gc_cpu_percent     = stats.gc_cpu_ns
+                                  / stats.cpu_ns;
+            sum.gc_elapsed_percent = stats.gc_elapsed_ns
+                                  / stats.elapsed_ns;
+    #endif // THREADED_RTS
+
+            sum.fragmentation_bytes =
+                (uint64_t)(peak_mblocks_allocated
+                         * BLOCKS_PER_MBLOCK
+                         * BLOCK_SIZE_W
+                         - hw_alloc_blocks * BLOCK_SIZE_W)
+                / (uint64_t)sizeof(W_);
+
+            sum.average_bytes_used = stats.major_gcs == 0 ? 0 :
+                 stats.cumulative_live_bytes/stats.major_gcs,
+
+            sum.alloc_rate = stats.mutator_cpu_ns == 0 ? 0 :
+                (uint64_t)((double)stats.allocated_bytes
+                / TimeToSecondsDbl(stats.mutator_cpu_ns));
+
+            // REVIEWERS: These two values didn't used to include the exit times
+            sum.productivity_cpu_percent =
+                TimeToSecondsDbl(stats.cpu_ns
+                                - stats.gc_cpu_ns
+                                - sum.rp_cpu_ns
+                                - sum.hc_cpu_ns
+                                - stats.init_cpu_ns
+                                - sum.exit_cpu_ns)
+                / TimeToSecondsDbl(stats.cpu_ns);
+
+            sum.productivity_elapsed_percent =
+                TimeToSecondsDbl(stats.elapsed_ns
+                                - stats.gc_elapsed_ns
+                                - sum.rp_elapsed_ns
+                                - sum.hc_elapsed_ns
+                                - stats.init_elapsed_ns
+                                - sum.exit_elapsed_ns)
+                / TimeToSecondsDbl(stats.elapsed_ns);
+
+            for(g = 0; g < RtsFlags.GcFlags.generations; ++g) {
+                const generation* gen = &generations[g];
+                GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g];
+                gen_stats->collections = gen->collections;
+                gen_stats->par_collections = gen->par_collections;
+                gen_stats->cpu_ns = GC_coll_cpu[g];
+                gen_stats->elapsed_ns = GC_coll_elapsed[g];
+                gen_stats->max_pause_ns = GC_coll_max_pause[g];
+                gen_stats->avg_pause_ns = gen->collections == 0 ?
+                    0 : (GC_coll_elapsed[g] / gen->collections);
+    #if defined(THREADED_RTS) && defined(PROF_SPIN)
+                gen_stats->sync_spin = gen->sync.spin;
+                gen_stats->sync_yield = gen->sync.yield;
+    #endif // PROF_SPIN
             }
         }
 
+        // Now we generate the report
+        if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
+            report_summary(&sum);
+        }
+
         if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
-          char *fmt;
-          if (RtsFlags.MiscFlags.machineReadable) {
-            fmt =
-                " [(\"bytes allocated\", \"%" FMT_Word64 "\")\n"
-                " ,(\"num_GCs\", \"%" FMT_Word32 "\")\n"
-                " ,(\"average_bytes_used\", \"%" FMT_Word64 "\")\n"
-                " ,(\"max_bytes_used\", \"%" FMT_Word64 "\")\n"
-                " ,(\"num_byte_usage_samples\", \"%" FMT_Word32 "\")\n"
-                " ,(\"peak_megabytes_allocated\", \"%" FMT_Word64 "\")\n"
-                " ,(\"init_cpu_seconds\", \"%.3f\")\n"
-                " ,(\"init_wall_seconds\", \"%.3f\")\n"
-                " ,(\"mutator_cpu_seconds\", \"%.3f\")\n"
-                " ,(\"mutator_wall_seconds\", \"%.3f\")\n"
-                " ,(\"GC_cpu_seconds\", \"%.3f\")\n"
-                " ,(\"GC_wall_seconds\", \"%.3f\")\n"
-                " ]\n";
-          }
-          else {
-            fmt =
-                "<<ghc: %" FMT_Word64 " bytes, "
-                "%" FMT_Word32 " GCs, "
-                "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency (%" FMT_Word32 " samples), "
-                "%" FMT_Word64 "M in use, "
-                "%.3f INIT (%.3f elapsed), "
-                "%.3f MUT (%.3f elapsed), "
-                "%.3f GC (%.3f elapsed) :ghc>>\n";
-          }
-          /* print the long long separately to avoid bugginess on mingwin (2001-07-02, mingw-0.5) */
-          statsPrintf(fmt,
-                    stats.allocated_bytes,
-                    stats.gcs,
-                     (uint64_t)
-                      (stats.major_gcs == 0 ? 0 :
-                       stats.cumulative_live_bytes/stats.major_gcs),
-                    stats.max_live_bytes,
-                    stats.major_gcs,
-                    (uint64_t) (peak_mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)),
-                    TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed),
-                    TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed),
-                    TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
+            if (RtsFlags.MiscFlags.machineReadable) {
+                report_machine_readable(&sum);
+            }
+            else {
+                report_one_line(&sum);
+            }
         }
 
+        free_RTSSummaryStats(&sum);
         statsFlush();
         statsClose();
     }
@@ -1055,7 +1371,7 @@ waitForGcThreads:
   for a parallel garbage collection. We yield more than we spin in this case.
 
 In several places in the runtime we must take a lock on a closure. To do this,
-we replace it's info table with stg_WHITEHOLE_info, spinning if it is already
+we replace its info table with stg_WHITEHOLE_info, spinning if it is already
 a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we
 don't. We count these white-hole spins and include them in the SpinLocks table.
 If a particular loop does not yield, we put "n/a" in the table. They are named
@@ -1141,8 +1457,9 @@ statDescribeGens(void)
 
       gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live;
 
-      debugBelch("%8" FMT_Word " %8d  %8d %9" FMT_Word " %9" FMT_Word "\n", gen_blocks, lge, compacts,
-                 gen_live*(W_)sizeof(W_), gen_slop*(W_)sizeof(W_));
+      debugBelch("%8" FMT_Word " %8d  %8d %9" FMT_Word " %9" FMT_Word "\n",
+                 gen_blocks, lge, compacts, gen_live*(W_)sizeof(W_),
+                 gen_slop*(W_)sizeof(W_));
       tot_live += gen_live;
       tot_slop += gen_slop;
   }
author	Douglas Wilson <douglas.wilson@gmail.com>	2018-03-19 13:26:41 -0400
committer	Ben Gamari <ben@smart-cactus.org>	2018-03-19 13:26:42 -0400
commit	2d4bda2e4ac68816baba0afab00da6f769ea75a7 (patch)
tree	caab3da4c7506d6589b6ca89ecae2d234b4810ad /rts/Stats.c
parent	b3b394b44e42f19ab7c23668a4008e4f728b51ba (diff)
download	haskell-2d4bda2e4ac68816baba0afab00da6f769ea75a7.tar.gz