4 files changed, 691 insertions, 317 deletions
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h
index 6f011cbf6e..69ec6075db 100644
--- a/includes/RtsAPI.h
+++ b/includes/RtsAPI.h
@@ -184,7 +184,7 @@ typedef struct _RTSStats {
   uint64_t par_copied_bytes;
     // Sum of par_max_copied_bytes across all parallel GCs
   uint64_t cumulative_par_max_copied_bytes;
-  // Sum of par_balanced_copied_byes across all parallel GCs.
+    // Sum of par_balanced_copied_byes across all parallel GCs.
   uint64_t cumulative_par_balanced_copied_bytes;
 
   // -----------------------------------
@@ -192,6 +192,10 @@ typedef struct _RTSStats {
   // (we use signed values here because due to inaccuracies in timers
   // the values can occasionally go slightly negative)
 
+    // Total CPU time used by the init phase
+  Time init_cpu_ns;
+    // Total elapsed time used by the init phase
+  Time init_elapsed_ns;
     // Total CPU time used by the mutator
   Time mutator_cpu_ns;
     // Total elapsed time used by the mutator
@@ -213,26 +217,26 @@ typedef struct _RTSStats {
   // -----------------------------------
   // Internal Counters
 
-  // The number of times a GC thread spun on its 'gc_spin' lock.
-  // Will be zero if the rts was not built with PROF_SPIN
+    // The number of times a GC thread spun on its 'gc_spin' lock.
+    // Will be zero if the rts was not built with PROF_SPIN
   uint64_t gc_spin_spin;
-  // The number of times a GC thread yielded on its 'gc_spin' lock.
-  // Will be zero if the rts was not built with PROF_SPIN
+    // The number of times a GC thread yielded on its 'gc_spin' lock.
+    // Will be zero if the rts was not built with PROF_SPIN
   uint64_t gc_spin_yield;
-  // The number of times a GC thread spun on its 'mut_spin' lock.
-  // Will be zero if the rts was not built with PROF_SPIN
+    // The number of times a GC thread spun on its 'mut_spin' lock.
+    // Will be zero if the rts was not built with PROF_SPIN
   uint64_t mut_spin_spin;
-  // The number of times a GC thread yielded on its 'mut_spin' lock.
-  // Will be zero if the rts was not built with PROF_SPIN
+    // The number of times a GC thread yielded on its 'mut_spin' lock.
+    // Will be zero if the rts was not built with PROF_SPIN
   uint64_t mut_spin_yield;
-  // The number of times a GC thread has checked for work across all parallel
-  // GCs
+    // The number of times a GC thread has checked for work across all parallel
+    // GCs
   uint64_t any_work;
-  // The number of times a GC thread has checked for work and found none across
-  // all parallel GCs
+    // The number of times a GC thread has checked for work and found none
+    // across all parallel GCs
   uint64_t no_work;
-  // The number of times a GC thread has iterated it's outer loop across all
-  // parallel GCs
+    // The number of times a GC thread has iterated it's outer loop across all
+    // parallel GCs
   uint64_t scav_find_work;
 } RTSStats;
 
diff --git a/libraries/base/GHC/Stats.hsc b/libraries/base/GHC/Stats.hsc
index 3497ff5185..d81173fe58 100644
--- a/libraries/base/GHC/Stats.hsc
+++ b/libraries/base/GHC/Stats.hsc
@@ -83,6 +83,12 @@ data RTSStats = RTSStats {
   -- (we use signed values here because due to inaccuracies in timers
   -- the values can occasionally go slightly negative)
 
+    -- | Total CPU time used by the init phase
+    -- @since 4.12.0.0
+  , init_cpu_ns :: RtsTime
+    -- | Total elapsed time used by the init phase
+    -- @since 4.12.0.0
+  , init_elapsed_ns :: RtsTime
     -- | Total CPU time used by the mutator
   , mutator_cpu_ns :: RtsTime
     -- | Total elapsed time used by the mutator
@@ -175,6 +181,8 @@ getRTSStats = do
       (# peek RTSStats, cumulative_par_max_copied_bytes) p
     cumulative_par_balanced_copied_bytes <-
       (# peek RTSStats, cumulative_par_balanced_copied_bytes) p
+    init_cpu_ns <- (# peek RTSStats, init_cpu_ns) p
+    init_elapsed_ns <- (# peek RTSStats, init_elapsed_ns) p
     mutator_cpu_ns <- (# peek RTSStats, mutator_cpu_ns) p
     mutator_elapsed_ns <- (# peek RTSStats, mutator_elapsed_ns) p
     gc_cpu_ns <- (# peek RTSStats, gc_cpu_ns) p
diff --git a/rts/Stats.c b/rts/Stats.c
index 7eb93bef33..8769551160 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -24,6 +24,7 @@
 #include "ThreadPaused.h"
 #include "Messages.h"
 
+#include <string.h> // for memset
 
 #define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
 
@@ -92,6 +93,9 @@ mut_user_time_until( Time t )
     return TimeToSecondsDbl(t - stats.gc_cpu_ns);
     // heapCensus() time is included in GC_tot_cpu, so we don't need
     // to subtract it here.
+
+    // TODO: This seems wrong to me. Surely we should be subtracting
+    // (at least) start_init_cpu?
 }
 
 double
@@ -169,6 +173,8 @@ initStats0(void)
         .any_work = 0,
         .no_work = 0,
         .scav_find_work = 0,
+        .init_cpu_ns = 0,
+        .init_elapsed_ns = 0,
         .mutator_cpu_ns = 0,
         .mutator_elapsed_ns = 0,
         .gc_cpu_ns = 0,
@@ -240,6 +246,8 @@ void
 stat_endInit(void)
 {
     getProcessTimes(&end_init_cpu, &end_init_elapsed);
+    stats.init_cpu_ns = end_init_cpu - start_init_cpu;
+    stats.init_elapsed_ns = end_init_elapsed - start_init_elapsed;
 }
 
 /* -----------------------------------------------------------------------------
@@ -419,8 +427,8 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
                           stats.gc.copied_bytes,
                           stats.gc.slop_bytes,
                           /* current loss due to fragmentation */
-                          (mblocks_allocated * BLOCKS_PER_MBLOCK - n_alloc_blocks)
-                                 * BLOCK_SIZE,
+                          (mblocks_allocated * BLOCKS_PER_MBLOCK
+                           - n_alloc_blocks) * BLOCK_SIZE,
                           par_n_threads,
                           stats.gc.par_max_copied_bytes,
                           stats.gc.copied_bytes,
@@ -515,7 +523,8 @@ stat_endRP(
   fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
   fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
 #endif
-  fprintf(prof_file, "\tAverage number of visits per object = %f\n", averageNumVisit);
+  fprintf(prof_file, "\tAverage number of visits per object = %f\n",
+          averageNumVisit);
 }
 #endif /* PROFILING */
 
@@ -573,7 +582,7 @@ StgInt TOTAL_CALLS=1;
 #define REPORT(counter) \
   { \
     showStgWord64(counter,temp,true/*commas*/); \
-    statsPrintf("  (" #counter ")  : %s\n",temp);                               \
+    statsPrintf("  (" #counter ")  : %s\n",temp); \
   }
 
 /* Report the value of a counter as a percentage of another counter */
@@ -592,340 +601,647 @@ StgInt TOTAL_CALLS=1;
   statsPrintf("  (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \
               SLOW_CALLS_##arity * 100.0/TOTAL_CALLS)
 
-STATIC_INLINE Time get_init_cpu(void) {
-    return end_init_cpu - start_init_cpu;
-}
+/*
+Note [RTS Stats Reporting]
+==========================
+
+There are currently three reporting functions:
+  * report_summary:
+      Responsible for producing '+RTS -s' output.
+      Will report internal counters if the RTS flag --internal-counters is
+      passed. See [Internal Counters Stats]
+  * report_machine_readable:
+      Responsible for producing '+RTS -t --machine-readable' output.
+  * report_one_line:
+      Responsible for productin '+RTS -t' output
+
+Stats are accumulated into the global variable 'stats' as the program runs, then
+in 'stat_exit' we do the following:
+  * Finalise 'stats'. This involves setting final running times and allocations
+    that have not yet been accounted for.
+  * Create a RTSSummaryStats. This contains all data for reports that is not
+    included in stats (because they do not make sense before the program has
+    completed) or in a global variable.
+  * call the appropriate report_* function, passing the newly constructed
+    RTSSummaryStats.
+
+To ensure that the data in the different reports is kept consistent, the
+report_* functions should not do any calculation, excepting unit changes and
+formatting. If you need to add a new calculated field, add it to
+RTSSummaryStats.
+
+*/
+
 
-STATIC_INLINE Time get_init_elapsed(void) {
-    return end_init_elapsed - start_init_elapsed;
+static void init_RTSSummaryStats(RTSSummaryStats* sum)
+{
+    const size_t sizeof_gc_summary_stats =
+      RtsFlags.GcFlags.generations * sizeof(GenerationSummaryStats);
+
+    memset(sum, 0, sizeof(RTSSummaryStats));
+    sum->gc_summary_stats =
+      stgMallocBytes(sizeof_gc_summary_stats,
+                     "alloc_RTSSummaryStats.gc_summary_stats");
+    memset(sum->gc_summary_stats, 0, sizeof_gc_summary_stats);
 }
 
+static void free_RTSSummaryStats(RTSSummaryStats * sum)
+{
+    if (!sum) { return; }
+    if (!sum->gc_summary_stats) {
+        stgFree(sum->gc_summary_stats);
+        sum->gc_summary_stats = NULL;
+    }
+}
 
-void
-stat_exit (void)
+static void report_summary(const RTSSummaryStats* sum)
 {
-    generation *gen;
-    Time gc_cpu = 0;
-    Time gc_elapsed = 0;
-    Time init_cpu = 0;
-    Time init_elapsed = 0;
-    Time mut_cpu = 0;
-    Time mut_elapsed = 0;
-    Time exit_cpu = 0;
-    Time exit_elapsed = 0;
-    Time exit_gc_cpu = 0;
-    Time exit_gc_elapsed = 0;
+    // We should do no calculation, other than unit changes and formatting, and
+    // we should not not use any data from outside of globals, sum and stats
+    // here. See Note [RTS Stats Reporting]
+
+    uint32_t g;
+    char temp[512];
+    showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
+    statsPrintf("%16s bytes allocated in the heap\n", temp);
+
+    showStgWord64(stats.copied_bytes, temp, true/*commas*/);
+    statsPrintf("%16s bytes copied during GC\n", temp);
+
+    if ( stats.major_gcs > 0 ) {
+        showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
+        statsPrintf("%16s bytes maximum residency (%" FMT_Word32
+                    " sample(s))\n",
+                    temp, stats.major_gcs);
+    }
 
-    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
+    showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
+    statsPrintf("%16s bytes maximum slop\n", temp);
+
+    statsPrintf("%16" FMT_SizeT " MB total memory in use (%"
+                FMT_SizeT " MB lost due to fragmentation)\n\n",
+                stats.max_live_bytes  / (1024 * 1024),
+                sum->fragmentation_bytes / (1024 * 1024));
+
+    /* Print garbage collections in each gen */
+    statsPrintf("                                     Tot time (elapsed)  Avg pause  Max pause\n");
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+        const GenerationSummaryStats * gen_stats =
+            &sum->gc_summary_stats[g];
+        statsPrintf("  Gen %2d     %5d colls"
+                    ", %5d par   %6.3fs  %6.3fs     %3.4fs    %3.4fs\n",
+                    g, // REVIEWERS: this used to be gen->no
+                    //, this can't ever be different right?
+                    gen_stats->collections,
+                    gen_stats->par_collections,
+                    TimeToSecondsDbl(gen_stats->cpu_ns),
+                    TimeToSecondsDbl(gen_stats->elapsed_ns),
+                    TimeToSecondsDbl(gen_stats->avg_pause_ns),
+                    TimeToSecondsDbl(gen_stats->max_pause_ns));
+    }
 
-        char temp[512];
-        Time tot_cpu;
-        Time tot_elapsed;
-        uint32_t g;
+    statsPrintf("\n");
 
-        getProcessTimes( &tot_cpu, &tot_elapsed );
-        tot_cpu -= start_init_cpu;
-        tot_elapsed -= start_init_elapsed;
+#if defined(THREADED_RTS)
+    if (RtsFlags.ParFlags.parGcEnabled && sum->work_balance > 0) {
+        // See Note [Work Balance]
+        statsPrintf("  Parallel GC work balance: "
+                    "%.2f%% (serial 0%%, perfect 100%%)\n\n",
+                    sum->work_balance * 100);
+    }
 
-        uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
+    statsPrintf("  TASKS: %d "
+                "(%d bound, %d peak workers (%d total), using -N%d)\n\n",
+                taskCount, sum->bound_task_count,
+                peakWorkerCount, workerCount,
+                n_capabilities);
+
+    statsPrintf("  SPARKS: %" FMT_Word
+                "(%" FMT_Word " converted, %" FMT_Word " overflowed, %"
+                FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
+                sum->sparks_count,
+                sum->sparks.converted, sum->sparks.overflowed,
+                sum->sparks.dud, sum->sparks.gcd,
+                sum->sparks.fizzled);
+#endif
 
-        // allocated since the last GC
-        stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
-        stats.allocated_bytes = tot_alloc_bytes;
+    statsPrintf("  INIT    time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(stats.init_cpu_ns),
+                TimeToSecondsDbl(stats.init_elapsed_ns));
 
-        /* avoid divide by zero if tot_cpu is measured as 0.00 seconds -- SDM */
-        if (tot_cpu <= 0)  tot_cpu = 1;
-        if (tot_elapsed <= 0) tot_elapsed = 1;
+    statsPrintf("  MUT     time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(stats.mutator_cpu_ns),
+                TimeToSecondsDbl(stats.mutator_elapsed_ns));
+    statsPrintf("  GC      time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(stats.gc_cpu_ns),
+                TimeToSecondsDbl(stats.gc_elapsed_ns));
 
-        if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
-            statsPrintf("%9" FMT_Word " %9.9s %9.9s",
-                        (W_)stats.gc.allocated_bytes, "", "");
-            statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
-        }
+#if defined(PROFILING)
+    statsPrintf("  RP      time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(sum->rc_cpu_ns),
+                TimeToSecondsDbl(sum->rc_elapsed_ns));
+    statsPrintf("  PROF    time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(sum->hc_cpu_ns),
+                TimeToSecondsDbl(sum->hc_elapsed_ns));
+#endif
+    statsPrintf("  EXIT    time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(sum->exit_cpu_ns),
+                TimeToSecondsDbl(sum->exit_elapsed_ns));
+    statsPrintf("  Total   time  %7.3fs  (%7.3fs elapsed)\n\n",
+                TimeToSecondsDbl(stats.cpu_ns),
+                TimeToSecondsDbl(stats.elapsed_ns));
+#if !defined(THREADED_RTS)
+    statsPrintf("  %%GC     time     %5.1f%%  (%.1f%% elapsed)\n\n",
+                sum->gc_cpu_percent * 100,
+                sum->gc_elapsed_percent * 100);
+#endif
 
-        // heapCensus() is called by the GC, so RP and HC time are
-        // included in the GC stats.  We therefore subtract them to
-        // obtain the actual GC cpu time.
-        gc_cpu     = stats.gc_cpu_ns - PROF_VAL(RP_tot_time + HC_tot_time);
-        gc_elapsed = stats.gc_elapsed_ns - PROF_VAL(RPe_tot_time + HCe_tot_time);
+    showStgWord64(sum->alloc_rate, temp, true/*commas*/);
 
-        init_cpu     = get_init_cpu();
-        init_elapsed = get_init_elapsed();
+    statsPrintf("  Alloc rate    %s bytes per MUT second\n\n", temp);
 
-        // We do a GC during the EXIT phase.  We'll attribute the cost of that
-        // to GC instead of EXIT, so carefully subtract it from the EXIT time.
-        exit_gc_cpu  = stats.gc_cpu_ns - start_exit_gc_cpu;
-        exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
-        exit_cpu     = end_exit_cpu - start_exit_cpu - exit_gc_cpu;
-        exit_elapsed = end_exit_elapsed - start_exit_elapsed - exit_gc_elapsed;
+    statsPrintf("  Productivity %5.1f%% of total user, "
+                "%.1f%% of total elapsed\n\n",
+                sum->productivity_cpu_percent * 100,
+                sum->productivity_elapsed_percent * 100);
 
-        mut_elapsed = start_exit_elapsed - end_init_elapsed -
-            (gc_elapsed - exit_gc_elapsed) -
-            PROF_VAL(RPe_tot_time + HCe_tot_time);
+    // See Note [Internal Counter Stats] for a description of the
+    // following counters. If you add a counter here, please remember
+    // to update the Note.
+    if (RtsFlags.MiscFlags.internalCounters) {
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+        const int32_t col_width[] = {4, -30, 14, 14};
+        statsPrintf("Internal Counters:\n");
+        statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
+                    , col_width[0], ""
+                    , col_width[1], "SpinLock"
+                    , col_width[2], "Spins"
+                    , col_width[3], "Yields");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "gc_alloc_block_sync"
+                    , col_width[2], gc_alloc_block_sync.spin
+                    , col_width[3], gc_alloc_block_sync.yield);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "gc_spin"
+                    , col_width[2], stats.gc_spin_spin
+                    , col_width[3], stats.gc_spin_yield);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "mut_spin"
+                    , col_width[2], stats.mut_spin_spin
+                    , col_width[3], stats.mut_spin_yield);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                    , col_width[0], ""
+                    , col_width[1], "whitehole_gc"
+                    , col_width[2], whitehole_gc_spin
+                    , col_width[3], "n/a");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                    , col_width[0], ""
+                    , col_width[1], "whitehole_threadPaused"
+                    , col_width[2], whitehole_threadPaused_spin
+                    , col_width[3], "n/a");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+                    , col_width[0], ""
+                    , col_width[1], "whitehole_executeMessage"
+                    , col_width[2], whitehole_executeMessage_spin
+                    , col_width[3], "n/a");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "whitehole_lockClosure"
+                    , col_width[2], whitehole_lockClosure_spin
+                    , col_width[3], whitehole_lockClosure_yield);
+        // waitForGcThreads isn't really spin-locking(see the function)
+        // but these numbers still seem useful.
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "waitForGcThreads"
+                    , col_width[2], waitForGcThreads_spin
+                    , col_width[3], waitForGcThreads_yield);
+
+        for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+            int prefix_length = 0;
+            statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
+                        col_width[0], "", g, &prefix_length);
+            prefix_length -= col_width[0];
+            int suffix_length = col_width[1] + prefix_length;
+            suffix_length =
+                  suffix_length > 0 ? col_width[1] : suffix_length;
+
+            statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+                        , suffix_length, "].sync"
+                        , col_width[2], generations[g].sync.spin
+                        , col_width[3], generations[g].sync.yield);
+        }
+        statsPrintf("\n");
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "any_work"
+                    , col_width[2], stats.any_work);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "no_work"
+                    , col_width[2], stats.no_work);
+        statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+                    , col_width[0], ""
+                    , col_width[1], "scav_find_work"
+                    , col_width[2], stats.scav_find_work);
+#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
+        statsPrintf("Internal Counters require the RTS to be built "
+                "with PROF_SPIN"); // PROF_SPIN is not #defined here
+#else // THREADED_RTS
+        statsPrintf("Internal Counters require the threaded RTS");
+#endif
+    }
+}
 
-        mut_cpu = start_exit_cpu - end_init_cpu - (gc_cpu - exit_gc_cpu)
-            - PROF_VAL(RP_tot_time + HC_tot_time);
-        if (mut_cpu < 0) { mut_cpu = 0; }
+static void report_machine_readable (const RTSSummaryStats * sum)
+{
+    // We should do no calculation, other than unit changes and formatting, and
+    // we should not not use any data from outside of globals, sum and stats
+    // here. See Note [RTS Stats Reporting]
+    uint32_t g;
+
+#define MR_STAT(field_name,format,value) \
+    statsPrintf(" ,(\"" field_name "\", \"%" format "\")\n", value)
+#define MR_STAT_GEN(gen,field_name,format,value) \
+    statsPrintf(" ,(\"gen_%" FMT_Word32 "_" field_name "\", \"%" \
+      format "\")\n", g, value)
+
+    // These first values are for backwards compatibility.
+    // Some of these first fields are duplicated with more machine-readable
+    // names, or to match the name in RtsStats.
+
+    // we don't use for the first field helper macro here because the prefix is
+    // different
+    statsPrintf(" [(\"%s\", \"%" FMT_Word64 "\")\n", "bytes allocated",
+                stats.allocated_bytes);
+    MR_STAT("num_GCs", FMT_Word32, stats.gcs);
+    MR_STAT("average_bytes_used", FMT_Word64, sum->average_bytes_used);
+    MR_STAT("max_bytes_used", FMT_Word64, stats.max_live_bytes);
+    MR_STAT("num_byte_usage_samples", FMT_Word32, stats.major_gcs);
+    MR_STAT("peak_megabytes_allocated", FMT_SizeT,
+      stats.max_mem_in_use_bytes / (1024 * 1024));
+
+    MR_STAT("init_cpu_seconds", "f", TimeToSecondsDbl(stats.init_cpu_ns));
+    MR_STAT("init_wall_seconds", "f", TimeToSecondsDbl(stats.init_elapsed_ns));
+    MR_STAT("mut_cpu_seconds", "f", TimeToSecondsDbl(stats.mutator_cpu_ns));
+    MR_STAT("mut_wall_seconds", "f",
+            TimeToSecondsDbl(stats.mutator_elapsed_ns));
+    MR_STAT("GC_cpu_seconds", "f", TimeToSecondsDbl(stats.gc_cpu_ns));
+    MR_STAT("GC_wall_seconds", "f", TimeToSecondsDbl(stats.gc_elapsed_ns));
+
+    // end backward compatibility
+
+    // First, the rest of the times
+
+    MR_STAT("exit_cpu_seconds", "f", TimeToSecondsDbl(sum->exit_cpu_ns));
+    MR_STAT("exit_wall_seconds", "f", TimeToSecondsDbl(sum->exit_elapsed_ns));
+#if defined(PROFILING)
+    MR_STAT("rp_cpu_seconds", "f", TimeToSecondsDbl(sum->rp_cpu_ns));
+    MR_STAT("rp_wall_seconds", "f", TimeToSecondsDbl(sum->rp_elapsed_ns));
+    MR_STAT("hc_cpu_seconds", "f", TimeToSecondsDbl(sum->hp_cpu_ns));
+    MR_STAT("hc_wall_seconds", "f", TimeToSecondsDbl(sum->hp_elapsed_ns));
+#endif
+    MR_STAT("total_cpu_seconds", "f", TimeToSecondsDbl(stats.cpu_ns));
+    MR_STAT("total_wall_seconds", "f",
+            TimeToSecondsDbl(stats.elapsed_ns));
+
+    // next, the remainder of the fields of RTSStats, except internal counters
+
+    // The first two are duplicates of those above, but have more machine
+    // readable names that match the field names in RTSStats.
+
+
+    // gcs has been done as num_GCs above
+    MR_STAT("major_gcs", FMT_Word32, stats.major_gcs);
+    MR_STAT("allocated_bytes", FMT_Word64, stats.allocated_bytes);
+    MR_STAT("max_live_bytes", FMT_Word64, stats.max_live_bytes);
+    MR_STAT("max_large_objects_bytes", FMT_Word64,
+            stats.max_large_objects_bytes);
+    MR_STAT("max_compact_bytes", FMT_Word64, stats.max_compact_bytes);
+    MR_STAT("max_slop_bytes", FMT_Word64, stats.max_slop_bytes);
+    // This duplicates, except for unit, peak_megabytes_allocated above
+    MR_STAT("max_mem_in_use_bytes", FMT_Word64, stats.max_mem_in_use_bytes);
+    MR_STAT("cumulative_live_bytes", FMT_Word64, stats.cumulative_live_bytes);
+    MR_STAT("copied_bytes", FMT_Word64, stats.copied_bytes);
+    MR_STAT("par_copied_bytes", FMT_Word64, stats.par_copied_bytes);
+    MR_STAT("cumulative_par_max_copied_bytes", FMT_Word64,
+            stats.cumulative_par_max_copied_bytes);
+    MR_STAT("cumulative_par_balanced_copied_bytes", FMT_Word64,
+            stats.cumulative_par_balanced_copied_bytes);
+
+    // next, the computed fields in RTSSummaryStats
+#if !defined(THREADED_RTS) // THREADED_RTS
+    MR_STAT("gc_cpu_percent", "f", sum->gc_cpu_percent);
+    MR_STAT("gc_wall_percent", "f", sum->gc_cpu_percent);
+#endif
+    MR_STAT("fragmentation_bytes", FMT_SizeT, sum->fragmentation_bytes);
+    // average_bytes_used is done above
+    MR_STAT("alloc_rate", FMT_Word64, sum->alloc_rate);
+    MR_STAT("productivity_cpu_percent", "f", sum->productivity_cpu_percent);
+    MR_STAT("productivity_wall_percent", "f",
+            sum->productivity_elapsed_percent);
 
-        // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding and
-        // subtracting, so the parts should add up to the total exactly.  Note
-        // that tot_elapsed is captured a tiny bit later than end_exit_elapsed,
-        // so we don't use it here.
-        ASSERT(init_elapsed + mut_elapsed + gc_elapsed + exit_elapsed
-               == end_exit_elapsed - start_init_elapsed);
+    // next, the THREADED_RTS fields in RTSSummaryStats
 
+#if defined(THREADED_RTS)
+    MR_STAT("bound_task_count", FMT_Word32, sum->bound_task_count);
+    MR_STAT("sparks_count", FMT_Word64, sum->sparks_count);
+    MR_STAT("sparks_converted", FMT_Word64, sum->sparks.converted);
+    MR_STAT("sparks_overflowed", FMT_Word64, sum->sparks.overflowed);
+    MR_STAT("sparks_dud ", FMT_Word64, sum->sparks.dud);
+    MR_STAT("sparks_gcd", FMT_Word64, sum->sparks.gcd);
+    MR_STAT("sparks_fizzled", FMT_Word64, sum->sparks.fizzled);
+    MR_STAT("work_balance", "f", sum->work_balance);
+
+    // next, globals (other than internal counters)
+    MR_STAT("n_capabilities", FMT_Word32, n_capabilities);
+    MR_STAT("task_count", FMT_Word32, taskCount);
+    MR_STAT("peak_worker_count", FMT_Word32, peakWorkerCount);
+    MR_STAT("worker_count", FMT_Word32, workerCount);
+
+    // next, internal counters
+#if defined(PROF_SPIN)
+    MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
+    MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
+            gc_alloc_block_sync.yield);
+    MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
+    MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin);
+    MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield);
+    MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin);
+    MR_STAT("mut_spin_yield", FMT_Word64, stats.mut_spin_yield);
+    MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin);
+    MR_STAT("waitForGcThreads_yield", FMT_Word64,
+            waitForGcThreads_yield);
+    MR_STAT("whitehole_gc_spin", FMT_Word64, whitehole_gc_spin);
+    MR_STAT("whitehole_lockClosure_spin", FMT_Word64,
+            whitehole_lockClosure_spin);
+    MR_STAT("whitehole_lockClosure_yield", FMT_Word64,
+            whitehole_lockClosure_yield);
+    MR_STAT("whitehole_executeMessage_spin", FMT_Word64,
+            whitehole_executeMessage_spin);
+    MR_STAT("whitehole_threadPaused_spin", FMT_Word64,
+            whitehole_threadPaused_spin);
+    MR_STAT("any_work", FMT_Word64,
+            stats.any_work);
+    MR_STAT("no_work", FMT_Word64,
+            stats.no_work);
+    MR_STAT("scav_find_work", FMT_Word64,
+            stats.scav_find_work);
+#endif // PROF_SPIN
+#endif // THREADED_RTS
+
+    // finally, per-generation stats. Named as, for example for generation 0,
+    // gen_0_collections
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+        const GenerationSummaryStats* gc_sum = &sum->gc_summary_stats[g];
+        MR_STAT_GEN(g, "collections", FMT_Word32, gc_sum->collections);
+        MR_STAT_GEN(g, "par_collections", FMT_Word32, gc_sum->par_collections);
+        MR_STAT_GEN(g, "cpu_seconds", "f", TimeToSecondsDbl(gc_sum->cpu_ns));
+        MR_STAT_GEN(g, "wall_seconds", "f",
+                    TimeToSecondsDbl(gc_sum->elapsed_ns));
+        MR_STAT_GEN(g, "max_pause_seconds", "f",
+                    TimeToSecondsDbl(gc_sum->max_pause_ns));
+        MR_STAT_GEN(g, "avg_pause_seconds", "f",
+                    TimeToSecondsDbl(gc_sum->avg_pause_ns));
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+        MR_STAT_GEN(g, "sync_spin", FMT_Word64, gc_sum->sync_spin);
+        MR_STAT_GEN(g, "sync_yield", FMT_Word64, gc_sum->sync_yield);
+#endif
+    }
 
-        if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
-            showStgWord64(stats.allocated_bytes, temp, true/*commas*/);
-            statsPrintf("%16s bytes allocated in the heap\n", temp);
+    statsPrintf(" ]\n");
+}
 
-            showStgWord64(stats.copied_bytes, temp, true/*commas*/);
-            statsPrintf("%16s bytes copied during GC\n", temp);
+static void report_one_line(const RTSSummaryStats * sum)
+{
+    // We should do no calculation, other than unit changes and formatting, and
+    // we should not not use any data from outside of globals, sum and stats
+    // here. See Note [RTS Stats Reporting]
 
-            if ( stats.major_gcs > 0 ) {
-                showStgWord64(stats.max_live_bytes, temp, true/*commas*/);
-                statsPrintf("%16s bytes maximum residency (%" FMT_Word32
-                            " sample(s))\n",
-                            temp, stats.major_gcs);
-            }
+    /* print the long long separately to avoid bugginess on mingwin (2001-07-02,
+    mingw-0.5) */
+    statsPrintf("<<ghc: %" FMT_Word64 " bytes, "
+                "%" FMT_Word32 " GCs, "
+                "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency "
+                "(%" FMT_Word32 " samples), "
+                "%" FMT_Word64 "M in use, "
+                "%.3f INIT (%.3f elapsed), "
+                "%.3f MUT (%.3f elapsed), "
+                "%.3f GC (%.3f elapsed) :ghc>>\n",
+                stats.allocated_bytes,
+                stats.gcs,
+                sum->average_bytes_used,
+                stats.max_live_bytes,
+                stats.major_gcs,
+                stats.max_mem_in_use_bytes / (1024 * 1024),
+                TimeToSecondsDbl(stats.init_cpu_ns),
+                TimeToSecondsDbl(stats.init_elapsed_ns),
+                TimeToSecondsDbl(stats.mutator_cpu_ns),
+                TimeToSecondsDbl(stats.mutator_elapsed_ns),
+                TimeToSecondsDbl(stats.gc_cpu_ns),
+                TimeToSecondsDbl(stats.gc_elapsed_ns));
+}
 
-            showStgWord64(stats.max_slop_bytes, temp, true/*commas*/);
-            statsPrintf("%16s bytes maximum slop\n", temp);
-
-            statsPrintf("%16" FMT_SizeT " MB total memory in use (%"
-                        FMT_SizeT " MB lost due to fragmentation)\n\n",
-                        (size_t)(peak_mblocks_allocated * MBLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)),
-                        (size_t)(peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_)));
-
-            /* Print garbage collections in each gen */
-            statsPrintf("                                     Tot time (elapsed)  Avg pause  Max pause\n");
-            for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-                gen = &generations[g];
-                statsPrintf("  Gen %2d     %5d colls, %5d par   %6.3fs  %6.3fs     %3.4fs    %3.4fs\n",
-                            gen->no,
-                            gen->collections,
-                            gen->par_collections,
-                            TimeToSecondsDbl(GC_coll_cpu[g]),
-                            TimeToSecondsDbl(GC_coll_elapsed[g]),
-                            gen->collections == 0 ? 0 : TimeToSecondsDbl(GC_coll_elapsed[g] / gen->collections),
-                            TimeToSecondsDbl(GC_coll_max_pause[g]));
-            }
+void
+stat_exit (void)
+{
+    RTSSummaryStats sum;
+    uint32_t g;
 
-#if defined(THREADED_RTS)
-            if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
-                // See Note [Work Balance]
-                statsPrintf("\n  Parallel GC work balance: %.2f%% (serial 0%%, perfect 100%%)\n",
-                    100 * (double)stats.cumulative_par_balanced_copied_bytes /
-                          (double)stats.par_copied_bytes);
-            }
-#endif
-            statsPrintf("\n");
+    init_RTSSummaryStats(&sum);
+    if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
+        // First we tidy the times in stats, and populate the times in sum.
+        // In particular, we adjust the gc_* time in stats to remove
+        // profiling times.
+        {
+            Time now_cpu_ns, now_elapsed_ns;
+            Time exit_gc_cpu = 0;
+            Time exit_gc_elapsed = 0;
+            Time prof_cpu = 0;
+            Time prof_elapsed = 0;
+
+            getProcessTimes( &now_cpu_ns, &now_elapsed_ns);
+
+            stats.cpu_ns = now_cpu_ns - start_init_cpu;
+            stats.elapsed_ns = now_elapsed_ns - start_init_elapsed;
+            /* avoid divide by zero if stats.total_cpu_ns is measured as 0.00
+               seconds -- SDM */
+            if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; }
+            if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; }
+
+            prof_cpu = PROF_VAL(RP_tot_time + HC_tot_time);
+            prof_elapsed = PROF_VAL(RPe_tot_time + HCe_tot_time);
+
+            // heapCensus() is called by the GC, so RP and HC time are
+            // included in the GC stats.  We therefore subtract them to
+            // obtain the actual GC cpu time.
+            stats.gc_cpu_ns      -=  prof_cpu;
+            stats.gc_elapsed_ns  -=  prof_elapsed;
 
-#if defined(THREADED_RTS)
-            statsPrintf("  TASKS: %d (%d bound, %d peak workers (%d total), using -N%d)\n",
-                        taskCount, taskCount - workerCount,
-                        peakWorkerCount, workerCount,
-                        n_capabilities);
-
-            statsPrintf("\n");
-
-            {
-                uint32_t i;
-                SparkCounters sparks = { 0, 0, 0, 0, 0, 0};
-                for (i = 0; i < n_capabilities; i++) {
-                    sparks.created   += capabilities[i]->spark_stats.created;
-                    sparks.dud       += capabilities[i]->spark_stats.dud;
-                    sparks.overflowed+= capabilities[i]->spark_stats.overflowed;
-                    sparks.converted += capabilities[i]->spark_stats.converted;
-                    sparks.gcd       += capabilities[i]->spark_stats.gcd;
-                    sparks.fizzled   += capabilities[i]->spark_stats.fizzled;
-                }
-
-                statsPrintf("  SPARKS: %" FMT_Word " (%" FMT_Word " converted, %" FMT_Word " overflowed, %" FMT_Word " dud, %" FMT_Word " GC'd, %" FMT_Word " fizzled)\n\n",
-                            sparks.created + sparks.dud + sparks.overflowed,
-                            sparks.converted, sparks.overflowed, sparks.dud,
-                            sparks.gcd, sparks.fizzled);
-            }
-#endif
+#if defined(PROFILING)
+            sum.rp_cpu_ns = RP_tot_time;
+            sum.rp_elapsed_ns = RPe_tot_time;
+            sum.hc_cpu_ns = HC_tot_time;
+            sum.hc_elapsed_ns = HCe_tot_time;
+#endif // PROFILING
+
+            // We do a GC during the EXIT phase. We'll attribute the cost of
+            // that to GC instead of EXIT, so carefully subtract it from the
+            // EXIT time.
+            exit_gc_cpu          = stats.gc_cpu_ns - start_exit_gc_cpu;
+            exit_gc_elapsed      = stats.gc_elapsed_ns - start_exit_gc_elapsed;
+
+            sum.exit_cpu_ns     = end_exit_cpu
+                                      - start_exit_cpu
+                                      - exit_gc_cpu;
+            sum.exit_elapsed_ns = end_exit_elapsed
+                                       - start_exit_elapsed
+                                       - exit_gc_elapsed;
+
+            stats.mutator_cpu_ns     = start_exit_cpu
+                                 - end_init_cpu
+                                 - (stats.gc_cpu_ns - exit_gc_cpu)
+                                 - prof_cpu;
+            stats.mutator_elapsed_ns = start_exit_elapsed
+                                 - end_init_elapsed
+                                 - (stats.gc_elapsed_ns - exit_gc_elapsed)
+                                 - prof_elapsed;
+
+            if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; }
+
+            // The subdivision of runtime into INIT/EXIT/GC/MUT is just adding
+            // and subtracting, so the parts should add up to the total exactly.
+            // Note that stats->total_ns is captured a tiny bit later than
+            // end_exit_elapsed, so we don't use it here.
+            ASSERT(stats.init_elapsed_ns \
+                   + stats.mutator_elapsed_ns \
+                   + stats.gc_elapsed_ns \
+                   + sum.exit_elapsed_ns \
+                   == end_exit_elapsed - start_init_elapsed);
 
-            statsPrintf("  INIT    time  %7.3fs  (%7.3fs elapsed)\n",
-                        TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed));
+        }
 
-            statsPrintf("  MUT     time  %7.3fs  (%7.3fs elapsed)\n",
-                        TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed));
-            statsPrintf("  GC      time  %7.3fs  (%7.3fs elapsed)\n",
-                        TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
+        // REVIEWERS: it's not clear to me why the following isn't done in
+        // stat_endGC of the last garbage collection?
+
+        // We account for the last garbage collection
+        {
+            uint64_t tot_alloc_bytes = calcTotalAllocated() * sizeof(W_);
+            stats.gc.allocated_bytes = tot_alloc_bytes - stats.allocated_bytes;
+            stats.allocated_bytes = tot_alloc_bytes;
+            if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
+                statsPrintf("%9" FMT_Word " %9.9s %9.9s",
+                            (W_)stats.gc.allocated_bytes, "", "");
+                statsPrintf(" %6.3f %6.3f\n\n", 0.0, 0.0);
+            }
+        }
 
-#if defined(PROFILING)
-            statsPrintf("  RP      time  %7.3fs  (%7.3fs elapsed)\n",
-                    TimeToSecondsDbl(RP_tot_time), TimeToSecondsDbl(RPe_tot_time));
-            statsPrintf("  PROF    time  %7.3fs  (%7.3fs elapsed)\n",
-                    TimeToSecondsDbl(HC_tot_time), TimeToSecondsDbl(HCe_tot_time));
-#endif
-            statsPrintf("  EXIT    time  %7.3fs  (%7.3fs elapsed)\n",
-                    TimeToSecondsDbl(exit_cpu), TimeToSecondsDbl(exit_elapsed));
-            statsPrintf("  Total   time  %7.3fs  (%7.3fs elapsed)\n\n",
-                    TimeToSecondsDbl(tot_cpu), TimeToSecondsDbl(tot_elapsed));
-#if !defined(THREADED_RTS)
-            statsPrintf("  %%GC     time     %5.1f%%  (%.1f%% elapsed)\n\n",
-                    TimeToSecondsDbl(gc_cpu)*100/TimeToSecondsDbl(tot_cpu),
-                    TimeToSecondsDbl(gc_elapsed)*100/TimeToSecondsDbl(tot_elapsed));
-#endif
+        // We populate the remainder (non-time elements) of sum
+        {
+    #if defined(THREADED_RTS)
+            uint32_t i;
+            sum.bound_task_count = taskCount - workerCount;
+
+            for (i = 0; i < n_capabilities; i++) {
+                sum.sparks.created   += capabilities[i]->spark_stats.created;
+                sum.sparks.dud       += capabilities[i]->spark_stats.dud;
+                sum.sparks.overflowed+=
+                  capabilities[i]->spark_stats.overflowed;
+                sum.sparks.converted +=
+                  capabilities[i]->spark_stats.converted;
+                sum.sparks.gcd       += capabilities[i]->spark_stats.gcd;
+                sum.sparks.fizzled   += capabilities[i]->spark_stats.fizzled;
+            }
 
-            if (mut_cpu == 0) {
-                showStgWord64(0, temp, true/*commas*/);
+            sum.sparks_count = sum.sparks.created
+                + sum.sparks.dud
+                + sum.sparks.overflowed;
+
+            if (RtsFlags.ParFlags.parGcEnabled && stats.par_copied_bytes > 0) {
+                // See Note [Work Balance]
+                sum.work_balance =
+                    (double)stats.cumulative_par_balanced_copied_bytes
+                    / (double)stats.par_copied_bytes;
             } else {
-                showStgWord64(
-                    (StgWord64)((double)stats.allocated_bytes /
-                                TimeToSecondsDbl(mut_cpu)),
-                    temp, true/*commas*/);
+                sum.work_balance = 0;
             }
 
-            statsPrintf("  Alloc rate    %s bytes per MUT second\n\n", temp);
 
-            statsPrintf("  Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n",
-                    TimeToSecondsDbl(tot_cpu - gc_cpu -
-                                PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100
-                    / TimeToSecondsDbl(tot_cpu),
-                    TimeToSecondsDbl(tot_elapsed - gc_elapsed -
-                                PROF_VAL(RPe_tot_time + HCe_tot_time) - init_elapsed) * 100
-                    / TimeToSecondsDbl(tot_elapsed));
-
-            // See Note [Internal Counter Stats] for a description of the
-            // following counters. If you add a counter here, please remember
-            // to update the Note.
-            if (RtsFlags.MiscFlags.internalCounters) {
-#if defined(THREADED_RTS) && defined(PROF_SPIN)
-                uint32_t g;
-                const int32_t col_width[] = {4, -30, 14, 14};
-                statsPrintf("Internal Counters:\n");
-                statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
-                            , col_width[0], ""
-                            , col_width[1], "SpinLock"
-                            , col_width[2], "Spins"
-                            , col_width[3], "Yields");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "gc_alloc_block_sync"
-                            , col_width[2], gc_alloc_block_sync.spin
-                            , col_width[3], gc_alloc_block_sync.yield);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "gc_spin"
-                            , col_width[2], stats.gc_spin_spin
-                            , col_width[3], stats.gc_spin_yield);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "mut_spin"
-                            , col_width[2], stats.mut_spin_spin
-                            , col_width[3], stats.mut_spin_yield);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
-                            , col_width[0], ""
-                            , col_width[1], "whitehole_gc"
-                            , col_width[2], whitehole_gc_spin
-                            , col_width[3], "n/a");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
-                            , col_width[0], ""
-                            , col_width[1], "whitehole_threadPaused"
-                            , col_width[2], whitehole_threadPaused_spin
-                            , col_width[3], "n/a");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
-                            , col_width[0], ""
-                            , col_width[1], "whitehole_executeMessage"
-                            , col_width[2], whitehole_executeMessage_spin
-                            , col_width[3], "n/a");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "whitehole_lockClosure"
-                            , col_width[2], whitehole_lockClosure_spin
-                            , col_width[3], whitehole_lockClosure_yield);
-                // waitForGcThreads isn't really spin-locking(see the function)
-                // but these numbers still seem useful.
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "waitForGcThreads"
-                            , col_width[2], waitForGcThreads_spin
-                            , col_width[3], waitForGcThreads_yield);
-
-                for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-                    int prefix_length = 0;
-                    statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
-                                col_width[0], "", g, &prefix_length);
-                    prefix_length -= col_width[0];
-                    int suffix_length = col_width[1] + prefix_length;
-                    suffix_length =
-                      suffix_length > 0 ? col_width[1] : suffix_length;
-
-                    statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
-                                , suffix_length, "].sync"
-                                , col_width[2], generations[g].sync.spin
-                                , col_width[3], generations[g].sync.yield);
-                }
-                statsPrintf("\n");
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "any_work"
-                            , col_width[2], stats.any_work);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "no_work"
-                            , col_width[2], stats.no_work);
-                statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
-                            , col_width[0], ""
-                            , col_width[1], "scav_find_work"
-                            , col_width[2], stats.scav_find_work);
-#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
-                statsPrintf("Internal Counters require the RTS to be built "
-                        "with PROF_SPIN"); // PROF_SPIN is not #defined here
-#else // THREADED_RTS
-                statsPrintf("Internal Counters require the threaded RTS");
-#endif
+    #else // THREADED_RTS
+            sum.gc_cpu_percent     = stats.gc_cpu_ns
+                                  / stats.cpu_ns;
+            sum.gc_elapsed_percent = stats.gc_elapsed_ns
+                                  / stats.elapsed_ns;
+    #endif // THREADED_RTS
+
+            sum.fragmentation_bytes =
+                (uint64_t)(peak_mblocks_allocated
+                         * BLOCKS_PER_MBLOCK
+                         * BLOCK_SIZE_W
+                         - hw_alloc_blocks * BLOCK_SIZE_W)
+                / (uint64_t)sizeof(W_);
+
+            sum.average_bytes_used = stats.major_gcs == 0 ? 0 :
+                 stats.cumulative_live_bytes/stats.major_gcs,
+
+            sum.alloc_rate = stats.mutator_cpu_ns == 0 ? 0 :
+                (uint64_t)((double)stats.allocated_bytes
+                / TimeToSecondsDbl(stats.mutator_cpu_ns));
+
+            // REVIEWERS: These two values didn't used to include the exit times
+            sum.productivity_cpu_percent =
+                TimeToSecondsDbl(stats.cpu_ns
+                                - stats.gc_cpu_ns
+                                - sum.rp_cpu_ns
+                                - sum.hc_cpu_ns
+                                - stats.init_cpu_ns
+                                - sum.exit_cpu_ns)
+                / TimeToSecondsDbl(stats.cpu_ns);
+
+            sum.productivity_elapsed_percent =
+                TimeToSecondsDbl(stats.elapsed_ns
+                                - stats.gc_elapsed_ns
+                                - sum.rp_elapsed_ns
+                                - sum.hc_elapsed_ns
+                                - stats.init_elapsed_ns
+                                - sum.exit_elapsed_ns)
+                / TimeToSecondsDbl(stats.elapsed_ns);
+
+            for(g = 0; g < RtsFlags.GcFlags.generations; ++g) {
+                const generation* gen = &generations[g];
+                GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g];
+                gen_stats->collections = gen->collections;
+                gen_stats->par_collections = gen->par_collections;
+                gen_stats->cpu_ns = GC_coll_cpu[g];
+                gen_stats->elapsed_ns = GC_coll_elapsed[g];
+                gen_stats->max_pause_ns = GC_coll_max_pause[g];
+                gen_stats->avg_pause_ns = gen->collections == 0 ?
+                    0 : (GC_coll_elapsed[g] / gen->collections);
+    #if defined(THREADED_RTS) && defined(PROF_SPIN)
+                gen_stats->sync_spin = gen->sync.spin;
+                gen_stats->sync_yield = gen->sync.yield;
+    #endif // PROF_SPIN
             }
         }
 
+        // Now we generate the report
+        if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) {
+            report_summary(&sum);
+        }
+
         if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
-          char *fmt;
-          if (RtsFlags.MiscFlags.machineReadable) {
-            fmt =
-                " [(\"bytes allocated\", \"%" FMT_Word64 "\")\n"
-                " ,(\"num_GCs\", \"%" FMT_Word32 "\")\n"
-                " ,(\"average_bytes_used\", \"%" FMT_Word64 "\")\n"
-                " ,(\"max_bytes_used\", \"%" FMT_Word64 "\")\n"
-                " ,(\"num_byte_usage_samples\", \"%" FMT_Word32 "\")\n"
-                " ,(\"peak_megabytes_allocated\", \"%" FMT_Word64 "\")\n"
-                " ,(\"init_cpu_seconds\", \"%.3f\")\n"
-                " ,(\"init_wall_seconds\", \"%.3f\")\n"
-                " ,(\"mutator_cpu_seconds\", \"%.3f\")\n"
-                " ,(\"mutator_wall_seconds\", \"%.3f\")\n"
-                " ,(\"GC_cpu_seconds\", \"%.3f\")\n"
-                " ,(\"GC_wall_seconds\", \"%.3f\")\n"
-                " ]\n";
-          }
-          else {
-            fmt =
-                "<<ghc: %" FMT_Word64 " bytes, "
-                "%" FMT_Word32 " GCs, "
-                "%" FMT_Word64 "/%" FMT_Word64 " avg/max bytes residency (%" FMT_Word32 " samples), "
-                "%" FMT_Word64 "M in use, "
-                "%.3f INIT (%.3f elapsed), "
-                "%.3f MUT (%.3f elapsed), "
-                "%.3f GC (%.3f elapsed) :ghc>>\n";
-          }
-          /* print the long long separately to avoid bugginess on mingwin (2001-07-02, mingw-0.5) */
-          statsPrintf(fmt,
-                    stats.allocated_bytes,
-                    stats.gcs,
-                     (uint64_t)
-                      (stats.major_gcs == 0 ? 0 :
-                       stats.cumulative_live_bytes/stats.major_gcs),
-                    stats.max_live_bytes,
-                    stats.major_gcs,
-                    (uint64_t) (peak_mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)),
-                    TimeToSecondsDbl(init_cpu), TimeToSecondsDbl(init_elapsed),
-                    TimeToSecondsDbl(mut_cpu), TimeToSecondsDbl(mut_elapsed),
-                    TimeToSecondsDbl(gc_cpu), TimeToSecondsDbl(gc_elapsed));
+            if (RtsFlags.MiscFlags.machineReadable) {
+                report_machine_readable(&sum);
+            }
+            else {
+                report_one_line(&sum);
+            }
         }
 
+        free_RTSSummaryStats(&sum);
         statsFlush();
         statsClose();
     }
@@ -1055,7 +1371,7 @@ waitForGcThreads:
   for a parallel garbage collection. We yield more than we spin in this case.
 
 In several places in the runtime we must take a lock on a closure. To do this,
-we replace it's info table with stg_WHITEHOLE_info, spinning if it is already
+we replace its info table with stg_WHITEHOLE_info, spinning if it is already
 a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we
 don't. We count these white-hole spins and include them in the SpinLocks table.
 If a particular loop does not yield, we put "n/a" in the table. They are named
@@ -1141,8 +1457,9 @@ statDescribeGens(void)
 
       gen_slop = gen_blocks * BLOCK_SIZE_W - gen_live;
 
-      debugBelch("%8" FMT_Word " %8d  %8d %9" FMT_Word " %9" FMT_Word "\n", gen_blocks, lge, compacts,
-                 gen_live*(W_)sizeof(W_), gen_slop*(W_)sizeof(W_));
+      debugBelch("%8" FMT_Word " %8d  %8d %9" FMT_Word " %9" FMT_Word "\n",
+                 gen_blocks, lge, compacts, gen_live*(W_)sizeof(W_),
+                 gen_slop*(W_)sizeof(W_));
       tot_live += gen_live;
       tot_slop += gen_slop;
   }
diff --git a/rts/Stats.h b/rts/Stats.h
index 1c56344f81..7cd49fc768 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -66,4 +66,49 @@ void      statDescribeGens( void );
 Time      stat_getElapsedGCTime(void);
 Time      stat_getElapsedTime(void);
 
+typedef struct GenerationSummaryStats_ {
+    uint32_t collections;
+    uint32_t par_collections;
+    Time cpu_ns;
+    Time elapsed_ns;
+    Time max_pause_ns;
+    Time avg_pause_ns;
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+    uint64_t sync_spin;
+    uint64_t sync_yield;
+#endif
+} GenerationSummaryStats;
+
+typedef struct RTSSummaryStats_ {
+    // These profiling times could potentially be in RTSStats. However, I'm not
+    // confident enough to do this now, since there is some logic depending on
+    // global state that I do not understand. (Or if I do understand it, it's
+    // wrong)
+    Time rp_cpu_ns;
+    Time rp_elapsed_ns;
+    Time hc_cpu_ns;
+    Time hc_elapsed_ns;
+
+    Time exit_cpu_ns;
+    Time exit_elapsed_ns;
+
+#if defined(THREADED_RTS)
+    uint32_t bound_task_count;
+    uint64_t sparks_count;
+    SparkCounters sparks;
+    double work_balance;
+#else // THREADED_RTS
+    double gc_cpu_percent;
+    double gc_elapsed_percent;
+#endif
+    uint64_t fragmentation_bytes;
+    uint64_t average_bytes_used; // This is not shown in the '+RTS -s' report
+    uint64_t alloc_rate;
+    double productivity_cpu_percent;
+    double productivity_elapsed_percent;
+
+    // one for each generation, 0 first
+    GenerationSummaryStats* gc_summary_stats;
+} RTSSummaryStats;
+
 #include "EndPrivate.h"