10 files changed, 286 insertions, 24 deletions
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h
index 488cab5f3f..3cf02c0d8a 100644
--- a/includes/RtsAPI.h
+++ b/includes/RtsAPI.h
@@ -151,6 +151,23 @@ typedef struct GCDetails_ {
   Time cpu_ns;
     // The time elapsed during GC itself
   Time elapsed_ns;
+
+    //
+    // Concurrent garbage collector
+    //
+
+    // The CPU time used during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_sync_cpu_ns;
+    // The time elapsed during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_sync_elapsed_ns;
+    // The CPU time used during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_cpu_ns;
+    // The time elapsed during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_elapsed_ns;
 } GCDetails;
 
 //
@@ -241,6 +258,28 @@ typedef struct _RTSStats {
     // The number of times a GC thread has iterated it's outer loop across all
     // parallel GCs
   uint64_t scav_find_work;
+
+  // ----------------------------------
+  // Concurrent garbage collector
+
+    // The CPU time used during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_sync_cpu_ns;
+    // The time elapsed during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_sync_elapsed_ns;
+    // The maximum time elapsed during the post-mark pause phase of the
+    // concurrent nonmoving GC.
+  Time nonmoving_gc_sync_max_elapsed_ns;
+    // The CPU time used during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_cpu_ns;
+    // The time elapsed during the post-mark pause phase of the concurrent
+    // nonmoving GC.
+  Time nonmoving_gc_elapsed_ns;
+    // The maximum time elapsed during the post-mark pause phase of the
+    // concurrent nonmoving GC.
+  Time nonmoving_gc_max_elapsed_ns;
 } RTSStats;
 
 void getRTSStats (RTSStats *s);
diff --git a/includes/rts/Time.h b/includes/rts/Time.h
index 00d98b4f09..ab291cd6c5 100644
--- a/includes/rts/Time.h
+++ b/includes/rts/Time.h
@@ -33,6 +33,7 @@ typedef int64_t Time;
 
 #define SecondsToTime(t) ((Time)(t) * TIME_RESOLUTION)
 #define TimeToSeconds(t) ((t) / TIME_RESOLUTION)
+#define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
 
 // Use instead of SecondsToTime() when we have a floating-point
 // seconds value, to avoid truncating it.
diff --git a/libraries/base/GHC/Stats.hsc b/libraries/base/GHC/Stats.hsc
index 58b5e22d04..92a77504b6 100644
--- a/libraries/base/GHC/Stats.hsc
+++ b/libraries/base/GHC/Stats.hsc
@@ -103,6 +103,25 @@ data RTSStats = RTSStats {
     -- | Total elapsed time (at the previous GC)
   , elapsed_ns :: RtsTime
 
+    -- | The CPU time used during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , nonmoving_gc_sync_cpu_ns :: RtsTime
+    -- | The time elapsed during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , nonmoving_gc_sync_elapsed_ns :: RtsTime
+    -- | The maximum time elapsed during the post-mark pause phase of the
+    -- concurrent nonmoving GC.
+  , nonmoving_gc_sync_max_elapsed_ns :: RtsTime
+    -- | The CPU time used during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , nonmoving_gc_cpu_ns :: RtsTime
+    -- | The time elapsed during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , nonmoving_gc_elapsed_ns :: RtsTime
+    -- | The maximum time elapsed during the post-mark pause phase of the
+    -- concurrent nonmoving GC.
+  , nonmoving_gc_max_elapsed_ns :: RtsTime
+
     -- | Details about the most recent GC
   , gc :: GCDetails
   } deriving ( Read -- ^ @since 4.10.0.0
@@ -146,6 +165,13 @@ data GCDetails = GCDetails {
   , gcdetails_cpu_ns :: RtsTime
     -- | The time elapsed during GC itself
   , gcdetails_elapsed_ns :: RtsTime
+
+    -- | The CPU time used during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , gcdetails_nonmoving_gc_sync_cpu_ns :: RtsTime
+    -- | The time elapsed during the post-mark pause phase of the concurrent
+    -- nonmoving GC.
+  , gcdetails_nonmoving_gc_sync_elapsed_ns :: RtsTime
   } deriving ( Read -- ^ @since 4.10.0.0
              , Show -- ^ @since 4.10.0.0
              )
@@ -192,6 +218,12 @@ getRTSStats = do
     gc_elapsed_ns <- (# peek RTSStats, gc_elapsed_ns) p
     cpu_ns <- (# peek RTSStats, cpu_ns) p
     elapsed_ns <- (# peek RTSStats, elapsed_ns) p
+    nonmoving_gc_sync_cpu_ns <- (# peek RTSStats, nonmoving_gc_sync_cpu_ns) p
+    nonmoving_gc_sync_elapsed_ns <- (# peek RTSStats, nonmoving_gc_sync_elapsed_ns) p
+    nonmoving_gc_sync_max_elapsed_ns <- (# peek RTSStats, nonmoving_gc_sync_max_elapsed_ns) p
+    nonmoving_gc_cpu_ns <- (# peek RTSStats, nonmoving_gc_cpu_ns) p
+    nonmoving_gc_elapsed_ns <- (# peek RTSStats, nonmoving_gc_elapsed_ns) p
+    nonmoving_gc_max_elapsed_ns <- (# peek RTSStats, nonmoving_gc_max_elapsed_ns) p
     let pgc = (# ptr RTSStats, gc) p
     gc <- do
       gcdetails_gen <- (# peek GCDetails, gen) pgc
@@ -211,5 +243,7 @@ getRTSStats = do
       gcdetails_sync_elapsed_ns <- (# peek GCDetails, sync_elapsed_ns) pgc
       gcdetails_cpu_ns <- (# peek GCDetails, cpu_ns) pgc
       gcdetails_elapsed_ns <- (# peek GCDetails, elapsed_ns) pgc
+      gcdetails_nonmoving_gc_sync_cpu_ns <- (# peek GCDetails, nonmoving_gc_sync_cpu_ns) pgc
+      gcdetails_nonmoving_gc_sync_elapsed_ns <- (# peek GCDetails, nonmoving_gc_sync_elapsed_ns) pgc
       return GCDetails{..}
     return RTSStats{..}
diff --git a/rts/ProfHeap.c b/rts/ProfHeap.c
index bc439c1678..efb312e899 100644
--- a/rts/ProfHeap.c
+++ b/rts/ProfHeap.c
@@ -1212,6 +1212,8 @@ heapCensusChain( Census *census, bdescr *bd )
     }
 }
 
+// Time is process CPU time of beginning of current GC and is used as
+// the mutator CPU time reported as the census timestamp.
 void heapCensus (Time t)
 {
   uint32_t g, n;
@@ -1219,7 +1221,7 @@ void heapCensus (Time t)
   gen_workspace *ws;
 
   census = &censuses[era];
-  census->time  = mut_user_time_until(t);
+  census->time  = TimeToSecondsDbl(t);
   census->rtime = TimeToNS(stat_getElapsedTime());
 
 
diff --git a/rts/Stats.c b/rts/Stats.c
index 889f6d92f3..0e6cb09b27 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -26,14 +26,14 @@
 
 #include <string.h> // for memset
 
-#define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
-
 static Time
     start_init_cpu, start_init_elapsed,
     end_init_cpu,   end_init_elapsed,
     start_exit_cpu, start_exit_elapsed,
     start_exit_gc_elapsed, start_exit_gc_cpu,
-    end_exit_cpu,   end_exit_elapsed;
+    end_exit_cpu,   end_exit_elapsed,
+    start_nonmoving_gc_cpu, start_nonmoving_gc_elapsed,
+    start_nonmoving_gc_sync_elapsed;
 
 #if defined(PROFILING)
 static Time RP_start_time  = 0, RP_tot_time  = 0;  // retainer prof user time
@@ -84,7 +84,7 @@ Time stat_getElapsedTime(void)
 double
 mut_user_time_until( Time t )
 {
-    return TimeToSecondsDbl(t - stats.gc_cpu_ns);
+    return TimeToSecondsDbl(t - stats.gc_cpu_ns - stats.nonmoving_gc_cpu_ns);
     // heapCensus() time is included in GC_tot_cpu, so we don't need
     // to subtract it here.
 
@@ -125,6 +125,10 @@ initStats0(void)
     end_init_cpu     = 0;
     end_init_elapsed  = 0;
 
+    start_nonmoving_gc_cpu = 0;
+    start_nonmoving_gc_elapsed = 0;
+    start_nonmoving_gc_sync_elapsed = 0;
+
     start_exit_cpu    = 0;
     start_exit_elapsed = 0;
     start_exit_gc_cpu    = 0;
@@ -175,6 +179,11 @@ initStats0(void)
         .gc_elapsed_ns = 0,
         .cpu_ns = 0,
         .elapsed_ns = 0,
+        .nonmoving_gc_cpu_ns = 0,
+        .nonmoving_gc_elapsed_ns = 0,
+        .nonmoving_gc_max_elapsed_ns = 0,
+        .nonmoving_gc_sync_elapsed_ns = 0,
+        .nonmoving_gc_sync_max_elapsed_ns = 0,
         .gc = {
             .gen = 0,
             .threads = 0,
@@ -189,7 +198,10 @@ initStats0(void)
             .par_balanced_copied_bytes = 0,
             .sync_elapsed_ns = 0,
             .cpu_ns = 0,
-            .elapsed_ns = 0
+            .elapsed_ns = 0,
+            .nonmoving_gc_cpu_ns = 0,
+            .nonmoving_gc_elapsed_ns = 0,
+            .nonmoving_gc_sync_elapsed_ns = 0,
         }
     };
 }
@@ -274,6 +286,11 @@ stat_startExit(void)
     start_exit_gc_cpu = stats.gc_cpu_ns;
 }
 
+/* -----------------------------------------------------------------------------
+   Nonmoving (concurrent) collector statistics
+
+   These two measure the time taken in the concurrent mark & sweep collector.
+   -------------------------------------------------------------------------- */
 void
 stat_endExit(void)
 {
@@ -286,10 +303,115 @@ stat_startGCSync (gc_thread *gct)
     gct->gc_sync_start_elapsed = getProcessElapsedTime();
 }
 
+void
+stat_startNonmovingGc ()
+{
+    start_nonmoving_gc_cpu = getCurrentThreadCPUTime();
+    start_nonmoving_gc_elapsed = getProcessCPUTime();
+}
+
+void
+stat_endNonmovingGc ()
+{
+    Time cpu = getCurrentThreadCPUTime();
+    Time elapsed = getProcessCPUTime();
+    stats.gc.nonmoving_gc_elapsed_ns = elapsed - start_nonmoving_gc_elapsed;
+    stats.nonmoving_gc_elapsed_ns += stats.gc.nonmoving_gc_elapsed_ns;
+
+    stats.gc.nonmoving_gc_cpu_ns = cpu - start_nonmoving_gc_cpu;
+    stats.nonmoving_gc_cpu_ns += stats.gc.nonmoving_gc_cpu_ns;
+
+    stats.nonmoving_gc_max_elapsed_ns =
+      stg_max(stats.gc.nonmoving_gc_elapsed_ns,
+              stats.nonmoving_gc_max_elapsed_ns);
+}
+
+void
+stat_startNonmovingGcSync ()
+{
+    start_nonmoving_gc_sync_elapsed = getProcessElapsedTime();
+    traceConcSyncBegin();
+}
+
+void
+stat_endNonmovingGcSync ()
+{
+    Time end_elapsed = getProcessElapsedTime();
+    stats.gc.nonmoving_gc_sync_elapsed_ns = end_elapsed - start_nonmoving_gc_sync_elapsed;
+    stats.nonmoving_gc_sync_elapsed_ns +=  stats.gc.nonmoving_gc_sync_elapsed_ns;
+    stats.nonmoving_gc_sync_max_elapsed_ns =
+      stg_max(stats.gc.nonmoving_gc_sync_elapsed_ns,
+              stats.nonmoving_gc_sync_max_elapsed_ns);
+    traceConcSyncEnd();
+}
+
 /* -----------------------------------------------------------------------------
    Called at the beginning of each GC
    -------------------------------------------------------------------------- */
 
+/*
+ * Note [Time accounting]
+ * ~~~~~~~~~~~~~~~~~~~~~~
+ * In the "vanilla" configuration (using the standard copying GC) GHC keeps
+ * track of a two different sinks of elapsed and CPU time:
+ *
+ *  - time spent synchronising to initiate garbage collection
+ *  - garbage collection (per generation)
+ *  - mutation
+ *
+ * When using the (concurrent) non-moving garbage collector (see Note
+ * [Non-moving garbage collector]) we also track a few more sinks:
+ *
+ *  - minor GC
+ *  - major GC (namly time spent in the preparatory phase)
+ *  - concurrent mark
+ *  - final synchronization (elapsed only)
+ *  - mutation
+ *
+ * To keep track of these CPU times we rely on the system's per-thread CPU time
+ * clock (exposed via the runtime's getCurrentThreadCPUTime utility).
+ *
+ * CPU time spent in the copying garbage collector is tracked in each GC
+ * worker's gc_thread struct. At the beginning of scavenging each worker
+ * records its OS thread's CPU time its gc_thread (by stat_startGCWorker). At
+ * the end of scavenging we again record the CPU time (in stat_endGCworker).
+ * The differences of these are then summed over by the thread leading the GC
+ * at the end of collection in stat_endGC. By contrast, the elapsed time is
+ * recorded only by the leader.
+ *
+ * Mutator time is derived from the process's CPU time, subtracting out
+ * contributions from stop-the-world and concurrent GCs.
+ *
+ * Time spent in concurrent marking is recorded by stat_{start,end}NonmovingGc.
+ * Likewise, elapsed time spent in the final synchronization is recorded by
+ * stat_{start,end}NonmovingGcSync.
+ */
+
+void
+stat_startGCWorker (Capability *cap STG_UNUSED, gc_thread *gct)
+{
+    bool stats_enabled =
+        RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
+        rtsConfig.gcDoneHook != NULL;
+
+    if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
+        gct->gc_start_cpu = getCurrentThreadCPUTime();
+    }
+}
+
+void
+stat_endGCWorker (Capability *cap STG_UNUSED, gc_thread *gct)
+{
+    bool stats_enabled =
+        RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
+        rtsConfig.gcDoneHook != NULL;
+
+    if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
+        gct->gc_end_cpu = getCurrentThreadCPUTime();
+        ASSERT(gct->gc_end_cpu >= gct->gc_start_cpu);
+    }
+}
+
 void
 stat_startGC (Capability *cap, gc_thread *gct)
 {
@@ -297,7 +419,15 @@ stat_startGC (Capability *cap, gc_thread *gct)
         debugBelch("\007");
     }
 
-    getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed);
+    bool stats_enabled =
+        RtsFlags.GcFlags.giveStats != NO_GC_STATS ||
+        rtsConfig.gcDoneHook != NULL;
+
+    if (stats_enabled || RtsFlags.ProfFlags.doHeapProfile) {
+        gct->gc_start_cpu = getCurrentThreadCPUTime();
+    }
+
+    gct->gc_start_elapsed = getProcessElapsedTime();
 
     // Post EVENT_GC_START with the same timestamp as used for stats
     // (though converted from Time=StgInt64 to EventTimestamp=StgWord64).
@@ -320,9 +450,9 @@ stat_startGC (Capability *cap, gc_thread *gct)
    -------------------------------------------------------------------------- */
 
 void
-stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
-            uint32_t gen, uint32_t par_n_threads, W_ par_max_copied,
-            W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
+stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ slop,
+            uint32_t gen, uint32_t par_n_threads, gc_thread **gc_threads,
+            W_ par_max_copied, W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
             W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work,
             W_ scav_find_work)
 {
@@ -364,9 +494,14 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
         stats.elapsed_ns = current_elapsed - start_init_elapsed;
 
         stats.gc.sync_elapsed_ns =
-            gct->gc_start_elapsed - gct->gc_sync_start_elapsed;
-        stats.gc.elapsed_ns = current_elapsed - gct->gc_start_elapsed;
-        stats.gc.cpu_ns = current_cpu - gct->gc_start_cpu;
+            initiating_gct->gc_start_elapsed - initiating_gct->gc_sync_start_elapsed;
+        stats.gc.elapsed_ns = current_elapsed - initiating_gct->gc_start_elapsed;
+        stats.gc.cpu_ns = 0;
+        for (unsigned int i=0; i < par_n_threads; i++) {
+            gc_thread *gct = gc_threads[i];
+            ASSERT(gct->gc_end_cpu >= gct->gc_start_cpu);
+            stats.gc.cpu_ns += gct->gc_end_cpu - gct->gc_start_cpu;
+        }
     }
     // -------------------------------------------------
     // Update the cumulative stats
@@ -473,8 +608,8 @@ stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
                     TimeToSecondsDbl(stats.gc.elapsed_ns),
                     TimeToSecondsDbl(stats.cpu_ns),
                     TimeToSecondsDbl(stats.elapsed_ns),
-                    faults - gct->gc_start_faults,
-                        gct->gc_start_faults - GC_end_faults,
+                    faults - initiating_gct->gc_start_faults,
+                        initiating_gct->gc_start_faults - GC_end_faults,
                     gen);
 
             GC_end_faults = faults;
@@ -700,6 +835,21 @@ static void report_summary(const RTSSummaryStats* sum)
                     TimeToSecondsDbl(gen_stats->avg_pause_ns),
                     TimeToSecondsDbl(gen_stats->max_pause_ns));
     }
+    if (RtsFlags.GcFlags.useNonmoving) {
+        const int n_major_colls = sum->gc_summary_stats[RtsFlags.GcFlags.generations-1].collections;
+        statsPrintf("  Gen  1     %5d syncs"
+                    ",                      %6.3fs     %3.4fs    %3.4fs\n",
+                    n_major_colls,
+                    TimeToSecondsDbl(stats.nonmoving_gc_sync_elapsed_ns),
+                    TimeToSecondsDbl(stats.nonmoving_gc_sync_elapsed_ns) / n_major_colls,
+                    TimeToSecondsDbl(stats.nonmoving_gc_sync_max_elapsed_ns));
+        statsPrintf("  Gen  1      concurrent"
+                    ",             %6.3fs  %6.3fs     %3.4fs    %3.4fs\n",
+                    TimeToSecondsDbl(stats.nonmoving_gc_cpu_ns),
+                    TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns),
+                    TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns) / n_major_colls,
+                    TimeToSecondsDbl(stats.nonmoving_gc_max_elapsed_ns));
+    }
 
     statsPrintf("\n");
 
@@ -736,6 +886,12 @@ static void report_summary(const RTSSummaryStats* sum)
     statsPrintf("  GC      time  %7.3fs  (%7.3fs elapsed)\n",
                 TimeToSecondsDbl(stats.gc_cpu_ns),
                 TimeToSecondsDbl(stats.gc_elapsed_ns));
+    if (RtsFlags.GcFlags.useNonmoving) {
+        statsPrintf(
+                "  CONC GC time  %7.3fs  (%7.3fs elapsed)\n",
+                TimeToSecondsDbl(stats.nonmoving_gc_cpu_ns),
+                TimeToSecondsDbl(stats.nonmoving_gc_elapsed_ns));
+    }
 
 #if defined(PROFILING)
     statsPrintf("  RP      time  %7.3fs  (%7.3fs elapsed)\n",
@@ -1094,7 +1250,8 @@ stat_exit (void)
 
             stats.mutator_cpu_ns     = start_exit_cpu
                                  - end_init_cpu
-                                 - (stats.gc_cpu_ns - exit_gc_cpu);
+                                 - (stats.gc_cpu_ns - exit_gc_cpu)
+                                 - stats.nonmoving_gc_cpu_ns;
             stats.mutator_elapsed_ns = start_exit_elapsed
                                  - end_init_elapsed
                                  - (stats.gc_elapsed_ns - exit_gc_elapsed);
@@ -1504,7 +1661,8 @@ void getRTSStats( RTSStats *s )
     s->cpu_ns = current_cpu - end_init_cpu;
     s->elapsed_ns = current_elapsed - end_init_elapsed;
 
-    s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns;
+    s->mutator_cpu_ns = current_cpu - end_init_cpu - stats.gc_cpu_ns -
+        stats.nonmoving_gc_cpu_ns;
     s->mutator_elapsed_ns = current_elapsed - end_init_elapsed -
         stats.gc_elapsed_ns;
 }
diff --git a/rts/Stats.h b/rts/Stats.h
index 1f498aea23..f5b8ce9991 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -30,13 +30,21 @@ void      stat_endInit(void);
 
 void      stat_startGCSync(struct gc_thread_ *_gct);
 void      stat_startGC(Capability *cap, struct gc_thread_ *_gct);
-void      stat_endGC  (Capability *cap, struct gc_thread_ *_gct, W_ live,
-                       W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads,
+void      stat_startGCWorker (Capability *cap, struct gc_thread_ *_gct);
+void      stat_endGCWorker (Capability *cap, struct gc_thread_ *_gct);
+void      stat_endGC  (Capability *cap, struct gc_thread_ *initiating_gct, W_ live,
+                       W_ copied, W_ slop, uint32_t gen,
+                       uint32_t n_gc_threads, struct gc_thread_ **gc_threads,
                        W_ par_max_copied, W_ par_balanced_copied,
                        W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin,
                        W_ mut_spin_yield, W_ any_work, W_ no_work,
                        W_ scav_find_work);
 
+void      stat_startNonmovingGcSync(void);
+void      stat_endNonmovingGcSync(void);
+void      stat_startNonmovingGc (void);
+void      stat_endNonmovingGc (void);
+
 #if defined(PROFILING)
 void      stat_startRP(void);
 void      stat_endRP(uint32_t, int, double);
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 526c84f411..16d8c77453 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -209,6 +209,14 @@ GarbageCollect (uint32_t collect_gen,
   gc_thread *saved_gct;
 #endif
   uint32_t g, n;
+  // The time we should report our heap census as occurring at, if necessary.
+  Time mut_time = 0;
+
+  if (do_heap_census) {
+      RTSStats stats;
+      getRTSStats(&stats);
+      mut_time = stats.mutator_cpu_ns;
+  }
 
   // necessary if we stole a callee-saves register for gct:
 #if defined(THREADED_RTS)
@@ -846,7 +854,7 @@ GarbageCollect (uint32_t collect_gen,
   if (do_heap_census) {
       debugTrace(DEBUG_sched, "performing heap census");
       RELEASE_SM_LOCK;
-      heapCensus(gct->gc_start_cpu);
+      heapCensus(mut_time);
       ACQUIRE_SM_LOCK;
   }
 
@@ -927,9 +935,11 @@ GarbageCollect (uint32_t collect_gen,
 #endif
 
   // ok, GC over: tell the stats department what happened.
+  stat_endGCWorker(cap, gct);
   stat_endGC(cap, gct, live_words, copied,
              live_blocks * BLOCK_SIZE_W - live_words /* slop */,
-             N, n_gc_threads, par_max_copied, par_balanced_copied,
+             N, n_gc_threads, gc_threads,
+             par_max_copied, par_balanced_copied,
              gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
              any_work, no_work, scav_find_work);
 
@@ -1209,6 +1219,7 @@ gcWorkerThread (Capability *cap)
 
     SET_GCT(gc_threads[cap->no]);
     gct->id = osThreadId();
+    stat_startGCWorker (cap, gct);
 
     // Wait until we're told to wake up
     RELEASE_SPIN_LOCK(&gct->mut_spin);
@@ -1247,6 +1258,7 @@ gcWorkerThread (Capability *cap)
     gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE;
     debugTrace(DEBUG_gc, "GC thread %d waiting to continue...",
                gct->thread_index);
+    stat_endGCWorker (cap, gct);
     ACQUIRE_SPIN_LOCK(&gct->mut_spin);
     debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index);
 
diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h
index 3012f52f28..723bb00bf1 100644
--- a/rts/sm/GCThread.h
+++ b/rts/sm/GCThread.h
@@ -185,9 +185,11 @@ typedef struct gc_thread_ {
     W_ no_work;
     W_ scav_find_work;
 
-    Time gc_start_cpu;   // process CPU time
-    Time gc_sync_start_elapsed;  // start of GC sync
-    Time gc_start_elapsed;  // process elapsed time
+    Time gc_start_cpu;             // thread CPU time
+    Time gc_end_cpu;               // thread CPU time
+    Time gc_sync_start_elapsed;    // start of GC sync
+    Time gc_start_elapsed;         // process elapsed time
+    Time gc_end_elapsed;           // process elapsed time
     W_ gc_start_faults;
 
     // -------------------
diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index 0bd96d1800..2dd201f0f9 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -17,6 +17,7 @@
 #include "GCThread.h"
 #include "GCTDecl.h"
 #include "Schedule.h"
+#include "Stats.h"
 
 #include "NonMoving.h"
 #include "NonMovingMark.h"
@@ -947,6 +948,7 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
 {
     ACQUIRE_LOCK(&nonmoving_collection_mutex);
     debugTrace(DEBUG_nonmoving_gc, "Starting mark...");
+    stat_startNonmovingGc();
 
     // Do concurrent marking; most of the heap will get marked here.
     nonmovingMarkThreadsWeaks(mark_queue);
@@ -1098,6 +1100,7 @@ finish:
 
     // We are done...
     mark_thread = 0;
+    stat_endNonmovingGc();
 
     // Signal that the concurrent collection is finished, allowing the next
     // non-moving collection to proceed
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index 6d70ca21dd..3113e7f013 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -21,6 +21,7 @@
 #include "Printer.h"
 #include "Schedule.h"
 #include "Weak.h"
+#include "Stats.h"
 #include "STM.h"
 #include "MarkWeak.h"
 #include "sm/Storage.h"
@@ -316,6 +317,7 @@ void nonmovingBeginFlush(Task *task)
     debugTrace(DEBUG_nonmoving_gc, "Starting update remembered set flush...");
     traceConcSyncBegin();
     upd_rem_set_flush_count = 0;
+    stat_startNonmovingGcSync();
     stopAllCapabilitiesWith(NULL, task, SYNC_FLUSH_UPD_REM_SET);
 
     // XXX: We may have been given a capability via releaseCapability (i.e. a
@@ -407,6 +409,7 @@ void nonmovingFinishFlush(Task *task)
 
     debugTrace(DEBUG_nonmoving_gc, "Finished update remembered set flush...");
     traceConcSyncEnd();
+    stat_endNonmovingGcSync();
     releaseAllCapabilities(n_capabilities, NULL, task);
 }
 #endif