summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDouglas Wilson <douglas.wilson@gmail.com>2018-03-19 11:55:37 -0400
committerBen Gamari <ben@smart-cactus.org>2018-03-19 12:05:07 -0400
commit2918abf75594001deed51ee252a05b146f844489 (patch)
treeecb789d307060dc887e2dfe01eb390a9c475e8f7
parentf9a6d4207fb0e551821fee847ac064ac31d96bba (diff)
downloadhaskell-2918abf75594001deed51ee252a05b146f844489.tar.gz
rts: Add --internal-counters RTS flag and several counters
The existing internal counters: * gc_alloc_block_sync * whitehole_spin * gen[g].sync * gen[1].sync are now not shown in the -s report unless --internal-counters is also passed. If --internal-counters is passed we now show the counters above, reformatted, as well as several other counters. In particular, we now count the yieldThread() calls that SpinLocks do as well as their spins. The added counters are: * gc_spin (spin and yield) * mut_spin (spin and yield) * whitehole_threadPaused (spin only) * whitehole_executeMessage (spin only) * whitehole_lockClosure (spin only) * waitForGcThreadsd (spin and yield) As well as the following, which are not SpinLock-like things: * any_work * do_work * scav_find_work See the Note for descriptions of what these counters are. We add busy_wait_nops in these loops along with the counter increment where it was absent. Old internal counters output: ``` gc_alloc_block_sync: 0 whitehole_gc_spin: 0 gen[0].sync: 0 gen[1].sync: 0 ``` New internal counters output: ``` Internal Counters: Spins Yields gc_alloc_block_sync 323 0 gc_spin 9016713 752 mut_spin 57360944 47716 whitehole_gc 0 n/a whitehole_threadPaused 0 n/a whitehole_executeMessage 0 n/a whitehole_lockClosure 0 0 waitForGcThreads 2 415 gen[0].sync 6 0 gen[1].sync 1 0 any_work 2017 no_work 2014 scav_find_work 1004 ``` Test Plan: ./validate Check it builds with #define PROF_SPIN removed from includes/rts/Config.h Reviewers: bgamari, erikd, simonmar, hvr Reviewed By: simonmar Subscribers: rwbarton, thomie, carter GHC Trac Issues: #3553, #9221 Differential Revision: https://phabricator.haskell.org/D4302
-rw-r--r--docs/users_guide/runtime_control.rst6
-rw-r--r--includes/RtsAPI.h24
-rw-r--r--includes/rts/Flags.h1
-rw-r--r--includes/rts/SpinLock.h5
-rw-r--r--includes/rts/storage/GC.h2
-rw-r--r--libraries/base/GHC/RTS/Flags.hsc3
-rw-r--r--rts/Messages.c3
-rw-r--r--rts/Messages.h4
-rw-r--r--rts/RtsFlags.c6
-rw-r--r--rts/SMPClosureOps.h12
-rw-r--r--rts/Stats.c189
-rw-r--r--rts/Stats.h5
-rw-r--r--rts/StgMiscClosures.cmm10
-rw-r--r--rts/ThreadPaused.c4
-rw-r--r--rts/ThreadPaused.h8
-rw-r--r--rts/sm/Evac.c9
-rw-r--r--rts/sm/GC.c71
-rw-r--r--rts/sm/GC.h2
-rw-r--r--testsuite/tests/rts/InternalCounters.stdout1
-rw-r--r--testsuite/tests/rts/Makefile19
-rw-r--r--testsuite/tests/rts/all.T2
21 files changed, 346 insertions, 40 deletions
diff --git a/docs/users_guide/runtime_control.rst b/docs/users_guide/runtime_control.rst
index 009e3ae887..92bc739dfe 100644
--- a/docs/users_guide/runtime_control.rst
+++ b/docs/users_guide/runtime_control.rst
@@ -776,6 +776,7 @@ RTS options to produce runtime statistics
-s [⟨file⟩]
-S [⟨file⟩]
--machine-readable
+ --internal-counters
These options produce runtime-system statistics, such as the amount
of time spent executing the program and in the garbage collector,
@@ -785,7 +786,10 @@ RTS options to produce runtime statistics
line of output in the same format as GHC's ``-Rghc-timing`` option,
``-s`` produces a more detailed summary at the end of the program,
and ``-S`` additionally produces information about each and every
- garbage collection.
+ garbage collection. Passing ``--internal-counters`` to a threaded
+ runtime will cause a detailed summary to include various internal
+ counts accumulated during the run; note that these are unspecified
+ and may change between releases.
The output is placed in ⟨file⟩. If ⟨file⟩ is omitted, then the
output is sent to ``stderr``.
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h
index 27a5080220..6f011cbf6e 100644
--- a/includes/RtsAPI.h
+++ b/includes/RtsAPI.h
@@ -210,6 +210,30 @@ typedef struct _RTSStats {
GCDetails gc;
+ // -----------------------------------
+ // Internal Counters
+
+ // The number of times a GC thread spun on its 'gc_spin' lock.
+ // Will be zero if the rts was not built with PROF_SPIN
+ uint64_t gc_spin_spin;
+ // The number of times a GC thread yielded on its 'gc_spin' lock.
+ // Will be zero if the rts was not built with PROF_SPIN
+ uint64_t gc_spin_yield;
+ // The number of times a GC thread spun on its 'mut_spin' lock.
+ // Will be zero if the rts was not built with PROF_SPIN
+ uint64_t mut_spin_spin;
+ // The number of times a GC thread yielded on its 'mut_spin' lock.
+ // Will be zero if the rts was not built with PROF_SPIN
+ uint64_t mut_spin_yield;
+ // The number of times a GC thread has checked for work across all parallel
+ // GCs
+ uint64_t any_work;
+ // The number of times a GC thread has checked for work and found none across
+ // all parallel GCs
+ uint64_t no_work;
+ // The number of times a GC thread has iterated it's outer loop across all
+ // parallel GCs
+ uint64_t scav_find_work;
} RTSStats;
void getRTSStats (RTSStats *s);
diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h
index aed4dca384..6487947749 100644
--- a/includes/rts/Flags.h
+++ b/includes/rts/Flags.h
@@ -195,6 +195,7 @@ typedef struct _MISC_FLAGS {
bool generate_dump_file;
bool generate_stack_trace;
bool machineReadable;
+ bool internalCounters; /* See Note [Internal Counter Stats] */
StgWord linkerMemBase; /* address to ask the OS for memory
* for the linker, NULL ==> off */
} MISC_FLAGS;
diff --git a/includes/rts/SpinLock.h b/includes/rts/SpinLock.h
index 6530a3a2f0..1dca02f795 100644
--- a/includes/rts/SpinLock.h
+++ b/includes/rts/SpinLock.h
@@ -27,7 +27,8 @@
typedef struct SpinLock_
{
StgWord lock;
- StgWord64 spin; // DEBUG version counts how much it spins
+ StgWord64 spin; // incremented every time we spin in ACQUIRE_SPIN_LOCK
+ StgWord64 yield; // incremented every time we yield in ACQUIRE_SPIN_LOCK
} SpinLock;
#else
typedef StgWord SpinLock;
@@ -49,6 +50,7 @@ INLINE_HEADER void ACQUIRE_SPIN_LOCK(SpinLock * p)
p->spin++;
busy_wait_nop();
}
+ p->yield++;
yieldThread();
} while (1);
}
@@ -66,6 +68,7 @@ INLINE_HEADER void initSpinLock(SpinLock * p)
write_barrier();
p->lock = 1;
p->spin = 0;
+ p->yield = 0;
}
#else
diff --git a/includes/rts/storage/GC.h b/includes/rts/storage/GC.h
index 2aed7c57ee..d4182dd7f9 100644
--- a/includes/rts/storage/GC.h
+++ b/includes/rts/storage/GC.h
@@ -120,7 +120,7 @@ typedef struct generation_ {
// stats information
uint32_t collections;
uint32_t par_collections;
- uint32_t failed_promotions;
+ uint32_t failed_promotions; // Currently unused
// ------------------------------------
// Fields below are used during GC only
diff --git a/libraries/base/GHC/RTS/Flags.hsc b/libraries/base/GHC/RTS/Flags.hsc
index 3e712ca900..046975577e 100644
--- a/libraries/base/GHC/RTS/Flags.hsc
+++ b/libraries/base/GHC/RTS/Flags.hsc
@@ -138,6 +138,7 @@ data MiscFlags = MiscFlags
, generateCrashDumpFile :: Bool
, generateStackTrace :: Bool
, machineReadable :: Bool
+ , internalCounters :: Bool
, linkerMemBase :: Word
-- ^ address to ask the OS for memory for the linker, 0 ==> off
} deriving ( Show -- ^ @since 4.8.0.0
@@ -441,6 +442,8 @@ getMiscFlags = do
(#{peek MISC_FLAGS, generate_stack_trace} ptr :: IO CBool))
<*> (toBool <$>
(#{peek MISC_FLAGS, machineReadable} ptr :: IO CBool))
+ <*> (toBool <$>
+ (#{peek MISC_FLAGS, internalCounters} ptr :: IO CBool))
<*> #{peek MISC_FLAGS, linkerMemBase} ptr
getDebugFlags :: IO DebugFlags
diff --git a/rts/Messages.c b/rts/Messages.c
index 8fab314bc4..a9c794d823 100644
--- a/rts/Messages.c
+++ b/rts/Messages.c
@@ -129,6 +129,9 @@ loop:
}
else if (i == &stg_WHITEHOLE_info)
{
+#if defined(PROF_SPIN)
+ ++whitehole_executeMessage_spin;
+#endif
goto loop;
}
else
diff --git a/rts/Messages.h b/rts/Messages.h
index e60f19dc1d..18371564c4 100644
--- a/rts/Messages.h
+++ b/rts/Messages.h
@@ -31,3 +31,7 @@ doneWithMsgThrowTo (MessageThrowTo *m)
}
#include "EndPrivate.h"
+
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+extern volatile StgWord64 whitehole_executeMessage_spin;
+#endif
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index 7b38bbd6fd..b674e9b685 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -232,6 +232,7 @@ void initRtsFlagsDefaults(void)
RtsFlags.MiscFlags.generate_stack_trace = true;
RtsFlags.MiscFlags.generate_dump_file = false;
RtsFlags.MiscFlags.machineReadable = false;
+ RtsFlags.MiscFlags.internalCounters = false;
RtsFlags.MiscFlags.linkerMemBase = 0;
#if defined(THREADED_RTS)
@@ -888,6 +889,11 @@ error = true;
OPTION_UNSAFE;
RtsFlags.MiscFlags.machineReadable = true;
}
+ else if (strequal("internal-counters",
+ &rts_argv[arg][2])) {
+ OPTION_SAFE;
+ RtsFlags.MiscFlags.internalCounters = true;
+ }
else if (strequal("info",
&rts_argv[arg][2])) {
OPTION_SAFE;
diff --git a/rts/SMPClosureOps.h b/rts/SMPClosureOps.h
index 4ea1c55976..1d18e1b018 100644
--- a/rts/SMPClosureOps.h
+++ b/rts/SMPClosureOps.h
@@ -38,6 +38,11 @@ EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info);
#if defined(THREADED_RTS)
+#if defined(PROF_SPIN)
+extern volatile StgWord64 whitehole_lockClosure_spin;
+extern volatile StgWord64 whitehole_lockClosure_yield;
+#endif
+
/* -----------------------------------------------------------------------------
* Locking/unlocking closures
*
@@ -56,7 +61,14 @@ EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p)
do {
info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info);
if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info;
+#if defined(PROF_SPIN)
+ ++whitehole_lockClosure_spin;
+#endif
+ busy_wait_nop();
} while (++i < SPIN_COUNT);
+#if defined(PROF_SPIN)
+ ++whitehole_lockClosure_yield;
+#endif
yieldThread();
} while (1);
}
diff --git a/rts/Stats.c b/rts/Stats.c
index 26bdac0ea5..7eb93bef33 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -16,10 +16,15 @@
#include "Profiling.h"
#include "GetTime.h"
#include "sm/Storage.h"
-#include "sm/GC.h" // gc_alloc_block_sync, whitehole_gc_spin
#include "sm/GCThread.h"
#include "sm/BlockAlloc.h"
+// for spin/yield counters
+#include "sm/GC.h"
+#include "ThreadPaused.h"
+#include "Messages.h"
+
+
#define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION)
static Time
@@ -43,6 +48,13 @@ static Time HCe_start_time, HCe_tot_time = 0; // heap census prof elap time
#define PROF_VAL(x) 0
#endif
+#if defined(PROF_SPIN)
+volatile StgWord64 whitehole_lockClosure_spin = 0;
+volatile StgWord64 whitehole_lockClosure_yield = 0;
+volatile StgWord64 whitehole_threadPaused_spin = 0;
+volatile StgWord64 whitehole_executeMessage_spin = 0;
+#endif
+
//
// All the stats!
//
@@ -150,6 +162,13 @@ initStats0(void)
.par_copied_bytes = 0,
.cumulative_par_max_copied_bytes = 0,
.cumulative_par_balanced_copied_bytes = 0,
+ .gc_spin_spin = 0,
+ .gc_spin_yield = 0,
+ .mut_spin_spin = 0,
+ .mut_spin_yield = 0,
+ .any_work = 0,
+ .no_work = 0,
+ .scav_find_work = 0,
.mutator_cpu_ns = 0,
.mutator_elapsed_ns = 0,
.gc_cpu_ns = 0,
@@ -283,10 +302,11 @@ stat_startGC (Capability *cap, gc_thread *gct)
-------------------------------------------------------------------------- */
void
-stat_endGC (Capability *cap, gc_thread *gct,
- W_ live, W_ copied, W_ slop, uint32_t gen,
- uint32_t par_n_threads, W_ par_max_copied,
- W_ par_balanced_copied)
+stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop,
+ uint32_t gen, uint32_t par_n_threads, W_ par_max_copied,
+ W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
+ W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work,
+ W_ scav_find_work)
{
// -------------------------------------------------
// Collect all the stats about this GC in stats.gc. We always do this since
@@ -350,6 +370,13 @@ stat_endGC (Capability *cap, gc_thread *gct,
stats.gc.par_max_copied_bytes;
stats.cumulative_par_balanced_copied_bytes +=
stats.gc.par_balanced_copied_bytes;
+ stats.any_work += any_work;
+ stats.no_work += no_work;
+ stats.scav_find_work += scav_find_work;
+ stats.gc_spin_spin += gc_spin_spin;
+ stats.gc_spin_yield += gc_spin_yield;
+ stats.mut_spin_spin += mut_spin_spin;
+ stats.mut_spin_yield += mut_spin_yield;
}
stats.gc_cpu_ns += stats.gc.cpu_ns;
stats.gc_elapsed_ns += stats.gc.elapsed_ns;
@@ -764,18 +791,96 @@ stat_exit (void)
PROF_VAL(RPe_tot_time + HCe_tot_time) - init_elapsed) * 100
/ TimeToSecondsDbl(tot_elapsed));
+ // See Note [Internal Counter Stats] for a description of the
+ // following counters. If you add a counter here, please remember
+ // to update the Note.
+ if (RtsFlags.MiscFlags.internalCounters) {
#if defined(THREADED_RTS) && defined(PROF_SPIN)
- {
uint32_t g;
+ const int32_t col_width[] = {4, -30, 14, 14};
+ statsPrintf("Internal Counters:\n");
+ statsPrintf("%*s" "%*s" "%*s" "%*s" "\n"
+ , col_width[0], ""
+ , col_width[1], "SpinLock"
+ , col_width[2], "Spins"
+ , col_width[3], "Yields");
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "gc_alloc_block_sync"
+ , col_width[2], gc_alloc_block_sync.spin
+ , col_width[3], gc_alloc_block_sync.yield);
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "gc_spin"
+ , col_width[2], stats.gc_spin_spin
+ , col_width[3], stats.gc_spin_yield);
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "mut_spin"
+ , col_width[2], stats.mut_spin_spin
+ , col_width[3], stats.mut_spin_yield);
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+ , col_width[0], ""
+ , col_width[1], "whitehole_gc"
+ , col_width[2], whitehole_gc_spin
+ , col_width[3], "n/a");
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+ , col_width[0], ""
+ , col_width[1], "whitehole_threadPaused"
+ , col_width[2], whitehole_threadPaused_spin
+ , col_width[3], "n/a");
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
+ , col_width[0], ""
+ , col_width[1], "whitehole_executeMessage"
+ , col_width[2], whitehole_executeMessage_spin
+ , col_width[3], "n/a");
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "whitehole_lockClosure"
+ , col_width[2], whitehole_lockClosure_spin
+ , col_width[3], whitehole_lockClosure_yield);
+ // waitForGcThreads isn't really spin-locking(see the function)
+ // but these numbers still seem useful.
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "waitForGcThreads"
+ , col_width[2], waitForGcThreads_spin
+ , col_width[3], waitForGcThreads_yield);
- statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin);
- statsPrintf("whitehole_gc_spin: %"FMT_Word64"\n"
- , whitehole_gc_spin);
for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
- statsPrintf("gen[%d].sync: %"FMT_Word64"\n", g, generations[g].sync.spin);
+ int prefix_length = 0;
+ statsPrintf("%*s" "gen[%" FMT_Word32 "%n",
+ col_width[0], "", g, &prefix_length);
+ prefix_length -= col_width[0];
+ int suffix_length = col_width[1] + prefix_length;
+ suffix_length =
+ suffix_length > 0 ? col_width[1] : suffix_length;
+
+ statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
+ , suffix_length, "].sync"
+ , col_width[2], generations[g].sync.spin
+ , col_width[3], generations[g].sync.yield);
}
- }
+ statsPrintf("\n");
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "any_work"
+ , col_width[2], stats.any_work);
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "no_work"
+ , col_width[2], stats.no_work);
+ statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n"
+ , col_width[0], ""
+ , col_width[1], "scav_find_work"
+ , col_width[2], stats.scav_find_work);
+#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN
+ statsPrintf("Internal Counters require the RTS to be built "
+ "with PROF_SPIN"); // PROF_SPIN is not #defined here
+#else // THREADED_RTS
+ statsPrintf("Internal Counters require the threaded RTS");
#endif
+ }
}
if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) {
@@ -917,6 +1022,68 @@ the number of gc threads is limited to the number of cores.
See #13830
*/
+/*
+Note [Internal Counter Stats]
+-----------------------------
+What do the counts at the end of a '+RTS -s --internal-counters' report mean?
+They are detailed below. Most of these counters are used by multiple threads
+with no attempt at synchronisation. This means that reported values may be
+lower than the true value and this becomes more likely and more severe as
+contention increases.
+
+The first counters are for various SpinLock-like constructs in the RTS. See
+Spinlock.h for the definition of a SpinLock. We maintain up two counters per
+SpinLock:
+* spin: The number of busy-spins over the length of the program.
+* yield: The number of times the SpinLock spun SPIN_COUNT times without success
+ and called yieldThread().
+Not all of these are actual SpinLocks, see the details below.
+
+Actual SpinLocks:
+* gc_alloc_block:
+ This SpinLock protects the block allocator and free list manager. See
+ BlockAlloc.c.
+* gc_spin and mut_spin:
+ These SpinLocks are used to herd gc worker threads during parallel garbage
+ collection. See gcWorkerThread, wakeup_gc_threads and releaseGCThreads.
+* gen[g].sync:
+ These SpinLocks, one per generation, protect the generations[g] data
+ structure during garbage collection.
+
+waitForGcThreads:
+ These counters are incremented while we wait for all threads to be ready
+ for a parallel garbage collection. We yield more than we spin in this case.
+
+In several places in the runtime we must take a lock on a closure. To do this,
+we replace it's info table with stg_WHITEHOLE_info, spinning if it is already
+a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we
+don't. We count these white-hole spins and include them in the SpinLocks table.
+If a particular loop does not yield, we put "n/a" in the table. They are named
+for the function that has the spinning loop except that several loops in the
+garbage collector accumulate into whitehole_gc.
+TODO: Should these counters be more or less granular?
+
+white-hole spin counters:
+* whitehole_gc
+* whitehole_lockClosure
+* whitehole_executeMessage
+* whitehole_threadPaused
+
+
+We count the number of calls of several functions in the parallel garbage
+collector.
+
+Parallel garbage collector counters:
+* any_work:
+ A cheap function called whenever a gc_thread is ready for work. Does
+ not do any work.
+* no_work:
+ Incremented whenever any_work finds no work.
+* scav_find_work:
+ Called to do work when any_work return true.
+
+*/
+
/* -----------------------------------------------------------------------------
stat_describe_gens
diff --git a/rts/Stats.h b/rts/Stats.h
index 5d9cf04fa7..1c56344f81 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -30,7 +30,10 @@ void stat_startGCSync(struct gc_thread_ *_gct);
void stat_startGC(Capability *cap, struct gc_thread_ *_gct);
void stat_endGC (Capability *cap, struct gc_thread_ *_gct, W_ live,
W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads,
- W_ par_max_copied, W_ par_balanced_copied);
+ W_ par_max_copied, W_ par_balanced_copied,
+ W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin,
+ W_ mut_spin_yield, W_ any_work, W_ no_work,
+ W_ scav_find_work);
#if defined(PROFILING)
void stat_startRP(void);
diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm
index 361989d0d2..595d3ce6c2 100644
--- a/rts/StgMiscClosures.cmm
+++ b/rts/StgMiscClosures.cmm
@@ -375,11 +375,21 @@ loop:
// spin until the WHITEHOLE is updated
info = StgHeader_info(node);
if (info == stg_WHITEHOLE_info) {
+#if defined(PROF_SPIN)
+ W_[whitehole_lockClosure_spin] =
+ W_[whitehole_lockClosure_spin] + 1;
+#endif
i = i + 1;
if (i == SPIN_COUNT) {
i = 0;
+#if defined(PROF_SPIN)
+ W_[whitehole_lockClosure_yield] =
+ W_[whitehole_lockClosure_yield] + 1;
+#endif
ccall yieldThread();
}
+ // TODO: We should busy_wait_nop() here, but that's not currently
+ // defined in CMM.
goto loop;
}
jump %ENTRY_CODE(info) (node);
diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c
index 7ade0a64c5..3f7bddeb79 100644
--- a/rts/ThreadPaused.c
+++ b/rts/ThreadPaused.c
@@ -329,6 +329,10 @@ threadPaused(Capability *cap, StgTSO *tso)
if (cur_bh_info != bh_info) {
bh_info = cur_bh_info;
+#if defined(PROF_SPIN)
+ ++whitehole_threadPaused_spin;
+#endif
+ busy_wait_nop();
goto retry;
}
#endif
diff --git a/rts/ThreadPaused.h b/rts/ThreadPaused.h
index 4d762f9aed..ee25189c20 100644
--- a/rts/ThreadPaused.h
+++ b/rts/ThreadPaused.h
@@ -8,4 +8,12 @@
#pragma once
+#include "BeginPrivate.h"
+
RTS_PRIVATE void threadPaused ( Capability *cap, StgTSO * );
+
+#include "EndPrivate.h"
+
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+extern volatile StgWord64 whitehole_threadPaused_spin;
+#endif
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index bb54c7efc3..27f280665e 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -1058,9 +1058,14 @@ selector_chain:
// In threaded mode, we'll use WHITEHOLE to lock the selector
// thunk while we evaluate it.
{
- do {
+ while(true) {
info_ptr = xchg((StgPtr)&p->header.info, (W_)&stg_WHITEHOLE_info);
- } while (info_ptr == (W_)&stg_WHITEHOLE_info);
+ if (info_ptr != (W_)&stg_WHITEHOLE_info) { break; }
+#if defined(PROF_SPIN)
+ ++whitehole_gc_spin;
+#endif
+ busy_wait_nop();
+ }
// make sure someone else didn't get here first...
if (IS_FORWARDING_PTR(info_ptr) ||
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 54797ba0f0..d61ca41a6b 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -139,6 +139,9 @@ uint32_t n_gc_threads;
static long copied; // *words* copied & scavenged during this GC
#if defined(PROF_SPIN) && defined(THREADED_RTS)
+// spin and yield counts for the quasi-SpinLock in waitForGcThreads
+volatile StgWord64 waitForGcThreads_spin = 0;
+volatile StgWord64 waitForGcThreads_yield = 0;
volatile StgWord64 whitehole_gc_spin = 0;
#endif
@@ -198,7 +201,9 @@ GarbageCollect (uint32_t collect_gen,
{
bdescr *bd;
generation *gen;
- StgWord live_blocks, live_words, par_max_copied, par_balanced_copied;
+ StgWord live_blocks, live_words, par_max_copied, par_balanced_copied,
+ gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
+ any_work, no_work, scav_find_work;
#if defined(THREADED_RTS)
gc_thread *saved_gct;
#endif
@@ -471,32 +476,53 @@ GarbageCollect (uint32_t collect_gen,
copied = 0;
par_max_copied = 0;
par_balanced_copied = 0;
+ gc_spin_spin = 0;
+ gc_spin_yield = 0;
+ mut_spin_spin = 0;
+ mut_spin_yield = 0;
+ any_work = 0;
+ no_work = 0;
+ scav_find_work = 0;
{
uint32_t i;
uint64_t par_balanced_copied_acc = 0;
+ const gc_thread* thread;
for (i=0; i < n_gc_threads; i++) {
copied += gc_threads[i]->copied;
}
for (i=0; i < n_gc_threads; i++) {
+ thread = gc_threads[i];
if (n_gc_threads > 1) {
debugTrace(DEBUG_gc,"thread %d:", i);
- debugTrace(DEBUG_gc," copied %ld", gc_threads[i]->copied * sizeof(W_));
- debugTrace(DEBUG_gc," scanned %ld", gc_threads[i]->scanned * sizeof(W_));
- debugTrace(DEBUG_gc," any_work %ld", gc_threads[i]->any_work);
- debugTrace(DEBUG_gc," no_work %ld", gc_threads[i]->no_work);
- debugTrace(DEBUG_gc," scav_find_work %ld", gc_threads[i]->scav_find_work);
+ debugTrace(DEBUG_gc," copied %ld",
+ thread->copied * sizeof(W_));
+ debugTrace(DEBUG_gc," scanned %ld",
+ thread->scanned * sizeof(W_));
+ debugTrace(DEBUG_gc," any_work %ld",
+ thread->any_work);
+ debugTrace(DEBUG_gc," no_work %ld",
+ thread->no_work);
+ debugTrace(DEBUG_gc," scav_find_work %ld",
+ thread->scav_find_work);
+
+#if defined(THREADED_RTS) && defined(PROF_SPIN)
+ gc_spin_spin += thread->gc_spin.spin;
+ gc_spin_yield += thread->gc_spin.yield;
+ mut_spin_spin += thread->mut_spin.spin;
+ mut_spin_yield += thread->mut_spin.yield;
+#endif
+
+ any_work += thread->any_work;
+ no_work += thread->no_work;
+ scav_find_work += thread->scav_find_work;
+
+ par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
+ par_balanced_copied_acc +=
+ stg_min(n_gc_threads * gc_threads[i]->copied, copied);
}
- par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied);
- par_balanced_copied_acc +=
- stg_min(n_gc_threads * gc_threads[i]->copied, copied);
- }
- if (n_gc_threads == 1) {
- par_max_copied = 0;
- par_balanced_copied = 0;
}
- else
- {
+ if (n_gc_threads > 1) {
// See Note [Work Balance] for an explanation of this computation
par_balanced_copied =
(par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) /
@@ -834,7 +860,9 @@ GarbageCollect (uint32_t collect_gen,
// ok, GC over: tell the stats department what happened.
stat_endGC(cap, gct, live_words, copied,
live_blocks * BLOCK_SIZE_W - live_words /* slop */,
- N, n_gc_threads, par_max_copied, par_balanced_copied);
+ N, n_gc_threads, par_max_copied, par_balanced_copied,
+ gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
+ any_work, no_work, scav_find_work);
#if defined(RTS_USER_SIGNALS)
if (RtsFlags.MiscFlags.install_signal_handlers) {
@@ -1186,6 +1214,9 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
}
}
if (!retry) break;
+#if defined(PROF_SPIN)
+ waitForGcThreads_yield++;
+#endif
yieldThread();
}
@@ -1196,6 +1227,14 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
rtsConfig.longGCSync(cap->no, t2 - t0);
t1 = t2;
}
+ if (retry) {
+#if defined(PROF_SPIN)
+ // This is a bit strange, we'll get more yields than spins.
+ // I guess that means it's not a spin-lock at all, but these
+ // numbers are still useful (I think).
+ waitForGcThreads_spin++;
+#endif
+ }
}
if (RtsFlags.GcFlags.longGCSync != 0 &&
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index 78f054931a..7fce87edd4 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -47,6 +47,8 @@ extern uint32_t mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS, mutlist_OTHERS,
#if defined(PROF_SPIN) && defined(THREADED_RTS)
extern volatile StgWord64 whitehole_gc_spin;
+extern volatile StgWord64 waitForGcThreads_spin;
+extern volatile StgWord64 waitForGcThreads_yield;
#endif
void gcWorkerThread (Capability *cap);
diff --git a/testsuite/tests/rts/InternalCounters.stdout b/testsuite/tests/rts/InternalCounters.stdout
new file mode 100644
index 0000000000..d764d7bc19
--- /dev/null
+++ b/testsuite/tests/rts/InternalCounters.stdout
@@ -0,0 +1 @@
+Internal Counters: \ No newline at end of file
diff --git a/testsuite/tests/rts/Makefile b/testsuite/tests/rts/Makefile
index ded3be1b3b..630508542d 100644
--- a/testsuite/tests/rts/Makefile
+++ b/testsuite/tests/rts/Makefile
@@ -48,13 +48,13 @@ T5423:
.PHONY: T9405
T9405:
@'$(TEST_HC)' $(TEST_HC_OPTS) -ticky -rtsopts T9405.hs; \
- ./T9405 +RTS -rT9405.ticky & \
- sleep 0.2; \
- kill -2 $$!; \
- wait $$!; \
- [ -e T9405.ticky ] || echo "Error: Ticky profile doesn't exist"; \
- [ -s T9405.ticky ] || echo "Error: Ticky profile is empty"; \
- echo Ticky-Ticky;
+ ./T9405 +RTS -rT9405.ticky & \
+ sleep 0.2; \
+ kill -2 $$!; \
+ wait $$!; \
+ [ -e T9405.ticky ] || echo "Error: Ticky profile doesn't exist"; \
+ [ -s T9405.ticky ] || echo "Error: Ticky profile is empty"; \
+ echo Ticky-Ticky;
# Naming convention: 'T5423_' obj-way '_' obj-src
# obj-way ::= v | dyn
@@ -178,3 +178,8 @@ T12497:
.PHONY: T14695
T14695:
echo ":quit" | LD_LIBRARY_PATH="foo:" "$(TEST_HC)" $(filter-out -rtsopts, $(TEST_HC_OPTS_INTERACTIVE))
+
+.PHONY: InternalCounters
+InternalCounters:
+ "$(TEST_HC)" +RTS -s --internal-counters -RTS 2>&1 | grep "Internal Counters"
+ -"$(TEST_HC)" +RTS -s -RTS 2>&1 | grep "Internal Counters"
diff --git a/testsuite/tests/rts/all.T b/testsuite/tests/rts/all.T
index 5000a914db..cd70132610 100644
--- a/testsuite/tests/rts/all.T
+++ b/testsuite/tests/rts/all.T
@@ -390,3 +390,5 @@ test('T14702', [ ignore_stdout
, compile_and_run, [''])
test('T14900', normal, compile_and_run, ['-package ghc-compact'])
+test('InternalCounters', normal, run_command,
+ ['$MAKE -s --no-print-directory InternalCounters'])