diff options
Diffstat (limited to 'rts')
-rw-r--r-- | rts/Messages.c | 3 | ||||
-rw-r--r-- | rts/Messages.h | 4 | ||||
-rw-r--r-- | rts/RtsFlags.c | 6 | ||||
-rw-r--r-- | rts/SMPClosureOps.h | 12 | ||||
-rw-r--r-- | rts/Stats.c | 189 | ||||
-rw-r--r-- | rts/Stats.h | 5 | ||||
-rw-r--r-- | rts/StgMiscClosures.cmm | 10 | ||||
-rw-r--r-- | rts/ThreadPaused.c | 4 | ||||
-rw-r--r-- | rts/ThreadPaused.h | 8 | ||||
-rw-r--r-- | rts/sm/Evac.c | 9 | ||||
-rw-r--r-- | rts/sm/GC.c | 71 | ||||
-rw-r--r-- | rts/sm/GC.h | 2 |
12 files changed, 293 insertions, 30 deletions
diff --git a/rts/Messages.c b/rts/Messages.c index 8fab314bc4..a9c794d823 100644 --- a/rts/Messages.c +++ b/rts/Messages.c @@ -129,6 +129,9 @@ loop: } else if (i == &stg_WHITEHOLE_info) { +#if defined(PROF_SPIN) + ++whitehole_executeMessage_spin; +#endif goto loop; } else diff --git a/rts/Messages.h b/rts/Messages.h index e60f19dc1d..18371564c4 100644 --- a/rts/Messages.h +++ b/rts/Messages.h @@ -31,3 +31,7 @@ doneWithMsgThrowTo (MessageThrowTo *m) } #include "EndPrivate.h" + +#if defined(THREADED_RTS) && defined(PROF_SPIN) +extern volatile StgWord64 whitehole_executeMessage_spin; +#endif diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c index 7b38bbd6fd..b674e9b685 100644 --- a/rts/RtsFlags.c +++ b/rts/RtsFlags.c @@ -232,6 +232,7 @@ void initRtsFlagsDefaults(void) RtsFlags.MiscFlags.generate_stack_trace = true; RtsFlags.MiscFlags.generate_dump_file = false; RtsFlags.MiscFlags.machineReadable = false; + RtsFlags.MiscFlags.internalCounters = false; RtsFlags.MiscFlags.linkerMemBase = 0; #if defined(THREADED_RTS) @@ -888,6 +889,11 @@ error = true; OPTION_UNSAFE; RtsFlags.MiscFlags.machineReadable = true; } + else if (strequal("internal-counters", + &rts_argv[arg][2])) { + OPTION_SAFE; + RtsFlags.MiscFlags.internalCounters = true; + } else if (strequal("info", &rts_argv[arg][2])) { OPTION_SAFE; diff --git a/rts/SMPClosureOps.h b/rts/SMPClosureOps.h index 4ea1c55976..1d18e1b018 100644 --- a/rts/SMPClosureOps.h +++ b/rts/SMPClosureOps.h @@ -38,6 +38,11 @@ EXTERN_INLINE void unlockClosure(StgClosure *p, const StgInfoTable *info); #if defined(THREADED_RTS) +#if defined(PROF_SPIN) +extern volatile StgWord64 whitehole_lockClosure_spin; +extern volatile StgWord64 whitehole_lockClosure_yield; +#endif + /* ----------------------------------------------------------------------------- * Locking/unlocking closures * @@ -56,7 +61,14 @@ EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p) do { info = xchg((P_)(void *)&p->header.info, (W_)&stg_WHITEHOLE_info); if (info != (W_)&stg_WHITEHOLE_info) return (StgInfoTable *)info; +#if defined(PROF_SPIN) + ++whitehole_lockClosure_spin; +#endif + busy_wait_nop(); } while (++i < SPIN_COUNT); +#if defined(PROF_SPIN) + ++whitehole_lockClosure_yield; +#endif yieldThread(); } while (1); } diff --git a/rts/Stats.c b/rts/Stats.c index 26bdac0ea5..7eb93bef33 100644 --- a/rts/Stats.c +++ b/rts/Stats.c @@ -16,10 +16,15 @@ #include "Profiling.h" #include "GetTime.h" #include "sm/Storage.h" -#include "sm/GC.h" // gc_alloc_block_sync, whitehole_gc_spin #include "sm/GCThread.h" #include "sm/BlockAlloc.h" +// for spin/yield counters +#include "sm/GC.h" +#include "ThreadPaused.h" +#include "Messages.h" + + #define TimeToSecondsDbl(t) ((double)(t) / TIME_RESOLUTION) static Time @@ -43,6 +48,13 @@ static Time HCe_start_time, HCe_tot_time = 0; // heap census prof elap time #define PROF_VAL(x) 0 #endif +#if defined(PROF_SPIN) +volatile StgWord64 whitehole_lockClosure_spin = 0; +volatile StgWord64 whitehole_lockClosure_yield = 0; +volatile StgWord64 whitehole_threadPaused_spin = 0; +volatile StgWord64 whitehole_executeMessage_spin = 0; +#endif + // // All the stats! // @@ -150,6 +162,13 @@ initStats0(void) .par_copied_bytes = 0, .cumulative_par_max_copied_bytes = 0, .cumulative_par_balanced_copied_bytes = 0, + .gc_spin_spin = 0, + .gc_spin_yield = 0, + .mut_spin_spin = 0, + .mut_spin_yield = 0, + .any_work = 0, + .no_work = 0, + .scav_find_work = 0, .mutator_cpu_ns = 0, .mutator_elapsed_ns = 0, .gc_cpu_ns = 0, @@ -283,10 +302,11 @@ stat_startGC (Capability *cap, gc_thread *gct) -------------------------------------------------------------------------- */ void -stat_endGC (Capability *cap, gc_thread *gct, - W_ live, W_ copied, W_ slop, uint32_t gen, - uint32_t par_n_threads, W_ par_max_copied, - W_ par_balanced_copied) +stat_endGC (Capability *cap, gc_thread *gct, W_ live, W_ copied, W_ slop, + uint32_t gen, uint32_t par_n_threads, W_ par_max_copied, + W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield, + W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ no_work, + W_ scav_find_work) { // ------------------------------------------------- // Collect all the stats about this GC in stats.gc. We always do this since @@ -350,6 +370,13 @@ stat_endGC (Capability *cap, gc_thread *gct, stats.gc.par_max_copied_bytes; stats.cumulative_par_balanced_copied_bytes += stats.gc.par_balanced_copied_bytes; + stats.any_work += any_work; + stats.no_work += no_work; + stats.scav_find_work += scav_find_work; + stats.gc_spin_spin += gc_spin_spin; + stats.gc_spin_yield += gc_spin_yield; + stats.mut_spin_spin += mut_spin_spin; + stats.mut_spin_yield += mut_spin_yield; } stats.gc_cpu_ns += stats.gc.cpu_ns; stats.gc_elapsed_ns += stats.gc.elapsed_ns; @@ -764,18 +791,96 @@ stat_exit (void) PROF_VAL(RPe_tot_time + HCe_tot_time) - init_elapsed) * 100 / TimeToSecondsDbl(tot_elapsed)); + // See Note [Internal Counter Stats] for a description of the + // following counters. If you add a counter here, please remember + // to update the Note. + if (RtsFlags.MiscFlags.internalCounters) { #if defined(THREADED_RTS) && defined(PROF_SPIN) - { uint32_t g; + const int32_t col_width[] = {4, -30, 14, 14}; + statsPrintf("Internal Counters:\n"); + statsPrintf("%*s" "%*s" "%*s" "%*s" "\n" + , col_width[0], "" + , col_width[1], "SpinLock" + , col_width[2], "Spins" + , col_width[3], "Yields"); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "gc_alloc_block_sync" + , col_width[2], gc_alloc_block_sync.spin + , col_width[3], gc_alloc_block_sync.yield); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "gc_spin" + , col_width[2], stats.gc_spin_spin + , col_width[3], stats.gc_spin_yield); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "mut_spin" + , col_width[2], stats.mut_spin_spin + , col_width[3], stats.mut_spin_yield); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n" + , col_width[0], "" + , col_width[1], "whitehole_gc" + , col_width[2], whitehole_gc_spin + , col_width[3], "n/a"); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n" + , col_width[0], "" + , col_width[1], "whitehole_threadPaused" + , col_width[2], whitehole_threadPaused_spin + , col_width[3], "n/a"); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n" + , col_width[0], "" + , col_width[1], "whitehole_executeMessage" + , col_width[2], whitehole_executeMessage_spin + , col_width[3], "n/a"); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "whitehole_lockClosure" + , col_width[2], whitehole_lockClosure_spin + , col_width[3], whitehole_lockClosure_yield); + // waitForGcThreads isn't really spin-locking(see the function) + // but these numbers still seem useful. + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "waitForGcThreads" + , col_width[2], waitForGcThreads_spin + , col_width[3], waitForGcThreads_yield); - statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin); - statsPrintf("whitehole_gc_spin: %"FMT_Word64"\n" - , whitehole_gc_spin); for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - statsPrintf("gen[%d].sync: %"FMT_Word64"\n", g, generations[g].sync.spin); + int prefix_length = 0; + statsPrintf("%*s" "gen[%" FMT_Word32 "%n", + col_width[0], "", g, &prefix_length); + prefix_length -= col_width[0]; + int suffix_length = col_width[1] + prefix_length; + suffix_length = + suffix_length > 0 ? col_width[1] : suffix_length; + + statsPrintf("%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" + , suffix_length, "].sync" + , col_width[2], generations[g].sync.spin + , col_width[3], generations[g].sync.yield); } - } + statsPrintf("\n"); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "any_work" + , col_width[2], stats.any_work); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "no_work" + , col_width[2], stats.no_work); + statsPrintf("%*s" "%*s" "%*" FMT_Word64 "\n" + , col_width[0], "" + , col_width[1], "scav_find_work" + , col_width[2], stats.scav_find_work); +#elif defined(THREADED_RTS) // THREADED_RTS && PROF_SPIN + statsPrintf("Internal Counters require the RTS to be built " + "with PROF_SPIN"); // PROF_SPIN is not #defined here +#else // THREADED_RTS + statsPrintf("Internal Counters require the threaded RTS"); #endif + } } if (RtsFlags.GcFlags.giveStats == ONELINE_GC_STATS) { @@ -917,6 +1022,68 @@ the number of gc threads is limited to the number of cores. See #13830 */ +/* +Note [Internal Counter Stats] +----------------------------- +What do the counts at the end of a '+RTS -s --internal-counters' report mean? +They are detailed below. Most of these counters are used by multiple threads +with no attempt at synchronisation. This means that reported values may be +lower than the true value and this becomes more likely and more severe as +contention increases. + +The first counters are for various SpinLock-like constructs in the RTS. See +Spinlock.h for the definition of a SpinLock. We maintain up two counters per +SpinLock: +* spin: The number of busy-spins over the length of the program. +* yield: The number of times the SpinLock spun SPIN_COUNT times without success + and called yieldThread(). +Not all of these are actual SpinLocks, see the details below. + +Actual SpinLocks: +* gc_alloc_block: + This SpinLock protects the block allocator and free list manager. See + BlockAlloc.c. +* gc_spin and mut_spin: + These SpinLocks are used to herd gc worker threads during parallel garbage + collection. See gcWorkerThread, wakeup_gc_threads and releaseGCThreads. +* gen[g].sync: + These SpinLocks, one per generation, protect the generations[g] data + structure during garbage collection. + +waitForGcThreads: + These counters are incremented while we wait for all threads to be ready + for a parallel garbage collection. We yield more than we spin in this case. + +In several places in the runtime we must take a lock on a closure. To do this, +we replace it's info table with stg_WHITEHOLE_info, spinning if it is already +a white-hole. Sometimes we yieldThread() if we spin too long, sometimes we +don't. We count these white-hole spins and include them in the SpinLocks table. +If a particular loop does not yield, we put "n/a" in the table. They are named +for the function that has the spinning loop except that several loops in the +garbage collector accumulate into whitehole_gc. +TODO: Should these counters be more or less granular? + +white-hole spin counters: +* whitehole_gc +* whitehole_lockClosure +* whitehole_executeMessage +* whitehole_threadPaused + + +We count the number of calls of several functions in the parallel garbage +collector. + +Parallel garbage collector counters: +* any_work: + A cheap function called whenever a gc_thread is ready for work. Does + not do any work. +* no_work: + Incremented whenever any_work finds no work. +* scav_find_work: + Called to do work when any_work return true. + +*/ + /* ----------------------------------------------------------------------------- stat_describe_gens diff --git a/rts/Stats.h b/rts/Stats.h index 5d9cf04fa7..1c56344f81 100644 --- a/rts/Stats.h +++ b/rts/Stats.h @@ -30,7 +30,10 @@ void stat_startGCSync(struct gc_thread_ *_gct); void stat_startGC(Capability *cap, struct gc_thread_ *_gct); void stat_endGC (Capability *cap, struct gc_thread_ *_gct, W_ live, W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads, - W_ par_max_copied, W_ par_balanced_copied); + W_ par_max_copied, W_ par_balanced_copied, + W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin, + W_ mut_spin_yield, W_ any_work, W_ no_work, + W_ scav_find_work); #if defined(PROFILING) void stat_startRP(void); diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm index 361989d0d2..595d3ce6c2 100644 --- a/rts/StgMiscClosures.cmm +++ b/rts/StgMiscClosures.cmm @@ -375,11 +375,21 @@ loop: // spin until the WHITEHOLE is updated info = StgHeader_info(node); if (info == stg_WHITEHOLE_info) { +#if defined(PROF_SPIN) + W_[whitehole_lockClosure_spin] = + W_[whitehole_lockClosure_spin] + 1; +#endif i = i + 1; if (i == SPIN_COUNT) { i = 0; +#if defined(PROF_SPIN) + W_[whitehole_lockClosure_yield] = + W_[whitehole_lockClosure_yield] + 1; +#endif ccall yieldThread(); } + // TODO: We should busy_wait_nop() here, but that's not currently + // defined in CMM. goto loop; } jump %ENTRY_CODE(info) (node); diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c index 7ade0a64c5..3f7bddeb79 100644 --- a/rts/ThreadPaused.c +++ b/rts/ThreadPaused.c @@ -329,6 +329,10 @@ threadPaused(Capability *cap, StgTSO *tso) if (cur_bh_info != bh_info) { bh_info = cur_bh_info; +#if defined(PROF_SPIN) + ++whitehole_threadPaused_spin; +#endif + busy_wait_nop(); goto retry; } #endif diff --git a/rts/ThreadPaused.h b/rts/ThreadPaused.h index 4d762f9aed..ee25189c20 100644 --- a/rts/ThreadPaused.h +++ b/rts/ThreadPaused.h @@ -8,4 +8,12 @@ #pragma once +#include "BeginPrivate.h" + RTS_PRIVATE void threadPaused ( Capability *cap, StgTSO * ); + +#include "EndPrivate.h" + +#if defined(THREADED_RTS) && defined(PROF_SPIN) +extern volatile StgWord64 whitehole_threadPaused_spin; +#endif diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c index bb54c7efc3..27f280665e 100644 --- a/rts/sm/Evac.c +++ b/rts/sm/Evac.c @@ -1058,9 +1058,14 @@ selector_chain: // In threaded mode, we'll use WHITEHOLE to lock the selector // thunk while we evaluate it. { - do { + while(true) { info_ptr = xchg((StgPtr)&p->header.info, (W_)&stg_WHITEHOLE_info); - } while (info_ptr == (W_)&stg_WHITEHOLE_info); + if (info_ptr != (W_)&stg_WHITEHOLE_info) { break; } +#if defined(PROF_SPIN) + ++whitehole_gc_spin; +#endif + busy_wait_nop(); + } // make sure someone else didn't get here first... if (IS_FORWARDING_PTR(info_ptr) || diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 54797ba0f0..d61ca41a6b 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -139,6 +139,9 @@ uint32_t n_gc_threads; static long copied; // *words* copied & scavenged during this GC #if defined(PROF_SPIN) && defined(THREADED_RTS) +// spin and yield counts for the quasi-SpinLock in waitForGcThreads +volatile StgWord64 waitForGcThreads_spin = 0; +volatile StgWord64 waitForGcThreads_yield = 0; volatile StgWord64 whitehole_gc_spin = 0; #endif @@ -198,7 +201,9 @@ GarbageCollect (uint32_t collect_gen, { bdescr *bd; generation *gen; - StgWord live_blocks, live_words, par_max_copied, par_balanced_copied; + StgWord live_blocks, live_words, par_max_copied, par_balanced_copied, + gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield, + any_work, no_work, scav_find_work; #if defined(THREADED_RTS) gc_thread *saved_gct; #endif @@ -471,32 +476,53 @@ GarbageCollect (uint32_t collect_gen, copied = 0; par_max_copied = 0; par_balanced_copied = 0; + gc_spin_spin = 0; + gc_spin_yield = 0; + mut_spin_spin = 0; + mut_spin_yield = 0; + any_work = 0; + no_work = 0; + scav_find_work = 0; { uint32_t i; uint64_t par_balanced_copied_acc = 0; + const gc_thread* thread; for (i=0; i < n_gc_threads; i++) { copied += gc_threads[i]->copied; } for (i=0; i < n_gc_threads; i++) { + thread = gc_threads[i]; if (n_gc_threads > 1) { debugTrace(DEBUG_gc,"thread %d:", i); - debugTrace(DEBUG_gc," copied %ld", gc_threads[i]->copied * sizeof(W_)); - debugTrace(DEBUG_gc," scanned %ld", gc_threads[i]->scanned * sizeof(W_)); - debugTrace(DEBUG_gc," any_work %ld", gc_threads[i]->any_work); - debugTrace(DEBUG_gc," no_work %ld", gc_threads[i]->no_work); - debugTrace(DEBUG_gc," scav_find_work %ld", gc_threads[i]->scav_find_work); + debugTrace(DEBUG_gc," copied %ld", + thread->copied * sizeof(W_)); + debugTrace(DEBUG_gc," scanned %ld", + thread->scanned * sizeof(W_)); + debugTrace(DEBUG_gc," any_work %ld", + thread->any_work); + debugTrace(DEBUG_gc," no_work %ld", + thread->no_work); + debugTrace(DEBUG_gc," scav_find_work %ld", + thread->scav_find_work); + +#if defined(THREADED_RTS) && defined(PROF_SPIN) + gc_spin_spin += thread->gc_spin.spin; + gc_spin_yield += thread->gc_spin.yield; + mut_spin_spin += thread->mut_spin.spin; + mut_spin_yield += thread->mut_spin.yield; +#endif + + any_work += thread->any_work; + no_work += thread->no_work; + scav_find_work += thread->scav_find_work; + + par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied); + par_balanced_copied_acc += + stg_min(n_gc_threads * gc_threads[i]->copied, copied); } - par_max_copied = stg_max(gc_threads[i]->copied, par_max_copied); - par_balanced_copied_acc += - stg_min(n_gc_threads * gc_threads[i]->copied, copied); - } - if (n_gc_threads == 1) { - par_max_copied = 0; - par_balanced_copied = 0; } - else - { + if (n_gc_threads > 1) { // See Note [Work Balance] for an explanation of this computation par_balanced_copied = (par_balanced_copied_acc - copied + (n_gc_threads - 1) / 2) / @@ -834,7 +860,9 @@ GarbageCollect (uint32_t collect_gen, // ok, GC over: tell the stats department what happened. stat_endGC(cap, gct, live_words, copied, live_blocks * BLOCK_SIZE_W - live_words /* slop */, - N, n_gc_threads, par_max_copied, par_balanced_copied); + N, n_gc_threads, par_max_copied, par_balanced_copied, + gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield, + any_work, no_work, scav_find_work); #if defined(RTS_USER_SIGNALS) if (RtsFlags.MiscFlags.install_signal_handlers) { @@ -1186,6 +1214,9 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[]) } } if (!retry) break; +#if defined(PROF_SPIN) + waitForGcThreads_yield++; +#endif yieldThread(); } @@ -1196,6 +1227,14 @@ waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[]) rtsConfig.longGCSync(cap->no, t2 - t0); t1 = t2; } + if (retry) { +#if defined(PROF_SPIN) + // This is a bit strange, we'll get more yields than spins. + // I guess that means it's not a spin-lock at all, but these + // numbers are still useful (I think). + waitForGcThreads_spin++; +#endif + } } if (RtsFlags.GcFlags.longGCSync != 0 && diff --git a/rts/sm/GC.h b/rts/sm/GC.h index 78f054931a..7fce87edd4 100644 --- a/rts/sm/GC.h +++ b/rts/sm/GC.h @@ -47,6 +47,8 @@ extern uint32_t mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS, mutlist_OTHERS, #if defined(PROF_SPIN) && defined(THREADED_RTS) extern volatile StgWord64 whitehole_gc_spin; +extern volatile StgWord64 waitForGcThreads_spin; +extern volatile StgWord64 waitForGcThreads_yield; #endif void gcWorkerThread (Capability *cap); |