diff options
-rw-r--r-- | includes/RtsAPI.h | 14 | ||||
-rw-r--r-- | rts/Stats.c | 30 | ||||
-rw-r--r-- | rts/Stats.h | 4 | ||||
-rw-r--r-- | rts/sm/GC.c | 185 | ||||
-rw-r--r-- | rts/sm/GCThread.h | 4 |
5 files changed, 113 insertions, 124 deletions
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h index c896b5b244..e2d1845819 100644 --- a/includes/RtsAPI.h +++ b/includes/RtsAPI.h @@ -251,20 +251,6 @@ typedef struct _RTSStats { // ----------------------------------- // Internal Counters - // The number of times a GC thread spun on its 'gc_spin' lock. - // Will be zero if the rts was not built with PROF_SPIN - uint64_t gc_spin_spin; - // The number of times a GC thread yielded on its 'gc_spin' lock. - // Will be zero if the rts was not built with PROF_SPIN - uint64_t gc_spin_yield; - // The number of times a GC thread spun on its 'mut_spin' lock. - // Will be zero if the rts was not built with PROF_SPIN - uint64_t mut_spin_spin; - // The number of times a GC thread yielded on its 'mut_spin' lock. - // Will be zero if the rts was not built with PROF_SPIN - uint64_t mut_spin_yield; - // The number of times a GC thread has checked for work across all parallel - // GCs uint64_t any_work; // The number of times a GC thread has iterated it's outer loop across all // parallel GCs diff --git a/rts/Stats.c b/rts/Stats.c index 45d40ddcad..53251bb7d6 100644 --- a/rts/Stats.c +++ b/rts/Stats.c @@ -154,10 +154,6 @@ initStats0(void) .par_copied_bytes = 0, .cumulative_par_max_copied_bytes = 0, .cumulative_par_balanced_copied_bytes = 0, - .gc_spin_spin = 0, - .gc_spin_yield = 0, - .mut_spin_spin = 0, - .mut_spin_yield = 0, .any_work = 0, .scav_find_work = 0, .max_n_todo_overflow = 0, @@ -460,9 +456,8 @@ stat_startGC (Capability *cap, gc_thread *gct) void stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ slop, uint32_t gen, uint32_t par_n_threads, gc_thread **gc_threads, - W_ par_max_copied, W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield, - W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ scav_find_work, - W_ max_n_todo_overflow) + W_ par_max_copied, W_ par_balanced_copied, W_ any_work, + W_ scav_find_work, W_ max_n_todo_overflow) { ACQUIRE_LOCK(&stats_mutex); @@ -544,10 +539,6 @@ stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ s stats.any_work += any_work; stats.scav_find_work += scav_find_work; stats.max_n_todo_overflow += stg_max(max_n_todo_overflow, stats.max_n_todo_overflow); - stats.gc_spin_spin += gc_spin_spin; - stats.gc_spin_yield += gc_spin_yield; - stats.mut_spin_spin += mut_spin_spin; - stats.mut_spin_yield += mut_spin_yield; } stats.gc_cpu_ns += stats.gc.cpu_ns; stats.gc_elapsed_ns += stats.gc.elapsed_ns; @@ -968,16 +959,6 @@ static void report_summary(const RTSSummaryStats* sum) , col_width[1], "gc_alloc_block_sync" , col_width[2], gc_alloc_block_sync.spin , col_width[3], gc_alloc_block_sync.yield); - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" - , col_width[0], "" - , col_width[1], "gc_spin" - , col_width[2], stats.gc_spin_spin - , col_width[3], stats.gc_spin_yield); - statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n" - , col_width[0], "" - , col_width[1], "mut_spin" - , col_width[2], stats.mut_spin_spin - , col_width[3], stats.mut_spin_yield); statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n" , col_width[0], "" , col_width[1], "whitehole_gc" @@ -1154,10 +1135,6 @@ static void report_machine_readable (const RTSSummaryStats * sum) MR_STAT("gc_alloc_block_sync_yield", FMT_Word64, gc_alloc_block_sync.yield); MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin); - MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin); - MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield); - MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin); - MR_STAT("mut_spin_yield", FMT_Word64, stats.mut_spin_yield); MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin); MR_STAT("waitForGcThreads_yield", FMT_Word64, waitForGcThreads_yield); @@ -1587,9 +1564,6 @@ Actual SpinLocks: * gc_alloc_block: This SpinLock protects the block allocator and free list manager. See BlockAlloc.c. -* gc_spin and mut_spin: - These SpinLocks are used to herd gc worker threads during parallel garbage - collection. See gcWorkerThread, wakeup_gc_threads and releaseGCThreads. * gen[g].sync: These SpinLocks, one per generation, protect the generations[g] data structure during garbage collection. diff --git a/rts/Stats.h b/rts/Stats.h index 19bd707302..5268c3ae63 100644 --- a/rts/Stats.h +++ b/rts/Stats.h @@ -36,9 +36,7 @@ void stat_endGC (Capability *cap, struct gc_thread_ *initiating_gct, W_ li W_ copied, W_ slop, uint32_t gen, uint32_t n_gc_threads, struct gc_thread_ **gc_threads, W_ par_max_copied, W_ par_balanced_copied, - W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin, - W_ mut_spin_yield, W_ any_work, W_ scav_find_work, - W_ max_n_todo_overflow); + W_ any_work, W_ scav_find_work, W_ max_n_todo_overflow); void stat_startNonmovingGcSync(void); void stat_endNonmovingGcSync(void); diff --git a/rts/sm/GC.c b/rts/sm/GC.c index 5e986f2296..4ddbcfdafd 100644 --- a/rts/sm/GC.c +++ b/rts/sm/GC.c @@ -269,7 +269,6 @@ GarbageCollect (uint32_t collect_gen, bdescr *bd; generation *gen; StgWord live_blocks, live_words, par_max_copied, par_balanced_copied, - gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield, any_work, scav_find_work, max_n_todo_overflow; #if defined(THREADED_RTS) gc_thread *saved_gct; @@ -391,7 +390,10 @@ GarbageCollect (uint32_t collect_gen, } work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled && N >= RtsFlags.ParFlags.parGcLoadBalancingGen && - n_gc_threads > 1; + (gc_type == SYNC_GC_PAR) && + (n_gc_threads - n_gc_idle_threads) > 1; + // ^ if we are the only non-idle gc thread, turn off work stealing + // // It's not always a good idea to do load balancing in parallel // GC. In particular, for a parallel program we don't want to // lose locality by moving cached data into another CPU's cache @@ -588,10 +590,6 @@ GarbageCollect (uint32_t collect_gen, copied = 0; par_max_copied = 0; par_balanced_copied = 0; - gc_spin_spin = 0; - gc_spin_yield = 0; - mut_spin_spin = 0; - mut_spin_yield = 0; any_work = 0; scav_find_work = 0; max_n_todo_overflow = 0; @@ -617,13 +615,6 @@ GarbageCollect (uint32_t collect_gen, debugTrace(DEBUG_gc," scav_find_work %ld", RELAXED_LOAD(&thread->scav_find_work)); -#if defined(THREADED_RTS) && defined(PROF_SPIN) - gc_spin_spin += RELAXED_LOAD(&thread->gc_spin.spin); - gc_spin_yield += RELAXED_LOAD(&thread->gc_spin.yield); - mut_spin_spin += RELAXED_LOAD(&thread->mut_spin.spin); - mut_spin_yield += RELAXED_LOAD(&thread->mut_spin.yield); -#endif - any_work += RELAXED_LOAD(&thread->any_work); scav_find_work += RELAXED_LOAD(&thread->scav_find_work); max_n_todo_overflow = stg_max(RELAXED_LOAD(&thread->max_n_todo_overflow), max_n_todo_overflow); @@ -1044,7 +1035,6 @@ GarbageCollect (uint32_t collect_gen, live_blocks * BLOCK_SIZE_W - live_words /* slop */, N, n_gc_threads, gc_threads, par_max_copied, par_balanced_copied, - gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield, any_work, scav_find_work, max_n_todo_overflow); #if defined(RTS_USER_SIGNALS) @@ -1083,10 +1073,6 @@ new_gc_thread (uint32_t n, gc_thread *t) #if defined(THREADED_RTS) t->id = 0; - initSpinLock(&t->gc_spin); - initSpinLock(&t->mut_spin); - ACQUIRE_SPIN_LOCK(&t->gc_spin); - ACQUIRE_SPIN_LOCK(&t->mut_spin); SEQ_CST_STORE(&t->wakeup, GC_THREAD_INACTIVE); // starts true, so we can wait for the // thread to start up, see wakeup_gc_threads #endif @@ -1149,6 +1135,12 @@ initGcThreads (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS) } else { gc_threads = stgMallocBytes (to * sizeof(gc_thread*), "initGcThreads"); + initMutex(&gc_entry_mutex); + initCondition(&gc_entry_arrived_cv); + initCondition(&gc_entry_start_now_cv); + initMutex(&gc_exit_mutex); + initCondition(&gc_exit_arrived_cv); + initCondition(&gc_exit_leave_now_cv); initMutex(&gc_running_mutex); initCondition(&gc_running_cv); } @@ -1185,6 +1177,12 @@ freeGcThreads (void) } closeCondition(&gc_running_cv); closeMutex(&gc_running_mutex); + closeCondition(&gc_exit_leave_now_cv); + closeCondition(&gc_exit_arrived_cv); + closeMutex(&gc_exit_mutex); + closeCondition(&gc_entry_start_now_cv); + closeCondition(&gc_entry_arrived_cv); + closeMutex(&gc_entry_mutex); stgFree (gc_threads); #else for (g = 0; g < RtsFlags.GcFlags.generations; g++) @@ -1339,16 +1337,16 @@ gcWorkerThread (Capability *cap) gct->id = osThreadId(); stat_startGCWorker (cap, gct); - // Wait until we're told to wake up - RELEASE_SPIN_LOCK(&gct->mut_spin); - // yieldThread(); - // Strangely, adding a yieldThread() here makes the CPU time - // measurements more accurate on Linux, perhaps because it syncs - // the CPU time across the multiple cores. Without this, CPU time - // is heavily skewed towards GC rather than MUT. SEQ_CST_STORE(&gct->wakeup, GC_THREAD_STANDING_BY); debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index); - ACQUIRE_SPIN_LOCK(&gct->gc_spin); + + ACQUIRE_LOCK(&gc_entry_mutex); + SEQ_CST_ADD(&n_gc_entered, 1); + signalCondition(&gc_entry_arrived_cv); + while(SEQ_CST_LOAD(&n_gc_entered) != 0) { + waitCondition(&gc_entry_start_now_cv, &gc_entry_mutex); + } + RELEASE_LOCK(&gc_entry_mutex); init_gc_thread(gct); @@ -1372,15 +1370,22 @@ gcWorkerThread (Capability *cap) #endif // Wait until we're told to continue - RELEASE_SPIN_LOCK(&gct->gc_spin); debugTrace(DEBUG_gc, "GC thread %d waiting to continue...", gct->thread_index); stat_endGCWorker (cap, gct); + // This must come *after* stat_endGCWorker since it serves to // synchronize us with the GC leader, which will later aggregate the // GC statistics (#17964,#18717) + ACQUIRE_LOCK(&gc_exit_mutex); SEQ_CST_STORE(&gct->wakeup, GC_THREAD_WAITING_TO_CONTINUE); - ACQUIRE_SPIN_LOCK(&gct->mut_spin); + SEQ_CST_ADD(&n_gc_exited, 1); + signalCondition(&gc_exit_arrived_cv); + while(SEQ_CST_LOAD(&n_gc_exited) != 0) { + waitCondition(&gc_exit_leave_now_cv, &gc_exit_mutex); + } + RELEASE_LOCK(&gc_exit_mutex); + debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index); SET_GCT(saved_gct); @@ -1391,56 +1396,53 @@ gcWorkerThread (Capability *cap) #if defined(THREADED_RTS) void -waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[]) +waitForGcThreads (Capability *cap, bool idle_cap[]) { - const uint32_t n_threads = n_capabilities; + // n_gc_threads is not valid here, we're too early + uint32_t n_threads = n_capabilities; const uint32_t me = cap->no; - uint32_t i, j; - bool retry = true; + uint32_t i, cur_n_gc_entered; Time t0, t1, t2; t0 = t1 = t2 = getProcessElapsedTime(); - while(retry) { - for (i=0; i < n_threads; i++) { - if (i == me || idle_cap[i]) continue; + for(i = 0; i < n_capabilities; ++i) { + if (i == me || idle_cap[i]) { + --n_threads; + } + } + + ASSERT(n_threads < n_capabilities); // must be less becasue we don't count ourself + if(n_threads == 0) { return; } + + ACQUIRE_LOCK(&gc_entry_mutex); + while((cur_n_gc_entered = SEQ_CST_LOAD(&n_gc_entered)) != n_threads) { + ASSERT(cur_n_gc_entered < n_threads); + for(i = 0; i < n_capabilities; ++i) { + if (i == me || idle_cap[i]) { continue; } if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY) { prodCapability(capabilities[i], cap->running_task); - } - } - for (j=0; j < 10; j++) { - retry = false; - for (i=0; i < n_threads; i++) { - if (i == me || idle_cap[i]) continue; write_barrier(); interruptCapability(capabilities[i]); - if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY) { - retry = true; - } } - if (!retry) break; -#if defined(PROF_SPIN) - waitForGcThreads_yield++; -#endif - yieldThread(); } - + // this 1ms timeout is not well justified. It's the shortest timeout we + // can use on windows. It seems to work well for me. + timedWaitCondition(&gc_entry_arrived_cv, &gc_entry_mutex, USToTime(1000)); t2 = getProcessElapsedTime(); if (RtsFlags.GcFlags.longGCSync != 0 && t2 - t1 > RtsFlags.GcFlags.longGCSync) { + // best not to hold the mutex while we call a hook function + RELEASE_LOCK(&gc_entry_mutex); + /* call this every longGCSync of delay */ rtsConfig.longGCSync(cap->no, t2 - t0); t1 = t2; - } - if (retry) { -#if defined(PROF_SPIN) - // This is a bit strange, we'll get more yields than spins. - // I guess that means it's not a spin-lock at all, but these - // numbers are still useful (I think). - waitForGcThreads_spin++; -#endif + + ACQUIRE_LOCK(&gc_entry_mutex); } } + RELEASE_LOCK(&gc_entry_mutex); if (RtsFlags.GcFlags.longGCSync != 0 && t2 - t0 > RtsFlags.GcFlags.longGCSync) { @@ -1459,17 +1461,28 @@ wakeup_gc_threads (uint32_t me USED_IF_THREADS, if (n_gc_threads == 1) return; +#if defined(DEBUG) + StgInt num_idle = 0; + for(i=0; i < n_gc_threads; ++i) { + ASSERT(!(i==me && idle_cap[i])); + if (idle_cap[i]) { ++num_idle;} + } + ASSERT(num_idle == n_gc_idle_threads); +#endif + + ACQUIRE_LOCK(&gc_entry_mutex); for (i=0; i < n_gc_threads; i++) { if (i == me || idle_cap[i]) continue; inc_running(); debugTrace(DEBUG_gc, "waking up gc thread %d", i); - if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY) - barf("wakeup_gc_threads"); - + ASSERT(SEQ_CST_LOAD(&gc_threads[i]->wakeup) == GC_THREAD_STANDING_BY); SEQ_CST_STORE(&gc_threads[i]->wakeup, GC_THREAD_RUNNING); - ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin); - RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin); } + ASSERT(SEQ_CST_LOAD(&n_gc_entered) == + (StgInt)n_gc_threads - 1 - (StgInt)n_gc_idle_threads); + SEQ_CST_STORE(&n_gc_entered, 0); + broadcastCondition(&gc_entry_start_now_cv); + RELEASE_LOCK(&gc_entry_mutex); #endif } @@ -1477,21 +1490,31 @@ wakeup_gc_threads (uint32_t me USED_IF_THREADS, // standby state, otherwise they may still be executing inside // scavenge_until_all_done(), and may even remain awake until the next GC starts. static void -shutdown_gc_threads (uint32_t me USED_IF_THREADS, - bool idle_cap[] USED_IF_THREADS) +shutdown_gc_threads (uint32_t me USED_IF_THREADS USED_IF_DEBUG, + bool idle_cap[] USED_IF_THREADS USED_IF_DEBUG) { #if defined(THREADED_RTS) - uint32_t i; if (n_gc_threads == 1) return; - for (i=0; i < n_gc_threads; i++) { + // we need to wait for `n_threads` threads. -1 because that's ourself + StgInt n_threads = (StgInt)n_gc_threads - 1 - (StgInt)n_gc_idle_threads; + StgInt cur_n_gc_exited; + ACQUIRE_LOCK(&gc_exit_mutex); + while((cur_n_gc_exited = SEQ_CST_LOAD(&n_gc_exited)) != n_threads) { + ASSERT(cur_n_gc_exited >= 0); + ASSERT(cur_n_gc_exited < n_threads); + waitCondition(&gc_exit_arrived_cv, &gc_exit_mutex); + } +#if defined(DEBUG) + uint32_t i; + for (i=0; i < n_capabilities; i++) { if (i == me || idle_cap[i]) continue; - while (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_WAITING_TO_CONTINUE) { - busy_wait_nop(); - } + ASSERT(SEQ_CST_LOAD(&gc_threads[i]->wakeup) == GC_THREAD_WAITING_TO_CONTINUE); } -#endif +#endif // DEBUG + RELEASE_LOCK(&gc_exit_mutex); +#endif // THREADED_RTS } #if defined(THREADED_RTS) @@ -1501,15 +1524,25 @@ releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[]) const uint32_t n_threads = n_capabilities; const uint32_t me = cap->no; uint32_t i; +#if defined(DEBUG) + uint32_t num_idle = 0; + for(i=0; i < n_threads; ++i) { + ASSERT(!(i==me && idle_cap[i])); + if (idle_cap[i]) { ++num_idle;} + } +#endif + for (i=0; i < n_threads; i++) { if (i == me || idle_cap[i]) continue; - if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_WAITING_TO_CONTINUE) - barf("releaseGCThreads"); - + ASSERT(SEQ_CST_LOAD(&gc_threads[i]->wakeup) == GC_THREAD_WAITING_TO_CONTINUE); SEQ_CST_STORE(&gc_threads[i]->wakeup, GC_THREAD_INACTIVE); - ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin); - RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin); } + + ACQUIRE_LOCK(&gc_exit_mutex); + ASSERT(SEQ_CST_LOAD(&n_gc_exited) == (StgInt)n_threads - 1 - (StgInt)num_idle); + SEQ_CST_STORE(&n_gc_exited, 0); + broadcastCondition(&gc_exit_leave_now_cv); + RELEASE_LOCK(&gc_exit_mutex); } #endif diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h index 31719ca020..84cf2be939 100644 --- a/rts/sm/GCThread.h +++ b/rts/sm/GCThread.h @@ -125,9 +125,7 @@ typedef struct gc_thread_ { #if defined(THREADED_RTS) OSThreadId id; // The OS thread that this struct belongs to - SpinLock gc_spin; - SpinLock mut_spin; - StgWord wakeup; // This should only be accessed via atomic accesses + volatile StgWord wakeup; // NB not StgWord8; only StgWord is guaranteed atomic #endif uint32_t thread_index; // a zero based index identifying the thread |