summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDouglas Wilson <douglas.wilson@gmail.com>2020-12-17 15:05:25 +0000
committerMarge Bot <ben+marge-bot@smart-cactus.org>2021-01-17 05:49:54 -0500
commitd56fdad76c5fe025ad628aad727c8a1c7e745513 (patch)
tree97fb968d4bd6347bd478574e9ffbc1d2701d7f29
parent33fc453fd46e54410472a7bad7585e5d1821c0ec (diff)
downloadhaskell-d56fdad76c5fe025ad628aad727c8a1c7e745513.tar.gz
rts: gc: use mutex+condvar instead of spinlooks in gc entry/exit
used timed wait on condition variable in waitForGcThreads fix dodgy timespec calculation
-rw-r--r--includes/RtsAPI.h14
-rw-r--r--rts/Stats.c30
-rw-r--r--rts/Stats.h4
-rw-r--r--rts/sm/GC.c185
-rw-r--r--rts/sm/GCThread.h4
5 files changed, 113 insertions, 124 deletions
diff --git a/includes/RtsAPI.h b/includes/RtsAPI.h
index c896b5b244..e2d1845819 100644
--- a/includes/RtsAPI.h
+++ b/includes/RtsAPI.h
@@ -251,20 +251,6 @@ typedef struct _RTSStats {
// -----------------------------------
// Internal Counters
- // The number of times a GC thread spun on its 'gc_spin' lock.
- // Will be zero if the rts was not built with PROF_SPIN
- uint64_t gc_spin_spin;
- // The number of times a GC thread yielded on its 'gc_spin' lock.
- // Will be zero if the rts was not built with PROF_SPIN
- uint64_t gc_spin_yield;
- // The number of times a GC thread spun on its 'mut_spin' lock.
- // Will be zero if the rts was not built with PROF_SPIN
- uint64_t mut_spin_spin;
- // The number of times a GC thread yielded on its 'mut_spin' lock.
- // Will be zero if the rts was not built with PROF_SPIN
- uint64_t mut_spin_yield;
- // The number of times a GC thread has checked for work across all parallel
- // GCs
uint64_t any_work;
// The number of times a GC thread has iterated it's outer loop across all
// parallel GCs
diff --git a/rts/Stats.c b/rts/Stats.c
index 45d40ddcad..53251bb7d6 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -154,10 +154,6 @@ initStats0(void)
.par_copied_bytes = 0,
.cumulative_par_max_copied_bytes = 0,
.cumulative_par_balanced_copied_bytes = 0,
- .gc_spin_spin = 0,
- .gc_spin_yield = 0,
- .mut_spin_spin = 0,
- .mut_spin_yield = 0,
.any_work = 0,
.scav_find_work = 0,
.max_n_todo_overflow = 0,
@@ -460,9 +456,8 @@ stat_startGC (Capability *cap, gc_thread *gct)
void
stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ slop,
uint32_t gen, uint32_t par_n_threads, gc_thread **gc_threads,
- W_ par_max_copied, W_ par_balanced_copied, W_ gc_spin_spin, W_ gc_spin_yield,
- W_ mut_spin_spin, W_ mut_spin_yield, W_ any_work, W_ scav_find_work,
- W_ max_n_todo_overflow)
+ W_ par_max_copied, W_ par_balanced_copied, W_ any_work,
+ W_ scav_find_work, W_ max_n_todo_overflow)
{
ACQUIRE_LOCK(&stats_mutex);
@@ -544,10 +539,6 @@ stat_endGC (Capability *cap, gc_thread *initiating_gct, W_ live, W_ copied, W_ s
stats.any_work += any_work;
stats.scav_find_work += scav_find_work;
stats.max_n_todo_overflow += stg_max(max_n_todo_overflow, stats.max_n_todo_overflow);
- stats.gc_spin_spin += gc_spin_spin;
- stats.gc_spin_yield += gc_spin_yield;
- stats.mut_spin_spin += mut_spin_spin;
- stats.mut_spin_yield += mut_spin_yield;
}
stats.gc_cpu_ns += stats.gc.cpu_ns;
stats.gc_elapsed_ns += stats.gc.elapsed_ns;
@@ -968,16 +959,6 @@ static void report_summary(const RTSSummaryStats* sum)
, col_width[1], "gc_alloc_block_sync"
, col_width[2], gc_alloc_block_sync.spin
, col_width[3], gc_alloc_block_sync.yield);
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
- , col_width[0], ""
- , col_width[1], "gc_spin"
- , col_width[2], stats.gc_spin_spin
- , col_width[3], stats.gc_spin_yield);
- statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*" FMT_Word64 "\n"
- , col_width[0], ""
- , col_width[1], "mut_spin"
- , col_width[2], stats.mut_spin_spin
- , col_width[3], stats.mut_spin_yield);
statsPrintf("%*s" "%*s" "%*" FMT_Word64 "%*s\n"
, col_width[0], ""
, col_width[1], "whitehole_gc"
@@ -1154,10 +1135,6 @@ static void report_machine_readable (const RTSSummaryStats * sum)
MR_STAT("gc_alloc_block_sync_yield", FMT_Word64,
gc_alloc_block_sync.yield);
MR_STAT("gc_alloc_block_sync_spin", FMT_Word64, gc_alloc_block_sync.spin);
- MR_STAT("gc_spin_spin", FMT_Word64, stats.gc_spin_spin);
- MR_STAT("gc_spin_yield", FMT_Word64, stats.gc_spin_yield);
- MR_STAT("mut_spin_spin", FMT_Word64, stats.mut_spin_spin);
- MR_STAT("mut_spin_yield", FMT_Word64, stats.mut_spin_yield);
MR_STAT("waitForGcThreads_spin", FMT_Word64, waitForGcThreads_spin);
MR_STAT("waitForGcThreads_yield", FMT_Word64,
waitForGcThreads_yield);
@@ -1587,9 +1564,6 @@ Actual SpinLocks:
* gc_alloc_block:
This SpinLock protects the block allocator and free list manager. See
BlockAlloc.c.
-* gc_spin and mut_spin:
- These SpinLocks are used to herd gc worker threads during parallel garbage
- collection. See gcWorkerThread, wakeup_gc_threads and releaseGCThreads.
* gen[g].sync:
These SpinLocks, one per generation, protect the generations[g] data
structure during garbage collection.
diff --git a/rts/Stats.h b/rts/Stats.h
index 19bd707302..5268c3ae63 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -36,9 +36,7 @@ void stat_endGC (Capability *cap, struct gc_thread_ *initiating_gct, W_ li
W_ copied, W_ slop, uint32_t gen,
uint32_t n_gc_threads, struct gc_thread_ **gc_threads,
W_ par_max_copied, W_ par_balanced_copied,
- W_ gc_spin_spin, W_ gc_spin_yield, W_ mut_spin_spin,
- W_ mut_spin_yield, W_ any_work, W_ scav_find_work,
- W_ max_n_todo_overflow);
+ W_ any_work, W_ scav_find_work, W_ max_n_todo_overflow);
void stat_startNonmovingGcSync(void);
void stat_endNonmovingGcSync(void);
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 5e986f2296..4ddbcfdafd 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -269,7 +269,6 @@ GarbageCollect (uint32_t collect_gen,
bdescr *bd;
generation *gen;
StgWord live_blocks, live_words, par_max_copied, par_balanced_copied,
- gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
any_work, scav_find_work, max_n_todo_overflow;
#if defined(THREADED_RTS)
gc_thread *saved_gct;
@@ -391,7 +390,10 @@ GarbageCollect (uint32_t collect_gen,
}
work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
N >= RtsFlags.ParFlags.parGcLoadBalancingGen &&
- n_gc_threads > 1;
+ (gc_type == SYNC_GC_PAR) &&
+ (n_gc_threads - n_gc_idle_threads) > 1;
+ // ^ if we are the only non-idle gc thread, turn off work stealing
+ //
// It's not always a good idea to do load balancing in parallel
// GC. In particular, for a parallel program we don't want to
// lose locality by moving cached data into another CPU's cache
@@ -588,10 +590,6 @@ GarbageCollect (uint32_t collect_gen,
copied = 0;
par_max_copied = 0;
par_balanced_copied = 0;
- gc_spin_spin = 0;
- gc_spin_yield = 0;
- mut_spin_spin = 0;
- mut_spin_yield = 0;
any_work = 0;
scav_find_work = 0;
max_n_todo_overflow = 0;
@@ -617,13 +615,6 @@ GarbageCollect (uint32_t collect_gen,
debugTrace(DEBUG_gc," scav_find_work %ld",
RELAXED_LOAD(&thread->scav_find_work));
-#if defined(THREADED_RTS) && defined(PROF_SPIN)
- gc_spin_spin += RELAXED_LOAD(&thread->gc_spin.spin);
- gc_spin_yield += RELAXED_LOAD(&thread->gc_spin.yield);
- mut_spin_spin += RELAXED_LOAD(&thread->mut_spin.spin);
- mut_spin_yield += RELAXED_LOAD(&thread->mut_spin.yield);
-#endif
-
any_work += RELAXED_LOAD(&thread->any_work);
scav_find_work += RELAXED_LOAD(&thread->scav_find_work);
max_n_todo_overflow = stg_max(RELAXED_LOAD(&thread->max_n_todo_overflow), max_n_todo_overflow);
@@ -1044,7 +1035,6 @@ GarbageCollect (uint32_t collect_gen,
live_blocks * BLOCK_SIZE_W - live_words /* slop */,
N, n_gc_threads, gc_threads,
par_max_copied, par_balanced_copied,
- gc_spin_spin, gc_spin_yield, mut_spin_spin, mut_spin_yield,
any_work, scav_find_work, max_n_todo_overflow);
#if defined(RTS_USER_SIGNALS)
@@ -1083,10 +1073,6 @@ new_gc_thread (uint32_t n, gc_thread *t)
#if defined(THREADED_RTS)
t->id = 0;
- initSpinLock(&t->gc_spin);
- initSpinLock(&t->mut_spin);
- ACQUIRE_SPIN_LOCK(&t->gc_spin);
- ACQUIRE_SPIN_LOCK(&t->mut_spin);
SEQ_CST_STORE(&t->wakeup, GC_THREAD_INACTIVE); // starts true, so we can wait for the
// thread to start up, see wakeup_gc_threads
#endif
@@ -1149,6 +1135,12 @@ initGcThreads (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS)
} else {
gc_threads = stgMallocBytes (to * sizeof(gc_thread*),
"initGcThreads");
+ initMutex(&gc_entry_mutex);
+ initCondition(&gc_entry_arrived_cv);
+ initCondition(&gc_entry_start_now_cv);
+ initMutex(&gc_exit_mutex);
+ initCondition(&gc_exit_arrived_cv);
+ initCondition(&gc_exit_leave_now_cv);
initMutex(&gc_running_mutex);
initCondition(&gc_running_cv);
}
@@ -1185,6 +1177,12 @@ freeGcThreads (void)
}
closeCondition(&gc_running_cv);
closeMutex(&gc_running_mutex);
+ closeCondition(&gc_exit_leave_now_cv);
+ closeCondition(&gc_exit_arrived_cv);
+ closeMutex(&gc_exit_mutex);
+ closeCondition(&gc_entry_start_now_cv);
+ closeCondition(&gc_entry_arrived_cv);
+ closeMutex(&gc_entry_mutex);
stgFree (gc_threads);
#else
for (g = 0; g < RtsFlags.GcFlags.generations; g++)
@@ -1339,16 +1337,16 @@ gcWorkerThread (Capability *cap)
gct->id = osThreadId();
stat_startGCWorker (cap, gct);
- // Wait until we're told to wake up
- RELEASE_SPIN_LOCK(&gct->mut_spin);
- // yieldThread();
- // Strangely, adding a yieldThread() here makes the CPU time
- // measurements more accurate on Linux, perhaps because it syncs
- // the CPU time across the multiple cores. Without this, CPU time
- // is heavily skewed towards GC rather than MUT.
SEQ_CST_STORE(&gct->wakeup, GC_THREAD_STANDING_BY);
debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index);
- ACQUIRE_SPIN_LOCK(&gct->gc_spin);
+
+ ACQUIRE_LOCK(&gc_entry_mutex);
+ SEQ_CST_ADD(&n_gc_entered, 1);
+ signalCondition(&gc_entry_arrived_cv);
+ while(SEQ_CST_LOAD(&n_gc_entered) != 0) {
+ waitCondition(&gc_entry_start_now_cv, &gc_entry_mutex);
+ }
+ RELEASE_LOCK(&gc_entry_mutex);
init_gc_thread(gct);
@@ -1372,15 +1370,22 @@ gcWorkerThread (Capability *cap)
#endif
// Wait until we're told to continue
- RELEASE_SPIN_LOCK(&gct->gc_spin);
debugTrace(DEBUG_gc, "GC thread %d waiting to continue...",
gct->thread_index);
stat_endGCWorker (cap, gct);
+
// This must come *after* stat_endGCWorker since it serves to
// synchronize us with the GC leader, which will later aggregate the
// GC statistics (#17964,#18717)
+ ACQUIRE_LOCK(&gc_exit_mutex);
SEQ_CST_STORE(&gct->wakeup, GC_THREAD_WAITING_TO_CONTINUE);
- ACQUIRE_SPIN_LOCK(&gct->mut_spin);
+ SEQ_CST_ADD(&n_gc_exited, 1);
+ signalCondition(&gc_exit_arrived_cv);
+ while(SEQ_CST_LOAD(&n_gc_exited) != 0) {
+ waitCondition(&gc_exit_leave_now_cv, &gc_exit_mutex);
+ }
+ RELEASE_LOCK(&gc_exit_mutex);
+
debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index);
SET_GCT(saved_gct);
@@ -1391,56 +1396,53 @@ gcWorkerThread (Capability *cap)
#if defined(THREADED_RTS)
void
-waitForGcThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
+waitForGcThreads (Capability *cap, bool idle_cap[])
{
- const uint32_t n_threads = n_capabilities;
+ // n_gc_threads is not valid here, we're too early
+ uint32_t n_threads = n_capabilities;
const uint32_t me = cap->no;
- uint32_t i, j;
- bool retry = true;
+ uint32_t i, cur_n_gc_entered;
Time t0, t1, t2;
t0 = t1 = t2 = getProcessElapsedTime();
- while(retry) {
- for (i=0; i < n_threads; i++) {
- if (i == me || idle_cap[i]) continue;
+ for(i = 0; i < n_capabilities; ++i) {
+ if (i == me || idle_cap[i]) {
+ --n_threads;
+ }
+ }
+
+ ASSERT(n_threads < n_capabilities); // must be less becasue we don't count ourself
+ if(n_threads == 0) { return; }
+
+ ACQUIRE_LOCK(&gc_entry_mutex);
+ while((cur_n_gc_entered = SEQ_CST_LOAD(&n_gc_entered)) != n_threads) {
+ ASSERT(cur_n_gc_entered < n_threads);
+ for(i = 0; i < n_capabilities; ++i) {
+ if (i == me || idle_cap[i]) { continue; }
if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY) {
prodCapability(capabilities[i], cap->running_task);
- }
- }
- for (j=0; j < 10; j++) {
- retry = false;
- for (i=0; i < n_threads; i++) {
- if (i == me || idle_cap[i]) continue;
write_barrier();
interruptCapability(capabilities[i]);
- if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY) {
- retry = true;
- }
}
- if (!retry) break;
-#if defined(PROF_SPIN)
- waitForGcThreads_yield++;
-#endif
- yieldThread();
}
-
+ // this 1ms timeout is not well justified. It's the shortest timeout we
+ // can use on windows. It seems to work well for me.
+ timedWaitCondition(&gc_entry_arrived_cv, &gc_entry_mutex, USToTime(1000));
t2 = getProcessElapsedTime();
if (RtsFlags.GcFlags.longGCSync != 0 &&
t2 - t1 > RtsFlags.GcFlags.longGCSync) {
+ // best not to hold the mutex while we call a hook function
+ RELEASE_LOCK(&gc_entry_mutex);
+
/* call this every longGCSync of delay */
rtsConfig.longGCSync(cap->no, t2 - t0);
t1 = t2;
- }
- if (retry) {
-#if defined(PROF_SPIN)
- // This is a bit strange, we'll get more yields than spins.
- // I guess that means it's not a spin-lock at all, but these
- // numbers are still useful (I think).
- waitForGcThreads_spin++;
-#endif
+
+ ACQUIRE_LOCK(&gc_entry_mutex);
}
}
+ RELEASE_LOCK(&gc_entry_mutex);
if (RtsFlags.GcFlags.longGCSync != 0 &&
t2 - t0 > RtsFlags.GcFlags.longGCSync) {
@@ -1459,17 +1461,28 @@ wakeup_gc_threads (uint32_t me USED_IF_THREADS,
if (n_gc_threads == 1) return;
+#if defined(DEBUG)
+ StgInt num_idle = 0;
+ for(i=0; i < n_gc_threads; ++i) {
+ ASSERT(!(i==me && idle_cap[i]));
+ if (idle_cap[i]) { ++num_idle;}
+ }
+ ASSERT(num_idle == n_gc_idle_threads);
+#endif
+
+ ACQUIRE_LOCK(&gc_entry_mutex);
for (i=0; i < n_gc_threads; i++) {
if (i == me || idle_cap[i]) continue;
inc_running();
debugTrace(DEBUG_gc, "waking up gc thread %d", i);
- if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY)
- barf("wakeup_gc_threads");
-
+ ASSERT(SEQ_CST_LOAD(&gc_threads[i]->wakeup) == GC_THREAD_STANDING_BY);
SEQ_CST_STORE(&gc_threads[i]->wakeup, GC_THREAD_RUNNING);
- ACQUIRE_SPIN_LOCK(&gc_threads[i]->mut_spin);
- RELEASE_SPIN_LOCK(&gc_threads[i]->gc_spin);
}
+ ASSERT(SEQ_CST_LOAD(&n_gc_entered) ==
+ (StgInt)n_gc_threads - 1 - (StgInt)n_gc_idle_threads);
+ SEQ_CST_STORE(&n_gc_entered, 0);
+ broadcastCondition(&gc_entry_start_now_cv);
+ RELEASE_LOCK(&gc_entry_mutex);
#endif
}
@@ -1477,21 +1490,31 @@ wakeup_gc_threads (uint32_t me USED_IF_THREADS,
// standby state, otherwise they may still be executing inside
// scavenge_until_all_done(), and may even remain awake until the next GC starts.
static void
-shutdown_gc_threads (uint32_t me USED_IF_THREADS,
- bool idle_cap[] USED_IF_THREADS)
+shutdown_gc_threads (uint32_t me USED_IF_THREADS USED_IF_DEBUG,
+ bool idle_cap[] USED_IF_THREADS USED_IF_DEBUG)
{
#if defined(THREADED_RTS)
- uint32_t i;
if (n_gc_threads == 1) return;
- for (i=0; i < n_gc_threads; i++) {
+ // we need to wait for `n_threads` threads. -1 because that's ourself
+ StgInt n_threads = (StgInt)n_gc_threads - 1 - (StgInt)n_gc_idle_threads;
+ StgInt cur_n_gc_exited;
+ ACQUIRE_LOCK(&gc_exit_mutex);
+ while((cur_n_gc_exited = SEQ_CST_LOAD(&n_gc_exited)) != n_threads) {
+ ASSERT(cur_n_gc_exited >= 0);
+ ASSERT(cur_n_gc_exited < n_threads);
+ waitCondition(&gc_exit_arrived_cv, &gc_exit_mutex);
+ }
+#if defined(DEBUG)
+ uint32_t i;
+ for (i=0; i < n_capabilities; i++) {
if (i == me || idle_cap[i]) continue;
- while (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_WAITING_TO_CONTINUE) {
- busy_wait_nop();
- }
+ ASSERT(SEQ_CST_LOAD(&gc_threads[i]->wakeup) == GC_THREAD_WAITING_TO_CONTINUE);
}
-#endif
+#endif // DEBUG
+ RELEASE_LOCK(&gc_exit_mutex);
+#endif // THREADED_RTS
}
#if defined(THREADED_RTS)
@@ -1501,15 +1524,25 @@ releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
const uint32_t n_threads = n_capabilities;
const uint32_t me = cap->no;
uint32_t i;
+#if defined(DEBUG)
+ uint32_t num_idle = 0;
+ for(i=0; i < n_threads; ++i) {
+ ASSERT(!(i==me && idle_cap[i]));
+ if (idle_cap[i]) { ++num_idle;}
+ }
+#endif
+
for (i=0; i < n_threads; i++) {
if (i == me || idle_cap[i]) continue;
- if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_WAITING_TO_CONTINUE)
- barf("releaseGCThreads");
-
+ ASSERT(SEQ_CST_LOAD(&gc_threads[i]->wakeup) == GC_THREAD_WAITING_TO_CONTINUE);
SEQ_CST_STORE(&gc_threads[i]->wakeup, GC_THREAD_INACTIVE);
- ACQUIRE_SPIN_LOCK(&gc_threads[i]->gc_spin);
- RELEASE_SPIN_LOCK(&gc_threads[i]->mut_spin);
}
+
+ ACQUIRE_LOCK(&gc_exit_mutex);
+ ASSERT(SEQ_CST_LOAD(&n_gc_exited) == (StgInt)n_threads - 1 - (StgInt)num_idle);
+ SEQ_CST_STORE(&n_gc_exited, 0);
+ broadcastCondition(&gc_exit_leave_now_cv);
+ RELEASE_LOCK(&gc_exit_mutex);
}
#endif
diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h
index 31719ca020..84cf2be939 100644
--- a/rts/sm/GCThread.h
+++ b/rts/sm/GCThread.h
@@ -125,9 +125,7 @@ typedef struct gc_thread_ {
#if defined(THREADED_RTS)
OSThreadId id; // The OS thread that this struct belongs to
- SpinLock gc_spin;
- SpinLock mut_spin;
- StgWord wakeup; // This should only be accessed via atomic accesses
+ volatile StgWord wakeup; // NB not StgWord8; only StgWord is guaranteed atomic
#endif
uint32_t thread_index; // a zero based index identifying the thread