diff options
author | Ben Gamari <ben@smart-cactus.org> | 2020-10-05 18:12:38 -0400 |
---|---|---|
committer | Ben Gamari <ben@smart-cactus.org> | 2020-11-24 12:03:03 -0500 |
commit | 2e76a63116df473e720ab1d95e3aa159b2d2998f (patch) | |
tree | 762c759aec19c4882a5fab527d2be564dbbe26d1 | |
parent | 1bcc9cd05822587be768eff708ebdb50822ab2af (diff) | |
download | haskell-2e76a63116df473e720ab1d95e3aa159b2d2998f.tar.gz |
Document schedulePushWork race
-rw-r--r-- | rts/Capability.c | 95 | ||||
-rw-r--r-- | rts/Schedule.c | 9 |
2 files changed, 70 insertions, 34 deletions
diff --git a/rts/Capability.c b/rts/Capability.c index 3031d283c4..eb52505656 100644 --- a/rts/Capability.c +++ b/rts/Capability.c @@ -643,6 +643,36 @@ enqueueWorker (Capability* cap USED_IF_THREADS) #endif +/* + * Note [Benign data race due to work-pushing] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * #17276 points out a tricky data race (noticed by ThreadSanitizer) between + * waitForWorkerCapability and schedulePushWork. In short, schedulePushWork + * works as follows: + * + * 1. collect the set of all idle capabilities, take cap->lock of each. + * + * 2. sort through each TSO on the calling capability's run queue and push + * some to idle capabilities. This may (if the TSO is a bound thread) + * involve setting tso->bound->task->cap despite not holding + * tso->bound->task->lock. + * + * 3. release cap->lock of all idle capabilities. + * + * Now, step 2 is in principle safe since the capability of the caller of + * schedulePushWork *owns* the TSO and therefore the Task to which it is bound. + * Furthermore, step 3 ensures that the write in step (2) will be visible to + * any core which starts execution of the previously-idle capability. + * + * However, this argument doesn't quite work for waitForWorkerCapability, which + * reads task->cap *without* first owning the capability which owns `task`. + * For this reason, we check again whether the task has been migrated to + * another capability after taking task->cap->lock. See Note [migrated bound + * threads] above. + * + */ + /* ---------------------------------------------------------------------------- * waitForWorkerCapability(task) * @@ -664,6 +694,9 @@ static Capability * waitForWorkerCapability (Task *task) // schedulePushWork, which does owns 'task' when it sets 'task->cap'. TSAN_ANNOTATE_HAPPENS_AFTER(&task->cap); cap = task->cap; + + // See Note [Benign data race due to work-pushing]. + TSAN_ANNOTATE_BENIGN_RACE(&task->cap, "we will double-check this below"); task->wakeup = false; RELEASE_LOCK(&task->lock); @@ -977,33 +1010,41 @@ yieldCapability (Capability** pCap, Task *task, bool gcAllowed) #endif /* THREADED_RTS */ -// Note [migrated bound threads] -// -// There's a tricky case where: -// - cap A is running an unbound thread T1 -// - there is a bound thread T2 at the head of the run queue on cap A -// - T1 makes a safe foreign call, the task bound to T2 is woken up on cap A -// - T1 returns quickly grabbing A again (T2 is still waking up on A) -// - T1 blocks, the scheduler migrates T2 to cap B -// - the task bound to T2 wakes up on cap B -// -// We take advantage of the following invariant: -// -// - A bound thread can only be migrated by the holder of the -// Capability on which the bound thread currently lives. So, if we -// hold Capability C, and task->cap == C, then task cannot be -// migrated under our feet. - -// Note [migrated bound threads 2] -// -// Second tricky case; -// - A bound Task becomes a GC thread -// - scheduleDoGC() migrates the thread belonging to this Task, -// because the Capability it is on is disabled -// - after GC, gcWorkerThread() returns, but now we are -// holding a Capability that is not the same as task->cap -// - Hence we must check for this case and immediately give up the -// cap we hold. +/* + * Note [migrated bound threads] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * There's a tricky case where: + * - cap A is running an unbound thread T1 + * - there is a bound thread T2 at the head of the run queue on cap A + * - T1 makes a safe foreign call, the task bound to T2 is woken up on cap A + * - T1 returns quickly grabbing A again (T2 is still waking up on A) + * - T1 blocks, the scheduler migrates T2 to cap B + * - the task bound to T2 wakes up on cap B + * + * We take advantage of the following invariant: + * + * - A bound thread can only be migrated by the holder of the + * Capability on which the bound thread currently lives. So, if we + * hold Capability C, and task->cap == C, then task cannot be + * migrated under our feet. + * + * See also Note [Benign data race due to work-pushing]. + * + * + * Note [migrated bound threads 2] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Second tricky case; + * - A bound Task becomes a GC thread + * - scheduleDoGC() migrates the thread belonging to this Task, + * because the Capability it is on is disabled + * - after GC, gcWorkerThread() returns, but now we are + * holding a Capability that is not the same as task->cap + * - Hence we must check for this case and immediately give up the + * cap we hold. + * + */ /* ---------------------------------------------------------------------------- * prodCapability diff --git a/rts/Schedule.c b/rts/Schedule.c index e7d60297f1..e542e4006a 100644 --- a/rts/Schedule.c +++ b/rts/Schedule.c @@ -843,14 +843,9 @@ schedulePushWork(Capability *cap USED_IF_THREADS, appendToRunQueue(free_caps[i],t); traceEventMigrateThread (cap, t, free_caps[i]->no); + // See Note [Benign data race due to work-pushing]. if (t->bound) { - // N.B. we typically would need to hold 't->bound->task->lock' to change 'cap' - // but this is safe because the Task lives on our run queue. See - // Note [Ownership of Task]. - TSAN_ANNOTATE_HAPPENS_BEFORE(&t->bound->task->cap); - // The happens-before matches the happens-after in - // waitForWorkerCapability - RELAXED_STORE(&t->bound->task->cap, free_caps[i]); + t->bound->task->cap = free_caps[i]; } t->cap = free_caps[i]; n--; // we have one fewer threads now |