summaryrefslogtreecommitdiff
path: root/rts/PrimOps.cmm
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2010-02-16 12:34:11 +0000
committerSimon Marlow <marlowsd@gmail.com>2010-02-16 12:34:11 +0000
commitc44aaa1087388b0555a6026b11af2433e86136d0 (patch)
tree2b55e7ab81900c41b2b7f1a270ca6eaeb5532da7 /rts/PrimOps.cmm
parent3fd25ca4ae2b41c283210112c6e710e8a44804ba (diff)
downloadhaskell-c44aaa1087388b0555a6026b11af2433e86136d0.tar.gz
Fix a bug that can lead to noDuplicate# not working sometimes.
The symptom is that under some rare conditions when running in parallel, an unsafePerformIO or unsafeInterleaveIO computation might be duplicated, so e.g. lazy I/O might give the wrong answer (the stream might appear to have duplicate parts or parts missing). I have a program that demonstrates it -N3 or more, some lazy I/O, and a lot of shared mutable state. See the comment with stg_noDuplicatezh in PrimOps.cmm that explains the problem and the fix. This took me about a day to find :-(
Diffstat (limited to 'rts/PrimOps.cmm')
-rw-r--r--rts/PrimOps.cmm79
1 files changed, 71 insertions, 8 deletions
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index d7cc3e82ec..5325c85d1f 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -1845,12 +1845,71 @@ stg_asyncDoProczh
}
#endif
-// noDuplicate# tries to ensure that none of the thunks under
-// evaluation by the current thread are also under evaluation by
-// another thread. It relies on *both* threads doing noDuplicate#;
-// the second one will get blocked if they are duplicating some work.
+/* -----------------------------------------------------------------------------
+ * noDuplicate#
+ *
+ * noDuplicate# tries to ensure that none of the thunks under
+ * evaluation by the current thread are also under evaluation by
+ * another thread. It relies on *both* threads doing noDuplicate#;
+ * the second one will get blocked if they are duplicating some work.
+ *
+ * The idea is that noDuplicate# is used within unsafePerformIO to
+ * ensure that the IO operation is performed at most once.
+ * noDuplicate# calls threadPaused which acquires an exclusive lock on
+ * all the thunks currently under evaluation by the current thread.
+ *
+ * Consider the following scenario. There is a thunk A, whose
+ * evaluation requires evaluating thunk B, where thunk B is an
+ * unsafePerformIO. Two threads, 1 and 2, bother enter A. Thread 2
+ * is pre-empted before it enters B, and claims A by blackholing it
+ * (in threadPaused). Thread 1 now enters B, and calls noDuplicate#.
+ *
+ * thread 1 thread 2
+ * +-----------+ +---------------+
+ * | -------+-----> A <-------+------- |
+ * | update | BLACKHOLE | marked_update |
+ * +-----------+ +---------------+
+ * | | | |
+ * ... ...
+ * | | +---------------+
+ * +-----------+
+ * | ------+-----> B
+ * | update | BLACKHOLE
+ * +-----------+
+ *
+ * At this point: A is a blackhole, owned by thread 2. noDuplicate#
+ * calls threadPaused, which walks up the stack and
+ * - claims B on behalf of thread 1
+ * - then it reaches the update frame for A, which it sees is already
+ * a BLACKHOLE and is therefore owned by another thread. Since
+ * thread 1 is duplicating work, the computation up to the update
+ * frame for A is suspended, including thunk B.
+ * - thunk B, which is an unsafePerformIO, has now been reverted to
+ * an AP_STACK which could be duplicated - BAD!
+ * - The solution is as follows: before calling threadPaused, we
+ * leave a frame on the stack (stg_noDuplicate_info) that will call
+ * noDuplicate# again if the current computation is suspended and
+ * restarted.
+ *
+ * See the test program in concurrent/prog003 for a way to demonstrate
+ * this. It needs to be run with +RTS -N3 or greater, and the bug
+ * only manifests occasionally (once very 10 runs or so).
+ * -------------------------------------------------------------------------- */
+
+INFO_TABLE_RET(stg_noDuplicate, RET_SMALL)
+{
+ Sp_adj(1);
+ jump stg_noDuplicatezh;
+}
+
stg_noDuplicatezh
{
+ STK_CHK_GEN( WDS(1), NO_PTRS, stg_noDuplicatezh );
+ // leave noDuplicate frame in case the current
+ // computation is suspended and restarted (see above).
+ Sp_adj(-1);
+ Sp(0) = stg_noDuplicate_info;
+
SAVE_THREAD_STATE();
ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
foreign "C" threadPaused (MyCapability() "ptr", CurrentTSO "ptr") [];
@@ -1860,10 +1919,18 @@ stg_noDuplicatezh
} else {
LOAD_THREAD_STATE();
ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16);
+ // remove the stg_noDuplicate frame if it is still there.
+ if (Sp(0) == stg_noDuplicate_info) {
+ Sp_adj(1);
+ }
jump %ENTRY_CODE(Sp(0));
}
}
+/* -----------------------------------------------------------------------------
+ Misc. primitives
+ -------------------------------------------------------------------------- */
+
stg_getApStackValzh
{
W_ ap_stack, offset, val, ok;
@@ -1882,10 +1949,6 @@ stg_getApStackValzh
RET_NP(ok,val);
}
-/* -----------------------------------------------------------------------------
- Misc. primitives
- -------------------------------------------------------------------------- */
-
// Write the cost center stack of the first argument on stderr; return
// the second. Possibly only makes sense for already evaluated
// things?