summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2017-06-06 17:07:21 +1000
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-06-07 07:36:50 +1000
commit6da948fe3d4dd7262cbd0528d66867f276ce811d (patch)
tree4cf143145412fd515dff5a9a1e714936cdd03db3
parentacd71ffa0d88a5e9fabac8ce27e940a550bb6418 (diff)
downloadmongo-6da948fe3d4dd7262cbd0528d66867f276ce811d.tar.gz
WT-3356 Use atomic reads of rwlocks. (#3454)mongodb-3.2.14
* WT-3356 Use atomic reads of rwlocks. Previously we had some conditions that checked several fields within a rwlock by indirecting to the live structure. Switch to always doing a read of the full 64-bit value, then using local reads from the copy. Otherwise, we're relying on the compiler and the memory model to order the structure accesses in "code execution order". That could explain assertion failures and/or incorrect behavior with the new rwlock implementation. * Change all waits to 10ms. Previously when stalling waiting to get into the lock we would wait for 1ms, but once queued we waited forever. The former is probably too aggressive (burns too much CPU when we should be able to wait for a notification), and the latter is dangerous if a notification is ever lost (a thread with a ticket may never wake up).
-rw-r--r--src/support/mtx_rw.c81
1 files changed, 51 insertions, 30 deletions
diff --git a/src/support/mtx_rw.c b/src/support/mtx_rw.c
index 1c326a50318..7905241be0e 100644
--- a/src/support/mtx_rw.c
+++ b/src/support/mtx_rw.c
@@ -121,23 +121,22 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_STAT_CONN_INCR(session, rwlock_read);
- new.u.v = old.u.v = l->u.v;
+ old.u.v = l->u.v;
- /*
- * This read lock can only be granted if there are no active writers.
- *
- * Also check for overflow in case there are 64K active readers.
- */
- if (old.u.s.current != old.u.s.next ||
- new.u.s.readers_active == UINT16_MAX)
+ /* This read lock can only be granted if there are no active writers. */
+ if (old.u.s.current != old.u.s.next)
return (EBUSY);
/*
* The replacement lock value is a result of adding an active reader.
- *
- * We rely on this atomic operation to provide a barrier.
+ * Check for overflow: if the maximum number of readers are already
+ * active, no new readers can enter the lock.
*/
- new.u.s.readers_active++;
+ new.u.v = old.u.v;
+ if (++new.u.s.readers_active == 0)
+ return (EBUSY);
+
+ /* We rely on this atomic operation to provide a barrier. */
return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY);
}
@@ -179,7 +178,8 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
new.u.v = old.u.v;
/*
* Check for overflow: if the maximum number of readers
- * are already active, wait to try again.
+ * are already active, no new readers can enter the
+ * lock.
*/
if (++new.u.s.readers_active == 0)
goto stall;
@@ -202,8 +202,8 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
*/
writers_active = old.u.s.next - old.u.s.current;
if (old.u.s.readers_queued > writers_active) {
-stall: __wt_cond_wait(
- session, l->cond_readers, WT_THOUSAND, NULL);
+stall: __wt_cond_wait(session,
+ l->cond_readers, 10 * WT_THOUSAND, NULL);
continue;
}
@@ -230,8 +230,8 @@ stall: __wt_cond_wait(
else {
session->current_rwlock = l;
session->current_rwticket = ticket;
- __wt_cond_wait(
- session, l->cond_readers, 0, __read_blocked);
+ __wt_cond_wait(session,
+ l->cond_readers, 10 * WT_THOUSAND, __read_blocked);
}
}
@@ -244,7 +244,9 @@ stall: __wt_cond_wait(
*/
WT_READ_BARRIER();
- WT_ASSERT(session, l->u.s.readers_active > 0);
+ /* Sanity check that we (still) have the lock. */
+ WT_ASSERT(session,
+ ticket == l->u.s.current && l->u.s.readers_active > 0);
}
/*
@@ -337,26 +339,37 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_STAT_CONN_INCR(session, rwlock_write);
for (;;) {
- new.u.v = old.u.v = l->u.v;
+ old.u.v = l->u.v;
+
+ /* Allocate a ticket. */
+ new.u.v = old.u.v;
ticket = new.u.s.next++;
/*
- * Avoid wrapping: if we allocate more than 256 tickets, two
- * lockers will simultaneously be granted the lock.
+ * Check for overflow: if the next ticket is allowed to catch
+ * up with the current batch, two writers could be granted the
+ * lock simultaneously.
*/
if (new.u.s.current == new.u.s.next) {
- __wt_cond_wait(
- session, l->cond_writers, WT_THOUSAND, NULL);
+ __wt_cond_wait(session,
+ l->cond_writers, 10 * WT_THOUSAND, NULL);
continue;
}
if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
break;
}
- /* Wait for our group to start and any readers to drain. */
- for (pause_cnt = 0;
- ticket != l->u.s.current || l->u.s.readers_active != 0;
- pause_cnt++) {
+ /*
+ * Wait for our group to start and any readers to drain.
+ *
+ * We take care here to do an atomic read of the full 64-bit lock
+ * value. Otherwise, reads are not guaranteed to be ordered and we
+ * could see no readers active from a different batch and decide that
+ * we have the lock.
+ */
+ for (pause_cnt = 0, old.u.v = l->u.v;
+ ticket != old.u.s.current || old.u.s.readers_active != 0;
+ pause_cnt++, old.u.v = l->u.v) {
if (pause_cnt < 1000)
WT_PAUSE();
else if (pause_cnt < 1200)
@@ -364,8 +377,8 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
else {
session->current_rwlock = l;
session->current_rwticket = ticket;
- __wt_cond_wait(
- session, l->cond_writers, 0, __write_blocked);
+ __wt_cond_wait(session,
+ l->cond_writers, 10 * WT_THOUSAND, __write_blocked);
}
}
@@ -377,6 +390,10 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
* meantime.
*/
WT_READ_BARRIER();
+
+ /* Sanity check that we (still) have the lock. */
+ WT_ASSERT(session,
+ ticket == l->u.s.current && l->u.s.readers_active == 0);
}
/*
@@ -389,7 +406,7 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_RWLOCK new, old;
do {
- new.u.v = old.u.v = l->u.v;
+ old.u.v = l->u.v;
/*
* We're holding the lock exclusive, there shouldn't be any
@@ -404,6 +421,7 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
* to active: this could race with new readlock requests, so we
* have to spin.
*/
+ new.u.v = old.u.v;
if (++new.u.s.current == new.u.s.reader) {
new.u.s.readers_active = new.u.s.readers_queued;
new.u.s.readers_queued = 0;
@@ -426,8 +444,11 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
bool
__wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
+ WT_RWLOCK old;
+
WT_UNUSED(session);
- return (l->u.s.current != l->u.s.next || l->u.s.readers_active != 0);
+ old.u.v = l->u.v;
+ return (old.u.s.current != old.u.s.next || old.u.s.readers_active != 0);
}
#endif