summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c')
-rw-r--r--src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c197
1 files changed, 161 insertions, 36 deletions
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
index cdd4f8a24e1..df558b12bef 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
@@ -38,6 +38,78 @@
* Joseph Seigh. Note that a similar (but not identical) algorithm was published
* by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
* Reader-Writer Synchronization for Shared-Memory Multiprocessors".
+ *
+ * The following is an explanation of this code. First, the underlying lock
+ * structure.
+ *
+ * struct {
+ * uint16_t writers; Now serving for writers
+ * uint16_t readers; Now serving for readers
+ * uint16_t users; Next available ticket number
+ * uint16_t __notused; Padding
+ * }
+ *
+ * First, imagine a store's 'take a number' ticket algorithm. A customer takes
+ * a unique ticket number and customers are served in ticket order. In the data
+ * structure, 'writers' is the next writer to be served, 'readers' is the next
+ * reader to be served, and 'users' is the next available ticket number.
+ *
+ * Next, consider exclusive (write) locks. The 'now serving' number for writers
+ * is 'writers'. To lock, 'take a number' and wait until that number is being
+ * served; more specifically, atomically copy and increment the current value of
+ * 'users', and then wait until 'writers' equals that copied number.
+ *
+ * Shared (read) locks are similar. Like writers, readers atomically get the
+ * next number available. However, instead of waiting for 'writers' to equal
+ * their number, they wait for 'readers' to equal their number.
+ *
+ * This has the effect of queuing lock requests in the order they arrive
+ * (incidentally avoiding starvation).
+ *
+ * Each lock/unlock pair requires incrementing both 'readers' and 'writers'.
+ * In the case of a reader, the 'readers' increment happens when the reader
+ * acquires the lock (to allow read-lock sharing), and the 'writers' increment
+ * happens when the reader releases the lock. In the case of a writer, both
+ * 'readers' and 'writers' are incremented when the writer releases the lock.
+ *
+ * For example, consider the following read (R) and write (W) lock requests:
+ *
+ * writers readers users
+ * 0 0 0
+ * R: ticket 0, readers match OK 0 1 1
+ * R: ticket 1, readers match OK 0 2 2
+ * R: ticket 2, readers match OK 0 3 3
+ * W: ticket 3, writers no match block 0 3 4
+ * R: ticket 2, unlock 1 3 4
+ * R: ticket 0, unlock 2 3 4
+ * R: ticket 1, unlock 3 3 4
+ * W: ticket 3, writers match OK 3 3 4
+ *
+ * Note the writer blocks until 'writers' equals its ticket number and it does
+ * not matter if readers unlock in order or not.
+ *
+ * Readers or writers entering the system after the write lock is queued block,
+ * and the next ticket holder (reader or writer) will unblock when the writer
+ * unlocks. An example, continuing from the last line of the above example:
+ *
+ * writers readers users
+ * W: ticket 3, writers match OK 3 3 4
+ * R: ticket 4, readers no match block 3 3 5
+ * R: ticket 5, readers no match block 3 3 6
+ * W: ticket 6, writers no match block 3 3 7
+ * W: ticket 3, unlock 4 4 7
+ * R: ticket 4, readers match OK 4 5 7
+ * R: ticket 5, readers match OK 4 6 7
+ *
+ * The 'users' field is a 2-byte value so the available ticket number wraps at
+ * 64K requests. If a thread's lock request is not granted until the 'users'
+ * field cycles and the same ticket is taken by another thread, we could grant
+ * a lock to two separate threads at the same time, and bad things happen: two
+ * writer threads or a reader thread and a writer thread would run in parallel,
+ * and lock waiters could be skipped if the unlocks race. This is unlikely, it
+ * only happens if a lock request is blocked by 64K other requests. The fix is
+ * to grow the lock structure fields, but the largest atomic instruction we have
+ * is 8 bytes, the structure has no room to grow.
*/
#include "wt_internal.h"
@@ -69,20 +141,31 @@ __wt_rwlock_alloc(
int
__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
- wt_rwlock_t *l;
- uint64_t old, new, pad, users, writers;
+ wt_rwlock_t *l, new, old;
WT_RET(__wt_verbose(
session, WT_VERB_MUTEX, "rwlock: try_readlock %s", rwlock->name));
WT_STAT_FAST_CONN_INCR(session, rwlock_read);
l = &rwlock->rwlock;
- pad = l->s.pad;
- users = l->s.users;
- writers = l->s.writers;
- old = (pad << 48) + (users << 32) + (users << 16) + writers;
- new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers;
- return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY);
+ new = old = *l;
+
+ /*
+ * This read lock can only be granted if the lock was last granted to
+ * a reader and there are no readers or writers blocked on the lock,
+ * that is, if this thread's ticket would be the next ticket granted.
+ * Do the cheap test to see if this can possibly succeed (and confirm
+ * the lock is in the correct state to grant this read lock).
+ */
+ if (old.s.readers != old.s.users)
+ return (EBUSY);
+
+ /*
+ * The replacement lock value is a result of allocating a new ticket and
+ * incrementing the reader value to match it.
+ */
+ new.s.readers = new.s.users = old.s.users + 1;
+ return (WT_ATOMIC_CAS8(l->u, old.u, new.u) ? 0 : EBUSY);
}
/*
@@ -93,8 +176,7 @@ int
__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
wt_rwlock_t *l;
- uint64_t me;
- uint16_t val;
+ uint16_t ticket;
int pause_cnt;
WT_RET(__wt_verbose(
@@ -102,17 +184,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
WT_STAT_FAST_CONN_INCR(session, rwlock_read);
l = &rwlock->rwlock;
- me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
- val = (uint16_t)(me >> 32);
- for (pause_cnt = 0; val != l->s.readers;) {
+
+ /*
+ * Possibly wrap: if we have more than 64K lockers waiting, the ticket
+ * value will wrap and two lockers will simultaneously be granted the
+ * lock.
+ */
+ ticket = WT_ATOMIC_FETCH_ADD2(l->s.users, 1);
+ for (pause_cnt = 0; ticket != l->s.readers;) {
/*
* We failed to get the lock; pause before retrying and if we've
* paused enough, sleep so we don't burn CPU to no purpose. This
* situation happens if there are more threads than cores in the
- * system and we're thrashing on shared resources. Regardless,
- * don't sleep long, all we need is to schedule the other reader
- * threads to complete a few more instructions and increment the
- * reader count.
+ * system and we're thrashing on shared resources.
+ *
+ * Don't sleep long when waiting on a read lock, hopefully we're
+ * waiting on another read thread to increment the reader count.
*/
if (++pause_cnt < 1000)
WT_PAUSE();
@@ -120,6 +207,10 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
__wt_sleep(0, 10);
}
+ /*
+ * We're the only writer of the readers field, so the update does not
+ * need to be atomic.
+ */
++l->s.readers;
return (0);
@@ -138,6 +229,11 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name));
l = &rwlock->rwlock;
+
+ /*
+ * Increment the writers value (other readers are doing the same, make
+ * sure we don't race).
+ */
WT_ATOMIC_ADD2(l->s.writers, 1);
return (0);
@@ -150,20 +246,28 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
int
__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
- wt_rwlock_t *l;
- uint64_t old, new, pad, readers, users;
+ wt_rwlock_t *l, new, old;
WT_RET(__wt_verbose(
session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name));
WT_STAT_FAST_CONN_INCR(session, rwlock_write);
l = &rwlock->rwlock;
- pad = l->s.pad;
- readers = l->s.readers;
- users = l->s.users;
- old = (pad << 48) + (users << 32) + (readers << 16) + users;
- new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users;
- return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY);
+ old = new = *l;
+
+ /*
+ * This write lock can only be granted if the lock was last granted to
+ * a writer and there are no readers or writers blocked on the lock,
+ * that is, if this thread's ticket would be the next ticket granted.
+ * Do the cheap test to see if this can possibly succeed (and confirm
+ * the lock is in the correct state to grant this write lock).
+ */
+ if (old.s.writers != old.s.users)
+ return (EBUSY);
+
+ /* The replacement lock value is a result of allocating a new ticket. */
+ ++new.s.users;
+ return (WT_ATOMIC_CAS8(l->u, old.u, new.u) ? 0 : EBUSY);
}
/*
@@ -174,23 +278,33 @@ int
__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
wt_rwlock_t *l;
- uint64_t me;
- uint16_t val;
+ uint16_t ticket;
+ int pause_cnt;
WT_RET(__wt_verbose(
session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name));
WT_STAT_FAST_CONN_INCR(session, rwlock_write);
+ l = &rwlock->rwlock;
+
/*
- * Possibly wrap: if we have more than 64K lockers waiting, the count
- * of writers will wrap and two lockers will simultaneously be granted
- * the write lock.
+ * Possibly wrap: if we have more than 64K lockers waiting, the ticket
+ * value will wrap and two lockers will simultaneously be granted the
+ * lock.
*/
- l = &rwlock->rwlock;
- me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
- val = (uint16_t)(me >> 32);
- while (val != l->s.writers)
- WT_PAUSE();
+ ticket = WT_ATOMIC_FETCH_ADD2(l->s.users, 1);
+ for (pause_cnt = 0; ticket != l->s.writers;) {
+ /*
+ * We failed to get the lock; pause before retrying and if we've
+ * paused enough, sleep so we don't burn CPU to no purpose. This
+ * situation happens if there are more threads than cores in the
+ * system and we're thrashing on shared resources.
+ */
+ if (++pause_cnt < 1000)
+ WT_PAUSE();
+ else
+ __wt_sleep(0, 10);
+ }
return (0);
}
@@ -211,12 +325,23 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
copy = *l;
+ /*
+ * We're the only writer of the writers/readers fields, so the update
+ * does not need to be atomic; we have to update both values at the
+ * same time though, otherwise we'd potentially race with the thread
+ * next granted the lock.
+ *
+ * Use a memory barrier to ensure the compiler doesn't mess with these
+ * instructions and rework the code in a way that avoids the update as
+ * a unit.
+ */
WT_BARRIER();
++copy.s.writers;
++copy.s.readers;
- l->i.us = copy.i.us;
+ l->i.wr = copy.i.wr;
+
return (0);
}