1 files changed, 161 insertions, 36 deletions
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
index cdd4f8a24e1..df558b12bef 100644
--- a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
+++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c
@@ -38,6 +38,78 @@
  * Joseph Seigh. Note that a similar (but not identical) algorithm was published
  * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
  * Reader-Writer Synchronization for Shared-Memory Multiprocessors".
+ *
+ * The following is an explanation of this code. First, the underlying lock
+ * structure.
+ *
+ *	struct {
+ *		uint16_t writers;	Now serving for writers
+ *		uint16_t readers;	Now serving for readers
+ *		uint16_t users;		Next available ticket number
+ *		uint16_t __notused;	Padding
+ *	}
+ *
+ * First, imagine a store's 'take a number' ticket algorithm. A customer takes
+ * a unique ticket number and customers are served in ticket order. In the data
+ * structure, 'writers' is the next writer to be served, 'readers' is the next
+ * reader to be served, and 'users' is the next available ticket number.
+ *
+ * Next, consider exclusive (write) locks. The 'now serving' number for writers
+ * is 'writers'. To lock, 'take a number' and wait until that number is being
+ * served; more specifically, atomically copy and increment the current value of
+ * 'users', and then wait until 'writers' equals that copied number.
+ *
+ * Shared (read) locks are similar. Like writers, readers atomically get the
+ * next number available. However, instead of waiting for 'writers' to equal
+ * their number, they wait for 'readers' to equal their number.
+ *
+ * This has the effect of queuing lock requests in the order they arrive
+ * (incidentally avoiding starvation).
+ *
+ * Each lock/unlock pair requires incrementing both 'readers' and 'writers'.
+ * In the case of a reader, the 'readers' increment happens when the reader
+ * acquires the lock (to allow read-lock sharing), and the 'writers' increment
+ * happens when the reader releases the lock. In the case of a writer, both
+ * 'readers' and 'writers' are incremented when the writer releases the lock.
+ *
+ * For example, consider the following read (R) and write (W) lock requests:
+ *
+ *						writers	readers	users
+ *						0	0	0
+ *	R: ticket 0, readers match	OK	0	1	1
+ *	R: ticket 1, readers match	OK	0	2	2
+ *	R: ticket 2, readers match	OK	0	3	3
+ *	W: ticket 3, writers no match	block	0	3	4
+ *	R: ticket 2, unlock			1	3	4
+ *	R: ticket 0, unlock			2	3	4
+ *	R: ticket 1, unlock			3	3	4
+ *	W: ticket 3, writers match	OK	3	3	4
+ *
+ * Note the writer blocks until 'writers' equals its ticket number and it does
+ * not matter if readers unlock in order or not.
+ *
+ * Readers or writers entering the system after the write lock is queued block,
+ * and the next ticket holder (reader or writer) will unblock when the writer
+ * unlocks. An example, continuing from the last line of the above example:
+ *
+ *						writers	readers	users
+ *	W: ticket 3, writers match	OK	3	3	4
+ *	R: ticket 4, readers no match	block	3	3	5
+ *	R: ticket 5, readers no match	block	3	3	6
+ *	W: ticket 6, writers no match	block	3	3	7
+ *	W: ticket 3, unlock			4	4	7
+ *	R: ticket 4, readers match	OK	4	5	7
+ *	R: ticket 5, readers match	OK	4	6	7
+ *
+ * The 'users' field is a 2-byte value so the available ticket number wraps at
+ * 64K requests. If a thread's lock request is not granted until the 'users'
+ * field cycles and the same ticket is taken by another thread, we could grant
+ * a lock to two separate threads at the same time, and bad things happen: two
+ * writer threads or a reader thread and a writer thread would run in parallel,
+ * and lock waiters could be skipped if the unlocks race. This is unlikely, it
+ * only happens if a lock request is blocked by 64K other requests. The fix is
+ * to grow the lock structure fields, but the largest atomic instruction we have
+ * is 8 bytes, the structure has no room to grow.
  */
 
 #include "wt_internal.h"
@@ -69,20 +141,31 @@ __wt_rwlock_alloc(
 int
 __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
-	wt_rwlock_t *l;
-	uint64_t old, new, pad, users, writers;
+	wt_rwlock_t *l, new, old;
 
 	WT_RET(__wt_verbose(
 	    session, WT_VERB_MUTEX, "rwlock: try_readlock %s", rwlock->name));
 	WT_STAT_FAST_CONN_INCR(session, rwlock_read);
 
 	l = &rwlock->rwlock;
-	pad = l->s.pad;
-	users = l->s.users;
-	writers = l->s.writers;
-	old = (pad << 48) + (users << 32) + (users << 16) + writers;
-	new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers;
-	return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY);
+	new = old = *l;
+
+	/*
+	 * This read lock can only be granted if the lock was last granted to
+	 * a reader and there are no readers or writers blocked on the lock,
+	 * that is, if this thread's ticket would be the next ticket granted.
+	 * Do the cheap test to see if this can possibly succeed (and confirm
+	 * the lock is in the correct state to grant this read lock).
+	 */
+	if (old.s.readers != old.s.users)
+		return (EBUSY);
+
+	/*
+	 * The replacement lock value is a result of allocating a new ticket and
+	 * incrementing the reader value to match it.
+	 */
+	new.s.readers = new.s.users = old.s.users + 1;
+	return (WT_ATOMIC_CAS8(l->u, old.u, new.u) ? 0 : EBUSY);
 }
 
 /*
@@ -93,8 +176,7 @@ int
 __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
 	wt_rwlock_t *l;
-	uint64_t me;
-	uint16_t val;
+	uint16_t ticket;
 	int pause_cnt;
 
 	WT_RET(__wt_verbose(
@@ -102,17 +184,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 	WT_STAT_FAST_CONN_INCR(session, rwlock_read);
 
 	l = &rwlock->rwlock;
-	me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
-	val = (uint16_t)(me >> 32);
-	for (pause_cnt = 0; val != l->s.readers;) {
+
+	/*
+	 * Possibly wrap: if we have more than 64K lockers waiting, the ticket
+	 * value will wrap and two lockers will simultaneously be granted the
+	 * lock.
+	 */
+	ticket = WT_ATOMIC_FETCH_ADD2(l->s.users, 1);
+	for (pause_cnt = 0; ticket != l->s.readers;) {
 		/*
 		 * We failed to get the lock; pause before retrying and if we've
 		 * paused enough, sleep so we don't burn CPU to no purpose. This
 		 * situation happens if there are more threads than cores in the
-		 * system and we're thrashing on shared resources. Regardless,
-		 * don't sleep long, all we need is to schedule the other reader
-		 * threads to complete a few more instructions and increment the
-		 * reader count.
+		 * system and we're thrashing on shared resources.
+		 *
+		 * Don't sleep long when waiting on a read lock, hopefully we're
+		 * waiting on another read thread to increment the reader count.
 		 */
 		if (++pause_cnt < 1000)
 			WT_PAUSE();
@@ -120,6 +207,10 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 			__wt_sleep(0, 10);
 	}
 
+	/*
+	 * We're the only writer of the readers field, so the update does not
+	 * need to be atomic.
+	 */
 	++l->s.readers;
 
 	return (0);
@@ -138,6 +229,11 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 	    session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name));
 
 	l = &rwlock->rwlock;
+
+	/*
+	 * Increment the writers value (other readers are doing the same, make
+	 * sure we don't race).
+	 */
 	WT_ATOMIC_ADD2(l->s.writers, 1);
 
 	return (0);
@@ -150,20 +246,28 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 int
 __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
-	wt_rwlock_t *l;
-	uint64_t old, new, pad, readers, users;
+	wt_rwlock_t *l, new, old;
 
 	WT_RET(__wt_verbose(
 	    session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name));
 	WT_STAT_FAST_CONN_INCR(session, rwlock_write);
 
 	l = &rwlock->rwlock;
-	pad = l->s.pad;
-	readers = l->s.readers;
-	users = l->s.users;
-	old = (pad << 48) + (users << 32) + (readers << 16) + users;
-	new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users;
-	return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY);
+	old = new = *l;
+
+	/*
+	 * This write lock can only be granted if the lock was last granted to
+	 * a writer and there are no readers or writers blocked on the lock,
+	 * that is, if this thread's ticket would be the next ticket granted.
+	 * Do the cheap test to see if this can possibly succeed (and confirm
+	 * the lock is in the correct state to grant this write lock).
+	 */
+	if (old.s.writers != old.s.users)
+		return (EBUSY);
+
+	/* The replacement lock value is a result of allocating a new ticket. */
+	++new.s.users;
+	return (WT_ATOMIC_CAS8(l->u, old.u, new.u) ? 0 : EBUSY);
 }
 
 /*
@@ -174,23 +278,33 @@ int
 __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 {
 	wt_rwlock_t *l;
-	uint64_t me;
-	uint16_t val;
+	uint16_t ticket;
+	int pause_cnt;
 
 	WT_RET(__wt_verbose(
 	    session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name));
 	WT_STAT_FAST_CONN_INCR(session, rwlock_write);
 
+	l = &rwlock->rwlock;
+
 	/*
-	 * Possibly wrap: if we have more than 64K lockers waiting, the count
-	 * of writers will wrap and two lockers will simultaneously be granted
-	 * the write lock.
+	 * Possibly wrap: if we have more than 64K lockers waiting, the ticket
+	 * value will wrap and two lockers will simultaneously be granted the
+	 * lock.
 	 */
-	l = &rwlock->rwlock;
-	me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32);
-	val = (uint16_t)(me >> 32);
-	while (val != l->s.writers)
-		WT_PAUSE();
+	ticket = WT_ATOMIC_FETCH_ADD2(l->s.users, 1);
+	for (pause_cnt = 0; ticket != l->s.writers;) {
+		/*
+		 * We failed to get the lock; pause before retrying and if we've
+		 * paused enough, sleep so we don't burn CPU to no purpose. This
+		 * situation happens if there are more threads than cores in the
+		 * system and we're thrashing on shared resources.
+		 */
+		if (++pause_cnt < 1000)
+			WT_PAUSE();
+		else
+			__wt_sleep(0, 10);
+	}
 
 	return (0);
 }
@@ -211,12 +325,23 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
 
 	copy = *l;
 
+	/*
+	 * We're the only writer of the writers/readers fields, so the update
+	 * does not need to be atomic; we have to update both values at the
+	 * same time though, otherwise we'd potentially race with the thread
+	 * next granted the lock.
+	 *
+	 * Use a memory barrier to ensure the compiler doesn't mess with these
+	 * instructions and rework the code in a way that avoids the update as
+	 * a unit.
+	 */
 	WT_BARRIER();
 
 	++copy.s.writers;
 	++copy.s.readers;
 
-	l->i.us = copy.i.us;
+	l->i.wr = copy.i.wr;
+
 	return (0);
 }