1 files changed, 234 insertions, 171 deletions
diff --git a/src/support/mtx_rw.c b/src/support/mtx_rw.c
index 35ad5da23f2..b2ab32bdef1 100644
--- a/src/support/mtx_rw.c
+++ b/src/support/mtx_rw.c
@@ -27,7 +27,7 @@
  */
 
 /*
- * Based on "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst:
+ * Inspired by "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst:
  *	http://locklessinc.com/articles/locks/
  *
  * Dr. Fuerst further credits:
@@ -39,77 +39,46 @@
  * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
  * Reader-Writer Synchronization for Shared-Memory Multiprocessors".
  *
- * The following is an explanation of this code. First, the underlying lock
- * structure.
+ * The following is an explanation of our interpretation and implementation.
+ * First, the underlying lock structure.
  *
+ * volatile union {
+ *	uint64_t v;				// Full 64-bit value
  *	struct {
- *		uint16_t writers;	Now serving for writers
- *		uint16_t readers;	Now serving for readers
- *		uint16_t next;		Next available ticket number
- *		uint16_t __notused;	Padding
- *	}
+ *		uint8_t current;		// Current ticket
+ *		uint8_t next;			// Next available ticket
+ *		uint8_t reader;			// Read queue ticket
+ *		uint8_t __notused;		// Padding
+ *		uint16_t readers_active;	// Count of active readers
+ *		uint16_t readers_queued;	// Count of queued readers
+ *	} s;
+ * } u;
  *
  * First, imagine a store's 'take a number' ticket algorithm. A customer takes
  * a unique ticket number and customers are served in ticket order. In the data
- * structure, 'writers' is the next writer to be served, 'readers' is the next
- * reader to be served, and 'next' is the next available ticket number.
+ * structure, 'next' is the ticket that will be allocated next, and 'current'
+ * is the ticket being served.
  *
- * Next, consider exclusive (write) locks. The 'now serving' number for writers
- * is 'writers'. To lock, 'take a number' and wait until that number is being
- * served; more specifically, atomically copy and increment the current value of
- * 'next', and then wait until 'writers' equals that copied number.
+ * Next, consider exclusive (write) locks.  To lock, 'take a number' and wait
+ * until that number is being served; more specifically, atomically increment
+ * 'next', and then wait until 'current' equals that allocated ticket.
  *
- * Shared (read) locks are similar. Like writers, readers atomically get the
- * next number available. However, instead of waiting for 'writers' to equal
- * their number, they wait for 'readers' to equal their number.
+ * Shared (read) locks are similar, except that readers can share a ticket
+ * (both with each other and with a single writer).  Readers with a given
+ * ticket execute before the writer with that ticket.  In other words, writers
+ * wait for both their ticket to become current and for all readers to exit
+ * the lock.
  *
- * This has the effect of queuing lock requests in the order they arrive
- * (incidentally avoiding starvation).
+ * If there are no active writers (indicated by 'current' == 'next'), readers
+ * can immediately enter the lock by atomically incrementing 'readers_active'.
+ * When there are writers active, readers form a new queue by first setting
+ * 'reader' to 'next' (i.e. readers are scheduled after any queued writers,
+ * avoiding starvation), then atomically incrementing 'readers_queued'.
  *
- * Each lock/unlock pair requires incrementing both 'readers' and 'writers'.
- * In the case of a reader, the 'readers' increment happens when the reader
- * acquires the lock (to allow read-lock sharing), and the 'writers' increment
- * happens when the reader releases the lock. In the case of a writer, both
- * 'readers' and 'writers' are incremented when the writer releases the lock.
- *
- * For example, consider the following read (R) and write (W) lock requests:
- *
- *						writers	readers	next
- *						0	0	0
- *	R: ticket 0, readers match	OK	0	1	1
- *	R: ticket 1, readers match	OK	0	2	2
- *	R: ticket 2, readers match	OK	0	3	3
- *	W: ticket 3, writers no match	block	0	3	4
- *	R: ticket 2, unlock			1	3	4
- *	R: ticket 0, unlock			2	3	4
- *	R: ticket 1, unlock			3	3	4
- *	W: ticket 3, writers match	OK	3	3	4
- *
- * Note the writer blocks until 'writers' equals its ticket number and it does
- * not matter if readers unlock in order or not.
- *
- * Readers or writers entering the system after the write lock is queued block,
- * and the next ticket holder (reader or writer) will unblock when the writer
- * unlocks. An example, continuing from the last line of the above example:
- *
- *						writers	readers	next
- *	W: ticket 3, writers match	OK	3	3	4
- *	R: ticket 4, readers no match	block	3	3	5
- *	R: ticket 5, readers no match	block	3	3	6
- *	W: ticket 6, writers no match	block	3	3	7
- *	W: ticket 3, unlock			4	4	7
- *	R: ticket 4, readers match	OK	4	5	7
- *	R: ticket 5, readers match	OK	4	6	7
- *
- * The 'next' field is a 2-byte value so the available ticket number wraps at
- * 64K requests. If a thread's lock request is not granted until the 'next'
- * field cycles and the same ticket is taken by another thread, we could grant
- * a lock to two separate threads at the same time, and bad things happen: two
- * writer threads or a reader thread and a writer thread would run in parallel,
- * and lock waiters could be skipped if the unlocks race. This is unlikely, it
- * only happens if a lock request is blocked by 64K other requests. The fix is
- * to grow the lock structure fields, but the largest atomic instruction we have
- * is 8 bytes, the structure has no room to grow.
+ * The 'next' field is a 1-byte value so the available ticket number wraps
+ * after 256 requests. If a thread's write lock request would cause the 'next'
+ * field to catch up with 'current', instead it waits to avoid the same ticket
+ * being allocated to multiple threads.
  */
 
 #include "wt_internal.h"
@@ -118,12 +87,14 @@
  * __wt_rwlock_init --
  *	Initialize a read/write lock.
  */
-void
+int
 __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 {
-	WT_UNUSED(session);
+	l->u.v = 0;
 
-	l->u = 0;
+	WT_RET(__wt_cond_alloc(session, "rwlock wait", &l->cond_readers));
+	WT_RET(__wt_cond_alloc(session, "rwlock wait", &l->cond_writers));
+	return (0);
 }
 
 /*
@@ -133,9 +104,10 @@ __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 void
 __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 {
-	WT_UNUSED(session);
+	l->u.v = 0;
 
-	l->u = 0;
+	__wt_cond_destroy(session, &l->cond_readers);
+	__wt_cond_destroy(session, &l->cond_writers);
 }
 
 /*
@@ -149,46 +121,35 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 
 	WT_STAT_CONN_INCR(session, rwlock_read);
 
-	new = old = *l;
+	new.u.v = old.u.v = l->u.v;
 
 	/*
-	 * This read lock can only be granted if the lock was last granted to
-	 * a reader and there are no readers or writers blocked on the lock,
-	 * that is, if this thread's ticket would be the next ticket granted.
-	 * Do the cheap test to see if this can possibly succeed (and confirm
-	 * the lock is in the correct state to grant this read lock).
+	 * This read lock can only be granted if there are no active writers.
+	 *
+	 * Also check for overflow in case there are 64K active readers.
 	 */
-	if (old.s.readers != old.s.next)
+	if (old.u.s.current != old.u.s.next ||
+	    new.u.s.readers_active == UINT16_MAX)
 		return (EBUSY);
 
 	/*
-	 * The replacement lock value is a result of allocating a new ticket and
-	 * incrementing the reader value to match it.
+	 * The replacement lock value is a result of adding an active reader.
+	 *
+	 * We rely on this atomic operation to provide a barrier.
 	 */
-	new.s.readers = new.s.next = old.s.next + 1;
-	return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
+	new.u.s.readers_active++;
+	return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY);
 }
 
 /*
- * __wt_readlock_spin --
- *	Spin to get a read lock: only yield the CPU if the lock is held
- *	exclusive.
+ * __read_blocked --
+ *	Check whether the current read lock request should keep waiting.
  */
-void
-__wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l)
+static bool
+__read_blocked(WT_SESSION_IMPL *session)
 {
-	/*
-	 * Try to get the lock in a single operation if it is available to
-	 * readers.  This avoids the situation where multiple readers arrive
-	 * concurrently and have to line up in order to enter the lock.  For
-	 * read-heavy workloads it can make a significant difference.
-	 */
-	while (__wt_try_readlock(session, l) != 0) {
-		if (l->s.writers_active > 0)
-			__wt_yield();
-		else
-			WT_PAUSE();
-	}
+	return (session->current_rwticket !=
+	    session->current_rwlock->u.s.current);
 }
 
 /*
@@ -198,41 +159,90 @@ __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 void
 __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 {
-	uint16_t ticket;
+	WT_RWLOCK new, old;
 	int pause_cnt;
+	int16_t writers_active;
+	uint8_t ticket;
 
 	WT_STAT_CONN_INCR(session, rwlock_read);
 
 	WT_DIAGNOSTIC_YIELD;
 
-	/*
-	 * Possibly wrap: if we have more than 64K lockers waiting, the ticket
-	 * value will wrap and two lockers will simultaneously be granted the
-	 * lock.
-	 */
-	ticket = __wt_atomic_fetch_add16(&l->s.next, 1);
-	for (pause_cnt = 0; ticket != l->s.readers;) {
+	for (;;) {
 		/*
-		 * We failed to get the lock; pause before retrying and if we've
-		 * paused enough, yield so we don't burn CPU to no purpose. This
-		 * situation happens if there are more threads than cores in the
-		 * system and we're thrashing on shared resources.
+		 * Fast path: if there is no active writer, join the current
+		 * group.
 		 */
-		if (++pause_cnt < WT_THOUSAND)
+		for (old.u.v = l->u.v;
+		    old.u.s.current == old.u.s.next;
+		    old.u.v = l->u.v) {
+			new.u.v = old.u.v;
+			/*
+			 * Check for overflow: if the maximum number of readers
+			 * are already active, wait to try again.
+			 */
+			if (++new.u.s.readers_active == 0)
+				goto stall;
+			if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
+				return;
 			WT_PAUSE();
-		else
+		}
+
+		/*
+		 * There is an active writer: join the next group.
+		 *
+		 * Limit how many readers can queue: don't allow more readers
+		 * to queue than there are active writers (calculated as
+		 * `next - current`): otherwise, in write-heavy workloads,
+		 * readers can keep queuing up in front of writers and
+		 * throughput is unstable.
+		 *
+		 * If the maximum number of readers are already queued, wait
+		 * until we can get a valid ticket.
+		 */
+		writers_active = old.u.s.next - old.u.s.current;
+		if (old.u.s.readers_queued > writers_active) {
+stall:			__wt_cond_wait(
+			    session, l->cond_readers, WT_THOUSAND, NULL);
+			continue;
+		}
+
+		/*
+		 * If we are the first reader to queue, set the next read
+		 * group.  Note: don't re-read from the lock or we could race
+		 * with a writer unlocking.
+		 */
+		new.u.v = old.u.v;
+		if (new.u.s.readers_queued++ == 0)
+			new.u.s.reader = new.u.s.next;
+		ticket = new.u.s.reader;
+
+		if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
+			break;
+	}
+
+	/* Wait for our group to start. */
+	for (pause_cnt = 0; ticket != l->u.s.current; pause_cnt++) {
+		if (pause_cnt < 1000)
+			WT_PAUSE();
+		else if (pause_cnt < 1200)
 			__wt_yield();
+		else {
+			session->current_rwlock = l;
+			session->current_rwticket = ticket;
+			__wt_cond_wait(
+			    session, l->cond_readers, 0, __read_blocked);
+		}
 	}
 
-	/*
-	 * We're the only writer of the readers field, so the update does not
-	 * need to be atomic.
-	 */
-	++l->s.readers;
+	WT_ASSERT(session, l->u.s.readers_active > 0);
 
 	/*
 	 * Applications depend on a barrier here so that operations holding the
-	 * lock see consistent data.
+	 * lock see consistent data.  The atomic operation above isn't
+	 * sufficient here because we don't own the lock until our ticket comes
+	 * up and whatever data we are protecting may have changed in the
+	 * meantime.
 	 */
 	WT_READ_BARRIER();
 }
@@ -244,13 +254,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 void
 __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 {
-	WT_UNUSED(session);
+	WT_RWLOCK new, old;
 
-	/*
-	 * Increment the writers value (other readers are doing the same, make
-	 * sure we don't race).
-	 */
-	(void)__wt_atomic_add16(&l->s.writers, 1);
+	do {
+		old.u.v = l->u.v;
+		WT_ASSERT(session, old.u.s.readers_active > 0);
+
+		/*
+		 * Decrement the active reader count (other readers are doing
+		 * the same, make sure we don't race).
+		 */
+		new.u.v = old.u.v;
+		--new.u.s.readers_active;
+	} while (!__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v));
+
+	if (new.u.s.readers_active == 0 && new.u.s.current != new.u.s.next)
+		__wt_cond_signal(session, l->cond_writers);
 }
 
 /*
@@ -264,22 +283,44 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 
 	WT_STAT_CONN_INCR(session, rwlock_write);
 
-	old = new = *l;
-
 	/*
-	 * This write lock can only be granted if the lock was last granted to
-	 * a writer and there are no readers or writers blocked on the lock,
-	 * that is, if this thread's ticket would be the next ticket granted.
-	 * Do the cheap test to see if this can possibly succeed (and confirm
-	 * the lock is in the correct state to grant this write lock).
+	 * This write lock can only be granted if no readers or writers blocked
+	 * on the lock, that is, if this thread's ticket would be the next
+	 * ticket granted.  Check if this can possibly succeed (and confirm the
+	 * lock is in the correct state to grant this write lock).
 	 */
-	if (old.s.writers != old.s.next)
+	old.u.v = l->u.v;
+	if (old.u.s.current != old.u.s.next || old.u.s.readers_active != 0)
 		return (EBUSY);
 
-	/* The replacement lock value is a result of allocating a new ticket. */
-	++new.s.next;
-	++new.s.writers_active;
-	return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
+	/*
+	 * We've checked above that there is no writer active (since
+	 * `current == next`), so there should be no readers queued.
+	 */
+	WT_ASSERT(session, old.u.s.readers_queued == 0);
+
+	/*
+	 * The replacement lock value is a result of allocating a new ticket.
+	 *
+	 * We rely on this atomic operation to provide a barrier.
+	 */
+	new.u.v = old.u.v;
+	new.u.s.next++;
+	return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY);
+}
+
+/*
+ * __write_blocked --
+ *	Check whether the current write lock request should keep waiting.
+ */
+static bool
+__write_blocked(WT_SESSION_IMPL *session)
+{
+	WT_RWLOCK *l;
+
+	l = session->current_rwlock;
+	return (session->current_rwticket != l->u.s.current ||
+	    l->u.s.readers_active != 0);
 }
 
 /*
@@ -289,34 +330,51 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 void
 __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 {
-	uint16_t ticket;
+	WT_RWLOCK new, old;
 	int pause_cnt;
+	uint8_t ticket;
 
 	WT_STAT_CONN_INCR(session, rwlock_write);
 
-	/*
-	 * Possibly wrap: if we have more than 64K lockers waiting, the ticket
-	 * value will wrap and two lockers will simultaneously be granted the
-	 * lock.
-	 */
-	ticket = __wt_atomic_fetch_add16(&l->s.next, 1);
-	(void)__wt_atomic_add16(&l->s.writers_active, 1);
-	for (pause_cnt = 0; ticket != l->s.writers;) {
+	for (;;) {
+		new.u.v = old.u.v = l->u.v;
+		ticket = new.u.s.next++;
+
 		/*
-		 * We failed to get the lock; pause before retrying and if we've
-		 * paused enough, sleep so we don't burn CPU to no purpose. This
-		 * situation happens if there are more threads than cores in the
-		 * system and we're thrashing on shared resources.
+		 * Avoid wrapping: if we allocate more than 256 tickets, two
+		 * lockers will simultaneously be granted the lock.
 		 */
-		if (++pause_cnt < WT_THOUSAND)
+		if (new.u.s.current == new.u.s.next) {
+			__wt_cond_wait(
+			    session, l->cond_writers, WT_THOUSAND, NULL);
+			continue;
+		}
+		if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
+			break;
+	}
+
+	/* Wait for our group to start and any readers to drain. */
+	for (pause_cnt = 0;
+	    ticket != l->u.s.current || l->u.s.readers_active != 0;
+	    pause_cnt++) {
+		if (pause_cnt < 1000)
 			WT_PAUSE();
-		else
-			__wt_sleep(0, 10);
+		else if (pause_cnt < 1200)
+			__wt_yield();
+		else {
+			session->current_rwlock = l;
+			session->current_rwticket = ticket;
+			__wt_cond_wait(
+			    session, l->cond_writers, 0, __write_blocked);
+		}
 	}
 
 	/*
 	 * Applications depend on a barrier here so that operations holding the
-	 * lock see consistent data.
+	 * lock see consistent data.  The atomic operation above isn't
+	 * sufficient here because we don't own the lock until our ticket comes
+	 * up and whatever data we are protecting may have changed in the
+	 * meantime.
 	 */
 	WT_READ_BARRIER();
 }
@@ -328,29 +386,34 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 void
 __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 {
-	WT_RWLOCK new;
-
-	WT_UNUSED(session);
+	WT_RWLOCK new, old;
 
-	(void)__wt_atomic_sub16(&l->s.writers_active, 1);
+	do {
+		new.u.v = old.u.v = l->u.v;
 
-	/*
-	 * Ensure that all updates made while the lock was held are visible to
-	 * the next thread to acquire the lock.
-	 */
-	WT_WRITE_BARRIER();
+		/*
+		 * We're holding the lock exclusive, there shouldn't be any
+		 * active readers.
+		 */
+		WT_ASSERT(session, old.u.s.readers_active == 0);
 
-	new = *l;
+		/*
+		 * Allow the next batch to start.
+		 *
+		 * If there are readers in the next group, swap queued readers
+		 * to active: this could race with new readlock requests, so we
+		 * have to spin.
+		 */
+		if (++new.u.s.current == new.u.s.reader) {
+			new.u.s.readers_active = new.u.s.readers_queued;
+			new.u.s.readers_queued = 0;
+		}
+	} while (!__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v));
 
-	/*
-	 * We're the only writer of the writers/readers fields, so the update
-	 * does not need to be atomic; we have to update both values at the
-	 * same time though, otherwise we'd potentially race with the thread
-	 * next granted the lock.
-	 */
-	++new.s.writers;
-	++new.s.readers;
-	l->i.wr = new.i.wr;
+	if (new.u.s.readers_active != 0)
+		__wt_cond_signal(session, l->cond_readers);
+	else if (new.u.s.current != new.u.s.next)
+		__wt_cond_signal(session, l->cond_writers);
 
 	WT_DIAGNOSTIC_YIELD;
 }
@@ -365,6 +428,6 @@ __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l)
 {
 	WT_UNUSED(session);
 
-	return (l->s.writers != l->s.next || l->s.readers != l->s.next);
+	return (l->u.s.current != l->u.s.next || l->u.s.readers_active != 0);
 }
 #endif