/*-
 * Public Domain 2014-2017 MongoDB, Inc.
 * Public Domain 2008-2014 WiredTiger, Inc.
 *
 * This is free and unencumbered software released into the public domain.
 *
 * Anyone is free to copy, modify, publish, use, compile, sell, or
 * distribute this software, either in source code form or as a compiled
 * binary, for any purpose, commercial or non-commercial, and by any
 * means.
 *
 * In jurisdictions that recognize copyright laws, the author or authors
 * of this software dedicate any and all copyright interest in the
 * software to the public domain. We make this dedication for the benefit
 * of the public at large and to the detriment of our heirs and
 * successors. We intend this dedication to be an overt act of
 * relinquishment in perpetuity of all present and future rights to this
 * software under copyright law.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

/*
 * Inspired by "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst:
 *	http://locklessinc.com/articles/locks/
 *
 * Dr. Fuerst further credits:
 *	There exists a form of the ticket lock that is designed for read-write
 * locks. An example written in assembly was posted to the Linux kernel mailing
 * list in 2002 by David Howells from RedHat. This was a highly optimized
 * version of a read-write ticket lock developed at IBM in the early 90's by
 * Joseph Seigh. Note that a similar (but not identical) algorithm was published
 * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
 * Reader-Writer Synchronization for Shared-Memory Multiprocessors".
 *
 * The following is an explanation of our interpretation and implementation.
 * First, the underlying lock structure.
 *
 * volatile union {
 *	uint64_t v;				// Full 64-bit value
 *	struct {
 *		uint8_t current;		// Current ticket
 *		uint8_t next;			// Next available ticket
 *		uint8_t reader;			// Read queue ticket
 *		uint8_t __notused;		// Padding
 *		uint16_t readers_active;	// Count of active readers
 *		uint16_t readers_queued;	// Count of queued readers
 *	} s;
 * } u;
 *
 * First, imagine a store's 'take a number' ticket algorithm. A customer takes
 * a unique ticket number and customers are served in ticket order. In the data
 * structure, 'next' is the ticket that will be allocated next, and 'current'
 * is the ticket being served.
 *
 * Next, consider exclusive (write) locks.  To lock, 'take a number' and wait
 * until that number is being served; more specifically, atomically increment
 * 'next', and then wait until 'current' equals that allocated ticket.
 *
 * Shared (read) locks are similar, except that readers can share a ticket
 * (both with each other and with a single writer).  Readers with a given
 * ticket execute before the writer with that ticket.  In other words, writers
 * wait for both their ticket to become current and for all readers to exit
 * the lock.
 *
 * If there are no active writers (indicated by 'current' == 'next'), readers
 * can immediately enter the lock by atomically incrementing 'readers_active'.
 * When there are writers active, readers form a new queue by first setting
 * 'reader' to 'next' (i.e. readers are scheduled after any queued writers,
 * avoiding starvation), then atomically incrementing 'readers_queued'.
 *
 * The 'next' field is a 1-byte value so the available ticket number wraps
 * after 256 requests. If a thread's write lock request would cause the 'next'
 * field to catch up with 'current', instead it waits to avoid the same ticket
 * being allocated to multiple threads.
 */

#include "wt_internal.h"

/*
 * __wt_rwlock_init --
 *	Initialize a read/write lock.
 */
int
__wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
	l->u.v = 0;

	WT_RET(__wt_cond_alloc(session, "rwlock wait", &l->cond_readers));
	WT_RET(__wt_cond_alloc(session, "rwlock wait", &l->cond_writers));
	return (0);
}

/*
 * __wt_rwlock_destroy --
 *	Destroy a read/write lock.
 */
void
__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
	l->u.v = 0;

	__wt_cond_destroy(session, &l->cond_readers);
	__wt_cond_destroy(session, &l->cond_writers);
}

/*
 * __wt_try_readlock --
 *	Try to get a shared lock, fail immediately if unavailable.
 */
int
__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
	WT_RWLOCK new, old;

	WT_STAT_CONN_INCR(session, rwlock_read);

	new.u.v = old.u.v = l->u.v;

	/*
	 * This read lock can only be granted if there are no active writers.
	 *
	 * Also check for overflow in case there are 64K active readers.
	 */
	if (old.u.s.current != old.u.s.next ||
	    new.u.s.readers_active == UINT16_MAX)
		return (EBUSY);

	/*
	 * The replacement lock value is a result of adding an active reader.
	 *
	 * We rely on this atomic operation to provide a barrier.
	 */
	new.u.s.readers_active++;
	return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY);
}

/*
 * __read_blocked --
 *	Check whether the current read lock request should keep waiting.
 */
static bool
__read_blocked(WT_SESSION_IMPL *session)
{
	return (session->current_rwticket !=
	    session->current_rwlock->u.s.current);
}

/*
 * __wt_readlock --
 *	Get a shared lock.
 */
void
__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
	WT_RWLOCK new, old;
	int pause_cnt;
	int16_t writers_active;
	uint8_t ticket;

	WT_STAT_CONN_INCR(session, rwlock_read);

	WT_DIAGNOSTIC_YIELD;

	for (;;) {
		/*
		 * Fast path: if there is no active writer, join the current
		 * group.
		 */
		for (old.u.v = l->u.v;
		    old.u.s.current == old.u.s.next;
		    old.u.v = l->u.v) {
			new.u.v = old.u.v;
			/*
			 * Check for overflow: if the maximum number of readers
			 * are already active, wait to try again.
			 */
			if (++new.u.s.readers_active == 0)
				goto stall;
			if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
				return;
			WT_PAUSE();
		}

		/*
		 * There is an active writer: join the next group.
		 *
		 * Limit how many readers can queue: don't allow more readers
		 * to queue than there are active writers (calculated as
		 * `next - current`): otherwise, in write-heavy workloads,
		 * readers can keep queuing up in front of writers and
		 * throughput is unstable.
		 *
		 * If the maximum number of readers are already queued, wait
		 * until we can get a valid ticket.
		 */
		writers_active = old.u.s.next - old.u.s.current;
		if (old.u.s.readers_queued > writers_active) {
stall:			__wt_cond_wait(
			    session, l->cond_readers, WT_THOUSAND, NULL);
			continue;
		}

		/*
		 * If we are the first reader to queue, set the next read
		 * group.  Note: don't re-read from the lock or we could race
		 * with a writer unlocking.
		 */
		new.u.v = old.u.v;
		if (new.u.s.readers_queued++ == 0)
			new.u.s.reader = new.u.s.next;
		ticket = new.u.s.reader;

		if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
			break;
	}

	/* Wait for our group to start. */
	for (pause_cnt = 0; ticket != l->u.s.current; pause_cnt++) {
		if (pause_cnt < 1000)
			WT_PAUSE();
		else if (pause_cnt < 1200)
			__wt_yield();
		else {
			session->current_rwlock = l;
			session->current_rwticket = ticket;
			__wt_cond_wait(
			    session, l->cond_readers, 0, __read_blocked);
		}
	}

	/*
	 * Applications depend on a barrier here so that operations holding the
	 * lock see consistent data.  The atomic operation above isn't
	 * sufficient here because we don't own the lock until our ticket comes
	 * up and whatever data we are protecting may have changed in the
	 * meantime.
	 */
	WT_READ_BARRIER();

	WT_ASSERT(session, l->u.s.readers_active > 0);
}

/*
 * __wt_readunlock --
 *	Release a shared lock.
 */
void
__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
	WT_RWLOCK new, old;

	do {
		old.u.v = l->u.v;
		WT_ASSERT(session, old.u.s.readers_active > 0);

		/*
		 * Decrement the active reader count (other readers are doing
		 * the same, make sure we don't race).
		 */
		new.u.v = old.u.v;
		--new.u.s.readers_active;
	} while (!__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v));

	if (new.u.s.readers_active == 0 && new.u.s.current != new.u.s.next)
		__wt_cond_signal(session, l->cond_writers);
}

/*
 * __wt_try_writelock --
 *	Try to get an exclusive lock, fail immediately if unavailable.
 */
int
__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
	WT_RWLOCK new, old;

	WT_STAT_CONN_INCR(session, rwlock_write);

	/*
	 * This write lock can only be granted if no readers or writers blocked
	 * on the lock, that is, if this thread's ticket would be the next
	 * ticket granted.  Check if this can possibly succeed (and confirm the
	 * lock is in the correct state to grant this write lock).
	 */
	old.u.v = l->u.v;
	if (old.u.s.current != old.u.s.next || old.u.s.readers_active != 0)
		return (EBUSY);

	/*
	 * We've checked above that there is no writer active (since
	 * `current == next`), so there should be no readers queued.
	 */
	WT_ASSERT(session, old.u.s.readers_queued == 0);

	/*
	 * The replacement lock value is a result of allocating a new ticket.
	 *
	 * We rely on this atomic operation to provide a barrier.
	 */
	new.u.v = old.u.v;
	new.u.s.next++;
	return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY);
}

/*
 * __write_blocked --
 *	Check whether the current write lock request should keep waiting.
 */
static bool
__write_blocked(WT_SESSION_IMPL *session)
{
	WT_RWLOCK *l;

	l = session->current_rwlock;
	return (session->current_rwticket != l->u.s.current ||
	    l->u.s.readers_active != 0);
}

/*
 * __wt_writelock --
 *	Wait to get an exclusive lock.
 */
void
__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
	WT_RWLOCK new, old;
	int pause_cnt;
	uint8_t ticket;

	WT_STAT_CONN_INCR(session, rwlock_write);

	for (;;) {
		new.u.v = old.u.v = l->u.v;
		ticket = new.u.s.next++;

		/*
		 * Avoid wrapping: if we allocate more than 256 tickets, two
		 * lockers will simultaneously be granted the lock.
		 */
		if (new.u.s.current == new.u.s.next) {
			__wt_cond_wait(
			    session, l->cond_writers, WT_THOUSAND, NULL);
			continue;
		}
		if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v))
			break;
	}

	/* Wait for our group to start and any readers to drain. */
	for (pause_cnt = 0;
	    ticket != l->u.s.current || l->u.s.readers_active != 0;
	    pause_cnt++) {
		if (pause_cnt < 1000)
			WT_PAUSE();
		else if (pause_cnt < 1200)
			__wt_yield();
		else {
			session->current_rwlock = l;
			session->current_rwticket = ticket;
			__wt_cond_wait(
			    session, l->cond_writers, 0, __write_blocked);
		}
	}

	/*
	 * Applications depend on a barrier here so that operations holding the
	 * lock see consistent data.  The atomic operation above isn't
	 * sufficient here because we don't own the lock until our ticket comes
	 * up and whatever data we are protecting may have changed in the
	 * meantime.
	 */
	WT_READ_BARRIER();
}

/*
 * __wt_writeunlock --
 *	Release an exclusive lock.
 */
void
__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
	WT_RWLOCK new, old;

	do {
		new.u.v = old.u.v = l->u.v;

		/*
		 * We're holding the lock exclusive, there shouldn't be any
		 * active readers.
		 */
		WT_ASSERT(session, old.u.s.readers_active == 0);

		/*
		 * Allow the next batch to start.
		 *
		 * If there are readers in the next group, swap queued readers
		 * to active: this could race with new readlock requests, so we
		 * have to spin.
		 */
		if (++new.u.s.current == new.u.s.reader) {
			new.u.s.readers_active = new.u.s.readers_queued;
			new.u.s.readers_queued = 0;
		}
	} while (!__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v));

	if (new.u.s.readers_active != 0)
		__wt_cond_signal(session, l->cond_readers);
	else if (new.u.s.current != new.u.s.next)
		__wt_cond_signal(session, l->cond_writers);

	WT_DIAGNOSTIC_YIELD;
}

#ifdef HAVE_DIAGNOSTIC
/*
 * __wt_rwlock_islocked --
 *	Return if a read/write lock is currently locked for reading or writing.
 */
bool
__wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l)
{
	WT_UNUSED(session);

	return (l->u.s.current != l->u.s.next || l->u.s.readers_active != 0);
}
#endif