/*-
 * Public Domain 2014-2016 MongoDB, Inc.
 * Public Domain 2008-2014 WiredTiger, Inc.
 *
 * This is free and unencumbered software released into the public domain.
 *
 * Anyone is free to copy, modify, publish, use, compile, sell, or
 * distribute this software, either in source code form or as a compiled
 * binary, for any purpose, commercial or non-commercial, and by any
 * means.
 *
 * In jurisdictions that recognize copyright laws, the author or authors
 * of this software dedicate any and all copyright interest in the
 * software to the public domain. We make this dedication for the benefit
 * of the public at large and to the detriment of our heirs and
 * successors. We intend this dedication to be an overt act of
 * relinquishment in perpetuity of all present and future rights to this
 * software under copyright law.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

/*
 * Based on "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst:
 *	http://locklessinc.com/articles/locks/
 *
 * Dr. Fuerst further credits:
 *	There exists a form of the ticket lock that is designed for read-write
 * locks. An example written in assembly was posted to the Linux kernel mailing
 * list in 2002 by David Howells from RedHat. This was a highly optimized
 * version of a read-write ticket lock developed at IBM in the early 90's by
 * Joseph Seigh. Note that a similar (but not identical) algorithm was published
 * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable
 * Reader-Writer Synchronization for Shared-Memory Multiprocessors".
 *
 * The following is an explanation of this code. First, the underlying lock
 * structure.
 *
 *	struct {
 *		uint16_t writers;	Now serving for writers
 *		uint16_t readers;	Now serving for readers
 *		uint16_t next;		Next available ticket number
 *		uint16_t __notused;	Padding
 *	}
 *
 * First, imagine a store's 'take a number' ticket algorithm. A customer takes
 * a unique ticket number and customers are served in ticket order. In the data
 * structure, 'writers' is the next writer to be served, 'readers' is the next
 * reader to be served, and 'next' is the next available ticket number.
 *
 * Next, consider exclusive (write) locks. The 'now serving' number for writers
 * is 'writers'. To lock, 'take a number' and wait until that number is being
 * served; more specifically, atomically copy and increment the current value of
 * 'next', and then wait until 'writers' equals that copied number.
 *
 * Shared (read) locks are similar. Like writers, readers atomically get the
 * next number available. However, instead of waiting for 'writers' to equal
 * their number, they wait for 'readers' to equal their number.
 *
 * This has the effect of queuing lock requests in the order they arrive
 * (incidentally avoiding starvation).
 *
 * Each lock/unlock pair requires incrementing both 'readers' and 'writers'.
 * In the case of a reader, the 'readers' increment happens when the reader
 * acquires the lock (to allow read-lock sharing), and the 'writers' increment
 * happens when the reader releases the lock. In the case of a writer, both
 * 'readers' and 'writers' are incremented when the writer releases the lock.
 *
 * For example, consider the following read (R) and write (W) lock requests:
 *
 *						writers	readers	next
 *						0	0	0
 *	R: ticket 0, readers match	OK	0	1	1
 *	R: ticket 1, readers match	OK	0	2	2
 *	R: ticket 2, readers match	OK	0	3	3
 *	W: ticket 3, writers no match	block	0	3	4
 *	R: ticket 2, unlock			1	3	4
 *	R: ticket 0, unlock			2	3	4
 *	R: ticket 1, unlock			3	3	4
 *	W: ticket 3, writers match	OK	3	3	4
 *
 * Note the writer blocks until 'writers' equals its ticket number and it does
 * not matter if readers unlock in order or not.
 *
 * Readers or writers entering the system after the write lock is queued block,
 * and the next ticket holder (reader or writer) will unblock when the writer
 * unlocks. An example, continuing from the last line of the above example:
 *
 *						writers	readers	next
 *	W: ticket 3, writers match	OK	3	3	4
 *	R: ticket 4, readers no match	block	3	3	5
 *	R: ticket 5, readers no match	block	3	3	6
 *	W: ticket 6, writers no match	block	3	3	7
 *	W: ticket 3, unlock			4	4	7
 *	R: ticket 4, readers match	OK	4	5	7
 *	R: ticket 5, readers match	OK	4	6	7
 *
 * The 'next' field is a 2-byte value so the available ticket number wraps at
 * 64K requests. If a thread's lock request is not granted until the 'next'
 * field cycles and the same ticket is taken by another thread, we could grant
 * a lock to two separate threads at the same time, and bad things happen: two
 * writer threads or a reader thread and a writer thread would run in parallel,
 * and lock waiters could be skipped if the unlocks race. This is unlikely, it
 * only happens if a lock request is blocked by 64K other requests. The fix is
 * to grow the lock structure fields, but the largest atomic instruction we have
 * is 8 bytes, the structure has no room to grow.
 */

#include "wt_internal.h"

/*
 * __wt_rwlock_alloc --
 *	Allocate and initialize a read/write lock.
 */
int
__wt_rwlock_alloc(
    WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp, const char *name)
{
	WT_RWLOCK *rwlock;

	__wt_verbose(session, WT_VERB_MUTEX, "rwlock: alloc %s", name);

	WT_RET(__wt_calloc_one(session, &rwlock));

	rwlock->name = name;

	*rwlockp = rwlock;
	return (0);
}

/*
 * __wt_try_readlock --
 *	Try to get a shared lock, fail immediately if unavailable.
 */
int
__wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
	wt_rwlock_t *l, new, old;

	WT_STAT_FAST_CONN_INCR(session, rwlock_read);

	l = &rwlock->rwlock;
	new = old = *l;

	/*
	 * This read lock can only be granted if the lock was last granted to
	 * a reader and there are no readers or writers blocked on the lock,
	 * that is, if this thread's ticket would be the next ticket granted.
	 * Do the cheap test to see if this can possibly succeed (and confirm
	 * the lock is in the correct state to grant this read lock).
	 */
	if (old.s.readers != old.s.next)
		return (EBUSY);

	/*
	 * The replacement lock value is a result of allocating a new ticket and
	 * incrementing the reader value to match it.
	 */
	new.s.readers = new.s.next = old.s.next + 1;
	return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
}

/*
 * __wt_readlock --
 *	Get a shared lock.
 */
void
__wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
	wt_rwlock_t *l;
	uint16_t ticket;
	int pause_cnt;

	WT_STAT_FAST_CONN_INCR(session, rwlock_read);

	WT_DIAGNOSTIC_YIELD;

	l = &rwlock->rwlock;

	/*
	 * Possibly wrap: if we have more than 64K lockers waiting, the ticket
	 * value will wrap and two lockers will simultaneously be granted the
	 * lock.
	 */
	ticket = __wt_atomic_fetch_add16(&l->s.next, 1);
	for (pause_cnt = 0; ticket != l->s.readers;) {
		/*
		 * We failed to get the lock; pause before retrying and if we've
		 * paused enough, sleep so we don't burn CPU to no purpose. This
		 * situation happens if there are more threads than cores in the
		 * system and we're thrashing on shared resources.
		 *
		 * Don't sleep long when waiting on a read lock, hopefully we're
		 * waiting on another read thread to increment the reader count.
		 */
		if (++pause_cnt < WT_THOUSAND)
			WT_PAUSE();
		else
			__wt_sleep(0, 10);
	}

	/*
	 * We're the only writer of the readers field, so the update does not
	 * need to be atomic.
	 */
	++l->s.readers;

	/*
	 * Applications depend on a barrier here so that operations holding the
	 * lock see consistent data.
	 */
	WT_READ_BARRIER();
}

/*
 * __wt_readunlock --
 *	Release a shared lock.
 */
void
__wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
	wt_rwlock_t *l;

	WT_UNUSED(session);

	l = &rwlock->rwlock;

	/*
	 * Increment the writers value (other readers are doing the same, make
	 * sure we don't race).
	 */
	(void)__wt_atomic_add16(&l->s.writers, 1);
}

/*
 * __wt_try_writelock --
 *	Try to get an exclusive lock, fail immediately if unavailable.
 */
int
__wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
	wt_rwlock_t *l, new, old;

	WT_STAT_FAST_CONN_INCR(session, rwlock_write);

	l = &rwlock->rwlock;
	old = new = *l;

	/*
	 * This write lock can only be granted if the lock was last granted to
	 * a writer and there are no readers or writers blocked on the lock,
	 * that is, if this thread's ticket would be the next ticket granted.
	 * Do the cheap test to see if this can possibly succeed (and confirm
	 * the lock is in the correct state to grant this write lock).
	 */
	if (old.s.writers != old.s.next)
		return (EBUSY);

	/* The replacement lock value is a result of allocating a new ticket. */
	++new.s.next;
	return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
}

/*
 * __wt_writelock --
 *	Wait to get an exclusive lock.
 */
void
__wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
	wt_rwlock_t *l;
	uint16_t ticket;
	int pause_cnt;

	WT_STAT_FAST_CONN_INCR(session, rwlock_write);

	l = &rwlock->rwlock;

	/*
	 * Possibly wrap: if we have more than 64K lockers waiting, the ticket
	 * value will wrap and two lockers will simultaneously be granted the
	 * lock.
	 */
	ticket = __wt_atomic_fetch_add16(&l->s.next, 1);
	for (pause_cnt = 0; ticket != l->s.writers;) {
		/*
		 * We failed to get the lock; pause before retrying and if we've
		 * paused enough, sleep so we don't burn CPU to no purpose. This
		 * situation happens if there are more threads than cores in the
		 * system and we're thrashing on shared resources.
		 */
		if (++pause_cnt < WT_THOUSAND)
			WT_PAUSE();
		else
			__wt_sleep(0, 10);
	}

	/*
	 * Applications depend on a barrier here so that operations holding the
	 * lock see consistent data.
	 */
	WT_READ_BARRIER();
}

/*
 * __wt_writeunlock --
 *	Release an exclusive lock.
 */
void
__wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
	wt_rwlock_t *l, new;

	WT_UNUSED(session);

	/*
	 * Ensure that all updates made while the lock was held are visible to
	 * the next thread to acquire the lock.
	 */
	WT_WRITE_BARRIER();

	l = &rwlock->rwlock;

	new = *l;

	/*
	 * We're the only writer of the writers/readers fields, so the update
	 * does not need to be atomic; we have to update both values at the
	 * same time though, otherwise we'd potentially race with the thread
	 * next granted the lock.
	 */
	++new.s.writers;
	++new.s.readers;
	l->i.wr = new.i.wr;

	WT_DIAGNOSTIC_YIELD;
}

/*
 * __wt_rwlock_destroy --
 *	Destroy a read/write lock.
 */
void
__wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp)
{
	WT_RWLOCK *rwlock;

	rwlock = *rwlockp;		/* Clear our caller's reference. */
	if (rwlock == NULL)
		return;
	*rwlockp = NULL;

	__wt_verbose(
	    session, WT_VERB_MUTEX, "rwlock: destroy %s", rwlock->name);

	__wt_free(session, rwlock);
}

#ifdef HAVE_DIAGNOSTIC
/*
 * __wt_rwlock_islocked --
 *	Return if a read/write lock is currently locked for reading or writing.
 */
bool
__wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
{
	wt_rwlock_t *l;

	WT_UNUSED(session);

	l = &rwlock->rwlock;

	return (l->s.writers != l->s.next || l->s.readers != l->s.next);
}
#endif