diff options
Diffstat (limited to 'src/support/mtx_rw.c')
-rw-r--r-- | src/support/mtx_rw.c | 405 |
1 files changed, 234 insertions, 171 deletions
diff --git a/src/support/mtx_rw.c b/src/support/mtx_rw.c index 35ad5da23f2..b2ab32bdef1 100644 --- a/src/support/mtx_rw.c +++ b/src/support/mtx_rw.c @@ -27,7 +27,7 @@ */ /* - * Based on "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst: + * Inspired by "Spinlocks and Read-Write Locks" by Dr. Steven Fuerst: * http://locklessinc.com/articles/locks/ * * Dr. Fuerst further credits: @@ -39,77 +39,46 @@ * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable * Reader-Writer Synchronization for Shared-Memory Multiprocessors". * - * The following is an explanation of this code. First, the underlying lock - * structure. + * The following is an explanation of our interpretation and implementation. + * First, the underlying lock structure. * + * volatile union { + * uint64_t v; // Full 64-bit value * struct { - * uint16_t writers; Now serving for writers - * uint16_t readers; Now serving for readers - * uint16_t next; Next available ticket number - * uint16_t __notused; Padding - * } + * uint8_t current; // Current ticket + * uint8_t next; // Next available ticket + * uint8_t reader; // Read queue ticket + * uint8_t __notused; // Padding + * uint16_t readers_active; // Count of active readers + * uint16_t readers_queued; // Count of queued readers + * } s; + * } u; * * First, imagine a store's 'take a number' ticket algorithm. A customer takes * a unique ticket number and customers are served in ticket order. In the data - * structure, 'writers' is the next writer to be served, 'readers' is the next - * reader to be served, and 'next' is the next available ticket number. + * structure, 'next' is the ticket that will be allocated next, and 'current' + * is the ticket being served. * - * Next, consider exclusive (write) locks. The 'now serving' number for writers - * is 'writers'. To lock, 'take a number' and wait until that number is being - * served; more specifically, atomically copy and increment the current value of - * 'next', and then wait until 'writers' equals that copied number. + * Next, consider exclusive (write) locks. To lock, 'take a number' and wait + * until that number is being served; more specifically, atomically increment + * 'next', and then wait until 'current' equals that allocated ticket. * - * Shared (read) locks are similar. Like writers, readers atomically get the - * next number available. However, instead of waiting for 'writers' to equal - * their number, they wait for 'readers' to equal their number. + * Shared (read) locks are similar, except that readers can share a ticket + * (both with each other and with a single writer). Readers with a given + * ticket execute before the writer with that ticket. In other words, writers + * wait for both their ticket to become current and for all readers to exit + * the lock. * - * This has the effect of queuing lock requests in the order they arrive - * (incidentally avoiding starvation). + * If there are no active writers (indicated by 'current' == 'next'), readers + * can immediately enter the lock by atomically incrementing 'readers_active'. + * When there are writers active, readers form a new queue by first setting + * 'reader' to 'next' (i.e. readers are scheduled after any queued writers, + * avoiding starvation), then atomically incrementing 'readers_queued'. * - * Each lock/unlock pair requires incrementing both 'readers' and 'writers'. - * In the case of a reader, the 'readers' increment happens when the reader - * acquires the lock (to allow read-lock sharing), and the 'writers' increment - * happens when the reader releases the lock. In the case of a writer, both - * 'readers' and 'writers' are incremented when the writer releases the lock. - * - * For example, consider the following read (R) and write (W) lock requests: - * - * writers readers next - * 0 0 0 - * R: ticket 0, readers match OK 0 1 1 - * R: ticket 1, readers match OK 0 2 2 - * R: ticket 2, readers match OK 0 3 3 - * W: ticket 3, writers no match block 0 3 4 - * R: ticket 2, unlock 1 3 4 - * R: ticket 0, unlock 2 3 4 - * R: ticket 1, unlock 3 3 4 - * W: ticket 3, writers match OK 3 3 4 - * - * Note the writer blocks until 'writers' equals its ticket number and it does - * not matter if readers unlock in order or not. - * - * Readers or writers entering the system after the write lock is queued block, - * and the next ticket holder (reader or writer) will unblock when the writer - * unlocks. An example, continuing from the last line of the above example: - * - * writers readers next - * W: ticket 3, writers match OK 3 3 4 - * R: ticket 4, readers no match block 3 3 5 - * R: ticket 5, readers no match block 3 3 6 - * W: ticket 6, writers no match block 3 3 7 - * W: ticket 3, unlock 4 4 7 - * R: ticket 4, readers match OK 4 5 7 - * R: ticket 5, readers match OK 4 6 7 - * - * The 'next' field is a 2-byte value so the available ticket number wraps at - * 64K requests. If a thread's lock request is not granted until the 'next' - * field cycles and the same ticket is taken by another thread, we could grant - * a lock to two separate threads at the same time, and bad things happen: two - * writer threads or a reader thread and a writer thread would run in parallel, - * and lock waiters could be skipped if the unlocks race. This is unlikely, it - * only happens if a lock request is blocked by 64K other requests. The fix is - * to grow the lock structure fields, but the largest atomic instruction we have - * is 8 bytes, the structure has no room to grow. + * The 'next' field is a 1-byte value so the available ticket number wraps + * after 256 requests. If a thread's write lock request would cause the 'next' + * field to catch up with 'current', instead it waits to avoid the same ticket + * being allocated to multiple threads. */ #include "wt_internal.h" @@ -118,12 +87,14 @@ * __wt_rwlock_init -- * Initialize a read/write lock. */ -void +int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - WT_UNUSED(session); + l->u.v = 0; - l->u = 0; + WT_RET(__wt_cond_alloc(session, "rwlock wait", &l->cond_readers)); + WT_RET(__wt_cond_alloc(session, "rwlock wait", &l->cond_writers)); + return (0); } /* @@ -133,9 +104,10 @@ __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - WT_UNUSED(session); + l->u.v = 0; - l->u = 0; + __wt_cond_destroy(session, &l->cond_readers); + __wt_cond_destroy(session, &l->cond_writers); } /* @@ -149,46 +121,35 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_STAT_CONN_INCR(session, rwlock_read); - new = old = *l; + new.u.v = old.u.v = l->u.v; /* - * This read lock can only be granted if the lock was last granted to - * a reader and there are no readers or writers blocked on the lock, - * that is, if this thread's ticket would be the next ticket granted. - * Do the cheap test to see if this can possibly succeed (and confirm - * the lock is in the correct state to grant this read lock). + * This read lock can only be granted if there are no active writers. + * + * Also check for overflow in case there are 64K active readers. */ - if (old.s.readers != old.s.next) + if (old.u.s.current != old.u.s.next || + new.u.s.readers_active == UINT16_MAX) return (EBUSY); /* - * The replacement lock value is a result of allocating a new ticket and - * incrementing the reader value to match it. + * The replacement lock value is a result of adding an active reader. + * + * We rely on this atomic operation to provide a barrier. */ - new.s.readers = new.s.next = old.s.next + 1; - return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY); + new.u.s.readers_active++; + return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY); } /* - * __wt_readlock_spin -- - * Spin to get a read lock: only yield the CPU if the lock is held - * exclusive. + * __read_blocked -- + * Check whether the current read lock request should keep waiting. */ -void -__wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l) +static bool +__read_blocked(WT_SESSION_IMPL *session) { - /* - * Try to get the lock in a single operation if it is available to - * readers. This avoids the situation where multiple readers arrive - * concurrently and have to line up in order to enter the lock. For - * read-heavy workloads it can make a significant difference. - */ - while (__wt_try_readlock(session, l) != 0) { - if (l->s.writers_active > 0) - __wt_yield(); - else - WT_PAUSE(); - } + return (session->current_rwticket != + session->current_rwlock->u.s.current); } /* @@ -198,41 +159,90 @@ __wt_readlock_spin(WT_SESSION_IMPL *session, WT_RWLOCK *l) void __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - uint16_t ticket; + WT_RWLOCK new, old; int pause_cnt; + int16_t writers_active; + uint8_t ticket; WT_STAT_CONN_INCR(session, rwlock_read); WT_DIAGNOSTIC_YIELD; - /* - * Possibly wrap: if we have more than 64K lockers waiting, the ticket - * value will wrap and two lockers will simultaneously be granted the - * lock. - */ - ticket = __wt_atomic_fetch_add16(&l->s.next, 1); - for (pause_cnt = 0; ticket != l->s.readers;) { + for (;;) { /* - * We failed to get the lock; pause before retrying and if we've - * paused enough, yield so we don't burn CPU to no purpose. This - * situation happens if there are more threads than cores in the - * system and we're thrashing on shared resources. + * Fast path: if there is no active writer, join the current + * group. */ - if (++pause_cnt < WT_THOUSAND) + for (old.u.v = l->u.v; + old.u.s.current == old.u.s.next; + old.u.v = l->u.v) { + new.u.v = old.u.v; + /* + * Check for overflow: if the maximum number of readers + * are already active, wait to try again. + */ + if (++new.u.s.readers_active == 0) + goto stall; + if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v)) + return; WT_PAUSE(); - else + } + + /* + * There is an active writer: join the next group. + * + * Limit how many readers can queue: don't allow more readers + * to queue than there are active writers (calculated as + * `next - current`): otherwise, in write-heavy workloads, + * readers can keep queuing up in front of writers and + * throughput is unstable. + * + * If the maximum number of readers are already queued, wait + * until we can get a valid ticket. + */ + writers_active = old.u.s.next - old.u.s.current; + if (old.u.s.readers_queued > writers_active) { +stall: __wt_cond_wait( + session, l->cond_readers, WT_THOUSAND, NULL); + continue; + } + + /* + * If we are the first reader to queue, set the next read + * group. Note: don't re-read from the lock or we could race + * with a writer unlocking. + */ + new.u.v = old.u.v; + if (new.u.s.readers_queued++ == 0) + new.u.s.reader = new.u.s.next; + ticket = new.u.s.reader; + + if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v)) + break; + } + + /* Wait for our group to start. */ + for (pause_cnt = 0; ticket != l->u.s.current; pause_cnt++) { + if (pause_cnt < 1000) + WT_PAUSE(); + else if (pause_cnt < 1200) __wt_yield(); + else { + session->current_rwlock = l; + session->current_rwticket = ticket; + __wt_cond_wait( + session, l->cond_readers, 0, __read_blocked); + } } - /* - * We're the only writer of the readers field, so the update does not - * need to be atomic. - */ - ++l->s.readers; + WT_ASSERT(session, l->u.s.readers_active > 0); /* * Applications depend on a barrier here so that operations holding the - * lock see consistent data. + * lock see consistent data. The atomic operation above isn't + * sufficient here because we don't own the lock until our ticket comes + * up and whatever data we are protecting may have changed in the + * meantime. */ WT_READ_BARRIER(); } @@ -244,13 +254,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - WT_UNUSED(session); + WT_RWLOCK new, old; - /* - * Increment the writers value (other readers are doing the same, make - * sure we don't race). - */ - (void)__wt_atomic_add16(&l->s.writers, 1); + do { + old.u.v = l->u.v; + WT_ASSERT(session, old.u.s.readers_active > 0); + + /* + * Decrement the active reader count (other readers are doing + * the same, make sure we don't race). + */ + new.u.v = old.u.v; + --new.u.s.readers_active; + } while (!__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v)); + + if (new.u.s.readers_active == 0 && new.u.s.current != new.u.s.next) + __wt_cond_signal(session, l->cond_writers); } /* @@ -264,22 +283,44 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_STAT_CONN_INCR(session, rwlock_write); - old = new = *l; - /* - * This write lock can only be granted if the lock was last granted to - * a writer and there are no readers or writers blocked on the lock, - * that is, if this thread's ticket would be the next ticket granted. - * Do the cheap test to see if this can possibly succeed (and confirm - * the lock is in the correct state to grant this write lock). + * This write lock can only be granted if no readers or writers blocked + * on the lock, that is, if this thread's ticket would be the next + * ticket granted. Check if this can possibly succeed (and confirm the + * lock is in the correct state to grant this write lock). */ - if (old.s.writers != old.s.next) + old.u.v = l->u.v; + if (old.u.s.current != old.u.s.next || old.u.s.readers_active != 0) return (EBUSY); - /* The replacement lock value is a result of allocating a new ticket. */ - ++new.s.next; - ++new.s.writers_active; - return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY); + /* + * We've checked above that there is no writer active (since + * `current == next`), so there should be no readers queued. + */ + WT_ASSERT(session, old.u.s.readers_queued == 0); + + /* + * The replacement lock value is a result of allocating a new ticket. + * + * We rely on this atomic operation to provide a barrier. + */ + new.u.v = old.u.v; + new.u.s.next++; + return (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v) ? 0 : EBUSY); +} + +/* + * __write_blocked -- + * Check whether the current write lock request should keep waiting. + */ +static bool +__write_blocked(WT_SESSION_IMPL *session) +{ + WT_RWLOCK *l; + + l = session->current_rwlock; + return (session->current_rwticket != l->u.s.current || + l->u.s.readers_active != 0); } /* @@ -289,34 +330,51 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - uint16_t ticket; + WT_RWLOCK new, old; int pause_cnt; + uint8_t ticket; WT_STAT_CONN_INCR(session, rwlock_write); - /* - * Possibly wrap: if we have more than 64K lockers waiting, the ticket - * value will wrap and two lockers will simultaneously be granted the - * lock. - */ - ticket = __wt_atomic_fetch_add16(&l->s.next, 1); - (void)__wt_atomic_add16(&l->s.writers_active, 1); - for (pause_cnt = 0; ticket != l->s.writers;) { + for (;;) { + new.u.v = old.u.v = l->u.v; + ticket = new.u.s.next++; + /* - * We failed to get the lock; pause before retrying and if we've - * paused enough, sleep so we don't burn CPU to no purpose. This - * situation happens if there are more threads than cores in the - * system and we're thrashing on shared resources. + * Avoid wrapping: if we allocate more than 256 tickets, two + * lockers will simultaneously be granted the lock. */ - if (++pause_cnt < WT_THOUSAND) + if (new.u.s.current == new.u.s.next) { + __wt_cond_wait( + session, l->cond_writers, WT_THOUSAND, NULL); + continue; + } + if (__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v)) + break; + } + + /* Wait for our group to start and any readers to drain. */ + for (pause_cnt = 0; + ticket != l->u.s.current || l->u.s.readers_active != 0; + pause_cnt++) { + if (pause_cnt < 1000) WT_PAUSE(); - else - __wt_sleep(0, 10); + else if (pause_cnt < 1200) + __wt_yield(); + else { + session->current_rwlock = l; + session->current_rwticket = ticket; + __wt_cond_wait( + session, l->cond_writers, 0, __write_blocked); + } } /* * Applications depend on a barrier here so that operations holding the - * lock see consistent data. + * lock see consistent data. The atomic operation above isn't + * sufficient here because we don't own the lock until our ticket comes + * up and whatever data we are protecting may have changed in the + * meantime. */ WT_READ_BARRIER(); } @@ -328,29 +386,34 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) { - WT_RWLOCK new; - - WT_UNUSED(session); + WT_RWLOCK new, old; - (void)__wt_atomic_sub16(&l->s.writers_active, 1); + do { + new.u.v = old.u.v = l->u.v; - /* - * Ensure that all updates made while the lock was held are visible to - * the next thread to acquire the lock. - */ - WT_WRITE_BARRIER(); + /* + * We're holding the lock exclusive, there shouldn't be any + * active readers. + */ + WT_ASSERT(session, old.u.s.readers_active == 0); - new = *l; + /* + * Allow the next batch to start. + * + * If there are readers in the next group, swap queued readers + * to active: this could race with new readlock requests, so we + * have to spin. + */ + if (++new.u.s.current == new.u.s.reader) { + new.u.s.readers_active = new.u.s.readers_queued; + new.u.s.readers_queued = 0; + } + } while (!__wt_atomic_casv64(&l->u.v, old.u.v, new.u.v)); - /* - * We're the only writer of the writers/readers fields, so the update - * does not need to be atomic; we have to update both values at the - * same time though, otherwise we'd potentially race with the thread - * next granted the lock. - */ - ++new.s.writers; - ++new.s.readers; - l->i.wr = new.i.wr; + if (new.u.s.readers_active != 0) + __wt_cond_signal(session, l->cond_readers); + else if (new.u.s.current != new.u.s.next) + __wt_cond_signal(session, l->cond_writers); WT_DIAGNOSTIC_YIELD; } @@ -365,6 +428,6 @@ __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) { WT_UNUSED(session); - return (l->s.writers != l->s.next || l->s.readers != l->s.next); + return (l->u.s.current != l->u.s.next || l->u.s.readers_active != 0); } #endif |