diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c')
-rw-r--r-- | src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c | 197 |
1 files changed, 161 insertions, 36 deletions
diff --git a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c index cdd4f8a24e1..df558b12bef 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c +++ b/src/third_party/wiredtiger/src/os_posix/os_mtx_rw.c @@ -38,6 +38,78 @@ * Joseph Seigh. Note that a similar (but not identical) algorithm was published * by John Mellor-Crummey and Michael Scott in their landmark paper "Scalable * Reader-Writer Synchronization for Shared-Memory Multiprocessors". + * + * The following is an explanation of this code. First, the underlying lock + * structure. + * + * struct { + * uint16_t writers; Now serving for writers + * uint16_t readers; Now serving for readers + * uint16_t users; Next available ticket number + * uint16_t __notused; Padding + * } + * + * First, imagine a store's 'take a number' ticket algorithm. A customer takes + * a unique ticket number and customers are served in ticket order. In the data + * structure, 'writers' is the next writer to be served, 'readers' is the next + * reader to be served, and 'users' is the next available ticket number. + * + * Next, consider exclusive (write) locks. The 'now serving' number for writers + * is 'writers'. To lock, 'take a number' and wait until that number is being + * served; more specifically, atomically copy and increment the current value of + * 'users', and then wait until 'writers' equals that copied number. + * + * Shared (read) locks are similar. Like writers, readers atomically get the + * next number available. However, instead of waiting for 'writers' to equal + * their number, they wait for 'readers' to equal their number. + * + * This has the effect of queuing lock requests in the order they arrive + * (incidentally avoiding starvation). + * + * Each lock/unlock pair requires incrementing both 'readers' and 'writers'. + * In the case of a reader, the 'readers' increment happens when the reader + * acquires the lock (to allow read-lock sharing), and the 'writers' increment + * happens when the reader releases the lock. In the case of a writer, both + * 'readers' and 'writers' are incremented when the writer releases the lock. + * + * For example, consider the following read (R) and write (W) lock requests: + * + * writers readers users + * 0 0 0 + * R: ticket 0, readers match OK 0 1 1 + * R: ticket 1, readers match OK 0 2 2 + * R: ticket 2, readers match OK 0 3 3 + * W: ticket 3, writers no match block 0 3 4 + * R: ticket 2, unlock 1 3 4 + * R: ticket 0, unlock 2 3 4 + * R: ticket 1, unlock 3 3 4 + * W: ticket 3, writers match OK 3 3 4 + * + * Note the writer blocks until 'writers' equals its ticket number and it does + * not matter if readers unlock in order or not. + * + * Readers or writers entering the system after the write lock is queued block, + * and the next ticket holder (reader or writer) will unblock when the writer + * unlocks. An example, continuing from the last line of the above example: + * + * writers readers users + * W: ticket 3, writers match OK 3 3 4 + * R: ticket 4, readers no match block 3 3 5 + * R: ticket 5, readers no match block 3 3 6 + * W: ticket 6, writers no match block 3 3 7 + * W: ticket 3, unlock 4 4 7 + * R: ticket 4, readers match OK 4 5 7 + * R: ticket 5, readers match OK 4 6 7 + * + * The 'users' field is a 2-byte value so the available ticket number wraps at + * 64K requests. If a thread's lock request is not granted until the 'users' + * field cycles and the same ticket is taken by another thread, we could grant + * a lock to two separate threads at the same time, and bad things happen: two + * writer threads or a reader thread and a writer thread would run in parallel, + * and lock waiters could be skipped if the unlocks race. This is unlikely, it + * only happens if a lock request is blocked by 64K other requests. The fix is + * to grow the lock structure fields, but the largest atomic instruction we have + * is 8 bytes, the structure has no room to grow. */ #include "wt_internal.h" @@ -69,20 +141,31 @@ __wt_rwlock_alloc( int __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { - wt_rwlock_t *l; - uint64_t old, new, pad, users, writers; + wt_rwlock_t *l, new, old; WT_RET(__wt_verbose( session, WT_VERB_MUTEX, "rwlock: try_readlock %s", rwlock->name)); WT_STAT_FAST_CONN_INCR(session, rwlock_read); l = &rwlock->rwlock; - pad = l->s.pad; - users = l->s.users; - writers = l->s.writers; - old = (pad << 48) + (users << 32) + (users << 16) + writers; - new = (pad << 48) + ((users + 1) << 32) + ((users + 1) << 16) + writers; - return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY); + new = old = *l; + + /* + * This read lock can only be granted if the lock was last granted to + * a reader and there are no readers or writers blocked on the lock, + * that is, if this thread's ticket would be the next ticket granted. + * Do the cheap test to see if this can possibly succeed (and confirm + * the lock is in the correct state to grant this read lock). + */ + if (old.s.readers != old.s.users) + return (EBUSY); + + /* + * The replacement lock value is a result of allocating a new ticket and + * incrementing the reader value to match it. + */ + new.s.readers = new.s.users = old.s.users + 1; + return (WT_ATOMIC_CAS8(l->u, old.u, new.u) ? 0 : EBUSY); } /* @@ -93,8 +176,7 @@ int __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { wt_rwlock_t *l; - uint64_t me; - uint16_t val; + uint16_t ticket; int pause_cnt; WT_RET(__wt_verbose( @@ -102,17 +184,22 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_STAT_FAST_CONN_INCR(session, rwlock_read); l = &rwlock->rwlock; - me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32); - val = (uint16_t)(me >> 32); - for (pause_cnt = 0; val != l->s.readers;) { + + /* + * Possibly wrap: if we have more than 64K lockers waiting, the ticket + * value will wrap and two lockers will simultaneously be granted the + * lock. + */ + ticket = WT_ATOMIC_FETCH_ADD2(l->s.users, 1); + for (pause_cnt = 0; ticket != l->s.readers;) { /* * We failed to get the lock; pause before retrying and if we've * paused enough, sleep so we don't burn CPU to no purpose. This * situation happens if there are more threads than cores in the - * system and we're thrashing on shared resources. Regardless, - * don't sleep long, all we need is to schedule the other reader - * threads to complete a few more instructions and increment the - * reader count. + * system and we're thrashing on shared resources. + * + * Don't sleep long when waiting on a read lock, hopefully we're + * waiting on another read thread to increment the reader count. */ if (++pause_cnt < 1000) WT_PAUSE(); @@ -120,6 +207,10 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) __wt_sleep(0, 10); } + /* + * We're the only writer of the readers field, so the update does not + * need to be atomic. + */ ++l->s.readers; return (0); @@ -138,6 +229,11 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) session, WT_VERB_MUTEX, "rwlock: read unlock %s", rwlock->name)); l = &rwlock->rwlock; + + /* + * Increment the writers value (other readers are doing the same, make + * sure we don't race). + */ WT_ATOMIC_ADD2(l->s.writers, 1); return (0); @@ -150,20 +246,28 @@ __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { - wt_rwlock_t *l; - uint64_t old, new, pad, readers, users; + wt_rwlock_t *l, new, old; WT_RET(__wt_verbose( session, WT_VERB_MUTEX, "rwlock: try_writelock %s", rwlock->name)); WT_STAT_FAST_CONN_INCR(session, rwlock_write); l = &rwlock->rwlock; - pad = l->s.pad; - readers = l->s.readers; - users = l->s.users; - old = (pad << 48) + (users << 32) + (readers << 16) + users; - new = (pad << 48) + ((users + 1) << 32) + (readers << 16) + users; - return (WT_ATOMIC_CAS8(l->u, old, new) ? 0 : EBUSY); + old = new = *l; + + /* + * This write lock can only be granted if the lock was last granted to + * a writer and there are no readers or writers blocked on the lock, + * that is, if this thread's ticket would be the next ticket granted. + * Do the cheap test to see if this can possibly succeed (and confirm + * the lock is in the correct state to grant this write lock). + */ + if (old.s.writers != old.s.users) + return (EBUSY); + + /* The replacement lock value is a result of allocating a new ticket. */ + ++new.s.users; + return (WT_ATOMIC_CAS8(l->u, old.u, new.u) ? 0 : EBUSY); } /* @@ -174,23 +278,33 @@ int __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) { wt_rwlock_t *l; - uint64_t me; - uint16_t val; + uint16_t ticket; + int pause_cnt; WT_RET(__wt_verbose( session, WT_VERB_MUTEX, "rwlock: writelock %s", rwlock->name)); WT_STAT_FAST_CONN_INCR(session, rwlock_write); + l = &rwlock->rwlock; + /* - * Possibly wrap: if we have more than 64K lockers waiting, the count - * of writers will wrap and two lockers will simultaneously be granted - * the write lock. + * Possibly wrap: if we have more than 64K lockers waiting, the ticket + * value will wrap and two lockers will simultaneously be granted the + * lock. */ - l = &rwlock->rwlock; - me = WT_ATOMIC_FETCH_ADD8(l->u, (uint64_t)1 << 32); - val = (uint16_t)(me >> 32); - while (val != l->s.writers) - WT_PAUSE(); + ticket = WT_ATOMIC_FETCH_ADD2(l->s.users, 1); + for (pause_cnt = 0; ticket != l->s.writers;) { + /* + * We failed to get the lock; pause before retrying and if we've + * paused enough, sleep so we don't burn CPU to no purpose. This + * situation happens if there are more threads than cores in the + * system and we're thrashing on shared resources. + */ + if (++pause_cnt < 1000) + WT_PAUSE(); + else + __wt_sleep(0, 10); + } return (0); } @@ -211,12 +325,23 @@ __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) copy = *l; + /* + * We're the only writer of the writers/readers fields, so the update + * does not need to be atomic; we have to update both values at the + * same time though, otherwise we'd potentially race with the thread + * next granted the lock. + * + * Use a memory barrier to ensure the compiler doesn't mess with these + * instructions and rework the code in a way that avoids the update as + * a unit. + */ WT_BARRIER(); ++copy.s.writers; ++copy.s.readers; - l->i.us = copy.i.us; + l->i.wr = copy.i.wr; + return (0); } |