summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorKeith Bostic <keith.bostic@mongodb.com>2016-08-17 10:04:02 -0400
committerGitHub <noreply@github.com>2016-08-17 10:04:02 -0400
commit36c2663b813216f9e10052ceeb2516fca5586496 (patch)
treeadaf8f3304d8736b70a8e9040f63947c3e0018f5 /src
parentb9a98e44bee8692808ebe51ce29d5a774307494c (diff)
downloadmongo-36c2663b813216f9e10052ceeb2516fca5586496.tar.gz
WT-2847 Merge fair locks into read/write locks. (#2966)
* WT-2847 Merge fair locks into read/write locks. Merge the fair-lock implementation into the read/write lock implementation (fair locks were the same as the read/write lock's write lock), add a new read/write "is locked?" function for diagnostic tests. * Rename the WT_RWLOCK "users" field to be "next", it makes the code easier to read (and we already depend on "next" not being a keyword, WT_CURSOR.next). * Extend a comment. * aspell does not know amongst!
Diffstat (limited to 'src')
-rw-r--r--src/btree/bt_compact.c4
-rw-r--r--src/btree/bt_debug.c2
-rw-r--r--src/btree/bt_discard.c2
-rw-r--r--src/btree/bt_split.c10
-rw-r--r--src/include/btmem.h5
-rw-r--r--src/include/extern.h1
-rw-r--r--src/include/mutex.h20
-rw-r--r--src/include/mutex.i96
-rw-r--r--src/include/serial.i4
-rw-r--r--src/include/wt_internal.h2
-rw-r--r--src/reconcile/rec_write.c6
-rw-r--r--src/support/mtx_rw.c44
12 files changed, 50 insertions, 146 deletions
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index d52afbe4280..bb1261d94b0 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -60,7 +60,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
*/
if (mod->rec_result == WT_PM_REC_REPLACE ||
mod->rec_result == WT_PM_REC_MULTIBLOCK)
- __wt_fair_lock(session, &page->page_lock);
+ __wt_writelock(session, &page->page_lock);
if (mod->rec_result == WT_PM_REC_REPLACE)
ret = bm->compact_page_skip(bm, session,
@@ -80,7 +80,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
if (mod->rec_result == WT_PM_REC_REPLACE ||
mod->rec_result == WT_PM_REC_MULTIBLOCK)
- __wt_fair_unlock(session, &page->page_lock);
+ __wt_writeunlock(session, &page->page_lock);
return (ret);
}
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index d4b1362fbfe..c1560150435 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -682,7 +682,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
"\t" "disk %p, entries %" PRIu32, (void *)page->dsk, entries));
WT_RET(ds->f(ds,
", %s", __wt_page_is_modified(page) ? "dirty" : "clean"));
- WT_RET(ds->f(ds, ", %s", __wt_fair_islocked(
+ WT_RET(ds->f(ds, ", %s", __wt_rwlock_islocked(
session, &page->page_lock) ? "locked" : "unlocked"));
if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index 4b132e311c5..162bc07a1c2 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -61,7 +61,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
*/
WT_ASSERT(session, !__wt_page_is_modified(page));
WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
- WT_ASSERT(session, !__wt_fair_islocked(session, &page->page_lock));
+ WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock));
#ifdef HAVE_DIAGNOSTIC
{
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 5b15a1224ea..700c2f3b192 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -1287,12 +1287,12 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
return (EBUSY);
if (trylock)
- WT_RET(__wt_fair_trylock(session, &parent->page_lock));
+ WT_RET(__wt_try_writelock(session, &parent->page_lock));
else
- __wt_fair_lock(session, &parent->page_lock);
+ __wt_writelock(session, &parent->page_lock);
if (parent == ref->home)
break;
- __wt_fair_unlock(session, &parent->page_lock);
+ __wt_writeunlock(session, &parent->page_lock);
}
/*
@@ -1315,7 +1315,7 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
*parentp = parent;
return (0);
-err: __wt_fair_unlock(session, &parent->page_lock);
+err: __wt_writeunlock(session, &parent->page_lock);
return (ret);
}
@@ -1331,7 +1331,7 @@ __split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
if (hazard)
ret = __wt_hazard_clear(session, parent);
- __wt_fair_unlock(session, &parent->page_lock);
+ __wt_writeunlock(session, &parent->page_lock);
return (ret);
}
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 817ce892952..a9d190c3b09 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -584,9 +584,10 @@ struct __wt_page {
/*
* Used to protect and co-ordinate splits for internal pages and
- * reconciliation for all pages.
+ * reconciliation for all pages. Only used to co-ordinate among the
+ * uncommon cases that require exclusive access to a page.
*/
- WT_FAIR_LOCK page_lock;
+ WT_RWLOCK page_lock;
/*
* The page's read generation acts as an LRU value for each page in the
diff --git a/src/include/extern.h b/src/include/extern.h
index a3916a31542..796cf9a5f71 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -667,6 +667,7 @@ extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock) WT_GC
extern void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
extern void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
extern void __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp);
+extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock);
extern uint32_t __wt_nlpo2_round(uint32_t v);
extern uint32_t __wt_nlpo2(uint32_t v);
extern uint32_t __wt_log2_int(uint32_t n);
diff --git a/src/include/mutex.h b/src/include/mutex.h
index 04679884930..f0f8173bad4 100644
--- a/src/include/mutex.h
+++ b/src/include/mutex.h
@@ -42,7 +42,7 @@ typedef union { /* Read/write lock */
struct {
uint16_t writers; /* Now serving for writers */
uint16_t readers; /* Now serving for readers */
- uint16_t users; /* Next available ticket number */
+ uint16_t next; /* Next available ticket number */
uint16_t __notused; /* Padding */
} s;
} wt_rwlock_t;
@@ -59,24 +59,6 @@ struct __wt_rwlock {
};
/*
- * A light weight lock that can be used to replace spinlocks if fairness is
- * necessary. Implements a ticket-based back off spin lock.
- * The fields are available as a union to allow for atomically setting
- * the state of the entire lock.
- */
-struct __wt_fair_lock {
- union {
- uint32_t lock;
- struct {
- uint16_t owner; /* Ticket for current owner */
- uint16_t waiter; /* Last allocated ticket */
- } s;
- } u;
-#define fair_lock_owner u.s.owner
-#define fair_lock_waiter u.s.waiter
-};
-
-/*
* Spin locks:
*
* WiredTiger uses spinlocks for fast mutual exclusion (where operations done
diff --git a/src/include/mutex.i b/src/include/mutex.i
index e7c78fdb484..cb1847d9991 100644
--- a/src/include/mutex.i
+++ b/src/include/mutex.i
@@ -255,99 +255,3 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t)
#error Unknown spinlock type
#endif
-
-/*
- * __wt_fair_trylock --
- * Try to get a lock - give up if it is not immediately available.
- */
-static inline int
-__wt_fair_trylock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
-{
- WT_FAIR_LOCK new, old;
-
- WT_UNUSED(session);
-
- old = new = *lock;
-
- /* Exit early if there is no chance we can get the lock. */
- if (old.fair_lock_waiter != old.fair_lock_owner)
- return (EBUSY);
-
- /* The replacement lock value is a result of allocating a new ticket. */
- ++new.fair_lock_waiter;
- return (__wt_atomic_cas32(
- &lock->u.lock, old.u.lock, new.u.lock) ? 0 : EBUSY);
-}
-
-/*
- * __wt_fair_lock --
- * Get a lock.
- */
-static inline void
-__wt_fair_lock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
-{
- uint16_t ticket;
- int pause_cnt;
-
- WT_UNUSED(session);
-
- /*
- * Possibly wrap: if we have more than 64K lockers waiting, the ticket
- * value will wrap and two lockers will simultaneously be granted the
- * lock.
- */
- ticket = __wt_atomic_fetch_add16(&lock->fair_lock_waiter, 1);
- for (pause_cnt = 0; ticket != lock->fair_lock_owner;) {
- /*
- * We failed to get the lock; pause before retrying and if we've
- * paused enough, sleep so we don't burn CPU to no purpose. This
- * situation happens if there are more threads than cores in the
- * system and we're thrashing on shared resources.
- */
- if (++pause_cnt < WT_THOUSAND)
- WT_PAUSE();
- else
- __wt_sleep(0, 10);
- }
-
- /*
- * Applications depend on a barrier here so that operations holding the
- * lock see consistent data.
- */
- WT_READ_BARRIER();
-}
-
-/*
- * __wt_fair_unlock --
- * Release a shared lock.
- */
-static inline void
-__wt_fair_unlock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
-{
- WT_UNUSED(session);
-
- /*
- * Ensure that all updates made while the lock was held are visible to
- * the next thread to acquire the lock.
- */
- WT_WRITE_BARRIER();
-
- /*
- * We have exclusive access - the update does not need to be atomic.
- */
- ++lock->fair_lock_owner;
-}
-
-#ifdef HAVE_DIAGNOSTIC
-/*
- * __wt_fair_islocked --
- * Test whether the lock is currently held.
- */
-static inline bool
-__wt_fair_islocked(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock)
-{
- WT_UNUSED(session);
-
- return (lock->fair_lock_waiter != lock->fair_lock_owner);
-}
-#endif
diff --git a/src/include/serial.i b/src/include/serial.i
index ddbf3c17bb8..982f196b0b8 100644
--- a/src/include/serial.i
+++ b/src/include/serial.i
@@ -316,11 +316,11 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
}
/* If we can't lock it, don't scan, that's okay. */
- if (__wt_fair_trylock(session, &page->page_lock) != 0)
+ if (__wt_try_writelock(session, &page->page_lock) != 0)
return (0);
obsolete = __wt_update_obsolete_check(session, page, upd->next);
- __wt_fair_unlock(session, &page->page_lock);
+ __wt_writeunlock(session, &page->page_lock);
if (obsolete != NULL)
__wt_update_obsolete_free(session, page, obsolete);
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index c5337967f22..9b3460d0f9e 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -176,8 +176,6 @@ struct __wt_ext;
typedef struct __wt_ext WT_EXT;
struct __wt_extlist;
typedef struct __wt_extlist WT_EXTLIST;
-struct __wt_fair_lock;
- typedef struct __wt_fair_lock WT_FAIR_LOCK;
struct __wt_fh;
typedef struct __wt_fh WT_FH;
struct __wt_file_handle_inmem;
diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c
index 4e5537dbcdf..9c742476910 100644
--- a/src/reconcile/rec_write.c
+++ b/src/reconcile/rec_write.c
@@ -372,7 +372,7 @@ __wt_reconcile(WT_SESSION_IMPL *session,
* In-memory splits: reconciliation of an internal page cannot handle
* a child page splitting during the reconciliation.
*/
- __wt_fair_lock(session, &page->page_lock);
+ __wt_writelock(session, &page->page_lock);
/*
* Check that transaction time always moves forward for a given page.
@@ -386,7 +386,7 @@ __wt_reconcile(WT_SESSION_IMPL *session,
/* Initialize the reconciliation structure for each new run. */
if ((ret = __rec_write_init(
session, ref, flags, salvage, &session->reconcile)) != 0) {
- __wt_fair_unlock(session, &page->page_lock);
+ __wt_writeunlock(session, &page->page_lock);
return (ret);
}
r = session->reconcile;
@@ -427,7 +427,7 @@ __wt_reconcile(WT_SESSION_IMPL *session,
WT_TRET(__rec_write_wrapup_err(session, r, page));
/* Release the reconciliation lock. */
- __wt_fair_unlock(session, &page->page_lock);
+ __wt_writeunlock(session, &page->page_lock);
/* Update statistics. */
WT_STAT_FAST_CONN_INCR(session, rec_pages);
diff --git a/src/support/mtx_rw.c b/src/support/mtx_rw.c
index 6298a9dc52b..0bdde81c32f 100644
--- a/src/support/mtx_rw.c
+++ b/src/support/mtx_rw.c
@@ -45,19 +45,19 @@
* struct {
* uint16_t writers; Now serving for writers
* uint16_t readers; Now serving for readers
- * uint16_t users; Next available ticket number
+ * uint16_t next; Next available ticket number
* uint16_t __notused; Padding
* }
*
* First, imagine a store's 'take a number' ticket algorithm. A customer takes
* a unique ticket number and customers are served in ticket order. In the data
* structure, 'writers' is the next writer to be served, 'readers' is the next
- * reader to be served, and 'users' is the next available ticket number.
+ * reader to be served, and 'next' is the next available ticket number.
*
* Next, consider exclusive (write) locks. The 'now serving' number for writers
* is 'writers'. To lock, 'take a number' and wait until that number is being
* served; more specifically, atomically copy and increment the current value of
- * 'users', and then wait until 'writers' equals that copied number.
+ * 'next', and then wait until 'writers' equals that copied number.
*
* Shared (read) locks are similar. Like writers, readers atomically get the
* next number available. However, instead of waiting for 'writers' to equal
@@ -74,7 +74,7 @@
*
* For example, consider the following read (R) and write (W) lock requests:
*
- * writers readers users
+ * writers readers next
* 0 0 0
* R: ticket 0, readers match OK 0 1 1
* R: ticket 1, readers match OK 0 2 2
@@ -92,7 +92,7 @@
* and the next ticket holder (reader or writer) will unblock when the writer
* unlocks. An example, continuing from the last line of the above example:
*
- * writers readers users
+ * writers readers next
* W: ticket 3, writers match OK 3 3 4
* R: ticket 4, readers no match block 3 3 5
* R: ticket 5, readers no match block 3 3 6
@@ -101,8 +101,8 @@
* R: ticket 4, readers match OK 4 5 7
* R: ticket 5, readers match OK 4 6 7
*
- * The 'users' field is a 2-byte value so the available ticket number wraps at
- * 64K requests. If a thread's lock request is not granted until the 'users'
+ * The 'next' field is a 2-byte value so the available ticket number wraps at
+ * 64K requests. If a thread's lock request is not granted until the 'next'
* field cycles and the same ticket is taken by another thread, we could grant
* a lock to two separate threads at the same time, and bad things happen: two
* writer threads or a reader thread and a writer thread would run in parallel,
@@ -155,14 +155,14 @@ __wt_try_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
* Do the cheap test to see if this can possibly succeed (and confirm
* the lock is in the correct state to grant this read lock).
*/
- if (old.s.readers != old.s.users)
+ if (old.s.readers != old.s.next)
return (EBUSY);
/*
* The replacement lock value is a result of allocating a new ticket and
* incrementing the reader value to match it.
*/
- new.s.readers = new.s.users = old.s.users + 1;
+ new.s.readers = new.s.next = old.s.next + 1;
return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
}
@@ -188,7 +188,7 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
* value will wrap and two lockers will simultaneously be granted the
* lock.
*/
- ticket = __wt_atomic_fetch_add16(&l->s.users, 1);
+ ticket = __wt_atomic_fetch_add16(&l->s.next, 1);
for (pause_cnt = 0; ticket != l->s.readers;) {
/*
* We failed to get the lock; pause before retrying and if we've
@@ -259,11 +259,11 @@ __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
* Do the cheap test to see if this can possibly succeed (and confirm
* the lock is in the correct state to grant this write lock).
*/
- if (old.s.writers != old.s.users)
+ if (old.s.writers != old.s.next)
return (EBUSY);
/* The replacement lock value is a result of allocating a new ticket. */
- ++new.s.users;
+ ++new.s.next;
return (__wt_atomic_cas64(&l->u, old.u, new.u) ? 0 : EBUSY);
}
@@ -287,7 +287,7 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
* value will wrap and two lockers will simultaneously be granted the
* lock.
*/
- ticket = __wt_atomic_fetch_add16(&l->s.users, 1);
+ ticket = __wt_atomic_fetch_add16(&l->s.next, 1);
for (pause_cnt = 0; ticket != l->s.writers;) {
/*
* We failed to get the lock; pause before retrying and if we've
@@ -361,3 +361,21 @@ __wt_rwlock_destroy(WT_SESSION_IMPL *session, WT_RWLOCK **rwlockp)
__wt_free(session, rwlock);
}
+
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __wt_rwlock_islocked --
+ * Return if a read/write lock is currently locked for reading or writing.
+ */
+bool
+__wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *rwlock)
+{
+ wt_rwlock_t *l;
+
+ WT_UNUSED(session);
+
+ l = &rwlock->rwlock;
+
+ return (l->s.writers != l->s.next || l->s.readers != l->s.next);
+}
+#endif