diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2017-06-02 12:08:34 +1000 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2017-06-02 13:28:08 +1000 |
commit | 30590abdf059e10774c81c6b488f68b3d0afb3ca (patch) | |
tree | 1de28aa8da1321bf958ec06fe1fa0962acdd1bdd /src/include/serial.i | |
parent | 10da3a325c1ae856c85c25748b3cd342e37ec7f9 (diff) | |
download | mongo-30590abdf059e10774c81c6b488f68b3d0afb3ca.tar.gz |
WT-3345 Tune WiredTiger's read/write locks. (#3446)
* Add a workload that stresses rwlock performance under various conditions (including `threads >> cores`), tune read and write lock operations to only spin when it is likely to help, and to back off to a condition variable when there is heavy contention.
* New rwlock implementation: queue readers and writers separately, don't enforce fairness among readers or if the lock is overwhelmed.
* Switch to a spinlock whenever we need to lock a page.
Previously we had a read/write lock in the __wt_page structure that was only ever acquired in write mode, plus a spinlock in the page->modify structure. Switch to using the spinlock for everything.
One slight downside of this change is that we can no longer precisely determine whether a page is locked based on the status of the spinlock (since another page sharing the same lock could be holding it in the places where we used to check). Since that was only ever used
for diagnostic / debugging purposes, I think the benefit of the change outweighs this issue.
* Fix a bug where a failure during `__wt_curfile_create` caused a data handle to be released twice. This is caught by the sanity checking assertions in the new read/write lock code.
* Split may be holding a page lock when restoring update. Tell the restore code we have the page exclusive and no further locking is required.
* Allocate a spinlock for each modified page.
Using shared page locks for mulitple operations that need to lock a page (including inserts and reconciliation) resulted in self-deadlock when the lookaside table was used. That's because reconciliation held a page lock, then caused inserts to the lookaside table, which acquired the page lock for a page in the lookaside table. With a shared set of page locks, they could both be the same lock.
Switch (back?) to allocating a spinlock per modified page. Earlier in this ticket we saved some space in __wt_page, so growing __wt_page_modify is unlikely to be noticeable.
* Tweak padding and position of the spinlock in WT_PAGE_MODIFY to claw back some bytes.
Move evict_pass_gen to the end of WT_PAGE: on inspection, it should be a cold field relative to the others, which now fit in one x86 cache line.
(cherry picked from commit 42daa132f21c1391ae2b2b3d789df85878aca471)
(cherry picked from commit 1bcb9a0cc4c3f56857188ad42cbe81f57d11c7eb)
Diffstat (limited to 'src/include/serial.i')
-rw-r--r-- | src/include/serial.i | 30 |
1 files changed, 19 insertions, 11 deletions
diff --git a/src/include/serial.i b/src/include/serial.i index 982f196b0b8..0134e1a9c20 100644 --- a/src/include/serial.i +++ b/src/include/serial.i @@ -154,7 +154,7 @@ __col_append_serial_func(WT_SESSION_IMPL *session, WT_INSERT_HEAD *ins_head, static inline int __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **new_insp, - size_t new_ins_size, uint64_t *recnop, u_int skipdepth) + size_t new_ins_size, uint64_t *recnop, u_int skipdepth, bool exclusive) { WT_INSERT *new_ins = *new_insp; WT_DECL_RET; @@ -165,11 +165,16 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, /* Clear references to memory we now own and must free on error. */ *new_insp = NULL; - /* Acquire the page's spinlock, call the worker function. */ - WT_PAGE_LOCK(session, page); + /* + * Acquire the page's spinlock unless we already have exclusive access. + * Then call the worker function. + */ + if (!exclusive) + WT_PAGE_LOCK(session, page); ret = __col_append_serial_func( session, ins_head, ins_stack, new_ins, recnop, skipdepth); - WT_PAGE_UNLOCK(session, page); + if (!exclusive) + WT_PAGE_UNLOCK(session, page); if (ret != 0) { /* Free unused memory on error. */ @@ -198,7 +203,7 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page, static inline int __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **new_insp, - size_t new_ins_size, u_int skipdepth) + size_t new_ins_size, u_int skipdepth, bool exclusive) { WT_INSERT *new_ins = *new_insp; WT_DECL_RET; @@ -220,10 +225,12 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, ret = __insert_simple_func( session, ins_stack, new_ins, skipdepth); else { - WT_PAGE_LOCK(session, page); + if (!exclusive) + WT_PAGE_LOCK(session, page); ret = __insert_serial_func( session, ins_head, ins_stack, new_ins, skipdepth); - WT_PAGE_UNLOCK(session, page); + if (!exclusive) + WT_PAGE_UNLOCK(session, page); } if (ret != 0) { @@ -252,7 +259,8 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page, */ static inline int __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, - WT_UPDATE **srch_upd, WT_UPDATE **updp, size_t upd_size) + WT_UPDATE **srch_upd, WT_UPDATE **updp, size_t upd_size, + bool exclusive) { WT_DECL_RET; WT_UPDATE *obsolete, *upd = *updp; @@ -295,7 +303,7 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, /* * If there are no subsequent WT_UPDATE structures we are done here. */ - if (upd->next == NULL) + if (upd->next == NULL || exclusive) return (0); /* @@ -316,11 +324,11 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, } /* If we can't lock it, don't scan, that's okay. */ - if (__wt_try_writelock(session, &page->page_lock) != 0) + if (WT_PAGE_TRYLOCK(session, page) != 0) return (0); obsolete = __wt_update_obsolete_check(session, page, upd->next); - __wt_writeunlock(session, &page->page_lock); + WT_PAGE_UNLOCK(session, page); if (obsolete != NULL) __wt_update_obsolete_free(session, page, obsolete); |