diff options
author | Alex Gorrod <alexander.gorrod@mongodb.com> | 2015-10-29 16:47:19 +1100 |
---|---|---|
committer | Alex Gorrod <alexg@wiredtiger.com> | 2015-11-24 05:05:46 +0000 |
commit | c819d2f9d34d8d701e986da4ea628c08239f8626 (patch) | |
tree | dd1eb79778690a42be6d05483794f4030d1a5345 | |
parent | 00dfebc9b099a80c0ce8bbe69ef97168eda23bfd (diff) | |
download | mongo-c819d2f9d34d8d701e986da4ea628c08239f8626.tar.gz |
Merge pull request #2271 from wiredtiger/reverse-split-fix
SERVER-21027 Fix reverse splits to keep the original child ref locked
(cherry picked from commit f4d20a3)
-rw-r--r-- | src/btree/bt_split.c | 25 | ||||
-rw-r--r-- | src/evict/evict_file.c | 15 | ||||
-rw-r--r-- | src/evict/evict_page.c | 52 | ||||
-rw-r--r-- | src/include/extern.h | 2 |
4 files changed, 61 insertions, 33 deletions
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 07c266a07f9..82a4dac226f 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -943,8 +943,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * reading thread will restart. Include the ref we are splitting in * the count to be deleted. */ - deleted_entries = ref_new != NULL ? 1 : 0; - for (i = 0; i < parent_entries; ++i) { + for (deleted_entries = 1, i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); if (next_ref->state == WT_REF_DELETED && @@ -966,7 +965,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, */ if (result_entries == 0) { next_ref = pindex->index[0]; - WT_ASSERT(session, next_ref->state == WT_REF_SPLIT); + WT_ASSERT(session, next_ref->state == WT_REF_SPLIT || + (next_ref == ref && ref->state == WT_REF_LOCKED)); next_ref->state = WT_REF_DELETED; --deleted_entries; result_entries = 1; @@ -1051,9 +1051,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, "%s split into parent %" PRIu32 " -> %" PRIu32 - " (%" PRIu32 ")", - __wt_page_type_string(ref->page->type), parent_entries, - result_entries, result_entries - parent_entries)); + " (%" PRIu32 ")", ref->page == NULL ? + "reverse" : __wt_page_type_string(ref->page->type), + parent_entries, result_entries, result_entries - parent_entries)); /* * The new page index is in place, free the WT_REF we were splitting @@ -1454,23 +1454,18 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) /* * __wt_split_reverse -- - * Lock, then reverse split an internal page (remove deleted refs). + * We have a locked ref that is empty and we want to rewrite the index in + * its parent. */ int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) { WT_DECL_RET; WT_PAGE *parent; - WT_REF dummy_child; bool hazard; - WT_CLEAR(dummy_child); - dummy_child.home = dummy_child.page = ref->page; - dummy_child.state = WT_REF_MEM; - - WT_RET(__split_parent_lock(session, &dummy_child, &parent, &hazard)); - WT_ASSERT(session, parent == ref->page); - ret = __split_parent(session, &dummy_child, NULL, 0, 0, 0); + WT_RET(__split_parent_lock(session, ref, &parent, &hazard)); + ret = __split_parent(session, ref, NULL, 0, 0, 0); WT_TRET(__split_parent_unlock(session, parent, hazard)); return (ret); } diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index bcc5b86ecc2..4cf3840ba8e 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -18,8 +18,11 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; + WT_TXN *txn; bool evict_reset; + txn = &session->txn; + /* * We need exclusive access to the file -- disable ordinary eviction * and drain any blocks already queued. @@ -29,6 +32,9 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) /* Make sure the oldest transaction ID is up-to-date. */ __wt_txn_update_oldest(session, true); + if (txn->isolation == WT_ISO_READ_COMMITTED) + __wt_txn_get_snapshot(session); + /* Walk the tree, discarding pages. */ next_ref = NULL; WT_ERR(__wt_tree_walk(session, &next_ref, NULL, @@ -59,6 +65,10 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING)); + /* Update our snapshot for each new page. */ + if (txn->isolation == WT_ISO_READ_COMMITTED) + __wt_txn_get_snapshot(session); + /* * We can't evict the page just returned to us (it marks our * place in the tree), so move the walk to one page ahead of @@ -81,7 +91,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) case WT_SYNC_DISCARD: WT_ASSERT(session, __wt_page_can_evict(session, page, 0, NULL)); - WT_ERR(__wt_evict_page_clean_update(session, ref)); + WT_ERR( + __wt_evict_page_clean_update(session, ref, true)); break; case WT_SYNC_DISCARD_FORCE: /* @@ -97,7 +108,7 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) } F_SET(session, WT_SESSION_DISCARD_FORCE); - ret = __wt_evict_page_clean_update(session, ref); + ret = __wt_evict_page_clean_update(session, ref, true); F_CLR(session, WT_SESSION_DISCARD_FORCE); WT_ERR(ret); break; diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index 641122b119f..9de66922931 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -110,7 +110,8 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) if (__wt_ref_is_root(ref)) __wt_ref_out(session, ref); else - WT_ERR(__wt_evict_page_clean_update(session, ref)); + WT_ERR(__wt_evict_page_clean_update( + session, ref, closing)); WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean); WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean); @@ -142,25 +143,47 @@ done: if ((inmem_split || (forced_eviction && ret == EBUSY)) && return (ret); } /* - * __evict_reverse_split_check -- - * Check if an internal page needs a reverse split. + * __evict_delete_ref -- + * Mark a page reference deleted and check if the parent can reverse + * split. */ static int -__evict_reverse_split_check(WT_SESSION_IMPL *session, WT_REF *ref) +__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) { + WT_DECL_RET; WT_PAGE *parent; WT_PAGE_INDEX *pindex; - uint32_t deleted_entries; + uint32_t ndeleted; if (__wt_ref_is_root(ref)) return (0); - parent = ref->home; - WT_INTL_INDEX_GET(session, parent, pindex); - deleted_entries = __wt_atomic_addv32(&pindex->deleted_entries, 1); - if (deleted_entries > pindex->entries / 10) - WT_RET(__wt_split_reverse(session, parent->pg_intl_parent_ref)); + /* + * Avoid doing reverse splits when closing the file, it is + * wasted work and some structure may already have been freed. + */ + if (!closing) { + parent = ref->home; + WT_INTL_INDEX_GET(session, parent, pindex); + ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1); + + /* + * If more than 10% of the parent references are deleted, try a + * reverse split. Don't bother if there is a single deleted + * reference: the internal page is empty and we have to wait + * for eviction to notice. + * + * This will consume the deleted ref (and eventually free it). + * If the reverse split can't get the access it needs because + * something is busy, be sure that the page still ends up + * marked deleted. + */ + if (ndeleted > pindex->entries / 10 && pindex->entries > 1 && + (ret = __wt_split_reverse(session, ref)) != EBUSY) + return (ret); + } + WT_PUBLISH(ref->state, WT_REF_DELETED); return (0); } @@ -169,7 +192,8 @@ __evict_reverse_split_check(WT_SESSION_IMPL *session, WT_REF *ref) * Update a clean page's reference on eviction. */ int -__wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_evict_page_clean_update( + WT_SESSION_IMPL *session, WT_REF *ref, bool closing) { WT_DECL_RET; @@ -180,9 +204,8 @@ __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref) */ __wt_ref_out(session, ref); if (ref->addr == NULL) { - WT_PUBLISH(ref->state, WT_REF_DELETED); WT_WITH_PAGE_INDEX(session, - ret = __evict_reverse_split_check(session, ref)); + ret = __evict_delete_ref(session, ref, closing)); WT_RET_BUSY_OK(ret); } else WT_PUBLISH(ref->state, WT_REF_DISK); @@ -226,9 +249,8 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) */ __wt_ref_out(session, ref); ref->addr = NULL; - WT_PUBLISH(ref->state, WT_REF_DELETED); WT_WITH_PAGE_INDEX(session, - ret = __evict_reverse_split_check(session, ref)); + ret = __evict_delete_ref(session, ref, closing)); WT_RET_BUSY_OK(ret); break; case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ diff --git a/src/include/extern.h b/src/include/extern.h index 44e5658bf28..845102ca428 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -316,7 +316,7 @@ extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, bool is_server); extern int __wt_cache_wait(WT_SESSION_IMPL *session, int full); extern void __wt_cache_dump(WT_SESSION_IMPL *session); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing); -extern int __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_evict_page_clean_update( WT_SESSION_IMPL *session, WT_REF *ref, bool closing); extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn); extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp); |