diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2015-11-27 14:27:33 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2015-11-27 14:27:33 +1100 |
commit | deb2d8109ca59cc9e223fd4f5be19915b949c628 (patch) | |
tree | b587570148a970aa2dbed1fc798d1f780568864b | |
parent | cb642366f168caadd56bed3c257e4d3e4c5cc4f0 (diff) | |
parent | 66a111ec48da60195a011e2a163bef07f5035bb0 (diff) | |
download | mongo-deb2d8109ca59cc9e223fd4f5be19915b949c628.tar.gz |
Merge pull request #2330 from wiredtiger/reverse-splits-3.0_2
SERVER-21027 Reverse split if there are many deleted pages (3.0)
-rw-r--r-- | dist/flags.py | 1 | ||||
-rw-r--r-- | src/btree/bt_delete.c | 8 | ||||
-rw-r--r-- | src/btree/bt_page.c | 6 | ||||
-rw-r--r-- | src/btree/bt_split.c | 52 | ||||
-rw-r--r-- | src/btree/bt_walk.c | 14 | ||||
-rw-r--r-- | src/evict/evict_file.c | 6 | ||||
-rw-r--r-- | src/evict/evict_page.c | 69 | ||||
-rw-r--r-- | src/include/btmem.h | 1 | ||||
-rw-r--r-- | src/include/btree.i | 12 | ||||
-rw-r--r-- | src/include/extern.h | 5 | ||||
-rw-r--r-- | src/include/flags.h | 15 | ||||
-rw-r--r-- | src/include/txn.i | 7 | ||||
-rw-r--r-- | src/reconcile/rec_write.c | 1 |
13 files changed, 156 insertions, 41 deletions
diff --git a/dist/flags.py b/dist/flags.py index 394154fcb75..bdd4bf45b6f 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -36,6 +36,7 @@ flags = { 'page_read' : [ 'READ_CACHE', 'READ_COMPACT', + 'READ_NO_EMPTY', 'READ_NO_EVICT', 'READ_NO_GEN', 'READ_NO_WAIT', diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c index 1da2923489c..7313e31267f 100644 --- a/src/btree/bt_delete.c +++ b/src/btree/bt_delete.c @@ -214,10 +214,11 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) /* * __wt_delete_page_skip -- - * If iterating a cursor, skip deleted pages that are visible to us. + * If iterating a cursor, skip deleted pages that are either visible to + * us or globally visible. */ bool -__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) +__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) { bool skip; @@ -245,7 +246,8 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) if (!__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) return (false); - skip = (ref->page_del == NULL || + skip = ref->page_del == NULL || (visible_all ? + __wt_txn_visible_all(session, ref->page_del->txnid) : __wt_txn_visible(session, ref->page_del->txnid)); WT_PUBLISH(ref->state, WT_REF_DELETED); diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 41cc8f9398c..ad8f0293108 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -76,8 +76,12 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags for (force_attempts = 0, oldgen = false, wait_cnt = 0;;) { switch (ref->state) { - case WT_REF_DISK: case WT_REF_DELETED: + if (LF_ISSET(WT_READ_NO_EMPTY) && + __wt_delete_page_skip(session, ref, false)) + return (WT_NOTFOUND); + /* FALLTHROUGH */ + case WT_REF_DISK: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 739db727fb5..6f31ff89aa7 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -943,11 +943,11 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * reading thread will restart. Include the ref we are splitting in * the count to be deleted. */ - for (i = 0, deleted_entries = 1; i < parent_entries; ++i) { + for (deleted_entries = 1, i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); if (next_ref->state == WT_REF_DELETED && - __wt_delete_page_skip(session, next_ref) && + __wt_delete_page_skip(session, next_ref, true) && __wt_atomic_casv32( &next_ref->state, WT_REF_DELETED, WT_REF_SPLIT)) deleted_entries++; @@ -960,6 +960,16 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, result_entries = (parent_entries + new_entries) - deleted_entries; /* + * If the entire (sub)tree is empty, give up: we can't leave an empty + * internal page. Mark it to be evicted soon and clean up any + * references that have changed state. + */ + if (result_entries == 0) { + __wt_page_evict_soon(parent); + goto err; + } + + /* * Allocate and initialize a new page index array for the parent, then * copy references from the original index array, plus references from * the newly created split array, into place. @@ -1003,6 +1013,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, *alloc_refp++ = next_ref; } + /* Check that we filled in all the entries. */ + WT_ASSERT(session, alloc_refp - alloc_index->index == result_entries); + /* * Update the parent page's index: this update makes the split visible * to threads descending the tree. @@ -1038,9 +1051,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, "%s split into parent %" PRIu32 " -> %" PRIu32 - " (%" PRIu32 ")", - __wt_page_type_string(ref->page->type), parent_entries, - result_entries, result_entries - parent_entries)); + " (%" PRIu32 ")", ref->page == NULL ? + "reverse" : __wt_page_type_string(ref->page->type), + parent_entries, result_entries, result_entries - parent_entries)); /* * The new page index is in place, free the WT_REF we were splitting @@ -1132,14 +1145,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, __split_should_deepen(session, parent_ref)) ret = __split_deepen(session, parent); -err: if (!complete) +err: if (!complete) { for (i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref->state == WT_REF_SPLIT) next_ref->state = WT_REF_DELETED; } - __wt_free_ref_index(session, NULL, alloc_index, false); + /* If we gave up on a reverse split, unlock the child. */ + if (ref_new == NULL) { + WT_ASSERT(session, ref->state == WT_REF_LOCKED); + ref->state = WT_REF_DELETED; + } + + __wt_free_ref_index(session, NULL, alloc_index, false); + } /* * A note on error handling: if we completed the split, return success, @@ -1440,6 +1460,24 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) } /* + * __wt_split_reverse -- + * We have a locked ref that is empty and we want to rewrite the index in + * its parent. + */ +int +__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_DECL_RET; + WT_PAGE *parent; + bool hazard; + + WT_RET(__split_parent_lock(session, ref, &parent, &hazard)); + ret = __split_parent(session, ref, NULL, 0, 0, 0); + WT_TRET(__split_parent_unlock(session, parent, hazard)); + return (ret); +} + +/* * __wt_split_rewrite -- * Rewrite an in-memory page with a new version. */ diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c index 6e1d182ed0b..8e0f4036b79 100644 --- a/src/btree/bt_walk.c +++ b/src/btree/bt_walk.c @@ -94,6 +94,9 @@ __wt_tree_walk(WT_SESSION_IMPL *session, */ WT_ENTER_PAGE_INDEX(session); + /* Walk should never instantiate deleted pages. */ + LF_SET(WT_READ_NO_EMPTY); + /* * !!! * Fast-truncate currently only works on row-store trees. @@ -174,9 +177,10 @@ ascend: /* /* * If we got all the way through an internal page and - * all of the child pages were deleted, evict it. + * all of the child pages were deleted, mark it for + * eviction. */ - if (empty_internal) { + if (empty_internal && pindex->entries > 1) { __wt_page_evict_soon(ref->page); empty_internal = false; } @@ -257,7 +261,7 @@ ascend: /* * to delete it again. */ if (ref->state == WT_REF_DELETED && - __wt_delete_page_skip(session, ref)) + __wt_delete_page_skip(session, ref, false)) break; /* * If deleting a range, try to delete the page @@ -294,7 +298,7 @@ ascend: /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && - __wt_delete_page_skip(session, ref)) + __wt_delete_page_skip(session, ref, false)) break; } @@ -302,7 +306,7 @@ ascend: /* /* * Not-found is an expected return when only walking - * in-cache pages. + * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { ret = 0; diff --git a/src/evict/evict_file.c b/src/evict/evict_file.c index c5e04806062..ed0ffb5b262 100644 --- a/src/evict/evict_file.c +++ b/src/evict/evict_file.c @@ -81,7 +81,8 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) case WT_SYNC_DISCARD: WT_ASSERT(session, __wt_page_can_evict(session, page, 0, NULL)); - __wt_evict_page_clean_update(session, ref); + WT_ERR( + __wt_evict_page_clean_update(session, ref, true)); break; case WT_SYNC_DISCARD_FORCE: /* @@ -97,8 +98,9 @@ __wt_evict_file(WT_SESSION_IMPL *session, int syncop) } F_SET(session, WT_SESSION_DISCARD_FORCE); - __wt_evict_page_clean_update(session, ref); + ret = __wt_evict_page_clean_update(session, ref, true); F_CLR(session, WT_SESSION_DISCARD_FORCE); + WT_ERR(ret); break; WT_ILLEGAL_VALUE_ERR(session); } diff --git a/src/evict/evict_page.c b/src/evict/evict_page.c index f4693511e11..9de66922931 100644 --- a/src/evict/evict_page.c +++ b/src/evict/evict_page.c @@ -110,7 +110,8 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) if (__wt_ref_is_root(ref)) __wt_ref_out(session, ref); else - __wt_evict_page_clean_update(session, ref); + WT_ERR(__wt_evict_page_clean_update( + session, ref, closing)); WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean); WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean); @@ -141,22 +142,75 @@ done: if ((inmem_split || (forced_eviction && ret == EBUSY)) && return (ret); } +/* + * __evict_delete_ref -- + * Mark a page reference deleted and check if the parent can reverse + * split. + */ +static int +__evict_delete_ref(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) +{ + WT_DECL_RET; + WT_PAGE *parent; + WT_PAGE_INDEX *pindex; + uint32_t ndeleted; + + if (__wt_ref_is_root(ref)) + return (0); + + /* + * Avoid doing reverse splits when closing the file, it is + * wasted work and some structure may already have been freed. + */ + if (!closing) { + parent = ref->home; + WT_INTL_INDEX_GET(session, parent, pindex); + ndeleted = __wt_atomic_addv32(&pindex->deleted_entries, 1); + + /* + * If more than 10% of the parent references are deleted, try a + * reverse split. Don't bother if there is a single deleted + * reference: the internal page is empty and we have to wait + * for eviction to notice. + * + * This will consume the deleted ref (and eventually free it). + * If the reverse split can't get the access it needs because + * something is busy, be sure that the page still ends up + * marked deleted. + */ + if (ndeleted > pindex->entries / 10 && pindex->entries > 1 && + (ret = __wt_split_reverse(session, ref)) != EBUSY) + return (ret); + } + + WT_PUBLISH(ref->state, WT_REF_DELETED); + return (0); +} /* * __wt_evict_page_clean_update -- * Update a clean page's reference on eviction. */ -void -__wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref) +int +__wt_evict_page_clean_update( + WT_SESSION_IMPL *session, WT_REF *ref, bool closing) { + WT_DECL_RET; + /* * Discard the page and update the reference structure; if the page has * an address, it's a disk page; if it has no address, it's a deleted * page re-instantiated (for example, by searching) and never written. */ __wt_ref_out(session, ref); - WT_PUBLISH(ref->state, - ref->addr == NULL ? WT_REF_DELETED : WT_REF_DISK); + if (ref->addr == NULL) { + WT_WITH_PAGE_INDEX(session, + ret = __evict_delete_ref(session, ref, closing)); + WT_RET_BUSY_OK(ret); + } else + WT_PUBLISH(ref->state, WT_REF_DISK); + + return (0); } /* @@ -167,6 +221,7 @@ static int __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) { WT_ADDR *addr; + WT_DECL_RET; WT_PAGE *parent; WT_PAGE_MODIFY *mod; @@ -194,7 +249,9 @@ __evict_page_dirty_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) */ __wt_ref_out(session, ref); ref->addr = NULL; - WT_PUBLISH(ref->state, WT_REF_DELETED); + WT_WITH_PAGE_INDEX(session, + ret = __evict_delete_ref(session, ref, closing)); + WT_RET_BUSY_OK(ret); break; case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */ /* diff --git a/src/include/btmem.h b/src/include/btmem.h index c5d29bc8106..fb497f64963 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -408,6 +408,7 @@ struct __wt_page { struct __wt_page_index { uint32_t entries; + uint32_t deleted_entries; WT_REF **index; } * volatile __index; /* Collated children */ diff --git a/src/include/btree.i b/src/include/btree.i index edddcdd6fe4..1c416c99e13 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1003,7 +1003,7 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) if (count > WT_MIN_SPLIT_COUNT && size > (size_t)btree->maxleafpage) return (true); - } + } return (false); } @@ -1208,13 +1208,9 @@ __wt_page_swap_func(WT_SESSION_IMPL *session, WT_REF *held, #endif ); - /* An expected failure: WT_NOTFOUND when doing a cache-only read. */ - if (LF_ISSET(WT_READ_CACHE) && ret == WT_NOTFOUND) - return (WT_NOTFOUND); - - /* An expected failure: WT_RESTART */ - if (ret == WT_RESTART) - return (WT_RESTART); + /* Expected failures: page not found or restart. */ + if (ret == WT_NOTFOUND || ret == WT_RESTART) + return (ret); /* Discard the original held page. */ acquired = ret == 0; diff --git a/src/include/extern.h b/src/include/extern.h index 4b341a6adaa..845102ca428 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -114,7 +114,7 @@ extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_PAGE *page, const char * extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile); extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp); extern void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref); -extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref); +extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all); extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep); @@ -153,6 +153,7 @@ extern void __wt_split_stash_discard(WT_SESSION_IMPL *session); extern void __wt_split_stash_discard_all( WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session); extern int __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi, WT_REF **refp, size_t *incrp); extern int __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing); extern int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst); @@ -315,7 +316,7 @@ extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, bool is_server); extern int __wt_cache_wait(WT_SESSION_IMPL *session, int full); extern void __wt_cache_dump(WT_SESSION_IMPL *session); extern int __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing); -extern void __wt_evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref); +extern int __wt_evict_page_clean_update( WT_SESSION_IMPL *session, WT_REF *ref, bool closing); extern int __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn); extern int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn); extern int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp); diff --git a/src/include/flags.h b/src/include/flags.h index 71fc54f9eac..aad44c22184 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -32,13 +32,14 @@ #define WT_LOG_FSYNC 0x00000004 #define WT_READ_CACHE 0x00000001 #define WT_READ_COMPACT 0x00000002 -#define WT_READ_NO_EVICT 0x00000004 -#define WT_READ_NO_GEN 0x00000008 -#define WT_READ_NO_WAIT 0x00000010 -#define WT_READ_PREV 0x00000020 -#define WT_READ_SKIP_INTL 0x00000040 -#define WT_READ_TRUNCATE 0x00000080 -#define WT_READ_WONT_NEED 0x00000100 +#define WT_READ_NO_EMPTY 0x00000004 +#define WT_READ_NO_EVICT 0x00000008 +#define WT_READ_NO_GEN 0x00000010 +#define WT_READ_NO_WAIT 0x00000020 +#define WT_READ_PREV 0x00000040 +#define WT_READ_SKIP_INTL 0x00000080 +#define WT_READ_TRUNCATE 0x00000100 +#define WT_READ_WONT_NEED 0x00000200 #define WT_SESSION_CAN_WAIT 0x00000001 #define WT_SESSION_CLEAR_EVICT_WALK 0x00000002 #define WT_SESSION_DISCARD_FORCE 0x00000004 diff --git a/src/include/txn.i b/src/include/txn.i index 3152ff6bdd5..73d7f1f0518 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -187,6 +187,13 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id) session->dhandle == session->meta_dhandle) return (true); + /* + * If we don't have a transactional snapshot, only make stable updates + * visible. + */ + if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)) + return (__wt_txn_visible_all(session, id)); + /* Transactions see their own changes. */ if (id == txn->id) return (true); diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 988b7e0a84f..73b7f4968e9 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -653,6 +653,7 @@ __rec_write_init(WT_SESSION_IMPL *session, r->flags = flags; /* Track if the page can be marked clean. */ + r->max_txn = WT_TXN_NONE; r->leave_dirty = false; /* Raw compression. */ |