diff options
author | Michael Cahill <michael.cahill@wiredtiger.com> | 2013-03-15 20:27:17 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@wiredtiger.com> | 2013-03-15 20:27:17 +1100 |
commit | 9279d3832a679e73782165150b5ebbdf28052a54 (patch) | |
tree | d754d24464f6b5dfe7a607cc47278e5e1ec80dcc | |
parent | b0577fdb370468ae872e40ad2da1ac8522d1abd0 (diff) | |
download | mongo-9279d3832a679e73782165150b5ebbdf28052a54.tar.gz |
Tweak merge to build better trees with random insert workloads.
-rw-r--r-- | src/btree/bt_evict.c | 51 | ||||
-rw-r--r-- | src/btree/rec_evict.c | 5 | ||||
-rw-r--r-- | src/btree/rec_merge.c | 94 | ||||
-rw-r--r-- | src/include/btmem.h | 5 | ||||
-rw-r--r-- | src/include/btree.i | 15 |
5 files changed, 121 insertions, 49 deletions
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index e0f8521887d..7f219364352 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -59,8 +59,7 @@ __evict_read_gen(const WT_EVICT_ENTRY *entry) * that make reads faster, so don't make them too unlikely. */ if ((page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) && - (page->modify == NULL || - !F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE))) + !__wt_btree_mergeable(page)) read_gen += WT_EVICT_INT_SKEW; return (read_gen); @@ -185,12 +184,11 @@ __wt_evict_forced_page(WT_SESSION_IMPL *session, WT_PAGE *page) * stack instead. */ for (top = page, levels = 0; - !WT_PAGE_IS_ROOT(top) && top->parent->modify != NULL && - F_ISSET(top->parent->modify, WT_PM_REC_SPLIT_MERGE); - ++levels) - top = top->parent; + __wt_btree_mergeable(top->parent); + top = top->parent, ++levels) + ; - if (levels > WT_MERGE_STACK_MIN) + if (levels >= WT_MERGE_STACK_MIN) page = top; /* @@ -1004,15 +1002,21 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean) WT_CSTAT_INCR(session, cache_eviction_walk); -#define WT_IS_SPLIT_MERGE(p) \ - ((p)->modify != NULL && F_ISSET((p)->modify, WT_PM_REC_SPLIT_MERGE)) + /* Ignore root pages entirely. */ + if (WT_PAGE_IS_ROOT(page)) + continue; /* Look for a split-merge (grand)parent page to merge. */ - for (levels = 0; - levels < WT_MERGE_STACK_MIN && page != NULL && - WT_IS_SPLIT_MERGE(page); - page = page->parent, levels++) - ; + levels = 0; + if (__wt_btree_mergeable(page)) + for (levels = 1; + levels < WT_MERGE_STACK_MIN && + __wt_btree_mergeable(page->parent); + page = page->parent, levels++) + ; + else if (page->modify != NULL && + F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE)) + continue; /* * Only look for a parent at exactly the right height above: if @@ -1020,19 +1024,14 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean) * don't want to do too much work on every level. * * !!! - * We don't restrict ourselves to only the top-most page. If - * there are split-merge pages under the root page in a big, - * busy tree, the merge will only happen if we can lock the - * whole tree exclusively. Consider subtrees if locking the + * Don't restrict ourselves to only the top-most page (that is, + * don't require that page->parent is not mergeable). If there + * is a big, busy enough split-merge tree, the top-level merge + * will only happen if we can lock the whole subtree + * exclusively. Consider smaller merges in case locking the * whole tree fails. */ - if (page == NULL || - (levels != 0 && levels != WT_MERGE_STACK_MIN) || - (levels == WT_MERGE_STACK_MIN && !WT_IS_SPLIT_MERGE(page))) - continue; - - /* Ignore root pages entirely. */ - if (WT_PAGE_IS_ROOT(page)) + if (levels != 0 && levels != WT_MERGE_STACK_MIN) continue; /* @@ -1077,7 +1076,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean) * If the oldest transaction hasn't changed since the last time * this page was written, there's no chance to make progress... */ - if (modified && + if (modified && levels == 0 && TXNID_LE(oldest_txn, page->modify->disk_txn)) continue; diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c index b4f17de0b23..317328afe06 100644 --- a/src/btree/rec_evict.c +++ b/src/btree/rec_evict.c @@ -37,10 +37,13 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) * it. During close, it will be merged into its parent. */ mod = page->modify; - merge = (mod != NULL && F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)); + merge = __wt_btree_mergeable(page); if (merge && exclusive) return (EBUSY); + WT_ASSERT(session, merge || mod == NULL || + !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)); + /* * Get exclusive access to the page and review the page and its subtree * for conditions that would block our eviction of the page. If the diff --git a/src/btree/rec_merge.c b/src/btree/rec_merge.c index 9034223032f..fb92a194924 100644 --- a/src/btree/rec_merge.c +++ b/src/btree/rec_merge.c @@ -43,12 +43,6 @@ __merge_walk(WT_SESSION_IMPL *session, WT_PAGE *page, u_int depth, WT_REF_FOREACH(page, ref, i) switch (ref->state) { case WT_REF_LOCKED: - /* - * Don't allow eviction until it is properly hooked - * into the tree. - */ - __wt_evict_list_clr_page(session, ref->page); - child = ref->page; /* @@ -57,8 +51,7 @@ __merge_walk(WT_SESSION_IMPL *session, WT_PAGE *page, u_int depth, * have to unlock everything. */ if (child->type == page->type && - child->modify != NULL && - F_ISSET(child->modify, WT_PM_REC_SPLIT_MERGE)) { + __wt_btree_mergeable(child)) { WT_RET(__merge_walk( session, child, depth + 1, visit, state)); break; @@ -138,6 +131,33 @@ __merge_unlock(WT_PAGE *page) } } +#ifdef HAVE_DIAGNOSTIC +/* + * __merge_check_discard -- + * Make sure we are only discarding split-merge pages. + */ +static void +__merge_check_discard(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_REF *ref; + uint32_t i; + + WT_ASSERT(session, page->type == WT_PAGE_ROW_INT || + page->type == WT_PAGE_COL_INT); + WT_ASSERT(session, page->modify != NULL && + F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE)); + + WT_REF_FOREACH(page, ref, i) { + if (ref->state == WT_REF_DISK || + ref->state == WT_REF_DELETED) + continue; + + WT_ASSERT(session, ref->state == WT_REF_LOCKED); + __merge_check_discard(session, ref->page); + } +} +#endif + /* * __merge_switch_page -- * Switch a page from the locked tree into the new tree. @@ -272,14 +292,14 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) lchild = newtop = rchild = NULL; page_type = top->type; - WT_ASSERT(session, !WT_PAGE_IS_ROOT(top)); + WT_ASSERT(session, __wt_btree_mergeable(top)); WT_ASSERT(session, top->ref->state == WT_REF_LOCKED); /* * Walk the subtree, count the references at the bottom level and * calculate the maximum depth. */ - WT_RET(__merge_walk(session, top, 0, __merge_count, &visit_state)); + WT_RET(__merge_walk(session, top, 1, __merge_count, &visit_state)); /* If there aren't enough useful levels, give up. */ if (visit_state.maxdepth < WT_MERGE_STACK_MIN) @@ -304,13 +324,9 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) * isn't big enough to justify the cost of evicting it. If splits * continue, it will be merged again until it gets over this limit. */ + promote = 0; refcnt = (uint32_t)visit_state.refcnt; - promote = (refcnt > 100); - - if (promote) { - /* Create a new top-level split-merge page with two entries. */ - WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop)); - + if (refcnt >= WT_MERGE_FULL_PAGE) { /* * In the normal case where there are live children spread * through the subtree, create two child pages. @@ -326,10 +342,26 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) if (visit_state.first_live == visit_state.last_live && (visit_state.first_live == 0 || visit_state.first_live == refcnt - 1)) - visit_state.split = split = - (visit_state.first_live == 0) ? 1 : refcnt - 1; + split = (visit_state.first_live == 0) ? 1 : refcnt - 1; else - visit_state.split = split = (refcnt + 1) / 2; + split = (refcnt + 1) / 2; + + /* Only promote if we can create a real page. */ + if (split == 1 || split == refcnt - 1) + promote = 1; + else if (split >= WT_MERGE_FULL_PAGE && + visit_state.first_live >= split) + promote = 1; + else if (refcnt - split >= WT_MERGE_FULL_PAGE && + visit_state.last_live < split) + promote = 1; + } + + if (promote) { + /* Create a new top-level split-merge page with two entries. */ + WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop)); + + visit_state.split = split; /* Left split. */ if (split == 1) @@ -353,8 +385,18 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) &visit_state.second->u.intl.t[0]; } } else { - WT_ERR(__merge_new_page( - session, page_type, refcnt, 1, &newtop)); + /* + * Create a new split-merge page for small merges, or if the + * page above is a split merge page. When we do a big enough + * merge, we create a real page at the top and don't consider + * it as a merge candidate again. Over time with an insert + * workload the tree will grow deeper, but that's inevitable, + * and this keeps individual merges small. + */ + WT_ERR(__merge_new_page(session, page_type, refcnt, + refcnt < WT_MERGE_FULL_PAGE || + __wt_btree_mergeable(top->parent), + &newtop)); visit_state.first = newtop; } @@ -407,12 +449,20 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) newtop->parent = top->parent; newtop->ref = top->ref; +#ifdef HAVE_DIAGNOSTIC + /* + * Before swapping in the new page, make sure we're going to just discard + * a set of split-merge pages. + */ + __merge_check_discard(session, top); +#endif /* * Set up the new top-level page as a split so that it will be swapped * into place by our caller. */ - top->modify->u.split = newtop; top->modify->flags = WT_PM_REC_SPLIT; + top->modify->u.split = newtop; + WT_VERBOSE_ERR(session, evict, "Successfully %s %" PRIu32 diff --git a/src/include/btmem.h b/src/include/btmem.h index c3e9e702a49..d0ee033202e 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -452,8 +452,13 @@ struct __wt_ref { * WT_MERGE_STACK_MIN -- * When stacks of in-memory pages become this deep, they are considered for * merging. + * + * WT_MERGE_FULL_PAGE -- + * When the result of a merge contains more than this number of keys, it is + * considered "done" and will not be merged again. */ #define WT_MERGE_STACK_MIN 3 +#define WT_MERGE_FULL_PAGE 100 /* * WT_ROW -- diff --git a/src/include/btree.i b/src/include/btree.i index 11ab5c4ad81..52769d837dc 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -579,3 +579,18 @@ __wt_btree_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item) (((cmp) = __wt_btree_lex_compare((k1), (k2))), 0) : \ (bt)->collator->compare((bt)->collator, &(s)->iface, \ (k1), (k2), &(cmp))) + +/* + * __wt_btree_mergeable -- + * Determines whether the given page is a candidate for merging. + */ +static inline int +__wt_btree_mergeable(WT_PAGE *page) +{ + if (WT_PAGE_IS_ROOT(page) || + page->modify == NULL || + !F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE)) + return (0); + + return (!WT_PAGE_IS_ROOT(page->parent)); +} |