summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@wiredtiger.com>2013-03-15 20:27:17 +1100
committerMichael Cahill <michael.cahill@wiredtiger.com>2013-03-15 20:27:17 +1100
commit9279d3832a679e73782165150b5ebbdf28052a54 (patch)
treed754d24464f6b5dfe7a607cc47278e5e1ec80dcc
parentb0577fdb370468ae872e40ad2da1ac8522d1abd0 (diff)
downloadmongo-9279d3832a679e73782165150b5ebbdf28052a54.tar.gz
Tweak merge to build better trees with random insert workloads.
-rw-r--r--src/btree/bt_evict.c51
-rw-r--r--src/btree/rec_evict.c5
-rw-r--r--src/btree/rec_merge.c94
-rw-r--r--src/include/btmem.h5
-rw-r--r--src/include/btree.i15
5 files changed, 121 insertions, 49 deletions
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index e0f8521887d..7f219364352 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -59,8 +59,7 @@ __evict_read_gen(const WT_EVICT_ENTRY *entry)
* that make reads faster, so don't make them too unlikely.
*/
if ((page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) &&
- (page->modify == NULL ||
- !F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE)))
+ !__wt_btree_mergeable(page))
read_gen += WT_EVICT_INT_SKEW;
return (read_gen);
@@ -185,12 +184,11 @@ __wt_evict_forced_page(WT_SESSION_IMPL *session, WT_PAGE *page)
* stack instead.
*/
for (top = page, levels = 0;
- !WT_PAGE_IS_ROOT(top) && top->parent->modify != NULL &&
- F_ISSET(top->parent->modify, WT_PM_REC_SPLIT_MERGE);
- ++levels)
- top = top->parent;
+ __wt_btree_mergeable(top->parent);
+ top = top->parent, ++levels)
+ ;
- if (levels > WT_MERGE_STACK_MIN)
+ if (levels >= WT_MERGE_STACK_MIN)
page = top;
/*
@@ -1004,15 +1002,21 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean)
WT_CSTAT_INCR(session, cache_eviction_walk);
-#define WT_IS_SPLIT_MERGE(p) \
- ((p)->modify != NULL && F_ISSET((p)->modify, WT_PM_REC_SPLIT_MERGE))
+ /* Ignore root pages entirely. */
+ if (WT_PAGE_IS_ROOT(page))
+ continue;
/* Look for a split-merge (grand)parent page to merge. */
- for (levels = 0;
- levels < WT_MERGE_STACK_MIN && page != NULL &&
- WT_IS_SPLIT_MERGE(page);
- page = page->parent, levels++)
- ;
+ levels = 0;
+ if (__wt_btree_mergeable(page))
+ for (levels = 1;
+ levels < WT_MERGE_STACK_MIN &&
+ __wt_btree_mergeable(page->parent);
+ page = page->parent, levels++)
+ ;
+ else if (page->modify != NULL &&
+ F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE))
+ continue;
/*
* Only look for a parent at exactly the right height above: if
@@ -1020,19 +1024,14 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean)
* don't want to do too much work on every level.
*
* !!!
- * We don't restrict ourselves to only the top-most page. If
- * there are split-merge pages under the root page in a big,
- * busy tree, the merge will only happen if we can lock the
- * whole tree exclusively. Consider subtrees if locking the
+ * Don't restrict ourselves to only the top-most page (that is,
+ * don't require that page->parent is not mergeable). If there
+ * is a big, busy enough split-merge tree, the top-level merge
+ * will only happen if we can lock the whole subtree
+ * exclusively. Consider smaller merges in case locking the
* whole tree fails.
*/
- if (page == NULL ||
- (levels != 0 && levels != WT_MERGE_STACK_MIN) ||
- (levels == WT_MERGE_STACK_MIN && !WT_IS_SPLIT_MERGE(page)))
- continue;
-
- /* Ignore root pages entirely. */
- if (WT_PAGE_IS_ROOT(page))
+ if (levels != 0 && levels != WT_MERGE_STACK_MIN)
continue;
/*
@@ -1077,7 +1076,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp, int clean)
* If the oldest transaction hasn't changed since the last time
* this page was written, there's no chance to make progress...
*/
- if (modified &&
+ if (modified && levels == 0 &&
TXNID_LE(oldest_txn, page->modify->disk_txn))
continue;
diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c
index b4f17de0b23..317328afe06 100644
--- a/src/btree/rec_evict.c
+++ b/src/btree/rec_evict.c
@@ -37,10 +37,13 @@ __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive)
* it. During close, it will be merged into its parent.
*/
mod = page->modify;
- merge = (mod != NULL && F_ISSET(mod, WT_PM_REC_SPLIT_MERGE));
+ merge = __wt_btree_mergeable(page);
if (merge && exclusive)
return (EBUSY);
+ WT_ASSERT(session, merge || mod == NULL ||
+ !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE));
+
/*
* Get exclusive access to the page and review the page and its subtree
* for conditions that would block our eviction of the page. If the
diff --git a/src/btree/rec_merge.c b/src/btree/rec_merge.c
index 9034223032f..fb92a194924 100644
--- a/src/btree/rec_merge.c
+++ b/src/btree/rec_merge.c
@@ -43,12 +43,6 @@ __merge_walk(WT_SESSION_IMPL *session, WT_PAGE *page, u_int depth,
WT_REF_FOREACH(page, ref, i)
switch (ref->state) {
case WT_REF_LOCKED:
- /*
- * Don't allow eviction until it is properly hooked
- * into the tree.
- */
- __wt_evict_list_clr_page(session, ref->page);
-
child = ref->page;
/*
@@ -57,8 +51,7 @@ __merge_walk(WT_SESSION_IMPL *session, WT_PAGE *page, u_int depth,
* have to unlock everything.
*/
if (child->type == page->type &&
- child->modify != NULL &&
- F_ISSET(child->modify, WT_PM_REC_SPLIT_MERGE)) {
+ __wt_btree_mergeable(child)) {
WT_RET(__merge_walk(
session, child, depth + 1, visit, state));
break;
@@ -138,6 +131,33 @@ __merge_unlock(WT_PAGE *page)
}
}
+#ifdef HAVE_DIAGNOSTIC
+/*
+ * __merge_check_discard --
+ * Make sure we are only discarding split-merge pages.
+ */
+static void
+__merge_check_discard(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_REF *ref;
+ uint32_t i;
+
+ WT_ASSERT(session, page->type == WT_PAGE_ROW_INT ||
+ page->type == WT_PAGE_COL_INT);
+ WT_ASSERT(session, page->modify != NULL &&
+ F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE));
+
+ WT_REF_FOREACH(page, ref, i) {
+ if (ref->state == WT_REF_DISK ||
+ ref->state == WT_REF_DELETED)
+ continue;
+
+ WT_ASSERT(session, ref->state == WT_REF_LOCKED);
+ __merge_check_discard(session, ref->page);
+ }
+}
+#endif
+
/*
* __merge_switch_page --
* Switch a page from the locked tree into the new tree.
@@ -272,14 +292,14 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
lchild = newtop = rchild = NULL;
page_type = top->type;
- WT_ASSERT(session, !WT_PAGE_IS_ROOT(top));
+ WT_ASSERT(session, __wt_btree_mergeable(top));
WT_ASSERT(session, top->ref->state == WT_REF_LOCKED);
/*
* Walk the subtree, count the references at the bottom level and
* calculate the maximum depth.
*/
- WT_RET(__merge_walk(session, top, 0, __merge_count, &visit_state));
+ WT_RET(__merge_walk(session, top, 1, __merge_count, &visit_state));
/* If there aren't enough useful levels, give up. */
if (visit_state.maxdepth < WT_MERGE_STACK_MIN)
@@ -304,13 +324,9 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
* isn't big enough to justify the cost of evicting it. If splits
* continue, it will be merged again until it gets over this limit.
*/
+ promote = 0;
refcnt = (uint32_t)visit_state.refcnt;
- promote = (refcnt > 100);
-
- if (promote) {
- /* Create a new top-level split-merge page with two entries. */
- WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop));
-
+ if (refcnt >= WT_MERGE_FULL_PAGE) {
/*
* In the normal case where there are live children spread
* through the subtree, create two child pages.
@@ -326,10 +342,26 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
if (visit_state.first_live == visit_state.last_live &&
(visit_state.first_live == 0 ||
visit_state.first_live == refcnt - 1))
- visit_state.split = split =
- (visit_state.first_live == 0) ? 1 : refcnt - 1;
+ split = (visit_state.first_live == 0) ? 1 : refcnt - 1;
else
- visit_state.split = split = (refcnt + 1) / 2;
+ split = (refcnt + 1) / 2;
+
+ /* Only promote if we can create a real page. */
+ if (split == 1 || split == refcnt - 1)
+ promote = 1;
+ else if (split >= WT_MERGE_FULL_PAGE &&
+ visit_state.first_live >= split)
+ promote = 1;
+ else if (refcnt - split >= WT_MERGE_FULL_PAGE &&
+ visit_state.last_live < split)
+ promote = 1;
+ }
+
+ if (promote) {
+ /* Create a new top-level split-merge page with two entries. */
+ WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop));
+
+ visit_state.split = split;
/* Left split. */
if (split == 1)
@@ -353,8 +385,18 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
&visit_state.second->u.intl.t[0];
}
} else {
- WT_ERR(__merge_new_page(
- session, page_type, refcnt, 1, &newtop));
+ /*
+ * Create a new split-merge page for small merges, or if the
+ * page above is a split merge page. When we do a big enough
+ * merge, we create a real page at the top and don't consider
+ * it as a merge candidate again. Over time with an insert
+ * workload the tree will grow deeper, but that's inevitable,
+ * and this keeps individual merges small.
+ */
+ WT_ERR(__merge_new_page(session, page_type, refcnt,
+ refcnt < WT_MERGE_FULL_PAGE ||
+ __wt_btree_mergeable(top->parent),
+ &newtop));
visit_state.first = newtop;
}
@@ -407,12 +449,20 @@ __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top)
newtop->parent = top->parent;
newtop->ref = top->ref;
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * Before swapping in the new page, make sure we're going to just discard
+ * a set of split-merge pages.
+ */
+ __merge_check_discard(session, top);
+#endif
/*
* Set up the new top-level page as a split so that it will be swapped
* into place by our caller.
*/
- top->modify->u.split = newtop;
top->modify->flags = WT_PM_REC_SPLIT;
+ top->modify->u.split = newtop;
+
WT_VERBOSE_ERR(session, evict,
"Successfully %s %" PRIu32
diff --git a/src/include/btmem.h b/src/include/btmem.h
index c3e9e702a49..d0ee033202e 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -452,8 +452,13 @@ struct __wt_ref {
* WT_MERGE_STACK_MIN --
* When stacks of in-memory pages become this deep, they are considered for
* merging.
+ *
+ * WT_MERGE_FULL_PAGE --
+ * When the result of a merge contains more than this number of keys, it is
+ * considered "done" and will not be merged again.
*/
#define WT_MERGE_STACK_MIN 3
+#define WT_MERGE_FULL_PAGE 100
/*
* WT_ROW --
diff --git a/src/include/btree.i b/src/include/btree.i
index 11ab5c4ad81..52769d837dc 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -579,3 +579,18 @@ __wt_btree_lex_compare(const WT_ITEM *user_item, const WT_ITEM *tree_item)
(((cmp) = __wt_btree_lex_compare((k1), (k2))), 0) : \
(bt)->collator->compare((bt)->collator, &(s)->iface, \
(k1), (k2), &(cmp)))
+
+/*
+ * __wt_btree_mergeable --
+ * Determines whether the given page is a candidate for merging.
+ */
+static inline int
+__wt_btree_mergeable(WT_PAGE *page)
+{
+ if (WT_PAGE_IS_ROOT(page) ||
+ page->modify == NULL ||
+ !F_ISSET(page->modify, WT_PM_REC_SPLIT_MERGE))
+ return (0);
+
+ return (!WT_PAGE_IS_ROOT(page->parent));
+}