summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@mongodb.com>2015-05-22 17:40:26 +1000
committerMichael Cahill <michael.cahill@mongodb.com>2015-10-22 11:23:37 +1100
commitcb642366f168caadd56bed3c257e4d3e4c5cc4f0 (patch)
tree1a5600cc3dbb61c89d941f25f7749cef122ec05b
parent01dbcf110a971b8155ad76ff3b2c8b528c1d9b73 (diff)
downloadmongo-cb642366f168caadd56bed3c257e4d3e4c5cc4f0.tar.gz
SERVER-21063 Avoid creating deep trees for append-only workloads.
Merge pull request #1988 from wiredtiger/split-deepen-for-append (cherry picked from commit a98417879da9eacefecd74242fd3924b46e31183)
-rw-r--r--src/btree/bt_split.c155
-rw-r--r--src/include/btmem.h19
2 files changed, 115 insertions, 59 deletions
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 12f86925849..739db727fb5 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -167,15 +167,12 @@ __split_safe_free(WT_SESSION_IMPL *session,
* Return if we should deepen the tree.
*/
static bool
-__split_should_deepen(
- WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *childrenp)
+__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_BTREE *btree;
WT_PAGE *page;
WT_PAGE_INDEX *pindex;
- *childrenp = 0;
-
btree = S2BT(session);
page = ref->page;
pindex = WT_INTL_INDEX_GET_SAFE(page);
@@ -193,10 +190,8 @@ __split_should_deepen(
* we get a significant payback (in the case of a set of large keys,
* splitting won't help).
*/
- if (pindex->entries > btree->split_deepen_min_child) {
- *childrenp = pindex->entries / btree->split_deepen_per_child;
+ if (pindex->entries > btree->split_deepen_min_child)
return (true);
- }
/*
* Don't allow a single page to put pressure on cache usage. The root
@@ -207,10 +202,8 @@ __split_should_deepen(
*/
if (pindex->entries >= 100 &&
(__wt_ref_is_root(ref) ||
- page->memory_footprint >= S2C(session)->cache_size / 4)) {
- *childrenp = pindex->entries / 10;
+ page->memory_footprint >= S2C(session)->cache_size / 4))
return (true);
- }
return (false);
}
@@ -377,8 +370,9 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
* Split an internal page in-memory, deepening the tree.
*/
static int
-__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
+__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent)
{
+ WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *child;
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
@@ -386,63 +380,91 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref;
size_t child_incr, parent_decr, parent_incr, size;
uint64_t split_gen;
- uint32_t chunk, i, j, remain, slots;
+ uint32_t children, chunk, i, j, moved_entries, new_entries, remain;
+ uint32_t skip_leading, slots;
bool panic;
void *p;
+ WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
+ WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
+
+ btree = S2BT(session);
alloc_index = NULL;
parent_incr = parent_decr = 0;
panic = false;
pindex = WT_INTL_INDEX_GET_SAFE(parent);
- WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
- WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
- WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
- "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children",
- parent, pindex->entries, children));
-
/*
- * If the workload is prepending/appending to the tree, we could deepen
- * without bound. Don't let that happen, keep the first/last pages of
- * the tree at their current level.
+ * A prepending/appending workload will repeatedly deepen parts of the
+ * tree that aren't changing, and appending workloads are not uncommon.
+ * First, keep the first/last pages of the tree at their current level,
+ * to catch simple workloads. Second, track the number of entries which
+ * resulted from the last time we deepened this page, and if we refilled
+ * this page without splitting into those slots, ignore them for this
+ * split. It's not exact because an eviction might split into any part
+ * of the page: if 80% of the splits are at the end of the page, assume
+ * an append-style workload. Of course, the plan eventually fails: when
+ * repeatedly deepening this page for an append-only workload, we will
+ * progressively ignore more and more of the slots. When ignoring 90% of
+ * the slots, deepen the entire page again.
*
- * XXX
- * To improve this, we could track which pages were last merged into
- * this page by eviction, and leave those pages alone, to prevent any
- * sustained insert into the tree from deepening a single location.
+ * Figure out how many slots we're leaving at this level and how many
+ * child pages we're creating.
+ */
+#undef skip_trailing
+#define skip_trailing 1
+ skip_leading = 1;
+ new_entries = pindex->entries - parent->pg_intl_deepen_split_last;
+ if (parent->pg_intl_deepen_split_append > (new_entries * 8) / 10)
+ skip_leading = parent->pg_intl_deepen_split_last;
+ if (skip_leading > (pindex->entries * 9) * 10)
+ skip_leading = 1;
+
+ /*
+ * In a few (rare) cases we split pages with only a few entries, and in
+ * those cases we keep it simple, 10 children, skip only first and last
+ * entries. Otherwise, split into a lot of child pages.
*/
-#undef SPLIT_CORRECT_1
-#define SPLIT_CORRECT_1 1 /* First page correction */
-#undef SPLIT_CORRECT_2
-#define SPLIT_CORRECT_2 2 /* First/last page correction */
+ moved_entries = pindex->entries - (skip_leading + skip_trailing);
+ children = moved_entries / btree->split_deepen_per_child;
+ if (children < 10) {
+ children = 10;
+ skip_leading = 1;
+ moved_entries =
+ pindex->entries - (skip_leading + skip_trailing);
+ }
+
+ WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
+ "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children",
+ parent, pindex->entries, children));
/*
- * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize
- * the first/last slots of the allocated WT_PAGE_INDEX to point to the
- * first/last pages we're keeping at the current level, and the rest of
- * the slots to point to new WT_REF objects.
+ * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize
+ * the slots of the allocated WT_PAGE_INDEX to point to the pages we're
+ * keeping at the current level, and the rest of the slots to point to
+ * new WT_REF objects.
*/
size = sizeof(WT_PAGE_INDEX) +
- (children + SPLIT_CORRECT_2) * sizeof(WT_REF *);
+ (children + skip_leading + skip_trailing) * sizeof(WT_REF *);
WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
parent_incr += size;
alloc_index->index = (WT_REF **)(alloc_index + 1);
- alloc_index->entries = children + SPLIT_CORRECT_2;
- alloc_index->index[0] = pindex->index[0];
+ alloc_index->entries = children + skip_leading + skip_trailing;
+ for (alloc_refp = alloc_index->index,
+ i = 0; i < skip_leading; ++alloc_refp, ++i)
+ alloc_index->index[i] = pindex->index[i];
+ for (i = 0; i < children; ++alloc_refp, ++i)
+ WT_ERR(__wt_calloc_one(session, alloc_refp));
+ parent_incr += children * sizeof(WT_REF);
alloc_index->index[alloc_index->entries - 1] =
pindex->index[pindex->entries - 1];
- for (alloc_refp = alloc_index->index + SPLIT_CORRECT_1,
- i = 0; i < children; ++alloc_refp, ++i) {
- WT_ERR(__wt_calloc_one(session, alloc_refp));
- parent_incr += sizeof(WT_REF);
- }
/* Allocate child pages, and connect them into the new page index. */
- chunk = (pindex->entries - SPLIT_CORRECT_2) / children;
- remain = (pindex->entries - SPLIT_CORRECT_2) - chunk * (children - 1);
- for (parent_refp = pindex->index + SPLIT_CORRECT_1,
- alloc_refp = alloc_index->index + SPLIT_CORRECT_1,
+ chunk = moved_entries / children;
+ remain = moved_entries - chunk * (children - 1);
+ for (parent_refp = pindex->index + skip_leading,
+ alloc_refp = alloc_index->index + skip_leading,
i = 0; i < children; ++i) {
slots = i == children - 1 ? remain : chunk;
WT_ERR(__wt_page_alloc(
@@ -500,10 +522,11 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
}
__wt_cache_page_inmem_incr(session, child, child_incr);
}
- WT_ASSERT(session, alloc_refp -
- alloc_index->index == alloc_index->entries - SPLIT_CORRECT_1);
WT_ASSERT(session,
- parent_refp - pindex->index == pindex->entries - SPLIT_CORRECT_1);
+ alloc_refp - alloc_index->index ==
+ alloc_index->entries - skip_trailing);
+ WT_ASSERT(session,
+ parent_refp - pindex->index == pindex->entries - skip_trailing);
/*
* Update the parent's index; this is the update which splits the page,
@@ -527,6 +550,12 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children)
#ifdef HAVE_DIAGNOSTIC
__split_verify_intl_key_order(session, parent);
#endif
+ /*
+ * Save the number of entries created by deepening the tree and reset
+ * the count of splits into this page after that point.
+ */
+ parent->pg_intl_deepen_split_append = 0;
+ parent->pg_intl_deepen_split_last = alloc_index->entries;
/*
* The moved reference structures now reference the wrong parent page,
@@ -889,8 +918,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
WT_REF **alloc_refp, *next_ref, *parent_ref;
size_t parent_decr, size;
uint64_t split_gen;
- uint32_t i, j;
- uint32_t children, deleted_entries, parent_entries, result_entries;
+ uint32_t deleted_entries, i, j, parent_entries, result_entries;
bool complete;
parent = ref->home;
@@ -926,8 +954,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
}
/*
- * The final entry count consists of: The original count, plus any
- * new pages, less any refs we are removing.
+ * The final entry count consists of the original count, plus any new
+ * pages, less any WT_REFs we're removing.
*/
result_entries = (parent_entries + new_entries) - deleted_entries;
@@ -943,7 +971,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
alloc_index->entries = result_entries;
for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
- if (next_ref == ref)
+ if (next_ref == ref) {
for (j = 0; j < new_entries; ++j) {
ref_new[j]->home = parent;
*alloc_refp++ = ref_new[j];
@@ -955,7 +983,22 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
*/
ref_new[j] = NULL;
}
- else if (next_ref->state != WT_REF_SPLIT)
+
+ /*
+ * We detect append-style workloads to avoid repeatedly
+ * deepening parts of the tree where no work is being
+ * done by tracking if we're splitting after the slots
+ * created by the last split to deepen this parent.
+ *
+ * Note the calculation: i is a 0-based array offset and
+ * split-last is a count of entries, also either or both
+ * i and split-last might be unsigned 0, don't decrement
+ * either one.
+ */
+ if (i > parent->pg_intl_deepen_split_last)
+ parent->
+ pg_intl_deepen_split_append += new_entries;
+ } else if (next_ref->state != WT_REF_SPLIT)
/* Skip refs we have marked for deletion. */
*alloc_refp++ = next_ref;
}
@@ -1086,8 +1129,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref,
* are holding it locked.
*/
if (ret == 0 && !closing &&
- __split_should_deepen(session, parent_ref, &children))
- ret = __split_deepen(session, parent, children);
+ __split_should_deepen(session, parent_ref))
+ ret = __split_deepen(session, parent);
err: if (!complete)
for (i = 0; i < parent_entries; ++i) {
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 9910cad0ae7..c5d29bc8106 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -375,9 +375,8 @@ struct __wt_page {
/*
* Internal pages (both column- and row-store).
*
- * The page record number is only used by column-store, but it
- * makes some things simpler and it doesn't cost us any memory,
- * other structures in this union are still as large.
+ * The page record number is only used by column-store, but it's
+ * simpler having only one kind of internal page.
*
* In-memory internal pages have an array of pointers to child
* structures, maintained in collated order. When a page is
@@ -411,10 +410,24 @@ struct __wt_page {
uint32_t entries;
WT_REF **index;
} * volatile __index; /* Collated children */
+
+ /*
+ * When splitting to deepen the tree, track the number
+ * of entries in the newly created parent, and how many
+ * subsequent splits follow the initial set of entries.
+ * If future splits into the page are generally after
+ * the initial set of items, perform future deepening
+ * splits in this page to optimize for an append-style
+ * workload.
+ */
+ uint32_t deepen_split_append;
+ uint32_t deepen_split_last;
} intl;
#undef pg_intl_recno
#define pg_intl_recno u.intl.recno
#define pg_intl_parent_ref u.intl.parent_ref
+#define pg_intl_deepen_split_append u.intl.deepen_split_append
+#define pg_intl_deepen_split_last u.intl.deepen_split_last
/*
* Macros to copy/set the index because the name is obscured to ensure