diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2015-11-13 17:31:39 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2015-11-13 17:31:39 +1100 |
commit | c0a6993ff9a10fcf21434a395a6ab543e5f38881 (patch) | |
tree | 69f1594f0a69567621622dbca704898134fe3966 | |
parent | eb905b18f7987dc2e67bc532895013f66ca723e8 (diff) | |
download | mongo-c0a6993ff9a10fcf21434a395a6ab543e5f38881.tar.gz |
Import wiredtiger-wiredtiger-mongodb-3.0.7-3-gcb64236.tar.gz from wiredtiger branch mongodb-3.0
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_split.c | 155 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/config/config.c | 7 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/btmem.h | 19 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/btree.i | 32 |
4 files changed, 143 insertions, 70 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 12f86925849..739db727fb5 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -167,15 +167,12 @@ __split_safe_free(WT_SESSION_IMPL *session, * Return if we should deepen the tree. */ static bool -__split_should_deepen( - WT_SESSION_IMPL *session, WT_REF *ref, uint32_t *childrenp) +__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_PAGE *page; WT_PAGE_INDEX *pindex; - *childrenp = 0; - btree = S2BT(session); page = ref->page; pindex = WT_INTL_INDEX_GET_SAFE(page); @@ -193,10 +190,8 @@ __split_should_deepen( * we get a significant payback (in the case of a set of large keys, * splitting won't help). */ - if (pindex->entries > btree->split_deepen_min_child) { - *childrenp = pindex->entries / btree->split_deepen_per_child; + if (pindex->entries > btree->split_deepen_min_child) return (true); - } /* * Don't allow a single page to put pressure on cache usage. The root @@ -207,10 +202,8 @@ __split_should_deepen( */ if (pindex->entries >= 100 && (__wt_ref_is_root(ref) || - page->memory_footprint >= S2C(session)->cache_size / 4)) { - *childrenp = pindex->entries / 10; + page->memory_footprint >= S2C(session)->cache_size / 4)) return (true); - } return (false); } @@ -377,8 +370,9 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) * Split an internal page in-memory, deepening the tree. */ static int -__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) +__split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent) { + WT_BTREE *btree; WT_DECL_RET; WT_PAGE *child; WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex; @@ -386,63 +380,91 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) WT_REF *child_ref, **child_refp, *parent_ref, **parent_refp, *ref; size_t child_incr, parent_decr, parent_incr, size; uint64_t split_gen; - uint32_t chunk, i, j, remain, slots; + uint32_t children, chunk, i, j, moved_entries, new_entries, remain; + uint32_t skip_leading, slots; bool panic; void *p; + WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); + WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); + + btree = S2BT(session); alloc_index = NULL; parent_incr = parent_decr = 0; panic = false; pindex = WT_INTL_INDEX_GET_SAFE(parent); - WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); - WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); - WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, - "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children", - parent, pindex->entries, children)); - /* - * If the workload is prepending/appending to the tree, we could deepen - * without bound. Don't let that happen, keep the first/last pages of - * the tree at their current level. + * A prepending/appending workload will repeatedly deepen parts of the + * tree that aren't changing, and appending workloads are not uncommon. + * First, keep the first/last pages of the tree at their current level, + * to catch simple workloads. Second, track the number of entries which + * resulted from the last time we deepened this page, and if we refilled + * this page without splitting into those slots, ignore them for this + * split. It's not exact because an eviction might split into any part + * of the page: if 80% of the splits are at the end of the page, assume + * an append-style workload. Of course, the plan eventually fails: when + * repeatedly deepening this page for an append-only workload, we will + * progressively ignore more and more of the slots. When ignoring 90% of + * the slots, deepen the entire page again. * - * XXX - * To improve this, we could track which pages were last merged into - * this page by eviction, and leave those pages alone, to prevent any - * sustained insert into the tree from deepening a single location. + * Figure out how many slots we're leaving at this level and how many + * child pages we're creating. + */ +#undef skip_trailing +#define skip_trailing 1 + skip_leading = 1; + new_entries = pindex->entries - parent->pg_intl_deepen_split_last; + if (parent->pg_intl_deepen_split_append > (new_entries * 8) / 10) + skip_leading = parent->pg_intl_deepen_split_last; + if (skip_leading > (pindex->entries * 9) * 10) + skip_leading = 1; + + /* + * In a few (rare) cases we split pages with only a few entries, and in + * those cases we keep it simple, 10 children, skip only first and last + * entries. Otherwise, split into a lot of child pages. */ -#undef SPLIT_CORRECT_1 -#define SPLIT_CORRECT_1 1 /* First page correction */ -#undef SPLIT_CORRECT_2 -#define SPLIT_CORRECT_2 2 /* First/last page correction */ + moved_entries = pindex->entries - (skip_leading + skip_trailing); + children = moved_entries / btree->split_deepen_per_child; + if (children < 10) { + children = 10; + skip_leading = 1; + moved_entries = + pindex->entries - (skip_leading + skip_trailing); + } + + WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, + "%p: %" PRIu32 " elements, splitting into %" PRIu32 " children", + parent, pindex->entries, children)); /* - * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize - * the first/last slots of the allocated WT_PAGE_INDEX to point to the - * first/last pages we're keeping at the current level, and the rest of - * the slots to point to new WT_REF objects. + * Allocate a new WT_PAGE_INDEX and set of WT_REF objects. Initialize + * the slots of the allocated WT_PAGE_INDEX to point to the pages we're + * keeping at the current level, and the rest of the slots to point to + * new WT_REF objects. */ size = sizeof(WT_PAGE_INDEX) + - (children + SPLIT_CORRECT_2) * sizeof(WT_REF *); + (children + skip_leading + skip_trailing) * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); parent_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); - alloc_index->entries = children + SPLIT_CORRECT_2; - alloc_index->index[0] = pindex->index[0]; + alloc_index->entries = children + skip_leading + skip_trailing; + for (alloc_refp = alloc_index->index, + i = 0; i < skip_leading; ++alloc_refp, ++i) + alloc_index->index[i] = pindex->index[i]; + for (i = 0; i < children; ++alloc_refp, ++i) + WT_ERR(__wt_calloc_one(session, alloc_refp)); + parent_incr += children * sizeof(WT_REF); alloc_index->index[alloc_index->entries - 1] = pindex->index[pindex->entries - 1]; - for (alloc_refp = alloc_index->index + SPLIT_CORRECT_1, - i = 0; i < children; ++alloc_refp, ++i) { - WT_ERR(__wt_calloc_one(session, alloc_refp)); - parent_incr += sizeof(WT_REF); - } /* Allocate child pages, and connect them into the new page index. */ - chunk = (pindex->entries - SPLIT_CORRECT_2) / children; - remain = (pindex->entries - SPLIT_CORRECT_2) - chunk * (children - 1); - for (parent_refp = pindex->index + SPLIT_CORRECT_1, - alloc_refp = alloc_index->index + SPLIT_CORRECT_1, + chunk = moved_entries / children; + remain = moved_entries - chunk * (children - 1); + for (parent_refp = pindex->index + skip_leading, + alloc_refp = alloc_index->index + skip_leading, i = 0; i < children; ++i) { slots = i == children - 1 ? remain : chunk; WT_ERR(__wt_page_alloc( @@ -500,10 +522,11 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) } __wt_cache_page_inmem_incr(session, child, child_incr); } - WT_ASSERT(session, alloc_refp - - alloc_index->index == alloc_index->entries - SPLIT_CORRECT_1); WT_ASSERT(session, - parent_refp - pindex->index == pindex->entries - SPLIT_CORRECT_1); + alloc_refp - alloc_index->index == + alloc_index->entries - skip_trailing); + WT_ASSERT(session, + parent_refp - pindex->index == pindex->entries - skip_trailing); /* * Update the parent's index; this is the update which splits the page, @@ -527,6 +550,12 @@ __split_deepen(WT_SESSION_IMPL *session, WT_PAGE *parent, uint32_t children) #ifdef HAVE_DIAGNOSTIC __split_verify_intl_key_order(session, parent); #endif + /* + * Save the number of entries created by deepening the tree and reset + * the count of splits into this page after that point. + */ + parent->pg_intl_deepen_split_append = 0; + parent->pg_intl_deepen_split_last = alloc_index->entries; /* * The moved reference structures now reference the wrong parent page, @@ -889,8 +918,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **alloc_refp, *next_ref, *parent_ref; size_t parent_decr, size; uint64_t split_gen; - uint32_t i, j; - uint32_t children, deleted_entries, parent_entries, result_entries; + uint32_t deleted_entries, i, j, parent_entries, result_entries; bool complete; parent = ref->home; @@ -926,8 +954,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, } /* - * The final entry count consists of: The original count, plus any - * new pages, less any refs we are removing. + * The final entry count consists of the original count, plus any new + * pages, less any WT_REFs we're removing. */ result_entries = (parent_entries + new_entries) - deleted_entries; @@ -943,7 +971,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, alloc_index->entries = result_entries; for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; - if (next_ref == ref) + if (next_ref == ref) { for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; *alloc_refp++ = ref_new[j]; @@ -955,7 +983,22 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, */ ref_new[j] = NULL; } - else if (next_ref->state != WT_REF_SPLIT) + + /* + * We detect append-style workloads to avoid repeatedly + * deepening parts of the tree where no work is being + * done by tracking if we're splitting after the slots + * created by the last split to deepen this parent. + * + * Note the calculation: i is a 0-based array offset and + * split-last is a count of entries, also either or both + * i and split-last might be unsigned 0, don't decrement + * either one. + */ + if (i > parent->pg_intl_deepen_split_last) + parent-> + pg_intl_deepen_split_append += new_entries; + } else if (next_ref->state != WT_REF_SPLIT) /* Skip refs we have marked for deletion. */ *alloc_refp++ = next_ref; } @@ -1086,8 +1129,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, * are holding it locked. */ if (ret == 0 && !closing && - __split_should_deepen(session, parent_ref, &children)) - ret = __split_deepen(session, parent, children); + __split_should_deepen(session, parent_ref)) + ret = __split_deepen(session, parent); err: if (!complete) for (i = 0; i < parent_entries; ++i) { diff --git a/src/third_party/wiredtiger/src/config/config.c b/src/third_party/wiredtiger/src/config/config.c index 92f917915e1..300b1923b9e 100644 --- a/src/third_party/wiredtiger/src/config/config.c +++ b/src/third_party/wiredtiger/src/config/config.c @@ -735,11 +735,16 @@ __wt_config_gets_def(WT_SESSION_IMPL *session, *value = false_value; value->val = def; + if (cfg == NULL || cfg[0] == NULL || cfg[1] == NULL) return (0); - else if (cfg[2] == NULL) + + if (cfg[2] == NULL) { WT_RET_NOTFOUND_OK( __wt_config_getones(session, cfg[1], key, value)); + return (0); + } + return (__wt_config_gets(session, cfg, key, value)); } diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 9910cad0ae7..c5d29bc8106 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -375,9 +375,8 @@ struct __wt_page { /* * Internal pages (both column- and row-store). * - * The page record number is only used by column-store, but it - * makes some things simpler and it doesn't cost us any memory, - * other structures in this union are still as large. + * The page record number is only used by column-store, but it's + * simpler having only one kind of internal page. * * In-memory internal pages have an array of pointers to child * structures, maintained in collated order. When a page is @@ -411,10 +410,24 @@ struct __wt_page { uint32_t entries; WT_REF **index; } * volatile __index; /* Collated children */ + + /* + * When splitting to deepen the tree, track the number + * of entries in the newly created parent, and how many + * subsequent splits follow the initial set of entries. + * If future splits into the page are generally after + * the initial set of items, perform future deepening + * splits in this page to optimize for an append-style + * workload. + */ + uint32_t deepen_split_append; + uint32_t deepen_split_last; } intl; #undef pg_intl_recno #define pg_intl_recno u.intl.recno #define pg_intl_parent_ref u.intl.parent_ref +#define pg_intl_deepen_split_append u.intl.deepen_split_append +#define pg_intl_deepen_split_last u.intl.deepen_split_last /* * Macros to copy/set the index because the name is obscured to ensure diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index dad695f8628..edddcdd6fe4 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -946,7 +946,8 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) WT_BTREE *btree; WT_INSERT_HEAD *ins_head; WT_INSERT *ins; - int i; + size_t size; + int count; btree = S2BT(session); @@ -976,22 +977,33 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) return (false); /* - * There is no point splitting if the list is small, no deep items is - * our heuristic for that. A 1/4 probability of adding a new skiplist - * level, with level-0 always created, means there will be a 5th level - * entry for roughly every 1024 entries in the list. If there are at - * least 4 5th level entries (4K items), the list is large enough. + * There is no point doing an in-memory split unless there is a lot of + * data in the last skiplist on the page. Split if there are enough + * items and the skiplist does not fit within a single disk page. + * + * Rather than scanning the whole list, walk a higher level, which + * gives a sample of the items -- at level 0 we have all the items, at + * level 1 we have 1/4 and at level 2 we have 1/16th. If we see more + * than 30 items and more data than would fit in a disk page, split. */ -#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(5, WT_SKIP_MAXDEPTH - 1) +#define WT_MIN_SPLIT_DEPTH 2 +#define WT_MIN_SPLIT_COUNT 30 +#define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */ + ins_head = page->pg_row_entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); if (ins_head == NULL) return (false); - for (i = 0, ins = ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH]; - ins != NULL; ins = ins->next[WT_MIN_SPLIT_SKIPLIST_DEPTH]) - if (++i == 4) + for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH]; + ins != NULL; ins = ins->next[WT_MIN_SPLIT_DEPTH]) { + count += WT_MIN_SPLIT_MULTIPLIER; + size += WT_MIN_SPLIT_MULTIPLIER * + (WT_INSERT_KEY_SIZE(ins) + WT_UPDATE_MEMSIZE(ins->upd)); + if (count > WT_MIN_SPLIT_COUNT && + size > (size_t)btree->maxleafpage) return (true); + } return (false); } |