diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/bt_split.c')
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_split.c | 166 |
1 files changed, 111 insertions, 55 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 92be2125b88..355bc09871b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -440,17 +440,49 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, } /* + * __split_ref_final -- + * Finalize the WT_REF move. + */ +static void +__split_ref_final(WT_SESSION_IMPL *session, WT_PAGE ***lockedp) +{ + WT_PAGE **locked; + size_t i; + + /* The parent page's page index has been updated. */ + WT_WRITE_BARRIER(); + + if ((locked = *lockedp) == NULL) + return; + *lockedp = NULL; + + /* + * The moved child pages are locked to prevent them from splitting + * before the parent move completes, unlock them as the final step. + */ + for (i = 0; locked[i] != NULL; ++i) + WT_PAGE_UNLOCK(session, locked[i]); + __wt_free(session, locked); +} + +/* * __split_ref_prepare -- * Prepare a set of WT_REFs for a move. */ -static void +static int __split_ref_prepare(WT_SESSION_IMPL *session, - WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first) + WT_PAGE_INDEX *pindex, WT_PAGE ***lockedp, bool skip_first) { - WT_PAGE *child; + WT_DECL_RET; + WT_PAGE *child, **locked; WT_REF *child_ref, *ref; + size_t alloc, cnt; uint32_t i, j; + *lockedp = NULL; + + locked = NULL; + /* The newly created subtree is complete. */ WT_WRITE_BARRIER(); @@ -462,51 +494,44 @@ __split_ref_prepare(WT_SESSION_IMPL *session, * ascend into the created children, but eventually fail as that parent * page won't yet know about the created children pages. That's OK, we * spin there until the parent's page index is updated. - */ + * + * Lock the newly created page to ensure none of its children can split. + * First, to ensure all of the child pages are updated before any pages + * can split. Second, to ensure the original split completes before any + * of the children can split. The latter involves split generations: + * the original split page has references to these children. If they + * split immediately, they could free WT_REF structures based on split + * generations earlier than the split generation we'll eventually choose + * to protect the original split page's previous page index. + */ + alloc = cnt = 0; for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { ref = pindex->index[i]; child = ref->page; - /* - * Block eviction in newly created pages. - * - * Once the split is live, newly created internal pages might be - * evicted and their WT_REF structures freed. If that happened - * before all threads exit the index of the page that previously - * "owned" the WT_REF, a thread might see a freed WT_REF. To - * ensure that doesn't happen, the newly created page contains - * the current split generation and can't be evicted until - * all readers have left the old generation. - * - * Historic, we also blocked splits in newly created pages - * because we didn't update the WT_REF.home field until after - * the split was live, so the WT_REF.home fields being updated - * could split again before the update, there's a race between - * splits as to which would update them first. The current code - * updates the WT_REF.home fields before going live (in this - * function), this isn't an issue. - */ - child->pg_intl_split_gen = split_gen; + WT_PAGE_LOCK(session, child); - /* - * We use a page flag to prevent the child from splitting from - * underneath us, but the split-generation error checks don't - * know about that flag; use the standard macros to ensure that - * reading the child's page index structure is safe. - */ + /* Track the locked pages for cleanup. */ + WT_ERR(__wt_realloc_def(session, &alloc, cnt + 2, &locked)); + locked[cnt++] = child; + + /* Switch the WT_REF's to their new page. */ j = 0; - WT_ENTER_PAGE_INDEX(session); WT_INTL_FOREACH_BEGIN(session, child, child_ref) { child_ref->home = child; child_ref->pindex_hint = j++; } WT_INTL_FOREACH_END; - WT_LEAVE_PAGE_INDEX(session); #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, child)); #endif } + *lockedp = locked; + return (0); + +err: __split_ref_final(session, &locked); + return (ret); } /* @@ -518,10 +543,9 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) { WT_BTREE *btree; WT_DECL_RET; - WT_PAGE *child; + WT_PAGE *child, **locked; WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex; - WT_REF **alloc_refp; - WT_REF **child_refp, *ref, **root_refp; + WT_REF **alloc_refp, **child_refp, *ref, **root_refp; WT_SPLIT_ERROR_PHASE complete; size_t child_incr, root_decr, root_incr, size; uint64_t split_gen; @@ -536,11 +560,13 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) btree = S2BT(session); alloc_index = NULL; + locked = NULL; root_decr = root_incr = 0; complete = WT_ERR_RETURN; - /* The root page will be marked dirty, make sure that will succeed. */ + /* Mark the root page dirty. */ WT_RET(__wt_page_modify_init(session, root)); + __wt_page_modify_set(session, root); /* * Our caller is holding the root page locked to single-thread splits, @@ -583,6 +609,17 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__wt_calloc_one(session, alloc_refp)); root_incr += children * sizeof(WT_REF); + /* + * Once the split is live, newly created internal pages might be evicted + * and their WT_REF structures freed. If that happens before all threads + * exit the index of the page that previously "owned" the WT_REF, a + * thread might see a freed WT_REF. To ensure that doesn't happen, the + * created pages are set to the current split generation and so can't be + * evicted until all readers have left the old generation. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + WT_ASSERT(session, root->pg_intl_split_gen < split_gen); + /* Allocate child pages, and connect them into the new page index. */ for (root_refp = pindex->index, alloc_refp = alloc_index->index, i = 0; i < children; ++i) { @@ -609,6 +646,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) /* Initialize the child page. */ child->pg_intl_parent_ref = ref; + child->pg_intl_split_gen = split_gen; /* Mark it dirty. */ WT_ERR(__wt_page_modify_init(session, child)); @@ -640,12 +678,9 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* - * Prepare the WT_REFs for the move: this requires a stable split - * generation to block splits in newly created pages, so get one. - */ + /* Prepare the WT_REFs for the move. */ WT_ENTER_PAGE_INDEX(session); - __split_ref_prepare(session, alloc_index, session->split_gen, false); + WT_ERR(__split_ref_prepare(session, alloc_index, &locked, false)); /* * Confirm the root page's index hasn't moved, then update it, which @@ -665,6 +700,9 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); root->pg_intl_split_gen = split_gen; + /* Finalize the WT_REF move. */ + __split_ref_final(session, &locked); + #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, ret = __split_verify_root(session, root)); @@ -688,12 +726,13 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); root_decr += size; - /* Adjust the root's memory footprint and mark it dirty. */ + /* Adjust the root's memory footprint. */ __wt_cache_page_inmem_incr(session, root, root_incr); __wt_cache_page_inmem_decr(session, root, root_decr); - __wt_page_modify_set(session, root); -err: switch (complete) { +err: __split_ref_final(session, &locked); + + switch (complete) { case WT_ERR_RETURN: __wt_free_ref_index(session, root, alloc_index, true); break; @@ -743,8 +782,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, empty_parent = false; complete = WT_ERR_RETURN; - /* The parent page will be marked dirty, make sure that will succeed. */ + /* Mark the page dirty. */ WT_RET(__wt_page_modify_init(session, parent)); + __wt_page_modify_set(session, parent); /* * We've locked the parent, which means it cannot split (which is the @@ -972,10 +1012,9 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_TRET(__split_safe_free(session, split_gen, exclusive, pindex, size)); parent_decr += size; - /* Adjust the parent's memory footprint and mark it dirty. */ + /* Adjust the parent's memory footprint. */ __wt_cache_page_inmem_incr(session, parent, parent_incr); __wt_cache_page_inmem_decr(session, parent, parent_decr); - __wt_page_modify_set(session, parent); err: __wt_scr_free(session, &scr); /* @@ -1025,10 +1064,9 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) { WT_BTREE *btree; WT_DECL_RET; - WT_PAGE *child; + WT_PAGE *child, **locked; WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index; - WT_REF **alloc_refp; - WT_REF **child_refp, *page_ref, **page_refp, *ref; + WT_REF **alloc_refp, **child_refp, *page_ref, **page_refp, *ref; WT_SPLIT_ERROR_PHASE complete; size_t child_incr, page_decr, page_incr, parent_incr, size; uint64_t split_gen; @@ -1039,12 +1077,14 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_STAT_CONN_INCR(session, cache_eviction_split_internal); WT_STAT_DATA_INCR(session, cache_eviction_split_internal); - /* The page will be marked dirty, make sure that will succeed. */ + /* Mark the page dirty. */ WT_RET(__wt_page_modify_init(session, page)); + __wt_page_modify_set(session, page); btree = S2BT(session); alloc_index = replace_index = NULL; page_ref = page->pg_intl_parent_ref; + locked = NULL; page_decr = page_incr = parent_incr = 0; complete = WT_ERR_RETURN; @@ -1111,6 +1151,17 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__wt_calloc_one(session, alloc_refp)); parent_incr += children * sizeof(WT_REF); + /* + * Once the split is live, newly created internal pages might be evicted + * and their WT_REF structures freed. If that happens before all threads + * exit the index of the page that previously "owned" the WT_REF, a + * thread might see a freed WT_REF. To ensure that doesn't happen, the + * created pages are set to the current split generation and so can't be + * evicted until all readers have left the old generation. + */ + split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); + WT_ASSERT(session, page->pg_intl_split_gen < split_gen); + /* Allocate child pages, and connect them into the new page index. */ WT_ASSERT(session, page_refp == pindex->index + chunk); for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) { @@ -1137,6 +1188,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) /* Initialize the child page. */ child->pg_intl_parent_ref = ref; + child->pg_intl_split_gen = split_gen; /* Mark it dirty. */ WT_ERR(__wt_page_modify_init(session, child)); @@ -1173,7 +1225,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) * generation to block splits in newly created pages, so get one. */ WT_ENTER_PAGE_INDEX(session); - __split_ref_prepare(session, alloc_index, session->split_gen, true); + WT_ERR(__split_ref_prepare(session, alloc_index, &locked, true)); /* Split into the parent. */ if ((ret = __split_parent(session, page_ref, alloc_index->index, @@ -1197,6 +1249,9 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1); page->pg_intl_split_gen = split_gen; + /* Finalize the WT_REF move. */ + __split_ref_final(session, &locked); + #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, parent)); @@ -1228,12 +1283,13 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_TRET(__split_safe_free(session, split_gen, false, pindex, size)); page_decr += size; - /* Adjust the page's memory footprint, and mark it dirty. */ + /* Adjust the page's memory footprint. */ __wt_cache_page_inmem_incr(session, page, page_incr); __wt_cache_page_inmem_decr(session, page, page_decr); - __wt_page_modify_set(session, page); -err: switch (complete) { +err: __split_ref_final(session, &locked); + + switch (complete) { case WT_ERR_RETURN: __wt_free_ref_index(session, page, alloc_index, true); __wt_free_ref_index(session, page, replace_index, false); |