From 3a291f0c096098b2d35fdbce0f97dc3c9b64f6d6 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Sat, 21 Nov 2015 11:59:28 -0500 Subject: WT-2231: Update the WT_REF's page-index hint as we split pages. This can race with a thread setting the hint based on an older page-index, and the change isn't backed out in the case of an error, so there ways for the hint to be wrong; OK because it's just a hint. I'm doing upates in both __split_parent and __split_ref_move_final, some of which is duplicated work. The reason is that __split_parent is the only way to catch reverse splits, and __split_ref_move_final is the best way to catch created WT_REF structures, and the additional work is minimal. --- src/btree/bt_split.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index c025ae62bb3..1c0bf9f10f3 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -394,13 +394,13 @@ __split_child_block_evict_and_split(WT_PAGE *child) */ static int __split_ref_move_final( - WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries) + WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) { WT_DECL_RET; WT_PAGE *child; WT_REF *ref, *child_ref; uint64_t txn_new_id; - uint32_t i; + uint32_t i, j; /* * When creating new internal pages as part of a split, we set a field @@ -419,8 +419,11 @@ __split_ref_move_final( * happens the thread waits for the reference's home page to be updated, * which we do here: walk the children and fix them up. */ - for (i = 0; i < entries; ++i, ++refp) { - ref = *refp; + for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { + ref = pindex->index[i]; + + /* Update the WT_REF's page-index hint. */ + ref->pindex_hint = i; /* * We don't hold hazard pointers on created pages, they cannot @@ -450,6 +453,7 @@ __split_ref_move_final( * know about that flag; use the standard macros to ensure that * reading the child's page index structure is safe. */ + j = 0; WT_ENTER_PAGE_INDEX(session); WT_INTL_FOREACH_BEGIN(session, child, child_ref) { /* @@ -460,10 +464,12 @@ __split_ref_move_final( */ if (child_ref->home != child) { child_ref->home = child; - child_ref->pindex_hint = 0; - child->modify->mod_split_txn = txn_new_id; } + + /* Update the WT_REF's page-index hint. */ + child_ref->pindex_hint = j++; + } WT_INTL_FOREACH_END; WT_LEAVE_PAGE_INDEX(session); @@ -639,8 +645,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) __split_verify_intl_key_order(session, root)); #endif /* Fix up the moved WT_REF structures. */ - WT_ERR(__split_ref_move_final( - session, alloc_index->index, alloc_index->entries)); + WT_ERR(__split_ref_move_final(session, alloc_index, false)); /* We've installed the allocated page-index, ensure error handling. */ alloc_index = NULL; @@ -701,7 +706,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_REF **alloc_refp, *next_ref; size_t parent_decr, size; uint64_t split_gen; - uint32_t i, j; + uint32_t hint, i, j; uint32_t deleted_entries, parent_entries, result_entries; uint32_t *deleted_refs; bool complete, empty_parent; @@ -769,22 +774,31 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * Allocate and initialize a new page index array for the parent, then * copy references from the original index array, plus references from * the newly created split array, into place. + * + * Update the WT_REF's page-index hint as we go. This can race with a + * thread setting the hint based on an older page-index, and the change + * isn't backed out in the case of an error, so there ways for the hint + * to be wrong; OK because it's just a hint. */ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); parent_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); alloc_index->entries = result_entries; - for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { + for (alloc_refp = alloc_index->index, + hint = i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref == ref) for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; + ref_new[j]->pindex_hint = hint++; *alloc_refp++ = ref_new[j]; } - else if (next_ref->state != WT_REF_SPLIT) + else if (next_ref->state != WT_REF_SPLIT) { /* Skip refs we have marked for deletion. */ + next_ref->pindex_hint = hint++; *alloc_refp++ = next_ref; + } } /* Check that we filled in all the entries. */ @@ -1128,8 +1142,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) #endif /* Fix up the moved WT_REF structures. */ - WT_ERR(__split_ref_move_final( - session, alloc_index->index + 1, alloc_index->entries - 1)); + WT_ERR(__split_ref_move_final(session, alloc_index, true)); /* * We don't care about the page-index we allocated, all we needed was -- cgit v1.2.1 From bd8889a9634bb7f2212e5f43951e8f09bf98dc79 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Sat, 28 Nov 2015 14:43:00 -0500 Subject: WT-2247: variable-length column-store in-memory page splits. --- src/btree/bt_split.c | 169 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 104 insertions(+), 65 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index a0dfbf32cad..77a301a5d36 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1618,6 +1618,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_PAGE *page, *right; WT_REF *child, *split_ref[2] = { NULL, NULL }; size_t page_decr, parent_incr, right_incr; + uint8_t type; int i; WT_STAT_FAST_CONN_INCR(session, cache_inmem_split); @@ -1626,6 +1627,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) page = ref->page; right = NULL; page_decr = parent_incr = right_incr = 0; + type = page->type; /* * Assert splitting makes sense; specifically assert the page is dirty, @@ -1639,9 +1641,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT); /* Find the last item on the page. */ - ins_head = page->pg_row_entries == 0 ? + ins_head = type == WT_PAGE_ROW_LEAF ? + (page->pg_row_entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1)) : + WT_COL_APPEND(page); moved_ins = WT_SKIP_LAST(ins_head); /* @@ -1651,15 +1655,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * * The new WT_REF is not quite identical: we have to instantiate a key, * and the new reference is visible to readers once the split completes. - * - * The key-instantiation code checks for races, leave the key fields - * zeroed we don't trigger them. - * - * Don't copy any deleted page state: we may be splitting a page that - * was instantiated after a truncate and that history should not be - * carried onto these new child pages. */ WT_ERR(__wt_calloc_one(session, &split_ref[0])); + parent_incr += sizeof(WT_REF); child = split_ref[0]; child->page = ref->page; child->home = ref->home; @@ -1667,49 +1665,77 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) child->state = WT_REF_MEM; child->addr = ref->addr; - /* - * Copy the first key from the original page into first ref in the new - * parent. Pages created in memory always have a "smallest" insert - * list, so look there first. If we don't find one, get the first key - * from the disk image. - * - * We can't just use the key from the original ref: it may have been - * suffix-compressed, and after the split the truncated key may not be - * valid. - */ - WT_ERR(__wt_scr_alloc(session, 0, &key)); - if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) { - key->data = WT_INSERT_KEY(ins); - key->size = WT_INSERT_KEY_SIZE(ins); + if (type == WT_PAGE_ROW_LEAF) { + /* + * Copy the first key from the original page into first ref in + * the new parent. Pages created in memory always have a + * "smallest" insert list, so look there first. If we don't + * find one, get the first key from the disk image. + * + * We can't just use the key from the original ref: it may have + * been suffix-compressed, and after the split the truncated key + * may not be valid. + */ + WT_ERR(__wt_scr_alloc(session, 0, &key)); + if ((ins = + WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) { + key->data = WT_INSERT_KEY(ins); + key->size = WT_INSERT_KEY_SIZE(ins); + } else + WT_ERR(__wt_row_leaf_key( + session, page, &page->pg_row_d[0], key, true)); + WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child)); + parent_incr += sizeof(WT_IKEY) + key->size; + __wt_scr_free(session, &key); } else - WT_ERR(__wt_row_leaf_key( - session, page, &page->pg_row_d[0], key, true)); - WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child)); - parent_incr += sizeof(WT_REF) + sizeof(WT_IKEY) + key->size; - __wt_scr_free(session, &key); + child->key.recno = ref->key.recno; + + /* + * Don't copy any deleted page state: we may be splitting a page that + * was instantiated after a truncate and that history should not be + * carried onto these new child pages. + */ + child->page_del = NULL; /* * The second page in the split is a new WT_REF/page pair. */ - WT_ERR(__wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, false, &right)); - WT_ERR(__wt_calloc_one(session, &right->pg_row_ins)); - WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0])); + if (type == WT_PAGE_ROW_LEAF) + WT_ERR(__wt_page_alloc(session, type, 0, 0, false, &right)); + else + WT_ERR(__wt_page_alloc(session, + type, WT_INSERT_RECNO(moved_ins), 0, false, &right)); + + /* + * The new page is dirty by definition, column-store splits update the + * page-modify structure, so create it now. + */ + WT_ERR(__wt_page_modify_init(session, right)); + __wt_page_modify_set(session, right); + + if (type == WT_PAGE_ROW_LEAF) { + WT_ERR(__wt_calloc_one(session, &right->pg_row_ins)); + WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0])); + } else { + WT_ERR(__wt_calloc_one(session, &right->modify->mod_append)); + WT_ERR(__wt_calloc_one(session, &right->modify->mod_append[0])); + } right_incr += sizeof(WT_INSERT_HEAD); right_incr += sizeof(WT_INSERT_HEAD *); WT_ERR(__wt_calloc_one(session, &split_ref[1])); + parent_incr += sizeof(WT_REF); child = split_ref[1]; child->page = right; child->state = WT_REF_MEM; - WT_ERR(__wt_row_ikey(session, 0, - WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins), - child)); - parent_incr += - sizeof(WT_REF) + sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins); - /* The new page is dirty by definition. */ - WT_ERR(__wt_page_modify_init(session, right)); - __wt_page_modify_set(session, right); + if (type == WT_PAGE_ROW_LEAF) { + WT_ERR(__wt_row_ikey(session, 0, + WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins), + child)); + parent_incr += sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins); + } else + child->key.recno = WT_INSERT_RECNO(moved_ins); /* * We modified the page above, which will have set the first dirty @@ -1740,8 +1766,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * for simplicity, the previous skip list pointers originally allocated * can be ignored.) */ - right->pg_row_ins[0]->head[0] = - right->pg_row_ins[0]->tail[0] = moved_ins; + if (type == WT_PAGE_ROW_LEAF) + right->pg_row_ins[0]->head[0] = + right->pg_row_ins[0]->tail[0] = moved_ins; + else + right->modify->mod_append[0]->head[0] = + right->modify->mod_append[0]->tail[0] = moved_ins; /* * Remove the entry from the orig page (i.e truncate the skip list). @@ -1826,45 +1856,54 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) __wt_cache_page_inmem_incr(session, right, right_incr); /* - * Split into the parent. After this, the original page is no + * Split into the parent. On successful return, the original page is no * longer locked, so we cannot safely look at it. */ page = NULL; if ((ret = __split_parent( - session, ref, split_ref, 2, parent_incr, false, true)) != 0) { - /* - * Move the insert list element back to the original page list. - * For simplicity, the previous skip list pointers originally - * allocated can be ignored, just append the entry to the end of - * the level 0 list. As before, we depend on the list having - * multiple elements and ignore the edge cases small lists have. - */ + session, ref, split_ref, 2, parent_incr, false, true)) == 0) + return (0); + + /* + * Failure. + * + * Clear the allocated page's reference to the moved insert list element + * so it's not freed when we discard the page. + * + * Move the element back to the original page list. For simplicity, the + * previous skip list pointers originally allocated can be ignored, just + * append the entry to the end of the level 0 list. As before, we depend + * on the list having multiple elements and ignore the edge cases small + * lists have. + */ + if (type == WT_PAGE_ROW_LEAF) right->pg_row_ins[0]->head[0] = right->pg_row_ins[0]->tail[0] = NULL; - ins_head->tail[0]->next[0] = moved_ins; - ins_head->tail[0] = moved_ins; + else + right->modify->mod_append[0]->head[0] = + right->modify->mod_append[0]->tail[0] = NULL; - /* - * We marked the new page dirty; we're going to discard it, but - * first mark it clean and fix up the cache statistics. - */ - __wt_page_modify_clear(session, right); - - WT_ERR(ret); - } - - return (0); + ins_head->tail[0]->next[0] = moved_ins; + ins_head->tail[0] = moved_ins; err: if (split_ref[0] != NULL) { - __wt_free(session, split_ref[0]->key.ikey); + if (type == WT_PAGE_ROW_LEAF) + __wt_free(session, split_ref[0]->key.ikey); __wt_free(session, split_ref[0]); } if (split_ref[1] != NULL) { - __wt_free(session, split_ref[1]->key.ikey); + if (type == WT_PAGE_ROW_LEAF) + __wt_free(session, split_ref[1]->key.ikey); __wt_free(session, split_ref[1]); } - if (right != NULL) + if (right != NULL) { + /* + * We marked the new page dirty; we're going to discard it, + * but first mark it clean and fix up the cache statistics. + */ + __wt_page_modify_clear(session, right); __wt_page_out(session, &right); + } __wt_scr_free(session, &key); return (ret); } -- cgit v1.2.1 From 56bc21d69902d5b1295974e6af0a5a131d4ac461 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Thu, 10 Dec 2015 17:03:05 +1100 Subject: Revert "WT-2231: check the leaf page's parent keys before doing the full binary search" --- src/btree/bt_split.c | 40 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 8623397d9bc..631aca0d5c0 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -404,12 +404,12 @@ __split_child_block_evict_and_split(WT_PAGE *child) */ static int __split_ref_move_final( - WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) + WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries) { WT_DECL_RET; WT_PAGE *child; WT_REF *ref, *child_ref; - uint32_t i, j; + uint32_t i; /* * The WT_REF structures moved to newly allocated child pages reference @@ -420,11 +420,8 @@ __split_ref_move_final( * happens the thread waits for the reference's home page to be updated, * which we do here: walk the children and fix them up. */ - for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { - ref = pindex->index[i]; - - /* Update the WT_REF's page-index hint. */ - ref->pindex_hint = i; + for (i = 0; i < entries; ++i, ++refp) { + ref = *refp; /* * We don't hold hazard pointers on created pages, they cannot @@ -454,7 +451,6 @@ __split_ref_move_final( * know about that flag; use the standard macros to ensure that * reading the child's page index structure is safe. */ - j = 0; WT_ENTER_PAGE_INDEX(session); WT_INTL_FOREACH_BEGIN(session, child, child_ref) { /* @@ -463,11 +459,10 @@ __split_ref_move_final( * disk pages may have been read in since then, and * those pages would have correct parent references. */ - if (child_ref->home != child) + if (child_ref->home != child) { child_ref->home = child; - - /* Update the WT_REF's page-index hint. */ - child_ref->pindex_hint = j++; + child_ref->pindex_hint = 0; + } } WT_INTL_FOREACH_END; WT_LEAVE_PAGE_INDEX(session); @@ -643,7 +638,8 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) __split_verify_intl_key_order(session, root)); #endif /* Fix up the moved WT_REF structures. */ - WT_ERR(__split_ref_move_final(session, alloc_index, false)); + WT_ERR(__split_ref_move_final( + session, alloc_index->index, alloc_index->entries)); /* We've installed the allocated page-index, ensure error handling. */ alloc_index = NULL; @@ -704,7 +700,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_REF **alloc_refp, *next_ref; size_t parent_decr, size; uint64_t split_gen; - uint32_t hint, i, j; + uint32_t i, j; uint32_t deleted_entries, parent_entries, result_entries; uint32_t *deleted_refs; bool complete, empty_parent; @@ -772,31 +768,22 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * Allocate and initialize a new page index array for the parent, then * copy references from the original index array, plus references from * the newly created split array, into place. - * - * Update the WT_REF's page-index hint as we go. This can race with a - * thread setting the hint based on an older page-index, and the change - * isn't backed out in the case of an error, so there ways for the hint - * to be wrong; OK because it's just a hint. */ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); parent_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); alloc_index->entries = result_entries; - for (alloc_refp = alloc_index->index, - hint = i = 0; i < parent_entries; ++i) { + for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref == ref) for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; - ref_new[j]->pindex_hint = hint++; *alloc_refp++ = ref_new[j]; } - else if (next_ref->state != WT_REF_SPLIT) { + else if (next_ref->state != WT_REF_SPLIT) /* Skip refs we have marked for deletion. */ - next_ref->pindex_hint = hint++; *alloc_refp++ = next_ref; - } } /* Check that we filled in all the entries. */ @@ -1141,7 +1128,8 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) #endif /* Fix up the moved WT_REF structures. */ - WT_ERR(__split_ref_move_final(session, alloc_index, true)); + WT_ERR(__split_ref_move_final( + session, alloc_index->index + 1, alloc_index->entries - 1)); /* * We don't care about the page-index we allocated, all we needed was -- cgit v1.2.1 From b62ee8ae0b4fd27a08bda89fd3ac537abe7f32f0 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 10 Dec 2015 16:41:27 -0500 Subject: WT-2231: this is a replay of the changes for the previous WT-2231 branch (see pull request #2325) for the individual commit details. There's an additional bug fix over and above those commits, fixing a clang sanitizer error. In summary: check the leaf page's parent keys before doing the full binary search of the leaf page, avoiding the binary search entirely when the cursor is being re-positioned, at the cost of two additional searches when the cursor is not being re-positioned. Additionally, do some work to improve WT_REF page hints. --- src/btree/bt_split.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 631aca0d5c0..8623397d9bc 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -404,12 +404,12 @@ __split_child_block_evict_and_split(WT_PAGE *child) */ static int __split_ref_move_final( - WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries) + WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) { WT_DECL_RET; WT_PAGE *child; WT_REF *ref, *child_ref; - uint32_t i; + uint32_t i, j; /* * The WT_REF structures moved to newly allocated child pages reference @@ -420,8 +420,11 @@ __split_ref_move_final( * happens the thread waits for the reference's home page to be updated, * which we do here: walk the children and fix them up. */ - for (i = 0; i < entries; ++i, ++refp) { - ref = *refp; + for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { + ref = pindex->index[i]; + + /* Update the WT_REF's page-index hint. */ + ref->pindex_hint = i; /* * We don't hold hazard pointers on created pages, they cannot @@ -451,6 +454,7 @@ __split_ref_move_final( * know about that flag; use the standard macros to ensure that * reading the child's page index structure is safe. */ + j = 0; WT_ENTER_PAGE_INDEX(session); WT_INTL_FOREACH_BEGIN(session, child, child_ref) { /* @@ -459,10 +463,11 @@ __split_ref_move_final( * disk pages may have been read in since then, and * those pages would have correct parent references. */ - if (child_ref->home != child) { + if (child_ref->home != child) child_ref->home = child; - child_ref->pindex_hint = 0; - } + + /* Update the WT_REF's page-index hint. */ + child_ref->pindex_hint = j++; } WT_INTL_FOREACH_END; WT_LEAVE_PAGE_INDEX(session); @@ -638,8 +643,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) __split_verify_intl_key_order(session, root)); #endif /* Fix up the moved WT_REF structures. */ - WT_ERR(__split_ref_move_final( - session, alloc_index->index, alloc_index->entries)); + WT_ERR(__split_ref_move_final(session, alloc_index, false)); /* We've installed the allocated page-index, ensure error handling. */ alloc_index = NULL; @@ -700,7 +704,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_REF **alloc_refp, *next_ref; size_t parent_decr, size; uint64_t split_gen; - uint32_t i, j; + uint32_t hint, i, j; uint32_t deleted_entries, parent_entries, result_entries; uint32_t *deleted_refs; bool complete, empty_parent; @@ -768,22 +772,31 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * Allocate and initialize a new page index array for the parent, then * copy references from the original index array, plus references from * the newly created split array, into place. + * + * Update the WT_REF's page-index hint as we go. This can race with a + * thread setting the hint based on an older page-index, and the change + * isn't backed out in the case of an error, so there ways for the hint + * to be wrong; OK because it's just a hint. */ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); parent_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); alloc_index->entries = result_entries; - for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { + for (alloc_refp = alloc_index->index, + hint = i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref == ref) for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; + ref_new[j]->pindex_hint = hint++; *alloc_refp++ = ref_new[j]; } - else if (next_ref->state != WT_REF_SPLIT) + else if (next_ref->state != WT_REF_SPLIT) { /* Skip refs we have marked for deletion. */ + next_ref->pindex_hint = hint++; *alloc_refp++ = next_ref; + } } /* Check that we filled in all the entries. */ @@ -1128,8 +1141,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) #endif /* Fix up the moved WT_REF structures. */ - WT_ERR(__split_ref_move_final( - session, alloc_index->index + 1, alloc_index->entries - 1)); + WT_ERR(__split_ref_move_final(session, alloc_index, true)); /* * We don't care about the page-index we allocated, all we needed was -- cgit v1.2.1 From ee072ba7a47bcbd7c813330a6b5ecd0375efbdeb Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Sun, 13 Dec 2015 09:20:44 -0500 Subject: If we split a column-store page, instantiate missing records in the page name space during reconciliation. (Imagine record 998 is transactionally visible, 999 was never created or is not yet not visible, 1000 is visible. Then the page split and record 1000 moves to another page. When we reconcile the original page, we write record 998, then we don't see record 999 for whatever reason. If we've moved record 1000, we don't know to write a deleted record 999 on the page.) The fix is to record the split record in the page-modify structure, and create missing records as deleted records. --- src/btree/bt_split.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 857be591bae..bd1e095cbc0 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1747,6 +1747,17 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) } else child->key.recno = WT_INSERT_RECNO(moved_ins); + /* + * Allocation operations completed, we're going to split. + * + * Record the split column-store page record, used in reconciliation. + */ + if (type != WT_PAGE_ROW_LEAF) { + WT_ASSERT(session, + page->modify->mod_split_recno == WT_RECNO_OOB); + page->modify->mod_split_recno = child->key.recno; + } + /* * We modified the page above, which will have set the first dirty * transaction to the last transaction current running. However, the @@ -1769,8 +1780,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) page_decr, right_incr, __wt_update_list_memsize(moved_ins->upd)); /* - * Allocation operations completed, move the last insert list item from - * the original page to the new page. + * Move the last insert list item from the original page to the new + * page. * * First, update the item to the new child page. (Just append the entry * for simplicity, the previous skip list pointers originally allocated @@ -1877,6 +1888,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) /* * Failure. * + * Reset the split column-store page record. + */ + page->modify->mod_split_recno = WT_RECNO_OOB; + + /* * Clear the allocated page's reference to the moved insert list element * so it's not freed when we discard the page. * -- cgit v1.2.1 From 4d6de24df3c73449befbadf8418f11d97d1a4e55 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 14 Dec 2015 06:38:41 -0500 Subject: Alex's review, simplify the expression. --- src/btree/bt_split.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index bd1e095cbc0..55e280f728d 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1645,11 +1645,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT); /* Find the last item on the page. */ - ins_head = type == WT_PAGE_ROW_LEAF ? - (page->pg_row_entries == 0 ? - WT_ROW_INSERT_SMALLEST(page) : - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1)) : - WT_COL_APPEND(page); + if (type == WT_PAGE_ROW_LEAF) + ins_head = page->pg_row_entries == 0 ? + WT_ROW_INSERT_SMALLEST(page) : + WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + else + ins_head = WT_COL_APPEND(page); moved_ins = WT_SKIP_LAST(ins_head); /* -- cgit v1.2.1 From d3bbe226381af870520d8f43f21f8e4b3b53559c Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 22 Dec 2015 16:41:41 -0500 Subject: Fix a comment. --- src/btree/bt_split.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 8623397d9bc..887544a7904 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -603,7 +603,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) * threads may be underneath us right now changing the structure * state.) However, if the WT_REF structures reference on-page * information, we have to fix that, because the disk image for - * the page that has an page index entry for the WT_REF is about + * the page that has a page index entry for the WT_REF is about * to change. */ child_pindex = WT_INTL_INDEX_GET_SAFE(child); -- cgit v1.2.1 From 8596071f42abe3da7a57c3e170193fdb4c8d5713 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 24 Dec 2015 13:55:11 -0500 Subject: WT-2307: Fix for cursor iteration bug when pages are splitting: add a separate step to update the WT_REF.home fields of the WT_REFs being moved before the split goes live, cursors then surface in the new tree structure instead of the old. --- src/btree/bt_split.c | 319 ++++++++++++++++++++++++++++----------------------- 1 file changed, 176 insertions(+), 143 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 887544a7904..fffd4fecb70 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -190,6 +190,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) case WT_PAGE_COL_INT: recno = 0; /* Less than any valid record number. */ WT_INTL_FOREACH_BEGIN(session, page, ref) { + WT_ASSERT(session, ref->home == page); + WT_ASSERT(session, ref->key.recno > recno); recno = ref->key.recno; } WT_INTL_FOREACH_END; @@ -202,6 +204,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) first = true; WT_INTL_FOREACH_BEGIN(session, page, ref) { + WT_ASSERT(session, ref->home == page); + __wt_ref_key(page, ref, &next->data, &next->size); if (last->size == 0) { if (first) @@ -328,7 +332,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, /* * If there's no address (the page has never been written), or the * address has been instantiated, there's no work to do. Otherwise, - * get the address from the on-page cell. + * instantiate the address in-memory, from the on-page cell. */ addr = ref->addr; if (addr != NULL && !__wt_off_page(from_home, addr)) { @@ -363,69 +367,102 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, } /* - * __split_child_block_evict_and_split -- - * Ensure the newly created child isn't evicted or split for now. + * __split_ref_step1 -- + * Prepare a set of WT_REFs for a move. */ static void -__split_child_block_evict_and_split(WT_PAGE *child) +__split_ref_step1( + WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) { + WT_PAGE *child; + WT_REF *child_ref, *ref; + uint32_t i, j; + + /* The newly created subtree is complete. */ + WT_WRITE_BARRIER(); + /* - * Once the split is live, newly created internal pages might be evicted - * and their WT_REF structures freed. If that happens before all threads - * exit the index of the page which previously "owned" the WT_REF, a - * thread might see a freed WT_REF. To ensure that doesn't happen, the - * newly created page's modify structure has a field with a transaction - * ID that's checked before any internal page is evicted. Unfortunately, - * we don't know the correct value until we update the original page's - * index (we need a transaction ID from after that update), but the act - * of updating the original page's index is what allows the eviction to - * happen. - * - * Once the split is live, newly created internal pages might themselves - * split. The split itself is not the problem: if a page splits before - * we fix up its WT_REF (in other words, a WT_REF we move is then moved - * again, before we reset the underlying page's parent reference), it's - * OK because the test we use to find a WT_REF and WT_PAGE that require - * fixing up is only that the WT_REF points to the wrong parent, not it - * points to a specific wrong parent. The problem is our fix up of the - * WT_REFs in the created page could race with the subsequent fix of the - * same WT_REFs (in a different created page), we'd have to acquire some - * lock to prevent that race, and that's going to be difficult at best. - * - * For now, block eviction and splits in newly created pages until they - * have been fixed up. + * Update the moved WT_REFs so threads moving through them start looking + * at the created children's page index information. Because we've not + * yet updated the page index of the parent page into which we are going + * to split this subtree, a cursor moving through these WT_REFs will + * ascend into the created children, but eventually fail as that parent + * page won't yet know about the created children pages. That's OK, we + * spin there until the parent's page index is updated. */ - F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { + ref = pindex->index[i]; + child = ref->page; + + /* + * Block eviction and splits in newly created pages. + * + * Once the split is live, newly created internal pages might be + * evicted and their WT_REF structures freed. If that happened + * before all threads exit the index of the page that previously + * "owned" the WT_REF, a thread might see a freed WT_REF. To + * ensure that doesn't happen, the newly created page's modify + * structure has a field with a transaction ID that's checked + * before any internal page is evicted. Unfortunately, we don't + * know the correct value until we update the original page's + * index (we need a transaction ID from after that update), but + * the act of updating the original page's index is what allows + * the eviction to happen. + * + * Split blocking was because historic versions of the split + * code didn't update the WT_REF.home field until after the + * split was live, so the WT_REF.home fields being updated could + * split again before the update, there's a race between splits + * as to which would update them first. The current code updates + * the WT_REF.home fields before going live (in this function), + * this shouldn't be an issue, but for now splits remain turned + * off. + */ + F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + + /* + * We use a page flag to prevent the child from splitting from + * underneath us, but the split-generation error checks don't + * know about that flag; use the standard macros to ensure that + * reading the child's page index structure is safe. + */ + j = 0; + WT_ENTER_PAGE_INDEX(session); + WT_INTL_FOREACH_BEGIN(session, child, child_ref) { + child_ref->home = child; + child_ref->pindex_hint = j++; + } WT_INTL_FOREACH_END; + WT_LEAVE_PAGE_INDEX(session); + +#ifdef HAVE_DIAGNOSTIC + WT_WITH_PAGE_INDEX(session, + __split_verify_intl_key_order(session, child)); +#endif + } } /* - * __split_ref_move_final -- - * Finalize the moved WT_REF structures after the split succeeds. + * __split_ref_step2 -- + * Allow the newly created children to be evicted or split. */ static int -__split_ref_move_final( +__split_ref_step2( WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) { WT_DECL_RET; WT_PAGE *child; - WT_REF *ref, *child_ref; - uint32_t i, j; + WT_REF *ref; + uint32_t i; /* - * The WT_REF structures moved to newly allocated child pages reference - * the wrong parent page and we have to fix that up. The problem is - * revealed when a thread of control searches for the child page's - * reference structure slot, and fails to find it because the parent - * page being searched no longer references the child. When that failure - * happens the thread waits for the reference's home page to be updated, - * which we do here: walk the children and fix them up. + * The split has gone live, enable eviction and splits on the newly + * created internal pages. */ + WT_WRITE_BARRIER(); + for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) { ref = pindex->index[i]; - /* Update the WT_REF's page-index hint. */ - ref->pindex_hint = i; - /* * We don't hold hazard pointers on created pages, they cannot * be evicted because the page-modify transaction value set as @@ -444,44 +481,18 @@ __split_ref_move_final( WT_ERR(ret); child = ref->page; + + /* The child can now be evicted or split. */ + F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); + #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, child)); #endif - /* - * We use a page flag to prevent the child from splitting from - * underneath us, but the split-generation error checks don't - * know about that flag; use the standard macros to ensure that - * reading the child's page index structure is safe. - */ - j = 0; - WT_ENTER_PAGE_INDEX(session); - WT_INTL_FOREACH_BEGIN(session, child, child_ref) { - /* - * The page's home reference may not be wrong, as we - * opened up access from the top of the tree already, - * disk pages may have been read in since then, and - * those pages would have correct parent references. - */ - if (child_ref->home != child) - child_ref->home = child; - - /* Update the WT_REF's page-index hint. */ - child_ref->pindex_hint = j++; - } WT_INTL_FOREACH_END; - WT_LEAVE_PAGE_INDEX(session); - - /* The child can now be evicted or split. */ - F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK); WT_ERR(__wt_hazard_clear(session, child)); } - /* - * Push out the changes: not required for correctness, but don't let - * threads spin on incorrect page references longer than necessary. - */ - WT_FULL_BARRIER(); return (0); err: /* Something really bad just happened. */ @@ -505,9 +516,21 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) uint64_t split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; - bool complete; void *p; + /* + * A note on error handling: this function first allocates/initializes + * new structures; failures during that period are handled by discarding + * the memory and returning an error code, our caller knows the split + * didn't happen and proceeds accordingly. Second, this function updates + * the tree, and a failure in that period is catastrophic, any partial + * update to the tree requires a panic, we can't recover. Third, once + * the split is complete and the tree has been fully updated, we have to + * ignore most errors because the split is complete and correct, callers + * have to proceed accordingly. + */ + enum { WT_SPLIT_NONE, WT_SPLIT_PANIC, WT_SPLIT_IGNORE } complete; + WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); @@ -516,7 +539,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) btree = S2BT(session); alloc_index = NULL; root_decr = root_incr = 0; - complete = false; + complete = WT_SPLIT_NONE; /* The root page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, root)); @@ -594,9 +617,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__wt_page_modify_init(session, child)); __wt_page_modify_set(session, child); - /* Ensure the page isn't evicted or split for now. */ - __split_child_block_evict_and_split(child); - /* * The newly allocated child's page index references the same * structures as the root. (We cannot move WT_REF structures, @@ -620,30 +640,28 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ASSERT(session, root_refp - pindex->index == (ptrdiff_t)pindex->entries); + /* Start making real changes to the tree, errors are fatal. */ + complete = WT_SPLIT_PANIC; + + /* Prepare the WT_REFs for the move. */ + __split_ref_step1(session, alloc_index, false); + /* * Confirm the root page's index hasn't moved, then update it, which - * makes the split visible to threads descending the tree. From this - * point on, we're committed to the split. - * - * A note on error handling: until this point, there's no problem with - * unwinding on error. We allocated a new page index, a new set of - * WT_REFs and a new set of child pages -- if an error occurred, the - * root remained unchanged, although it may have an incorrect memory - * footprint. From now on we've modified the root page, attention - * needs to be paid. However, subsequent failures are relatively benign, - * the split is OK and complete. For that reason, we ignore errors past - * this point unless there's a panic. + * makes the split visible to threads descending the tree. */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex); WT_INTL_INDEX_SET(root, alloc_index); - complete = true; #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, __split_verify_intl_key_order(session, root)); #endif - /* Fix up the moved WT_REF structures. */ - WT_ERR(__split_ref_move_final(session, alloc_index, false)); + /* Finalize the WT_REFs we moved. */ + WT_ERR(__split_ref_step2(session, alloc_index, false)); + + /* The split is complete and correct, ignore benign errors. */ + complete = WT_SPLIT_IGNORE; /* We've installed the allocated page-index, ensure error handling. */ alloc_index = NULL; @@ -668,24 +686,25 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) __wt_cache_page_inmem_decr(session, root, root_decr); __wt_page_modify_set(session, root); -err: /* - * If complete is true, we saw an error after opening up the tree to - * descent through the root page's new index. There is nothing we - * can do, there are threads potentially active in both versions of - * the tree. - * - * A note on error handling: if we completed the split, return success, - * nothing really bad can have happened, and our caller has to proceed - * with the split. - */ - if (!complete) +err: switch (complete) { + case WT_SPLIT_NONE: __wt_free_ref_index(session, root, alloc_index, true); - - if (ret != 0 && ret != WT_PANIC) + break; + case WT_SPLIT_PANIC: __wt_err(session, ret, - "ignoring not-fatal error during root page split to " - "deepen the tree"); - return (ret == WT_PANIC || !complete ? ret : 0); + "fatal error during root page split to deepen the tree"); + ret = WT_PANIC; + break; + case WT_SPLIT_IGNORE: + if (ret != 0 && ret != WT_PANIC) { + __wt_err(session, ret, + "ignoring not-fatal error during root page split " + "to deepen the tree"); + ret = 0; + } + break; + } + return (ret); } /* @@ -977,9 +996,21 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) uint64_t split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; - bool complete; void *p; + /* + * A note on error handling: this function first allocates/initializes + * new structures; failures during that period are handled by discarding + * the memory and returning an error code, our caller knows the split + * didn't happen and proceeds accordingly. Second, this function updates + * the tree, and a failure in that period is catastrophic, any partial + * update to the tree requires a panic, we can't recover. Third, once + * the split is complete and the tree has been fully updated, we have to + * ignore most errors because the split is complete and correct, callers + * have to proceed accordingly. + */ + enum { WT_SPLIT_NONE, WT_SPLIT_PANIC, WT_SPLIT_IGNORE } complete; + WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal); @@ -990,7 +1021,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) alloc_index = replace_index = NULL; page_ref = page->pg_intl_parent_ref; page_decr = page_incr = parent_incr = 0; - complete = false; + complete = WT_SPLIT_NONE; /* * Our caller is holding the page locked to single-thread splits, which @@ -1087,9 +1118,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__wt_page_modify_init(session, child)); __wt_page_modify_set(session, child); - /* Ensure the page isn't evicted or split for now. */ - __split_child_block_evict_and_split(child); - /* * The newly allocated child's page index references the same * structures as the parent. (We cannot move WT_REF structures, @@ -1113,22 +1141,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ASSERT(session, page_refp - pindex->index == (ptrdiff_t)pindex->entries); + /* Start making real changes to the tree, errors are fatal. */ + complete = WT_SPLIT_PANIC; + + /* Prepare the WT_REFs for the move. */ + __split_ref_step1(session, alloc_index, true); + /* Split into the parent. */ WT_ERR(__split_parent(session, page_ref, alloc_index->index, alloc_index->entries, parent_incr, false, false)); - /* - * A note on error handling: until this point, there's no problem with - * unwinding on error. We allocated a new page index, a new set of - * WT_REFs and a new set of child pages -- if an error occurred, the - * page remained unchanged, although it may have an incorrect memory - * footprint. From now on we've modified the parent page, attention - * needs to be paid. However, subsequent failures are relatively benign, - * the split is OK and complete. For that reason, we ignore errors past - * this point unless there's a panic. - */ - complete = true; - /* Confirm the page's index hasn't moved, then update it. */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); WT_INTL_INDEX_SET(page, replace_index); @@ -1140,8 +1162,17 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __split_verify_intl_key_order(session, page)); #endif - /* Fix up the moved WT_REF structures. */ - WT_ERR(__split_ref_move_final(session, alloc_index, true)); + /* Finalize the WT_REFs we moved. */ + WT_ERR(__split_ref_step2(session, alloc_index, true)); + + /* The split is complete and correct, ignore benign errors. */ + complete = WT_SPLIT_IGNORE; + + /* + * Push out the changes: not required for correctness, but no reason + * to wait. + */ + WT_FULL_BARRIER(); /* * We don't care about the page-index we allocated, all we needed was @@ -1170,24 +1201,26 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __wt_cache_page_inmem_decr(session, page, page_decr); __wt_page_modify_set(session, page); -err: /* - * If complete is true, we saw an error after opening up the tree to - * descent through the page's new index. There is nothing we can do, - * there are threads potentially active in both versions of the tree. - * - * A note on error handling: if we completed the split, return success, - * nothing really bad can have happened, and our caller has to proceed - * with the split. - */ - if (!complete) { +err: switch (complete) { + case WT_SPLIT_NONE: __wt_free_ref_index(session, page, alloc_index, true); __wt_free_ref_index(session, page, replace_index, false); - } - - if (ret != 0 && ret != WT_PANIC) + break; + case WT_SPLIT_PANIC: __wt_err(session, ret, - "ignoring not-fatal error during internal page split"); - return (ret == WT_PANIC || !complete ? ret : 0); + "fatal error during internal page split"); + ret = WT_PANIC; + break; + case WT_SPLIT_IGNORE: + if (ret != 0 && ret != WT_PANIC) { + __wt_err(session, ret, + "ignoring not-fatal error during internal page " + "split"); + ret = 0; + } + break; + } + return (ret); } /* -- cgit v1.2.1 From 6f2e8037544ce555b16ff2375449175f5b68f5dd Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 28 Dec 2015 11:13:42 -0500 Subject: Review comments, rename error conditions for clarity. --- src/btree/bt_split.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index fffd4fecb70..3046316bf8b 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -529,7 +529,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) * ignore most errors because the split is complete and correct, callers * have to proceed accordingly. */ - enum { WT_SPLIT_NONE, WT_SPLIT_PANIC, WT_SPLIT_IGNORE } complete; + enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete; WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); @@ -539,7 +539,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) btree = S2BT(session); alloc_index = NULL; root_decr = root_incr = 0; - complete = WT_SPLIT_NONE; + complete = ERR_RETURN; /* The root page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, root)); @@ -641,7 +641,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) root_refp - pindex->index == (ptrdiff_t)pindex->entries); /* Start making real changes to the tree, errors are fatal. */ - complete = WT_SPLIT_PANIC; + complete = ERR_PANIC; /* Prepare the WT_REFs for the move. */ __split_ref_step1(session, alloc_index, false); @@ -661,7 +661,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__split_ref_step2(session, alloc_index, false)); /* The split is complete and correct, ignore benign errors. */ - complete = WT_SPLIT_IGNORE; + complete = ERR_IGNORE; /* We've installed the allocated page-index, ensure error handling. */ alloc_index = NULL; @@ -687,15 +687,15 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) __wt_page_modify_set(session, root); err: switch (complete) { - case WT_SPLIT_NONE: + case ERR_RETURN: __wt_free_ref_index(session, root, alloc_index, true); break; - case WT_SPLIT_PANIC: + case ERR_PANIC: __wt_err(session, ret, "fatal error during root page split to deepen the tree"); ret = WT_PANIC; break; - case WT_SPLIT_IGNORE: + case ERR_IGNORE: if (ret != 0 && ret != WT_PANIC) { __wt_err(session, ret, "ignoring not-fatal error during root page split " @@ -1009,7 +1009,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) * ignore most errors because the split is complete and correct, callers * have to proceed accordingly. */ - enum { WT_SPLIT_NONE, WT_SPLIT_PANIC, WT_SPLIT_IGNORE } complete; + enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete; WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal); @@ -1021,7 +1021,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) alloc_index = replace_index = NULL; page_ref = page->pg_intl_parent_ref; page_decr = page_incr = parent_incr = 0; - complete = WT_SPLIT_NONE; + complete = ERR_RETURN; /* * Our caller is holding the page locked to single-thread splits, which @@ -1142,7 +1142,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) page_refp - pindex->index == (ptrdiff_t)pindex->entries); /* Start making real changes to the tree, errors are fatal. */ - complete = WT_SPLIT_PANIC; + complete = ERR_PANIC; /* Prepare the WT_REFs for the move. */ __split_ref_step1(session, alloc_index, true); @@ -1166,7 +1166,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__split_ref_step2(session, alloc_index, true)); /* The split is complete and correct, ignore benign errors. */ - complete = WT_SPLIT_IGNORE; + complete = ERR_IGNORE; /* * Push out the changes: not required for correctness, but no reason @@ -1202,16 +1202,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __wt_page_modify_set(session, page); err: switch (complete) { - case WT_SPLIT_NONE: + case ERR_RETURN: __wt_free_ref_index(session, page, alloc_index, true); __wt_free_ref_index(session, page, replace_index, false); break; - case WT_SPLIT_PANIC: + case ERR_PANIC: __wt_err(session, ret, "fatal error during internal page split"); ret = WT_PANIC; break; - case WT_SPLIT_IGNORE: + case ERR_IGNORE: if (ret != 0 && ret != WT_PANIC) { __wt_err(session, ret, "ignoring not-fatal error during internal page " -- cgit v1.2.1 From 2427cd486745f4bac50d0482fa4878e70c8e3950 Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Mon, 28 Dec 2015 14:11:10 -0500 Subject: WT-2307 Use single enum for split error handling phases. --- src/btree/bt_split.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 3046316bf8b..684388e47d9 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -512,6 +512,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex; WT_REF **alloc_refp; WT_REF **child_refp, *ref, **root_refp; + WT_SPLIT_ERROR_PHASE complete; size_t child_incr, root_decr, root_incr, size; uint64_t split_gen; uint32_t children, chunk, i, j, remain; @@ -529,7 +530,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) * ignore most errors because the split is complete and correct, callers * have to proceed accordingly. */ - enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete; WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); @@ -539,7 +539,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) btree = S2BT(session); alloc_index = NULL; root_decr = root_incr = 0; - complete = ERR_RETURN; + complete = WT_ERR_RETURN; /* The root page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, root)); @@ -641,7 +641,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) root_refp - pindex->index == (ptrdiff_t)pindex->entries); /* Start making real changes to the tree, errors are fatal. */ - complete = ERR_PANIC; + complete = WT_ERR_PANIC; /* Prepare the WT_REFs for the move. */ __split_ref_step1(session, alloc_index, false); @@ -661,7 +661,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__split_ref_step2(session, alloc_index, false)); /* The split is complete and correct, ignore benign errors. */ - complete = ERR_IGNORE; + complete = WT_ERR_IGNORE; /* We've installed the allocated page-index, ensure error handling. */ alloc_index = NULL; @@ -687,15 +687,15 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) __wt_page_modify_set(session, root); err: switch (complete) { - case ERR_RETURN: + case WT_ERR_RETURN: __wt_free_ref_index(session, root, alloc_index, true); break; - case ERR_PANIC: + case WT_ERR_PANIC: __wt_err(session, ret, "fatal error during root page split to deepen the tree"); ret = WT_PANIC; break; - case ERR_IGNORE: + case WT_ERR_IGNORE: if (ret != 0 && ret != WT_PANIC) { __wt_err(session, ret, "ignoring not-fatal error during root page split " @@ -992,6 +992,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index; WT_REF **alloc_refp; WT_REF **child_refp, *page_ref, **page_refp, *ref; + WT_SPLIT_ERROR_PHASE complete; size_t child_incr, page_decr, page_incr, parent_incr, size; uint64_t split_gen; uint32_t children, chunk, i, j, remain; @@ -1009,7 +1010,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) * ignore most errors because the split is complete and correct, callers * have to proceed accordingly. */ - enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete; WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal); @@ -1021,7 +1021,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) alloc_index = replace_index = NULL; page_ref = page->pg_intl_parent_ref; page_decr = page_incr = parent_incr = 0; - complete = ERR_RETURN; + complete = WT_ERR_RETURN; /* * Our caller is holding the page locked to single-thread splits, which @@ -1142,7 +1142,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) page_refp - pindex->index == (ptrdiff_t)pindex->entries); /* Start making real changes to the tree, errors are fatal. */ - complete = ERR_PANIC; + complete = WT_ERR_PANIC; /* Prepare the WT_REFs for the move. */ __split_ref_step1(session, alloc_index, true); @@ -1166,7 +1166,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__split_ref_step2(session, alloc_index, true)); /* The split is complete and correct, ignore benign errors. */ - complete = ERR_IGNORE; + complete = WT_ERR_IGNORE; /* * Push out the changes: not required for correctness, but no reason @@ -1202,16 +1202,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __wt_page_modify_set(session, page); err: switch (complete) { - case ERR_RETURN: + case WT_ERR_RETURN: __wt_free_ref_index(session, page, alloc_index, true); __wt_free_ref_index(session, page, replace_index, false); break; - case ERR_PANIC: + case WT_ERR_PANIC: __wt_err(session, ret, "fatal error during internal page split"); ret = WT_PANIC; break; - case ERR_IGNORE: + case WT_ERR_IGNORE: if (ret != 0 && ret != WT_PANIC) { __wt_err(session, ret, "ignoring not-fatal error during internal page " -- cgit v1.2.1 From b47941bf443c9c28d3d8ddd80c12ae33fe530f0e Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Mon, 28 Dec 2015 15:53:28 -0500 Subject: WT-2307 Move enum into bt_split.c. Fix split_parent. --- src/btree/bt_split.c | 87 +++++++++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 45 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 684388e47d9..fc9d642689b 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -14,6 +14,23 @@ to_incr += __len; \ } while (0) +/* + * A note on error handling: this function first allocates/initializes + * new structures; failures during that period are handled by discarding + * the memory and returning an error code, our caller knows the split + * didn't happen and proceeds accordingly. Second, this function updates + * the tree, and a failure in that period is catastrophic, any partial + * update to the tree requires a panic, we can't recover. Third, once + * the split is complete and the tree has been fully updated, we have to + * ignore most errors because the split is complete and correct, callers + * have to proceed accordingly. + */ +typedef enum __wt_split_error_phase { + WT_ERR_IGNORE, + WT_ERR_PANIC, + WT_ERR_RETURN +} WT_SPLIT_ERROR_PHASE; + /* * __split_oldest_gen -- * Calculate the oldest active split generation. @@ -519,18 +536,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) uint32_t slots; void *p; - /* - * A note on error handling: this function first allocates/initializes - * new structures; failures during that period are handled by discarding - * the memory and returning an error code, our caller knows the split - * didn't happen and proceeds accordingly. Second, this function updates - * the tree, and a failure in that period is catastrophic, any partial - * update to the tree requires a panic, we can't recover. Third, once - * the split is complete and the tree has been fully updated, we have to - * ignore most errors because the split is complete and correct, callers - * have to proceed accordingly. - */ - WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen); WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen); WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); @@ -721,19 +726,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_PAGE *parent; WT_PAGE_INDEX *alloc_index, *pindex; WT_REF **alloc_refp, *next_ref; + WT_SPLIT_ERROR_PHASE complete; size_t parent_decr, size; uint64_t split_gen; uint32_t hint, i, j; uint32_t deleted_entries, parent_entries, result_entries; uint32_t *deleted_refs; - bool complete, empty_parent; + bool empty_parent; parent = ref->home; alloc_index = pindex = NULL; parent_decr = 0; parent_entries = 0; - complete = empty_parent = false; + empty_parent = false; + complete = WT_ERR_RETURN; /* The parent page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, parent)); @@ -751,7 +758,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * array anyway. Switch them to the special split state, so that any * reading thread will restart. */ - WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); + WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); for (deleted_entries = 0, i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); @@ -827,6 +834,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * makes the split visible to threads descending the tree. */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); + complete = WT_ERR_PANIC; WT_INTL_INDEX_SET(parent, alloc_index); alloc_index = NULL; @@ -862,16 +870,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, */ WT_FULL_BARRIER(); - /* - * A note on error handling: failures before we swapped the new page - * index into the parent can be resolved by freeing allocated memory - * because the original page is unchanged, we can continue to use it - * and we have not yet modified the parent. Failures after we swap - * the new page index into the parent are also relatively benign, the - * split is OK and complete. For those reasons, we ignore errors past - * this point unless there's a panic. - */ - complete = true; + complete = WT_ERR_IGNORE; WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, "%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32 @@ -955,7 +954,8 @@ err: __wt_scr_free(session, &scr); * nothing really bad can have happened, and our caller has to proceed * with the split. */ - if (!complete) { + switch (complete) { + case WT_ERR_RETURN: for (i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref->state == WT_REF_SPLIT) @@ -963,20 +963,29 @@ err: __wt_scr_free(session, &scr); } __wt_free_ref_index(session, NULL, alloc_index, false); - /* * The split couldn't proceed because the parent would be empty, * return EBUSY so our caller knows to unlock the WT_REF that's * being deleted, but don't be noisy, there's nothing wrong. */ if (empty_parent) - return (EBUSY); - } - - if (ret != 0 && ret != WT_PANIC) + ret = EBUSY; + break; + case WT_ERR_PANIC: __wt_err(session, ret, - "ignoring not-fatal error during parent page split"); - return (ret == WT_PANIC || !complete ? ret : 0); + "fatal error during root page split to deepen the tree"); + ret = WT_PANIC; + break; + case WT_ERR_IGNORE: + if (ret != 0 && ret != WT_PANIC) { + __wt_err(session, ret, + "ignoring not-fatal error during parent page " + "split"); + ret = 0; + } + break; + } + return (ret); } /* @@ -999,18 +1008,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) uint32_t slots; void *p; - /* - * A note on error handling: this function first allocates/initializes - * new structures; failures during that period are handled by discarding - * the memory and returning an error code, our caller knows the split - * didn't happen and proceeds accordingly. Second, this function updates - * the tree, and a failure in that period is catastrophic, any partial - * update to the tree requires a panic, we can't recover. Third, once - * the split is complete and the tree has been fully updated, we have to - * ignore most errors because the split is complete and correct, callers - * have to proceed accordingly. - */ - WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal); WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal); -- cgit v1.2.1 From 0c50635f2d1829d03af6feca730f20560e61d963 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 28 Dec 2015 17:53:24 -0500 Subject: Rework the leading comment for WT_SPLIT_ERROR_PHASE enum, no longer applies to "this function". WT_SPLIT_ERROR_PHASE doesn't need a local name, the typedef is enough. Add comments to the assignments of "complete" in __split_parent, just so the code in all three functions is identical. Fix a cut-and-paste typo in __split_parent. --- src/btree/bt_split.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index fc9d642689b..69c787c9385 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -15,20 +15,19 @@ } while (0) /* - * A note on error handling: this function first allocates/initializes - * new structures; failures during that period are handled by discarding - * the memory and returning an error code, our caller knows the split - * didn't happen and proceeds accordingly. Second, this function updates - * the tree, and a failure in that period is catastrophic, any partial - * update to the tree requires a panic, we can't recover. Third, once - * the split is complete and the tree has been fully updated, we have to - * ignore most errors because the split is complete and correct, callers - * have to proceed accordingly. + * A note on error handling: main split functions first allocate/initialize new + * structures; failures during that period are handled by discarding the memory + * and returning an error code, the caller knows the split didn't happen and + * proceeds accordingly. Second, split functions update the tree, and a failure + * in that period is catastrophic, any partial update to the tree requires a + * panic, we can't recover. Third, once the split is complete and the tree has + * been fully updated, we have to ignore most errors, the split is complete and + * correct, callers have to proceed accordingly. */ -typedef enum __wt_split_error_phase { - WT_ERR_IGNORE, - WT_ERR_PANIC, - WT_ERR_RETURN +typedef enum { + WT_ERR_IGNORE, /* Ignore minor errors */ + WT_ERR_PANIC, /* Panic on all errors */ + WT_ERR_RETURN /* Clean up and return error */ } WT_SPLIT_ERROR_PHASE; /* @@ -829,12 +828,14 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_ASSERT(session, alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); + /* Start making real changes to the tree, errors are fatal. */ + complete = WT_ERR_PANIC; + /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. */ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(parent) == pindex); - complete = WT_ERR_PANIC; WT_INTL_INDEX_SET(parent, alloc_index); alloc_index = NULL; @@ -870,6 +871,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, */ WT_FULL_BARRIER(); + /* The split is complete and correct, ignore benign errors. */ complete = WT_ERR_IGNORE; WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, @@ -972,8 +974,7 @@ err: __wt_scr_free(session, &scr); ret = EBUSY; break; case WT_ERR_PANIC: - __wt_err(session, ret, - "fatal error during root page split to deepen the tree"); + __wt_err(session, ret, "fatal error during parent page split"); ret = WT_PANIC; break; case WT_ERR_IGNORE: -- cgit v1.2.1 From 016d579e93124679f48c93f919e598473f91fac3 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Thu, 31 Dec 2015 09:19:50 -0500 Subject: When splitting, set the newly created child page's record number in the page-allocation function, it's cleaner and more consistent with creating other pages. --- src/btree/bt_split.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index f7e38f260af..669a2e04a84 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -530,7 +530,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_REF **child_refp, *ref, **root_refp; WT_SPLIT_ERROR_PHASE complete; size_t child_incr, root_decr, root_incr, size; - uint64_t split_gen; + uint64_t recno, split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; void *p; @@ -593,8 +593,11 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) for (root_refp = pindex->index, alloc_refp = alloc_index->index, i = 0; i < children; ++i) { slots = i == children - 1 ? remain : chunk; + + recno = root->type == WT_PAGE_COL_INT ? + (*root_refp)->key.recno : WT_RECNO_OOB; WT_ERR(__wt_page_alloc( - session, root->type, 0, slots, false, &child)); + session, root->type, recno, slots, false, &child)); /* * Initialize the page's child reference; we need a copy of the @@ -609,12 +612,10 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); root_incr += sizeof(WT_IKEY) + size; } else - ref->key.recno = (*root_refp)->key.recno; + ref->key.recno = recno; ref->state = WT_REF_MEM; /* Initialize the child page. */ - if (root->type == WT_PAGE_COL_INT) - child->pg_intl_recno = (*root_refp)->key.recno; child->pg_intl_parent_ref = ref; /* Mark it dirty. */ @@ -1004,7 +1005,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_REF **child_refp, *page_ref, **page_refp, *ref; WT_SPLIT_ERROR_PHASE complete; size_t child_incr, page_decr, page_incr, parent_incr, size; - uint64_t split_gen; + uint64_t recno, split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; void *p; @@ -1088,8 +1089,11 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ASSERT(session, page_refp == pindex->index + chunk); for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) { slots = i == children - 1 ? remain : chunk; + + recno = page->type == WT_PAGE_COL_INT ? + (*page_refp)->key.recno : WT_RECNO_OOB; WT_ERR(__wt_page_alloc( - session, page->type, 0, slots, false, &child)); + session, page->type, recno, slots, false, &child)); /* * Initialize the page's child reference; we need a copy of the @@ -1104,12 +1108,10 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); parent_incr += sizeof(WT_IKEY) + size; } else - ref->key.recno = (*page_refp)->key.recno; + ref->key.recno = recno; ref->state = WT_REF_MEM; /* Initialize the child page. */ - if (page->type == WT_PAGE_COL_INT) - child->pg_intl_recno = (*page_refp)->key.recno; child->pg_intl_parent_ref = ref; /* Mark it dirty. */ -- cgit v1.2.1 From 6b74b44079224192dd3bcb81735ccd49f3062277 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Fri, 1 Jan 2016 12:02:23 -0500 Subject: Replace another 0 with WT_RECNO_OOB. --- src/btree/bt_split.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 669a2e04a84..186eae80236 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1757,7 +1757,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * The second page in the split is a new WT_REF/page pair. */ if (type == WT_PAGE_ROW_LEAF) - WT_ERR(__wt_page_alloc(session, type, 0, 0, false, &right)); + WT_ERR(__wt_page_alloc(session, + type, WT_RECNO_OOB, 0, false, &right)); else WT_ERR(__wt_page_alloc(session, type, WT_INSERT_RECNO(moved_ins), 0, false, &right)); -- cgit v1.2.1 From 344a714c3dd78c5ed86cdb9401d30cb90e40e765 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Fri, 1 Jan 2016 16:37:39 -0500 Subject: Upgrade copyright notices from 2015 to 2016. --- src/btree/bt_split.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 69c787c9385..357880ca1c0 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * -- cgit v1.2.1 From d39d416bf50a5c6f6c12d180465f18a1983f76ab Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 19 Jan 2016 08:21:42 -0500 Subject: WT-2247, WT-2312: variable-length column-store in-memory page splits Review comment, remove an unnecessary set of the WT_REF.page_del field. --- src/btree/bt_split.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 8fa13faf8ff..8c8c5824207 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1705,6 +1705,10 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * * The new WT_REF is not quite identical: we have to instantiate a key, * and the new reference is visible to readers once the split completes. + * + * Don't copy any deleted page state: we may be splitting a page that + * was instantiated after a truncate and that history should not be + * carried onto these new child pages. */ WT_ERR(__wt_calloc_one(session, &split_ref[0])); parent_incr += sizeof(WT_REF); @@ -1746,13 +1750,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) } else child->key.recno = ref->key.recno; - /* - * Don't copy any deleted page state: we may be splitting a page that - * was instantiated after a truncate and that history should not be - * carried onto these new child pages. - */ - child->page_del = NULL; - /* * The second page in the split is a new WT_REF/page pair. */ -- cgit v1.2.1 From 4b67c914e66ca5473d7688de9ac68d326cb47723 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Tue, 19 Jan 2016 08:29:13 -0500 Subject: WT-2247, WT-2312: variable-length column-store in-memory page splits Review comment, simplify an expression with a temporary variable. --- src/btree/bt_split.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'src/btree/bt_split.c') diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 8c8c5824207..ac83a21ac6f 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1663,7 +1663,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_DECL_RET; WT_DECL_ITEM(key); WT_INSERT *ins, **insp, *moved_ins, *prev_ins; - WT_INSERT_HEAD *ins_head; + WT_INSERT_HEAD *ins_head, *tmp_ins_head; WT_PAGE *page, *right; WT_REF *child, *split_ref[2] = { NULL, NULL }; size_t page_decr, parent_incr, right_incr; @@ -1831,12 +1831,9 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * for simplicity, the previous skip list pointers originally allocated * can be ignored.) */ - if (type == WT_PAGE_ROW_LEAF) - right->pg_row_ins[0]->head[0] = - right->pg_row_ins[0]->tail[0] = moved_ins; - else - right->modify->mod_append[0]->head[0] = - right->modify->mod_append[0]->tail[0] = moved_ins; + tmp_ins_head = type == WT_PAGE_ROW_LEAF ? + right->pg_row_ins[0] : right->modify->mod_append[0]; + tmp_ins_head->head[0] = tmp_ins_head->tail[0] = moved_ins; /* * Remove the entry from the orig page (i.e truncate the skip list). -- cgit v1.2.1