diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2016-01-28 09:25:16 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2016-01-28 09:30:32 +1100 |
commit | 67e412d4c578e203505446496c67fc1517931cad (patch) | |
tree | 5d1e780ea1e14b1d32ef34aa33716be9bc284f8c /src/btree/bt_split.c | |
parent | 3c2ad56b507d82d8d5599dc1ebb50d3ccc087bfb (diff) | |
parent | 7fea169e425b2a56d88351f362d746d5b5dd77f5 (diff) | |
download | mongo-67e412d4c578e203505446496c67fc1517931cad.tar.gz |
Merge branch 'develop' into mongodb-3.2
Diffstat (limited to 'src/btree/bt_split.c')
-rw-r--r-- | src/btree/bt_split.c | 312 |
1 files changed, 199 insertions, 113 deletions
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 12f4197e9e7..102265c0a8f 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2014-2015 MongoDB, Inc. + * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * @@ -15,6 +15,22 @@ } while (0) /* + * A note on error handling: main split functions first allocate/initialize new + * structures; failures during that period are handled by discarding the memory + * and returning an error code, the caller knows the split didn't happen and + * proceeds accordingly. Second, split functions update the tree, and a failure + * in that period is catastrophic, any partial update to the tree requires a + * panic, we can't recover. Third, once the split is complete and the tree has + * been fully updated, we have to ignore most errors, the split is complete and + * correct, callers have to proceed accordingly. + */ +typedef enum { + WT_ERR_IGNORE, /* Ignore minor errors */ + WT_ERR_PANIC, /* Panic on all errors */ + WT_ERR_RETURN /* Clean up and return error */ +} WT_SPLIT_ERROR_PHASE; + +/* * __split_oldest_gen -- * Calculate the oldest active split generation. */ @@ -512,8 +528,9 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex; WT_REF **alloc_refp; WT_REF **child_refp, *ref, **root_refp; + WT_SPLIT_ERROR_PHASE complete; size_t child_incr, root_decr, root_incr, size; - uint64_t split_gen; + uint64_t recno, split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; void *p; @@ -539,7 +556,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) btree = S2BT(session); alloc_index = NULL; root_decr = root_incr = 0; - complete = ERR_RETURN; + complete = WT_ERR_RETURN; /* The root page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, root)); @@ -589,8 +606,11 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) for (root_refp = pindex->index, alloc_refp = alloc_index->index, i = 0; i < children; ++i) { slots = i == children - 1 ? remain : chunk; + + recno = root->type == WT_PAGE_COL_INT ? + (*root_refp)->key.recno : WT_RECNO_OOB; WT_ERR(__wt_page_alloc( - session, root->type, 0, slots, false, &child)); + session, root->type, recno, slots, false, &child)); /* * Initialize the page's child reference; we need a copy of the @@ -605,12 +625,10 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); root_incr += sizeof(WT_IKEY) + size; } else - ref->key.recno = (*root_refp)->key.recno; + ref->key.recno = recno; ref->state = WT_REF_MEM; /* Initialize the child page. */ - if (root->type == WT_PAGE_COL_INT) - child->pg_intl_recno = (*root_refp)->key.recno; child->pg_intl_parent_ref = ref; /* Mark it dirty. */ @@ -623,7 +641,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) * threads may be underneath us right now changing the structure * state.) However, if the WT_REF structures reference on-page * information, we have to fix that, because the disk image for - * the page that has an page index entry for the WT_REF is about + * the page that has a page index entry for the WT_REF is about * to change. */ child_pindex = WT_INTL_INDEX_GET_SAFE(child); @@ -641,7 +659,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) root_refp - pindex->index == (ptrdiff_t)pindex->entries); /* Start making real changes to the tree, errors are fatal. */ - complete = ERR_PANIC; + complete = WT_ERR_PANIC; /* Prepare the WT_REFs for the move. */ __split_ref_step1(session, alloc_index, false); @@ -661,7 +679,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__split_ref_step2(session, alloc_index, false)); /* The split is complete and correct, ignore benign errors. */ - complete = ERR_IGNORE; + complete = WT_ERR_IGNORE; /* We've installed the allocated page-index, ensure error handling. */ alloc_index = NULL; @@ -687,15 +705,15 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) __wt_page_modify_set(session, root); err: switch (complete) { - case ERR_RETURN: + case WT_ERR_RETURN: __wt_free_ref_index(session, root, alloc_index, true); break; - case ERR_PANIC: + case WT_ERR_PANIC: __wt_err(session, ret, "fatal error during root page split to deepen the tree"); ret = WT_PANIC; break; - case ERR_IGNORE: + case WT_ERR_IGNORE: if (ret != 0 && ret != WT_PANIC) { __wt_err(session, ret, "ignoring not-fatal error during root page split " @@ -721,19 +739,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_PAGE *parent; WT_PAGE_INDEX *alloc_index, *pindex; WT_REF **alloc_refp, *next_ref; + WT_SPLIT_ERROR_PHASE complete; size_t parent_decr, size; uint64_t split_gen; - uint32_t i, j; + uint32_t hint, i, j; uint32_t deleted_entries, parent_entries, result_entries; uint32_t *deleted_refs; - bool complete, empty_parent; + bool empty_parent; parent = ref->home; alloc_index = pindex = NULL; parent_decr = 0; parent_entries = 0; - complete = empty_parent = false; + empty_parent = false; + complete = WT_ERR_RETURN; /* The parent page will be marked dirty, make sure that will succeed. */ WT_RET(__wt_page_modify_init(session, parent)); @@ -751,7 +771,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * array anyway. Switch them to the special split state, so that any * reading thread will restart. */ - WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); + WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr)); for (deleted_entries = 0, i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; WT_ASSERT(session, next_ref->state != WT_REF_SPLIT); @@ -791,28 +811,40 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * Allocate and initialize a new page index array for the parent, then * copy references from the original index array, plus references from * the newly created split array, into place. + * + * Update the WT_REF's page-index hint as we go. This can race with a + * thread setting the hint based on an older page-index, and the change + * isn't backed out in the case of an error, so there ways for the hint + * to be wrong; OK because it's just a hint. */ size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *); WT_ERR(__wt_calloc(session, 1, size, &alloc_index)); parent_incr += size; alloc_index->index = (WT_REF **)(alloc_index + 1); alloc_index->entries = result_entries; - for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) { + for (alloc_refp = alloc_index->index, + hint = i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref == ref) for (j = 0; j < new_entries; ++j) { ref_new[j]->home = parent; + ref_new[j]->pindex_hint = hint++; *alloc_refp++ = ref_new[j]; } - else if (next_ref->state != WT_REF_SPLIT) + else if (next_ref->state != WT_REF_SPLIT) { /* Skip refs we have marked for deletion. */ + next_ref->pindex_hint = hint++; *alloc_refp++ = next_ref; + } } /* Check that we filled in all the entries. */ WT_ASSERT(session, alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); + /* Start making real changes to the tree, errors are fatal. */ + complete = WT_ERR_PANIC; + /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -853,16 +885,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, */ WT_FULL_BARRIER(); - /* - * A note on error handling: failures before we swapped the new page - * index into the parent can be resolved by freeing allocated memory - * because the original page is unchanged, we can continue to use it - * and we have not yet modified the parent. Failures after we swap - * the new page index into the parent are also relatively benign, the - * split is OK and complete. For those reasons, we ignore errors past - * this point unless there's a panic. - */ - complete = true; + /* The split is complete and correct, ignore benign errors. */ + complete = WT_ERR_IGNORE; WT_ERR(__wt_verbose(session, WT_VERB_SPLIT, "%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32 @@ -946,7 +970,8 @@ err: __wt_scr_free(session, &scr); * nothing really bad can have happened, and our caller has to proceed * with the split. */ - if (!complete) { + switch (complete) { + case WT_ERR_RETURN: for (i = 0; i < parent_entries; ++i) { next_ref = pindex->index[i]; if (next_ref->state == WT_REF_SPLIT) @@ -954,20 +979,28 @@ err: __wt_scr_free(session, &scr); } __wt_free_ref_index(session, NULL, alloc_index, false); - /* * The split couldn't proceed because the parent would be empty, * return EBUSY so our caller knows to unlock the WT_REF that's * being deleted, but don't be noisy, there's nothing wrong. */ if (empty_parent) - return (EBUSY); + ret = EBUSY; + break; + case WT_ERR_PANIC: + __wt_err(session, ret, "fatal error during parent page split"); + ret = WT_PANIC; + break; + case WT_ERR_IGNORE: + if (ret != 0 && ret != WT_PANIC) { + __wt_err(session, ret, + "ignoring not-fatal error during parent page " + "split"); + ret = 0; + } + break; } - - if (ret != 0 && ret != WT_PANIC) - __wt_err(session, ret, - "ignoring not-fatal error during parent page split"); - return (ret == WT_PANIC || !complete ? ret : 0); + return (ret); } /* @@ -983,8 +1016,9 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index; WT_REF **alloc_refp; WT_REF **child_refp, *page_ref, **page_refp, *ref; + WT_SPLIT_ERROR_PHASE complete; size_t child_incr, page_decr, page_incr, parent_incr, size; - uint64_t split_gen; + uint64_t recno, split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; void *p; @@ -1012,7 +1046,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) alloc_index = replace_index = NULL; page_ref = page->pg_intl_parent_ref; page_decr = page_incr = parent_incr = 0; - complete = ERR_RETURN; + complete = WT_ERR_RETURN; /* * Our caller is holding the page locked to single-thread splits, which @@ -1081,8 +1115,11 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ASSERT(session, page_refp == pindex->index + chunk); for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) { slots = i == children - 1 ? remain : chunk; + + recno = page->type == WT_PAGE_COL_INT ? + (*page_refp)->key.recno : WT_RECNO_OOB; WT_ERR(__wt_page_alloc( - session, page->type, 0, slots, false, &child)); + session, page->type, recno, slots, false, &child)); /* * Initialize the page's child reference; we need a copy of the @@ -1097,12 +1134,10 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); parent_incr += sizeof(WT_IKEY) + size; } else - ref->key.recno = (*page_refp)->key.recno; + ref->key.recno = recno; ref->state = WT_REF_MEM; /* Initialize the child page. */ - if (page->type == WT_PAGE_COL_INT) - child->pg_intl_recno = (*page_refp)->key.recno; child->pg_intl_parent_ref = ref; /* Mark it dirty. */ @@ -1133,7 +1168,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) page_refp - pindex->index == (ptrdiff_t)pindex->entries); /* Start making real changes to the tree, errors are fatal. */ - complete = ERR_PANIC; + complete = WT_ERR_PANIC; /* Prepare the WT_REFs for the move. */ __split_ref_step1(session, alloc_index, true); @@ -1157,7 +1192,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__split_ref_step2(session, alloc_index, true)); /* The split is complete and correct, ignore benign errors. */ - complete = ERR_IGNORE; + complete = WT_ERR_IGNORE; /* * Push out the changes: not required for correctness, but no reason @@ -1193,16 +1228,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) __wt_page_modify_set(session, page); err: switch (complete) { - case ERR_RETURN: + case WT_ERR_RETURN: __wt_free_ref_index(session, page, alloc_index, true); __wt_free_ref_index(session, page, replace_index, false); break; - case ERR_PANIC: + case WT_ERR_PANIC: __wt_err(session, ret, "fatal error during internal page split"); ret = WT_PANIC; break; - case ERR_IGNORE: + case WT_ERR_IGNORE: if (ret != 0 && ret != WT_PANIC) { __wt_err(session, ret, "ignoring not-fatal error during internal page " @@ -1654,10 +1689,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) WT_DECL_RET; WT_DECL_ITEM(key); WT_INSERT *ins, **insp, *moved_ins, *prev_ins; - WT_INSERT_HEAD *ins_head; + WT_INSERT_HEAD *ins_head, *tmp_ins_head; WT_PAGE *page, *right; WT_REF *child, *split_ref[2] = { NULL, NULL }; size_t page_decr, parent_incr, right_incr; + uint8_t type; int i; WT_STAT_FAST_CONN_INCR(session, cache_inmem_split); @@ -1666,6 +1702,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) page = ref->page; right = NULL; page_decr = parent_incr = right_incr = 0; + type = page->type; /* * Assert splitting makes sense; specifically assert the page is dirty, @@ -1679,9 +1716,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT); /* Find the last item on the page. */ - ins_head = page->pg_row_entries == 0 ? - WT_ROW_INSERT_SMALLEST(page) : - WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + if (type == WT_PAGE_ROW_LEAF) + ins_head = page->pg_row_entries == 0 ? + WT_ROW_INSERT_SMALLEST(page) : + WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); + else + ins_head = WT_COL_APPEND(page); moved_ins = WT_SKIP_LAST(ins_head); /* @@ -1692,14 +1732,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * The new WT_REF is not quite identical: we have to instantiate a key, * and the new reference is visible to readers once the split completes. * - * The key-instantiation code checks for races, leave the key fields - * zeroed we don't trigger them. - * * Don't copy any deleted page state: we may be splitting a page that * was instantiated after a truncate and that history should not be * carried onto these new child pages. */ WT_ERR(__wt_calloc_one(session, &split_ref[0])); + parent_incr += sizeof(WT_REF); child = split_ref[0]; child->page = ref->page; child->home = ref->home; @@ -1713,49 +1751,82 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) */ ref->addr = NULL; - /* - * Copy the first key from the original page into first ref in the new - * parent. Pages created in memory always have a "smallest" insert - * list, so look there first. If we don't find one, get the first key - * from the disk image. - * - * We can't just use the key from the original ref: it may have been - * suffix-compressed, and after the split the truncated key may not be - * valid. - */ - WT_ERR(__wt_scr_alloc(session, 0, &key)); - if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) { - key->data = WT_INSERT_KEY(ins); - key->size = WT_INSERT_KEY_SIZE(ins); + if (type == WT_PAGE_ROW_LEAF) { + /* + * Copy the first key from the original page into first ref in + * the new parent. Pages created in memory always have a + * "smallest" insert list, so look there first. If we don't + * find one, get the first key from the disk image. + * + * We can't just use the key from the original ref: it may have + * been suffix-compressed, and after the split the truncated key + * may not be valid. + */ + WT_ERR(__wt_scr_alloc(session, 0, &key)); + if ((ins = + WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) { + key->data = WT_INSERT_KEY(ins); + key->size = WT_INSERT_KEY_SIZE(ins); + } else + WT_ERR(__wt_row_leaf_key( + session, page, &page->pg_row_d[0], key, true)); + WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child)); + parent_incr += sizeof(WT_IKEY) + key->size; + __wt_scr_free(session, &key); } else - WT_ERR(__wt_row_leaf_key( - session, page, &page->pg_row_d[0], key, true)); - WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child)); - parent_incr += sizeof(WT_REF) + sizeof(WT_IKEY) + key->size; - __wt_scr_free(session, &key); + child->key.recno = ref->key.recno; /* * The second page in the split is a new WT_REF/page pair. */ - WT_ERR(__wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, false, &right)); - WT_ERR(__wt_calloc_one(session, &right->pg_row_ins)); - WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0])); + if (type == WT_PAGE_ROW_LEAF) + WT_ERR(__wt_page_alloc(session, + type, WT_RECNO_OOB, 0, false, &right)); + else + WT_ERR(__wt_page_alloc(session, + type, WT_INSERT_RECNO(moved_ins), 0, false, &right)); + + /* + * The new page is dirty by definition, column-store splits update the + * page-modify structure, so create it now. + */ + WT_ERR(__wt_page_modify_init(session, right)); + __wt_page_modify_set(session, right); + + if (type == WT_PAGE_ROW_LEAF) { + WT_ERR(__wt_calloc_one(session, &right->pg_row_ins)); + WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0])); + } else { + WT_ERR(__wt_calloc_one(session, &right->modify->mod_append)); + WT_ERR(__wt_calloc_one(session, &right->modify->mod_append[0])); + } right_incr += sizeof(WT_INSERT_HEAD); right_incr += sizeof(WT_INSERT_HEAD *); WT_ERR(__wt_calloc_one(session, &split_ref[1])); + parent_incr += sizeof(WT_REF); child = split_ref[1]; child->page = right; child->state = WT_REF_MEM; - WT_ERR(__wt_row_ikey(session, 0, - WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins), - child)); - parent_incr += - sizeof(WT_REF) + sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins); - /* The new page is dirty by definition. */ - WT_ERR(__wt_page_modify_init(session, right)); - __wt_page_modify_set(session, right); + if (type == WT_PAGE_ROW_LEAF) { + WT_ERR(__wt_row_ikey(session, 0, + WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins), + child)); + parent_incr += sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins); + } else + child->key.recno = WT_INSERT_RECNO(moved_ins); + + /* + * Allocation operations completed, we're going to split. + * + * Record the split column-store page record, used in reconciliation. + */ + if (type != WT_PAGE_ROW_LEAF) { + WT_ASSERT(session, + page->modify->mod_split_recno == WT_RECNO_OOB); + page->modify->mod_split_recno = child->key.recno; + } /* * We modified the page above, which will have set the first dirty @@ -1779,15 +1850,16 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) page_decr, right_incr, __wt_update_list_memsize(moved_ins->upd)); /* - * Allocation operations completed, move the last insert list item from - * the original page to the new page. + * Move the last insert list item from the original page to the new + * page. * * First, update the item to the new child page. (Just append the entry * for simplicity, the previous skip list pointers originally allocated * can be ignored.) */ - right->pg_row_ins[0]->head[0] = - right->pg_row_ins[0]->tail[0] = moved_ins; + tmp_ins_head = type == WT_PAGE_ROW_LEAF ? + right->pg_row_ins[0] : right->modify->mod_append[0]; + tmp_ins_head->head[0] = tmp_ins_head->tail[0] = moved_ins; /* * Remove the entry from the orig page (i.e truncate the skip list). @@ -1872,34 +1944,40 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) __wt_cache_page_inmem_incr(session, right, right_incr); /* - * Split into the parent. After this, the original page is no + * Split into the parent. On successful return, the original page is no * longer locked, so we cannot safely look at it. */ page = NULL; if ((ret = __split_parent( - session, ref, split_ref, 2, parent_incr, false, true)) != 0) { - /* - * Move the insert list element back to the original page list. - * For simplicity, the previous skip list pointers originally - * allocated can be ignored, just append the entry to the end of - * the level 0 list. As before, we depend on the list having - * multiple elements and ignore the edge cases small lists have. - */ - right->pg_row_ins[0]->head[0] = - right->pg_row_ins[0]->tail[0] = NULL; - ins_head->tail[0]->next[0] = moved_ins; - ins_head->tail[0] = moved_ins; + session, ref, split_ref, 2, parent_incr, false, true)) == 0) + return (0); - /* - * We marked the new page dirty; we're going to discard it, but - * first mark it clean and fix up the cache statistics. - */ - __wt_page_modify_clear(session, right); + /* + * Failure. + * + * Reset the split column-store page record. + */ + page->modify->mod_split_recno = WT_RECNO_OOB; - WT_ERR(ret); - } + /* + * Clear the allocated page's reference to the moved insert list element + * so it's not freed when we discard the page. + * + * Move the element back to the original page list. For simplicity, the + * previous skip list pointers originally allocated can be ignored, just + * append the entry to the end of the level 0 list. As before, we depend + * on the list having multiple elements and ignore the edge cases small + * lists have. + */ + if (type == WT_PAGE_ROW_LEAF) + right->pg_row_ins[0]->head[0] = + right->pg_row_ins[0]->tail[0] = NULL; + else + right->modify->mod_append[0]->head[0] = + right->modify->mod_append[0]->tail[0] = NULL; - return (0); + ins_head->tail[0]->next[0] = moved_ins; + ins_head->tail[0] = moved_ins; err: if (split_ref[0] != NULL) { /* @@ -1907,15 +1985,23 @@ err: if (split_ref[0] != NULL) { */ ref->addr = split_ref[0]->addr; - __wt_free(session, split_ref[0]->key.ikey); + if (type == WT_PAGE_ROW_LEAF) + __wt_free(session, split_ref[0]->key.ikey); __wt_free(session, split_ref[0]); } if (split_ref[1] != NULL) { - __wt_free(session, split_ref[1]->key.ikey); + if (type == WT_PAGE_ROW_LEAF) + __wt_free(session, split_ref[1]->key.ikey); __wt_free(session, split_ref[1]); } - if (right != NULL) + if (right != NULL) { + /* + * We marked the new page dirty; we're going to discard it, + * but first mark it clean and fix up the cache statistics. + */ + __wt_page_modify_clear(session, right); __wt_page_out(session, &right); + } __wt_scr_free(session, &key); return (ret); } |