summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/bt_split.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/bt_split.c')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c338
1 files changed, 199 insertions, 139 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index 12f4197e9e7..ac83a21ac6f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2015 MongoDB, Inc.
+ * Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -15,6 +15,22 @@
} while (0)
/*
+ * A note on error handling: main split functions first allocate/initialize new
+ * structures; failures during that period are handled by discarding the memory
+ * and returning an error code, the caller knows the split didn't happen and
+ * proceeds accordingly. Second, split functions update the tree, and a failure
+ * in that period is catastrophic, any partial update to the tree requires a
+ * panic, we can't recover. Third, once the split is complete and the tree has
+ * been fully updated, we have to ignore most errors, the split is complete and
+ * correct, callers have to proceed accordingly.
+ */
+typedef enum {
+ WT_ERR_IGNORE, /* Ignore minor errors */
+ WT_ERR_PANIC, /* Panic on all errors */
+ WT_ERR_RETURN /* Clean up and return error */
+} WT_SPLIT_ERROR_PHASE;
+
+/*
* __split_oldest_gen --
* Calculate the oldest active split generation.
*/
@@ -512,25 +528,13 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex;
WT_REF **alloc_refp;
WT_REF **child_refp, *ref, **root_refp;
+ WT_SPLIT_ERROR_PHASE complete;
size_t child_incr, root_decr, root_incr, size;
- uint64_t split_gen;
+ uint64_t recno, split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
void *p;
- /*
- * A note on error handling: this function first allocates/initializes
- * new structures; failures during that period are handled by discarding
- * the memory and returning an error code, our caller knows the split
- * didn't happen and proceeds accordingly. Second, this function updates
- * the tree, and a failure in that period is catastrophic, any partial
- * update to the tree requires a panic, we can't recover. Third, once
- * the split is complete and the tree has been fully updated, we have to
- * ignore most errors because the split is complete and correct, callers
- * have to proceed accordingly.
- */
- enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete;
-
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
@@ -539,7 +543,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
btree = S2BT(session);
alloc_index = NULL;
root_decr = root_incr = 0;
- complete = ERR_RETURN;
+ complete = WT_ERR_RETURN;
/* The root page will be marked dirty, make sure that will succeed. */
WT_RET(__wt_page_modify_init(session, root));
@@ -589,8 +593,11 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
for (root_refp = pindex->index,
alloc_refp = alloc_index->index, i = 0; i < children; ++i) {
slots = i == children - 1 ? remain : chunk;
+
+ recno = root->type == WT_PAGE_COL_INT ?
+ (*root_refp)->key.recno : WT_RECNO_OOB;
WT_ERR(__wt_page_alloc(
- session, root->type, 0, slots, false, &child));
+ session, root->type, recno, slots, false, &child));
/*
* Initialize the page's child reference; we need a copy of the
@@ -605,12 +612,10 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
root_incr += sizeof(WT_IKEY) + size;
} else
- ref->key.recno = (*root_refp)->key.recno;
+ ref->key.recno = recno;
ref->state = WT_REF_MEM;
/* Initialize the child page. */
- if (root->type == WT_PAGE_COL_INT)
- child->pg_intl_recno = (*root_refp)->key.recno;
child->pg_intl_parent_ref = ref;
/* Mark it dirty. */
@@ -623,7 +628,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
* threads may be underneath us right now changing the structure
* state.) However, if the WT_REF structures reference on-page
* information, we have to fix that, because the disk image for
- * the page that has an page index entry for the WT_REF is about
+ * the page that has a page index entry for the WT_REF is about
* to change.
*/
child_pindex = WT_INTL_INDEX_GET_SAFE(child);
@@ -641,7 +646,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
root_refp - pindex->index == (ptrdiff_t)pindex->entries);
/* Start making real changes to the tree, errors are fatal. */
- complete = ERR_PANIC;
+ complete = WT_ERR_PANIC;
/* Prepare the WT_REFs for the move. */
__split_ref_step1(session, alloc_index, false);
@@ -661,7 +666,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__split_ref_step2(session, alloc_index, false));
/* The split is complete and correct, ignore benign errors. */
- complete = ERR_IGNORE;
+ complete = WT_ERR_IGNORE;
/* We've installed the allocated page-index, ensure error handling. */
alloc_index = NULL;
@@ -687,15 +692,15 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
__wt_page_modify_set(session, root);
err: switch (complete) {
- case ERR_RETURN:
+ case WT_ERR_RETURN:
__wt_free_ref_index(session, root, alloc_index, true);
break;
- case ERR_PANIC:
+ case WT_ERR_PANIC:
__wt_err(session, ret,
"fatal error during root page split to deepen the tree");
ret = WT_PANIC;
break;
- case ERR_IGNORE:
+ case WT_ERR_IGNORE:
if (ret != 0 && ret != WT_PANIC) {
__wt_err(session, ret,
"ignoring not-fatal error during root page split "
@@ -721,19 +726,21 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_PAGE *parent;
WT_PAGE_INDEX *alloc_index, *pindex;
WT_REF **alloc_refp, *next_ref;
+ WT_SPLIT_ERROR_PHASE complete;
size_t parent_decr, size;
uint64_t split_gen;
- uint32_t i, j;
+ uint32_t hint, i, j;
uint32_t deleted_entries, parent_entries, result_entries;
uint32_t *deleted_refs;
- bool complete, empty_parent;
+ bool empty_parent;
parent = ref->home;
alloc_index = pindex = NULL;
parent_decr = 0;
parent_entries = 0;
- complete = empty_parent = false;
+ empty_parent = false;
+ complete = WT_ERR_RETURN;
/* The parent page will be marked dirty, make sure that will succeed. */
WT_RET(__wt_page_modify_init(session, parent));
@@ -751,7 +758,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* array anyway. Switch them to the special split state, so that any
* reading thread will restart.
*/
- WT_RET(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
+ WT_ERR(__wt_scr_alloc(session, 10 * sizeof(uint32_t), &scr));
for (deleted_entries = 0, i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
WT_ASSERT(session, next_ref->state != WT_REF_SPLIT);
@@ -791,28 +798,40 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* Allocate and initialize a new page index array for the parent, then
* copy references from the original index array, plus references from
* the newly created split array, into place.
+ *
+ * Update the WT_REF's page-index hint as we go. This can race with a
+ * thread setting the hint based on an older page-index, and the change
+ * isn't backed out in the case of an error, so there ways for the hint
+ * to be wrong; OK because it's just a hint.
*/
size = sizeof(WT_PAGE_INDEX) + result_entries * sizeof(WT_REF *);
WT_ERR(__wt_calloc(session, 1, size, &alloc_index));
parent_incr += size;
alloc_index->index = (WT_REF **)(alloc_index + 1);
alloc_index->entries = result_entries;
- for (alloc_refp = alloc_index->index, i = 0; i < parent_entries; ++i) {
+ for (alloc_refp = alloc_index->index,
+ hint = i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
if (next_ref == ref)
for (j = 0; j < new_entries; ++j) {
ref_new[j]->home = parent;
+ ref_new[j]->pindex_hint = hint++;
*alloc_refp++ = ref_new[j];
}
- else if (next_ref->state != WT_REF_SPLIT)
+ else if (next_ref->state != WT_REF_SPLIT) {
/* Skip refs we have marked for deletion. */
+ next_ref->pindex_hint = hint++;
*alloc_refp++ = next_ref;
+ }
}
/* Check that we filled in all the entries. */
WT_ASSERT(session,
alloc_refp - alloc_index->index == (ptrdiff_t)result_entries);
+ /* Start making real changes to the tree, errors are fatal. */
+ complete = WT_ERR_PANIC;
+
/*
* Confirm the parent page's index hasn't moved then update it, which
* makes the split visible to threads descending the tree.
@@ -853,16 +872,8 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
*/
WT_FULL_BARRIER();
- /*
- * A note on error handling: failures before we swapped the new page
- * index into the parent can be resolved by freeing allocated memory
- * because the original page is unchanged, we can continue to use it
- * and we have not yet modified the parent. Failures after we swap
- * the new page index into the parent are also relatively benign, the
- * split is OK and complete. For those reasons, we ignore errors past
- * this point unless there's a panic.
- */
- complete = true;
+ /* The split is complete and correct, ignore benign errors. */
+ complete = WT_ERR_IGNORE;
WT_ERR(__wt_verbose(session, WT_VERB_SPLIT,
"%p: %s %s" "split into parent %p, %" PRIu32 " -> %" PRIu32
@@ -946,7 +957,8 @@ err: __wt_scr_free(session, &scr);
* nothing really bad can have happened, and our caller has to proceed
* with the split.
*/
- if (!complete) {
+ switch (complete) {
+ case WT_ERR_RETURN:
for (i = 0; i < parent_entries; ++i) {
next_ref = pindex->index[i];
if (next_ref->state == WT_REF_SPLIT)
@@ -954,20 +966,28 @@ err: __wt_scr_free(session, &scr);
}
__wt_free_ref_index(session, NULL, alloc_index, false);
-
/*
* The split couldn't proceed because the parent would be empty,
* return EBUSY so our caller knows to unlock the WT_REF that's
* being deleted, but don't be noisy, there's nothing wrong.
*/
if (empty_parent)
- return (EBUSY);
+ ret = EBUSY;
+ break;
+ case WT_ERR_PANIC:
+ __wt_err(session, ret, "fatal error during parent page split");
+ ret = WT_PANIC;
+ break;
+ case WT_ERR_IGNORE:
+ if (ret != 0 && ret != WT_PANIC) {
+ __wt_err(session, ret,
+ "ignoring not-fatal error during parent page "
+ "split");
+ ret = 0;
+ }
+ break;
}
-
- if (ret != 0 && ret != WT_PANIC)
- __wt_err(session, ret,
- "ignoring not-fatal error during parent page split");
- return (ret == WT_PANIC || !complete ? ret : 0);
+ return (ret);
}
/*
@@ -983,25 +1003,13 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_PAGE_INDEX *alloc_index, *child_pindex, *pindex, *replace_index;
WT_REF **alloc_refp;
WT_REF **child_refp, *page_ref, **page_refp, *ref;
+ WT_SPLIT_ERROR_PHASE complete;
size_t child_incr, page_decr, page_incr, parent_incr, size;
- uint64_t split_gen;
+ uint64_t recno, split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
void *p;
- /*
- * A note on error handling: this function first allocates/initializes
- * new structures; failures during that period are handled by discarding
- * the memory and returning an error code, our caller knows the split
- * didn't happen and proceeds accordingly. Second, this function updates
- * the tree, and a failure in that period is catastrophic, any partial
- * update to the tree requires a panic, we can't recover. Third, once
- * the split is complete and the tree has been fully updated, we have to
- * ignore most errors because the split is complete and correct, callers
- * have to proceed accordingly.
- */
- enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete;
-
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal);
@@ -1012,7 +1020,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
alloc_index = replace_index = NULL;
page_ref = page->pg_intl_parent_ref;
page_decr = page_incr = parent_incr = 0;
- complete = ERR_RETURN;
+ complete = WT_ERR_RETURN;
/*
* Our caller is holding the page locked to single-thread splits, which
@@ -1081,8 +1089,11 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ASSERT(session, page_refp == pindex->index + chunk);
for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) {
slots = i == children - 1 ? remain : chunk;
+
+ recno = page->type == WT_PAGE_COL_INT ?
+ (*page_refp)->key.recno : WT_RECNO_OOB;
WT_ERR(__wt_page_alloc(
- session, page->type, 0, slots, false, &child));
+ session, page->type, recno, slots, false, &child));
/*
* Initialize the page's child reference; we need a copy of the
@@ -1097,12 +1108,10 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__wt_row_ikey(session, 0, p, size, ref));
parent_incr += sizeof(WT_IKEY) + size;
} else
- ref->key.recno = (*page_refp)->key.recno;
+ ref->key.recno = recno;
ref->state = WT_REF_MEM;
/* Initialize the child page. */
- if (page->type == WT_PAGE_COL_INT)
- child->pg_intl_recno = (*page_refp)->key.recno;
child->pg_intl_parent_ref = ref;
/* Mark it dirty. */
@@ -1133,7 +1142,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
page_refp - pindex->index == (ptrdiff_t)pindex->entries);
/* Start making real changes to the tree, errors are fatal. */
- complete = ERR_PANIC;
+ complete = WT_ERR_PANIC;
/* Prepare the WT_REFs for the move. */
__split_ref_step1(session, alloc_index, true);
@@ -1157,7 +1166,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__split_ref_step2(session, alloc_index, true));
/* The split is complete and correct, ignore benign errors. */
- complete = ERR_IGNORE;
+ complete = WT_ERR_IGNORE;
/*
* Push out the changes: not required for correctness, but no reason
@@ -1193,16 +1202,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__wt_page_modify_set(session, page);
err: switch (complete) {
- case ERR_RETURN:
+ case WT_ERR_RETURN:
__wt_free_ref_index(session, page, alloc_index, true);
__wt_free_ref_index(session, page, replace_index, false);
break;
- case ERR_PANIC:
+ case WT_ERR_PANIC:
__wt_err(session, ret,
"fatal error during internal page split");
ret = WT_PANIC;
break;
- case ERR_IGNORE:
+ case WT_ERR_IGNORE:
if (ret != 0 && ret != WT_PANIC) {
__wt_err(session, ret,
"ignoring not-fatal error during internal page "
@@ -1654,10 +1663,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
WT_DECL_RET;
WT_DECL_ITEM(key);
WT_INSERT *ins, **insp, *moved_ins, *prev_ins;
- WT_INSERT_HEAD *ins_head;
+ WT_INSERT_HEAD *ins_head, *tmp_ins_head;
WT_PAGE *page, *right;
WT_REF *child, *split_ref[2] = { NULL, NULL };
size_t page_decr, parent_incr, right_incr;
+ uint8_t type;
int i;
WT_STAT_FAST_CONN_INCR(session, cache_inmem_split);
@@ -1666,6 +1676,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
page = ref->page;
right = NULL;
page_decr = parent_incr = right_incr = 0;
+ type = page->type;
/*
* Assert splitting makes sense; specifically assert the page is dirty,
@@ -1679,9 +1690,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
F_SET_ATOMIC(page, WT_PAGE_SPLIT_INSERT);
/* Find the last item on the page. */
- ins_head = page->pg_row_entries == 0 ?
- WT_ROW_INSERT_SMALLEST(page) :
- WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
+ if (type == WT_PAGE_ROW_LEAF)
+ ins_head = page->pg_row_entries == 0 ?
+ WT_ROW_INSERT_SMALLEST(page) :
+ WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
+ else
+ ins_head = WT_COL_APPEND(page);
moved_ins = WT_SKIP_LAST(ins_head);
/*
@@ -1692,14 +1706,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
* The new WT_REF is not quite identical: we have to instantiate a key,
* and the new reference is visible to readers once the split completes.
*
- * The key-instantiation code checks for races, leave the key fields
- * zeroed we don't trigger them.
- *
* Don't copy any deleted page state: we may be splitting a page that
* was instantiated after a truncate and that history should not be
* carried onto these new child pages.
*/
WT_ERR(__wt_calloc_one(session, &split_ref[0]));
+ parent_incr += sizeof(WT_REF);
child = split_ref[0];
child->page = ref->page;
child->home = ref->home;
@@ -1713,49 +1725,82 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
*/
ref->addr = NULL;
- /*
- * Copy the first key from the original page into first ref in the new
- * parent. Pages created in memory always have a "smallest" insert
- * list, so look there first. If we don't find one, get the first key
- * from the disk image.
- *
- * We can't just use the key from the original ref: it may have been
- * suffix-compressed, and after the split the truncated key may not be
- * valid.
- */
- WT_ERR(__wt_scr_alloc(session, 0, &key));
- if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) {
- key->data = WT_INSERT_KEY(ins);
- key->size = WT_INSERT_KEY_SIZE(ins);
+ if (type == WT_PAGE_ROW_LEAF) {
+ /*
+ * Copy the first key from the original page into first ref in
+ * the new parent. Pages created in memory always have a
+ * "smallest" insert list, so look there first. If we don't
+ * find one, get the first key from the disk image.
+ *
+ * We can't just use the key from the original ref: it may have
+ * been suffix-compressed, and after the split the truncated key
+ * may not be valid.
+ */
+ WT_ERR(__wt_scr_alloc(session, 0, &key));
+ if ((ins =
+ WT_SKIP_FIRST(WT_ROW_INSERT_SMALLEST(page))) != NULL) {
+ key->data = WT_INSERT_KEY(ins);
+ key->size = WT_INSERT_KEY_SIZE(ins);
+ } else
+ WT_ERR(__wt_row_leaf_key(
+ session, page, &page->pg_row_d[0], key, true));
+ WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child));
+ parent_incr += sizeof(WT_IKEY) + key->size;
+ __wt_scr_free(session, &key);
} else
- WT_ERR(__wt_row_leaf_key(
- session, page, &page->pg_row_d[0], key, true));
- WT_ERR(__wt_row_ikey(session, 0, key->data, key->size, child));
- parent_incr += sizeof(WT_REF) + sizeof(WT_IKEY) + key->size;
- __wt_scr_free(session, &key);
+ child->key.recno = ref->key.recno;
/*
* The second page in the split is a new WT_REF/page pair.
*/
- WT_ERR(__wt_page_alloc(session, WT_PAGE_ROW_LEAF, 0, 0, false, &right));
- WT_ERR(__wt_calloc_one(session, &right->pg_row_ins));
- WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0]));
+ if (type == WT_PAGE_ROW_LEAF)
+ WT_ERR(__wt_page_alloc(session,
+ type, WT_RECNO_OOB, 0, false, &right));
+ else
+ WT_ERR(__wt_page_alloc(session,
+ type, WT_INSERT_RECNO(moved_ins), 0, false, &right));
+
+ /*
+ * The new page is dirty by definition, column-store splits update the
+ * page-modify structure, so create it now.
+ */
+ WT_ERR(__wt_page_modify_init(session, right));
+ __wt_page_modify_set(session, right);
+
+ if (type == WT_PAGE_ROW_LEAF) {
+ WT_ERR(__wt_calloc_one(session, &right->pg_row_ins));
+ WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0]));
+ } else {
+ WT_ERR(__wt_calloc_one(session, &right->modify->mod_append));
+ WT_ERR(__wt_calloc_one(session, &right->modify->mod_append[0]));
+ }
right_incr += sizeof(WT_INSERT_HEAD);
right_incr += sizeof(WT_INSERT_HEAD *);
WT_ERR(__wt_calloc_one(session, &split_ref[1]));
+ parent_incr += sizeof(WT_REF);
child = split_ref[1];
child->page = right;
child->state = WT_REF_MEM;
- WT_ERR(__wt_row_ikey(session, 0,
- WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins),
- child));
- parent_incr +=
- sizeof(WT_REF) + sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins);
- /* The new page is dirty by definition. */
- WT_ERR(__wt_page_modify_init(session, right));
- __wt_page_modify_set(session, right);
+ if (type == WT_PAGE_ROW_LEAF) {
+ WT_ERR(__wt_row_ikey(session, 0,
+ WT_INSERT_KEY(moved_ins), WT_INSERT_KEY_SIZE(moved_ins),
+ child));
+ parent_incr += sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins);
+ } else
+ child->key.recno = WT_INSERT_RECNO(moved_ins);
+
+ /*
+ * Allocation operations completed, we're going to split.
+ *
+ * Record the split column-store page record, used in reconciliation.
+ */
+ if (type != WT_PAGE_ROW_LEAF) {
+ WT_ASSERT(session,
+ page->modify->mod_split_recno == WT_RECNO_OOB);
+ page->modify->mod_split_recno = child->key.recno;
+ }
/*
* We modified the page above, which will have set the first dirty
@@ -1779,15 +1824,16 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
page_decr, right_incr, __wt_update_list_memsize(moved_ins->upd));
/*
- * Allocation operations completed, move the last insert list item from
- * the original page to the new page.
+ * Move the last insert list item from the original page to the new
+ * page.
*
* First, update the item to the new child page. (Just append the entry
* for simplicity, the previous skip list pointers originally allocated
* can be ignored.)
*/
- right->pg_row_ins[0]->head[0] =
- right->pg_row_ins[0]->tail[0] = moved_ins;
+ tmp_ins_head = type == WT_PAGE_ROW_LEAF ?
+ right->pg_row_ins[0] : right->modify->mod_append[0];
+ tmp_ins_head->head[0] = tmp_ins_head->tail[0] = moved_ins;
/*
* Remove the entry from the orig page (i.e truncate the skip list).
@@ -1872,34 +1918,40 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
__wt_cache_page_inmem_incr(session, right, right_incr);
/*
- * Split into the parent. After this, the original page is no
+ * Split into the parent. On successful return, the original page is no
* longer locked, so we cannot safely look at it.
*/
page = NULL;
if ((ret = __split_parent(
- session, ref, split_ref, 2, parent_incr, false, true)) != 0) {
- /*
- * Move the insert list element back to the original page list.
- * For simplicity, the previous skip list pointers originally
- * allocated can be ignored, just append the entry to the end of
- * the level 0 list. As before, we depend on the list having
- * multiple elements and ignore the edge cases small lists have.
- */
- right->pg_row_ins[0]->head[0] =
- right->pg_row_ins[0]->tail[0] = NULL;
- ins_head->tail[0]->next[0] = moved_ins;
- ins_head->tail[0] = moved_ins;
+ session, ref, split_ref, 2, parent_incr, false, true)) == 0)
+ return (0);
- /*
- * We marked the new page dirty; we're going to discard it, but
- * first mark it clean and fix up the cache statistics.
- */
- __wt_page_modify_clear(session, right);
+ /*
+ * Failure.
+ *
+ * Reset the split column-store page record.
+ */
+ page->modify->mod_split_recno = WT_RECNO_OOB;
- WT_ERR(ret);
- }
+ /*
+ * Clear the allocated page's reference to the moved insert list element
+ * so it's not freed when we discard the page.
+ *
+ * Move the element back to the original page list. For simplicity, the
+ * previous skip list pointers originally allocated can be ignored, just
+ * append the entry to the end of the level 0 list. As before, we depend
+ * on the list having multiple elements and ignore the edge cases small
+ * lists have.
+ */
+ if (type == WT_PAGE_ROW_LEAF)
+ right->pg_row_ins[0]->head[0] =
+ right->pg_row_ins[0]->tail[0] = NULL;
+ else
+ right->modify->mod_append[0]->head[0] =
+ right->modify->mod_append[0]->tail[0] = NULL;
- return (0);
+ ins_head->tail[0]->next[0] = moved_ins;
+ ins_head->tail[0] = moved_ins;
err: if (split_ref[0] != NULL) {
/*
@@ -1907,15 +1959,23 @@ err: if (split_ref[0] != NULL) {
*/
ref->addr = split_ref[0]->addr;
- __wt_free(session, split_ref[0]->key.ikey);
+ if (type == WT_PAGE_ROW_LEAF)
+ __wt_free(session, split_ref[0]->key.ikey);
__wt_free(session, split_ref[0]);
}
if (split_ref[1] != NULL) {
- __wt_free(session, split_ref[1]->key.ikey);
+ if (type == WT_PAGE_ROW_LEAF)
+ __wt_free(session, split_ref[1]->key.ikey);
__wt_free(session, split_ref[1]);
}
- if (right != NULL)
+ if (right != NULL) {
+ /*
+ * We marked the new page dirty; we're going to discard it,
+ * but first mark it clean and fix up the cache statistics.
+ */
+ __wt_page_modify_clear(session, right);
__wt_page_out(session, &right);
+ }
__wt_scr_free(session, &key);
return (ret);
}