summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2014-03-03 14:22:15 -0500
committerKeith Bostic <keith@wiredtiger.com>2014-03-03 14:22:15 -0500
commit07fd9661ed3efe8121ff615aa25713763ab19421 (patch)
tree4116c2c45fb35a73244482869dd22b3c202d977b /src
parent4a35b129f111896666044ab8f9c08c886270428d (diff)
downloadmongo-07fd9661ed3efe8121ff615aa25713763ab19421.tar.gz
Support eviction of pages that include unresolved changes.
Split the page being evicted into two groups of blocks: blocks without changes that cannot be written (cold blocks), and blocks with unresolved changes that cannot be written (hot blocks). Cold blocks are written by reconciliation as usual, and the resulting block address copied into the WT_PAGE.modify structure for eviction's use. Hot blocks: the disk image for each hot block is copied into the WT_PAGE.modify structure, along with a list of WT_UPDATE structures, one for each unresolved change that prevented the page from being written. Eviction then uses the disk image to create a new in-memory page, and the list of WT_UPDATE structures to re-create the list of unresolved changes on that page. Both cold and hot blocks are then moved into a WT_REF array, and split into the parent page.
Diffstat (limited to 'src')
-rw-r--r--src/btree/bt_cursor.c86
-rw-r--r--src/btree/bt_discard.c7
-rw-r--r--src/btree/col_modify.c98
-rw-r--r--src/btree/col_srch.c20
-rw-r--r--src/btree/rec_evict.c185
-rw-r--r--src/btree/rec_write.c243
-rw-r--r--src/btree/row_modify.c94
-rw-r--r--src/btree/row_srch.c21
-rw-r--r--src/include/btmem.h41
-rw-r--r--src/include/btree.i4
-rw-r--r--src/include/extern.h28
-rw-r--r--src/include/flags.h2
-rw-r--r--src/include/txn.i40
-rw-r--r--src/include/wt_internal.h2
14 files changed, 525 insertions, 346 deletions
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 7e9919f1a1a..aed17e0e8b4 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -139,6 +139,50 @@ __cursor_invalid(WT_CURSOR_BTREE *cbt)
}
/*
+ * __cursor_col_search --
+ * Column-store search from an application cursor.
+ */
+static inline int
+__cursor_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ return (__wt_col_search(session, cbt->iface.recno, NULL, cbt));
+}
+
+/*
+ * __cursor_row_search --
+ * Row-store search from an application cursor.
+ */
+static inline int
+__cursor_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+{
+ return (__wt_row_search(session, &cbt->iface.key, NULL, cbt));
+}
+
+/*
+ * __cursor_col_modify --
+ * Column-store delete, insert, and update from an application cursor.
+ */
+static inline int
+__cursor_col_modify(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
+{
+ return (__wt_col_modify(session,
+ cbt, cbt->iface.recno, &cbt->iface.value, NULL, is_remove));
+}
+
+/*
+ * __cursor_row_modify --
+ * Row-store insert, update and delete from an application cursor.
+ */
+static inline int
+__cursor_row_modify(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
+{
+ return (__wt_row_modify(session,
+ cbt, &cbt->iface.key, &cbt->iface.value, NULL, is_remove));
+}
+
+/*
* __wt_btcur_reset --
* Invalidate the cursor position.
*/
@@ -184,8 +228,8 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
WT_RET(__cursor_func_init(cbt, 1));
WT_ERR(btree->type == BTREE_ROW ?
- __wt_row_search(session, cbt) :
- __wt_col_search(session, cbt));
+ __cursor_row_search(session, cbt) :
+ __cursor_col_search(session, cbt));
if (cbt->compare != 0 || __cursor_invalid(cbt)) {
/*
* Creating a record past the end of the tree in a fixed-length
@@ -233,8 +277,8 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
WT_RET(__cursor_func_init(cbt, 1));
WT_ERR(btree->type == BTREE_ROW ?
- __wt_row_search(session, cbt) :
- __wt_col_search(session, cbt));
+ __cursor_row_search(session, cbt) :
+ __cursor_col_search(session, cbt));
/*
* Creating a record past the end of the tree in a fixed-length column-
@@ -263,8 +307,8 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
exact = 1;
else {
WT_ERR(btree->type == BTREE_ROW ?
- __wt_row_search(session, cbt) :
- __wt_col_search(session, cbt));
+ __cursor_row_search(session, cbt) :
+ __cursor_col_search(session, cbt));
if (!__cursor_invalid(cbt)) {
exact = cbt->compare;
ret = __wt_kv_return(session, cbt);
@@ -325,7 +369,7 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
if (F_ISSET(cursor, WT_CURSTD_APPEND))
cbt->iface.recno = UINT64_MAX;
- WT_ERR(__wt_col_search(session, cbt));
+ WT_ERR(__cursor_col_search(session, cbt));
if (F_ISSET(cursor, WT_CURSTD_APPEND))
cbt->iface.recno = 0;
@@ -341,12 +385,12 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
(cbt->compare != 0 && __cursor_fix_implicit(btree, cbt))))
WT_ERR(WT_DUPLICATE_KEY);
- WT_ERR(__wt_col_modify(session, cbt, 0));
+ WT_ERR(__cursor_col_modify(session, cbt, 0));
if (F_ISSET(cursor, WT_CURSTD_APPEND))
cbt->iface.recno = cbt->recno;
break;
case BTREE_ROW:
- WT_ERR(__wt_row_search(session, cbt));
+ WT_ERR(__cursor_row_search(session, cbt));
/*
* If not overwriting, fail if the key exists, else insert the
* key/value pair.
@@ -355,7 +399,7 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
cbt->compare == 0 && !__cursor_invalid(cbt))
WT_ERR(WT_DUPLICATE_KEY);
- ret = __wt_row_modify(session, cbt, 0);
+ ret = __cursor_row_modify(session, cbt, 0);
break;
WT_ILLEGAL_VALUE_ERR(session);
}
@@ -398,7 +442,7 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
switch (btree->type) {
case BTREE_COL_FIX:
case BTREE_COL_VAR:
- WT_ERR(__wt_col_search(session, cbt));
+ WT_ERR(__cursor_col_search(session, cbt));
/* Remove the record if it exists. */
if (cbt->compare != 0 || __cursor_invalid(cbt)) {
@@ -416,15 +460,15 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
*/
cbt->recno = cursor->recno;
} else
- ret = __wt_col_modify(session, cbt, 1);
+ ret = __cursor_col_modify(session, cbt, 1);
break;
case BTREE_ROW:
/* Remove the record if it exists. */
- WT_ERR(__wt_row_search(session, cbt));
+ WT_ERR(__cursor_row_search(session, cbt));
if (cbt->compare != 0 || __cursor_invalid(cbt))
WT_ERR(WT_NOTFOUND);
- ret = __wt_row_modify(session, cbt, 1);
+ ret = __cursor_row_modify(session, cbt, 1);
break;
WT_ILLEGAL_VALUE_ERR(session);
}
@@ -480,7 +524,7 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
switch (btree->type) {
case BTREE_COL_FIX:
case BTREE_COL_VAR:
- WT_ERR(__wt_col_search(session, cbt));
+ WT_ERR(__cursor_col_search(session, cbt));
/*
* If not overwriting, fail if the key doesn't exist. Update
@@ -493,17 +537,17 @@ retry: WT_RET(__cursor_func_init(cbt, 1));
(cbt->compare != 0 || __cursor_invalid(cbt)) &&
!__cursor_fix_implicit(btree, cbt))
WT_ERR(WT_NOTFOUND);
- ret = __wt_col_modify(session, cbt, 0);
+ ret = __cursor_col_modify(session, cbt, 0);
break;
case BTREE_ROW:
- WT_ERR(__wt_row_search(session, cbt));
+ WT_ERR(__cursor_row_search(session, cbt));
/*
* If not overwriting, fail if the key does not exist.
*/
if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
(cbt->compare != 0 || __cursor_invalid(cbt)))
WT_ERR(WT_NOTFOUND);
- ret = __wt_row_modify(session, cbt, 0);
+ ret = __cursor_row_modify(session, cbt, 0);
break;
WT_ILLEGAL_VALUE_ERR(session);
}
@@ -744,11 +788,11 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
switch (btree->type) {
case BTREE_COL_FIX:
WT_ERR(__cursor_truncate_fix(
- session, start, stop, __wt_col_modify));
+ session, start, stop, __cursor_col_modify));
break;
case BTREE_COL_VAR:
WT_ERR(__cursor_truncate(
- session, start, stop, __wt_col_modify));
+ session, start, stop, __cursor_col_modify));
break;
case BTREE_ROW:
/*
@@ -766,7 +810,7 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
if (stop != NULL)
WT_ERR(__wt_btcur_search(stop));
WT_ERR(__cursor_truncate(
- session, start, stop, __wt_row_modify));
+ session, start, stop, __cursor_row_modify));
break;
}
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index 5c484293023..22b8ec3c763 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -352,12 +352,13 @@ __free_update_list(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
WT_UPDATE *next;
- do {
- next = upd->next;
+ for (; upd != NULL; upd = next) {
/* Everything we free should be visible to everyone. */
WT_ASSERT(session,
upd->txnid == WT_TXN_ABORTED ||
__wt_txn_visible_all(session, upd->txnid));
+
+ next = upd->next;
__wt_free(session, upd);
- } while ((upd = next) != NULL);
+ }
}
diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c
index f7cc34b6126..ede35b556dd 100644
--- a/src/btree/col_modify.c
+++ b/src/btree/col_modify.c
@@ -15,25 +15,26 @@ static int __col_insert_alloc(
* Column-store delete, insert, and update.
*/
int
-__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
+__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+ uint64_t recno, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
{
WT_BTREE *btree;
WT_DECL_RET;
WT_INSERT *ins;
WT_INSERT_HEAD *ins_head, **ins_headp;
- WT_ITEM *value, _value;
+ WT_ITEM _value;
WT_PAGE *page;
- WT_UPDATE *old_upd, *upd;
+ WT_UPDATE *old_upd;
size_t ins_size, upd_size;
- uint64_t recno;
u_int i, skipdepth;
int append, logged;
btree = cbt->btree;
+ ins = NULL;
page = cbt->page;
- recno = cbt->iface.recno;
append = logged = 0;
+ /* This code expects a remove to have a NULL value. */
if (is_remove) {
if (btree->type == BTREE_COL_FIX) {
value = &_value;
@@ -42,8 +43,6 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
} else
value = NULL;
} else {
- value = &cbt->iface.value;
-
/*
* There's some chance the application specified a record past
* the last record on the page. If that's the case, and we're
@@ -60,9 +59,6 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
/* If we don't yet have a modify structure, we'll need one. */
WT_RET(__wt_page_modify_init(session, page));
- ins = NULL;
- upd = NULL;
-
/*
* Delete, insert or update a column-store entry.
*
@@ -75,13 +71,20 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
* the WT_INSERT structure.
*/
if (cbt->compare == 0 && cbt->ins != NULL) {
- /* Make sure the update can proceed. */
- WT_ERR(__wt_txn_update_check(session, old_upd = cbt->ins->upd));
-
- /* Allocate the WT_UPDATE structure and transaction ID. */
- WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
- WT_ERR(__wt_txn_modify(session, cbt, upd));
- logged = 1;
+ if (upd == NULL) {
+ /* Make sure the update can proceed. */
+ WT_ERR(__wt_txn_update_check(
+ session, old_upd = cbt->ins->upd));
+
+ /* Allocate a WT_UPDATE structure and transaction ID. */
+ WT_ERR(
+ __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, cbt, upd));
+ logged = 1;
+ } else {
+ upd_size = sizeof(WT_UPDATE) + upd->size;
+ old_upd = cbt->ins->upd;
+ }
/*
* Point the new WT_UPDATE item to the next element in the list.
@@ -119,22 +122,23 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
/*
* Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
- * update the cursor to reference it.
+ * update the cursor to reference it (the WT_INSERT_HEAD might
+ * be allocated, the WT_INSERT was allocated).
*/
WT_ERR(__col_insert_alloc(
session, recno, skipdepth, &ins, &ins_size));
- WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
- ins->upd = upd;
- ins_size += upd_size;
-
- /*
- * Update the cursor: the insert head may have been allocated,
- * the ins field was allocated.
- */
cbt->ins_head = ins_head;
cbt->ins = ins;
- WT_ERR(__wt_txn_modify(session, cbt, upd));
- logged = 1;
+
+ if (upd == NULL) {
+ WT_ERR(
+ __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, cbt, upd));
+ logged = 1;
+ } else
+ upd_size = sizeof(WT_UPDATE) + upd->size;
+ ins->upd = upd;
+ ins_size += upd_size;
/*
* If there was no insert list during the search, or there was
@@ -207,41 +211,3 @@ __col_insert_alloc(WT_SESSION_IMPL *session,
*ins_sizep = ins_size;
return (0);
}
-
-/*
- * __wt_col_leaf_obsolete --
- * Discard all obsolete updates on a column-store leaf page.
- */
-void
-__wt_col_leaf_obsolete(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
- WT_COL *cip;
- WT_INSERT *ins;
- WT_UPDATE *upd;
- uint32_t i;
-
- switch (page->type) {
- case WT_PAGE_COL_FIX:
- WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page))
- if ((upd = __wt_update_obsolete_check(
- session, ins->upd)) != NULL)
- __wt_update_obsolete_free(session, page, upd);
- break;
-
- case WT_PAGE_COL_VAR:
- WT_COL_FOREACH(page, cip, i)
- WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip))
- if ((upd = __wt_update_obsolete_check(
- session, ins->upd)) != NULL)
- __wt_update_obsolete_free(
- session, page, upd);
- break;
- }
-
- /* Walk any append list. */
- WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
- if ((upd =
- __wt_update_obsolete_check(session, ins->upd)) != NULL)
- __wt_update_obsolete_free(session, page, upd);
- }
-}
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index a2c2d804e02..e1a7664a38e 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -12,7 +12,8 @@
* Search a column-store tree for a specific record-based key.
*/
int
-__wt_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+__wt_col_search(WT_SESSION_IMPL *session,
+ uint64_t recno, WT_PAGE *leaf_page, WT_CURSOR_BTREE *cbt)
{
WT_BTREE *btree;
WT_COL *cip;
@@ -22,15 +23,22 @@ __wt_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
WT_PAGE *page;
WT_PAGE_INDEX *pindex;
WT_REF *ref;
- uint64_t recno;
uint32_t base, indx, limit;
int depth;
btree = S2BT(session);
- recno = cbt->iface.recno;
__cursor_search_clear(cbt);
+ /*
+ * In the service of eviction splits, we're only searching a single leaf
+ * page, not a full tree.
+ */
+ if (leaf_page != NULL) {
+ page = leaf_page;
+ goto leaf_only;
+ }
+
restart:
/* Search the internal pages of the tree. */
ref = NULL;
@@ -97,13 +105,11 @@ descend: WT_ASSERT(session, ref != NULL);
return (ret);
}
- /*
- * We want to know how deep the tree gets because excessive depth can
- * happen because of how WiredTiger splits.
- */
+ /* Track how deep the tree gets. */
if (depth > btree->maximum_depth)
btree->maximum_depth = depth;
+leaf_only:
cbt->page = page;
cbt->recno = recno;
cbt->compare = 0;
diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c
index 18125135e48..cf5ba1851b9 100644
--- a/src/btree/rec_evict.c
+++ b/src/btree/rec_evict.c
@@ -409,6 +409,153 @@ err: __wt_free(session, alloc_index);
}
/*
+ * __wt_multi_inmem_build --
+ * Instantiate a page in a multi-block set, when an update couldn't be
+ * written.
+ */
+static int
+__wt_multi_inmem_build(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, WT_MULTI *multi)
+{
+ WT_CURSOR_BTREE cbt;
+ WT_ITEM key;
+ WT_PAGE *new;
+ WT_UPDATE **updp;
+ WT_UPD_SKIPPED *skip;
+ uint64_t recno;
+ uint32_t i;
+
+ WT_CLEAR(key);
+
+ /*
+ * When a page is evicted, we can find unresolved updates, which cannot
+ * be written. We simply fail those evictions in most cases, but one
+ * case we must handle is when forcibly evicting a page grown too-large
+ * because the application inserted lots of new records. In that case,
+ * the page is expected to split into many on-disk chunks we write, plus
+ * some on-disk chunks we don't write. This code deals with the latter:
+ * any chunk we didn't write is re-created as a page, and then we apply
+ * the unresolved updates to that page.
+ *
+ * Create an in-memory version of the page, and link it to its parent.
+ */
+ WT_RET(__wt_page_inmem(session, NULL, NULL, multi->skip_dsk, 0, &new));
+ ref->page = new;
+
+ /* Re-create each modification we couldn't write. */
+ for (i = 0, skip = multi->skip; i < multi->skip_entries; ++i, ++skip) {
+ switch (page->type) {
+ case WT_PAGE_COL_FIX:
+ case WT_PAGE_COL_VAR:
+ /* Build a key. */
+ recno = WT_INSERT_RECNO(skip->head);
+
+ /* Search the page. */
+ WT_RET(__wt_col_search(session, recno, new, &cbt));
+
+ /* Apply the modification. */
+ WT_RET(__wt_col_modify(session, &cbt, recno,
+ NULL, skip->upd,
+ WT_UPDATE_DELETED_ISSET(skip->upd)));
+ break;
+ case WT_PAGE_ROW_LEAF:
+ /* Build a key. */
+ if (skip->is_insert) {
+ key.data = WT_INSERT_KEY(skip->head);
+ key.size =
+ WT_INSERT_KEY_SIZE(skip->head);
+ } else
+ WT_RET(__wt_row_leaf_key(session,
+ page, skip->head, &key, 0));
+
+ /* Search the page. */
+ WT_RET(__wt_row_search(session, &key, new, &cbt));
+
+ /* Apply the modification. */
+ WT_RET(__wt_row_modify(session, &cbt, &key,
+ NULL, skip->upd,
+ WT_UPDATE_DELETED_ISSET(skip->upd)));
+ break;
+ WT_ILLEGAL_VALUE(session);
+ }
+
+ /*
+ * XXXKEITH: the WT_UPDATE now appears on two lists. If
+ * we succeed, discarding the original page frees it, if
+ * we fail, discarding the created page will free it.
+ * A session aborting a transaction can modify it at any
+ * time, and this is a problem. For now, I'm removing
+ * it from the original page, but this problem needs to
+ * be revisted once we decide this whole approach is a
+ * viable one.
+ */
+ if (skip->is_insert)
+ updp = &((WT_INSERT *)skip->head)->upd;
+ else
+ updp = &page->pg_row_upd[
+ WT_ROW_SLOT(page, skip->head)];
+ for (; *updp != NULL; updp = &(*updp)->next)
+ if (*updp == skip->upd)
+ break;
+ WT_ASSERT(session, *updp != NULL);
+ *updp = (*updp)->next;
+ }
+
+ WT_LINK_PAGE(page->parent, ref, new);
+
+ return (0);
+}
+
+/*
+ * __wt_multi_to_ref --
+ * Move a multi-block list into an array of WT_REF structures.
+ */
+int
+__wt_multi_to_ref(WT_SESSION_IMPL *session,
+ WT_PAGE *page, WT_MULTI *multi, WT_REF *refarg, uint32_t entries)
+{
+ WT_ADDR *addr;
+ WT_DECL_RET;
+ WT_REF *ref;
+ uint32_t i;
+
+ addr = NULL;
+ for (ref = refarg, i = 0; i < entries; ++multi, ++ref, ++i) {
+ if (multi->skip == NULL) {
+ WT_ERR(__wt_calloc_def(session, 1, &addr));
+ ref->addr = addr;
+ addr->size = multi->addr.size;
+ addr->type = multi->addr.type;
+ WT_ERR(__wt_strndup(session,
+ multi->addr.addr, addr->size = multi->addr.size,
+ &addr->addr));
+ } else
+ WT_ERR(
+ __wt_multi_inmem_build(session, page, ref, multi));
+
+ switch (page->type) {
+ case WT_PAGE_ROW_INT:
+ case WT_PAGE_ROW_LEAF:
+ WT_ERR(__wt_strndup(session,
+ multi->key.ikey,
+ multi->key.ikey->size + sizeof(WT_IKEY),
+ &ref->key.ikey));
+ break;
+ default:
+ ref->key.recno = multi->key.recno;
+ break;
+ }
+
+ ref->txnid = 0;
+ ref->state = ref->page == NULL ? WT_REF_DISK : WT_REF_MEM;
+ }
+ return (0);
+
+err: __wt_free_ref_array(session, page, refarg, entries);
+ return (ret);
+}
+
+/*
* __rec_split_evict --
* Resolve a page split, inserting new information into the parent.
*/
@@ -442,7 +589,7 @@ __rec_split_evict(WT_SESSION_IMPL *session, WT_REF *parent_ref, WT_PAGE *page)
__wt_page_only_modify_set(session, parent);
/*
- * Allocate an array of WT_REF structures, and copy the page's multiple
+ * Allocate an array of WT_REF structures, and move the page's multiple
* block reconciliation information into it.
*/
WT_RET(__wt_calloc_def(session, mod->multi_entries, &alloc_ref));
@@ -685,7 +832,6 @@ __rec_review(WT_SESSION_IMPL *session,
WT_REF *ref, WT_PAGE *page, int exclusive, int top, int *istree)
{
WT_BTREE *btree;
- WT_DECL_RET;
WT_PAGE_MODIFY *mod;
WT_PAGE *t;
@@ -830,43 +976,22 @@ ckpt: WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint);
* know the final state.
*/
if (__wt_page_is_modified(page)) {
- ret = __wt_rec_write(session, page,
- NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT);
+ WT_RET(__wt_rec_write(session, page,
+ NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_RESTORE));
/*
* Update the page's modification reference, reconciliation
* might have changed it.
+ *
+ * XXXKEITH: I don't think this is true, I don't think the
+ * page's modify reference ever moves (or can move).
*/
mod = page->modify;
- if (ret == EBUSY) {
- /* Give up if there are unwritten changes */
- WT_VERBOSE_RET(session, evict,
- "eviction failed, reconciled page"
- " contained active updates");
-
- /*
- * We may be able to discard any "update" memory the
- * page no longer needs.
- */
- switch (page->type) {
- case WT_PAGE_COL_FIX:
- case WT_PAGE_COL_VAR:
- __wt_col_leaf_obsolete(session, page);
- break;
- case WT_PAGE_ROW_LEAF:
- __wt_row_leaf_obsolete(session, page);
- break;
- }
- }
- WT_RET(ret);
-
- WT_ASSERT(session, !__wt_page_is_modified(page));
}
/*
- * If the page is clean, but was ever modified, make sure all of the
- * updates on the page are old enough that they can be discarded from
- * cache.
+ * If the page was ever modified, make sure all of the updates on the
+ * page are old enough that they can be discarded from cache.
*/
if (!exclusive && mod != NULL &&
!__wt_txn_visible_all(session, mod->rec_max_txn))
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index eddc482a444..bb0893db9a5 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -139,6 +139,11 @@ typedef struct {
WT_ADDR addr; /* Split's written location */
uint32_t size; /* Split's size */
uint32_t cksum; /* Split's checksum */
+ void *dsk; /* Split's disk image */
+
+ WT_UPD_SKIPPED *skip; /* Skipped updates */
+ uint32_t skip_next;
+ size_t skip_allocated;
/*
* The key for a row-store page; no column-store key is needed
@@ -394,53 +399,6 @@ __wt_rec_write(WT_SESSION_IMPL *session,
}
/*
- * __wt_multi_to_ref --
- * Copy a list of blocks into an array of WT_REF structures.
- */
-int
-__wt_multi_to_ref(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_MULTI *multi, WT_REF *refarg, uint32_t entries)
-{
- WT_ADDR *addr;
- WT_DECL_RET;
- WT_REF *ref;
- uint32_t i;
-
- addr = NULL;
- for (ref = refarg, i = 0; i < entries; ++multi, ++ref, ++i) {
- ref->page = NULL;
-
- WT_ERR(__wt_calloc_def(session, 1, &addr));
- ref->addr = addr;
- addr->size = multi->addr.size;
- addr->type = multi->addr.type;
- WT_ERR(__wt_strndup(session,
- multi->addr.addr, addr->size = multi->addr.size,
- &addr->addr));
-
- switch (page->type) {
- case WT_PAGE_ROW_INT:
- case WT_PAGE_ROW_LEAF:
- WT_ERR(__wt_strndup(session,
- multi->key.ikey,
- multi->key.ikey->size + sizeof(WT_IKEY),
- &ref->key.ikey));
- break;
- default:
- ref->key.recno = multi->key.recno;
- break;
- }
-
- ref->txnid = 0;
- ref->state = WT_REF_DISK;
- }
- return (0);
-
-err: __wt_free_ref_array(session, page, refarg, entries);
- return (ret);
-}
-
-/*
* __rec_root_write --
* Handle the write of a root page.
*/
@@ -550,8 +508,12 @@ __rec_write_init(
WT_ASSERT(session, bnd->addr.addr == NULL);
bnd->addr.size = 0;
bnd->addr.type = 0;
-
bnd->size = bnd->cksum = 0;
+ __wt_free(session, bnd->dsk);
+
+ __wt_free(session, bnd->skip);
+ bnd->skip_next = 0;
+ bnd->skip_allocated = 0;
/* Leave the key alone, it's space we re-use. */
@@ -696,13 +658,11 @@ __rec_txn_skip_chk(WT_SESSION_IMPL *session, WT_RECONCILE *r)
{
r->upd_skipped = 1;
- switch (F_ISSET(r, WT_SKIP_UPDATE_ERR | WT_SKIP_UPDATE_QUIT)) {
+ switch (F_ISSET(r, WT_SKIP_UPDATE_ERR | WT_SKIP_UPDATE_RESTORE)) {
case WT_SKIP_UPDATE_ERR:
WT_PANIC_RETX(
session, "reconciliation illegally skipped an update");
- case WT_SKIP_UPDATE_QUIT:
- WT_STAT_FAST_CONN_INCR(session, rec_skipped_update);
- WT_STAT_FAST_DATA_INCR(session, rec_skipped_update);
+ case WT_SKIP_UPDATE_RESTORE:
return (EBUSY);
case 0:
default:
@@ -712,33 +672,78 @@ __rec_txn_skip_chk(WT_SESSION_IMPL *session, WT_RECONCILE *r)
}
/*
+ * __rec_upd_skip_save --
+ * Save a key/WT_UPDATE pair for later restoration.
+ */
+static int
+__rec_upd_skip_save(WT_SESSION_IMPL *session,
+ WT_RECONCILE *r, void *head, int is_insert, WT_UPDATE *upd)
+{
+ WT_BOUNDARY *bnd;
+
+ bnd = &r->bnd[r->bnd_next];
+ WT_RET(__wt_realloc_def(
+ session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip));
+ bnd->skip[bnd->skip_next].upd = upd;
+ bnd->skip[bnd->skip_next].head = head;
+ bnd->skip[bnd->skip_next].is_insert = is_insert;
+ ++bnd->skip_next;
+ return (0);
+}
+
+/*
* __rec_txn_read --
- * Return the update structure that's visible, or fail if there's a change
- * that's not globally visible and we can't skip changes.
+ * Return the first visible update in a list (or NULL if none are visible).
+ * Track the maximum transaction ID in the list and whether updates were skipped
+ * to find the visible update, an optionally save away skipped updates.
*/
static inline int
-__rec_txn_read(
- WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *upd, WT_UPDATE **updp)
+__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
+ void *head, int is_insert, WT_UPDATE *upd, WT_UPDATE **updp)
{
- int skip, retried;
+ *updp = NULL;
+ for (; upd != NULL; upd = upd->next) {
+ if (upd->txnid == WT_TXN_ABORTED)
+ continue;
- retried = 0;
-retry: *updp = __wt_txn_read_skip(session, upd, &r->max_txn, &skip);
- if (!skip)
- return (0);
+ /*
+ * Track the largest transaction ID on this page. We store this
+ * in the page at the end of reconciliation if no updates are
+ * skipped, and used to avoid evicting clean pages from memory
+ * with changes that are required to satisfy a snapshot read.
+ */
+ if (TXNID_LT(r->max_txn, upd->txnid))
+ r->max_txn = upd->txnid;
- /*
- * If skipping this update will cause reconciliation to quit, update
- * the oldest transaction ID and retry, in case some transactions have
- * committed while we have been working.
- */
- if (F_ISSET(r, WT_SKIP_UPDATE_QUIT) && !retried) {
- __wt_txn_update_oldest(session);
- retried = 1;
- goto retry;
+ if (*updp != NULL)
+ continue;
+ if (__wt_txn_visible(session, upd->txnid)) {
+ *updp = upd;
+ continue;
+ }
+
+ /*
+ * Record whether any updates were skipped on the way to finding
+ * the first visible update. That determines if a future read
+ * with no intervening modifications to the page could see a
+ * different value. If not, the page can safely be marked clean
+ * and does not need to be reconciled until modified again.
+ */
+ r->upd_skipped = 1;
+
+ /*
+ * If evicting and there's an update that's in-flight, save the
+ * information about the update so we can restore it on a newly
+ * instantiated page. It's tricky: a transaction references
+ * the physical WT_UPDATE, so we must move the structure itself,
+ * not a copy of it.
+ */
+ if (F_ISSET(r, WT_SKIP_UPDATE_RESTORE))
+ WT_RET(__rec_upd_skip_save(
+ session, r, head, is_insert, upd));
}
- return (__rec_txn_skip_chk(session, r));
+ return (0);
}
/*
@@ -1889,6 +1894,19 @@ __rec_split_finish_std(WT_SESSION_IMPL *session, WT_RECONCILE *r)
WT_ILLEGAL_VALUE(session);
}
+ /*
+ * If we're doing an eviction, and we skipped an update, it only pays
+ * off to continue if writing multiple blocks. This should be unlikely
+ * (why did eviction pick a small block that was recently written), but
+ * it's possible.
+ */
+ if (F_ISSET(r, WT_SKIP_UPDATE_RESTORE) &&
+ r->bnd_next == 0 && r->upd_skipped) {
+ WT_STAT_FAST_CONN_INCR(session, rec_skipped_update);
+ WT_STAT_FAST_DATA_INCR(session, rec_skipped_update);
+ return (EBUSY);
+ }
+
/* Set the boundary reference and increment the count. */
bnd = &r->bnd[r->bnd_next++];
bnd->entries = r->entries;
@@ -2063,6 +2081,17 @@ __rec_split_write(
WT_ILLEGAL_VALUE(session);
}
+ bnd->size = (uint32_t)buf->size;
+ bnd->cksum = 0;
+
+ /*
+ * If we had to skip updates in order to build this disk image, we can't
+ * actually write it. Instead, we will re-instantiate the page using the
+ * disk image and the list of updates we skipped.
+ */
+ if (bnd->skip != NULL)
+ return (__wt_strndup(session, buf->data, buf->size, &bnd->dsk));
+
/*
* If we wrote this block before, re-use it. Pages get written in the
* same block order every time, only check the appropriate slot. The
@@ -2073,8 +2102,6 @@ __rec_split_write(
* time, but that test won't calculate a checksum on the first block
* the first time the page splits.
*/
- bnd->size = (uint32_t)buf->size;
- bnd->cksum = 0;
if (mod->multi != NULL || r->bnd_next > 1) {
/*
* There are page header fields which need to be cleared to get
@@ -2545,7 +2572,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Update any changes to the original on-page data items. */
WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) {
- WT_RET(__rec_txn_read(session, r, ins->upd, &upd));
+ WT_RET(__rec_txn_read(session, r, ins, 1, ins->upd, &upd));
if (upd == NULL)
continue;
__bit_setv_recno(page, WT_INSERT_RECNO(ins),
@@ -2568,7 +2595,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
/* Walk any append list. */
append = WT_COL_APPEND(page);
WT_SKIP_FOREACH(ins, append) {
- WT_RET(__rec_txn_read(session, r, ins->upd, &upd));
+ WT_RET(__rec_txn_read(session, r, ins, 1, ins->upd, &upd));
if (upd == NULL)
continue;
for (;;) {
@@ -2889,8 +2916,8 @@ record_loop: /*
n < nrepeat; n += repeat_count, src_recno += repeat_count) {
upd = NULL;
if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) {
- WT_ERR(
- __rec_txn_read(session, r, ins->upd, &upd));
+ WT_ERR(__rec_txn_read(
+ session, r, ins, 1, ins->upd, &upd));
ins = WT_SKIP_NEXT(ins);
}
if (upd != NULL) {
@@ -3048,7 +3075,7 @@ compare: /*
/* Walk any append list. */
append = WT_COL_APPEND(page);
WT_SKIP_FOREACH(ins, append) {
- WT_ERR(__rec_txn_read(session, r, ins->upd, &upd));
+ WT_ERR(__rec_txn_read(session, r, ins, 1, ins->upd, &upd));
if (upd == NULL)
continue;
for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) {
@@ -3460,8 +3487,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
dictionary = 0;
if ((val_cell = __wt_row_leaf_value(page, rip)) != NULL)
__wt_cell_unpack(val_cell, unpack);
- WT_ERR(
- __rec_txn_read(session, r, WT_ROW_UPDATE(page, rip), &upd));
+ WT_ERR(__rec_txn_read(
+ session, r, rip, 0, WT_ROW_UPDATE(page, rip), &upd));
if (upd == NULL) {
/*
* When the page was read into memory, there may not
@@ -3723,7 +3750,7 @@ __rec_row_leaf(WT_SESSION_IMPL *session,
leaf_insert: /* Write any K/V pairs inserted into the page after this key. */
if ((ins = WT_SKIP_FIRST(WT_ROW_INSERT(page, rip))) != NULL)
- WT_ERR(__rec_row_leaf_insert(session, r, ins));
+ WT_ERR(__rec_row_leaf_insert(session, r, ins));
}
/* Write the remnant page. */
@@ -3752,7 +3779,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins)
for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) {
/* Build value cell. */
- WT_RET(__rec_txn_read(session, r, ins->upd, &upd));
+ WT_RET(__rec_txn_read(session, r, ins, 1, ins->upd, &upd));
if (upd == NULL || WT_UPDATE_DELETED_ISSET(upd))
continue;
if (upd->size == 0)
@@ -4077,6 +4104,14 @@ err: __wt_scr_free(&tkey);
* there's no risk of that happening).
*/
if (r->upd_skipped) {
+ /*
+ * In some cases (for example, when closing a file), there had
+ * better not be any updates we can't write.
+ */
+ if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
+ WT_PANIC_RETX(session,
+ "reconciliation illegally skipped an update");
+
btree->modified = 1;
WT_FULL_BARRIER();
}
@@ -4160,17 +4195,25 @@ __rec_split_row(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
for (multi = mod->multi,
bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
- multi->addr = bnd->addr;
- bnd->addr.addr = NULL;
- incr += sizeof(WT_ADDR) + multi->addr.size;
-
WT_RET(__wt_row_ikey(session, 0,
bnd->key.data, bnd->key.size, &multi->key.ikey));
incr += sizeof(WT_IKEY) + bnd->key.size;
- multi->size = bnd->size;
- multi->cksum = bnd->cksum;
- multi->reuse = 0;
+ if (bnd->skip == NULL) {
+ multi->addr = bnd->addr;
+ bnd->addr.addr = NULL;
+ incr += sizeof(WT_ADDR) + multi->addr.size;
+
+ multi->size = bnd->size;
+ multi->cksum = bnd->cksum;
+ multi->reuse = 0;
+ } else {
+ multi->skip = bnd->skip;
+ multi->skip_entries = bnd->skip_next;
+ bnd->skip = NULL;
+ multi->skip_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ }
}
__wt_cache_page_inmem_incr(session, page, incr);
@@ -4202,15 +4245,23 @@ __rec_split_col(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page)
for (multi = mod->multi,
bnd = r->bnd, i = 0; i < r->bnd_next; ++multi, ++bnd, ++i) {
- multi->addr = bnd->addr;
- bnd->addr.addr = NULL;
- incr += sizeof(WT_ADDR) + multi->addr.size;
-
multi->key.recno = bnd->recno;
- multi->size = bnd->size;
- multi->cksum = bnd->cksum;
- multi->reuse = 0;
+ if (bnd->skip == NULL) {
+ multi->addr = bnd->addr;
+ bnd->addr.addr = NULL;
+ incr += sizeof(WT_ADDR) + multi->addr.size;
+
+ multi->size = bnd->size;
+ multi->cksum = bnd->cksum;
+ multi->reuse = 0;
+ } else {
+ multi->skip = bnd->skip;
+ multi->skip_entries = bnd->skip_next;
+ bnd->skip = NULL;
+ multi->skip_dsk = bnd->dsk;
+ bnd->dsk = NULL;
+ }
}
__wt_cache_page_inmem_incr(session, page, incr);
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index 7798144effa..16c4caaea8d 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -12,31 +12,30 @@
* Row-store insert, update and delete.
*/
int
-__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
+__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+ WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
{
WT_DECL_RET;
WT_INSERT *ins;
WT_INSERT_HEAD *ins_head, **ins_headp;
- WT_ITEM *key, *value;
WT_PAGE *page;
- WT_UPDATE *old_upd, *upd, **upd_entry;
+ WT_UPDATE *old_upd, **upd_entry;
size_t ins_size, upd_size;
uint32_t ins_slot;
u_int i, skipdepth;
int logged;
- key = &cbt->iface.key;
- value = is_remove ? NULL : &cbt->iface.value;
-
+ ins = NULL;
page = cbt->page;
+ logged = 0;
+
+ /* This code expects a remove to have a NULL value. */
+ if (is_remove)
+ value = NULL;
/* If we don't yet have a modify structure, we'll need one. */
WT_RET(__wt_page_modify_init(session, page));
- ins = NULL;
- upd = NULL;
- logged = 0;
-
/*
* Modify: allocate an update array as necessary, build a WT_UPDATE
* structure, and call a serialized function to insert the WT_UPDATE
@@ -57,13 +56,20 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
} else
upd_entry = &cbt->ins->upd;
- /* Make sure the update can proceed. */
- WT_ERR(__wt_txn_update_check(session, old_upd = *upd_entry));
-
- /* Allocate the WT_UPDATE structure and transaction ID. */
- WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
- WT_ERR(__wt_txn_modify(session, cbt, upd));
- logged = 1;
+ if (upd == NULL) {
+ /* Make sure the update can proceed. */
+ WT_ERR(__wt_txn_update_check(
+ session, old_upd = *upd_entry));
+
+ /* Allocate a WT_UPDATE structure and transaction ID. */
+ WT_ERR(
+ __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, cbt, upd));
+ logged = 1;
+ } else {
+ upd_size = sizeof(WT_UPDATE) + upd->size;
+ old_upd = *upd_entry;
+ }
/*
* Point the new WT_UPDATE item to the next element in the list.
@@ -103,22 +109,23 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove)
/*
* Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
- * update the cursor to reference it.
+ * update the cursor to reference it (the WT_INSERT_HEAD might
+ * be allocated, the WT_INSERT was allocated).
*/
WT_ERR(__wt_row_insert_alloc(
session, key, skipdepth, &ins, &ins_size));
- WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
- ins->upd = upd;
- ins_size += upd_size;
-
- /*
- * Update the cursor: the WT_INSERT_HEAD might be allocated,
- * the WT_INSERT was allocated.
- */
cbt->ins_head = ins_head;
cbt->ins = ins;
- WT_ERR(__wt_txn_modify(session, cbt, upd));
- logged = 1;
+
+ if (upd == NULL) {
+ WT_ERR(
+ __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, cbt, upd));
+ logged = 1;
+ } else
+ upd_size = sizeof(WT_UPDATE) + upd->size;
+ ins->upd = upd;
+ ins_size += upd_size;
/*
* If there was no insert list during the search, the cursor's
@@ -281,34 +288,3 @@ __wt_update_obsolete_free(
if (size != 0)
__wt_cache_page_inmem_decr(session, page, size);
}
-
-/*
- * __wt_row_leaf_obsolete --
- * Discard all obsolete updates on a row-store leaf page.
- */
-void
-__wt_row_leaf_obsolete(WT_SESSION_IMPL *session, WT_PAGE *page)
-{
- WT_INSERT *ins;
- WT_ROW *rip;
- WT_UPDATE *upd;
- uint32_t i;
-
- /* For entries before the first on-page record... */
- WT_SKIP_FOREACH(ins, WT_ROW_INSERT_SMALLEST(page))
- if ((upd =
- __wt_update_obsolete_check(session, ins->upd)) != NULL)
- __wt_update_obsolete_free(session, page, upd);
-
- /* For each entry on the page... */
- WT_ROW_FOREACH(page, rip, i) {
- if ((upd = __wt_update_obsolete_check(
- session, WT_ROW_UPDATE(page, rip))) != NULL)
- __wt_update_obsolete_free(session, page, upd);
-
- WT_SKIP_FOREACH(ins, WT_ROW_INSERT(page, rip))
- if ((upd = __wt_update_obsolete_check(
- session, ins->upd)) != NULL)
- __wt_update_obsolete_free(session, page, upd);
- }
-}
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 1a4aa9cf78e..53b49ffd10d 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -113,11 +113,12 @@ __wt_search_insert(WT_SESSION_IMPL *session,
* Search a row-store tree for a specific key.
*/
int
-__wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
+__wt_row_search(WT_SESSION_IMPL *session,
+ WT_ITEM *srch_key, WT_PAGE *leaf_page, WT_CURSOR_BTREE *cbt)
{
WT_BTREE *btree;
WT_DECL_RET;
- WT_ITEM *item, _item, *srch_key;
+ WT_ITEM *item, _item;
WT_PAGE *page;
WT_PAGE_INDEX *pindex;
WT_REF *ref;
@@ -130,7 +131,6 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
rip = NULL;
match = 0; /* -Wuninitialized */
- srch_key = &cbt->iface.key;
__cursor_search_clear(cbt);
item = &_item;
@@ -147,6 +147,15 @@ restart:
*/
skiphigh = skiplow = 0;
+ /*
+ * In the service of eviction splits, we're only searching a single leaf
+ * page, not a full tree.
+ */
+ if (leaf_page != NULL) {
+ page = leaf_page;
+ goto leaf_only;
+ }
+
/* Search the internal pages of the tree. */
cmp = -1;
for (depth = 2,
@@ -324,13 +333,11 @@ descend: WT_ASSERT(session, ref != NULL);
return (ret);
}
- /*
- * We want to know how deep the tree gets because excessive depth can
- * happen because of how WiredTiger splits.
- */
+ /* Track how deep the tree gets. */
if (depth > btree->maximum_depth)
btree->maximum_depth = depth;
+leaf_only:
/*
* Binary search of the leaf page. There are two versions (a default
* loop and an application-specified collation loop), because moving
diff --git a/src/include/btmem.h b/src/include/btmem.h
index 03fdf4e0b35..d5e5f7bc337 100644
--- a/src/include/btmem.h
+++ b/src/include/btmem.h
@@ -184,6 +184,23 @@ struct __wt_ovfl_txnc {
};
/*
+ * WT_UPD_SKIPPED --
+ * When a page is reconciled, there may be updates that cannot be written.
+ * Those updates are copied and then restored when the page is re-instantiated.
+ */
+struct __wt_upd_skipped {
+ WT_UPDATE *upd; /* Skipped update */
+
+ /*
+ * Skipped updates have to be moved to another page, so they come with
+ * either a pointer to the WT_INSERT list, or a pointer to a row-store
+ * leaf page update list.
+ */
+ void *head; /* (WT_UPDATE **) or (WT_INSERT *) */
+ uint8_t is_insert; /* (WT_INSERT *) */
+};
+
+/*
* WT_PAGE_MODIFY --
* When a page is modified, there's additional information to maintain.
*/
@@ -211,14 +228,25 @@ struct __wt_page_modify {
*/
WT_ADDR replace; /* Single replacement block */
struct __wt_multi { /* Multiple replacement blocks */
- WT_ADDR addr; /* Address */
union {
uint64_t recno; /* Column-store: starting recno */
WT_IKEY *ikey; /* Row-store: variable-length key */
} key;
- uint32_t cksum; /* Checksum */
+
+ /*
+ * XXXKEITH
+ * These two sets of fields should be a union, only one gets
+ * filled in, it's either an address or a skipped update.
+ */
+ WT_UPD_SKIPPED *skip; /* Skipped updates */
+ uint32_t skip_entries;
+ void *skip_dsk; /* Page's disk image */
+
+ WT_ADDR addr; /* Address */
uint32_t size; /* Size */
- uint8_t reuse; /* Being reused */
+ uint32_t cksum; /* Checksum */
+ uint8_t reuse; /* Being reused */
+
} *multi;
uint32_t multi_entries; /* Multi-block element count */
size_t multi_size; /* Multi-block memory footprint */
@@ -483,6 +511,7 @@ struct __wt_page {
#define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */
#define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */
#define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */
+#define WT_PAGE_EVICT_FORCE 0x10 /* Page being forcibly evicted */
uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */
};
@@ -816,10 +845,10 @@ struct __wt_insert {
} key;
} u;
-#define WT_INSERT_KEY_SIZE(ins) ((ins)->u.key.size)
+#define WT_INSERT_KEY_SIZE(ins) (((WT_INSERT *)ins)->u.key.size)
#define WT_INSERT_KEY(ins) \
- ((void *)((uint8_t *)(ins) + (ins)->u.key.offset))
-#define WT_INSERT_RECNO(ins) ((ins)->u.recno)
+ ((void *)((uint8_t *)(ins) + ((WT_INSERT *)ins)->u.key.offset))
+#define WT_INSERT_RECNO(ins) (((WT_INSERT *)ins)->u.recno)
WT_INSERT *next[0]; /* forward-linked skip list */
};
diff --git a/src/include/btree.i b/src/include/btree.i
index 42e33eecfd5..43cae6e8eb2 100644
--- a/src/include/btree.i
+++ b/src/include/btree.i
@@ -784,7 +784,9 @@ __wt_eviction_force(WT_SESSION_IMPL *session, WT_PAGE *page)
* give other transactions a chance to complete.
*/
__wt_txn_update_oldest(session);
- if (__wt_txn_visible_all(session, page->modify->update_txn)) {
+ if (!F_ISSET_ATOMIC(page, WT_PAGE_EVICT_FORCE) ||
+ __wt_txn_visible_all(session, page->modify->update_txn)) {
+ F_SET_ATOMIC(page, WT_PAGE_EVICT_FORCE);
page->read_gen = WT_READ_GEN_OLDEST;
WT_RET(__wt_page_release(session, page));
WT_RET(__wt_evict_server_wake(session));
diff --git a/src/include/extern.h b/src/include/extern.h
index 29dfd0bf3db..1cb4c5fb795 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -378,12 +378,22 @@ extern int __wt_tree_walk(WT_SESSION_IMPL *session,
uint32_t flags);
extern int __wt_col_modify(WT_SESSION_IMPL *session,
WT_CURSOR_BTREE *cbt,
+ uint64_t recno,
+ WT_ITEM *value,
+ WT_UPDATE *upd,
int is_remove);
-extern void __wt_col_leaf_obsolete(WT_SESSION_IMPL *session, WT_PAGE *page);
-extern int __wt_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern int __wt_col_search(WT_SESSION_IMPL *session,
+ uint64_t recno,
+ WT_PAGE *leaf_page,
+ WT_CURSOR_BTREE *cbt);
extern int __wt_rec_evict(WT_SESSION_IMPL *session,
WT_PAGE **pagep,
int exclusive);
+extern int __wt_multi_to_ref(WT_SESSION_IMPL *session,
+ WT_PAGE *page,
+ WT_MULTI *multi,
+ WT_REF *refarg,
+ uint32_t entries);
extern int __wt_btree_new_modified_page(WT_SESSION_IMPL *session,
uint8_t type,
uint32_t entries,
@@ -429,11 +439,6 @@ extern int __wt_rec_write(WT_SESSION_IMPL *session,
WT_PAGE *page,
WT_SALVAGE_COOKIE *salvage,
uint32_t flags);
-extern int __wt_multi_to_ref(WT_SESSION_IMPL *session,
- WT_PAGE *page,
- WT_MULTI *multi,
- WT_REF *refarg,
- uint32_t entries);
extern int __wt_rec_bulk_init(WT_CURSOR_BULK *cbulk);
extern int __wt_rec_bulk_wrapup(WT_CURSOR_BULK *cbulk);
extern int __wt_rec_row_bulk_insert(WT_CURSOR_BULK *cbulk);
@@ -462,6 +467,9 @@ extern int __wt_row_ikey(WT_SESSION_IMPL *session,
void *ikeyp);
extern int __wt_row_modify(WT_SESSION_IMPL *session,
WT_CURSOR_BTREE *cbt,
+ WT_ITEM *key,
+ WT_ITEM *value,
+ WT_UPDATE *upd,
int is_remove);
extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session,
WT_ITEM *key,
@@ -477,12 +485,14 @@ extern WT_UPDATE *__wt_update_obsolete_check(WT_SESSION_IMPL *session,
extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session,
WT_PAGE *page,
WT_UPDATE *upd);
-extern void __wt_row_leaf_obsolete(WT_SESSION_IMPL *session, WT_PAGE *page);
extern int __wt_search_insert(WT_SESSION_IMPL *session,
WT_CURSOR_BTREE *cbt,
WT_INSERT_HEAD *inshead,
WT_ITEM *srch_key);
-extern int __wt_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
+extern int __wt_row_search(WT_SESSION_IMPL *session,
+ WT_ITEM *srch_key,
+ WT_PAGE *leaf_page,
+ WT_CURSOR_BTREE *cbt);
extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt);
extern int __wt_config_initn( WT_SESSION_IMPL *session,
WT_CONFIG *conf,
diff --git a/src/include/flags.h b/src/include/flags.h
index 038c1ef03eb..69c87a00230 100644
--- a/src/include/flags.h
+++ b/src/include/flags.h
@@ -28,7 +28,7 @@
#define WT_SESSION_SALVAGE_CORRUPT_OK 0x00000002
#define WT_SESSION_SCHEMA_LOCKED 0x00000001
#define WT_SKIP_UPDATE_ERR 0x00000002
-#define WT_SKIP_UPDATE_QUIT 0x00000001
+#define WT_SKIP_UPDATE_RESTORE 0x00000001
#define WT_SYNC_CHECKPOINT 0x00000008
#define WT_SYNC_DISCARD 0x00000004
#define WT_SYNC_DISCARD_NOWRITE 0x00000002
diff --git a/src/include/txn.i b/src/include/txn.i
index 2543d5ff21f..df4a18f5ba1 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -165,46 +165,6 @@ __wt_txn_visible(WT_SESSION_IMPL *session, uint64_t id)
}
/*
- * __wt_txn_read_skip --
- * Get the first visible update in a list (or NULL if none are visible).
- * Report the maximum transaction ID in the list and whether any updates
- * were skipped to find the visible update.
- */
-static inline WT_UPDATE *
-__wt_txn_read_skip(
- WT_SESSION_IMPL *session, WT_UPDATE *upd, uint64_t *max_txn, int *skipp)
-{
- WT_UPDATE *first_upd;
-
- /*
- * Track the largest transaction ID on this page. We store this in the
- * page at the end of reconciliation if no updates are skipped. It is
- * used to avoid evicting a clean page from memory with changes that
- * are required to satisfy a snapshot read.
- *
- * Record whether any updates were skipped on the way to finding the
- * first visible update. That determines whether a future read with no
- * intervening modifications to the page could see a different value.
- * If not, the page can safely be marked clean, and does not need to be
- * reconciled until it is modified again.
- */
- *skipp = 0;
- for (first_upd = NULL; upd != NULL; upd = upd->next)
- if (upd->txnid != WT_TXN_ABORTED) {
- if (TXNID_LT(*max_txn, upd->txnid))
- *max_txn = upd->txnid;
- if (first_upd == NULL) {
- if (__wt_txn_visible(session, upd->txnid))
- first_upd = upd;
- else
- *skipp = 1;
- }
- }
-
- return (first_upd);
-}
-
-/*
* __wt_txn_read --
* Get the first visible update in a list (or NULL if none are visible).
*/
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index b30ad80d5a3..e50a5509ec9 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -223,6 +223,8 @@ struct __wt_txn_op;
typedef struct __wt_txn_op WT_TXN_OP;
struct __wt_txn_state;
typedef struct __wt_txn_state WT_TXN_STATE;
+struct __wt_upd_skipped;
+ typedef struct __wt_upd_skipped WT_UPD_SKIPPED;
struct __wt_update;
typedef struct __wt_update WT_UPDATE;
/*