summaryrefslogtreecommitdiff
path: root/src/btree
diff options
context:
space:
mode:
Diffstat (limited to 'src/btree')
-rw-r--r--src/btree/bt_compact.c12
-rw-r--r--src/btree/bt_curnext.c10
-rw-r--r--src/btree/bt_curprev.c10
-rw-r--r--src/btree/bt_cursor.c248
-rw-r--r--src/btree/bt_debug.c8
-rw-r--r--src/btree/bt_delete.c4
-rw-r--r--src/btree/bt_discard.c4
-rw-r--r--src/btree/bt_handle.c22
-rw-r--r--src/btree/bt_huffman.c2
-rw-r--r--src/btree/bt_io.c2
-rw-r--r--src/btree/bt_misc.c2
-rw-r--r--src/btree/bt_ovfl.c2
-rw-r--r--src/btree/bt_page.c2
-rw-r--r--src/btree/bt_random.c5
-rw-r--r--src/btree/bt_read.c26
-rw-r--r--src/btree/bt_rebalance.c2
-rw-r--r--src/btree/bt_ret.c12
-rw-r--r--src/btree/bt_slvg.c2
-rw-r--r--src/btree/bt_split.c182
-rw-r--r--src/btree/bt_stat.c25
-rw-r--r--src/btree/bt_sync.c26
-rw-r--r--src/btree/bt_upgrade.c2
-rw-r--r--src/btree/bt_vrfy.c2
-rw-r--r--src/btree/bt_vrfy_dsk.c2
-rw-r--r--src/btree/bt_walk.c101
-rw-r--r--src/btree/col_modify.c44
-rw-r--r--src/btree/col_srch.c2
-rw-r--r--src/btree/row_key.c4
-rw-r--r--src/btree/row_modify.c62
-rw-r--r--src/btree/row_srch.c2
30 files changed, 370 insertions, 459 deletions
diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c
index 2edcac76d0b..c6a412aa84e 100644
--- a/src/btree/bt_compact.c
+++ b/src/btree/bt_compact.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -60,7 +60,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
*/
if (mod->rec_result == WT_PM_REC_REPLACE ||
mod->rec_result == WT_PM_REC_MULTIBLOCK)
- __wt_writelock(session, &page->page_lock);
+ WT_PAGE_LOCK(session, page);
if (mod->rec_result == WT_PM_REC_REPLACE)
ret = bm->compact_page_skip(bm, session,
@@ -80,7 +80,7 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
if (mod->rec_result == WT_PM_REC_REPLACE ||
mod->rec_result == WT_PM_REC_MULTIBLOCK)
- __wt_writeunlock(session, &page->page_lock);
+ WT_PAGE_UNLOCK(session, page);
return (ret);
}
@@ -228,12 +228,8 @@ __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
bm, session, addr, addr_size, skipp);
}
- /*
- * Reset the WT_REF state and push the change. The full-barrier isn't
- * necessary, but it's better to keep pages in circulation than not.
- */
+ /* Reset the WT_REF state. */
ref->state = WT_REF_DISK;
- WT_FULL_BARRIER();
return (ret);
}
diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c
index 21e575ffca9..7b92a58991d 100644
--- a/src/btree/bt_curnext.c
+++ b/src/btree/bt_curnext.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -142,7 +142,7 @@ new_page: if (cbt->ins == NULL)
__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL)
continue;
- if (WT_UPDATE_DELETED_ISSET(upd)) {
+ if (upd->type == WT_UPDATE_DELETED) {
if (__wt_txn_visible_all(session, upd->txnid))
++cbt->page_deleted_count;
continue;
@@ -205,7 +205,7 @@ new_page: /* Find the matching WT_COL slot. */
upd = cbt->ins == NULL ?
NULL : __wt_txn_read(session, cbt->ins->upd);
if (upd != NULL) {
- if (WT_UPDATE_DELETED_ISSET(upd)) {
+ if (upd->type == WT_UPDATE_DELETED) {
if (__wt_txn_visible_all(session, upd->txnid))
++cbt->page_deleted_count;
continue;
@@ -325,7 +325,7 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage)
new_insert: if ((ins = cbt->ins) != NULL) {
if ((upd = __wt_txn_read(session, ins->upd)) == NULL)
continue;
- if (WT_UPDATE_DELETED_ISSET(upd)) {
+ if (upd->type == WT_UPDATE_DELETED) {
if (__wt_txn_visible_all(session, upd->txnid))
++cbt->page_deleted_count;
continue;
@@ -358,7 +358,7 @@ new_insert: if ((ins = cbt->ins) != NULL) {
cbt->slot = cbt->row_iteration_slot / 2 - 1;
rip = &page->pg_row[cbt->slot];
upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
- if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) {
+ if (upd != NULL && upd->type == WT_UPDATE_DELETED) {
if (__wt_txn_visible_all(session, upd->txnid))
++cbt->page_deleted_count;
continue;
diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c
index bf4bdad6529..55b5095fe91 100644
--- a/src/btree/bt_curprev.c
+++ b/src/btree/bt_curprev.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -288,7 +288,7 @@ new_page: if (cbt->ins == NULL)
__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL)
continue;
- if (WT_UPDATE_DELETED_ISSET(upd)) {
+ if (upd->type == WT_UPDATE_DELETED) {
if (__wt_txn_visible_all(session, upd->txnid))
++cbt->page_deleted_count;
continue;
@@ -352,7 +352,7 @@ new_page: if (cbt->recno < cbt->ref->ref_recno)
upd = cbt->ins == NULL ?
NULL : __wt_txn_read(session, cbt->ins->upd);
if (upd != NULL) {
- if (WT_UPDATE_DELETED_ISSET(upd)) {
+ if (upd->type == WT_UPDATE_DELETED) {
if (__wt_txn_visible_all(session, upd->txnid))
++cbt->page_deleted_count;
continue;
@@ -482,7 +482,7 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
new_insert: if ((ins = cbt->ins) != NULL) {
if ((upd = __wt_txn_read(session, ins->upd)) == NULL)
continue;
- if (WT_UPDATE_DELETED_ISSET(upd)) {
+ if (upd->type == WT_UPDATE_DELETED) {
if (__wt_txn_visible_all(session, upd->txnid))
++cbt->page_deleted_count;
continue;
@@ -517,7 +517,7 @@ new_insert: if ((ins = cbt->ins) != NULL) {
cbt->slot = cbt->row_iteration_slot / 2 - 1;
rip = &page->pg_row[cbt->slot];
upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
- if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) {
+ if (upd != NULL && upd->type == WT_UPDATE_DELETED) {
if (__wt_txn_visible_all(session, upd->txnid))
++cbt->page_deleted_count;
continue;
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 944e276fc01..52435eeefed 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -64,29 +64,6 @@ __cursor_page_pinned(WT_CURSOR_BTREE *cbt)
}
/*
- * __cursor_copy_int_key --
- * If we're pointing into the tree, save the key into local memory.
- */
-static inline int
-__cursor_copy_int_key(WT_CURSOR *cursor)
-{
- /*
- * We're about to discard the cursor's position and the cursor layer
- * might retry the operation. We discard pinned pages on error, which
- * will invalidate pinned keys. Clear WT_CURSTD_KEY_INT in all cases,
- * the underlying page is gone whether we can allocate memory or not.
- */
- if (F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
- F_CLR(cursor, WT_CURSTD_KEY_INT);
- if (!WT_DATA_IN_ITEM(&cursor->key))
- WT_RET(__wt_buf_set((WT_SESSION_IMPL *)cursor->session,
- &cursor->key, cursor->key.data, cursor->key.size));
- F_SET(cursor, WT_CURSTD_KEY_EXT);
- }
- return (0);
-}
-
-/*
* __cursor_size_chk --
* Return if an inserted item is too large.
*/
@@ -247,7 +224,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
*/
if (cbt->ins != NULL &&
(upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
- if (WT_UPDATE_DELETED_ISSET(upd))
+ if (upd->type == WT_UPDATE_DELETED)
return (false);
if (updp != NULL)
*updp = upd;
@@ -320,7 +297,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
page->modify->mod_row_update != NULL &&
(upd = __wt_txn_read(session,
page->modify->mod_row_update[cbt->slot])) != NULL) {
- if (WT_UPDATE_DELETED_ISSET(upd))
+ if (upd->type == WT_UPDATE_DELETED)
return (false);
if (updp != NULL)
*updp = upd;
@@ -366,10 +343,10 @@ __cursor_row_search(
*/
static inline int
__cursor_col_modify(
- WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool is_remove)
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, u_int modify_type)
{
- return (__wt_col_modify(session,
- cbt, cbt->iface.recno, &cbt->iface.value, NULL, is_remove));
+ return (__wt_col_modify(session, cbt,
+ cbt->iface.recno, &cbt->iface.value, NULL, modify_type, false));
}
/*
@@ -378,10 +355,10 @@ __cursor_col_modify(
*/
static inline int
__cursor_row_modify(
- WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, bool is_remove)
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, u_int modify_type)
{
- return (__wt_row_modify(session,
- cbt, &cbt->iface.key, &cbt->iface.value, NULL, is_remove));
+ return (__wt_row_modify(session, cbt,
+ &cbt->iface.key, &cbt->iface.value, NULL, modify_type, false));
}
/*
@@ -431,10 +408,14 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
__cursor_state_save(cursor, &state);
/*
- * The pinned page goes away if we do a search, make sure there's a
- * local copy of any key, then re-save the cursor state.
+ * The pinned page goes away if we search the tree, get a local copy of
+ * any pinned key and discard any pinned value, then re-save the cursor
+ * state. Done before searching pinned pages (unlike other cursor
+ * functions), because we don't anticipate applications searching for a
+ * key they currently have pinned.)
*/
- WT_ERR(__cursor_copy_int_key(cursor));
+ WT_ERR(__cursor_localkey(cursor));
+ __cursor_novalue(cursor);
__cursor_state_save(cursor, &state);
/*
@@ -516,10 +497,14 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
__cursor_state_save(cursor, &state);
/*
- * The pinned page goes away if we do a search, make sure there's a
- * local copy of any key, then re-save the cursor state.
+ * The pinned page goes away if we search the tree, get a local copy of
+ * any pinned key and discard any pinned value, then re-save the cursor
+ * state. Done before searching pinned pages (unlike other cursor
+ * functions), because we don't anticipate applications searching for a
+ * key they currently have pinned.)
*/
- WT_ERR(__cursor_copy_int_key(cursor));
+ WT_ERR(__cursor_localkey(cursor));
+ __cursor_novalue(cursor);
__cursor_state_save(cursor, &state);
/*
@@ -640,8 +625,6 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt)
WT_STAT_DATA_INCRV(session,
cursor_insert_bytes, cursor->key.size + cursor->value.size);
- __cursor_state_save(cursor, &state);
-
if (btree->type == BTREE_ROW)
WT_RET(__cursor_size_chk(session, &cursor->key));
WT_RET(__cursor_size_chk(session, &cursor->value));
@@ -658,6 +641,9 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt)
append_key =
F_ISSET(cursor, WT_CURSTD_APPEND) && btree->type != BTREE_ROW;
+ /* Save the cursor state. */
+ __cursor_state_save(cursor, &state);
+
/*
* If inserting with overwrite configured, and positioned to an on-page
* key, the update doesn't require another search. The cursor won't be
@@ -676,28 +662,30 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt)
*/
cbt->compare = 0;
ret = btree->type == BTREE_ROW ?
- __cursor_row_modify(session, cbt, false) :
- __cursor_col_modify(session, cbt, false);
+ __cursor_row_modify(session, cbt, WT_UPDATE_STANDARD) :
+ __cursor_col_modify(session, cbt, WT_UPDATE_STANDARD);
if (ret == 0)
goto done;
/*
- * The pinned page goes away if we fail for any reason, make
- * sure there's a local copy of any key. (Restart could still
+ * The pinned page goes away if we fail for any reason, get a
+ * local copy of any pinned key or value. (Restart could still
* use the pinned page, but that's an unlikely path.) Re-save
* the cursor state: we may retry but eventually fail.
*/
- WT_TRET(__cursor_copy_int_key(cursor));
+ WT_TRET(__cursor_localkey(cursor));
+ WT_TRET(__cursor_localvalue(cursor));
__cursor_state_save(cursor, &state);
goto err;
}
/*
- * The pinned page goes away if we do a search, make sure there's a
- * local copy of any key. Re-save the cursor state: we may retry but
+ * The pinned page goes away if we do a search, get a local copy of any
+ * pinned key or value. Re-save the cursor state: we may retry but
* eventually fail.
*/
- WT_ERR(__cursor_copy_int_key(cursor));
+ WT_ERR(__cursor_localkey(cursor));
+ WT_ERR(__cursor_localvalue(cursor));
__cursor_state_save(cursor, &state);
retry: WT_ERR(__cursor_func_init(cbt, true));
@@ -712,7 +700,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
cbt->compare == 0 && __wt_cursor_valid(cbt, NULL))
WT_ERR(WT_DUPLICATE_KEY);
- ret = __cursor_row_modify(session, cbt, false);
+ ret = __cursor_row_modify(session, cbt, WT_UPDATE_STANDARD);
} else {
/*
* Optionally insert a new record (ignoring the application's
@@ -735,7 +723,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
(cbt->compare != 0 && __cursor_fix_implicit(btree, cbt))))
WT_ERR(WT_DUPLICATE_KEY);
- WT_ERR(__cursor_col_modify(session, cbt, false));
+ WT_ERR(__cursor_col_modify(session, cbt, WT_UPDATE_STANDARD));
if (append_key)
cbt->iface.recno = cbt->recno;
@@ -812,12 +800,13 @@ __wt_btcur_insert_check(WT_CURSOR_BTREE *cbt)
session = (WT_SESSION_IMPL *)cursor->session;
/*
- * The pinned page goes away if we do a search, make sure there's a
- * local copy of any key. Unlike most of the btree cursor routines,
- * we don't have to save/restore the cursor key state, none of the
- * work done here changes the key state.
+ * The pinned page goes away if we do a search, get a local copy of any
+ * pinned key and discard any pinned value. Unlike most of the btree
+ * cursor routines, we don't have to save/restore the cursor key state,
+ * none of the work done here changes the cursor state.
*/
- WT_ERR(__cursor_copy_int_key(cursor));
+ WT_ERR(__cursor_localkey(cursor));
+ __cursor_novalue(cursor);
retry: WT_ERR(__cursor_func_init(cbt, true));
@@ -865,14 +854,15 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt)
WT_STAT_DATA_INCR(session, cursor_remove);
WT_STAT_DATA_INCRV(session, cursor_remove_bytes, cursor->key.size);
- __cursor_state_save(cursor, &state);
-
/*
* WT_CURSOR.remove has a unique semantic, the cursor stays positioned
* if it starts positioned, otherwise clear the cursor on completion.
*/
positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT);
+ /* Save the cursor state. */
+ __cursor_state_save(cursor, &state);
+
/*
* If remove positioned to an on-page key, the remove doesn't require
* another search. We don't care about the "overwrite" configuration
@@ -891,28 +881,33 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt)
*/
cbt->compare = 0;
ret = btree->type == BTREE_ROW ?
- __cursor_row_modify(session, cbt, true) :
- __cursor_col_modify(session, cbt, true);
+ __cursor_row_modify(session, cbt, WT_UPDATE_DELETED) :
+ __cursor_col_modify(session, cbt, WT_UPDATE_DELETED);
if (ret == 0)
goto done;
/*
- * The pinned page goes away if we fail for any reason, make
- * sure there's a local copy of any key. (Restart could still
- * use the pinned page, but that's an unlikely path.) Re-save
- * the cursor state: we may retry but eventually fail.
+ * The pinned page goes away if we fail for any reason, get a
+ * local copy of any pinned key and discard any value (remove
+ * discards any previous value on success or failure). (Restart
+ * could still use the pinned page, but that's an unlikely
+ * path.) Re-save the cursor state: we may retry but eventually
+ * fail.
*/
- WT_TRET(__cursor_copy_int_key(cursor));
+ WT_TRET(__cursor_localkey(cursor));
+ F_CLR(cursor, WT_CURSTD_VALUE_SET);
__cursor_state_save(cursor, &state);
goto err;
}
/*
- * The pinned page goes away if we do a search, make sure there's a
- * local copy of any key. Re-save the cursor state: we may retry but
- * eventually fail.
+ * The pinned page goes away if we do a search, get a local copy of any
+ * pinned key and discard any value (remove discards any previous
+ * value on success or failure). Re-save the cursor state: we may retry
+ * but eventually fail.
*/
- WT_ERR(__cursor_copy_int_key(cursor));
+ WT_ERR(__cursor_localkey(cursor));
+ F_CLR(cursor, WT_CURSTD_VALUE_SET);
__cursor_state_save(cursor, &state);
retry: WT_ERR(__cursor_func_init(cbt, true));
@@ -926,7 +921,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL))
WT_ERR(WT_NOTFOUND);
- ret = __cursor_row_modify(session, cbt, true);
+ ret = __cursor_row_modify(session, cbt, WT_UPDATE_DELETED);
} else {
WT_ERR(__cursor_col_search(session, cbt, NULL));
@@ -953,7 +948,8 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
*/
cbt->recno = cursor->recno;
} else
- ret = __cursor_col_modify(session, cbt, true);
+ ret = __cursor_col_modify(
+ session, cbt, WT_UPDATE_DELETED);
}
err: if (ret == WT_RESTART) {
@@ -987,11 +983,11 @@ done: /*
}
/*
- * __wt_btcur_update --
+ * __btcur_update --
* Update a record in the tree.
*/
-int
-__wt_btcur_update(WT_CURSOR_BTREE *cbt)
+static int
+__btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type)
{
WT_BTREE *btree;
WT_CURFILE_STATE state;
@@ -1003,19 +999,12 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt)
cursor = &cbt->iface;
session = (WT_SESSION_IMPL *)cursor->session;
- WT_STAT_CONN_INCR(session, cursor_update);
- WT_STAT_DATA_INCR(session, cursor_update);
- WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size);
-
- __cursor_state_save(cursor, &state);
-
- if (btree->type == BTREE_ROW)
- WT_RET(__cursor_size_chk(session, &cursor->key));
- WT_RET(__cursor_size_chk(session, &cursor->value));
-
/* It's no longer possible to bulk-load into the tree. */
__cursor_disable_bulk(session, btree);
+ /* Save the cursor state. */
+ __cursor_state_save(cursor, &state);
+
/*
* If update positioned to an on-page key, the update doesn't require
* another search. We don't care about the "overwrite" configuration
@@ -1033,28 +1022,30 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt)
*/
cbt->compare = 0;
ret = btree->type == BTREE_ROW ?
- __cursor_row_modify(session, cbt, false) :
- __cursor_col_modify(session, cbt, false);
+ __cursor_row_modify(session, cbt, modify_type) :
+ __cursor_col_modify(session, cbt, modify_type);
if (ret == 0)
goto done;
/*
- * The pinned page goes away if we fail for any reason, make
- * sure there's a local copy of any key. (Restart could still
+ * The pinned page goes away if we fail for any reason, get a
+ * a local copy of any pinned key or value. (Restart could still
* use the pinned page, but that's an unlikely path.) Re-save
* the cursor state: we may retry but eventually fail.
*/
- WT_TRET(__cursor_copy_int_key(cursor));
+ WT_TRET(__cursor_localkey(cursor));
+ WT_TRET(__cursor_localvalue(cursor));
__cursor_state_save(cursor, &state);
goto err;
}
/*
- * The pinned page goes away if we do a search, make sure there's a
- * local copy of any key. Re-save the cursor state: we may retry but
+ * The pinned page goes away if we do a search, get a local copy of any
+ * pinned key or value. Re-save the cursor state: we may retry but
* eventually fail.
*/
- WT_ERR(__cursor_copy_int_key(cursor));
+ WT_ERR(__cursor_localkey(cursor));
+ WT_ERR(__cursor_localvalue(cursor));
__cursor_state_save(cursor, &state);
retry: WT_ERR(__cursor_func_init(cbt, true));
@@ -1070,7 +1061,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL))
WT_ERR(WT_NOTFOUND);
}
- ret = __cursor_row_modify(session, cbt, false);
+ ret = __cursor_row_modify(session, cbt, modify_type);
} else {
WT_ERR(__cursor_col_search(session, cbt, NULL));
@@ -1089,7 +1080,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
!__cursor_fix_implicit(btree, cbt))
WT_ERR(WT_NOTFOUND);
}
- ret = __cursor_col_modify(session, cbt, false);
+ ret = __cursor_col_modify(session, cbt, modify_type);
}
err: if (ret == WT_RESTART) {
@@ -1106,8 +1097,14 @@ err: if (ret == WT_RESTART) {
* To make this work, we add a field to the btree cursor to pass back a
* pointer to the modify function's allocated update structure.
*/
-done: if (ret == 0)
- WT_TRET(__wt_kv_return(session, cbt, cbt->modify_update));
+done: if (ret == 0) {
+ if (modify_type == WT_UPDATE_RESERVED) {
+ F_CLR(cursor, WT_CURSTD_VALUE_SET);
+ WT_TRET(__wt_key_return(session, cbt));
+ } else
+ WT_TRET(
+ __wt_kv_return(session, cbt, cbt->modify_update));
+ }
if (ret != 0) {
WT_TRET(__cursor_reset(cbt));
@@ -1118,6 +1115,59 @@ done: if (ret == 0)
}
/*
+ * __wt_btcur_reserve --
+ * Reserve a record in the tree.
+ */
+int
+__wt_btcur_reserve(WT_CURSOR_BTREE *cbt)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ bool overwrite;
+
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_STAT_CONN_INCR(session, cursor_reserve);
+ WT_STAT_DATA_INCR(session, cursor_reserve);
+
+ /* WT_CURSOR.reserve is update-without-overwrite and a special value. */
+ overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
+ F_CLR(cursor, WT_CURSTD_OVERWRITE);
+ ret = __btcur_update(cbt, WT_UPDATE_RESERVED);
+ if (overwrite)
+ F_SET(cursor, WT_CURSTD_OVERWRITE);
+ return (ret);
+}
+
+/*
+ * __wt_btcur_update --
+ * Update a record in the tree.
+ */
+int
+__wt_btcur_update(WT_CURSOR_BTREE *cbt)
+{
+ WT_BTREE *btree;
+ WT_CURSOR *cursor;
+ WT_SESSION_IMPL *session;
+
+ btree = cbt->btree;
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_STAT_CONN_INCR(session, cursor_update);
+ WT_STAT_DATA_INCR(session, cursor_update);
+ WT_STAT_DATA_INCRV(session, cursor_update_bytes, cursor->value.size);
+
+ if (btree->type == BTREE_ROW)
+ WT_RET(__cursor_size_chk(session, &cursor->key));
+ WT_RET(__cursor_size_chk(session, &cursor->value));
+
+ return (__btcur_update(cbt, WT_UPDATE_STANDARD));
+}
+
+/*
* __wt_btcur_compare --
* Return a comparison between two cursors.
*/
@@ -1237,7 +1287,7 @@ __wt_btcur_equals(WT_CURSOR_BTREE *a_arg, WT_CURSOR_BTREE *b_arg, int *equalp)
static int
__cursor_truncate(WT_SESSION_IMPL *session,
WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
- int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, bool))
+ int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, u_int))
{
WT_DECL_RET;
@@ -1265,7 +1315,7 @@ retry: WT_RET(__wt_btcur_search(start));
F_MASK((WT_CURSOR *)start, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
for (;;) {
- if ((ret = rmfunc(session, start, 1)) != 0)
+ if ((ret = rmfunc(session, start, WT_UPDATE_DELETED)) != 0)
break;
if (stop != NULL && __cursor_equals(start, stop))
@@ -1292,7 +1342,7 @@ retry: WT_RET(__wt_btcur_search(start));
static int
__cursor_truncate_fix(WT_SESSION_IMPL *session,
WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop,
- int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, bool))
+ int (*rmfunc)(WT_SESSION_IMPL *, WT_CURSOR_BTREE *, u_int))
{
WT_DECL_RET;
const uint8_t *value;
@@ -1323,7 +1373,7 @@ retry: WT_RET(__wt_btcur_search(start));
for (;;) {
value = (const uint8_t *)start->iface.value.data;
if (*value != 0 &&
- (ret = rmfunc(session, start, 1)) != 0)
+ (ret = rmfunc(session, start, WT_UPDATE_DELETED)) != 0)
break;
if (stop != NULL && __cursor_equals(start, stop))
diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c
index d3f02e29b90..394ac6c7b84 100644
--- a/src/btree/bt_debug.c
+++ b/src/btree/bt_debug.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -689,8 +689,6 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
WT_RET(ds->f(ds, ", entries %" PRIu32, entries));
WT_RET(ds->f(ds,
", %s", __wt_page_is_modified(page) ? "dirty" : "clean"));
- WT_RET(ds->f(ds, ", %s", __wt_rwlock_islocked(
- session, &page->page_lock) ? "locked" : "unlocked"));
if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
WT_RET(ds->f(ds, ", keys-built"));
@@ -985,8 +983,10 @@ static int
__debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte)
{
for (; upd != NULL; upd = upd->next)
- if (WT_UPDATE_DELETED_ISSET(upd))
+ if (upd->type == WT_UPDATE_DELETED)
WT_RET(ds->f(ds, "\tvalue {deleted}\n"));
+ else if (upd->type == WT_UPDATE_RESERVED)
+ WT_RET(ds->f(ds, "\tvalue {reserved}\n"));
else if (hexbyte) {
WT_RET(ds->f(ds, "\t{"));
WT_RET(__debug_hex_byte(ds,
diff --git a/src/btree/bt_delete.c b/src/btree/bt_delete.c
index b55ad291c5e..4a88b672d47 100644
--- a/src/btree/bt_delete.c
+++ b/src/btree/bt_delete.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -333,7 +333,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
*/
for (i = 0, size = 0; i < page->entries; ++i) {
WT_ERR(__wt_calloc_one(session, &upd));
- WT_UPDATE_DELETED_SET(upd);
+ upd->type = WT_UPDATE_DELETED;
if (page_del == NULL)
upd->txnid = WT_TXN_NONE; /* Globally visible */
diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c
index bab7b8145d6..bfa8eb25aac 100644
--- a/src/btree/bt_discard.c
+++ b/src/btree/bt_discard.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -98,7 +98,6 @@ __page_out_int(WT_SESSION_IMPL *session, WT_PAGE **pagep, bool rewrite)
*/
WT_ASSERT(session, !__wt_page_is_modified(page));
WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
- WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock));
/*
* If a root page split, there may be one or more pages linked from the
@@ -254,6 +253,7 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page)
__wt_ovfl_discard_free(session, page);
__wt_free(session, page->modify->ovfl_track);
+ __wt_spin_destroy(session, &page->modify->page_lock);
__wt_free(session, page->modify);
}
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index d76720b19ae..06fbd6b74c7 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -418,15 +418,13 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
WT_RET(__wt_compressor_config(session, &cval, &btree->compressor));
/*
- * We do not use __wt_config_gets_none here because "none"
- * and the empty string have different meanings. The
- * empty string means inherit the system encryption setting
- * and "none" means this table is in the clear even if the
- * database is encrypted. If this is the metadata handle
- * always inherit from the connection.
+ * We do not use __wt_config_gets_none here because "none" and the empty
+ * string have different meanings. The empty string means inherit the
+ * system encryption setting and "none" means this table is in the clear
+ * even if the database is encrypted.
*/
WT_RET(__wt_config_gets(session, cfg, "encryption.name", &cval));
- if (WT_IS_METADATA(btree->dhandle) || cval.len == 0)
+ if (cval.len == 0)
btree->kencryptor = conn->kencryptor;
else if (WT_STRING_MATCH("none", cval.str, cval.len))
btree->kencryptor = NULL;
@@ -444,12 +442,14 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
}
/* Initialize locks. */
- __wt_rwlock_init(session, &btree->ovfl_lock);
+ WT_RET(__wt_rwlock_init(session, &btree->ovfl_lock));
WT_RET(__wt_spin_init(session, &btree->flush_lock, "btree flush"));
- btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */
btree->modified = false; /* Clean */
- btree->write_gen = ckpt->write_gen; /* Write generation */
+
+ btree->checkpointing = WT_CKPT_OFF; /* Not checkpointing */
+ btree->write_gen = ckpt->write_gen; /* Write generation */
+ btree->checkpoint_gen = __wt_gen(session, WT_GEN_CHECKPOINT);
return (0);
}
diff --git a/src/btree/bt_huffman.c b/src/btree/bt_huffman.c
index 918791d9c6e..c5cc9ccf0b0 100644
--- a/src/btree/bt_huffman.c
+++ b/src/btree/bt_huffman.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_io.c b/src/btree/bt_io.c
index b5e4d52394a..262532a4eab 100644
--- a/src/btree/bt_io.c
+++ b/src/btree/bt_io.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_misc.c b/src/btree/bt_misc.c
index 3bec65c2567..04b607082d1 100644
--- a/src/btree/bt_misc.c
+++ b/src/btree/bt_misc.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_ovfl.c b/src/btree/bt_ovfl.c
index ae0da62af57..3d09f655c65 100644
--- a/src/btree/bt_ovfl.c
+++ b/src/btree/bt_ovfl.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index f20f6398e37..ca5f05fe3dc 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_random.c b/src/btree/bt_random.c
index c5948ec4ab5..1bdf0fd1c8b 100644
--- a/src/btree/bt_random.c
+++ b/src/btree/bt_random.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -395,8 +395,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
*/
for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) {
n = skip;
- WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip,
- WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
+ WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip));
if (n == skip) {
if (skip == 0)
break;
diff --git a/src/btree/bt_read.c b/src/btree/bt_read.c
index 64874547b9c..de84a711019 100644
--- a/src/btree/bt_read.c
+++ b/src/btree/bt_read.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -90,7 +90,8 @@ __col_instantiate(WT_SESSION_IMPL *session,
{
/* Search the page and add updates. */
WT_RET(__wt_col_search(session, recno, ref, cbt));
- WT_RET(__wt_col_modify(session, cbt, recno, NULL, upd, false));
+ WT_RET(__wt_col_modify(
+ session, cbt, recno, NULL, upd, WT_UPDATE_STANDARD, false));
return (0);
}
@@ -104,7 +105,8 @@ __row_instantiate(WT_SESSION_IMPL *session,
{
/* Search the page and add updates. */
WT_RET(__wt_row_search(session, key, ref, cbt, true));
- WT_RET(__wt_row_modify(session, cbt, key, NULL, upd, false));
+ WT_RET(__wt_row_modify(
+ session, cbt, key, NULL, upd, WT_UPDATE_STANDARD, false));
return (0);
}
@@ -127,7 +129,8 @@ __las_page_instantiate(WT_SESSION_IMPL *session,
WT_UPDATE *first_upd, *last_upd, *upd;
size_t incr, total_incr;
uint64_t current_recno, las_counter, las_txnid, recno, upd_txnid;
- uint32_t las_id, upd_size, session_flags;
+ uint32_t las_id, session_flags;
+ uint8_t upd_type;
int exact;
const uint8_t *p;
@@ -188,10 +191,10 @@ __las_page_instantiate(WT_SESSION_IMPL *session,
/* Allocate the WT_UPDATE structure. */
WT_ERR(cursor->get_value(
- cursor, &upd_txnid, &upd_size, las_value));
- WT_ERR(__wt_update_alloc(session,
- (upd_size == WT_UPDATE_DELETED_VALUE) ? NULL : las_value,
- &upd, &incr));
+ cursor, &upd_txnid, &upd_type, las_value));
+ WT_ERR(__wt_update_alloc(session, las_value, &upd, &incr,
+ upd_type == WT_UPDATE_DELETED ?
+ WT_UPDATE_DELETED : WT_UPDATE_STANDARD));
total_incr += incr;
upd->txnid = upd_txnid;
@@ -586,15 +589,10 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
* if the page qualifies for forced eviction and update
* the page's generation number. If eviction isn't being
* done on this file, we're done.
- * In-memory split of large pages is allowed while
- * no_eviction is set on btree, whereas reconciliation
- * is not allowed.
*/
if (LF_ISSET(WT_READ_NO_EVICT) ||
F_ISSET(session, WT_SESSION_NO_EVICTION) ||
- btree->lsm_primary ||
- (btree->evict_disabled > 0 &&
- !F_ISSET(btree, WT_BTREE_ALLOW_SPLITS)))
+ btree->evict_disabled > 0 || btree->lsm_primary)
goto skip_evict;
/*
diff --git a/src/btree/bt_rebalance.c b/src/btree/bt_rebalance.c
index 68848c7c8f5..47c7888af35 100644
--- a/src/btree/bt_rebalance.c
+++ b/src/btree/bt_rebalance.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_ret.c b/src/btree/bt_ret.c
index f17fa1b85d1..7212de72d6e 100644
--- a/src/btree/bt_ret.c
+++ b/src/btree/bt_ret.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -147,9 +147,13 @@ __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
cursor = &cbt->iface;
/*
- * We may already have an internal key, in which case the cursor may
- * not be set up to get another copy (for example, when we rely on a
- * search-function result).
+ * We may already have an internal key and the cursor may not be set up
+ * to get another copy, so we have to leave it alone. Consider a cursor
+ * search followed by an update: the update doesn't repeat the search,
+ * it simply updates the currently referenced key's value. We will end
+ * up here with the correct internal key, but we can't "return" the key
+ * again even if we wanted to do the additional work, the cursor isn't
+ * set up for that because we didn't just complete a search.
*/
F_CLR(cursor, WT_CURSTD_KEY_EXT);
if (!F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
diff --git a/src/btree/bt_slvg.c b/src/btree/bt_slvg.c
index 165f932afb2..eb39301abc7 100644
--- a/src/btree/bt_slvg.c
+++ b/src/btree/bt_slvg.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c
index 49043c8bab4..71346baee2e 100644
--- a/src/btree/bt_split.c
+++ b/src/btree/bt_split.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -31,143 +31,6 @@ typedef enum {
} WT_SPLIT_ERROR_PHASE;
/*
- * __split_oldest_gen --
- * Calculate the oldest active split generation.
- */
-static uint64_t
-__split_oldest_gen(WT_SESSION_IMPL *session)
-{
- WT_CONNECTION_IMPL *conn;
- WT_SESSION_IMPL *s;
- uint64_t gen, oldest;
- u_int i, session_cnt;
-
- conn = S2C(session);
- WT_ORDERED_READ(session_cnt, conn->session_cnt);
- for (i = 0, s = conn->sessions, oldest = conn->split_gen + 1;
- i < session_cnt;
- i++, s++)
- if (((gen = s->split_gen) != 0) && gen < oldest)
- oldest = gen;
-
- return (oldest);
-}
-
-/*
- * __wt_split_obsolete --
- * Check if it is safe to free / evict based on split generation.
- */
-bool
-__wt_split_obsolete(WT_SESSION_IMPL *session, uint64_t split_gen)
-{
- return (split_gen < __split_oldest_gen(session));
-}
-
-/*
- * __split_stash_add --
- * Add a new entry into the session's split stash list.
- */
-static int
-__split_stash_add(
- WT_SESSION_IMPL *session, uint64_t split_gen, void *p, size_t len)
-{
- WT_CONNECTION_IMPL *conn;
- WT_SPLIT_STASH *stash;
-
- WT_ASSERT(session, p != NULL);
-
- conn = S2C(session);
-
- /* Grow the list as necessary. */
- WT_RET(__wt_realloc_def(session, &session->split_stash_alloc,
- session->split_stash_cnt + 1, &session->split_stash));
-
- stash = session->split_stash + session->split_stash_cnt++;
- stash->split_gen = split_gen;
- stash->p = p;
- stash->len = len;
-
- (void)__wt_atomic_add64(&conn->split_stashed_bytes, len);
- (void)__wt_atomic_add64(&conn->split_stashed_objects, 1);
-
- /* See if we can free any previous entries. */
- if (session->split_stash_cnt > 1)
- __wt_split_stash_discard(session);
-
- return (0);
-}
-
-/*
- * __wt_split_stash_discard --
- * Discard any memory from a session's split stash that we can.
- */
-void
-__wt_split_stash_discard(WT_SESSION_IMPL *session)
-{
- WT_CONNECTION_IMPL *conn;
- WT_SPLIT_STASH *stash;
- uint64_t oldest;
- size_t i;
-
- conn = S2C(session);
-
- /* Get the oldest split generation. */
- oldest = __split_oldest_gen(session);
-
- for (i = 0, stash = session->split_stash;
- i < session->split_stash_cnt;
- ++i, ++stash) {
- if (stash->p == NULL)
- continue;
- if (stash->split_gen >= oldest)
- break;
- /*
- * It's a bad thing if another thread is in this memory after
- * we free it, make sure nothing good happens to that thread.
- */
- (void)__wt_atomic_sub64(&conn->split_stashed_bytes, stash->len);
- (void)__wt_atomic_sub64(&conn->split_stashed_objects, 1);
- __wt_overwrite_and_free_len(session, stash->p, stash->len);
- }
-
- /*
- * If there are enough free slots at the beginning of the list, shuffle
- * everything down.
- */
- if (i > 100 || i == session->split_stash_cnt)
- if ((session->split_stash_cnt -= i) > 0)
- memmove(session->split_stash, stash,
- session->split_stash_cnt * sizeof(*stash));
-}
-
-/*
- * __wt_split_stash_discard_all --
- * Discard all memory from a session's split stash.
- */
-void
-__wt_split_stash_discard_all(
- WT_SESSION_IMPL *session_safe, WT_SESSION_IMPL *session)
-{
- WT_SPLIT_STASH *stash;
- size_t i;
-
- /*
- * This function is called during WT_CONNECTION.close to discard any
- * memory that remains. For that reason, we take two WT_SESSION_IMPL
- * arguments: session_safe is still linked to the WT_CONNECTION and
- * can be safely used for calls to other WiredTiger functions, while
- * session is the WT_SESSION_IMPL we're cleaning up.
- */
- for (i = 0, stash = session->split_stash;
- i < session->split_stash_cnt;
- ++i, ++stash)
- __wt_free(session_safe, stash->p);
-
- __wt_free(session_safe, session->split_stash);
- session->split_stash_cnt = session->split_stash_alloc = 0;
-}
-
-/*
* __split_safe_free --
* Free a buffer if we can be sure no thread is accessing it, or schedule
* it to be freed otherwise.
@@ -177,13 +40,14 @@ __split_safe_free(WT_SESSION_IMPL *session,
uint64_t split_gen, bool exclusive, void *p, size_t s)
{
/* We should only call safe free if we aren't pinning the memory. */
- WT_ASSERT(session, session->split_gen != split_gen);
+ WT_ASSERT(session,
+ __wt_session_gen(session, WT_GEN_SPLIT) != split_gen);
/*
* We have swapped something in a page: if we don't have exclusive
* access, check whether there are other threads in the same tree.
*/
- if (!exclusive && __split_oldest_gen(session) > split_gen)
+ if (!exclusive && __wt_gen_oldest(session, WT_GEN_SPLIT) > split_gen)
exclusive = true;
if (exclusive) {
@@ -191,7 +55,7 @@ __split_safe_free(WT_SESSION_IMPL *session,
return (0);
}
- return (__split_stash_add(session, split_gen, p, s));
+ return (__wt_stash_add(session, WT_GEN_SPLIT, split_gen, p, s));
}
#ifdef HAVE_DIAGNOSTIC
@@ -645,7 +509,8 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
* generation to block splits in newly created pages, so get one.
*/
WT_ENTER_PAGE_INDEX(session);
- __split_ref_prepare(session, alloc_index, session->split_gen, false);
+ __split_ref_prepare(session, alloc_index,
+ __wt_session_gen(session, WT_GEN_SPLIT), false);
/*
* Confirm the root page's index hasn't moved, then update it, which
@@ -662,7 +527,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
* after the new index is swapped into place in order to know that no
* readers are looking at the old index.
*/
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
root->pg_intl_split_gen = split_gen;
#ifdef HAVE_DIAGNOSTIC
@@ -848,7 +713,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* the new index is swapped into place in order to know that no readers
* are looking at the old index.
*/
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
parent->pg_intl_split_gen = split_gen;
/*
@@ -1173,7 +1038,8 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
* generation to block splits in newly created pages, so get one.
*/
WT_ENTER_PAGE_INDEX(session);
- __split_ref_prepare(session, alloc_index, session->split_gen, true);
+ __split_ref_prepare(session, alloc_index,
+ __wt_session_gen(session, WT_GEN_SPLIT), true);
/* Split into the parent. */
if ((ret = __split_parent(session, page_ref, alloc_index->index,
@@ -1194,7 +1060,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
* after the new index is swapped into place in order to know that no
* readers are looking at the old index.
*/
- split_gen = __wt_atomic_addv64(&S2C(session)->split_gen, 1);
+ split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
page->pg_intl_split_gen = split_gen;
#ifdef HAVE_DIAGNOSTIC
@@ -1300,13 +1166,19 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
for (;;) {
parent = ref->home;
+ /*
+ * The page will be marked dirty, and we can only lock a page
+ * with a modify structure.
+ */
+ WT_RET(__wt_page_modify_init(session, parent));
+
if (trylock)
- WT_RET(__wt_try_writelock(session, &parent->page_lock));
+ WT_RET(WT_PAGE_TRYLOCK(session, parent));
else
- __wt_writelock(session, &parent->page_lock);
+ WT_PAGE_LOCK(session, parent);
if (parent == ref->home)
break;
- __wt_writeunlock(session, &parent->page_lock);
+ WT_PAGE_UNLOCK(session, parent);
}
/*
@@ -1329,7 +1201,7 @@ __split_internal_lock(WT_SESSION_IMPL *session, WT_REF *ref, bool trylock,
*parentp = parent;
return (0);
-err: __wt_writeunlock(session, &parent->page_lock);
+err: WT_PAGE_UNLOCK(session, parent);
return (ret);
}
@@ -1345,7 +1217,7 @@ __split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
if (hazard)
ret = __wt_hazard_clear(session, parent->pg_intl_parent_ref);
- __wt_writeunlock(session, &parent->page_lock);
+ WT_PAGE_UNLOCK(session, parent);
return (ret);
}
@@ -1558,8 +1430,8 @@ __split_multi_inmem(
WT_ERR(__wt_col_search(session, recno, ref, &cbt));
/* Apply the modification. */
- WT_ERR(__wt_col_modify(
- session, &cbt, recno, NULL, upd, false));
+ WT_ERR(__wt_col_modify(session,
+ &cbt, recno, NULL, upd, WT_UPDATE_STANDARD, true));
break;
case WT_PAGE_ROW_LEAF:
/* Build a key. */
@@ -1580,8 +1452,8 @@ __split_multi_inmem(
WT_ERR(__wt_row_search(session, key, ref, &cbt, true));
/* Apply the modification. */
- WT_ERR(__wt_row_modify(
- session, &cbt, key, NULL, upd, false));
+ WT_ERR(__wt_row_modify(session, &cbt,
+ key, NULL, upd, WT_UPDATE_STANDARD, true));
break;
WT_ILLEGAL_VALUE_ERR(session);
}
diff --git a/src/btree/bt_stat.c b/src/btree/bt_stat.c
index 0da0e0807bd..e3b9bbced48 100644
--- a/src/btree/bt_stat.c
+++ b/src/btree/bt_stat.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -178,7 +178,9 @@ __stat_page_col_var(
*/
WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
upd = ins->upd;
- if (WT_UPDATE_DELETED_ISSET(upd)) {
+ if (upd->type == WT_UPDATE_RESERVED)
+ continue;
+ if (upd->type == WT_UPDATE_DELETED) {
if (!orig_deleted) {
++deleted_cnt;
--entry_cnt;
@@ -192,11 +194,14 @@ __stat_page_col_var(
}
/* Walk any append list. */
- WT_SKIP_FOREACH(ins, WT_COL_APPEND(page))
- if (WT_UPDATE_DELETED_ISSET(ins->upd))
+ WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
+ if (ins->upd->type == WT_UPDATE_RESERVED)
+ continue;
+ if (ins->upd->type == WT_UPDATE_DELETED)
++deleted_cnt;
else
++entry_cnt;
+ }
WT_STAT_INCRV(session, stats, btree_column_deleted, deleted_cnt);
WT_STAT_INCRV(session, stats, btree_column_rle, rle_cnt);
@@ -263,7 +268,8 @@ __stat_page_row_leaf(
* key on the page.
*/
WT_SKIP_FOREACH(ins, WT_ROW_INSERT_SMALLEST(page))
- if (!WT_UPDATE_DELETED_ISSET(ins->upd))
+ if (ins->upd->type != WT_UPDATE_DELETED &&
+ ins->upd->type != WT_UPDATE_RESERVED)
++entry_cnt;
/*
@@ -272,16 +278,19 @@ __stat_page_row_leaf(
*/
WT_ROW_FOREACH(page, rip, i) {
upd = WT_ROW_UPDATE(page, rip);
- if (upd == NULL || !WT_UPDATE_DELETED_ISSET(upd))
+ if (upd == NULL ||
+ (upd->type != WT_UPDATE_DELETED &&
+ upd->type != WT_UPDATE_RESERVED))
++entry_cnt;
if (upd == NULL && (cell =
__wt_row_leaf_value_cell(page, rip, NULL)) != NULL &&
__wt_cell_type(cell) == WT_CELL_VALUE_OVFL)
- ++ovfl_cnt;
+ ++ovfl_cnt;
/* Walk K/V pairs inserted after the on-page K/V pair. */
WT_SKIP_FOREACH(ins, WT_ROW_INSERT(page, rip))
- if (!WT_UPDATE_DELETED_ISSET(ins->upd))
+ if (ins->upd->type != WT_UPDATE_DELETED &&
+ ins->upd->type != WT_UPDATE_RESERVED)
++entry_cnt;
}
diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c
index ead6ccc4ac0..5b0bf53dc6c 100644
--- a/src/btree/bt_sync.c
+++ b/src/btree/bt_sync.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -179,22 +179,9 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop)
* Set the checkpointing flag to block such actions and wait for
* any problematic eviction or page splits to complete.
*/
- WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE);
-
- /*
- * Sync for checkpoint allows splits to happen while the queue
- * is being drained, but not reconciliation. We need to do this,
- * since draining the queue can take long enough for hot pages
- * to grow significantly larger than the configured maximum
- * size.
- */
- F_SET(btree, WT_BTREE_ALLOW_SPLITS);
- ret = __wt_evict_file_exclusive_on(session);
- F_CLR(btree, WT_BTREE_ALLOW_SPLITS);
- WT_ERR(ret);
- __wt_evict_file_exclusive_off(session);
-
- WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING);
+ btree->checkpointing = WT_CKPT_PREPARE;
+ (void)__wt_gen_next_drain(session, WT_GEN_EVICT);
+ btree->checkpointing = WT_CKPT_RUNNING;
/* Write all dirty in-cache pages. */
flags |= WT_READ_NO_EVICT;
@@ -268,9 +255,8 @@ err: /* On error, clear any left-over tree walk. */
saved_pinned_id == WT_TXN_NONE)
__wt_txn_release_snapshot(session);
- /* Clear the checkpoint flag and push the change. */
- if (btree->checkpointing != WT_CKPT_OFF)
- WT_PUBLISH(btree->checkpointing, WT_CKPT_OFF);
+ /* Clear the checkpoint flag. */
+ btree->checkpointing = WT_CKPT_OFF;
__wt_spin_unlock(session, &btree->flush_lock);
diff --git a/src/btree/bt_upgrade.c b/src/btree/bt_upgrade.c
index a9ff16ad496..a7fe3283218 100644
--- a/src/btree/bt_upgrade.c
+++ b/src/btree/bt_upgrade.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_vrfy.c b/src/btree/bt_vrfy.c
index 7475811adc5..21ba2d7a715 100644
--- a/src/btree/bt_vrfy.c
+++ b/src/btree/bt_vrfy.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_vrfy_dsk.c b/src/btree/bt_vrfy_dsk.c
index a4071c44aee..55c96bbed55 100644
--- a/src/btree/bt_vrfy_dsk.c
+++ b/src/btree/bt_vrfy_dsk.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/bt_walk.c b/src/btree/bt_walk.c
index 86484feb7c9..225e6812aa1 100644
--- a/src/btree/bt_walk.c
+++ b/src/btree/bt_walk.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -497,29 +497,21 @@ restart: /*
}
/*
- * Optionally skip leaf pages: skip all leaf pages if
- * WT_READ_SKIP_LEAF is set, when the skip-leaf-count
- * variable is non-zero, skip some count of leaf pages.
- * If this page is disk-based, crack the cell to figure
- * out it's a leaf page without reading it.
+ * Optionally skip leaf pages: when the skip-leaf-count
+ * variable is non-zero, skip some count of leaf pages,
+ * then take the next leaf page we can.
*
- * If skipping some number of leaf pages, decrement the
- * count of pages to zero, and then take the next leaf
- * page we can. Be cautious around the page decrement,
- * if for some reason don't take this particular page,
- * we can take the next one, and, there are additional
- * tests/decrements when we're about to return a leaf
- * page.
+ * The reason to do some of this work here (rather than
+ * in our caller), is because we can look at the cell
+ * and know it's a leaf page without reading it into
+ * memory. If this page is disk-based, crack the cell
+ * to figure out it's a leaf page without reading it.
*/
- if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF))
- if (__ref_is_leaf(ref)) {
- if (LF_ISSET(WT_READ_SKIP_LEAF))
- break;
- if (*skipleafcntp > 0) {
- --*skipleafcntp;
- break;
- }
- }
+ if (skipleafcntp != NULL &&
+ *skipleafcntp > 0 && __ref_is_leaf(ref)) {
+ --*skipleafcntp;
+ break;
+ }
ret = __wt_page_swap(session, couple, ref,
WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags);
@@ -626,34 +618,18 @@ descend: empty_internal = true;
session, ref, &pindex);
slot = pindex->entries - 1;
}
- } else {
- /*
- * At the lowest tree level (considering a leaf
- * page), turn off the initial-descent state.
- * Descent race tests are different when moving
- * through the tree vs. the initial descent.
- */
- initial_descent = false;
-
- /*
- * Optionally skip leaf pages, the second half.
- * We didn't have an on-page cell to figure out
- * if it was a leaf page, we had to acquire the
- * hazard pointer and look at the page.
- */
- if (skipleafcntp != NULL ||
- LF_ISSET(WT_READ_SKIP_LEAF)) {
- if (LF_ISSET(WT_READ_SKIP_LEAF))
- break;
- if (*skipleafcntp > 0) {
- --*skipleafcntp;
- break;
- }
- }
-
- *refp = ref;
- goto done;
+ continue;
}
+
+ /*
+ * The tree-walk restart code knows we return any leaf
+ * page we acquire (never hazard-pointer coupling on
+ * after acquiring a leaf page), and asserts no restart
+ * happens while holding a leaf page. This page must be
+ * returned to our caller.
+ */
+ *refp = ref;
+ goto done;
}
}
@@ -690,8 +666,29 @@ __wt_tree_walk_count(WT_SESSION_IMPL *session,
* of leaf pages before returning.
*/
int
-__wt_tree_walk_skip(WT_SESSION_IMPL *session,
- WT_REF **refp, uint64_t *skipleafcntp, uint32_t flags)
+__wt_tree_walk_skip(
+ WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *skipleafcntp)
{
- return (__tree_walk_internal(session, refp, NULL, skipleafcntp, flags));
+ /*
+ * Optionally skip leaf pages, the second half. The tree-walk function
+ * didn't have an on-page cell it could use to figure out if the page
+ * was a leaf page or not, it had to acquire the hazard pointer and look
+ * at the page. The tree-walk code never acquires a hazard pointer on a
+ * leaf page without returning it, and it's not trivial to change that.
+ * So, the tree-walk code returns all leaf pages here and we deal with
+ * decrementing the count.
+ */
+ do {
+ WT_RET(__tree_walk_internal(session, refp, NULL, skipleafcntp,
+ WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED));
+
+ /*
+ * The walk skipped internal pages, any page returned must be a
+ * leaf page.
+ */
+ if (*skipleafcntp > 0)
+ --*skipleafcntp;
+ } while (*skipleafcntp > 0);
+
+ return (0);
}
diff --git a/src/btree/col_modify.c b/src/btree/col_modify.c
index 9ccb9728189..2a64ec03952 100644
--- a/src/btree/col_modify.c
+++ b/src/btree/col_modify.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -17,13 +17,14 @@ static int __col_insert_alloc(
*/
int
__wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
- uint64_t recno, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove)
+ uint64_t recno, const WT_ITEM *value,
+ WT_UPDATE *upd_arg, u_int modify_type, bool exclusive)
{
+ static const WT_ITEM col_fix_remove = { "", 1, NULL, 0, 0 };
WT_BTREE *btree;
WT_DECL_RET;
WT_INSERT *ins;
WT_INSERT_HEAD *ins_head, **ins_headp;
- WT_ITEM _value;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_UPDATE *old_upd, *upd;
@@ -37,14 +38,17 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
upd = upd_arg;
append = logged = false;
- /* This code expects a remove to have a NULL value. */
- if (is_remove) {
- if (btree->type == BTREE_COL_FIX) {
- value = &_value;
- value->data = "";
- value->size = 1;
- } else
- value = NULL;
+ if (modify_type == WT_UPDATE_DELETED ||
+ modify_type == WT_UPDATE_RESERVED) {
+ /*
+ * Fixed-size column-store doesn't have on-page deleted values,
+ * it's a nul byte.
+ */
+ if (modify_type == WT_UPDATE_DELETED &&
+ btree->type == BTREE_COL_FIX) {
+ modify_type = WT_UPDATE_STANDARD;
+ value = &col_fix_remove;
+ }
} else {
/*
* There's some chance the application specified a record past
@@ -83,11 +87,11 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
WT_ASSERT(session, upd_arg == NULL);
/* Make sure the update can proceed. */
- WT_ERR(__wt_txn_update_check(
- session, old_upd = cbt->ins->upd));
+ WT_ERR(__wt_txn_update_check(session, old_upd = cbt->ins->upd));
/* Allocate a WT_UPDATE structure and transaction ID. */
- WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_update_alloc(session,
+ value, &upd, &upd_size, modify_type));
WT_ERR(__wt_txn_modify(session, upd));
logged = true;
@@ -103,7 +107,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Serialize the update. */
WT_ERR(__wt_update_serial(
- session, page, &cbt->ins->upd, &upd, upd_size));
+ session, page, &cbt->ins->upd, &upd, upd_size, false));
} else {
/* Allocate the append/update list reference as necessary. */
if (append) {
@@ -147,8 +151,8 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
mod->mod_col_split_recno > recno));
if (upd_arg == NULL) {
- WT_ERR(
- __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_update_alloc(session,
+ value, &upd, &upd_size, modify_type));
WT_ERR(__wt_txn_modify(session, upd));
logged = true;
@@ -185,15 +189,15 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
if (append)
WT_ERR(__wt_col_append_serial(
session, page, cbt->ins_head, cbt->ins_stack,
- &ins, ins_size, &cbt->recno, skipdepth));
+ &ins, ins_size, &cbt->recno, skipdepth, exclusive));
else
WT_ERR(__wt_insert_serial(
session, page, cbt->ins_head, cbt->ins_stack,
- &ins, ins_size, skipdepth));
+ &ins, ins_size, skipdepth, exclusive));
}
/* If the update was successful, add it to the in-memory log. */
- if (logged)
+ if (logged && modify_type != WT_UPDATE_RESERVED)
WT_ERR(__wt_txn_log_op(session, cbt));
if (0) {
diff --git a/src/btree/col_srch.c b/src/btree/col_srch.c
index c72d66f8796..78ee367dc69 100644
--- a/src/btree/col_srch.c
+++ b/src/btree/col_srch.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
diff --git a/src/btree/row_key.c b/src/btree/row_key.c
index 032fdf7d897..a016568898f 100644
--- a/src/btree/row_key.c
+++ b/src/btree/row_key.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -471,6 +471,8 @@ __wt_row_ikey_alloc(WT_SESSION_IMPL *session,
{
WT_IKEY *ikey;
+ WT_ASSERT(session, key != NULL); /* quiet clang scan-build */
+
/*
* Allocate memory for the WT_IKEY structure and the key, then copy
* the key into place.
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c
index b1a81ca3d9f..cab07341a1c 100644
--- a/src/btree/row_modify.c
+++ b/src/btree/row_modify.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
@@ -15,18 +15,13 @@
int
__wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
{
- WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
WT_PAGE_MODIFY *modify;
- conn = S2C(session);
-
WT_RET(__wt_calloc_one(session, &modify));
- /*
- * Select a spinlock for the page; let the barrier immediately below
- * keep things from racing too badly.
- */
- modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS;
+ /* Initialize the spinlock for the page. */
+ WT_ERR(__wt_spin_init(session, &modify->page_lock, "btree page"));
/*
* Multiple threads of control may be searching and deciding to modify
@@ -37,8 +32,8 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
if (__wt_atomic_cas_ptr(&page->modify, NULL, modify))
__wt_cache_page_inmem_incr(session, page, sizeof(*modify));
else
- __wt_free(session, modify);
- return (0);
+err: __wt_free(session, modify);
+ return (ret);
}
/*
@@ -47,7 +42,8 @@ __wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
*/
int
__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
- WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd_arg, bool is_remove)
+ const WT_ITEM *key, const WT_ITEM *value,
+ WT_UPDATE *upd_arg, u_int modify_type, bool exclusive)
{
WT_DECL_RET;
WT_INSERT *ins;
@@ -65,10 +61,6 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
upd = upd_arg;
logged = false;
- /* This code expects a remove to have a NULL value. */
- if (is_remove)
- value = NULL;
-
/* If we don't yet have a modify structure, we'll need one. */
WT_RET(__wt_page_modify_init(session, page));
mod = page->modify;
@@ -99,8 +91,8 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
session, old_upd = *upd_entry));
/* Allocate a WT_UPDATE structure and transaction ID. */
- WT_ERR(
- __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_update_alloc(session,
+ value, &upd, &upd_size, modify_type));
WT_ERR(__wt_txn_modify(session, upd));
logged = true;
@@ -132,7 +124,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Serialize the update. */
WT_ERR(__wt_update_serial(
- session, page, upd_entry, &upd, upd_size));
+ session, page, upd_entry, &upd, upd_size, exclusive));
} else {
/*
* Allocate the insert array as necessary.
@@ -170,8 +162,8 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
cbt->ins = ins;
if (upd_arg == NULL) {
- WT_ERR(
- __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_update_alloc(session,
+ value, &upd, &upd_size, modify_type));
WT_ERR(__wt_txn_modify(session, upd));
logged = true;
@@ -207,10 +199,10 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
/* Insert the WT_INSERT structure. */
WT_ERR(__wt_insert_serial(
session, page, cbt->ins_head, cbt->ins_stack,
- &ins, ins_size, skipdepth));
+ &ins, ins_size, skipdepth, exclusive));
}
- if (logged)
+ if (logged && modify_type != WT_UPDATE_RESERVED)
WT_ERR(__wt_txn_log_op(session, cbt));
if (0) {
@@ -235,7 +227,7 @@ err: /*
*/
int
__wt_row_insert_alloc(WT_SESSION_IMPL *session,
- WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
+ const WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
{
WT_INSERT *ins;
size_t ins_size;
@@ -263,11 +255,10 @@ __wt_row_insert_alloc(WT_SESSION_IMPL *session,
* Allocate a WT_UPDATE structure and associated value and fill it in.
*/
int
-__wt_update_alloc(
- WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep)
+__wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value,
+ WT_UPDATE **updp, size_t *sizep, u_int modify_type)
{
WT_UPDATE *upd;
- size_t size;
*updp = NULL;
@@ -275,15 +266,18 @@ __wt_update_alloc(
* Allocate the WT_UPDATE structure and room for the value, then copy
* the value into place.
*/
- size = value == NULL ? 0 : value->size;
- WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd));
- if (value == NULL)
- WT_UPDATE_DELETED_SET(upd);
+ if (modify_type == WT_UPDATE_DELETED ||
+ modify_type == WT_UPDATE_RESERVED)
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE), &upd));
else {
- upd->size = WT_STORE_SIZE(size);
- if (size != 0)
- memcpy(WT_UPDATE_DATA(upd), value->data, size);
+ WT_RET(__wt_calloc(
+ session, 1, sizeof(WT_UPDATE) + value->size, &upd));
+ if (value->size != 0) {
+ upd->size = WT_STORE_SIZE(value->size);
+ memcpy(WT_UPDATE_DATA(upd), value->data, value->size);
+ }
}
+ upd->type = (uint8_t)modify_type;
*updp = upd;
*sizep = WT_UPDATE_MEMSIZE(upd);
diff --git a/src/btree/row_srch.c b/src/btree/row_srch.c
index 9c3d467340e..76bebde7de7 100644
--- a/src/btree/row_srch.c
+++ b/src/btree/row_srch.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 2014-2016 MongoDB, Inc.
+ * Copyright (c) 2014-2017 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*