summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree
diff options
context:
space:
mode:
authorAlex Gorrod <alexander.gorrod@mongodb.com>2017-08-01 16:42:49 +1000
committerAlex Gorrod <alexander.gorrod@mongodb.com>2017-08-01 16:42:49 +1000
commit835bfb21d8e67663d84a40aa4f7370a4403725a9 (patch)
tree4f5edb231524f95272f834e31461ba4e17e52903 /src/third_party/wiredtiger/src/btree
parent6300b3bd4ad9cd238a02bdb8ca681a447913f1af (diff)
downloadmongo-835bfb21d8e67663d84a40aa4f7370a4403725a9.tar.gz
Import wiredtiger: 2e9744d11a65c63ba7445060dc78371250f04051 from branch mongodb-3.6
ref: 6173a98979..2e9744d11a for: 3.5.11 WT-2309 Add yields and/or sleeps in #DIAGNOSTIC mode WT-3047 Add mode aimed at uncovering race conditions in split code WT-3308 Add statistics tracking around yield loops WT-3316 Add new engineering section to reference guide documentation WT-3338 Optimize cursor modify WT-3380 Special case 8-byte timestamps WT-3387 Add support for a stable timestamp WT-3389 Restructure split code to hold a split generation for the entire operation. WT-3406 Reconciliation is choosing reserved records for writing. WT-3410 Add developer documentation for table rename WT-3412 Add backoff logic to the btree delete and walk yield loops WT-3418 block manager object race WT-3422 WiredTiger upgrading documents out of date WT-3432 workgen needs braces around an "if" body WT-3433 session->alter method should not be supported in read-only mode WT-3439 lint/cleanup WT-3440 Add a log record when starting a checkpoint WT-3442 Coverity 1378213: false positive on diagnostic assignment. WT-3446 Temporarily disable timestamp testing in test/checkpoint WT-3447 test_stat_log02 can assert before table stats are printed WT-3461 Avoid long sleeps when the system clock is adjusted WT-3463 Add recovery of backup to test_timestamp03.py WT-3466 Track the first commit timestamp for each transaction WT-3467 Minor lint/cleanup
Diffstat (limited to 'src/third_party/wiredtiger/src/btree')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curnext.c41
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_curprev.c41
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c214
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c82
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_delete.c24
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ovfl.c24
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_random.c7
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_read.c7
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_ret.c119
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_split.c383
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_stat.c31
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_walk.c25
-rw-r--r--src/third_party/wiredtiger/src/btree/row_modify.c15
13 files changed, 685 insertions, 328 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c
index cb59bff8f75..eb8a258d475 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curnext.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c
@@ -15,12 +15,10 @@
static inline int
__cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
{
- WT_ITEM *val;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
- val = &cbt->iface.value;
if (newpage) {
if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL)
@@ -59,10 +57,10 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
if (cbt->recno < WT_INSERT_RECNO(cbt->ins) ||
(upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
cbt->v = 0;
- val->data = &cbt->v;
+ cbt->iface.value.data = &cbt->v;
} else
- val->data = WT_UPDATE_DATA(upd);
- val->size = 1;
+ cbt->iface.value.data = upd->data;
+ cbt->iface.value.size = 1;
return (0);
}
@@ -74,7 +72,6 @@ static inline int
__cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage)
{
WT_BTREE *btree;
- WT_ITEM *val;
WT_PAGE *page;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
@@ -82,7 +79,6 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage)
session = (WT_SESSION_IMPL *)cbt->iface.session;
btree = S2BT(session);
page = cbt->ref->page;
- val = &cbt->iface.value;
/* Initialize for each new page. */
if (newpage) {
@@ -108,10 +104,10 @@ new_page:
upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
if (upd == NULL) {
cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
- val->data = &cbt->v;
+ cbt->iface.value.data = &cbt->v;
} else
- val->data = WT_UPDATE_DATA(upd);
- val->size = 1;
+ cbt->iface.value.data = upd->data;
+ cbt->iface.value.size = 1;
return (0);
}
@@ -122,12 +118,10 @@ new_page:
static inline int
__cursor_var_append_next(WT_CURSOR_BTREE *cbt, bool newpage)
{
- WT_ITEM *val;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
- val = &cbt->iface.value;
if (newpage) {
cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
@@ -147,9 +141,7 @@ new_page: if (cbt->ins == NULL)
++cbt->page_deleted_count;
continue;
}
- val->data = WT_UPDATE_DATA(upd);
- val->size = upd->size;
- return (0);
+ return (__wt_value_return(session, cbt, upd));
}
/* NOTREACHED */
}
@@ -164,7 +156,6 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage)
WT_CELL *cell;
WT_CELL_UNPACK unpack;
WT_COL *cip;
- WT_ITEM *val;
WT_INSERT *ins;
WT_PAGE *page;
WT_SESSION_IMPL *session;
@@ -173,7 +164,6 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage)
session = (WT_SESSION_IMPL *)cbt->iface.session;
page = cbt->ref->page;
- val = &cbt->iface.value;
rle_start = 0; /* -Werror=maybe-uninitialized */
@@ -210,10 +200,7 @@ new_page: /* Find the matching WT_COL slot. */
++cbt->page_deleted_count;
continue;
}
-
- val->data = WT_UPDATE_DATA(upd);
- val->size = upd->size;
- return (0);
+ return (__wt_value_return(session, cbt, upd));
}
/*
@@ -267,8 +254,8 @@ new_page: /* Find the matching WT_COL slot. */
cbt->cip_saved = cip;
}
- val->data = cbt->tmp->data;
- val->size = cbt->tmp->size;
+ cbt->iface.value.data = cbt->tmp->data;
+ cbt->iface.value.size = cbt->tmp->size;
return (0);
}
/* NOTREACHED */
@@ -282,7 +269,7 @@ static inline int
__cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage)
{
WT_INSERT *ins;
- WT_ITEM *key, *val;
+ WT_ITEM *key;
WT_PAGE *page;
WT_ROW *rip;
WT_SESSION_IMPL *session;
@@ -291,7 +278,6 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage)
session = (WT_SESSION_IMPL *)cbt->iface.session;
page = cbt->ref->page;
key = &cbt->iface.key;
- val = &cbt->iface.value;
/*
* For row-store pages, we need a single item that tells us the part
@@ -332,9 +318,7 @@ new_insert: if ((ins = cbt->ins) != NULL) {
}
key->data = WT_INSERT_KEY(ins);
key->size = WT_INSERT_KEY_SIZE(ins);
- val->data = WT_UPDATE_DATA(upd);
- val->size = upd->size;
- return (0);
+ return (__wt_value_return(session, cbt, upd));
}
/* Check for the end of the page. */
@@ -363,7 +347,6 @@ new_insert: if ((ins = cbt->ins) != NULL) {
++cbt->page_deleted_count;
continue;
}
-
return (__cursor_row_slot_return(cbt, rip, upd));
}
/* NOTREACHED */
diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c
index 6e49f4df68c..c1395ea9008 100644
--- a/src/third_party/wiredtiger/src/btree/bt_curprev.c
+++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c
@@ -127,12 +127,10 @@ restart:
static inline int
__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
- WT_ITEM *val;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
- val = &cbt->iface.value;
if (newpage) {
if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
@@ -205,10 +203,10 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
cbt->recno > WT_INSERT_RECNO(cbt->ins) ||
(upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
cbt->v = 0;
- val->data = &cbt->v;
+ cbt->iface.value.data = &cbt->v;
} else
- val->data = WT_UPDATE_DATA(upd);
- val->size = 1;
+ cbt->iface.value.data = upd->data;
+ cbt->iface.value.size = 1;
return (0);
}
@@ -220,7 +218,6 @@ static inline int
__cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
WT_BTREE *btree;
- WT_ITEM *val;
WT_PAGE *page;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
@@ -228,7 +225,6 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage)
session = (WT_SESSION_IMPL *)cbt->iface.session;
page = cbt->ref->page;
btree = S2BT(session);
- val = &cbt->iface.value;
/* Initialize for each new page. */
if (newpage) {
@@ -254,10 +250,10 @@ new_page:
upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
if (upd == NULL) {
cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt);
- val->data = &cbt->v;
+ cbt->iface.value.data = &cbt->v;
} else
- val->data = WT_UPDATE_DATA(upd);
- val->size = 1;
+ cbt->iface.value.data = upd->data;
+ cbt->iface.value.size = 1;
return (0);
}
@@ -268,12 +264,10 @@ new_page:
static inline int
__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
- WT_ITEM *val;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
session = (WT_SESSION_IMPL *)cbt->iface.session;
- val = &cbt->iface.value;
if (newpage) {
cbt->ins = WT_SKIP_LAST(cbt->ins_head);
@@ -293,9 +287,7 @@ new_page: if (cbt->ins == NULL)
++cbt->page_deleted_count;
continue;
}
- val->data = WT_UPDATE_DATA(upd);
- val->size = upd->size;
- return (0);
+ return (__wt_value_return(session, cbt, upd));
}
/* NOTREACHED */
}
@@ -311,7 +303,6 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
WT_CELL_UNPACK unpack;
WT_COL *cip;
WT_INSERT *ins;
- WT_ITEM *val;
WT_PAGE *page;
WT_SESSION_IMPL *session;
WT_UPDATE *upd;
@@ -319,7 +310,6 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
session = (WT_SESSION_IMPL *)cbt->iface.session;
page = cbt->ref->page;
- val = &cbt->iface.value;
rle_start = 0; /* -Werror=maybe-uninitialized */
@@ -357,10 +347,7 @@ new_page: if (cbt->recno < cbt->ref->ref_recno)
++cbt->page_deleted_count;
continue;
}
-
- val->data = WT_UPDATE_DATA(upd);
- val->size = upd->size;
- return (0);
+ return (__wt_value_return(session, cbt, upd));
}
/*
@@ -413,8 +400,8 @@ new_page: if (cbt->recno < cbt->ref->ref_recno)
cbt->cip_saved = cip;
}
- val->data = cbt->tmp->data;
- val->size = cbt->tmp->size;
+ cbt->iface.value.data = cbt->tmp->data;
+ cbt->iface.value.size = cbt->tmp->size;
return (0);
}
/* NOTREACHED */
@@ -428,7 +415,7 @@ static inline int
__cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
WT_INSERT *ins;
- WT_ITEM *key, *val;
+ WT_ITEM *key;
WT_PAGE *page;
WT_ROW *rip;
WT_SESSION_IMPL *session;
@@ -437,7 +424,6 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
session = (WT_SESSION_IMPL *)cbt->iface.session;
page = cbt->ref->page;
key = &cbt->iface.key;
- val = &cbt->iface.value;
/*
* For row-store pages, we need a single item that tells us the part
@@ -489,9 +475,7 @@ new_insert: if ((ins = cbt->ins) != NULL) {
}
key->data = WT_INSERT_KEY(ins);
key->size = WT_INSERT_KEY_SIZE(ins);
- val->data = WT_UPDATE_DATA(upd);
- val->size = upd->size;
- return (0);
+ return (__wt_value_return(session, cbt, upd));
}
/* Check for the beginning of the page. */
@@ -522,7 +506,6 @@ new_insert: if ((ins = cbt->ins) != NULL) {
++cbt->page_deleted_count;
continue;
}
-
return (__cursor_row_slot_return(cbt, rip, upd));
}
/* NOTREACHED */
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index 52435eeefed..d58dc78fbed 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -308,8 +308,22 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp)
}
/*
+ * __cursor_kv_return --
+ * Return a page referenced key/value pair to the application.
+ */
+static inline int
+__cursor_kv_return(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+ WT_RET(__wt_key_return(session, cbt));
+ WT_RET(__wt_value_return(session, cbt, upd));
+
+ return (0);
+}
+
+/*
* __cursor_col_search --
- * Column-store search from an application cursor.
+ * Column-store search from a cursor.
*/
static inline int
__cursor_col_search(
@@ -324,7 +338,7 @@ __cursor_col_search(
/*
* __cursor_row_search --
- * Row-store search from an application cursor.
+ * Row-store search from a cursor.
*/
static inline int
__cursor_row_search(
@@ -338,8 +352,32 @@ __cursor_row_search(
}
/*
+ * __cursor_col_modify_v --
+ * Column-store modify from a cursor, with a separate value.
+ */
+static inline int
+__cursor_col_modify_v(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
+{
+ return (__wt_col_modify(session, cbt,
+ cbt->iface.recno, value, NULL, modify_type, false));
+}
+
+/*
+ * __cursor_row_modify_v --
+ * Row-store modify from a cursor, with a separate value.
+ */
+static inline int
+__cursor_row_modify_v(WT_SESSION_IMPL *session,
+ WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
+{
+ return (__wt_row_modify(session, cbt,
+ &cbt->iface.key, value, NULL, modify_type, false));
+}
+
+/*
* __cursor_col_modify --
- * Column-store delete, insert, and update from an application cursor.
+ * Column-store modify from a cursor.
*/
static inline int
__cursor_col_modify(
@@ -351,7 +389,7 @@ __cursor_col_modify(
/*
* __cursor_row_modify --
- * Row-store insert, update and delete from an application cursor.
+ * Row-store modify from a cursor.
*/
static inline int
__cursor_row_modify(
@@ -442,7 +480,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
}
if (valid)
- ret = __wt_kv_return(session, cbt, upd);
+ ret = __cursor_kv_return(session, cbt, upd);
else if (__cursor_fix_implicit(btree, cbt)) {
/*
* Creating a record past the end of the tree in a fixed-length
@@ -564,7 +602,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
*/
if (valid) {
exact = cbt->compare;
- ret = __wt_kv_return(session, cbt, upd);
+ ret = __cursor_kv_return(session, cbt, upd);
} else if (__cursor_fix_implicit(btree, cbt)) {
cbt->recno = cursor->recno;
cbt->v = 0;
@@ -582,7 +620,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
__cursor_col_search(session, cbt, NULL));
if (__wt_cursor_valid(cbt, &upd)) {
exact = cbt->compare;
- ret = __wt_kv_return(session, cbt, upd);
+ ret = __cursor_kv_return(session, cbt, upd);
} else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND)
exact = -1;
}
@@ -987,7 +1025,7 @@ done: /*
* Update a record in the tree.
*/
static int
-__btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type)
+__btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
{
WT_BTREE *btree;
WT_CURFILE_STATE state;
@@ -1015,6 +1053,7 @@ __btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type)
*/
if (__cursor_page_pinned(cbt) && F_ISSET(cursor, WT_CURSTD_KEY_INT)) {
WT_ERR(__wt_txn_autocommit_check(session));
+
/*
* The cursor position may not be exact (the cursor's comparison
* value not equal to zero). Correct to an exact match so we can
@@ -1022,8 +1061,8 @@ __btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type)
*/
cbt->compare = 0;
ret = btree->type == BTREE_ROW ?
- __cursor_row_modify(session, cbt, modify_type) :
- __cursor_col_modify(session, cbt, modify_type);
+ __cursor_row_modify_v(session, cbt, value, modify_type) :
+ __cursor_col_modify_v(session, cbt, value, modify_type);
if (ret == 0)
goto done;
@@ -1052,6 +1091,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
if (btree->type == BTREE_ROW) {
WT_ERR(__cursor_row_search(session, cbt, NULL, true));
+
/*
* If not overwriting, check for conflicts and fail if the key
* does not exist.
@@ -1061,7 +1101,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL))
WT_ERR(WT_NOTFOUND);
}
- ret = __cursor_row_modify(session, cbt, modify_type);
+ ret = __cursor_row_modify_v(session, cbt, value, modify_type);
} else {
WT_ERR(__cursor_col_search(session, cbt, NULL));
@@ -1080,7 +1120,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true));
!__cursor_fix_implicit(btree, cbt))
WT_ERR(WT_NOTFOUND);
}
- ret = __cursor_col_modify(session, cbt, modify_type);
+ ret = __cursor_col_modify_v(session, cbt, value, modify_type);
}
err: if (ret == WT_RESTART) {
@@ -1097,14 +1137,33 @@ err: if (ret == WT_RESTART) {
* To make this work, we add a field to the btree cursor to pass back a
* pointer to the modify function's allocated update structure.
*/
-done: if (ret == 0) {
- if (modify_type == WT_UPDATE_RESERVED) {
+done: if (ret == 0)
+ switch (modify_type) {
+ case WT_UPDATE_STANDARD:
+ /*
+ * WT_CURSOR.update returns a key and a value.
+ */
+ WT_TRET(__cursor_kv_return(
+ session, cbt, cbt->modify_update));
+ break;
+ case WT_UPDATE_RESERVED:
+ /*
+ * WT_CURSOR.reserve doesn't return any value.
+ */
F_CLR(cursor, WT_CURSTD_VALUE_SET);
+ /* FALLTHROUGH */
+ case WT_UPDATE_MODIFIED:
+ /*
+ * WT_CURSOR.modify has already created the return value
+ * and our job is to leave it untouched.
+ */
WT_TRET(__wt_key_return(session, cbt));
- } else
- WT_TRET(
- __wt_kv_return(session, cbt, cbt->modify_update));
- }
+ break;
+ case WT_UPDATE_DELETED:
+ default:
+ WT_TRET(__wt_illegal_value(session, NULL));
+ break;
+ }
if (ret != 0) {
WT_TRET(__cursor_reset(cbt));
@@ -1115,6 +1174,121 @@ done: if (ret == 0) {
}
/*
+ * __cursor_chain_exceeded --
+ * Return if the update chain has exceeded the limit. Deleted or standard
+ * updates are anticipated to be sufficient to base the modify (although that's
+ * not guaranteed, they may not be visible or might abort before we read them).
+ * Also, this is not a hard limit, threads can race modifying updates.
+ */
+static bool
+__cursor_chain_exceeded(WT_CURSOR_BTREE *cbt)
+{
+ WT_PAGE *page;
+ WT_UPDATE *upd;
+ int i;
+
+ page = cbt->ref->page;
+
+ upd = NULL;
+ if (cbt->ins != NULL)
+ upd = cbt->ins->upd;
+ else if (cbt->btree->type == BTREE_ROW &&
+ page->modify != NULL && page->modify->mod_row_update != NULL)
+ upd = page->modify->mod_row_update[cbt->slot];
+
+ for (i = 0; upd != NULL; ++i, upd = upd->next) {
+ if (upd->type == WT_UPDATE_DELETED ||
+ upd->type == WT_UPDATE_STANDARD)
+ return (false);
+ if (i >= WT_MAX_MODIFY_UPDATE)
+ return (true);
+ }
+ return (false);
+}
+
+/*
+ * __wt_btcur_modify --
+ * Modify a record in the tree.
+ */
+int
+__wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
+{
+ WT_CURFILE_STATE state;
+ WT_CURSOR *cursor;
+ WT_DECL_ITEM(modify);
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ size_t orig, new;
+ bool chain_exceeded, overwrite;
+
+ cursor = &cbt->iface;
+ session = (WT_SESSION_IMPL *)cursor->session;
+
+ WT_STAT_CONN_INCR(session, cursor_modify);
+ WT_STAT_DATA_INCR(session, cursor_modify);
+
+ /* Save the cursor state. */
+ __cursor_state_save(cursor, &state);
+
+ /*
+ * Get the current value and apply the modification to it, for a few
+ * reasons: first, we set the updated value so the application can
+ * retrieve the cursor's value; second, we use the updated value as
+ * the update if the update chain is too long; third, there's a check
+ * if the updated value is too large to store; fourth, to simplify the
+ * count of bytes being added/removed; fifth, we can get into serious
+ * trouble if we attempt to modify a value that doesn't exist. For the
+ * fifth reason, verify we're not in a read-uncommitted transaction,
+ * that implies a value that might disappear out from under us.
+ */
+ if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
+ WT_ERR_MSG(session, ENOTSUP,
+ "not supported in read-uncommitted transactions");
+
+ WT_ERR(__wt_btcur_search(cbt));
+ orig = cursor->value.size;
+ WT_ERR(__wt_modify_apply_api(
+ session, &cursor->value, entries, nentries));
+ new = cursor->value.size;
+ WT_ERR(__cursor_size_chk(session, &cursor->value));
+ if (new > orig)
+ WT_STAT_DATA_INCRV(session, cursor_update_bytes, new - orig);
+ else
+ WT_STAT_DATA_DECRV(session, cursor_update_bytes, orig - new);
+
+ /*
+ * WT_CURSOR.modify is update-without-overwrite.
+ *
+ * Use the modify buffer as the update if under the limit, else use the
+ * complete value.
+ */
+ overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
+ F_CLR(cursor, WT_CURSTD_OVERWRITE);
+ chain_exceeded = __cursor_chain_exceeded(cbt);
+ if (chain_exceeded)
+ ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD);
+ else if ((ret =
+ __wt_modify_pack(session, &modify, entries, nentries)) == 0)
+ ret = __btcur_update(cbt, modify, WT_UPDATE_MODIFIED);
+ if (overwrite)
+ F_SET(cursor, WT_CURSTD_OVERWRITE);
+
+ /*
+ * We have our own cursor state restoration because we've modified the
+ * cursor before calling the underlying cursor update function and we
+ * need to restore it to its original state. This means multiple calls
+ * to reset the cursor, but that shouldn't be a problem.
+ */
+ if (ret != 0) {
+err: WT_TRET(__cursor_reset(cbt));
+ __cursor_state_restore(cursor, &state);
+ }
+
+ __wt_scr_free(session, &modify);
+ return (ret);
+}
+
+/*
* __wt_btcur_reserve --
* Reserve a record in the tree.
*/
@@ -1135,7 +1309,7 @@ __wt_btcur_reserve(WT_CURSOR_BTREE *cbt)
/* WT_CURSOR.reserve is update-without-overwrite and a special value. */
overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
F_CLR(cursor, WT_CURSTD_OVERWRITE);
- ret = __btcur_update(cbt, WT_UPDATE_RESERVED);
+ ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_RESERVED);
if (overwrite)
F_SET(cursor, WT_CURSTD_OVERWRITE);
return (ret);
@@ -1164,7 +1338,7 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt)
WT_RET(__cursor_size_chk(session, &cursor->key));
WT_RET(__cursor_size_chk(session, &cursor->value));
- return (__btcur_update(cbt, WT_UPDATE_STANDARD));
+ return (__btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD));
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index c0aaf3f42d9..b8d11be7b3e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -986,6 +986,35 @@ __debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head)
}
/*
+ * __debug_modified --
+ * Dump a modified update.
+ */
+static int
+__debug_modified(WT_DBG *ds, WT_UPDATE *upd)
+{
+ const size_t *p;
+ int nentries;
+ const uint8_t *data;
+ void *modify;
+
+ modify = upd->data;
+
+ p = modify;
+ nentries = (int)*p++;
+ data = (uint8_t *)modify +
+ sizeof(size_t) + ((size_t)nentries * 3 * sizeof(size_t));
+
+ WT_RET(ds->f(ds, "%d: ", nentries));
+ for (; nentries-- > 0; data += p[0], p += 3)
+ WT_RET(ds->f(ds,
+ "{%" WT_SIZET_FMT ", %" WT_SIZET_FMT ", %" WT_SIZET_FMT
+ ", %.*s}%s", p[0], p[1], p[2],
+ (int)p[2], data, nentries == 0 ? "" : ", "));
+
+ return (0);
+}
+
+/*
* __debug_update --
* Dump an update list.
*/
@@ -993,37 +1022,46 @@ static int
__debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte)
{
for (; upd != NULL; upd = upd->next) {
- if (upd->type == WT_UPDATE_DELETED)
+ switch (upd->type) {
+ case WT_UPDATE_DELETED:
WT_RET(ds->f(ds, "\tvalue {deleted}\n"));
- else if (upd->type == WT_UPDATE_RESERVED)
- WT_RET(ds->f(ds, "\tvalue {reserved}\n"));
- else if (hexbyte) {
- WT_RET(ds->f(ds, "\t{"));
- WT_RET(__debug_hex_byte(ds,
- *(uint8_t *)WT_UPDATE_DATA(upd)));
+ break;
+ case WT_UPDATE_MODIFIED:
+ WT_RET(ds->f(ds, "\tvalue {modified: "));
+ WT_RET(__debug_modified(ds, upd));
WT_RET(ds->f(ds, "}\n"));
- } else
- WT_RET(__debug_item(ds,
- "value", WT_UPDATE_DATA(upd), upd->size));
- WT_RET(ds->f(ds, "\t" "txn id %" PRIu64, upd->txnid));
+ break;
+ case WT_UPDATE_RESERVED:
+ WT_RET(ds->f(ds, "\tvalue {reserved}\n"));
+ break;
+ case WT_UPDATE_STANDARD:
+ if (hexbyte) {
+ WT_RET(ds->f(ds, "\t{"));
+ WT_RET(__debug_hex_byte(ds, *upd->data));
+ WT_RET(ds->f(ds, "}\n"));
+ } else
+ WT_RET(__debug_item(ds,
+ "value", upd->data, upd->size));
+ break;
+ }
+ if (upd->txnid == WT_TXN_ABORTED)
+ WT_RET(ds->f(ds, "\t" "txn aborted"));
+ else
+ WT_RET(ds->f(ds, "\t" "txn id %" PRIu64, upd->txnid));
#ifdef HAVE_TIMESTAMPS
- if (!__wt_timestamp_iszero(upd->timestamp)) {
+ if (!__wt_timestamp_iszero(
+ WT_TIMESTAMP_NULL(&upd->timestamp))) {
#if WT_TIMESTAMP_SIZE == 8
- {
- uint64_t ts;
- __wt_timestamp_set(
- (uint8_t *)&ts, (uint8_t *)&upd->timestamp[0]);
- ts = __wt_bswap64(ts);
- WT_RET(ds->f(ds, ", stamp %" PRIu64, ts));
- }
+ WT_RET(ds->f(ds,
+ ", stamp %" PRIu64, upd->timestamp.val));
#else
- {
int i;
+
WT_RET(ds->f(ds, ", stamp 0x"));
for (i = 0; i < WT_TIMESTAMP_SIZE; ++i)
- WT_RET(ds->f(ds, "%" PRIx8, upd->timestamp[i]));
- }
+ WT_RET(ds->f(ds,
+ "%" PRIx8, upd->timestamp.ts[i]));
#endif
}
#endif
diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c
index eac8994a5a4..093192dbaa0 100644
--- a/src/third_party/wiredtiger/src/btree/bt_delete.c
+++ b/src/third_party/wiredtiger/src/btree/bt_delete.c
@@ -153,6 +153,7 @@ void
__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_UPDATE **upd;
+ uint64_t sleep_count, yield_count;
/*
* If the page is still "deleted", it's as we left it, reset the state
@@ -160,7 +161,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
* instantiated or being instantiated. Loop because it's possible for
* the page to return to the deleted state if instantiation fails.
*/
- for (;; __wt_yield())
+ for (sleep_count = yield_count = 0;;) {
switch (ref->state) {
case WT_REF_DISK:
case WT_REF_READING:
@@ -205,6 +206,15 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
__wt_free(session, ref->page_del);
return;
}
+ /*
+ * We wait for the change in page state, yield before retrying,
+ * and if we've yielded enough times, start sleeping so we don't
+ * burn CPU to no purpose.
+ */
+ __wt_ref_state_yield_sleep(&yield_count, &sleep_count);
+ WT_STAT_CONN_INCRV(session, page_del_rollback_blocked,
+ sleep_count);
+ }
}
/*
@@ -242,10 +252,10 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
return (false);
skip = ref->page_del == NULL || (visible_all ?
- __wt_txn_visible_all(session,
- ref->page_del->txnid, WT_GET_TIMESTAMP(ref->page_del)):
- __wt_txn_visible(session,
- ref->page_del->txnid, WT_GET_TIMESTAMP(ref->page_del)));
+ __wt_txn_visible_all(session, ref->page_del->txnid,
+ WT_TIMESTAMP_NULL(&ref->page_del->timestamp)):
+ __wt_txn_visible(session, ref->page_del->txnid,
+ WT_TIMESTAMP_NULL(&ref->page_del->timestamp)));
/*
* The page_del structure can be freed as soon as the delete is stable:
@@ -254,8 +264,8 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
* no longer need synchronization to check the ref.
*/
if (skip && ref->page_del != NULL && (visible_all ||
- __wt_txn_visible_all(session,
- ref->page_del->txnid, WT_GET_TIMESTAMP(ref->page_del)))) {
+ __wt_txn_visible_all(session, ref->page_del->txnid,
+ WT_TIMESTAMP_NULL(&ref->page_del->timestamp)))) {
__wt_free(session, ref->page_del->update_list);
__wt_free(session, ref->page_del);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
index a0b1ff65006..f933245eaef 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c
@@ -45,13 +45,15 @@ __ovfl_read(WT_SESSION_IMPL *session,
*/
int
__wt_ovfl_read(WT_SESSION_IMPL *session,
- WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
+ WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded)
{
WT_DECL_RET;
WT_OVFL_TRACK *track;
WT_UPDATE *upd;
size_t i;
+ *decoded = false;
+
/*
* If no page specified, there's no need to lock and there's no cache
* to search, we don't care about WT_CELL_VALUE_OVFL_RM cells.
@@ -78,8 +80,9 @@ __wt_ovfl_read(WT_SESSION_IMPL *session,
break;
}
WT_ASSERT(session, i < track->remove_next);
- store->data = WT_UPDATE_DATA(upd);
+ store->data = upd->data;
store->size = upd->size;
+ *decoded = true;
} else
ret = __ovfl_read(session, unpack->data, unpack->size, store);
__wt_readunlock(session, &S2BT(session)->ovfl_lock);
@@ -147,7 +150,7 @@ __ovfl_cache_append_update(WT_SESSION_IMPL *session, WT_PAGE *page,
/* Read the overflow value. */
WT_RET(__wt_scr_alloc(session, 1024, &tmp));
- WT_ERR(__ovfl_read(session, unpack->data, unpack->size, tmp));
+ WT_ERR(__wt_dsk_cell_data_ref(session, page->type, unpack, tmp));
/*
* Create an update entry with no transaction ID to ensure global
@@ -159,10 +162,23 @@ __ovfl_cache_append_update(WT_SESSION_IMPL *session, WT_PAGE *page,
* involves atomic operations which will act as our barrier. Regardless,
* we update the page footprint as part of this operation, which acts as
* a barrier as well.
+ *
+ * The update transaction ID choice is tricky, to work around an issue
+ * in variable-length column store. Imagine an overflow value with an
+ * RLE greater than 1. We append a copy to the end of an update chain,
+ * but it's possible it's the overflow value for more than one record,
+ * and appending it to the end of one record's update chain means a
+ * subsequent enter of a globally visible value to one of the records
+ * would allow the truncation of the overflow chain that leaves other
+ * records without a value. If appending such an overflow record, set
+ * the transaction ID to the first possible transaction ID. That ID is
+ * old enough to be globally visible, but we can use it as a flag if an
+ * update record cannot be discarded when truncating an update chain.
*/
WT_ERR(__wt_update_alloc(
session, tmp, &append, &size, WT_UPDATE_STANDARD));
- append->txnid = WT_TXN_NONE;
+ append->txnid = page->type == WT_PAGE_COL_VAR &&
+ __wt_cell_rle(unpack) > 1 ? WT_TXN_FIRST : WT_TXN_NONE;
for (upd = upd_list; upd->next != NULL; upd = upd->next)
;
WT_PUBLISH(upd->next, append);
diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c
index 1bdf0fd1c8b..f28c4e10594 100644
--- a/src/third_party/wiredtiger/src/btree/bt_random.c
+++ b/src/third_party/wiredtiger/src/btree/bt_random.c
@@ -417,9 +417,10 @@ random_page_entry:
* the next entry, if that doesn't work, move to the previous entry.
*/
WT_ERR(__wt_row_random_leaf(session, cbt));
- if (__wt_cursor_valid(cbt, &upd))
- WT_ERR(__wt_kv_return(session, cbt, upd));
- else {
+ if (__wt_cursor_valid(cbt, &upd)) {
+ WT_ERR(__wt_key_return(session, cbt));
+ WT_ERR(__wt_value_return(session, cbt, upd));
+ } else {
if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND)
ret = __wt_btcur_prev(cbt, false);
WT_ERR(ret);
diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c
index 6a89f505c31..91c1499840e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_read.c
+++ b/src/third_party/wiredtiger/src/btree/bt_read.c
@@ -194,7 +194,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session,
upd->txnid = upd_txnid;
#ifdef HAVE_TIMESTAMPS
WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE);
- __wt_timestamp_set(upd->timestamp, las_timestamp.data);
+ __wt_timestamp_set(&upd->timestamp, las_timestamp.data);
#endif
switch (page->type) {
@@ -487,7 +487,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *page;
- u_int sleep_cnt, wait_cnt;
+ uint64_t sleep_cnt, wait_cnt;
bool busy, cache_work, evict_soon, stalled;
int force_attempts;
@@ -672,9 +672,8 @@ skip_evict:
if (cache_work)
continue;
}
- sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000);
+ __wt_ref_state_yield_sleep(&wait_cnt, &sleep_cnt);
WT_STAT_CONN_INCRV(session, page_sleep, sleep_cnt);
- __wt_sleep(0, sleep_cnt);
}
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c
index 7212de72d6e..4452e6eb0c6 100644
--- a/src/third_party/wiredtiger/src/btree/bt_ret.c
+++ b/src/third_party/wiredtiger/src/btree/bt_ret.c
@@ -75,10 +75,10 @@ __key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
/*
* __value_return --
- * Change the cursor to reference an internal return value.
+ * Change the cursor to reference an internal original-page return value.
*/
static inline int
-__value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+__value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
WT_BTREE *btree;
WT_CELL *cell;
@@ -93,13 +93,6 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
page = cbt->ref->page;
cursor = &cbt->iface;
- /* If the cursor references a WT_UPDATE item, return it. */
- if (upd != NULL) {
- cursor->value.data = WT_UPDATE_DATA(upd);
- cursor->value.size = upd->size;
- return (0);
- }
-
if (page->type == WT_PAGE_ROW_LEAF) {
rip = &page->pg_row[cbt->slot];
@@ -136,6 +129,99 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
}
/*
+ * __value_return_upd --
+ * Change the cursor to reference an internal update structure return
+ * value.
+ */
+static inline int
+__value_return_upd(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ WT_UPDATE **listp, *list[WT_MAX_MODIFY_UPDATE];
+ u_int i;
+ size_t allocated_bytes;
+
+ cursor = &cbt->iface;
+ allocated_bytes = 0;
+
+ /*
+ * We're passed a "standard" or "modified" update that's visible to us.
+ * Our caller should have already checked for deleted items (we're too
+ * far down the call stack to return not-found).
+ *
+ * Fast path if it's a standard item, assert our caller's behavior.
+ */
+ if (upd->type == WT_UPDATE_STANDARD) {
+ cursor->value.data = upd->data;
+ cursor->value.size = upd->size;
+ return (0);
+ }
+ WT_ASSERT(session, upd->type == WT_UPDATE_MODIFIED);
+
+ /*
+ * Find a complete update that's visible to us, tracking modifications
+ * that are visible to us.
+ */
+ for (i = 0, listp = list; upd != NULL; upd = upd->next) {
+ if (!__wt_txn_upd_visible(session, upd))
+ continue;
+
+ if (WT_UPDATE_DATA_VALUE(upd))
+ break;
+
+ if (upd->type == WT_UPDATE_MODIFIED) {
+ /*
+ * Update lists are expected to be short, but it's not
+ * guaranteed. There's sufficient room on the stack to
+ * avoid memory allocation in normal cases, but we have
+ * to handle the edge cases too.
+ */
+ if (i >= WT_MAX_MODIFY_UPDATE) {
+ if (i == WT_MAX_MODIFY_UPDATE)
+ listp = NULL;
+ WT_ERR(__wt_realloc_def(
+ session, &allocated_bytes, i + 1, &listp));
+ if (i == WT_MAX_MODIFY_UPDATE)
+ memcpy(listp, list, sizeof(list));
+ }
+ listp[i++] = upd;
+ }
+ }
+
+ /*
+ * If we hit the end of the chain, roll forward from the update item we
+ * found, otherwise, from the original page's value.
+ */
+ if (upd == NULL) {
+ /*
+ * Callers of this function set the cursor slot to an impossible
+ * value to check we're not trying to return on-page values when
+ * the update list should have been sufficient (which happens,
+ * for example, if an update list was truncated, deleting some
+ * standard update required by a previous modify update). Assert
+ * the case.
+ */
+ WT_ASSERT(session, cbt->slot != UINT32_MAX);
+
+ WT_ERR(__value_return(session, cbt));
+ } else if (upd->type == WT_UPDATE_DELETED)
+ WT_ERR(__wt_buf_set(session, &cursor->value, "", 0));
+ else
+ WT_ERR(__wt_buf_set(session,
+ &cursor->value, upd->data, upd->size));
+
+ while (i > 0)
+ WT_ERR(__wt_modify_apply(
+ session, &cursor->value, listp[--i]->data));
+
+err: if (allocated_bytes)
+ __wt_free(session, listp);
+ return (ret);
+}
+
+/*
* __wt_key_return --
* Change the cursor to reference an internal return key.
*/
@@ -164,21 +250,22 @@ __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
}
/*
- * __wt_kv_return --
- * Return a page referenced key/value pair to the application.
+ * __wt_value_return --
+ * Change the cursor to reference an internal return value.
*/
int
-__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
+__wt_value_return(
+ WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
{
WT_CURSOR *cursor;
cursor = &cbt->iface;
- WT_RET(__wt_key_return(session, cbt));
-
F_CLR(cursor, WT_CURSTD_VALUE_EXT);
- WT_RET(__value_return(session, cbt, upd));
+ if (upd == NULL)
+ WT_RET(__value_return(session, cbt));
+ else
+ WT_RET(__value_return_upd(session, cbt, upd));
F_SET(cursor, WT_CURSTD_VALUE_INT);
-
return (0);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c
index c1b7b6c4001..2862c7fb6d7 100644
--- a/src/third_party/wiredtiger/src/btree/bt_split.c
+++ b/src/third_party/wiredtiger/src/btree/bt_split.c
@@ -31,6 +31,24 @@ typedef enum {
} WT_SPLIT_ERROR_PHASE;
/*
+ * __page_split_timing_stress --
+ * Optionally add delay to simulate the race conditions in page split for
+ * debug purposes. The purpose is to uncover the race conditions in page split.
+ */
+static void
+__page_split_timing_stress(WT_SESSION_IMPL *session,
+ uint32_t flag, uint64_t micro_seconds)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+
+ /* We only want to sleep when page split race flag is set. */
+ if (FLD_ISSET(conn->timing_stress_flags, flag))
+ __wt_sleep(0, micro_seconds);
+}
+
+/*
* __split_safe_free --
* Free a buffer if we can be sure no thread is accessing it, or schedule
* it to be freed otherwise.
@@ -308,8 +326,8 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
* Prepare a set of WT_REFs for a move.
*/
static void
-__split_ref_prepare(WT_SESSION_IMPL *session,
- WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first)
+__split_ref_prepare(
+ WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
WT_PAGE *child;
WT_REF *child_ref, *ref;
@@ -331,40 +349,12 @@ __split_ref_prepare(WT_SESSION_IMPL *session,
ref = pindex->index[i];
child = ref->page;
- /*
- * Block eviction in newly created pages.
- *
- * Once the split is live, newly created internal pages might be
- * evicted and their WT_REF structures freed. If that happened
- * before all threads exit the index of the page that previously
- * "owned" the WT_REF, a thread might see a freed WT_REF. To
- * ensure that doesn't happen, the newly created page contains
- * the current split generation and can't be evicted until
- * all readers have left the old generation.
- *
- * Historic, we also blocked splits in newly created pages
- * because we didn't update the WT_REF.home field until after
- * the split was live, so the WT_REF.home fields being updated
- * could split again before the update, there's a race between
- * splits as to which would update them first. The current code
- * updates the WT_REF.home fields before going live (in this
- * function), this isn't an issue.
- */
- child->pg_intl_split_gen = split_gen;
-
- /*
- * We use a page flag to prevent the child from splitting from
- * underneath us, but the split-generation error checks don't
- * know about that flag; use the standard macros to ensure that
- * reading the child's page index structure is safe.
- */
+ /* Switch the WT_REF's to their new page. */
j = 0;
- WT_ENTER_PAGE_INDEX(session);
WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
child_ref->home = child;
child_ref->pindex_hint = j++;
} WT_INTL_FOREACH_END;
- WT_LEAVE_PAGE_INDEX(session);
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
@@ -447,6 +437,18 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__wt_calloc_one(session, alloc_refp));
root_incr += children * sizeof(WT_REF);
+ /*
+ * Once the split is live, newly created internal pages might be evicted
+ * and their WT_REF structures freed. If that happens before all threads
+ * exit the index of the page that previously "owned" the WT_REF, a
+ * thread might see a freed WT_REF. To ensure that doesn't happen, the
+ * created pages are set to the current split generation and so can't be
+ * evicted until all readers have left the old generation.
+ *
+ * Our thread has a stable split generation, get a copy.
+ */
+ split_gen = __wt_session_gen(session, WT_GEN_SPLIT);
+
/* Allocate child pages, and connect them into the new page index. */
for (root_refp = pindex->index,
alloc_refp = alloc_index->index, i = 0; i < children; ++i) {
@@ -471,10 +473,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
ref->ref_recno = (*root_refp)->ref_recno;
ref->state = WT_REF_MEM;
- /* Initialize the child page. */
+ /*
+ * Initialize the child page.
+ * Block eviction in newly created pages and mark them dirty.
+ */
child->pg_intl_parent_ref = ref;
-
- /* Mark it dirty. */
+ child->pg_intl_split_gen = split_gen;
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
@@ -504,13 +508,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /*
- * Prepare the WT_REFs for the move: this requires a stable split
- * generation to block splits in newly created pages, so get one.
- */
- WT_ENTER_PAGE_INDEX(session);
- __split_ref_prepare(session, alloc_index,
- __wt_session_gen(session, WT_GEN_SPLIT), false);
+ /* Prepare the WT_REFs for the move. */
+ __split_ref_prepare(session, alloc_index, false);
+
+ /* Encourage a race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
/*
* Confirm the root page's index hasn't moved, then update it, which
@@ -520,12 +523,21 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_INTL_INDEX_SET(root, alloc_index);
alloc_index = NULL;
- WT_LEAVE_PAGE_INDEX(session);
+ /* Encourage a race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
/*
* Get a generation for this split, mark the root page. This must be
* after the new index is swapped into place in order to know that no
* readers are looking at the old index.
+ *
+ * Note: as the root page cannot currently be evicted, the root split
+ * generation isn't ever used. That said, it future proofs eviction
+ * and isn't expensive enough to special-case.
+ *
+ * Getting a new split generation implies a full barrier, no additional
+ * barrier is needed.
*/
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
root->pg_intl_split_gen = split_gen;
@@ -700,6 +712,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
+ /* Encourage a race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
+
/*
* Confirm the parent page's index hasn't moved then update it, which
* makes the split visible to threads descending the tree.
@@ -708,10 +724,17 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
WT_INTL_INDEX_SET(parent, alloc_index);
alloc_index = NULL;
+ /* Encourage a race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
+
/*
* Get a generation for this split, mark the page. This must be after
* the new index is swapped into place in order to know that no readers
* are looking at the old index.
+ *
+ * Getting a new split generation implies a full barrier, no additional
+ * barrier is needed.
*/
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
parent->pg_intl_split_gen = split_gen;
@@ -760,7 +783,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
* Swapping in the new page index released the page for eviction, we can
* no longer look inside the page.
*/
-
if (ref->page == NULL)
__wt_verbose(session, WT_VERB_SPLIT,
"%p: reverse split into parent %p, %" PRIu32 " -> %" PRIu32
@@ -779,8 +801,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new,
/*
* The new page index is in place, free the WT_REF we were splitting and
* any deleted WT_REFs we found, modulo the usual safe free semantics.
- *
- * Acquire a new split generation.
*/
for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) {
next_ref = pindex->index[deleted_refs[i]];
@@ -976,6 +996,18 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__wt_calloc_one(session, alloc_refp));
parent_incr += children * sizeof(WT_REF);
+ /*
+ * Once the split is live, newly created internal pages might be evicted
+ * and their WT_REF structures freed. If that happens before all threads
+ * exit the index of the page that previously "owned" the WT_REF, a
+ * thread might see a freed WT_REF. To ensure that doesn't happen, the
+ * created pages are set to the current split generation and so can't be
+ * evicted until all readers have left the old generation.
+ *
+ * Our thread has a stable split generation, get a copy.
+ */
+ split_gen = __wt_session_gen(session, WT_GEN_SPLIT);
+
/* Allocate child pages, and connect them into the new page index. */
WT_ASSERT(session, page_refp == pindex->index + chunk);
for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) {
@@ -1000,10 +1032,12 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
ref->ref_recno = (*page_refp)->ref_recno;
ref->state = WT_REF_MEM;
- /* Initialize the child page. */
+ /*
+ * Initialize the child page.
+ * Block eviction in newly created pages and mark them dirty.
+ */
child->pg_intl_parent_ref = ref;
-
- /* Mark it dirty. */
+ child->pg_intl_split_gen = split_gen;
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
@@ -1033,32 +1067,35 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
/* Start making real changes to the tree, errors are fatal. */
complete = WT_ERR_PANIC;
- /*
- * Prepare the WT_REFs for the move: this requires a stable split
- * generation to block splits in newly created pages, so get one.
- */
- WT_ENTER_PAGE_INDEX(session);
- __split_ref_prepare(session, alloc_index,
- __wt_session_gen(session, WT_GEN_SPLIT), true);
+ /* Prepare the WT_REFs for the move. */
+ __split_ref_prepare(session, alloc_index, true);
+
+ /* Encourage a race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
/* Split into the parent. */
- if ((ret = __split_parent(session, page_ref, alloc_index->index,
- alloc_index->entries, parent_incr, false, false)) == 0) {
- /*
- * Confirm the page's index hasn't moved, then update it, which
- * makes the split visible to threads descending the tree.
- */
- WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
- WT_INTL_INDEX_SET(page, replace_index);
- }
+ WT_ERR(__split_parent(session, page_ref, alloc_index->index,
+ alloc_index->entries, parent_incr, false, false));
- WT_LEAVE_PAGE_INDEX(session);
- WT_ERR(ret);
+ /*
+ * Confirm the page's index hasn't moved, then update it, which
+ * makes the split visible to threads descending the tree.
+ */
+ WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
+ WT_INTL_INDEX_SET(page, replace_index);
+
+ /* Encourage a race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND);
/*
* Get a generation for this split, mark the parent page. This must be
* after the new index is swapped into place in order to know that no
* readers are looking at the old index.
+ *
+ * Getting a new split generation implies a full barrier, no additional
+ * barrier is needed.
*/
split_gen = __wt_gen_next(session, WT_GEN_SPLIT);
page->pg_intl_split_gen = split_gen;
@@ -1122,18 +1159,15 @@ err: switch (complete) {
}
/*
- * __split_internal_lock_worker --
+ * __split_internal_lock --
* Lock an internal page.
*/
static int
-__split_internal_lock_worker(WT_SESSION_IMPL *session,
- WT_REF *ref, bool trylock, WT_PAGE **parentp, bool *hazardp)
+__split_internal_lock(
+ WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, WT_PAGE **parentp)
{
- WT_DECL_RET;
WT_PAGE *parent;
- WT_REF *parent_ref;
- *hazardp = false;
*parentp = NULL;
/*
@@ -1166,10 +1200,11 @@ __split_internal_lock_worker(WT_SESSION_IMPL *session,
for (;;) {
parent = ref->home;
- /*
- * The page will be marked dirty, and we can only lock a page
- * with a modify structure.
- */
+ /* Encourage race */
+ __page_split_timing_stress(session,
+ WT_TIMING_STRESS_PAGE_SPLIT_RACE, WT_THOUSAND);
+
+ /* Page locks live in the modify structure. */
WT_RET(__wt_page_modify_init(session, parent));
if (trylock)
@@ -1182,69 +1217,28 @@ __split_internal_lock_worker(WT_SESSION_IMPL *session,
}
/*
- * We have exclusive access to split the parent, and at this point, the
- * child prevents the parent from being evicted. However, once we
+ * This child has exclusive access to split its parent and the child's
+ * existence prevents the parent from being evicted. However, once we
* update the parent's index, it may no longer refer to the child, and
- * could conceivably be evicted. Get a hazard pointer on the parent
- * now, so that we can safely access it after updating the index.
- *
- * Take care getting the page doesn't trigger eviction work: we could
- * block trying to split a different child of our parent and deadlock
- * or we could be the eviction server relied upon by other threads to
- * populate the eviction queue.
- */
- if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) {
- WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT));
- *hazardp = true;
- }
+ * could conceivably be evicted. If the parent page is dirty, our page
+ * lock prevents eviction because reconciliation is blocked. However,
+ * if the page were clean, it could be evicted without encountering our
+ * page lock. That isn't possible because you cannot move a child page
+ * and still leave the parent page clean.
+ */
*parentp = parent;
return (0);
-
-err: WT_PAGE_UNLOCK(session, parent);
- return (ret);
-}
-
-/*
- * __split_internal_lock --
- * Lock an internal page.
- */
-static int
-__split_internal_lock(WT_SESSION_IMPL *session,
- WT_REF *ref, bool trylock, WT_PAGE **parentp, bool *hazardp)
-{
- WT_DECL_RET;
-
- /*
- * There's no lock on our parent page and we're about to acquire one,
- * which implies using the WT_REF.home field to reference our parent
- * page. As a child of the parent page, we prevent its eviction, but
- * that's a weak guarantee. If the parent page splits, and our WT_REF
- * were to move with the split, the WT_REF.home field might change
- * underneath us and we could race, and end up attempting to access
- * an evicted page. Set the session page-index generation so if the
- * parent splits, it still can't be evicted.
- */
- WT_WITH_PAGE_INDEX(session,
- ret = __split_internal_lock_worker(
- session, ref, trylock, parentp, hazardp));
- return (ret);
}
/*
* __split_internal_unlock --
* Unlock the parent page.
*/
-static int
-__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard)
+static void
+__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent)
{
- WT_DECL_RET;
-
- if (hazard)
- ret = __wt_hazard_clear(session, parent->pg_intl_parent_ref);
-
WT_PAGE_UNLOCK(session, parent);
- return (ret);
}
/*
@@ -1297,13 +1291,12 @@ __split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref)
* Check if we should split up the tree.
*/
static int
-__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
+__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page)
{
WT_BTREE *btree;
WT_DECL_RET;
WT_PAGE *parent;
WT_REF *ref;
- bool parent_hazard;
btree = S2BT(session);
@@ -1317,8 +1310,10 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
* split chunk, but we'll write it upon finding it in a different part
* of the tree.
*/
- if (btree->checkpointing != WT_CKPT_OFF)
- return (__split_internal_unlock(session, page, page_hazard));
+ if (btree->checkpointing != WT_CKPT_OFF) {
+ __split_internal_unlock(session, page);
+ return (0);
+ }
/*
* Page splits trickle up the tree, that is, as leaf pages grow large
@@ -1340,7 +1335,6 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
*/
for (;;) {
parent = NULL;
- parent_hazard = false;
ref = page->pg_intl_parent_ref;
/* If we don't need to split the page, we're done. */
@@ -1360,22 +1354,18 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard)
* Lock the parent and split into it, then swap the parent/page
* locks, lock-coupling up the tree.
*/
- WT_ERR(__split_internal_lock(
- session, ref, true, &parent, &parent_hazard));
+ WT_ERR(__split_internal_lock(session, ref, true, &parent));
ret = __split_internal(session, parent, page);
- WT_TRET(__split_internal_unlock(session, page, page_hazard));
+ __split_internal_unlock(session, page);
page = parent;
- page_hazard = parent_hazard;
parent = NULL;
- parent_hazard = false;
WT_ERR(ret);
}
err: if (parent != NULL)
- WT_TRET(
- __split_internal_unlock(session, parent, parent_hazard));
- WT_TRET(__split_internal_unlock(session, page, page_hazard));
+ __split_internal_unlock(session, parent);
+ __split_internal_unlock(session, page);
/* A page may have been busy, in which case return without error. */
WT_RET_BUSY_OK(ret);
@@ -1462,11 +1452,11 @@ __split_multi_inmem(
case WT_PAGE_ROW_LEAF:
/* Build a key. */
if (supd->ins == NULL) {
- slot = WT_ROW_SLOT(orig, supd->rip);
+ slot = WT_ROW_SLOT(orig, supd->ripcip);
upd = orig->modify->mod_row_update[slot];
WT_ERR(__wt_row_leaf_key(
- session, orig, supd->rip, key, false));
+ session, orig, supd->ripcip, key, false));
} else {
upd = supd->ins->upd;
@@ -1530,7 +1520,7 @@ __split_multi_inmem_final(WT_PAGE *orig, WT_MULTI *multi)
break;
case WT_PAGE_ROW_LEAF:
if (supd->ins == NULL) {
- slot = WT_ROW_SLOT(orig, supd->rip);
+ slot = WT_ROW_SLOT(orig, supd->ripcip);
orig->modify->mod_row_update[slot] = NULL;
} else
supd->ins->upd = NULL;
@@ -1986,21 +1976,19 @@ err: if (split_ref[0] != NULL) {
}
/*
- * __wt_split_insert --
- * Lock, then split.
+ * __split_insert_lock --
+ * Split a page's last insert list entries into a separate page.
*/
-int
-__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
+static int
+__split_insert_lock(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_DECL_RET;
WT_PAGE *parent;
- bool hazard;
-
- __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref);
- WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard));
+ /* Lock the parent page, then proceed with the insert split. */
+ WT_RET(__split_internal_lock(session, ref, true, &parent));
if ((ret = __split_insert(session, ref)) != 0) {
- WT_TRET(__split_internal_unlock(session, parent, hazard));
+ __split_internal_unlock(session, parent);
return (ret);
}
@@ -2009,7 +1997,27 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
* parent page locked, note the functions we call are responsible for
* releasing that lock.
*/
- return (__split_parent_climb(session, parent, hazard));
+ return (__split_parent_climb(session, parent));
+}
+
+/*
+ * __wt_split_insert --
+ * Split a page's last insert list entries into a separate page.
+ */
+int
+__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref);
+
+ /*
+ * Set the session split generation to ensure underlying code isn't
+ * surprised by internal page eviction, then proceed with the insert
+ * split.
+ */
+ WT_WITH_PAGE_INDEX(session, ret = __split_insert_lock(session, ref));
+ return (ret);
}
/*
@@ -2077,21 +2085,19 @@ err: for (i = 0; i < new_entries; ++i)
}
/*
- * __wt_split_multi --
- * Lock, then split.
+ * __split_multi_lock --
+ * Split a page into multiple pages.
*/
-int
-__wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
+static int
+__split_multi_lock(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
{
WT_DECL_RET;
WT_PAGE *parent;
- bool hazard;
- __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref);
-
- WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
+ /* Lock the parent page, then proceed with the split. */
+ WT_RET(__split_internal_lock(session, ref, false, &parent));
if ((ret = __split_multi(session, ref, closing)) != 0 || closing) {
- WT_TRET(__split_internal_unlock(session, parent, hazard));
+ __split_internal_unlock(session, parent);
return (ret);
}
@@ -2100,26 +2106,63 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
* parent page locked, note the functions we call are responsible for
* releasing that lock.
*/
- return (__split_parent_climb(session, parent, hazard));
+ return (__split_parent_climb(session, parent));
+}
+
+/*
+ * __wt_split_multi --
+ * Split a page into multiple pages.
+ */
+int
+__wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing)
+{
+ WT_DECL_RET;
+
+ __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref);
+
+ /*
+ * Set the session split generation to ensure underlying code isn't
+ * surprised by internal page eviction, then proceed with the split.
+ */
+ WT_WITH_PAGE_INDEX(session,
+ ret = __split_multi_lock(session, ref, closing));
+ return (ret);
+}
+
+/*
+ * __split_reverse --
+ * Reverse split (rewrite a parent page's index to reflect an empty page).
+ */
+static int
+__split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
+{
+ WT_DECL_RET;
+ WT_PAGE *parent;
+
+ /* Lock the parent page, then proceed with the reverse split. */
+ WT_RET(__split_internal_lock(session, ref, false, &parent));
+ ret = __split_parent(session, ref, NULL, 0, 0, false, true);
+ __split_internal_unlock(session, parent);
+ return (ret);
}
/*
* __wt_split_reverse --
- * We have a locked ref that is empty and we want to rewrite the index in
- * its parent.
+ * Reverse split (rewrite a parent page's index to reflect an empty page).
*/
int
__wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_DECL_RET;
- WT_PAGE *parent;
- bool hazard;
__wt_verbose(session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref);
- WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard));
- ret = __split_parent(session, ref, NULL, 0, 0, false, true);
- WT_TRET(__split_internal_unlock(session, parent, hazard));
+ /*
+ * Set the session split generation to ensure underlying code isn't
+ * surprised by internal page eviction, then proceed with the reverse
+ * split.
+ */
+ WT_WITH_PAGE_INDEX(session, ret = __split_reverse(session, ref));
return (ret);
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c
index e3b9bbced48..d7150859e8f 100644
--- a/src/third_party/wiredtiger/src/btree/bt_stat.c
+++ b/src/third_party/wiredtiger/src/btree/bt_stat.c
@@ -137,7 +137,6 @@ __stat_page_col_var(
WT_CELL_UNPACK *unpack, _unpack;
WT_COL *cip;
WT_INSERT *ins;
- WT_UPDATE *upd;
uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt;
uint32_t i;
bool orig_deleted;
@@ -177,31 +176,39 @@ __stat_page_col_var(
* we find, correct the original count based on its state.
*/
WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
- upd = ins->upd;
- if (upd->type == WT_UPDATE_RESERVED)
- continue;
- if (upd->type == WT_UPDATE_DELETED) {
+ switch (ins->upd->type) {
+ case WT_UPDATE_DELETED:
if (!orig_deleted) {
++deleted_cnt;
--entry_cnt;
}
- } else
+ break;
+ case WT_UPDATE_MODIFIED:
+ case WT_UPDATE_STANDARD:
if (orig_deleted) {
--deleted_cnt;
++entry_cnt;
}
+ break;
+ case WT_UPDATE_RESERVED:
+ break;
+ }
}
}
/* Walk any append list. */
- WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) {
- if (ins->upd->type == WT_UPDATE_RESERVED)
- continue;
- if (ins->upd->type == WT_UPDATE_DELETED)
+ WT_SKIP_FOREACH(ins, WT_COL_APPEND(page))
+ switch (ins->upd->type) {
+ case WT_UPDATE_DELETED:
++deleted_cnt;
- else
+ break;
+ case WT_UPDATE_MODIFIED:
+ case WT_UPDATE_STANDARD:
++entry_cnt;
- }
+ break;
+ case WT_UPDATE_RESERVED:
+ break;
+ }
WT_STAT_INCRV(session, stats, btree_column_deleted, deleted_cnt);
WT_STAT_INCRV(session, stats, btree_column_rle, rle_cnt);
diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c
index 225e6812aa1..d783f8f6e71 100644
--- a/src/third_party/wiredtiger/src/btree/bt_walk.c
+++ b/src/third_party/wiredtiger/src/btree/bt_walk.c
@@ -18,9 +18,16 @@ __ref_index_slot(WT_SESSION_IMPL *session,
{
WT_PAGE_INDEX *pindex;
WT_REF **start, **stop, **p, **t;
+ uint64_t sleep_count, yield_count;
uint32_t entries, slot;
- for (;;) {
+ /*
+ * If we don't find our reference, the page split and our home
+ * pointer references the wrong page. When internal pages
+ * split, their WT_REF structure home values are updated; yield
+ * and wait for that to happen.
+ */
+ for (sleep_count = yield_count = 0;;) {
/*
* Copy the parent page's index value: the page can split at
* any time, but the index's value is always valid, even if
@@ -58,14 +65,14 @@ __ref_index_slot(WT_SESSION_IMPL *session,
goto found;
}
}
-
/*
- * If we don't find our reference, the page split and our home
- * pointer references the wrong page. When internal pages
- * split, their WT_REF structure home values are updated; yield
- * and wait for that to happen.
+ * We failed to get the page index and slot reference, yield
+ * before retrying, and if we've yielded enough times, start
+ * sleeping so we don't burn CPU to no purpose.
*/
- __wt_yield();
+ __wt_ref_state_yield_sleep(&yield_count, &sleep_count);
+ WT_STAT_CONN_INCRV(session, page_index_slot_ref_blocked,
+ sleep_count);
}
found: WT_ASSERT(session, pindex->index[slot] == ref);
@@ -177,12 +184,13 @@ __ref_descend_prev(
WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp)
{
WT_PAGE_INDEX *pindex;
+ uint64_t yield_count;
/*
* We're passed a child page into which we're descending, and on which
* we have a hazard pointer.
*/
- for (;; __wt_yield()) {
+ for (yield_count = 0;; yield_count++, __wt_yield()) {
/*
* There's a split race when a cursor moving backwards through
* the tree descends the tree. If we're splitting an internal
@@ -242,6 +250,7 @@ __ref_descend_prev(
break;
}
*pindexp = pindex;
+ WT_STAT_CONN_INCRV(session, tree_descend_blocked, yield_count);
}
/*
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
index e2d19bf705b..a57a9c17edb 100644
--- a/src/third_party/wiredtiger/src/btree/row_modify.c
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -268,13 +268,13 @@ __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value,
*/
if (modify_type == WT_UPDATE_DELETED ||
modify_type == WT_UPDATE_RESERVED)
- WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE), &upd));
+ WT_RET(__wt_calloc(session, 1, WT_UPDATE_SIZE, &upd));
else {
WT_RET(__wt_calloc(
- session, 1, sizeof(WT_UPDATE) + value->size, &upd));
+ session, 1, WT_UPDATE_SIZE + value->size, &upd));
if (value->size != 0) {
upd->size = WT_STORE_SIZE(value->size);
- memcpy(WT_UPDATE_DATA(upd), value->data, value->size);
+ memcpy(upd->data, value->data, value->size);
}
}
upd->type = (uint8_t)modify_type;
@@ -302,9 +302,16 @@ __wt_update_obsolete_check(
* freeing the memory.
*
* Walk the list of updates, looking for obsolete updates at the end.
+ *
+ * Only updates with globally visible, self-contained data can terminate
+ * update chains, ignore modified and reserved updates. Special case the
+ * first transaction ID, it flags column-store overflow values which can
+ * never be discarded.
*/
for (first = NULL, count = 0; upd != NULL; upd = upd->next, count++)
- if (__wt_txn_upd_visible_all(session, upd)) {
+ if (WT_UPDATE_DATA_VALUE(upd) &&
+ __wt_txn_upd_visible_all(session, upd) &&
+ upd->txnid != WT_TXN_FIRST) {
if (first == NULL)
first = upd;
} else if (upd->txnid != WT_TXN_ABORTED)