diff options
author | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-08-01 16:42:49 +1000 |
---|---|---|
committer | Alex Gorrod <alexander.gorrod@mongodb.com> | 2017-08-01 16:42:49 +1000 |
commit | 835bfb21d8e67663d84a40aa4f7370a4403725a9 (patch) | |
tree | 4f5edb231524f95272f834e31461ba4e17e52903 /src/third_party/wiredtiger/src/btree | |
parent | 6300b3bd4ad9cd238a02bdb8ca681a447913f1af (diff) | |
download | mongo-835bfb21d8e67663d84a40aa4f7370a4403725a9.tar.gz |
Import wiredtiger: 2e9744d11a65c63ba7445060dc78371250f04051 from branch mongodb-3.6
ref: 6173a98979..2e9744d11a
for: 3.5.11
WT-2309 Add yields and/or sleeps in #DIAGNOSTIC mode
WT-3047 Add mode aimed at uncovering race conditions in split code
WT-3308 Add statistics tracking around yield loops
WT-3316 Add new engineering section to reference guide documentation
WT-3338 Optimize cursor modify
WT-3380 Special case 8-byte timestamps
WT-3387 Add support for a stable timestamp
WT-3389 Restructure split code to hold a split generation for the entire operation.
WT-3406 Reconciliation is choosing reserved records for writing.
WT-3410 Add developer documentation for table rename
WT-3412 Add backoff logic to the btree delete and walk yield loops
WT-3418 block manager object race
WT-3422 WiredTiger upgrading documents out of date
WT-3432 workgen needs braces around an "if" body
WT-3433 session->alter method should not be supported in read-only mode
WT-3439 lint/cleanup
WT-3440 Add a log record when starting a checkpoint
WT-3442 Coverity 1378213: false positive on diagnostic assignment.
WT-3446 Temporarily disable timestamp testing in test/checkpoint
WT-3447 test_stat_log02 can assert before table stats are printed
WT-3461 Avoid long sleeps when the system clock is adjusted
WT-3463 Add recovery of backup to test_timestamp03.py
WT-3466 Track the first commit timestamp for each transaction
WT-3467 Minor lint/cleanup
Diffstat (limited to 'src/third_party/wiredtiger/src/btree')
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_curnext.c | 41 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_curprev.c | 41 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_cursor.c | 214 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_debug.c | 82 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_delete.c | 24 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_ovfl.c | 24 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_random.c | 7 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_read.c | 7 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_ret.c | 119 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_split.c | 383 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_stat.c | 31 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/bt_walk.c | 25 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/row_modify.c | 15 |
13 files changed, 685 insertions, 328 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index cb59bff8f75..eb8a258d475 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -15,12 +15,10 @@ static inline int __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage) { - WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; - val = &cbt->iface.value; if (newpage) { if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL) @@ -59,10 +57,10 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage) if (cbt->recno < WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { cbt->v = 0; - val->data = &cbt->v; + cbt->iface.value.data = &cbt->v; } else - val->data = WT_UPDATE_DATA(upd); - val->size = 1; + cbt->iface.value.data = upd->data; + cbt->iface.value.size = 1; return (0); } @@ -74,7 +72,6 @@ static inline int __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage) { WT_BTREE *btree; - WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; @@ -82,7 +79,6 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage) session = (WT_SESSION_IMPL *)cbt->iface.session; btree = S2BT(session); page = cbt->ref->page; - val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { @@ -108,10 +104,10 @@ new_page: upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd == NULL) { cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); - val->data = &cbt->v; + cbt->iface.value.data = &cbt->v; } else - val->data = WT_UPDATE_DATA(upd); - val->size = 1; + cbt->iface.value.data = upd->data; + cbt->iface.value.size = 1; return (0); } @@ -122,12 +118,10 @@ new_page: static inline int __cursor_var_append_next(WT_CURSOR_BTREE *cbt, bool newpage) { - WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; - val = &cbt->iface.value; if (newpage) { cbt->ins = WT_SKIP_FIRST(cbt->ins_head); @@ -147,9 +141,7 @@ new_page: if (cbt->ins == NULL) ++cbt->page_deleted_count; continue; } - val->data = WT_UPDATE_DATA(upd); - val->size = upd->size; - return (0); + return (__wt_value_return(session, cbt, upd)); } /* NOTREACHED */ } @@ -164,7 +156,6 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage) WT_CELL *cell; WT_CELL_UNPACK unpack; WT_COL *cip; - WT_ITEM *val; WT_INSERT *ins; WT_PAGE *page; WT_SESSION_IMPL *session; @@ -173,7 +164,6 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage) session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; - val = &cbt->iface.value; rle_start = 0; /* -Werror=maybe-uninitialized */ @@ -210,10 +200,7 @@ new_page: /* Find the matching WT_COL slot. */ ++cbt->page_deleted_count; continue; } - - val->data = WT_UPDATE_DATA(upd); - val->size = upd->size; - return (0); + return (__wt_value_return(session, cbt, upd)); } /* @@ -267,8 +254,8 @@ new_page: /* Find the matching WT_COL slot. */ cbt->cip_saved = cip; } - val->data = cbt->tmp->data; - val->size = cbt->tmp->size; + cbt->iface.value.data = cbt->tmp->data; + cbt->iface.value.size = cbt->tmp->size; return (0); } /* NOTREACHED */ @@ -282,7 +269,7 @@ static inline int __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage) { WT_INSERT *ins; - WT_ITEM *key, *val; + WT_ITEM *key; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; @@ -291,7 +278,6 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage) session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; - val = &cbt->iface.value; /* * For row-store pages, we need a single item that tells us the part @@ -332,9 +318,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); - val->data = WT_UPDATE_DATA(upd); - val->size = upd->size; - return (0); + return (__wt_value_return(session, cbt, upd)); } /* Check for the end of the page. */ @@ -363,7 +347,6 @@ new_insert: if ((ins = cbt->ins) != NULL) { ++cbt->page_deleted_count; continue; } - return (__cursor_row_slot_return(cbt, rip, upd)); } /* NOTREACHED */ diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 6e49f4df68c..c1395ea9008 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -127,12 +127,10 @@ restart: static inline int __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) { - WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; - val = &cbt->iface.value; if (newpage) { if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL) @@ -205,10 +203,10 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) cbt->recno > WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { cbt->v = 0; - val->data = &cbt->v; + cbt->iface.value.data = &cbt->v; } else - val->data = WT_UPDATE_DATA(upd); - val->size = 1; + cbt->iface.value.data = upd->data; + cbt->iface.value.size = 1; return (0); } @@ -220,7 +218,6 @@ static inline int __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_BTREE *btree; - WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; @@ -228,7 +225,6 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage) session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; btree = S2BT(session); - val = &cbt->iface.value; /* Initialize for each new page. */ if (newpage) { @@ -254,10 +250,10 @@ new_page: upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd == NULL) { cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); - val->data = &cbt->v; + cbt->iface.value.data = &cbt->v; } else - val->data = WT_UPDATE_DATA(upd); - val->size = 1; + cbt->iface.value.data = upd->data; + cbt->iface.value.size = 1; return (0); } @@ -268,12 +264,10 @@ new_page: static inline int __cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) { - WT_ITEM *val; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; - val = &cbt->iface.value; if (newpage) { cbt->ins = WT_SKIP_LAST(cbt->ins_head); @@ -293,9 +287,7 @@ new_page: if (cbt->ins == NULL) ++cbt->page_deleted_count; continue; } - val->data = WT_UPDATE_DATA(upd); - val->size = upd->size; - return (0); + return (__wt_value_return(session, cbt, upd)); } /* NOTREACHED */ } @@ -311,7 +303,6 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage) WT_CELL_UNPACK unpack; WT_COL *cip; WT_INSERT *ins; - WT_ITEM *val; WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; @@ -319,7 +310,6 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage) session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; - val = &cbt->iface.value; rle_start = 0; /* -Werror=maybe-uninitialized */ @@ -357,10 +347,7 @@ new_page: if (cbt->recno < cbt->ref->ref_recno) ++cbt->page_deleted_count; continue; } - - val->data = WT_UPDATE_DATA(upd); - val->size = upd->size; - return (0); + return (__wt_value_return(session, cbt, upd)); } /* @@ -413,8 +400,8 @@ new_page: if (cbt->recno < cbt->ref->ref_recno) cbt->cip_saved = cip; } - val->data = cbt->tmp->data; - val->size = cbt->tmp->size; + cbt->iface.value.data = cbt->tmp->data; + cbt->iface.value.size = cbt->tmp->size; return (0); } /* NOTREACHED */ @@ -428,7 +415,7 @@ static inline int __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_INSERT *ins; - WT_ITEM *key, *val; + WT_ITEM *key; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; @@ -437,7 +424,6 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; - val = &cbt->iface.value; /* * For row-store pages, we need a single item that tells us the part @@ -489,9 +475,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); - val->data = WT_UPDATE_DATA(upd); - val->size = upd->size; - return (0); + return (__wt_value_return(session, cbt, upd)); } /* Check for the beginning of the page. */ @@ -522,7 +506,6 @@ new_insert: if ((ins = cbt->ins) != NULL) { ++cbt->page_deleted_count; continue; } - return (__cursor_row_slot_return(cbt, rip, upd)); } /* NOTREACHED */ diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 52435eeefed..d58dc78fbed 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -308,8 +308,22 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) } /* + * __cursor_kv_return -- + * Return a page referenced key/value pair to the application. + */ +static inline int +__cursor_kv_return( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + WT_RET(__wt_key_return(session, cbt)); + WT_RET(__wt_value_return(session, cbt, upd)); + + return (0); +} + +/* * __cursor_col_search -- - * Column-store search from an application cursor. + * Column-store search from a cursor. */ static inline int __cursor_col_search( @@ -324,7 +338,7 @@ __cursor_col_search( /* * __cursor_row_search -- - * Row-store search from an application cursor. + * Row-store search from a cursor. */ static inline int __cursor_row_search( @@ -338,8 +352,32 @@ __cursor_row_search( } /* + * __cursor_col_modify_v -- + * Column-store modify from a cursor, with a separate value. + */ +static inline int +__cursor_col_modify_v(WT_SESSION_IMPL *session, + WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) +{ + return (__wt_col_modify(session, cbt, + cbt->iface.recno, value, NULL, modify_type, false)); +} + +/* + * __cursor_row_modify_v -- + * Row-store modify from a cursor, with a separate value. + */ +static inline int +__cursor_row_modify_v(WT_SESSION_IMPL *session, + WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) +{ + return (__wt_row_modify(session, cbt, + &cbt->iface.key, value, NULL, modify_type, false)); +} + +/* * __cursor_col_modify -- - * Column-store delete, insert, and update from an application cursor. + * Column-store modify from a cursor. */ static inline int __cursor_col_modify( @@ -351,7 +389,7 @@ __cursor_col_modify( /* * __cursor_row_modify -- - * Row-store insert, update and delete from an application cursor. + * Row-store modify from a cursor. */ static inline int __cursor_row_modify( @@ -442,7 +480,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) } if (valid) - ret = __wt_kv_return(session, cbt, upd); + ret = __cursor_kv_return(session, cbt, upd); else if (__cursor_fix_implicit(btree, cbt)) { /* * Creating a record past the end of the tree in a fixed-length @@ -564,7 +602,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) */ if (valid) { exact = cbt->compare; - ret = __wt_kv_return(session, cbt, upd); + ret = __cursor_kv_return(session, cbt, upd); } else if (__cursor_fix_implicit(btree, cbt)) { cbt->recno = cursor->recno; cbt->v = 0; @@ -582,7 +620,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) __cursor_col_search(session, cbt, NULL)); if (__wt_cursor_valid(cbt, &upd)) { exact = cbt->compare; - ret = __wt_kv_return(session, cbt, upd); + ret = __cursor_kv_return(session, cbt, upd); } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) exact = -1; } @@ -987,7 +1025,7 @@ done: /* * Update a record in the tree. */ static int -__btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type) +__btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) { WT_BTREE *btree; WT_CURFILE_STATE state; @@ -1015,6 +1053,7 @@ __btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type) */ if (__cursor_page_pinned(cbt) && F_ISSET(cursor, WT_CURSTD_KEY_INT)) { WT_ERR(__wt_txn_autocommit_check(session)); + /* * The cursor position may not be exact (the cursor's comparison * value not equal to zero). Correct to an exact match so we can @@ -1022,8 +1061,8 @@ __btcur_update(WT_CURSOR_BTREE *cbt, u_int modify_type) */ cbt->compare = 0; ret = btree->type == BTREE_ROW ? - __cursor_row_modify(session, cbt, modify_type) : - __cursor_col_modify(session, cbt, modify_type); + __cursor_row_modify_v(session, cbt, value, modify_type) : + __cursor_col_modify_v(session, cbt, value, modify_type); if (ret == 0) goto done; @@ -1052,6 +1091,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); if (btree->type == BTREE_ROW) { WT_ERR(__cursor_row_search(session, cbt, NULL, true)); + /* * If not overwriting, check for conflicts and fail if the key * does not exist. @@ -1061,7 +1101,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) WT_ERR(WT_NOTFOUND); } - ret = __cursor_row_modify(session, cbt, modify_type); + ret = __cursor_row_modify_v(session, cbt, value, modify_type); } else { WT_ERR(__cursor_col_search(session, cbt, NULL)); @@ -1080,7 +1120,7 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } - ret = __cursor_col_modify(session, cbt, modify_type); + ret = __cursor_col_modify_v(session, cbt, value, modify_type); } err: if (ret == WT_RESTART) { @@ -1097,14 +1137,33 @@ err: if (ret == WT_RESTART) { * To make this work, we add a field to the btree cursor to pass back a * pointer to the modify function's allocated update structure. */ -done: if (ret == 0) { - if (modify_type == WT_UPDATE_RESERVED) { +done: if (ret == 0) + switch (modify_type) { + case WT_UPDATE_STANDARD: + /* + * WT_CURSOR.update returns a key and a value. + */ + WT_TRET(__cursor_kv_return( + session, cbt, cbt->modify_update)); + break; + case WT_UPDATE_RESERVED: + /* + * WT_CURSOR.reserve doesn't return any value. + */ F_CLR(cursor, WT_CURSTD_VALUE_SET); + /* FALLTHROUGH */ + case WT_UPDATE_MODIFIED: + /* + * WT_CURSOR.modify has already created the return value + * and our job is to leave it untouched. + */ WT_TRET(__wt_key_return(session, cbt)); - } else - WT_TRET( - __wt_kv_return(session, cbt, cbt->modify_update)); - } + break; + case WT_UPDATE_DELETED: + default: + WT_TRET(__wt_illegal_value(session, NULL)); + break; + } if (ret != 0) { WT_TRET(__cursor_reset(cbt)); @@ -1115,6 +1174,121 @@ done: if (ret == 0) { } /* + * __cursor_chain_exceeded -- + * Return if the update chain has exceeded the limit. Deleted or standard + * updates are anticipated to be sufficient to base the modify (although that's + * not guaranteed, they may not be visible or might abort before we read them). + * Also, this is not a hard limit, threads can race modifying updates. + */ +static bool +__cursor_chain_exceeded(WT_CURSOR_BTREE *cbt) +{ + WT_PAGE *page; + WT_UPDATE *upd; + int i; + + page = cbt->ref->page; + + upd = NULL; + if (cbt->ins != NULL) + upd = cbt->ins->upd; + else if (cbt->btree->type == BTREE_ROW && + page->modify != NULL && page->modify->mod_row_update != NULL) + upd = page->modify->mod_row_update[cbt->slot]; + + for (i = 0; upd != NULL; ++i, upd = upd->next) { + if (upd->type == WT_UPDATE_DELETED || + upd->type == WT_UPDATE_STANDARD) + return (false); + if (i >= WT_MAX_MODIFY_UPDATE) + return (true); + } + return (false); +} + +/* + * __wt_btcur_modify -- + * Modify a record in the tree. + */ +int +__wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries) +{ + WT_CURFILE_STATE state; + WT_CURSOR *cursor; + WT_DECL_ITEM(modify); + WT_DECL_RET; + WT_SESSION_IMPL *session; + size_t orig, new; + bool chain_exceeded, overwrite; + + cursor = &cbt->iface; + session = (WT_SESSION_IMPL *)cursor->session; + + WT_STAT_CONN_INCR(session, cursor_modify); + WT_STAT_DATA_INCR(session, cursor_modify); + + /* Save the cursor state. */ + __cursor_state_save(cursor, &state); + + /* + * Get the current value and apply the modification to it, for a few + * reasons: first, we set the updated value so the application can + * retrieve the cursor's value; second, we use the updated value as + * the update if the update chain is too long; third, there's a check + * if the updated value is too large to store; fourth, to simplify the + * count of bytes being added/removed; fifth, we can get into serious + * trouble if we attempt to modify a value that doesn't exist. For the + * fifth reason, verify we're not in a read-uncommitted transaction, + * that implies a value that might disappear out from under us. + */ + if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED) + WT_ERR_MSG(session, ENOTSUP, + "not supported in read-uncommitted transactions"); + + WT_ERR(__wt_btcur_search(cbt)); + orig = cursor->value.size; + WT_ERR(__wt_modify_apply_api( + session, &cursor->value, entries, nentries)); + new = cursor->value.size; + WT_ERR(__cursor_size_chk(session, &cursor->value)); + if (new > orig) + WT_STAT_DATA_INCRV(session, cursor_update_bytes, new - orig); + else + WT_STAT_DATA_DECRV(session, cursor_update_bytes, orig - new); + + /* + * WT_CURSOR.modify is update-without-overwrite. + * + * Use the modify buffer as the update if under the limit, else use the + * complete value. + */ + overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE); + F_CLR(cursor, WT_CURSTD_OVERWRITE); + chain_exceeded = __cursor_chain_exceeded(cbt); + if (chain_exceeded) + ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD); + else if ((ret = + __wt_modify_pack(session, &modify, entries, nentries)) == 0) + ret = __btcur_update(cbt, modify, WT_UPDATE_MODIFIED); + if (overwrite) + F_SET(cursor, WT_CURSTD_OVERWRITE); + + /* + * We have our own cursor state restoration because we've modified the + * cursor before calling the underlying cursor update function and we + * need to restore it to its original state. This means multiple calls + * to reset the cursor, but that shouldn't be a problem. + */ + if (ret != 0) { +err: WT_TRET(__cursor_reset(cbt)); + __cursor_state_restore(cursor, &state); + } + + __wt_scr_free(session, &modify); + return (ret); +} + +/* * __wt_btcur_reserve -- * Reserve a record in the tree. */ @@ -1135,7 +1309,7 @@ __wt_btcur_reserve(WT_CURSOR_BTREE *cbt) /* WT_CURSOR.reserve is update-without-overwrite and a special value. */ overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE); F_CLR(cursor, WT_CURSTD_OVERWRITE); - ret = __btcur_update(cbt, WT_UPDATE_RESERVED); + ret = __btcur_update(cbt, &cursor->value, WT_UPDATE_RESERVED); if (overwrite) F_SET(cursor, WT_CURSTD_OVERWRITE); return (ret); @@ -1164,7 +1338,7 @@ __wt_btcur_update(WT_CURSOR_BTREE *cbt) WT_RET(__cursor_size_chk(session, &cursor->key)); WT_RET(__cursor_size_chk(session, &cursor->value)); - return (__btcur_update(cbt, WT_UPDATE_STANDARD)); + return (__btcur_update(cbt, &cursor->value, WT_UPDATE_STANDARD)); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index c0aaf3f42d9..b8d11be7b3e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -986,6 +986,35 @@ __debug_row_skip(WT_DBG *ds, WT_INSERT_HEAD *head) } /* + * __debug_modified -- + * Dump a modified update. + */ +static int +__debug_modified(WT_DBG *ds, WT_UPDATE *upd) +{ + const size_t *p; + int nentries; + const uint8_t *data; + void *modify; + + modify = upd->data; + + p = modify; + nentries = (int)*p++; + data = (uint8_t *)modify + + sizeof(size_t) + ((size_t)nentries * 3 * sizeof(size_t)); + + WT_RET(ds->f(ds, "%d: ", nentries)); + for (; nentries-- > 0; data += p[0], p += 3) + WT_RET(ds->f(ds, + "{%" WT_SIZET_FMT ", %" WT_SIZET_FMT ", %" WT_SIZET_FMT + ", %.*s}%s", p[0], p[1], p[2], + (int)p[2], data, nentries == 0 ? "" : ", ")); + + return (0); +} + +/* * __debug_update -- * Dump an update list. */ @@ -993,37 +1022,46 @@ static int __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte) { for (; upd != NULL; upd = upd->next) { - if (upd->type == WT_UPDATE_DELETED) + switch (upd->type) { + case WT_UPDATE_DELETED: WT_RET(ds->f(ds, "\tvalue {deleted}\n")); - else if (upd->type == WT_UPDATE_RESERVED) - WT_RET(ds->f(ds, "\tvalue {reserved}\n")); - else if (hexbyte) { - WT_RET(ds->f(ds, "\t{")); - WT_RET(__debug_hex_byte(ds, - *(uint8_t *)WT_UPDATE_DATA(upd))); + break; + case WT_UPDATE_MODIFIED: + WT_RET(ds->f(ds, "\tvalue {modified: ")); + WT_RET(__debug_modified(ds, upd)); WT_RET(ds->f(ds, "}\n")); - } else - WT_RET(__debug_item(ds, - "value", WT_UPDATE_DATA(upd), upd->size)); - WT_RET(ds->f(ds, "\t" "txn id %" PRIu64, upd->txnid)); + break; + case WT_UPDATE_RESERVED: + WT_RET(ds->f(ds, "\tvalue {reserved}\n")); + break; + case WT_UPDATE_STANDARD: + if (hexbyte) { + WT_RET(ds->f(ds, "\t{")); + WT_RET(__debug_hex_byte(ds, *upd->data)); + WT_RET(ds->f(ds, "}\n")); + } else + WT_RET(__debug_item(ds, + "value", upd->data, upd->size)); + break; + } + if (upd->txnid == WT_TXN_ABORTED) + WT_RET(ds->f(ds, "\t" "txn aborted")); + else + WT_RET(ds->f(ds, "\t" "txn id %" PRIu64, upd->txnid)); #ifdef HAVE_TIMESTAMPS - if (!__wt_timestamp_iszero(upd->timestamp)) { + if (!__wt_timestamp_iszero( + WT_TIMESTAMP_NULL(&upd->timestamp))) { #if WT_TIMESTAMP_SIZE == 8 - { - uint64_t ts; - __wt_timestamp_set( - (uint8_t *)&ts, (uint8_t *)&upd->timestamp[0]); - ts = __wt_bswap64(ts); - WT_RET(ds->f(ds, ", stamp %" PRIu64, ts)); - } + WT_RET(ds->f(ds, + ", stamp %" PRIu64, upd->timestamp.val)); #else - { int i; + WT_RET(ds->f(ds, ", stamp 0x")); for (i = 0; i < WT_TIMESTAMP_SIZE; ++i) - WT_RET(ds->f(ds, "%" PRIx8, upd->timestamp[i])); - } + WT_RET(ds->f(ds, + "%" PRIx8, upd->timestamp.ts[i])); #endif } #endif diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index eac8994a5a4..093192dbaa0 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -153,6 +153,7 @@ void __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) { WT_UPDATE **upd; + uint64_t sleep_count, yield_count; /* * If the page is still "deleted", it's as we left it, reset the state @@ -160,7 +161,7 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) * instantiated or being instantiated. Loop because it's possible for * the page to return to the deleted state if instantiation fails. */ - for (;; __wt_yield()) + for (sleep_count = yield_count = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_READING: @@ -205,6 +206,15 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) __wt_free(session, ref->page_del); return; } + /* + * We wait for the change in page state, yield before retrying, + * and if we've yielded enough times, start sleeping so we don't + * burn CPU to no purpose. + */ + __wt_ref_state_yield_sleep(&yield_count, &sleep_count); + WT_STAT_CONN_INCRV(session, page_del_rollback_blocked, + sleep_count); + } } /* @@ -242,10 +252,10 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) return (false); skip = ref->page_del == NULL || (visible_all ? - __wt_txn_visible_all(session, - ref->page_del->txnid, WT_GET_TIMESTAMP(ref->page_del)): - __wt_txn_visible(session, - ref->page_del->txnid, WT_GET_TIMESTAMP(ref->page_del))); + __wt_txn_visible_all(session, ref->page_del->txnid, + WT_TIMESTAMP_NULL(&ref->page_del->timestamp)): + __wt_txn_visible(session, ref->page_del->txnid, + WT_TIMESTAMP_NULL(&ref->page_del->timestamp))); /* * The page_del structure can be freed as soon as the delete is stable: @@ -254,8 +264,8 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) * no longer need synchronization to check the ref. */ if (skip && ref->page_del != NULL && (visible_all || - __wt_txn_visible_all(session, - ref->page_del->txnid, WT_GET_TIMESTAMP(ref->page_del)))) { + __wt_txn_visible_all(session, ref->page_del->txnid, + WT_TIMESTAMP_NULL(&ref->page_del->timestamp)))) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } diff --git a/src/third_party/wiredtiger/src/btree/bt_ovfl.c b/src/third_party/wiredtiger/src/btree/bt_ovfl.c index a0b1ff65006..f933245eaef 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ovfl.c +++ b/src/third_party/wiredtiger/src/btree/bt_ovfl.c @@ -45,13 +45,15 @@ __ovfl_read(WT_SESSION_IMPL *session, */ int __wt_ovfl_read(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store) + WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded) { WT_DECL_RET; WT_OVFL_TRACK *track; WT_UPDATE *upd; size_t i; + *decoded = false; + /* * If no page specified, there's no need to lock and there's no cache * to search, we don't care about WT_CELL_VALUE_OVFL_RM cells. @@ -78,8 +80,9 @@ __wt_ovfl_read(WT_SESSION_IMPL *session, break; } WT_ASSERT(session, i < track->remove_next); - store->data = WT_UPDATE_DATA(upd); + store->data = upd->data; store->size = upd->size; + *decoded = true; } else ret = __ovfl_read(session, unpack->data, unpack->size, store); __wt_readunlock(session, &S2BT(session)->ovfl_lock); @@ -147,7 +150,7 @@ __ovfl_cache_append_update(WT_SESSION_IMPL *session, WT_PAGE *page, /* Read the overflow value. */ WT_RET(__wt_scr_alloc(session, 1024, &tmp)); - WT_ERR(__ovfl_read(session, unpack->data, unpack->size, tmp)); + WT_ERR(__wt_dsk_cell_data_ref(session, page->type, unpack, tmp)); /* * Create an update entry with no transaction ID to ensure global @@ -159,10 +162,23 @@ __ovfl_cache_append_update(WT_SESSION_IMPL *session, WT_PAGE *page, * involves atomic operations which will act as our barrier. Regardless, * we update the page footprint as part of this operation, which acts as * a barrier as well. + * + * The update transaction ID choice is tricky, to work around an issue + * in variable-length column store. Imagine an overflow value with an + * RLE greater than 1. We append a copy to the end of an update chain, + * but it's possible it's the overflow value for more than one record, + * and appending it to the end of one record's update chain means a + * subsequent enter of a globally visible value to one of the records + * would allow the truncation of the overflow chain that leaves other + * records without a value. If appending such an overflow record, set + * the transaction ID to the first possible transaction ID. That ID is + * old enough to be globally visible, but we can use it as a flag if an + * update record cannot be discarded when truncating an update chain. */ WT_ERR(__wt_update_alloc( session, tmp, &append, &size, WT_UPDATE_STANDARD)); - append->txnid = WT_TXN_NONE; + append->txnid = page->type == WT_PAGE_COL_VAR && + __wt_cell_rle(unpack) > 1 ? WT_TXN_FIRST : WT_TXN_NONE; for (upd = upd_list; upd->next != NULL; upd = upd->next) ; WT_PUBLISH(upd->next, append); diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index 1bdf0fd1c8b..f28c4e10594 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -417,9 +417,10 @@ random_page_entry: * the next entry, if that doesn't work, move to the previous entry. */ WT_ERR(__wt_row_random_leaf(session, cbt)); - if (__wt_cursor_valid(cbt, &upd)) - WT_ERR(__wt_kv_return(session, cbt, upd)); - else { + if (__wt_cursor_valid(cbt, &upd)) { + WT_ERR(__wt_key_return(session, cbt)); + WT_ERR(__wt_value_return(session, cbt, upd)); + } else { if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) ret = __wt_btcur_prev(cbt, false); WT_ERR(ret); diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 6a89f505c31..91c1499840e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -194,7 +194,7 @@ __las_page_instantiate(WT_SESSION_IMPL *session, upd->txnid = upd_txnid; #ifdef HAVE_TIMESTAMPS WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE); - __wt_timestamp_set(upd->timestamp, las_timestamp.data); + __wt_timestamp_set(&upd->timestamp, las_timestamp.data); #endif switch (page->type) { @@ -487,7 +487,7 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; - u_int sleep_cnt, wait_cnt; + uint64_t sleep_cnt, wait_cnt; bool busy, cache_work, evict_soon, stalled; int force_attempts; @@ -672,9 +672,8 @@ skip_evict: if (cache_work) continue; } - sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000); + __wt_ref_state_yield_sleep(&wait_cnt, &sleep_cnt); WT_STAT_CONN_INCRV(session, page_sleep, sleep_cnt); - __wt_sleep(0, sleep_cnt); } } diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index 7212de72d6e..4452e6eb0c6 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -75,10 +75,10 @@ __key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) /* * __value_return -- - * Change the cursor to reference an internal return value. + * Change the cursor to reference an internal original-page return value. */ static inline int -__value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +__value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_CELL *cell; @@ -93,13 +93,6 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) page = cbt->ref->page; cursor = &cbt->iface; - /* If the cursor references a WT_UPDATE item, return it. */ - if (upd != NULL) { - cursor->value.data = WT_UPDATE_DATA(upd); - cursor->value.size = upd->size; - return (0); - } - if (page->type == WT_PAGE_ROW_LEAF) { rip = &page->pg_row[cbt->slot]; @@ -136,6 +129,99 @@ __value_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) } /* + * __value_return_upd -- + * Change the cursor to reference an internal update structure return + * value. + */ +static inline int +__value_return_upd( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + WT_UPDATE **listp, *list[WT_MAX_MODIFY_UPDATE]; + u_int i; + size_t allocated_bytes; + + cursor = &cbt->iface; + allocated_bytes = 0; + + /* + * We're passed a "standard" or "modified" update that's visible to us. + * Our caller should have already checked for deleted items (we're too + * far down the call stack to return not-found). + * + * Fast path if it's a standard item, assert our caller's behavior. + */ + if (upd->type == WT_UPDATE_STANDARD) { + cursor->value.data = upd->data; + cursor->value.size = upd->size; + return (0); + } + WT_ASSERT(session, upd->type == WT_UPDATE_MODIFIED); + + /* + * Find a complete update that's visible to us, tracking modifications + * that are visible to us. + */ + for (i = 0, listp = list; upd != NULL; upd = upd->next) { + if (!__wt_txn_upd_visible(session, upd)) + continue; + + if (WT_UPDATE_DATA_VALUE(upd)) + break; + + if (upd->type == WT_UPDATE_MODIFIED) { + /* + * Update lists are expected to be short, but it's not + * guaranteed. There's sufficient room on the stack to + * avoid memory allocation in normal cases, but we have + * to handle the edge cases too. + */ + if (i >= WT_MAX_MODIFY_UPDATE) { + if (i == WT_MAX_MODIFY_UPDATE) + listp = NULL; + WT_ERR(__wt_realloc_def( + session, &allocated_bytes, i + 1, &listp)); + if (i == WT_MAX_MODIFY_UPDATE) + memcpy(listp, list, sizeof(list)); + } + listp[i++] = upd; + } + } + + /* + * If we hit the end of the chain, roll forward from the update item we + * found, otherwise, from the original page's value. + */ + if (upd == NULL) { + /* + * Callers of this function set the cursor slot to an impossible + * value to check we're not trying to return on-page values when + * the update list should have been sufficient (which happens, + * for example, if an update list was truncated, deleting some + * standard update required by a previous modify update). Assert + * the case. + */ + WT_ASSERT(session, cbt->slot != UINT32_MAX); + + WT_ERR(__value_return(session, cbt)); + } else if (upd->type == WT_UPDATE_DELETED) + WT_ERR(__wt_buf_set(session, &cursor->value, "", 0)); + else + WT_ERR(__wt_buf_set(session, + &cursor->value, upd->data, upd->size)); + + while (i > 0) + WT_ERR(__wt_modify_apply( + session, &cursor->value, listp[--i]->data)); + +err: if (allocated_bytes) + __wt_free(session, listp); + return (ret); +} + +/* * __wt_key_return -- * Change the cursor to reference an internal return key. */ @@ -164,21 +250,22 @@ __wt_key_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) } /* - * __wt_kv_return -- - * Return a page referenced key/value pair to the application. + * __wt_value_return -- + * Change the cursor to reference an internal return value. */ int -__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +__wt_value_return( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) { WT_CURSOR *cursor; cursor = &cbt->iface; - WT_RET(__wt_key_return(session, cbt)); - F_CLR(cursor, WT_CURSTD_VALUE_EXT); - WT_RET(__value_return(session, cbt, upd)); + if (upd == NULL) + WT_RET(__value_return(session, cbt)); + else + WT_RET(__value_return_upd(session, cbt, upd)); F_SET(cursor, WT_CURSTD_VALUE_INT); - return (0); } diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index c1b7b6c4001..2862c7fb6d7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -31,6 +31,24 @@ typedef enum { } WT_SPLIT_ERROR_PHASE; /* + * __page_split_timing_stress -- + * Optionally add delay to simulate the race conditions in page split for + * debug purposes. The purpose is to uncover the race conditions in page split. + */ +static void +__page_split_timing_stress(WT_SESSION_IMPL *session, + uint32_t flag, uint64_t micro_seconds) +{ + WT_CONNECTION_IMPL *conn; + + conn = S2C(session); + + /* We only want to sleep when page split race flag is set. */ + if (FLD_ISSET(conn->timing_stress_flags, flag)) + __wt_sleep(0, micro_seconds); +} + +/* * __split_safe_free -- * Free a buffer if we can be sure no thread is accessing it, or schedule * it to be freed otherwise. @@ -308,8 +326,8 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, * Prepare a set of WT_REFs for a move. */ static void -__split_ref_prepare(WT_SESSION_IMPL *session, - WT_PAGE_INDEX *pindex, uint64_t split_gen, bool skip_first) +__split_ref_prepare( + WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first) { WT_PAGE *child; WT_REF *child_ref, *ref; @@ -331,40 +349,12 @@ __split_ref_prepare(WT_SESSION_IMPL *session, ref = pindex->index[i]; child = ref->page; - /* - * Block eviction in newly created pages. - * - * Once the split is live, newly created internal pages might be - * evicted and their WT_REF structures freed. If that happened - * before all threads exit the index of the page that previously - * "owned" the WT_REF, a thread might see a freed WT_REF. To - * ensure that doesn't happen, the newly created page contains - * the current split generation and can't be evicted until - * all readers have left the old generation. - * - * Historic, we also blocked splits in newly created pages - * because we didn't update the WT_REF.home field until after - * the split was live, so the WT_REF.home fields being updated - * could split again before the update, there's a race between - * splits as to which would update them first. The current code - * updates the WT_REF.home fields before going live (in this - * function), this isn't an issue. - */ - child->pg_intl_split_gen = split_gen; - - /* - * We use a page flag to prevent the child from splitting from - * underneath us, but the split-generation error checks don't - * know about that flag; use the standard macros to ensure that - * reading the child's page index structure is safe. - */ + /* Switch the WT_REF's to their new page. */ j = 0; - WT_ENTER_PAGE_INDEX(session); WT_INTL_FOREACH_BEGIN(session, child, child_ref) { child_ref->home = child; child_ref->pindex_hint = j++; } WT_INTL_FOREACH_END; - WT_LEAVE_PAGE_INDEX(session); #ifdef HAVE_DIAGNOSTIC WT_WITH_PAGE_INDEX(session, @@ -447,6 +437,18 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__wt_calloc_one(session, alloc_refp)); root_incr += children * sizeof(WT_REF); + /* + * Once the split is live, newly created internal pages might be evicted + * and their WT_REF structures freed. If that happens before all threads + * exit the index of the page that previously "owned" the WT_REF, a + * thread might see a freed WT_REF. To ensure that doesn't happen, the + * created pages are set to the current split generation and so can't be + * evicted until all readers have left the old generation. + * + * Our thread has a stable split generation, get a copy. + */ + split_gen = __wt_session_gen(session, WT_GEN_SPLIT); + /* Allocate child pages, and connect them into the new page index. */ for (root_refp = pindex->index, alloc_refp = alloc_index->index, i = 0; i < children; ++i) { @@ -471,10 +473,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) ref->ref_recno = (*root_refp)->ref_recno; ref->state = WT_REF_MEM; - /* Initialize the child page. */ + /* + * Initialize the child page. + * Block eviction in newly created pages and mark them dirty. + */ child->pg_intl_parent_ref = ref; - - /* Mark it dirty. */ + child->pg_intl_split_gen = split_gen; WT_ERR(__wt_page_modify_init(session, child)); __wt_page_modify_set(session, child); @@ -504,13 +508,12 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* - * Prepare the WT_REFs for the move: this requires a stable split - * generation to block splits in newly created pages, so get one. - */ - WT_ENTER_PAGE_INDEX(session); - __split_ref_prepare(session, alloc_index, - __wt_session_gen(session, WT_GEN_SPLIT), false); + /* Prepare the WT_REFs for the move. */ + __split_ref_prepare(session, alloc_index, false); + + /* Encourage a race */ + __page_split_timing_stress(session, + WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND); /* * Confirm the root page's index hasn't moved, then update it, which @@ -520,12 +523,21 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_INTL_INDEX_SET(root, alloc_index); alloc_index = NULL; - WT_LEAVE_PAGE_INDEX(session); + /* Encourage a race */ + __page_split_timing_stress(session, + WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND); /* * Get a generation for this split, mark the root page. This must be * after the new index is swapped into place in order to know that no * readers are looking at the old index. + * + * Note: as the root page cannot currently be evicted, the root split + * generation isn't ever used. That said, it future proofs eviction + * and isn't expensive enough to special-case. + * + * Getting a new split generation implies a full barrier, no additional + * barrier is needed. */ split_gen = __wt_gen_next(session, WT_GEN_SPLIT); root->pg_intl_split_gen = split_gen; @@ -700,6 +712,10 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; + /* Encourage a race */ + __page_split_timing_stress(session, + WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND); + /* * Confirm the parent page's index hasn't moved then update it, which * makes the split visible to threads descending the tree. @@ -708,10 +724,17 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, WT_INTL_INDEX_SET(parent, alloc_index); alloc_index = NULL; + /* Encourage a race */ + __page_split_timing_stress(session, + WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND); + /* * Get a generation for this split, mark the page. This must be after * the new index is swapped into place in order to know that no readers * are looking at the old index. + * + * Getting a new split generation implies a full barrier, no additional + * barrier is needed. */ split_gen = __wt_gen_next(session, WT_GEN_SPLIT); parent->pg_intl_split_gen = split_gen; @@ -760,7 +783,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, * Swapping in the new page index released the page for eviction, we can * no longer look inside the page. */ - if (ref->page == NULL) __wt_verbose(session, WT_VERB_SPLIT, "%p: reverse split into parent %p, %" PRIu32 " -> %" PRIu32 @@ -779,8 +801,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, /* * The new page index is in place, free the WT_REF we were splitting and * any deleted WT_REFs we found, modulo the usual safe free semantics. - * - * Acquire a new split generation. */ for (i = 0, deleted_refs = scr->mem; i < deleted_entries; ++i) { next_ref = pindex->index[deleted_refs[i]]; @@ -976,6 +996,18 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__wt_calloc_one(session, alloc_refp)); parent_incr += children * sizeof(WT_REF); + /* + * Once the split is live, newly created internal pages might be evicted + * and their WT_REF structures freed. If that happens before all threads + * exit the index of the page that previously "owned" the WT_REF, a + * thread might see a freed WT_REF. To ensure that doesn't happen, the + * created pages are set to the current split generation and so can't be + * evicted until all readers have left the old generation. + * + * Our thread has a stable split generation, get a copy. + */ + split_gen = __wt_session_gen(session, WT_GEN_SPLIT); + /* Allocate child pages, and connect them into the new page index. */ WT_ASSERT(session, page_refp == pindex->index + chunk); for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) { @@ -1000,10 +1032,12 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) ref->ref_recno = (*page_refp)->ref_recno; ref->state = WT_REF_MEM; - /* Initialize the child page. */ + /* + * Initialize the child page. + * Block eviction in newly created pages and mark them dirty. + */ child->pg_intl_parent_ref = ref; - - /* Mark it dirty. */ + child->pg_intl_split_gen = split_gen; WT_ERR(__wt_page_modify_init(session, child)); __wt_page_modify_set(session, child); @@ -1033,32 +1067,35 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) /* Start making real changes to the tree, errors are fatal. */ complete = WT_ERR_PANIC; - /* - * Prepare the WT_REFs for the move: this requires a stable split - * generation to block splits in newly created pages, so get one. - */ - WT_ENTER_PAGE_INDEX(session); - __split_ref_prepare(session, alloc_index, - __wt_session_gen(session, WT_GEN_SPLIT), true); + /* Prepare the WT_REFs for the move. */ + __split_ref_prepare(session, alloc_index, true); + + /* Encourage a race */ + __page_split_timing_stress(session, + WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND); /* Split into the parent. */ - if ((ret = __split_parent(session, page_ref, alloc_index->index, - alloc_index->entries, parent_incr, false, false)) == 0) { - /* - * Confirm the page's index hasn't moved, then update it, which - * makes the split visible to threads descending the tree. - */ - WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); - WT_INTL_INDEX_SET(page, replace_index); - } + WT_ERR(__split_parent(session, page_ref, alloc_index->index, + alloc_index->entries, parent_incr, false, false)); - WT_LEAVE_PAGE_INDEX(session); - WT_ERR(ret); + /* + * Confirm the page's index hasn't moved, then update it, which + * makes the split visible to threads descending the tree. + */ + WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex); + WT_INTL_INDEX_SET(page, replace_index); + + /* Encourage a race */ + __page_split_timing_stress(session, + WT_TIMING_STRESS_INTERNAL_PAGE_SPLIT_RACE, 100 * WT_THOUSAND); /* * Get a generation for this split, mark the parent page. This must be * after the new index is swapped into place in order to know that no * readers are looking at the old index. + * + * Getting a new split generation implies a full barrier, no additional + * barrier is needed. */ split_gen = __wt_gen_next(session, WT_GEN_SPLIT); page->pg_intl_split_gen = split_gen; @@ -1122,18 +1159,15 @@ err: switch (complete) { } /* - * __split_internal_lock_worker -- + * __split_internal_lock -- * Lock an internal page. */ static int -__split_internal_lock_worker(WT_SESSION_IMPL *session, - WT_REF *ref, bool trylock, WT_PAGE **parentp, bool *hazardp) +__split_internal_lock( + WT_SESSION_IMPL *session, WT_REF *ref, bool trylock, WT_PAGE **parentp) { - WT_DECL_RET; WT_PAGE *parent; - WT_REF *parent_ref; - *hazardp = false; *parentp = NULL; /* @@ -1166,10 +1200,11 @@ __split_internal_lock_worker(WT_SESSION_IMPL *session, for (;;) { parent = ref->home; - /* - * The page will be marked dirty, and we can only lock a page - * with a modify structure. - */ + /* Encourage race */ + __page_split_timing_stress(session, + WT_TIMING_STRESS_PAGE_SPLIT_RACE, WT_THOUSAND); + + /* Page locks live in the modify structure. */ WT_RET(__wt_page_modify_init(session, parent)); if (trylock) @@ -1182,69 +1217,28 @@ __split_internal_lock_worker(WT_SESSION_IMPL *session, } /* - * We have exclusive access to split the parent, and at this point, the - * child prevents the parent from being evicted. However, once we + * This child has exclusive access to split its parent and the child's + * existence prevents the parent from being evicted. However, once we * update the parent's index, it may no longer refer to the child, and - * could conceivably be evicted. Get a hazard pointer on the parent - * now, so that we can safely access it after updating the index. - * - * Take care getting the page doesn't trigger eviction work: we could - * block trying to split a different child of our parent and deadlock - * or we could be the eviction server relied upon by other threads to - * populate the eviction queue. - */ - if (!__wt_ref_is_root(parent_ref = parent->pg_intl_parent_ref)) { - WT_ERR(__wt_page_in(session, parent_ref, WT_READ_NO_EVICT)); - *hazardp = true; - } + * could conceivably be evicted. If the parent page is dirty, our page + * lock prevents eviction because reconciliation is blocked. However, + * if the page were clean, it could be evicted without encountering our + * page lock. That isn't possible because you cannot move a child page + * and still leave the parent page clean. + */ *parentp = parent; return (0); - -err: WT_PAGE_UNLOCK(session, parent); - return (ret); -} - -/* - * __split_internal_lock -- - * Lock an internal page. - */ -static int -__split_internal_lock(WT_SESSION_IMPL *session, - WT_REF *ref, bool trylock, WT_PAGE **parentp, bool *hazardp) -{ - WT_DECL_RET; - - /* - * There's no lock on our parent page and we're about to acquire one, - * which implies using the WT_REF.home field to reference our parent - * page. As a child of the parent page, we prevent its eviction, but - * that's a weak guarantee. If the parent page splits, and our WT_REF - * were to move with the split, the WT_REF.home field might change - * underneath us and we could race, and end up attempting to access - * an evicted page. Set the session page-index generation so if the - * parent splits, it still can't be evicted. - */ - WT_WITH_PAGE_INDEX(session, - ret = __split_internal_lock_worker( - session, ref, trylock, parentp, hazardp)); - return (ret); } /* * __split_internal_unlock -- * Unlock the parent page. */ -static int -__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard) +static void +__split_internal_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent) { - WT_DECL_RET; - - if (hazard) - ret = __wt_hazard_clear(session, parent->pg_intl_parent_ref); - WT_PAGE_UNLOCK(session, parent); - return (ret); } /* @@ -1297,13 +1291,12 @@ __split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref) * Check if we should split up the tree. */ static int -__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard) +__split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *parent; WT_REF *ref; - bool parent_hazard; btree = S2BT(session); @@ -1317,8 +1310,10 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard) * split chunk, but we'll write it upon finding it in a different part * of the tree. */ - if (btree->checkpointing != WT_CKPT_OFF) - return (__split_internal_unlock(session, page, page_hazard)); + if (btree->checkpointing != WT_CKPT_OFF) { + __split_internal_unlock(session, page); + return (0); + } /* * Page splits trickle up the tree, that is, as leaf pages grow large @@ -1340,7 +1335,6 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard) */ for (;;) { parent = NULL; - parent_hazard = false; ref = page->pg_intl_parent_ref; /* If we don't need to split the page, we're done. */ @@ -1360,22 +1354,18 @@ __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard) * Lock the parent and split into it, then swap the parent/page * locks, lock-coupling up the tree. */ - WT_ERR(__split_internal_lock( - session, ref, true, &parent, &parent_hazard)); + WT_ERR(__split_internal_lock(session, ref, true, &parent)); ret = __split_internal(session, parent, page); - WT_TRET(__split_internal_unlock(session, page, page_hazard)); + __split_internal_unlock(session, page); page = parent; - page_hazard = parent_hazard; parent = NULL; - parent_hazard = false; WT_ERR(ret); } err: if (parent != NULL) - WT_TRET( - __split_internal_unlock(session, parent, parent_hazard)); - WT_TRET(__split_internal_unlock(session, page, page_hazard)); + __split_internal_unlock(session, parent); + __split_internal_unlock(session, page); /* A page may have been busy, in which case return without error. */ WT_RET_BUSY_OK(ret); @@ -1462,11 +1452,11 @@ __split_multi_inmem( case WT_PAGE_ROW_LEAF: /* Build a key. */ if (supd->ins == NULL) { - slot = WT_ROW_SLOT(orig, supd->rip); + slot = WT_ROW_SLOT(orig, supd->ripcip); upd = orig->modify->mod_row_update[slot]; WT_ERR(__wt_row_leaf_key( - session, orig, supd->rip, key, false)); + session, orig, supd->ripcip, key, false)); } else { upd = supd->ins->upd; @@ -1530,7 +1520,7 @@ __split_multi_inmem_final(WT_PAGE *orig, WT_MULTI *multi) break; case WT_PAGE_ROW_LEAF: if (supd->ins == NULL) { - slot = WT_ROW_SLOT(orig, supd->rip); + slot = WT_ROW_SLOT(orig, supd->ripcip); orig->modify->mod_row_update[slot] = NULL; } else supd->ins->upd = NULL; @@ -1986,21 +1976,19 @@ err: if (split_ref[0] != NULL) { } /* - * __wt_split_insert -- - * Lock, then split. + * __split_insert_lock -- + * Split a page's last insert list entries into a separate page. */ -int -__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) +static int +__split_insert_lock(WT_SESSION_IMPL *session, WT_REF *ref) { WT_DECL_RET; WT_PAGE *parent; - bool hazard; - - __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref); - WT_RET(__split_internal_lock(session, ref, true, &parent, &hazard)); + /* Lock the parent page, then proceed with the insert split. */ + WT_RET(__split_internal_lock(session, ref, true, &parent)); if ((ret = __split_insert(session, ref)) != 0) { - WT_TRET(__split_internal_unlock(session, parent, hazard)); + __split_internal_unlock(session, parent); return (ret); } @@ -2009,7 +1997,27 @@ __wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * parent page locked, note the functions we call are responsible for * releasing that lock. */ - return (__split_parent_climb(session, parent, hazard)); + return (__split_parent_climb(session, parent)); +} + +/* + * __wt_split_insert -- + * Split a page's last insert list entries into a separate page. + */ +int +__wt_split_insert(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_DECL_RET; + + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-insert", (void *)ref); + + /* + * Set the session split generation to ensure underlying code isn't + * surprised by internal page eviction, then proceed with the insert + * split. + */ + WT_WITH_PAGE_INDEX(session, ret = __split_insert_lock(session, ref)); + return (ret); } /* @@ -2077,21 +2085,19 @@ err: for (i = 0; i < new_entries; ++i) } /* - * __wt_split_multi -- - * Lock, then split. + * __split_multi_lock -- + * Split a page into multiple pages. */ -int -__wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) +static int +__split_multi_lock(WT_SESSION_IMPL *session, WT_REF *ref, int closing) { WT_DECL_RET; WT_PAGE *parent; - bool hazard; - __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref); - - WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard)); + /* Lock the parent page, then proceed with the split. */ + WT_RET(__split_internal_lock(session, ref, false, &parent)); if ((ret = __split_multi(session, ref, closing)) != 0 || closing) { - WT_TRET(__split_internal_unlock(session, parent, hazard)); + __split_internal_unlock(session, parent); return (ret); } @@ -2100,26 +2106,63 @@ __wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) * parent page locked, note the functions we call are responsible for * releasing that lock. */ - return (__split_parent_climb(session, parent, hazard)); + return (__split_parent_climb(session, parent)); +} + +/* + * __wt_split_multi -- + * Split a page into multiple pages. + */ +int +__wt_split_multi(WT_SESSION_IMPL *session, WT_REF *ref, int closing) +{ + WT_DECL_RET; + + __wt_verbose(session, WT_VERB_SPLIT, "%p: split-multi", (void *)ref); + + /* + * Set the session split generation to ensure underlying code isn't + * surprised by internal page eviction, then proceed with the split. + */ + WT_WITH_PAGE_INDEX(session, + ret = __split_multi_lock(session, ref, closing)); + return (ret); +} + +/* + * __split_reverse -- + * Reverse split (rewrite a parent page's index to reflect an empty page). + */ +static int +__split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_DECL_RET; + WT_PAGE *parent; + + /* Lock the parent page, then proceed with the reverse split. */ + WT_RET(__split_internal_lock(session, ref, false, &parent)); + ret = __split_parent(session, ref, NULL, 0, 0, false, true); + __split_internal_unlock(session, parent); + return (ret); } /* * __wt_split_reverse -- - * We have a locked ref that is empty and we want to rewrite the index in - * its parent. + * Reverse split (rewrite a parent page's index to reflect an empty page). */ int __wt_split_reverse(WT_SESSION_IMPL *session, WT_REF *ref) { WT_DECL_RET; - WT_PAGE *parent; - bool hazard; __wt_verbose(session, WT_VERB_SPLIT, "%p: reverse-split", (void *)ref); - WT_RET(__split_internal_lock(session, ref, false, &parent, &hazard)); - ret = __split_parent(session, ref, NULL, 0, 0, false, true); - WT_TRET(__split_internal_unlock(session, parent, hazard)); + /* + * Set the session split generation to ensure underlying code isn't + * surprised by internal page eviction, then proceed with the reverse + * split. + */ + WT_WITH_PAGE_INDEX(session, ret = __split_reverse(session, ref)); return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_stat.c b/src/third_party/wiredtiger/src/btree/bt_stat.c index e3b9bbced48..d7150859e8f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_stat.c +++ b/src/third_party/wiredtiger/src/btree/bt_stat.c @@ -137,7 +137,6 @@ __stat_page_col_var( WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_INSERT *ins; - WT_UPDATE *upd; uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt; uint32_t i; bool orig_deleted; @@ -177,31 +176,39 @@ __stat_page_col_var( * we find, correct the original count based on its state. */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) { - upd = ins->upd; - if (upd->type == WT_UPDATE_RESERVED) - continue; - if (upd->type == WT_UPDATE_DELETED) { + switch (ins->upd->type) { + case WT_UPDATE_DELETED: if (!orig_deleted) { ++deleted_cnt; --entry_cnt; } - } else + break; + case WT_UPDATE_MODIFIED: + case WT_UPDATE_STANDARD: if (orig_deleted) { --deleted_cnt; ++entry_cnt; } + break; + case WT_UPDATE_RESERVED: + break; + } } } /* Walk any append list. */ - WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) { - if (ins->upd->type == WT_UPDATE_RESERVED) - continue; - if (ins->upd->type == WT_UPDATE_DELETED) + WT_SKIP_FOREACH(ins, WT_COL_APPEND(page)) + switch (ins->upd->type) { + case WT_UPDATE_DELETED: ++deleted_cnt; - else + break; + case WT_UPDATE_MODIFIED: + case WT_UPDATE_STANDARD: ++entry_cnt; - } + break; + case WT_UPDATE_RESERVED: + break; + } WT_STAT_INCRV(session, stats, btree_column_deleted, deleted_cnt); WT_STAT_INCRV(session, stats, btree_column_rle, rle_cnt); diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index 225e6812aa1..d783f8f6e71 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -18,9 +18,16 @@ __ref_index_slot(WT_SESSION_IMPL *session, { WT_PAGE_INDEX *pindex; WT_REF **start, **stop, **p, **t; + uint64_t sleep_count, yield_count; uint32_t entries, slot; - for (;;) { + /* + * If we don't find our reference, the page split and our home + * pointer references the wrong page. When internal pages + * split, their WT_REF structure home values are updated; yield + * and wait for that to happen. + */ + for (sleep_count = yield_count = 0;;) { /* * Copy the parent page's index value: the page can split at * any time, but the index's value is always valid, even if @@ -58,14 +65,14 @@ __ref_index_slot(WT_SESSION_IMPL *session, goto found; } } - /* - * If we don't find our reference, the page split and our home - * pointer references the wrong page. When internal pages - * split, their WT_REF structure home values are updated; yield - * and wait for that to happen. + * We failed to get the page index and slot reference, yield + * before retrying, and if we've yielded enough times, start + * sleeping so we don't burn CPU to no purpose. */ - __wt_yield(); + __wt_ref_state_yield_sleep(&yield_count, &sleep_count); + WT_STAT_CONN_INCRV(session, page_index_slot_ref_blocked, + sleep_count); } found: WT_ASSERT(session, pindex->index[slot] == ref); @@ -177,12 +184,13 @@ __ref_descend_prev( WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) { WT_PAGE_INDEX *pindex; + uint64_t yield_count; /* * We're passed a child page into which we're descending, and on which * we have a hazard pointer. */ - for (;; __wt_yield()) { + for (yield_count = 0;; yield_count++, __wt_yield()) { /* * There's a split race when a cursor moving backwards through * the tree descends the tree. If we're splitting an internal @@ -242,6 +250,7 @@ __ref_descend_prev( break; } *pindexp = pindex; + WT_STAT_CONN_INCRV(session, tree_descend_blocked, yield_count); } /* diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index e2d19bf705b..a57a9c17edb 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -268,13 +268,13 @@ __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, */ if (modify_type == WT_UPDATE_DELETED || modify_type == WT_UPDATE_RESERVED) - WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE), &upd)); + WT_RET(__wt_calloc(session, 1, WT_UPDATE_SIZE, &upd)); else { WT_RET(__wt_calloc( - session, 1, sizeof(WT_UPDATE) + value->size, &upd)); + session, 1, WT_UPDATE_SIZE + value->size, &upd)); if (value->size != 0) { upd->size = WT_STORE_SIZE(value->size); - memcpy(WT_UPDATE_DATA(upd), value->data, value->size); + memcpy(upd->data, value->data, value->size); } } upd->type = (uint8_t)modify_type; @@ -302,9 +302,16 @@ __wt_update_obsolete_check( * freeing the memory. * * Walk the list of updates, looking for obsolete updates at the end. + * + * Only updates with globally visible, self-contained data can terminate + * update chains, ignore modified and reserved updates. Special case the + * first transaction ID, it flags column-store overflow values which can + * never be discarded. */ for (first = NULL, count = 0; upd != NULL; upd = upd->next, count++) - if (__wt_txn_upd_visible_all(session, upd)) { + if (WT_UPDATE_DATA_VALUE(upd) && + __wt_txn_upd_visible_all(session, upd) && + upd->txnid != WT_TXN_FIRST) { if (first == NULL) first = upd; } else if (upd->txnid != WT_TXN_ABORTED) |