diff options
author | Luke Chen <luke.chen@mongodb.com> | 2018-03-29 09:36:51 +1100 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2018-03-29 09:36:51 +1100 |
commit | 55b2be7e2d7c5262670b4375e34dd49b95dc63ef (patch) | |
tree | 0d085a9e469df598faefef57924205d7cff9aa51 /src/third_party | |
parent | f511a790ffe197de7cedc9c6c7c16aca8054b11b (diff) | |
download | mongo-55b2be7e2d7c5262670b4375e34dd49b95dc63ef.tar.gz |
Import wiredtiger: dc58dd84dba0bdd358f8cc09b17f90c7659ac429 from branch mongodb-3.6
ref: 4d5794b937..dc58dd84db
for: 3.6.4
WT-3869 Bi-weekly WT codebase lint
WT-3913 Enhance cursor operations to account for prepare state
WT-3950 Add some rollback_to_stable statistics
WT-3958 Add query API to get most recent checkpoint's stable timestamp
WT-3969 Enhance format tester to account for prepare state
WT-3972 Allow more than 64K cursors to be open on a data source simultaneously
WT-3975 Arg format mismatch after rwlock changes
WT-3977 Print out actual checkpoint stable timestamp in timestamp_abort
WT-3979 Fix warnings generated with newer Doxygen releases
WT-3980 Failure returning a modified update without a backing "real" update
WT-3982 Fix transaction visibility bugs related to lookaside usage.
WT-3985 Pre-allocated log files accumulate on Windows
WT-3987 Avoid reading lookaside pages in truncate fast path
WT-3990 Fix Coverity warnings mostly in test programs
Diffstat (limited to 'src/third_party')
62 files changed, 1263 insertions, 750 deletions
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 17fe0d97735..f5e0b4a67a3 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -1371,7 +1371,8 @@ methods = { \c oldest_timestamp and the read timestamps of all active readers, and \c stable returns the most recent \c stable_timestamp set with WT_CONNECTION::set_timestamp. See @ref transaction_timestamps''', - choices=['all_committed','oldest','pinned','recovery','stable']), + choices=['all_committed','last_checkpoint', + 'oldest','pinned','recovery','stable']), ]), 'WT_CONNECTION.set_timestamp' : Method([ diff --git a/src/third_party/wiredtiger/dist/s_prototypes b/src/third_party/wiredtiger/dist/s_prototypes index 20e08eb4c54..75863cf8f87 100755 --- a/src/third_party/wiredtiger/dist/s_prototypes +++ b/src/third_party/wiredtiger/dist/s_prototypes @@ -42,7 +42,8 @@ proto() -e x \ -e '}' \ -e '# Add the warn_unused_result attribute to any external' \ - -e '# functions that return an int.' \ + -e '# functions that return a boolean or an int.' \ + -e '/^extern bool /s/$/ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result))/' \ -e '/^extern int /s/$/ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result))/' \ -e 's/$/;/' \ -e p < $1 diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index aa68e584376..7330f560eb6 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -138,6 +138,7 @@ FULLFSYNC Facebook FindClose FindFirstFile +FindNextFileW Fixup Fk FlushFileBuffers @@ -523,7 +524,7 @@ ccr cd centric cfg -cfkos +cfko change's changelog chdir diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index a630ebe3fa9..1441187812e 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -518,6 +518,9 @@ connection_stats = [ TxnStat('txn_read_queue_inserts', 'read timestamp queue inserts total'), TxnStat('txn_read_queue_len', 'read timestamp queue length'), TxnStat('txn_rollback', 'transactions rolled back'), + TxnStat('txn_rollback_las_removed', 'rollback to stable updates removed from lookaside'), + TxnStat('txn_rollback_to_stable', 'rollback to stable calls'), + TxnStat('txn_rollback_upd_aborted', 'rollback to stable updates aborted'), TxnStat('txn_set_ts', 'set timestamp calls'), TxnStat('txn_set_ts_commit', 'set timestamp commit calls'), TxnStat('txn_set_ts_commit_upd', 'set timestamp commit updates'), diff --git a/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c b/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c index bde1bfc48bf..bdb4669a637 100644 --- a/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c +++ b/src/third_party/wiredtiger/ext/test/fail_fs/fail_fs.c @@ -452,16 +452,14 @@ fail_fs_directory_list(WT_FILE_SYSTEM *file_system, * matter if the list is a bit longer than necessary. */ if (count >= allocated) { - p = realloc( - entries, (allocated + 10) * sizeof(*entries)); - if (p == NULL) { + allocated += 10; + if ((p = realloc( + entries, allocated * sizeof(*entries))) == NULL) { ret = ENOMEM; goto err; } entries = p; - memset(entries + allocated * sizeof(*entries), - 0, 10 * sizeof(*entries)); - allocated += 10; + memset(entries + count, 0, 10 * sizeof(*entries)); } entries[count++] = strdup(name); } diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 17424bdbfda..5b74711461d 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "4d5794b93795d52af97dc150d81b637442b89c5d", + "commit": "dc58dd84dba0bdd358f8cc09b17f90c7659ac429", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.6" diff --git a/src/third_party/wiredtiger/src/block/block_open.c b/src/third_party/wiredtiger/src/block/block_open.c index 7c7e0e5c525..32e13acfa83 100644 --- a/src/third_party/wiredtiger/src/block/block_open.c +++ b/src/third_party/wiredtiger/src/block/block_open.c @@ -256,7 +256,7 @@ __wt_block_close(WT_SESSION_IMPL *session, WT_BLOCK *block) conn = S2C(session); __wt_verbose(session, WT_VERB_BLOCK, - "close: %s", block->name == NULL ? "" : block->name ); + "close: %s", block->name == NULL ? "" : block->name); __wt_spin_lock(session, &conn->block_lock); diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index 6737af9996b..63d2cda4714 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -54,12 +54,17 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage) * insert is aborted, we simply return zero (empty), regardless of * whether we are at the end of the data. */ - if (cbt->recno < WT_INSERT_RECNO(cbt->ins) || - (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { + if (cbt->recno < WT_INSERT_RECNO(cbt->ins)) { cbt->v = 0; cbt->iface.value.data = &cbt->v; - } else - cbt->iface.value.data = upd->data; + } else { + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); + if (upd == NULL) { + cbt->v = 0; + cbt->iface.value.data = &cbt->v; + } else + cbt->iface.value.data = upd->data; + } cbt->iface.value.size = 1; return (0); } @@ -79,6 +84,7 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage) session = (WT_SESSION_IMPL *)cbt->iface.session; btree = S2BT(session); page = cbt->ref->page; + upd = NULL; /* Initialize for each new page. */ if (newpage) { @@ -101,7 +107,8 @@ new_page: cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; - upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); + if (cbt->ins != NULL) + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); if (upd == NULL) { cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); cbt->iface.value.data = &cbt->v; @@ -134,7 +141,8 @@ new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); - if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); + if (upd == NULL) continue; if (upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && @@ -193,8 +201,9 @@ new_page: /* Find the matching WT_COL slot. */ /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); - upd = cbt->ins == NULL ? - NULL : __wt_txn_read(session, cbt->ins->upd); + upd = NULL; + if (cbt->ins != NULL) + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); if (upd != NULL) { if (upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && @@ -311,7 +320,8 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage) cbt->ins = WT_SKIP_NEXT(cbt->ins); new_insert: if ((ins = cbt->ins) != NULL) { - if ((upd = __wt_txn_read(session, ins->upd)) == NULL) + WT_RET(__wt_txn_read(session, ins->upd, &upd)); + if (upd == NULL) continue; if (upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && @@ -344,7 +354,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row[cbt->slot]; - upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); + WT_RET(__wt_txn_read(session, WT_ROW_UPDATE(page, rip), &upd)); if (upd != NULL && upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) @@ -571,8 +581,9 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; + WT_UPDATE *upd; uint32_t flags; - bool newpage; + bool newpage, valid; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; @@ -582,6 +593,26 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + /* + * In case of retrying a next operation due to a prepare conflict, + * cursor would have been already positioned at an update structure + * which resulted in conflict. So, now when retrying we should examine + * the same update again instead of starting from the next one in the + * update chain. + */ + F_CLR(cbt, WT_CBT_RETRY_PREV); + if (F_ISSET(cbt, WT_CBT_RETRY_NEXT)) { + WT_RET(__wt_cursor_valid(cbt, &upd, &valid)); + F_CLR(cbt, WT_CBT_RETRY_NEXT); + if (valid) { + /* + * If the update, which returned prepared conflict is + * visible, return the value. + */ + return (__cursor_kv_return(session, cbt, upd)); + } + } + WT_RET(__cursor_func_init(cbt, false)); /* @@ -663,15 +694,24 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } - #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, true)); #endif - if (ret == 0) +err: switch (ret) { + case 0: F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); - -err: if (ret != 0) + break; + case WT_PREPARE_CONFLICT: + /* + * If prepare conflict occurs, cursor should not be reset, + * as current cursor position will be reused in case of a + * retry from user. + */ + F_SET(cbt, WT_CBT_RETRY_NEXT); + break; + default: WT_TRET(__cursor_reset(cbt)); + } return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 068a9915ab9..3356baeb24a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -199,13 +199,18 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) * created records written by reconciliation are deleted and so can be * never seen by a read. */ - if (cbt->ins == NULL || - cbt->recno > WT_INSERT_RECNO(cbt->ins) || - (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { + if (cbt->ins == NULL || cbt->recno > WT_INSERT_RECNO(cbt->ins)) { cbt->v = 0; cbt->iface.value.data = &cbt->v; - } else - cbt->iface.value.data = upd->data; + } else { + upd = NULL; + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); + if (upd == NULL) { + cbt->v = 0; + cbt->iface.value.data = &cbt->v; + } else + cbt->iface.value.data = upd->data; + } cbt->iface.value.size = 1; return (0); } @@ -247,7 +252,9 @@ new_page: cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; - upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); + upd = NULL; + if (cbt->ins != NULL) + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); if (upd == NULL) { cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); cbt->iface.value.data = &cbt->v; @@ -280,7 +287,8 @@ new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); - if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); + if (upd == NULL) continue; if (upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && @@ -340,8 +348,9 @@ new_page: if (cbt->recno < cbt->ref->ref_recno) /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); - upd = cbt->ins == NULL ? - NULL : __wt_txn_read(session, cbt->ins->upd); + upd = NULL; + if (cbt->ins != NULL) + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); if (upd != NULL) { if (upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && @@ -468,7 +477,8 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) WT_RET(__cursor_skip_prev(cbt)); new_insert: if ((ins = cbt->ins) != NULL) { - if ((upd = __wt_txn_read(session, ins->upd)) == NULL) + WT_RET(__wt_txn_read(session, ins->upd, &upd)); + if (upd == NULL) continue; if (upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && @@ -503,7 +513,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row[cbt->slot]; - upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); + WT_RET(__wt_txn_read(session, WT_ROW_UPDATE(page, rip), &upd)); if (upd != NULL && upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) @@ -526,8 +536,9 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; + WT_UPDATE *upd; uint32_t flags; - bool newpage; + bool newpage, valid; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; @@ -537,6 +548,26 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + /* + * In case of retrying a prev operation due to a prepare conflict, + * cursor would have been already positioned at an update structure + * which resulted in conflict. So, now when retrying we should examine + * the same update again instead of starting from the next one in the + * update chain. + */ + F_CLR(cbt, WT_CBT_RETRY_NEXT); + if (F_ISSET(cbt, WT_CBT_RETRY_PREV)) { + WT_RET(__wt_cursor_valid(cbt, &upd, &valid)); + F_CLR(cbt, WT_CBT_RETRY_PREV); + if (valid) { + /* + * If the update, which returned prepared conflict is + * visible, return the value. + */ + return (__cursor_kv_return(session, cbt, upd)); + } + } + WT_RET(__cursor_func_init(cbt, false)); /* @@ -622,10 +653,20 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, false)); #endif - if (ret == 0) +err: switch (ret) { + case 0: F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); - -err: if (ret != 0) + break; + case WT_PREPARE_CONFLICT: + /* + * If prepare conflict occurs, cursor should not be reset, + * as current cursor position will be reused in case of a + * retry from user. + */ + F_SET(cbt, WT_CBT_RETRY_PREV); + break; + default: WT_TRET(__cursor_reset(cbt)); + } return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 29725e22b2c..9a30ee2c1a4 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -205,8 +205,8 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) * __wt_cursor_valid -- * Return if the cursor references an valid key/value pair. */ -bool -__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) +int +__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *valid) { WT_BTREE *btree; WT_CELL *cell; @@ -215,11 +215,12 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) WT_SESSION_IMPL *session; WT_UPDATE *upd; + if (updp != NULL) + *updp = NULL; + *valid = false; btree = cbt->btree; page = cbt->ref->page; session = (WT_SESSION_IMPL *)cbt->iface.session; - if (updp != NULL) - *updp = NULL; /* * We may be pointing to an insert object, and we may have a page with @@ -265,13 +266,16 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * First, check for an insert object with a visible update (a visible * update that's been deleted is not a valid key/value pair). */ - if (cbt->ins != NULL && - (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) - return (false); - if (updp != NULL) - *updp = upd; - return (true); + if (cbt->ins != NULL) { + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); + if (upd != NULL) { + if (upd->type == WT_UPDATE_TOMBSTONE) + return (0); + if (updp != NULL) + *updp = upd; + *valid = true; + return (0); + } } /* @@ -290,7 +294,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * keys, check for retrieval past the end of the page. */ if (cbt->recno >= cbt->ref->ref_recno + page->entries) - return (false); + return (0); /* * An update would have appeared as an "insert" object; no @@ -300,7 +304,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) case BTREE_COL_VAR: /* The search function doesn't check for empty pages. */ if (page->entries == 0) - return (false); + return (0); WT_ASSERT(session, cbt->slot < page->entries); /* @@ -309,7 +313,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * returned on-page object must be checked for a match. */ if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH)) - return (false); + return (0); /* * Although updates would have appeared as an "insert" objects, @@ -320,12 +324,12 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) cip = &page->pg_var[cbt->slot]; if ((cell = WT_COL_PTR(page, cip)) == NULL || __wt_cell_type(cell) == WT_CELL_DEL) - return (false); + return (0); break; case BTREE_ROW: /* The search function doesn't check for empty pages. */ if (page->entries == 0) - return (false); + return (0); WT_ASSERT(session, cbt->slot < page->entries); /* @@ -333,34 +337,23 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * key as an on-page object, we're done. */ if (cbt->ins != NULL) - return (false); + return (0); /* Check for an update. */ if (page->modify != NULL && - page->modify->mod_row_update != NULL && - (upd = __wt_txn_read(session, - page->modify->mod_row_update[cbt->slot])) != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) - return (false); - if (updp != NULL) - *updp = upd; + page->modify->mod_row_update != NULL) { + WT_RET(__wt_txn_read(session, + page->modify->mod_row_update[cbt->slot], &upd)); + if (upd != NULL) { + if (upd->type == WT_UPDATE_TOMBSTONE) + return (0); + if (updp != NULL) + *updp = upd; + } } break; } - return (true); -} - -/* - * __cursor_kv_return -- - * Return a page referenced key/value pair to the application. - */ -static inline int -__cursor_kv_return( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) -{ - WT_RET(__wt_key_return(session, cbt)); - WT_RET(__wt_value_return(session, cbt, upd)); - + *valid = true; return (0); } @@ -512,7 +505,10 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, false) : __cursor_col_search(session, cbt, cbt->ref)); - valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd); + + /* Return, if prepare conflict encountered. */ + if (cbt->compare == 0) + WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); @@ -520,7 +516,10 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, false) : __cursor_col_search(session, cbt, NULL)); - valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd); + + /* Return, if prepare conflict encountered. */ + if (cbt->compare == 0) + WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); } if (valid) @@ -618,14 +617,14 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) * Ignore those cases, it makes things too complicated. */ if (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1) - valid = __wt_cursor_valid(cbt, &upd); + WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); - valid = __wt_cursor_valid(cbt, &upd); + WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); } /* @@ -656,26 +655,43 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) exact = 0; F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); - } else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) - exact = 1; - else { + } else { /* - * The cursor next call may have overwritten our caller's key, - * restore it to its original value. + * We didn't find an exact match: try after the search key, + * then before. We have to loop here because at low isolation + * levels, new records could appear as we are stepping through + * the tree. */ - __cursor_state_restore(cursor, &state); + while ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) { + WT_ERR(ret); + if (btree->type == BTREE_ROW) + WT_ERR(__wt_compare(session, btree->collator, + &cursor->key, &state.key, &exact)); + else + exact = cbt->recno < state.recno ? -1 : + cbt->recno == state.recno ? 0 : 1; + if (exact >= 0) + goto done; + } - WT_ERR(__cursor_func_init(cbt, true)); - WT_ERR(btree->type == BTREE_ROW ? - __cursor_row_search(session, cbt, NULL, true) : - __cursor_col_search(session, cbt, NULL)); - if (__wt_cursor_valid(cbt, &upd)) { - exact = cbt->compare; - ret = __cursor_kv_return(session, cbt, upd); - } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) - exact = -1; + /* + * We walked to the end of the tree without finding a match. + * Walk backwards instead. + */ + while ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) { + WT_ERR(ret); + if (btree->type == BTREE_ROW) + WT_ERR(__wt_compare(session, btree->collator, + &cursor->key, &state.key, &exact)); + else + exact = cbt->recno < state.recno ? -1 : + cbt->recno == state.recno ? 0 : 1; + if (exact <= 0) + goto done; + } } +done: err: if (ret == 0 && exactp != NULL) *exactp = exact; @@ -703,7 +719,7 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - bool append_key; + bool append_key, valid; btree = cbt->btree; cursor = &cbt->iface; @@ -784,8 +800,11 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); * key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) - WT_ERR(WT_DUPLICATE_KEY); + cbt->compare == 0) { + WT_ERR(__wt_cursor_valid(cbt, NULL, &valid)); + if (valid) + WT_ERR(WT_DUPLICATE_KEY); + } ret = __cursor_row_modify(session, cbt, WT_UPDATE_STANDARD); } else { @@ -805,10 +824,14 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); * column-store implicitly fills the gap with empty records. * Fail in that case, the record exists. */ - if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - ((cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) || - (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) - WT_ERR(WT_DUPLICATE_KEY); + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { + if (cbt->compare == 0) { + WT_ERR(__wt_cursor_valid(cbt, NULL, &valid)); + if (valid) + WT_ERR(WT_DUPLICATE_KEY); + } else if (__cursor_fix_implicit(btree, cbt)) + WT_ERR(WT_DUPLICATE_KEY); + } WT_ERR(__cursor_col_modify(session, cbt, WT_UPDATE_STANDARD)); @@ -932,7 +955,7 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - bool iterating; + bool iterating, valid; btree = cbt->btree; cursor = &cbt->iface; @@ -1028,7 +1051,10 @@ retry: if (positioned == POSITIONED) /* Check whether an update would conflict. */ WT_ERR(__curfile_update_check(cbt)); - if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) + if (cbt->compare != 0) + WT_ERR(WT_NOTFOUND); + WT_ERR(__wt_cursor_valid(cbt, NULL, &valid)); + if (!valid) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, WT_UPDATE_TOMBSTONE); @@ -1043,7 +1069,10 @@ retry: if (positioned == POSITIONED) WT_ERR(__curfile_update_check(cbt)); /* Remove the record if it exists. */ - if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) { + valid = false; + if (cbt->compare == 0) + WT_ERR(__wt_cursor_valid(cbt, NULL, &valid)); + if (cbt->compare != 0 || !valid) { if (!__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); /* @@ -1143,6 +1172,7 @@ __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; + bool valid; btree = cbt->btree; cursor = &cbt->iface; @@ -1207,7 +1237,10 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); - if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) + if (cbt->compare != 0) + WT_ERR(WT_NOTFOUND); + WT_ERR(__wt_cursor_valid(cbt, NULL, &valid)); + if (!valid) WT_ERR(WT_NOTFOUND); } ret = __cursor_row_modify_v(session, cbt, value, modify_type); @@ -1224,8 +1257,10 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); - if ((cbt->compare != 0 || - !__wt_cursor_valid(cbt, NULL)) && + valid = false; + if (cbt->compare == 0) + WT_ERR(__wt_cursor_valid(cbt, NULL, &valid)); + if ((cbt->compare != 0 || !valid) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index e9ac0bca66a..cb50bfbcf61 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -65,17 +65,18 @@ int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { + WT_ADDR *ref_addr; WT_DECL_RET; - WT_PAGE *parent; uint32_t previous_state; *skipp = false; /* If we have a clean page in memory, attempt to evict it. */ - if (ref->state == WT_REF_MEM && - __wt_atomic_casv32(&ref->state, WT_REF_MEM, WT_REF_LOCKED)) { + previous_state = ref->state; + if ((previous_state == WT_REF_MEM || previous_state == WT_REF_LIMBO) && + __wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) { if (__wt_page_is_modified(ref->page)) { - ref->state = WT_REF_MEM; + ref->state = previous_state; return (0); } @@ -93,7 +94,6 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) previous_state = ref->state; switch (previous_state) { case WT_REF_DISK: - case WT_REF_LIMBO: case WT_REF_LOOKASIDE: break; default: @@ -101,21 +101,9 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) } if (!__wt_atomic_casv32(&ref->state, previous_state, WT_REF_LOCKED)) return (0); - switch (previous_state) { - case WT_REF_DISK: - break; - case WT_REF_LIMBO: - case WT_REF_LOOKASIDE: - if (__wt_las_page_skip_locked(session, ref)) - break; - /* FALLTHROUGH */ - default: - ref->state = previous_state; - return (0); - } /* - * If this WT_REF was previously part of a fast-delete operation, there + * If this WT_REF was previously part of a truncate operation, there * may be existing page-delete information. The structure is only read * while the state is locked, free the previous version. * @@ -129,21 +117,24 @@ __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) } /* - * We cannot fast-delete pages that have overflow key/value items as - * the overflow blocks have to be discarded. The way we figure that - * out is to check the page's cell type, cells for leaf pages without - * overflow items are special. + * We cannot truncate pages that have overflow key/value items as the + * overflow blocks have to be discarded. The way we figure that out is + * to check the page's cell type, cells for leaf pages without overflow + * items are special. * * To look at an on-page cell, we need to look at the parent page, and * that's dangerous, our parent page could change without warning if - * the parent page were to split, deepening the tree. It's safe: the - * page's reference will always point to some valid page, and if we find - * any problems we simply fail the fast-delete optimization. + * the parent page were to split, deepening the tree. We can look at + * the parent page itself because the page can't change underneath us. + * However, if the parent page splits, our reference address can change; + * we don't care what version of it we read, as long as we don't read + * it twice. */ - parent = ref->home; - if (__wt_off_page(parent, ref->addr) ? - ((WT_ADDR *)ref->addr)->type != WT_ADDR_LEAF_NO : - __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO) + WT_ORDERED_READ(ref_addr, ref->addr); + if (ref_addr != NULL && + (__wt_off_page(ref->home, ref_addr) ? + ref_addr->type != WT_ADDR_LEAF_NO : + __wt_cell_type_raw((WT_CELL *)ref_addr) != WT_CELL_ADDR_LEAF_NO)) goto err; /* @@ -181,8 +172,10 @@ err: __wt_free(session, ref->page_del); int __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) { - WT_UPDATE **upd; + WT_UPDATE **updp; uint64_t sleep_count, yield_count; + uint32_t current_state; + bool locked; /* * If the page is still "deleted", it's as we left it, reset the state @@ -190,17 +183,17 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) * instantiated or being instantiated. Loop because it's possible for * the page to return to the deleted state if instantiation fails. */ - for (sleep_count = yield_count = 0;;) { - switch (ref->state) { + for (locked = false, sleep_count = yield_count = 0;;) { + switch (current_state = ref->state) { case WT_REF_DELETED: /* * If the page is still "deleted", it's as we left it, * reset the state. */ - if (!__wt_atomic_casv32(&ref->state, + if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, ref->page_del->previous_state)) - break; - goto done; + goto done; + break; case WT_REF_LOCKED: /* * A possible state, the page is being instantiated. @@ -208,22 +201,10 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) break; case WT_REF_MEM: case WT_REF_SPLIT: - /* - * We can't use the normal read path to get a copy of - * the page because the session may have closed the - * cursor, we no longer have the reference to the tree - * required for a hazard pointer. We're safe because - * with unresolved transactions, the page isn't going - * anywhere. - * - * The page is in an in-memory state, which means it - * was instantiated at some point. Walk the list of - * update structures and abort them. - */ - for (upd = - ref->page_del->update_list; *upd != NULL; ++upd) - (*upd)->txnid = WT_TXN_ABORTED; - goto done; + if (__wt_atomic_casv32( + &ref->state, current_state, WT_REF_LOCKED)) + locked = true; + break; case WT_REF_DISK: case WT_REF_LIMBO: case WT_REF_LOOKASIDE: @@ -232,16 +213,38 @@ __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) return (__wt_illegal_value(session, "illegal WT_REF.state rolling back deleted page")); } + + if (locked) + break; + /* * We wait for the change in page state, yield before retrying, - * and if we've yielded enough times, start sleeping so we don't - * burn CPU to no purpose. + * and if we've yielded enough times, start sleeping so we + * don't burn CPU to no purpose. */ __wt_ref_state_yield_sleep(&yield_count, &sleep_count); - WT_STAT_CONN_INCRV(session, page_del_rollback_blocked, - sleep_count); + WT_STAT_CONN_INCRV(session, + page_del_rollback_blocked, sleep_count); } + /* + * We can't use the normal read path to get a copy of the page + * because the session may have closed the cursor, we no longer + * have the reference to the tree required for a hazard + * pointer. We're safe because with unresolved transactions, + * the page isn't going anywhere. + * + * The page is in an in-memory state, which means it + * was instantiated at some point. Walk any list of + * update structures and abort them. + */ + WT_ASSERT(session, locked); + if ((updp = ref->page_del->update_list) != NULL) + for (; *updp != NULL; ++updp) + (*updp)->txnid = WT_TXN_ABORTED; + + ref->state = current_state; + done: /* * Now mark the truncate aborted: this must come last because after * this point there is nothing preventing the page from being evicted. @@ -261,12 +264,12 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) bool skip; /* - * Deleted pages come from two sources: either it's a fast-delete as + * Deleted pages come from two sources: either it's a truncate as * described above, or the page has been emptied by other operations * and eviction deleted it. * * In both cases, the WT_REF state will be WT_REF_DELETED. In the case - * of a fast-delete page, there will be a WT_PAGE_DELETED structure with + * of a truncated page, there will be a WT_PAGE_DELETED structure with * the transaction ID of the transaction that deleted the page, and the * page is visible if that transaction ID is visible. In the case of an * empty page, there will be no WT_PAGE_DELETED structure and the delete @@ -308,6 +311,31 @@ __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) } /* + * __tombstone_update_alloc -- + * Allocate and initialize a page-deleted tombstone update structure. + */ +static int +__tombstone_update_alloc(WT_SESSION_IMPL *session, + WT_PAGE_DELETED *page_del, WT_UPDATE **updp, size_t *sizep) +{ + WT_UPDATE *upd; + + WT_RET( + __wt_update_alloc(session, NULL, &upd, sizep, WT_UPDATE_TOMBSTONE)); + + /* + * Cleared memory matches the lowest possible transaction ID and + * timestamp, do nothing. + */ + if (page_del != NULL) { + upd->txnid = page_del->txnid; + __wt_timestamp_set(&upd->timestamp, &page_del->timestamp); + } + *updp = upd; + return (0); +} + +/* * __wt_delete_page_instantiate -- * Instantiate an entirely deleted row-store leaf page. */ @@ -316,11 +344,14 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_DECL_RET; + WT_INSERT *ins; + WT_INSERT_HEAD *insert; WT_PAGE *page; WT_PAGE_DELETED *page_del; + WT_ROW *rip; WT_UPDATE **upd_array, *upd; size_t size; - uint32_t i; + uint32_t count, i; btree = S2BT(session); page = ref->page; @@ -355,52 +386,75 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * running inside a checkpoint, and now we're being forced to read that * page. * - * In the first case, we have a page reference structure, in the second, - * we don't. - * - * Allocate the per-reference update array; in the case of instantiating - * a page, deleted by a running transaction that might eventually abort, - * we need a list of the update structures so we can do that abort. The - * hard case is if a page splits: the update structures might be moved - * to different pages, and we still have to find them all for an abort. + * Expect a page-deleted structure if there's a running transaction that + * needs to be resolved, otherwise, there may not be one (and, if the + * transaction has resolved, we can ignore the page-deleted structure). */ - page_del = ref->page_del; - if (page_del != NULL) - WT_RET(__wt_calloc_def( - session, page->entries + 1, &page_del->update_list)); + page_del = + __wt_btree_truncate_active(session, ref) ? ref->page_del : NULL; /* - * Allocate the per-page update array if one doesn't already exist. - * Because deletes may be instantiated after lookaside table updates, - * the update array may already exist. + * Allocate the per-page update array if one doesn't already exist. (It + * might already exist because deletes are instantiated after lookaside + * table updates.) */ - if (page->modify->mod_row_update == NULL) - WT_ERR(__wt_calloc_def( + if (page->entries != 0 && page->modify->mod_row_update == NULL) + WT_RET(__wt_calloc_def( session, page->entries, &page->modify->mod_row_update)); /* - * Fill in the per-reference update array with references to update - * structures, fill in the per-page update array with references to - * deleted items. + * Allocate the per-reference update array; in the case of instantiating + * a page deleted in a running transaction, we need a list of the update + * structures for the eventual commit or abort. */ - upd_array = page->modify->mod_row_update; - for (i = 0, size = 0; i < page->entries; ++i) { - WT_ERR(__wt_calloc_one(session, &upd)); - upd->type = WT_UPDATE_TOMBSTONE; - - if (page_del == NULL) - upd->txnid = WT_TXN_NONE; /* Globally visible */ - else { - upd->txnid = page_del->txnid; - __wt_timestamp_set( - &upd->timestamp, &page_del->timestamp); - page_del->update_list[i] = upd; + if (page_del != NULL) { + count = 0; + if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) + WT_SKIP_FOREACH(ins, insert) + ++count; + WT_ROW_FOREACH(page, rip, i) { + ++count; + if ((insert = WT_ROW_INSERT(page, rip)) != NULL) + WT_SKIP_FOREACH(ins, insert) + ++count; } + WT_RET(__wt_calloc_def( + session, count + 1, &page_del->update_list)); + } - upd->next = upd_array[i]; - upd_array[i] = upd; - - size += sizeof(WT_UPDATE *) + WT_UPDATE_MEMSIZE(upd); + /* Walk the page entries, giving each one a tombstone. */ + size = 0; + count = 0; + upd_array = page->modify->mod_row_update; + if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL) + WT_SKIP_FOREACH(ins, insert) { + WT_ERR(__tombstone_update_alloc( + session, page_del, &upd, &size)); + upd->next = ins->upd; + ins->upd = upd; + + if (page_del != NULL) + page_del->update_list[count++] = upd; + } + WT_ROW_FOREACH(page, rip, i) { + WT_ERR(__tombstone_update_alloc( + session, page_del, &upd, &size)); + upd->next = upd_array[WT_ROW_SLOT(page, rip)]; + upd_array[WT_ROW_SLOT(page, rip)] = upd; + + if (page_del != NULL) + page_del->update_list[count++] = upd; + + if ((insert = WT_ROW_INSERT(page, rip)) != NULL) + WT_SKIP_FOREACH(ins, insert) { + WT_ERR(__tombstone_update_alloc( + session, page_del, &upd, &size)); + upd->next = ins->upd; + ins->upd = upd; + + if (page_del != NULL) + page_del->update_list[count++] = upd; + } } __wt_cache_page_inmem_incr(session, page, size); diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c index 4c108114438..1379553c211 100644 --- a/src/third_party/wiredtiger/src/btree/bt_io.c +++ b/src/third_party/wiredtiger/src/btree/bt_io.c @@ -355,8 +355,7 @@ __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, * Checksum the data if the buffer isn't compressed or checksums are * configured. */ - data_checksum = true; /* -Werror=maybe-uninitialized */ - WT_NOT_READ(data_checksum); + WT_NOT_READ(data_checksum, true); switch (btree->checksum) { case CKSUM_ON: data_checksum = true; diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index d191fec8502..612540956b7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -71,6 +71,7 @@ __wt_page_alloc(WT_SESSION_IMPL *session, break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: + WT_ASSERT(session, alloc_entries != 0); /* * Internal pages have an array of references to objects so they * can split. Allocate the array of references and optionally, @@ -102,11 +103,13 @@ err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) { } break; case WT_PAGE_COL_VAR: - page->pg_var = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); + page->pg_var = alloc_entries == 0 ? + NULL : (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); page->entries = alloc_entries; break; case WT_PAGE_ROW_LEAF: - page->pg_row = (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE)); + page->pg_row = alloc_entries == 0 ? + NULL : (WT_ROW *)((uint8_t *)page + sizeof(WT_PAGE)); page->entries = alloc_entries; break; WT_ILLEGAL_VALUE(session); diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index 03b5039b00b..8eb120f06ec 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -302,6 +302,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_UPDATE *upd; wt_off_t size; uint64_t n, skip; + bool valid; btree = cbt->btree; cursor = &cbt->iface; @@ -421,7 +422,8 @@ random_page_entry: * the next entry, if that doesn't work, move to the previous entry. */ WT_ERR(__wt_row_random_leaf(session, cbt)); - if (__wt_cursor_valid(cbt, &upd)) { + WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); + if (valid) { WT_ERR(__wt_key_return(session, cbt)); WT_ERR(__wt_value_return(session, cbt, upd)); } else { diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 4ac0cb2da9b..450fd6cf563 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -370,7 +370,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) WT_BTREE *btree; WT_DECL_RET; WT_ITEM tmp; - WT_PAGE *page; + WT_PAGE *notused; size_t addr_size; uint64_t time_start, time_stop; uint32_t page_flags, final_state, new_state, previous_state; @@ -378,7 +378,6 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) bool timer; btree = S2BT(session); - page = NULL; time_start = time_stop = 0; /* @@ -427,11 +426,8 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) if (addr == NULL) { WT_ASSERT(session, previous_state != WT_REF_DISK); - WT_ERR(__wt_btree_new_leaf_page(session, &page)); - ref->page = page; - if (previous_state == WT_REF_LOOKASIDE) - goto skip_read; - goto done; + WT_ERR(__wt_btree_new_leaf_page(session, &ref->page)); + goto skip_read; } /* @@ -464,7 +460,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED; if (LF_ISSET(WT_READ_IGNORE_CACHE_SIZE)) FLD_SET(page_flags, WT_PAGE_EVICT_NO_PROGRESS); - WT_ERR(__wt_page_inmem(session, ref, tmp.data, page_flags, &page)); + WT_ERR(__wt_page_inmem(session, ref, tmp.data, page_flags, ¬used)); tmp.mem = NULL; /* @@ -481,7 +477,7 @@ skip_read: switch (previous_state) { case WT_REF_DELETED: /* - * A fast-deleted page may also have lookaside information. The + * A truncated page may also have lookaside information. The * delete happened after page eviction (writing the lookaside * information), first update based on the lookaside table and * then apply the delete. @@ -491,6 +487,7 @@ skip_read: ref->page_las->eviction_to_lookaside = false; } + /* Move all records to a deleted state. */ WT_ERR(__wt_delete_page_instantiate(session, ref)); break; case WT_REF_LOOKASIDE: @@ -523,7 +520,7 @@ skip_read: WT_IGNORE_RET(__wt_las_remove_block( session, btree->id, ref->page_las->las_pageid)); -done: WT_PUBLISH(ref->state, final_state); + WT_PUBLISH(ref->state, final_state); return (ret); err: /* @@ -719,8 +716,7 @@ read: /* ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { - ret = 0; - WT_NOT_READ(ret); + WT_NOT_READ(ret, 0); WT_STAT_CONN_INCR(session, page_forcible_evict_blocked); stalled = true; diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 36bbe48b407..3596f5a72b7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -719,8 +719,7 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, alloc_refp - alloc_index->index == (ptrdiff_t)result_entries); /* Start making real changes to the tree, errors are fatal. */ - complete = WT_ERR_PANIC; - WT_NOT_READ(complete); + WT_NOT_READ(complete, WT_ERR_PANIC); /* Encourage a race */ __page_split_timing_stress(session, diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index 22921d7d378..535e804d6a8 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -517,8 +517,7 @@ restart: /* * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { - ret = 0; - WT_NOT_READ(ret); + WT_NOT_READ(ret, 0); break; } diff --git a/src/third_party/wiredtiger/src/cache/cache_las.c b/src/third_party/wiredtiger/src/cache/cache_las.c index 569a0247e7b..7ccc325523e 100644 --- a/src/third_party/wiredtiger/src/cache/cache_las.c +++ b/src/third_party/wiredtiger/src/cache/cache_las.c @@ -993,7 +993,8 @@ __wt_las_sweep(WT_SESSION_IMPL *session) #else wt_timestamp_t *val_ts; #endif - uint64_t cnt, decrement_cnt, las_counter, las_pageid, txnid; + uint64_t cnt, decrement_cnt, las_counter, las_pageid, saved_pageid; + uint64_t las_txnid; uint32_t las_id, session_flags; uint8_t upd_type; int notused; @@ -1007,6 +1008,7 @@ __wt_las_sweep(WT_SESSION_IMPL *session) local_txn = locked = false; WT_RET(__wt_scr_alloc(session, 0, &saved_key)); + saved_pageid = 0; /* * Allocate a cursor and wrap all the updates in a transaction. @@ -1059,6 +1061,20 @@ __wt_las_sweep(WT_SESSION_IMPL *session) /* Walk the file. */ while ((ret = cursor->next(cursor)) == 0) { + WT_ERR(cursor->get_key(cursor, + &las_pageid, &las_id, &las_counter, &las_key)); + + /* + * If we have switched to a different page, clear the saved key. + * Otherwise, sweep could incorrectly remove records after + * seeing a birthmark for a key in one block if the same key is + * at the beginning of the next block. See WT-3982 for details. + */ + if (las_pageid != saved_pageid) { + saved_key->size = 0; + saved_pageid = las_pageid; + } + /* * Stop if the cache is stuck: we are ignoring the cache size * while scanning the lookaside table, so we're making things @@ -1076,9 +1092,6 @@ __wt_las_sweep(WT_SESSION_IMPL *session) else if (saved_key->size == 0) break; - WT_ERR(cursor->get_key(cursor, - &las_pageid, &las_id, &las_counter, &las_key)); - /* * If the entry belongs to a dropped tree, discard it. * @@ -1102,7 +1115,7 @@ __wt_las_sweep(WT_SESSION_IMPL *session) * now no longer needed. */ WT_ERR(cursor->get_value(cursor, - &txnid, &las_timestamp, &upd_type, &las_value)); + &las_txnid, &las_timestamp, &upd_type, &las_value)); #ifdef HAVE_TIMESTAMPS WT_ASSERT(session, las_timestamp.size == WT_TIMESTAMP_SIZE); memcpy(×tamp, las_timestamp.data, las_timestamp.size); @@ -1116,7 +1129,7 @@ __wt_las_sweep(WT_SESSION_IMPL *session) * If it is visible then perform additional checks to see * whether it has aged out of a live file. */ - if (!__wt_txn_visible_all(session, txnid, val_ts)) { + if (!__wt_txn_visible_all(session, las_txnid, val_ts)) { saved_key->size = 0; continue; } diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index f473cfe3e8d..ffcb2139330 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -46,8 +46,8 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_open_session[] = { static const WT_CONFIG_CHECK confchk_WT_CONNECTION_query_timestamp[] = { { "get", "string", - NULL, "choices=[\"all_committed\",\"oldest\",\"pinned\"," - "\"recovery\",\"stable\"]", + NULL, "choices=[\"all_committed\",\"last_checkpoint\",\"oldest\"" + ",\"pinned\",\"recovery\",\"stable\"]", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; diff --git a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c index 720df3c465d..f1043ee7546 100644 --- a/src/third_party/wiredtiger/src/conn/conn_cache_pool.c +++ b/src/third_party/wiredtiger/src/conn/conn_cache_pool.c @@ -303,8 +303,8 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session) conn = S2C(session); cache = conn->cache; - cp_locked = found = false; - WT_NOT_READ(cp_locked); + WT_NOT_READ(cp_locked, false); + found = false; cp = __wt_process.cache_pool; if (!F_ISSET(conn, WT_CONN_CACHE_POOL)) @@ -338,8 +338,7 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session) * operation. */ __wt_spin_unlock(session, &cp->cache_pool_lock); - cp_locked = false; - WT_NOT_READ(cp_locked); + WT_NOT_READ(cp_locked, false); FLD_CLR(cache->pool_flags, WT_CACHE_POOL_RUN); __wt_cond_signal(session, cp->cache_pool_cond); diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 6e27d0f98d6..fed45dbf4c4 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -598,8 +598,7 @@ __log_file_server(void *arg) continue; WT_ERR(__wt_fsync(session, log->log_fh, true)); __wt_spin_lock(session, &log->log_sync_lock); - locked = true; - WT_NOT_READ(locked); + WT_NOT_READ(locked, true); /* * The sync LSN could have advanced while we * were writing to disk. @@ -950,7 +949,7 @@ __log_server(void *arg) if (ret == EACCES && retry < WT_RETRY_MAX) { retry++; - ret = 0; + WT_NOT_READ(ret, 0); } else { /* * Return the error if there is diff --git a/src/third_party/wiredtiger/src/docs/Doxyfile b/src/third_party/wiredtiger/src/docs/Doxyfile index 178655bf6ed..60b6c4690b0 100644 --- a/src/third_party/wiredtiger/src/docs/Doxyfile +++ b/src/third_party/wiredtiger/src/docs/Doxyfile @@ -206,39 +206,39 @@ TAB_SIZE = 8 # You can put \n's in the value part of an alias to insert newlines. ALIASES = \ - "config{3}= @row{<tt>\1</tt>,\2,\3}" \ - "configempty{2}=@param config\n Configuration string, see @ref config_strings. No values currently permitted." \ - "configend= </table>" \ - "configstart{2}=@param config\n Configuration string, see @ref config_strings. Permitted values:\n <table>@hrow{Name,Effect,Values}" \ + config{3}=" @row{<tt>\1</tt>,\2,\3}" \ + configempty{2}="@param config configuration string, see @ref config_strings. No values currently permitted." \ + configend=" </table>" \ + configstart{2}="@param config configuration string, see @ref config_strings. Permitted values:\n <table>@hrow{Name,Effect,Values}" \ "ebusy_errors=@returns zero on success, EBUSY if the object is not available for exclusive access, and a non-zero error code on failure. See @ref error_handling \"Error handling\" for details." \ - "errors=@returns zero on success and a non-zero error code on failure. See @ref error_handling \"Error handling\" for details." \ - "exclusive=This method requires exclusive access to the specified data source(s). If any cursors are open with the specified name(s) or a data source is otherwise in use, the call will fail and return \c EBUSY.\n\n" \ - "ex_ref{1}=@ref \1 \"\1\"" \ - "hrow{1}=<tr><th>\1</th></tr>" \ - "hrow{2}=<tr><th>\1</th><th>\2</th></tr>" \ - "hrow{3}=<tr><th>\1</th><th>\2</th><th>\3</th></tr>" \ - "hrow{4}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th></tr>" \ - "hrow{5}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th></tr>" \ - "hrow{6}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th></tr>" \ - "hrow{7}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th><th>\7</th></tr>" \ - "hrow{8}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th><th>\7</th><th>\8</th></tr>" \ - "hrow{9}=<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th><th>\7</th><th>\8</th><th>\9</th></tr>" \ - "notyet{1}=Note: <b>"\1"</b> not yet supported in WiredTiger.\n@todo fix when \1 supported\n\n" \ - "plantuml_end=PlantUML template end -->" \ - "plantuml_start{1}=\image html \1\n\image latex \1\n<!-- PlantUML template begins" \ - "requires_notransaction=This method must not be called on a session with an active transaction.\n\n" \ - "requires_transaction=This method must be called on a session with an active transaction.\n\n" \ - "ref_single=@ref" \ - "row{1}=<tr><td>\1</td></tr>" \ - "row{2}=<tr><td>\1</td><td>\2</td></tr>" \ - "row{3}=<tr><td>\1</td><td>\2</td><td>\3</td></tr>" \ - "row{4}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td></tr>" \ - "row{5}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td></tr>" \ - "row{6}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td></tr>" \ - "row{7}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td><td>\7</td></tr>" \ - "row{8}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td><td>\7</td><td>\8</td></tr>" \ - "row{9}=<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td><td>\7</td><td>\8</td><td>\9</td></tr>" \ - "subpage_single=@subpage" \ + errors="@returns zero on success and a non-zero error code on failure. See @ref error_handling \"Error handling\" for details." \ + exclusive="This method requires exclusive access to the specified data source(s). If any cursors are open with the specified name(s) or a data source is otherwise in use, the call will fail and return \c EBUSY.\n\n" \ + ex_ref{1}="@ref \1 \"\1\"" \ + hrow{1}="<tr><th>\1</th></tr>" \ + hrow{2}="<tr><th>\1</th><th>\2</th></tr>" \ + hrow{3}="<tr><th>\1</th><th>\2</th><th>\3</th></tr>" \ + hrow{4}="<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th></tr>" \ + hrow{5}="<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th></tr>" \ + hrow{6}="<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th></tr>" \ + hrow{7}="<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th><th>\7</th></tr>" \ + hrow{8}="<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th><th>\7</th><th>\8</th></tr>" \ + hrow{9}="<tr><th>\1</th><th>\2</th><th>\3</th><th>\4</th><th>\5</th><th>\6</th><th>\7</th><th>\8</th><th>\9</th></tr>" \ + notyet{1}="Note: <b>"\1"</b> not yet supported in WiredTiger.\n@todo fix when \1 supported\n\n" \ + plantuml_end="PlantUML template end -->" \ + plantuml_start{1}="\image html \1\n\image latex \1\n<!-- PlantUML template begins" \ + requires_notransaction="This method must not be called on a session with an active transaction.\n\n" \ + requires_transaction="This method must be called on a session with an active transaction.\n\n" \ + ref_single="@ref" \ + row{1}="<tr><td>\1</td></tr>" \ + row{2}="<tr><td>\1</td><td>\2</td></tr>" \ + row{3}="<tr><td>\1</td><td>\2</td><td>\3</td></tr>" \ + row{4}="<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td></tr>" \ + row{5}="<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td></tr>" \ + row{6}="<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td></tr>" \ + row{7}="<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td><td>\7</td></tr>" \ + row{8}="<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td><td>\7</td><td>\8</td></tr>" \ + row{9}="<tr><td>\1</td><td>\2</td><td>\3</td><td>\4</td><td>\5</td><td>\6</td><td>\7</td><td>\8</td><td>\9</td></tr>" \ + subpage_single="@subpage" \ # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding diff --git a/src/third_party/wiredtiger/src/evict/evict_page.c b/src/third_party/wiredtiger/src/evict/evict_page.c index 90b71659015..5c478654585 100644 --- a/src/third_party/wiredtiger/src/evict/evict_page.c +++ b/src/third_party/wiredtiger/src/evict/evict_page.c @@ -124,22 +124,36 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) bool clean_page, inmem_split, tree_dead; conn = S2C(session); + page = ref->page; + + __wt_verbose(session, WT_VERB_EVICT, + "page %p (%s)", (void *)page, __wt_page_type_string(page->type)); /* Enter the eviction generation. */ __wt_session_gen_enter(session, WT_GEN_EVICT); - page = ref->page; - tree_dead = F_ISSET(session->dhandle, WT_DHANDLE_DEAD); + /* + * Get exclusive access to the page if our caller doesn't have the tree + * locked down. + */ + if (!closing) { + WT_ERR(__evict_exclusive(session, ref)); - __wt_verbose(session, WT_VERB_EVICT, - "page %p (%s)", (void *)page, __wt_page_type_string(page->type)); + /* + * Now the page is locked, remove it from the LRU eviction + * queue. We have to do this before freeing the page memory or + * otherwise touching the reference because eviction paths + * assume a non-NULL reference on the queue is pointing at + * valid memory. + */ + __wt_evict_list_clear_page(session, ref); + } /* - * Get exclusive access to the page and review it for conditions that - * would block our eviction of the page. If the check fails (for - * example, we find a page with active children), we're done. We have - * to make this check for clean pages, too: while unlikely eviction - * would choose an internal page with children, it's not disallowed. + * Review the page for conditions that would block its eviction. If the + * check fails (for example, we find a page with active children), quit. + * Make this check for clean pages, too: while unlikely eviction would + * choose an internal page with children, it's not disallowed. */ WT_ERR(__evict_review(session, ref, closing, &inmem_split)); @@ -178,6 +192,7 @@ __wt_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) } /* Update the reference and discard the page. */ + tree_dead = F_ISSET(session->dhandle, WT_DHANDLE_DEAD); if (__wt_ref_is_root(ref)) __wt_ref_out(session, ref); else if ((clean_page && !F_ISSET(conn, WT_CONN_IN_MEMORY)) || tree_dead) @@ -275,12 +290,11 @@ __evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, bool closing) WT_DECL_RET; /* - * Discard the page and update the reference structure; if the page has - * an address, it's a disk page; if it has no address, it's a deleted - * page re-instantiated (for example, by searching) and never written. - * - * If evicting a WT_REF_LIMBO reference, we get to here and transition - * back to WT_REF_LOOKASIDE. + * Discard the page and update the reference structure. If evicting a + * WT_REF_LIMBO page, transition back to WT_REF_LOOKASIDE. Otherwise, + * a page with a disk address is an on-disk page, and a page without + * a disk address is a re-instantiated deleted page (for example, by + * searching), that was never subsequently written. */ __wt_ref_out(session, ref); if (!closing && ref->page_las != NULL && @@ -417,7 +431,18 @@ __evict_child_check(WT_SESSION_IMPL *session, WT_REF *parent) WT_INTL_FOREACH_BEGIN(session, parent->page, child) { switch (child->state) { case WT_REF_DISK: /* On-disk */ - case WT_REF_DELETED: /* On-disk, deleted */ + break; + case WT_REF_DELETED: /* Deleted */ + /* + * If the page was part of a truncate, transaction + * rollback might switch this page into its previous + * state at any time, so the delete must be resolved. + * We don't have to lock the page, as no thread of + * control can be running below our locked internal + * page. + */ + if (__wt_btree_truncate_active(session, child)) + return (EBUSY); break; default: return (EBUSY); @@ -446,31 +471,12 @@ __evict_review( *inmem_splitp = false; conn = S2C(session); + page = ref->page; flags = WT_REC_EVICT; if (!WT_SESSION_IS_CHECKPOINT(session)) LF_SET(WT_REC_VISIBLE_ALL); /* - * Get exclusive access to the page if our caller doesn't have the tree - * locked down. - */ - if (!closing) { - WT_RET(__evict_exclusive(session, ref)); - - /* - * Now the page is locked, remove it from the LRU eviction - * queue. We have to do this before freeing the page memory or - * otherwise touching the reference because eviction paths - * assume a non-NULL reference on the queue is pointing at - * valid memory. - */ - __wt_evict_list_clear_page(session, ref); - } - - /* Now that we have exclusive access, review the page. */ - page = ref->page; - - /* * Fail if an internal has active children, the children must be evicted * first. The test is necessary but shouldn't fire much: the eviction * code is biased for leaf pages, an internal page shouldn't be selected diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index afefbe8ad5c..ca2176fcf0e 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -68,6 +68,7 @@ if ((ret) != 0 && \ (ret) != WT_NOTFOUND && \ (ret) != WT_DUPLICATE_KEY && \ + (ret) != WT_PREPARE_CONFLICT && \ F_ISSET(&(s)->txn, WT_TXN_RUNNING)) \ F_SET(&(s)->txn, WT_TXN_ERROR); \ /* \ @@ -237,6 +238,8 @@ JOINABLE_CURSOR_CALL_CHECK(cur) #define CURSOR_UPDATE_API_END(s, ret) \ + if ((ret) == WT_PREPARE_CONFLICT) \ + (ret) = WT_ROLLBACK; \ TXN_API_END(s, ret) #define ASYNCOP_API_CALL(conn, s, n) \ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 7ba73d1b94f..893f51aa022 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -773,7 +773,7 @@ struct __wt_page { /* * WT_PAGE_DELETED -- - * Related information for fast-delete, on-disk pages. + * Related information for truncated pages. */ struct __wt_page_deleted { volatile uint64_t txnid; /* Transaction ID */ @@ -997,7 +997,7 @@ struct __wt_update { finalized prepare */ #define WT_UPDATE_STATE_LOCKED 1 /* locked */ #define WT_UPDATE_STATE_PREPARED 2 /* prepared */ - uint8_t state; /* state (one byte : conserve memory) */ + volatile uint8_t state; /* If the update includes a complete value. */ #define WT_UPDATE_DATA_VALUE(upd) \ diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 808c8f7ee7f..de28eb7232f 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1151,6 +1151,23 @@ __wt_ref_block_free(WT_SESSION_IMPL *session, WT_REF *ref) } /* + * __wt_btree_truncate_active -- + * Return if a truncate operation is active. + */ +static inline bool +__wt_btree_truncate_active(WT_SESSION_IMPL *session, WT_REF *ref) +{ + WT_PAGE_DELETED *page_del; + + if ((page_del = ref->page_del) == NULL) + return (false); + if (page_del->txnid == WT_TXN_ABORTED) + return (false); + return (!__wt_txn_visible_all(session, + page_del->txnid, WT_TIMESTAMP_NULL(&page_del->timestamp))); +} + +/* * __wt_btree_can_evict_dirty -- * Check whether eviction of dirty pages or splits are permitted in the * current tree. @@ -1336,7 +1353,11 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) page = ref->page; mod = page->modify; - /* Pages that have never been modified can always be evicted. */ + /* A truncated page can't be evicted until the truncate completes. */ + if (__wt_btree_truncate_active(session, ref)) + return (false); + + /* Otherwise, never modified pages can always be evicted. */ if (mod == NULL) return (true); @@ -1350,12 +1371,6 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS)) return (false); - /* A truncated page can't be evicted until the truncate completes. */ - if (ref->page_del != NULL && ref->page_del->txnid != WT_TXN_ABORTED && - !__wt_txn_visible_all(session, - ref->page_del->txnid, WT_TIMESTAMP_NULL(&ref->page_del->timestamp))) - return (false); - /* * Check for in-memory splits before other eviction tests. If the page * should split in-memory, return success immediately and skip more @@ -1458,7 +1473,8 @@ __wt_page_release(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags) (LF_ISSET(WT_READ_NO_SPLIT) || (!inmem_split && F_ISSET(session, WT_SESSION_NO_RECONCILE)))) { if (!WT_SESSION_IS_CHECKPOINT(session)) - (void)__wt_page_evict_urgent(session, ref); + WT_IGNORE_RET( + __wt_page_evict_urgent(session, ref)); } else { WT_RET_BUSY_OK(__wt_page_release_evict(session, ref)); return (0); diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index 70f9318f6d7..ec5c6689c3f 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -217,20 +217,23 @@ struct __wt_cursor_btree { #endif /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_CBT_ACTIVE 0x01u /* Active in the tree */ -#define WT_CBT_ITERATE_APPEND 0x02u /* Col-store: iterating append list */ -#define WT_CBT_ITERATE_NEXT 0x04u /* Next iteration configuration */ -#define WT_CBT_ITERATE_PREV 0x08u /* Prev iteration configuration */ -#define WT_CBT_NO_TXN 0x10u /* Non-txn cursor (e.g. a checkpoint) */ -#define WT_CBT_SEARCH_SMALLEST 0x20u /* Row-store: small-key insert list */ -#define WT_CBT_VAR_ONPAGE_MATCH 0x40u /* Var-store: on-page recno match */ +#define WT_CBT_ACTIVE 0x001u /* Active in the tree */ +#define WT_CBT_ITERATE_APPEND 0x002u /* Col-store: iterating append list */ +#define WT_CBT_ITERATE_NEXT 0x004u /* Next iteration configuration */ +#define WT_CBT_ITERATE_PREV 0x008u /* Prev iteration configuration */ +#define WT_CBT_NO_TXN 0x010u /* Non-txn cursor (e.g. a checkpoint) */ +#define WT_CBT_RETRY_NEXT 0x020u /* Next, resulted in prepare conflict */ +#define WT_CBT_RETRY_PREV 0x040u /* Prev, resulted in prepare conflict */ +#define WT_CBT_SEARCH_SMALLEST 0x080u /* Row-store: small-key insert list */ +#define WT_CBT_VAR_ONPAGE_MATCH 0x100u /* Var-store: on-page recno match */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ #define WT_CBT_POSITION_MASK /* Flags associated with position */ \ (WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \ - WT_CBT_SEARCH_SMALLEST | WT_CBT_VAR_ONPAGE_MATCH) + WT_CBT_RETRY_NEXT | WT_CBT_RETRY_PREV | WT_CBT_SEARCH_SMALLEST | \ + WT_CBT_VAR_ONPAGE_MATCH) - uint8_t flags; + uint32_t flags; }; struct __wt_cursor_bulk { diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index a4e986c4325..d338c47dfae 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -311,6 +311,20 @@ __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session) } /* + * __cursor_kv_return -- + * Return a page referenced key/value pair to the application. + */ +static inline int +__cursor_kv_return( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + WT_RET(__wt_key_return(session, cbt)); + WT_RET(__wt_value_return(session, cbt, upd)); + + return (0); +} + +/* * __cursor_func_init -- * Cursor call setup. */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 8b69f9ef244..d884401feb2 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -68,7 +68,7 @@ extern int __wt_block_ext_prealloc(WT_SESSION_IMPL *session, u_int max) WT_GCC_F extern int __wt_block_ext_discard(WT_SESSION_IMPL *session, u_int max) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_salvage_start(WT_SESSION_IMPL *session, WT_BLOCK *block) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_salvage_end(WT_SESSION_IMPL *session, WT_BLOCK *block) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern bool __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size); +extern bool __wt_block_offset_invalid(WT_BLOCK *block, wt_off_t offset, uint32_t size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, bool *eofp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_salvage_valid(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t addr_size, bool valid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_block_verify_start(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, const char *cfg[]) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -100,7 +100,7 @@ extern void __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt); extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern bool __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp); +extern int __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *valid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -128,7 +128,7 @@ extern int __wt_debug_tree(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_REF *re extern int __wt_debug_page(WT_SESSION_IMPL *session, WT_REF *ref, const char *ofile) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all); +extern bool __wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep); @@ -141,7 +141,7 @@ extern int __wt_btree_discard(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBU extern void __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno); extern int __wt_btree_tree_open(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern bool __wt_btree_immediately_durable(WT_SESSION_IMPL *session); +extern bool __wt_btree_immediately_durable(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btree_huffman_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_btree_huffman_close(WT_SESSION_IMPL *session); extern int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -200,15 +200,15 @@ extern int __wt_update_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, WT_ extern WT_UPDATE *__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert, bool restore) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern bool __wt_las_nonempty(WT_SESSION_IMPL *session); +extern bool __wt_las_nonempty(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_las_stats_update(WT_SESSION_IMPL *session); extern int __wt_las_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_cursor_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_las_cursor(WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); extern int __wt_las_cursor_close(WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern bool __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref); -extern bool __wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref); +extern bool __wt_las_page_skip_locked(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern bool __wt_las_page_skip(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_insert_block(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_PAGE *page, WT_MULTI *multi, WT_ITEM *key) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_cursor_position(WT_CURSOR *cursor, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_las_remove_block(WT_SESSION_IMPL *session, uint32_t btree_id, uint64_t pageid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -369,7 +369,7 @@ extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, WT_CURS extern int __wt_evict_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref); extern void __wt_evict_server_wake(WT_SESSION_IMPL *session); -extern bool __wt_evict_thread_chk(WT_SESSION_IMPL *session); +extern bool __wt_evict_thread_chk(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_evict_thread_run(WT_SESSION_IMPL *session, WT_THREAD *thread) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_evict_thread_stop(WT_SESSION_IMPL *session, WT_THREAD *thread) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_evict_create(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -377,7 +377,7 @@ extern int __wt_evict_destroy(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBU extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); extern int __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, bool readonly, double pct_full) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref); +extern bool __wt_page_evict_urgent(WT_SESSION_IMPL *session, WT_REF *ref) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_evict_priority_set(WT_SESSION_IMPL *session, uint64_t v); extern void __wt_evict_priority_clear(WT_SESSION_IMPL *session); extern int __wt_verbose_dump_cache(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -495,7 +495,7 @@ extern int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int ( extern int __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool force, WT_LSM_CHUNK **chunkp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_work_switch(WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, bool *ran) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern bool __wt_lsm_chunk_visible_all(WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk); +extern bool __wt_lsm_chunk_visible_all(WT_SESSION_IMPL *session, WT_LSM_CHUNK *chunk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_lsm_worker_start(WT_SESSION_IMPL *session, WT_LSM_WORKER_ARGS *args) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -557,7 +557,7 @@ extern void __wt_free_int(WT_SESSION_IMPL *session, const void *p_arg) WT_GCC_FU extern int __wt_errno(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern const char *__wt_strerror(WT_SESSION_IMPL *session, int error, char *errbuf, size_t errlen); extern int __wt_ext_map_windows_error(WT_EXTENSION_API *wt_api, WT_SESSION *wt_session, uint32_t windows_error) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern bool __wt_handle_is_open(WT_SESSION_IMPL *session, const char *name); +extern bool __wt_handle_is_open(WT_SESSION_IMPL *session, const char *name) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_open(WT_SESSION_IMPL *session, const char *name, WT_FS_OPEN_FILE_TYPE file_type, u_int flags, WT_FH **fhp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_close(WT_SESSION_IMPL *session, WT_FH **fhp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_close_connection_close(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -742,11 +742,11 @@ extern void __wt_readunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern int __wt_try_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l); extern void __wt_writeunlock(WT_SESSION_IMPL *session, WT_RWLOCK *l); -extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l); +extern bool __wt_rwlock_islocked(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint32_t __wt_nlpo2_round(uint32_t v); extern uint32_t __wt_nlpo2(uint32_t v); extern uint32_t __wt_log2_int(uint32_t n); -extern bool __wt_ispo2(uint32_t v); +extern bool __wt_ispo2(uint32_t v) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uint32_t __wt_rduppo2(uint32_t n, uint32_t po2); extern void __wt_random_init(WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_random_init_seed(WT_SESSION_IMPL *session, WT_RAND_STATE volatile *rnd_state) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); diff --git a/src/third_party/wiredtiger/src/include/extern_posix.h b/src/third_party/wiredtiger/src/include/extern_posix.h index bc71b77d0f6..8b92d99d4f1 100644 --- a/src/third_party/wiredtiger/src/include/extern_posix.h +++ b/src/third_party/wiredtiger/src/include/extern_posix.h @@ -19,9 +19,9 @@ extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond); extern void __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp); extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_get_vm_pagesize(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern bool __wt_absolute_path(const char *path); +extern bool __wt_absolute_path(const char *path) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern const char *__wt_path_separator(void); -extern bool __wt_has_priv(void); +extern bool __wt_has_priv(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_stream_set_line_buffer(FILE *fp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_stream_set_no_buffer(FILE *fp) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds) WT_GCC_FUNC_DECL_ATTRIBUTE((visibility("default"))); diff --git a/src/third_party/wiredtiger/src/include/extern_win.h b/src/third_party/wiredtiger/src/include/extern_win.h index bdd54b7954a..50808750c56 100644 --- a/src/third_party/wiredtiger/src/include/extern_win.h +++ b/src/third_party/wiredtiger/src/include/extern_win.h @@ -17,9 +17,9 @@ extern void __wt_cond_signal(WT_SESSION_IMPL *session, WT_CONDVAR *cond); extern void __wt_cond_destroy(WT_SESSION_IMPL *session, WT_CONDVAR **condp); extern int __wt_once(void (*init_routine)(void)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_get_vm_pagesize(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern bool __wt_absolute_path(const char *path); +extern bool __wt_absolute_path(const char *path) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern const char *__wt_path_separator(void); -extern bool __wt_has_priv(void); +extern bool __wt_has_priv(void) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern void __wt_stream_set_line_buffer(FILE *fp); extern void __wt_stream_set_no_buffer(FILE *fp); extern void __wt_sleep(uint64_t seconds, uint64_t micro_seconds); @@ -30,8 +30,8 @@ extern void __wt_thread_id(uintmax_t *id); extern int __wt_thread_str(char *buf, size_t buflen) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern uintmax_t __wt_process_id(void); extern void __wt_epoch_raw(WT_SESSION_IMPL *session, struct timespec *tsp); -extern int __wt_to_utf16_string(WT_SESSION_IMPL *session, const char*utf8, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern int __wt_to_utf8_string(WT_SESSION_IMPL *session, const wchar_t*wide, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_to_utf16_string(WT_SESSION_IMPL *session, const char *utf8, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_to_utf8_string(WT_SESSION_IMPL *session, const wchar_t *wide, WT_ITEM **outbuf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern DWORD __wt_getlasterror(void); extern int __wt_map_windows_error(DWORD windows_error) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern const char *__wt_formatmessage(WT_SESSION_IMPL *session, DWORD windows_error); diff --git a/src/third_party/wiredtiger/src/include/misc.h b/src/third_party/wiredtiger/src/include/misc.h index a3a81c21569..c4d7def85c0 100644 --- a/src/third_party/wiredtiger/src/include/misc.h +++ b/src/third_party/wiredtiger/src/include/misc.h @@ -11,7 +11,10 @@ * and unused function return values. */ #define WT_UNUSED(var) (void)(var) -#define WT_NOT_READ(var) (void)(var) +#define WT_NOT_READ(v, val) do { \ + (v) = (val); \ + (void)(v); \ +} while (0); #define WT_IGNORE_RET(call) do { \ int __ignored_ret; \ __ignored_ret = (call); \ diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h index 2a3fc7448f8..ba32d166f03 100644 --- a/src/third_party/wiredtiger/src/include/mutex.h +++ b/src/third_party/wiredtiger/src/include/mutex.h @@ -44,9 +44,8 @@ struct __wt_rwlock { /* Read/write lock */ uint8_t current; /* Current ticket */ uint8_t next; /* Next available ticket */ uint8_t reader; /* Read queue ticket */ - uint8_t __notused; /* Padding */ - uint16_t readers_active;/* Count of active readers */ - uint16_t readers_queued;/* Count of queued readers */ + uint8_t readers_queued; /* Count of queued readers */ + uint32_t readers_active;/* Count of active readers */ } s; } u; diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 7ef63cb0eaf..01a982b8602 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -627,6 +627,9 @@ struct __wt_connection_stats { int64_t txn_read_queue_head; int64_t txn_read_queue_inserts; int64_t txn_read_queue_len; + int64_t txn_rollback_to_stable; + int64_t txn_rollback_upd_aborted; + int64_t txn_rollback_las_removed; int64_t txn_set_ts; int64_t txn_set_ts_commit; int64_t txn_set_ts_commit_upd; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index dd7f5d4a8bc..19e0be2d695 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -102,6 +102,7 @@ struct __wt_txn_global { volatile uint64_t oldest_id; WT_DECL_TIMESTAMP(commit_timestamp) + WT_DECL_TIMESTAMP(last_ckpt_timestamp) WT_DECL_TIMESTAMP(oldest_timestamp) WT_DECL_TIMESTAMP(pinned_timestamp) WT_DECL_TIMESTAMP(recovery_timestamp) diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 5fcf8ee11c9..9061157ff5a 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -9,6 +9,11 @@ static inline int __wt_txn_id_check(WT_SESSION_IMPL *session); static inline void __wt_txn_read_last(WT_SESSION_IMPL *session); +typedef enum { + WT_VISIBLE_FALSE=0, /* Not a visible update */ + WT_VISIBLE_PREPARE=1, /* Prepared update */ + WT_VISIBLE_TRUE=2 /* A visible update */ +} WT_VISIBLE_TYPE; #ifdef HAVE_TIMESTAMPS /* * __wt_txn_timestamp_flags -- @@ -291,7 +296,7 @@ __wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd) /* * __wt_txn_modify_page_delete -- - * Remember a page fast-deleted by the current transaction. + * Remember a page truncated by the current transaction. */ static inline int __wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref) @@ -539,33 +544,74 @@ __wt_txn_visible( } /* + * __wt_txn_upd_visible_type -- + * Visible type of given update for the current transaction. + */ +static inline WT_VISIBLE_TYPE +__wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd) +{ + uint8_t upd_state; + bool upd_visible; + + for (;;__wt_yield()) { + /* Commit is in progress, yield and try again. */ + if ((upd_state = upd->state) == WT_UPDATE_STATE_LOCKED) + continue; + + upd_visible = __wt_txn_visible( + session, upd->txnid, WT_TIMESTAMP_NULL(&upd->timestamp)); + + /* + * The visibility check is only valid if the update does not + * change state. If the state does change, recheck visibility. + */ + if (upd->state == upd_state) + break; + } + + if (!upd_visible) + return (WT_VISIBLE_FALSE); + + if (upd_state == WT_UPDATE_STATE_PREPARED) + return (F_ISSET(&session->txn, WT_TXN_IGNORE_PREPARE) ? + WT_VISIBLE_FALSE : WT_VISIBLE_PREPARE); + + return (WT_VISIBLE_TRUE); +} + +/* * __wt_txn_upd_visible -- * Can the current transaction see the given update. */ static inline bool __wt_txn_upd_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd) { - return (__wt_txn_visible(session, - upd->txnid, WT_TIMESTAMP_NULL(&upd->timestamp))); + return (__wt_txn_upd_visible_type(session, upd) == WT_VISIBLE_TRUE); } /* * __wt_txn_read -- * Get the first visible update in a list (or NULL if none are visible). */ -static inline WT_UPDATE * -__wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd) +static inline int +__wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_UPDATE **updp) { static WT_UPDATE tombstone = { .txnid = WT_TXN_NONE, .type = WT_UPDATE_TOMBSTONE }; + WT_VISIBLE_TYPE upd_visible; bool skipped_birthmark; + *updp = NULL; for (skipped_birthmark = false; upd != NULL; upd = upd->next) { /* Skip reserved place-holders, they're never visible. */ - if (upd->type != WT_UPDATE_RESERVE && - __wt_txn_upd_visible(session, upd)) - break; + if (upd->type != WT_UPDATE_RESERVE) { + upd_visible = __wt_txn_upd_visible_type(session, upd); + if (upd_visible == WT_VISIBLE_TRUE) + break; + if (upd_visible == WT_VISIBLE_PREPARE) + return (WT_PREPARE_CONFLICT); + } /* An invisible birthmark is equivalent to a tombstone. */ if (upd->type == WT_UPDATE_BIRTHMARK) skipped_birthmark = true; @@ -574,7 +620,8 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd) if (upd == NULL && skipped_birthmark) upd = &tombstone; - return (upd == NULL || upd->type == WT_UPDATE_BIRTHMARK ? NULL : upd); + *updp = (upd == NULL || upd->type == WT_UPDATE_BIRTHMARK ? NULL : upd); + return (0); } /* @@ -786,21 +833,32 @@ static inline int __wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) { WT_TXN *txn; + bool ignore_prepare_set; txn = &session->txn; - if (txn->isolation == WT_ISO_SNAPSHOT) - while (upd != NULL && !__wt_txn_upd_visible(session, upd)) { - if (upd->txnid != WT_TXN_ABORTED) { - WT_STAT_CONN_INCR( - session, txn_update_conflict); - WT_STAT_DATA_INCR( - session, txn_update_conflict); - return (__wt_txn_rollback_required(session, + if (txn->isolation != WT_ISO_SNAPSHOT) + return (0); + + /* + * Clear the ignore prepare setting of txn, as it is not supposed, to + * affect the visibility for update operations. + */ + ignore_prepare_set = F_ISSET(txn, WT_TXN_IGNORE_PREPARE); + F_CLR(txn, WT_TXN_IGNORE_PREPARE); + for (;upd != NULL && !__wt_txn_upd_visible(session, upd); + upd = upd->next) { + if (upd->txnid != WT_TXN_ABORTED) { + if (ignore_prepare_set) + F_SET(txn, WT_TXN_IGNORE_PREPARE); + WT_STAT_CONN_INCR(session, txn_update_conflict); + WT_STAT_DATA_INCR(session, txn_update_conflict); + return (__wt_txn_rollback_required(session, "conflict between concurrent operations")); - } - upd = upd->next; } + } + if (ignore_prepare_set) + F_SET(txn, WT_TXN_IGNORE_PREPARE); return (0); } diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index a75c22497ce..1f2a438b8e9 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -2431,8 +2431,9 @@ struct __wt_connection { * timestamps of all active readers\, and \c stable returns the most * recent \c stable_timestamp set with WT_CONNECTION::set_timestamp. * See @ref transaction_timestamps., a string\, chosen from the - * following options: \c "all_committed"\, \c "oldest"\, \c "pinned"\, - * \c "recovery"\, \c "stable"; default \c all_committed.} + * following options: \c "all_committed"\, \c "last_checkpoint"\, \c + * "oldest"\, \c "pinned"\, \c "recovery"\, \c "stable"; default \c + * all_committed.} * @configend * @errors * If there is no matching timestamp (e.g., if this method is called @@ -5562,81 +5563,87 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1312 /*! transaction: read timestamp queue length */ #define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1313 +/*! transaction: rollback to stable calls */ +#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE 1314 +/*! transaction: rollback to stable updates aborted */ +#define WT_STAT_CONN_TXN_ROLLBACK_UPD_ABORTED 1315 +/*! transaction: rollback to stable updates removed from lookaside */ +#define WT_STAT_CONN_TXN_ROLLBACK_LAS_REMOVED 1316 /*! transaction: set timestamp calls */ -#define WT_STAT_CONN_TXN_SET_TS 1314 +#define WT_STAT_CONN_TXN_SET_TS 1317 /*! transaction: set timestamp commit calls */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1315 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1318 /*! transaction: set timestamp commit updates */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1316 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1319 /*! transaction: set timestamp oldest calls */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1317 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1320 /*! transaction: set timestamp oldest updates */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1318 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1321 /*! transaction: set timestamp stable calls */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE 1319 +#define WT_STAT_CONN_TXN_SET_TS_STABLE 1322 /*! transaction: set timestamp stable updates */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1320 +#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1323 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1321 +#define WT_STAT_CONN_TXN_BEGIN 1324 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1322 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1325 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1323 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1326 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1324 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1327 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1325 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1328 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1326 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1329 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1327 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1330 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1328 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1331 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1329 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1332 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1330 +#define WT_STAT_CONN_TXN_CHECKPOINT 1333 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1331 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1334 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1332 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1335 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1333 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1336 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1334 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1337 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1335 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1338 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1336 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1339 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1337 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1340 /*! transaction: transaction range of timestamps currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1338 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1341 /*! * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1339 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1342 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1340 +#define WT_STAT_CONN_TXN_SYNC 1343 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1341 +#define WT_STAT_CONN_TXN_COMMIT 1344 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1342 +#define WT_STAT_CONN_TXN_ROLLBACK 1345 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1343 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1346 /*! * @} diff --git a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c index 7050a66a558..4d9f6f92832 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_cursor.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_cursor.c @@ -593,7 +593,7 @@ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { if (clsm->chunks != NULL && ngood < clsm->nchunks) { close_range_start = ngood; close_range_end = clsm->nchunks; - } else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0 ) { + } else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0) { close_range_start = 0; close_range_end = WT_MIN(nchunks, clsm->nchunks); if (close_range_end > nupdates) diff --git a/src/third_party/wiredtiger/src/lsm/lsm_tree.c b/src/third_party/wiredtiger/src/lsm/lsm_tree.c index 1b92028072d..9a7ab20f18f 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_tree.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_tree.c @@ -911,8 +911,7 @@ __wt_lsm_tree_drop( int tret; bool locked; - locked = false; - WT_NOT_READ(locked); + WT_NOT_READ(locked, false); /* Get the LSM tree. */ WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree)); @@ -971,8 +970,7 @@ __wt_lsm_tree_rename(WT_SESSION_IMPL *session, bool locked; old = NULL; - locked = false; - WT_NOT_READ(locked); + WT_NOT_READ(locked, false); /* Get the LSM tree. */ WT_RET(__wt_lsm_tree_get(session, olduri, true, &lsm_tree)); @@ -1043,8 +1041,7 @@ __wt_lsm_tree_truncate( WT_UNUSED(cfg); chunk = NULL; - locked = false; - WT_NOT_READ(locked); + WT_NOT_READ(locked, false); /* Get the LSM tree. */ WT_RET(__wt_lsm_tree_get(session, name, true, &lsm_tree)); @@ -1382,8 +1379,8 @@ __wt_lsm_tree_worker(WT_SESSION_IMPL *session, u_int i; bool exclusive, locked, need_release; - locked = need_release = false; - WT_NOT_READ(locked); + WT_NOT_READ(locked, false); + WT_NOT_READ(need_release, false); exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE); WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index dcb9b34802a..6f18f4fb152 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -345,8 +345,8 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_TXN_ISOLATION saved_isolation; bool flush_set, release_dhandle; - flush_set = release_dhandle = false; - WT_NOT_READ(flush_set); + WT_NOT_READ(flush_set, false); + release_dhandle = false; /* * If the chunk is already checkpointed, make sure it is also evicted. @@ -360,8 +360,7 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, if (ret == 0) chunk->evicted = 1; else if (ret == EBUSY) { - ret = 0; - WT_NOT_READ(ret); + WT_NOT_READ(ret, 0); } else WT_RET_MSG(session, ret, "discard handle"); } diff --git a/src/third_party/wiredtiger/src/os_posix/os_fs.c b/src/third_party/wiredtiger/src/os_posix/os_fs.c index ca810fa8d88..811c0576eef 100644 --- a/src/third_party/wiredtiger/src/os_posix/os_fs.c +++ b/src/third_party/wiredtiger/src/os_posix/os_fs.c @@ -97,7 +97,7 @@ __posix_directory_sync(WT_SESSION_IMPL *session, const char *path) dir = tmp->mem; strrchr(dir, '/')[1] = '\0'; - fd = -1; /* -Wconditional-uninitialized */ + fd = 0; /* -Wconditional-uninitialized */ WT_SYSCALL_RETRY(( (fd = open(dir, O_RDONLY, 0444)) == -1 ? -1 : 0), ret); if (ret != 0) diff --git a/src/third_party/wiredtiger/src/os_win/os_dir.c b/src/third_party/wiredtiger/src/os_win/os_dir.c index 3b78106f3b4..d5095e7ef78 100644 --- a/src/third_party/wiredtiger/src/os_win/os_dir.c +++ b/src/third_party/wiredtiger/src/os_win/os_dir.c @@ -59,30 +59,41 @@ __directory_list_worker(WT_FILE_SYSTEM *file_system, WT_ERR(__wt_map_windows_error(windows_error)); } - count = 0; - do { + for (count = 0;;) { /* * Skip . and .. */ if (wcscmp(finddata.cFileName, L".") == 0 || wcscmp(finddata.cFileName, L"..") == 0) - continue; + goto skip; /* The list of files is optionally filtered by a prefix. */ if (prefix != NULL && wcsncmp(finddata.cFileName, prefix_wide->data, prefix_widelen) != 0) - continue; + goto skip; WT_ERR(__wt_realloc_def( session, &dirallocsz, count + 1, &entries)); - WT_ERR(__wt_to_utf8_string( session, finddata.cFileName, &file_utf8)); WT_ERR(__wt_strdup(session, file_utf8->data, &entries[count])); ++count; __wt_scr_free(session, &file_utf8); - } while (!single && FindNextFileW(findhandle, &finddata) != 0); + + if (single) + break; + +skip: if (FindNextFileW(findhandle, &finddata) != 0) + continue; + windows_error = __wt_getlasterror(); + if (windows_error == ERROR_NO_MORE_FILES) + break; + __wt_errx(session, + "%s: directory-list: FindNextFileW: %s", + pathbuf->data, __wt_formatmessage(session, windows_error)); + WT_ERR(__wt_map_windows_error(windows_error)); + } *dirlistp = entries; *countp = count; diff --git a/src/third_party/wiredtiger/src/os_win/os_utf8.c b/src/third_party/wiredtiger/src/os_win/os_utf8.c index 077c39db3ef..1c9efe39506 100644 --- a/src/third_party/wiredtiger/src/os_win/os_utf8.c +++ b/src/third_party/wiredtiger/src/os_win/os_utf8.c @@ -14,7 +14,7 @@ */ int __wt_to_utf16_string( - WT_SESSION_IMPL *session, const char* utf8, WT_ITEM **outbuf) + WT_SESSION_IMPL *session, const char *utf8, WT_ITEM **outbuf) { DWORD windows_error; int bufferSize; @@ -50,7 +50,7 @@ __wt_to_utf16_string( */ int __wt_to_utf8_string( - WT_SESSION_IMPL *session, const wchar_t* wide, WT_ITEM **outbuf) + WT_SESSION_IMPL *session, const wchar_t *wide, WT_ITEM **outbuf) { DWORD windows_error; int bufferSize; diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index f4d0fc0b1ef..1c46da9be10 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -1341,12 +1341,14 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * started. The global commit point can move forward during * reconciliation so we use a cached copy to avoid races when a * concurrent transaction commits or rolls back while we are - * examining its updates. + * examining its updates. As prepared transaction id's are + * globally visible, need to check the update state as well. */ if (F_ISSET(r, WT_REC_EVICT) && + (upd->state != WT_UPDATE_STATE_READY || (F_ISSET(r, WT_REC_VISIBLE_ALL) ? WT_TXNID_LE(r->last_running, txnid) : - !__txn_visible_id(session, txnid))) { + !__txn_visible_id(session, txnid)))) { uncommitted = r->update_uncommitted = true; continue; } @@ -1783,12 +1785,12 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * If called during checkpoint, the child is being * considered by the eviction server or the child is a - * fast-delete page being read. The eviction may have + * truncated page being read. The eviction may have * started before the checkpoint and so we must wait * for the eviction to be resolved. I suspect we could - * handle fast-delete reads, but we can't distinguish - * between the two and fast-delete reads aren't expected - * to be common. + * handle reads of truncated pages, but we can't + * distinguish between the two and reads of truncated + * pages aren't expected to be common. */ break; @@ -5630,8 +5632,7 @@ build: if (key_onpage_ovfl) { WT_ERR(__wt_dsk_cell_data_ref(session, WT_PAGE_ROW_LEAF, kpack, r->cur)); - key_onpage_ovfl = false; - WT_NOT_READ(key_onpage_ovfl); + WT_NOT_READ(key_onpage_ovfl, false); } /* diff --git a/src/third_party/wiredtiger/src/support/mtx_rw.c b/src/third_party/wiredtiger/src/support/mtx_rw.c index 572592b9fbc..fd66a1a40bb 100644 --- a/src/third_party/wiredtiger/src/support/mtx_rw.c +++ b/src/third_party/wiredtiger/src/support/mtx_rw.c @@ -48,9 +48,8 @@ * uint8_t current; // Current ticket * uint8_t next; // Next available ticket * uint8_t reader; // Read queue ticket - * uint8_t __notused; // Padding - * uint16_t readers_active; // Count of active readers - * uint16_t readers_queued; // Count of queued readers + * uint8_t readers_queued; // Count of queued readers + * uint32_t readers_active; // Count of active readers * } s; * } u; * @@ -75,6 +74,12 @@ * 'reader' to 'next' (i.e. readers are scheduled after any queued writers, * avoiding starvation), then atomically incrementing 'readers_queued'. * + * We limit how many readers can queue: we don't allow more readers to queue + * than there are active writers (calculated as `next - current`): otherwise, + * in write-heavy workloads, readers can keep queuing up in front of writers + * and throughput is unstable. The remaining read requests wait without any + * ordering. + * * The 'next' field is a 1-byte value so the available ticket number wraps * after 256 requests. If a thread's write lock request would cause the 'next' * field to catch up with 'current', instead it waits to avoid the same ticket @@ -173,12 +178,10 @@ __wt_readlock(WT_SESSION_IMPL *session, WT_RWLOCK *l) int pause_cnt; bool set_stats; + stats = NULL; /* -Wconditional-uninitialized */ + time_start = time_stop = 0; /* -Wconditional-uninitialized */ + WT_STAT_CONN_INCR(session, rwlock_read); - stats = (int64_t **)S2C(session)->stats; - set_stats = (l->stat_read_count_off != -1 && WT_STAT_ENABLED(session)); - time_start = time_stop = 0; - if (set_stats) - stats[session->stat_bucket][l->stat_read_count_off]++; WT_DIAGNOSTIC_YIELD; @@ -236,8 +239,12 @@ stall: __wt_cond_wait(session, break; } - if (set_stats) + set_stats = (l->stat_read_count_off != -1 && WT_STAT_ENABLED(session)); + if (set_stats) { + stats = (int64_t **)S2C(session)->stats; + stats[session->stat_bucket][l->stat_read_count_off]++; time_start = __wt_clock(session); + } /* Wait for our group to start. */ for (pause_cnt = 0; ticket != l->u.s.current; pause_cnt++) { if (pause_cnt < 1000) @@ -370,12 +377,10 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) int pause_cnt; bool set_stats; + stats = NULL; /* -Wconditional-uninitialized */ + time_start = time_stop = 0; /* -Wconditional-uninitialized */ + WT_STAT_CONN_INCR(session, rwlock_write); - stats = (int64_t **)S2C(session)->stats; - set_stats = (l->stat_write_count_off != -1 && WT_STAT_ENABLED(session)); - time_start = time_stop = 0; - if (set_stats) - stats[session->stat_bucket][l->stat_write_count_off]++; for (;;) { old.u.v = l->u.v; @@ -398,6 +403,12 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) break; } + set_stats = (l->stat_write_count_off != -1 && WT_STAT_ENABLED(session)); + if (set_stats) { + stats = (int64_t **)S2C(session)->stats; + stats[session->stat_bucket][l->stat_write_count_off]++; + time_start = __wt_clock(session); + } /* * Wait for our group to start and any readers to drain. * @@ -406,8 +417,6 @@ __wt_writelock(WT_SESSION_IMPL *session, WT_RWLOCK *l) * could see no readers active from a different batch and decide that * we have the lock. */ - if (set_stats) - time_start = __wt_clock(session); for (pause_cnt = 0, old.u.v = l->u.v; ticket != old.u.s.current || old.u.s.readers_active != 0; pause_cnt++, old.u.v = l->u.v) { diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 40a07be0174..ae13f7d8abe 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -1052,6 +1052,9 @@ static const char * const __stats_connection_desc[] = { "transaction: read timestamp queue inserts to head", "transaction: read timestamp queue inserts total", "transaction: read timestamp queue length", + "transaction: rollback to stable calls", + "transaction: rollback to stable updates aborted", + "transaction: rollback to stable updates removed from lookaside", "transaction: set timestamp calls", "transaction: set timestamp commit calls", "transaction: set timestamp commit updates", @@ -1438,6 +1441,9 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->txn_read_queue_head = 0; stats->txn_read_queue_inserts = 0; stats->txn_read_queue_len = 0; + stats->txn_rollback_to_stable = 0; + stats->txn_rollback_upd_aborted = 0; + stats->txn_rollback_las_removed = 0; stats->txn_set_ts = 0; stats->txn_set_ts_commit = 0; stats->txn_set_ts_commit_upd = 0; @@ -1956,6 +1962,12 @@ __wt_stat_connection_aggregate( to->txn_read_queue_inserts += WT_STAT_READ(from, txn_read_queue_inserts); to->txn_read_queue_len += WT_STAT_READ(from, txn_read_queue_len); + to->txn_rollback_to_stable += + WT_STAT_READ(from, txn_rollback_to_stable); + to->txn_rollback_upd_aborted += + WT_STAT_READ(from, txn_rollback_upd_aborted); + to->txn_rollback_las_removed += + WT_STAT_READ(from, txn_rollback_las_removed); to->txn_set_ts += WT_STAT_READ(from, txn_set_ts); to->txn_set_ts_commit += WT_STAT_READ(from, txn_set_ts_commit); to->txn_set_ts_commit_upd += diff --git a/src/third_party/wiredtiger/src/txn/txn.c b/src/third_party/wiredtiger/src/txn/txn.c index 561961f4e98..3a9b3755ff5 100644 --- a/src/third_party/wiredtiger/src/txn/txn.c +++ b/src/third_party/wiredtiger/src/txn/txn.c @@ -1394,8 +1394,7 @@ __wt_verbose_dump_txn_one(WT_SESSION_IMPL *session, WT_TXN *txn) #endif const char *iso_tag; - iso_tag = "INVALID"; - WT_NOT_READ(iso_tag); + WT_NOT_READ(iso_tag, "INVALID"); switch (txn->isolation) { case WT_ISO_READ_COMMITTED: iso_tag = "WT_ISO_READ_COMMITTED"; diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 1235bc8c2b2..d3f11c5fa69 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -748,6 +748,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; + WT_DECL_TIMESTAMP(ckpt_tmp_ts) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_ISOLATION saved_isolation; @@ -899,6 +900,15 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * checkpointing the metadata since we know that all files in the * checkpoint are now in a consistent state. */ +#ifdef HAVE_TIMESTAMPS + /* + * Record the timestamp from the transaction if we were successful. + * Store it in a temp variable now because it will be invalidated during + * commit but we don't want to set it until we know the checkpoint + * is successful. + */ + __wt_timestamp_set(&ckpt_tmp_ts, &txn->read_timestamp); +#endif WT_ERR(__wt_txn_commit(session, NULL)); /* @@ -942,8 +952,13 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; - if (full) + if (full) { __checkpoint_stats(session); +#ifdef HAVE_TIMESTAMPS + __wt_timestamp_set( + &conn->txn_global.last_ckpt_timestamp, &ckpt_tmp_ts); +#endif + } err: /* * Reset the timer so that next checkpoint tracks the progress only if diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index d31b3995092..eef2fde5284 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -74,6 +74,7 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) &rollback_timestamp, las_timestamp.data) < 0) { WT_ERR(cursor->remove(cursor)); ++remove_cnt; + WT_STAT_CONN_INCR(session, txn_rollback_las_removed); } else ++las_total; } @@ -111,6 +112,7 @@ __txn_abort_newer_update(WT_SESSION_IMPL *session, if (__wt_timestamp_cmp( rollback_timestamp, &next_upd->timestamp) < 0) { next_upd->txnid = WT_TXN_ABORTED; + WT_STAT_CONN_INCR(session, txn_rollback_upd_aborted); __wt_timestamp_set_zero(&next_upd->timestamp); /* @@ -425,6 +427,7 @@ __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); + WT_STAT_CONN_INCR(session, txn_rollback_to_stable); /* * Mark that a rollback operation is in progress and wait for eviction * to drain. This is necessary because lookaside eviction uses diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index 280425eb56e..2266a9cd6f5 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -239,7 +239,10 @@ __txn_global_query_timestamp( break; } __wt_readunlock(session, &txn_global->commit_timestamp_rwlock); - } else if (WT_STRING_MATCH("oldest", cval.str, cval.len)) { + } else if (WT_STRING_MATCH("last_checkpoint", cval.str, cval.len)) + /* Read-only value forever. No lock needed. */ + __wt_timestamp_set(&ts, &txn_global->last_ckpt_timestamp); + else if (WT_STRING_MATCH("oldest", cval.str, cval.len)) { if (!txn_global->has_oldest_timestamp) return (WT_NOTFOUND); WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, diff --git a/src/third_party/wiredtiger/src/utilities/util_load.c b/src/third_party/wiredtiger/src/utilities/util_load.c index 3ed21fc591c..dab24930fe6 100644 --- a/src/third_party/wiredtiger/src/utilities/util_load.c +++ b/src/third_party/wiredtiger/src/utilities/util_load.c @@ -124,8 +124,10 @@ load_dump(WT_SESSION *session) "dump=%s%s%s", hex ? "hex" : "print", append ? ",append" : "", - no_overwrite ? ",overwrite=false" : "")) != 0) - return (util_err(session, ret, NULL)); + no_overwrite ? ",overwrite=false" : "")) != 0) { + ret = util_err(session, ret, NULL); + goto err; + } if ((ret = session->open_cursor( session, uri, NULL, config, &cursor)) != 0) { ret = util_err(session, ret, "%s: session.open_cursor", uri); @@ -494,8 +496,10 @@ config_rename(WT_SESSION *session, char **urip, const char *name) *p = '\0'; p = strchr(p + 1, ':'); if ((ret = __wt_snprintf( - buf, len, "%s:%s%s", *urip, name, p == NULL ? "" : p)) != 0) + buf, len, "%s:%s%s", *urip, name, p == NULL ? "" : p)) != 0) { + free(buf); return (util_err(session, ret, NULL)); + } *urip = buf; return (0); diff --git a/src/third_party/wiredtiger/test/bloom/test_bloom.c b/src/third_party/wiredtiger/test/bloom/test_bloom.c index 10607a3719c..dcc7ab372a9 100644 --- a/src/third_party/wiredtiger/test/bloom/test_bloom.c +++ b/src/third_party/wiredtiger/test/bloom/test_bloom.c @@ -39,7 +39,8 @@ static struct { uint32_t c_ops; uint32_t c_k; /* Number of hash iterations */ uint32_t c_factor; /* Number of bits per item */ - uint32_t c_srand; + + WT_RAND_STATE rand; uint8_t **entries; } g; @@ -67,10 +68,9 @@ main(int argc, char *argv[]) g.c_key_max = 100; g.c_k = 8; g.c_factor = 16; - g.c_srand = 3233456; /* Set values from the command line. */ - while ((ch = __wt_getopt(progname, argc, argv, "c:f:k:o:s:")) != EOF) + while ((ch = __wt_getopt(progname, argc, argv, "c:f:k:o:")) != EOF) switch (ch) { case 'c': /* Cache size */ g.c_cache = (u_int)atoi(__wt_optarg); @@ -78,15 +78,12 @@ main(int argc, char *argv[]) case 'f': /* Factor */ g.c_factor = (u_int)atoi(__wt_optarg); break; - case 'k': /* Number of hash functions */ + case 'k': /* Number of hash functions */ g.c_k = (u_int)atoi(__wt_optarg); break; case 'o': /* Number of ops */ g.c_ops = (u_int)atoi(__wt_optarg); break; - case 's': /* Number of ops */ - g.c_srand = (u_int)atoi(__wt_optarg); - break; default: usage(); } @@ -184,7 +181,7 @@ run(void) memset((void *)item.data, 'a', item.size); for (i = 0, fp = 0; i < g.c_ops; i++) { ((uint8_t *)item.data)[i % item.size] = - 'a' + ((uint8_t)rand() % 26); + 'a' + (__wt_random(&g.rand) % 26); if ((ret = __wt_bloom_get(bloomp, &item)) == 0) ++fp; if (ret != 0 && ret != WT_NOTFOUND) @@ -219,14 +216,14 @@ populate_entries(void) uint32_t i, j; uint8_t **entries; - srand(g.c_srand); + __wt_random_init_seed(NULL, &g.rand); entries = dcalloc(g.c_ops, sizeof(uint8_t *)); for (i = 0; i < g.c_ops; i++) { entries[i] = dcalloc(g.c_key_max, sizeof(uint8_t)); for (j = 0; j < g.c_key_max; j++) - entries[i][j] = 'a' + ((uint8_t)rand() % 26); + entries[i][j] = 'a' + (__wt_random(&g.rand) % 26); } g.entries = entries; @@ -239,13 +236,12 @@ populate_entries(void) void usage(void) { - fprintf(stderr, "usage: %s [-cfkos]\n", progname); + fprintf(stderr, "usage: %s [-cfko]\n", progname); fprintf(stderr, "%s", "\t-c cache size\n" "\t-f number of bits per item\n" "\t-k size of entry strings\n" - "\t-o number of operations to perform\n" - "\t-s random seed for run\n"); + "\t-o number of operations to perform\n"); exit(EXIT_FAILURE); } diff --git a/src/third_party/wiredtiger/test/csuite/rwlock/main.c b/src/third_party/wiredtiger/test/csuite/rwlock/main.c index e1d00344ee2..f69628dca40 100644 --- a/src/third_party/wiredtiger/test/csuite/rwlock/main.c +++ b/src/third_party/wiredtiger/test/csuite/rwlock/main.c @@ -171,8 +171,8 @@ thread_dump(void *arg) { sleep(1); printf("\n" "rwlock { current %" PRIu8 ", next %" PRIu8 - ", reader %" PRIu8 ", readers_active %" PRIu16 - ", readers_queued %" PRIu16 " }\n", + ", reader %" PRIu8 ", readers_active %" PRIu32 + ", readers_queued %" PRIu8 " }\n", rwlock.u.s.current, rwlock.u.s.next, rwlock.u.s.reader, diff --git a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c index a15baa0ba32..8a1781eae45 100644 --- a/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c +++ b/src/third_party/wiredtiger/test/csuite/timestamp_abort/main.c @@ -64,7 +64,7 @@ static char home[1024]; /* Program working dir */ #define MIN_TH 5 #define MIN_TIME 10 #define PREPARE_FREQ 5 -#define PREPARE_YIELD PREPARE_FREQ * 10 +#define PREPARE_YIELD (PREPARE_FREQ * 10) #define RECORDS_FILE "records-%" PRIu32 #define STABLE_PERIOD 100 @@ -184,10 +184,11 @@ thread_ckpt_run(void *arg) WT_RAND_STATE rnd; WT_SESSION *session; THREAD_DATA *td; - uint64_t ts; + uint64_t stable; uint32_t sleep_time; int i; bool first_ckpt; + char buf[128]; __wt_random_init(&rnd); @@ -198,20 +199,20 @@ thread_ckpt_run(void *arg) (void)unlink(ckpt_file); testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session)); first_ckpt = true; - ts = 0; for (i = 0; ;++i) { sleep_time = __wt_random(&rnd) % MAX_CKPT_INVL; sleep(sleep_time); - if (use_ts) - ts = global_ts; /* * Since this is the default, send in this string even if * running without timestamps. */ testutil_check(session->checkpoint( session, "use_timestamp=true")); - printf("Checkpoint %d complete. Minimum ts %" PRIu64 "\n", - i, ts); + testutil_check(td->conn->query_timestamp( + td->conn, buf, "get=last_checkpoint")); + sscanf(buf, "%" SCNx64, &stable); + printf("Checkpoint %d complete at stable %" + PRIu64 ".\n", i, stable); fflush(stdout); /* * Create the checkpoint file so that the parent process knows @@ -638,7 +639,7 @@ main(int argc, char *argv[]) use_ts ? "true" : "false"); printf("Parent: Create %" PRIu32 " threads; sleep %" PRIu32 " seconds\n", nth, timeout); - printf("CONFIG: %s%s%s%s -h %s -T %" PRIu32 "-t %" PRIu32 "\n", + printf("CONFIG: %s%s%s%s -h %s -T %" PRIu32 " -t %" PRIu32 "\n", progname, compat ? " -C" : "", inmem ? " -m" : "", diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index 8d85d331c89..d46b0868887 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -191,9 +191,13 @@ config_setup(void) /* * Turn off truncate for LSM runs (some configurations with truncate * always results in a timeout). + * + * WiredTiger doesn't currently support truncate and prepare at the + * same time, see WT-3922. For now, pick one on each run. */ - if (!config_is_perm("truncate") && DATASOURCE("lsm")) - config_single("truncate=off", 0); + if (!config_is_perm("truncate")) + if (DATASOURCE("lsm") || mmrand(NULL, 0, 1) == 1) + config_single("truncate=off", 0); /* Give Helium configuration a final review. */ if (DATASOURCE("helium")) diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 4c54972516e..596d952dcc6 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -403,10 +403,8 @@ snap_check(WT_CURSOR *cursor, break; case WT_NOTFOUND: break; - case WT_ROLLBACK: - return (WT_ROLLBACK); default: - testutil_die(ret, "WT_CURSOR.search"); + return (ret); } /* Check for simple matches. */ @@ -644,6 +642,19 @@ prepare_transaction(TINFO *tinfo, WT_SESSION *session) } /* + * OP_FAILED -- + * General error handling. + */ +#define OP_FAILED(notfound_ok) do { \ + positioned = false; \ + if (intxn && (ret == WT_CACHE_FULL || ret == WT_ROLLBACK)) \ + goto deadlock; \ + testutil_assert((notfound_ok && ret == WT_NOTFOUND) || \ + ret == WT_CACHE_FULL || \ + ret == WT_PREPARE_CONFLICT || ret == WT_ROLLBACK); \ +} while (0) + +/* * ops -- * Per-thread operations. */ @@ -825,11 +836,8 @@ ops(void *arg) if (ret == 0) { positioned = true; SNAP_TRACK(READ, tinfo); - } else { - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_NOTFOUND); - } + } else + OP_FAILED(true); } /* Optionally reserve a row. */ @@ -847,12 +855,8 @@ ops(void *arg) positioned = true; __wt_yield(); /* Let other threads proceed. */ - } else { - positioned = false; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_NOTFOUND); - } + } else + OP_FAILED(true); } /* Perform the operation. */ @@ -881,11 +885,8 @@ ops(void *arg) if (ret == 0) { ++tinfo->insert; SNAP_TRACK(INSERT, tinfo); - } else { - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_ROLLBACK); - } + } else + OP_FAILED(false); break; case MODIFY: /* @@ -907,13 +908,8 @@ ops(void *arg) if (ret == 0) { positioned = true; SNAP_TRACK(MODIFY, tinfo); - } else { - positioned = false; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert( - ret == WT_NOTFOUND || ret == WT_ROLLBACK); - } + } else + OP_FAILED(true); break; case READ: ++tinfo->search; @@ -921,12 +917,8 @@ ops(void *arg) if (ret == 0) { positioned = true; SNAP_TRACK(READ, tinfo); - } else { - positioned = false; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_NOTFOUND); - } + } else + OP_FAILED(true); break; case REMOVE: remove_instead_of_truncate: @@ -946,12 +938,8 @@ remove_instead_of_truncate: * previous state, but not necessarily set. */ SNAP_TRACK(REMOVE, tinfo); - } else { - positioned = false; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_NOTFOUND); - } + } else + OP_FAILED(true); break; case TRUNCATE: /* @@ -1020,11 +1008,8 @@ remove_instead_of_truncate: if (ret == 0) { ++tinfo->truncate; SNAP_TRACK(TRUNCATE, tinfo); - } else { - testutil_assert(ret == WT_ROLLBACK); - if (intxn) - goto deadlock; - } + } else + OP_FAILED(false); break; case UPDATE: update_instead_of_chosen_op: @@ -1041,12 +1026,8 @@ update_instead_of_chosen_op: if (ret == 0) { positioned = true; SNAP_TRACK(UPDATE, tinfo); - } else { - positioned = false; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_ROLLBACK); - } + } else + OP_FAILED(false); break; } @@ -1061,9 +1042,8 @@ update_instead_of_chosen_op: for (i = 0; i < j; ++i) { if ((ret = nextprev(tinfo, cursor, next)) == 0) continue; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_NOTFOUND); + + OP_FAILED(true); break; } } @@ -1090,9 +1070,11 @@ update_instead_of_chosen_op: goto deadlock; } - /* Prepare the transaction 10% of the time. */ - /* XXX: CONFIGURE PREPARE OFF FOR NOW */ - if (mmrand(&tinfo->rnd, 1, 10) == 0) { + /* + * Prepare the transaction 10% of the time. + * Currently doesn't work with truncation, see WT-3922. + */ + if (g.c_truncate == 0 && mmrand(&tinfo->rnd, 1, 10) == 1) { ret = prepare_transaction(tinfo, session); testutil_assert(ret == 0 || ret == WT_PREPARE_CONFLICT); if (ret == WT_PREPARE_CONFLICT) @@ -1138,7 +1120,7 @@ deadlock: ++tinfo->deadlock; /* * wts_read_scan -- - * Read and verify all elements in a file. + * Read and verify a subset of the elements in a file. */ void wts_read_scan(void) @@ -1182,6 +1164,7 @@ wts_read_scan(void) case 0: case WT_NOTFOUND: case WT_ROLLBACK: + case WT_PREPARE_CONFLICT: break; default: testutil_die( @@ -1209,11 +1192,6 @@ read_row_worker( session = cursor->session; - /* Log the operation */ - if (g.logging == LOG_OPS) - (void)g.wt_api->msg_printf(g.wt_api, - session, "%-10s%" PRIu64, "read", keyno); - /* Retrieve the key/value pair by key. */ switch (g.type) { case FIX: @@ -1254,12 +1232,15 @@ read_row_worker( value->size = 1; } break; - case WT_ROLLBACK: - return (WT_ROLLBACK); default: - testutil_die(ret, "read_row: read row %" PRIu64, keyno); + return (ret); } + /* Log the operation */ + if (g.logging == LOG_OPS) + (void)g.wt_api->msg_printf(g.wt_api, + session, "%-10s%" PRIu64, "read", keyno); + #ifdef HAVE_BERKELEY_DB if (!SINGLETHREADED) return (ret); @@ -1394,24 +1375,39 @@ nextprev(TINFO *tinfo, WT_CURSOR *cursor, bool next) break; case WT_NOTFOUND: break; - case WT_ROLLBACK: - return (WT_ROLLBACK); default: - testutil_die(ret, "%s", which); + return (ret); } + if (ret == 0 && g.logging == LOG_OPS) + switch (g.type) { + case FIX: + (void)g.wt_api->msg_printf(g.wt_api, + cursor->session, "%-10s%" PRIu64 " {0x%02x}", + which, keyno, ((char *)value.data)[0]); + break; + case ROW: + (void)g.wt_api->msg_printf(g.wt_api, + cursor->session, "%-10s{%.*s}, {%.*s}", + which, (int)key.size, (char *)key.data, + (int)value.size, (char *)value.data); + break; + case VAR: + (void)g.wt_api->msg_printf(g.wt_api, + cursor->session, "%-10s%" PRIu64 " {%.*s}", + which, keyno, (int)value.size, (char *)value.data); + break; + } + #ifdef HAVE_BERKELEY_DB if (!SINGLETHREADED) return (ret); { WT_ITEM bdb_key, bdb_value; - WT_SESSION *session; int notfound; char *p; - session = cursor->session; - /* Retrieve the BDB key/value. */ bdb_np(next, &bdb_key.data, &bdb_key.size, &bdb_value.data, &bdb_value.size, ¬found); @@ -1444,26 +1440,6 @@ mismatch: if (g.type == ROW) { print_item(" wt-value", &value); testutil_die(0, NULL); } - - if (g.logging == LOG_OPS) - switch (g.type) { - case FIX: - (void)g.wt_api->msg_printf(g.wt_api, - session, "%-10s%" PRIu64 " {0x%02x}", which, - keyno, ((char *)value.data)[0]); - break; - case ROW: - (void)g.wt_api->msg_printf( - g.wt_api, session, "%-10s{%.*s}, {%.*s}", which, - (int)key.size, (char *)key.data, - (int)value.size, (char *)value.data); - break; - case VAR: - (void)g.wt_api->msg_printf(g.wt_api, session, - "%-10s%" PRIu64 " {%.*s}", which, - keyno, (int)value.size, (char *)value.data); - break; - } } #endif return (ret); @@ -1483,24 +1459,14 @@ row_reserve(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) cursor->set_key(cursor, tinfo->key); } + if ((ret = cursor->reserve(cursor)) != 0) + return (ret); + if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, cursor->session, "%-10s{%.*s}", "reserve", (int)tinfo->key->size, tinfo->key->data); - switch (ret = cursor->reserve(cursor)) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - case WT_NOTFOUND: - return (WT_NOTFOUND); - default: - testutil_die(ret, - "row_reserve: reserve row %" PRIu64 " by key", - tinfo->keyno); - } return (0); } @@ -1516,21 +1482,13 @@ col_reserve(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) if (!positioned) cursor->set_key(cursor, tinfo->keyno); + if ((ret = cursor->reserve(cursor)) != 0) + return (ret); + if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, cursor->session, "%-10s%" PRIu64, "reserve", tinfo->keyno); - switch (ret = cursor->reserve(cursor)) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - case WT_NOTFOUND: - return (WT_NOTFOUND); - default: - testutil_die(ret, "col_reserve: %" PRIu64, tinfo->keyno); - } return (0); } @@ -1577,19 +1535,10 @@ row_modify(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) } modify_build(tinfo, entries, &nentries); - switch (ret = cursor->modify(cursor, entries, nentries)) { - case 0: - testutil_check(cursor->get_value(cursor, tinfo->value)); - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - case WT_NOTFOUND: - return (WT_NOTFOUND); - default: - testutil_die(ret, - "row_modify: modify row %" PRIu64 " by key", tinfo->keyno); - } + if ((ret = cursor->modify(cursor, entries, nentries)) != 0) + return (ret); + + testutil_check(cursor->get_value(cursor, tinfo->value)); if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, cursor->session, @@ -1624,25 +1573,16 @@ col_modify(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) cursor->set_key(cursor, tinfo->keyno); modify_build(tinfo, entries, &nentries); - switch (ret = cursor->modify(cursor, entries, nentries)) { - case 0: - testutil_check(cursor->get_value(cursor, tinfo->value)); - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - case WT_NOTFOUND: - return (WT_NOTFOUND); - default: - testutil_die(ret, - "col_modify: modify row %" PRIu64, tinfo->keyno); - } + if ((ret = cursor->modify(cursor, entries, nentries)) != 0) + return (ret); + + testutil_check(cursor->get_value(cursor, tinfo->value)); if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, cursor->session, - "%-10s{%.*s}, {%.*s}", + "%-10s%" PRIu64 ", {%.*s}", "modify", - (int)tinfo->key->size, tinfo->key->data, + tinfo->keyno, (int)tinfo->value->size, tinfo->value->data); #ifdef HAVE_BERKELEY_DB @@ -1698,24 +1638,15 @@ row_truncate(TINFO *tinfo, WT_CURSOR *cursor) testutil_check(c2->close(c2)); } + if (ret != 0) + return (ret); + if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, session, "%-10s%" PRIu64 ", %" PRIu64, "truncate", tinfo->keyno, tinfo->last); - switch (ret) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, - "row_truncate: row %" PRIu64 "-%" PRIu64, - tinfo->keyno, tinfo->last); - } - #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) bdb_truncate(tinfo->keyno, tinfo->last); @@ -1724,49 +1655,6 @@ row_truncate(TINFO *tinfo, WT_CURSOR *cursor) } /* - * row_update -- - * Update a row in a row-store file. - */ -static int -row_update(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) -{ - WT_DECL_RET; - - if (!positioned) { - key_gen(tinfo->key, tinfo->keyno); - cursor->set_key(cursor, tinfo->key); - } - val_gen(&tinfo->rnd, tinfo->value, tinfo->keyno); - cursor->set_value(cursor, tinfo->value); - - if (g.logging == LOG_OPS) - (void)g.wt_api->msg_printf(g.wt_api, cursor->session, - "%-10s{%.*s}, {%.*s}", - "put", - (int)tinfo->key->size, tinfo->key->data, - (int)tinfo->value->size, tinfo->value->data); - - switch (ret = cursor->update(cursor)) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, - "row_update: update row %" PRIu64 " by key", tinfo->keyno); - } - -#ifdef HAVE_BERKELEY_DB - if (SINGLETHREADED) - bdb_update( - tinfo->key->data, tinfo->key->size, - tinfo->value->data, tinfo->value->size); -#endif - return (0); -} - -/* * col_truncate -- * Truncate rows in a column-store file. */ @@ -1802,6 +1690,8 @@ col_truncate(TINFO *tinfo, WT_CURSOR *cursor) ret = session->truncate(session, NULL, cursor, c2, NULL); testutil_check(c2->close(c2)); } + if (ret != 0) + return (ret); if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, session, @@ -1809,21 +1699,44 @@ col_truncate(TINFO *tinfo, WT_CURSOR *cursor) "truncate", tinfo->keyno, tinfo->last); - switch (ret) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, - "col_truncate: row %" PRIu64 "-%" PRIu64, - tinfo->keyno, tinfo->last); +#ifdef HAVE_BERKELEY_DB + if (SINGLETHREADED) + bdb_truncate(tinfo->keyno, tinfo->last); +#endif + return (0); +} + +/* + * row_update -- + * Update a row in a row-store file. + */ +static int +row_update(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) +{ + WT_DECL_RET; + + if (!positioned) { + key_gen(tinfo->key, tinfo->keyno); + cursor->set_key(cursor, tinfo->key); } + val_gen(&tinfo->rnd, tinfo->value, tinfo->keyno); + cursor->set_value(cursor, tinfo->value); + + if ((ret = cursor->update(cursor)) != 0) + return (ret); + + if (g.logging == LOG_OPS) + (void)g.wt_api->msg_printf(g.wt_api, cursor->session, + "%-10s{%.*s}, {%.*s}", + "put", + (int)tinfo->key->size, tinfo->key->data, + (int)tinfo->value->size, tinfo->value->data); #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) - bdb_truncate(tinfo->keyno, tinfo->last); + bdb_update( + tinfo->key->data, tinfo->key->size, + tinfo->value->data, tinfo->value->size); #endif return (0); } @@ -1845,6 +1758,9 @@ col_update(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) else cursor->set_value(cursor, tinfo->value); + if ((ret = cursor->update(cursor)) != 0) + return (ret); + if (g.logging == LOG_OPS) { if (g.type == FIX) (void)g.wt_api->msg_printf(g.wt_api, cursor->session, @@ -1859,16 +1775,6 @@ col_update(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) (char *)tinfo->value->data); } - switch (ret = cursor->update(cursor)) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, "col_update: %" PRIu64, tinfo->keyno); - } - #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) { key_gen(tinfo->key, tinfo->keyno); @@ -1999,6 +1905,9 @@ row_insert(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) val_gen(&tinfo->rnd, tinfo->value, tinfo->keyno); cursor->set_value(cursor, tinfo->value); + if ((ret = cursor->insert(cursor)) != 0) + return (ret); + /* Log the operation */ if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, cursor->session, @@ -2007,17 +1916,6 @@ row_insert(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) (int)tinfo->key->size, tinfo->key->data, (int)tinfo->value->size, tinfo->value->data); - switch (ret = cursor->insert(cursor)) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, - "row_insert: insert row %" PRIu64 " by key", tinfo->keyno); - } - #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) bdb_update( @@ -2041,15 +1939,10 @@ col_insert(TINFO *tinfo, WT_CURSOR *cursor) cursor->set_value(cursor, *(uint8_t *)tinfo->value->data); else cursor->set_value(cursor, tinfo->value); - switch (ret = cursor->insert(cursor)) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, "cursor.insert"); - } + + if ((ret = cursor->insert(cursor)) != 0) + return (ret); + testutil_check(cursor->get_key(cursor, &tinfo->keyno)); table_append(tinfo->keyno); /* Extend the object. */ @@ -2093,23 +1986,16 @@ row_remove(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) cursor->set_key(cursor, tinfo->key); } - if (g.logging == LOG_OPS) - (void)g.wt_api->msg_printf(g.wt_api, - cursor->session, "%-10s%" PRIu64, "remove", tinfo->keyno); - /* We use the cursor in overwrite mode, check for existence. */ if ((ret = cursor->search(cursor)) == 0) ret = cursor->remove(cursor); - switch (ret) { - case 0: - case WT_NOTFOUND: - break; - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, - "row_remove: remove %" PRIu64 " by key", tinfo->keyno); - } + + if (ret != 0 && ret != WT_NOTFOUND) + return (ret); + + if (g.logging == LOG_OPS) + (void)g.wt_api->msg_printf(g.wt_api, + cursor->session, "%-10s%" PRIu64, "remove", tinfo->keyno); #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) { @@ -2134,23 +2020,16 @@ col_remove(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) if (!positioned) cursor->set_key(cursor, tinfo->keyno); - if (g.logging == LOG_OPS) - (void)g.wt_api->msg_printf(g.wt_api, - cursor->session, "%-10s%" PRIu64, "remove", tinfo->keyno); - /* We use the cursor in overwrite mode, check for existence. */ if ((ret = cursor->search(cursor)) == 0) ret = cursor->remove(cursor); - switch (ret) { - case 0: - case WT_NOTFOUND: - break; - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, - "col_remove: remove %" PRIu64 " by key", tinfo->keyno); - } + + if (ret != 0 && ret != WT_NOTFOUND) + return (ret); + + if (g.logging == LOG_OPS) + (void)g.wt_api->msg_printf(g.wt_api, + cursor->session, "%-10s%" PRIu64, "remove", tinfo->keyno); #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) { diff --git a/src/third_party/wiredtiger/test/suite/test_bug019.py b/src/third_party/wiredtiger/test/suite/test_bug019.py new file mode 100644 index 00000000000..202ca6b6b60 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_bug019.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import fnmatch, os, time +import wiredtiger, wttest +from wtdataset import SimpleDataSet + +# test_bug019.py +# Test that pre-allocating log files only pre-allocates a small number. +class test_bug019(wttest.WiredTigerTestCase): + conn_config = 'log=(enabled,file_max=100K)' + uri = "table:bug019" + entries = 100000 + + # Modify rows so we write log records. We're writing a lot more than a + # single log file, so we know the underlying library will churn through + # log files. + def populate(self, nentries): + c = self.session.open_cursor(self.uri, None, None) + for i in range(0, nentries): + c[i] = i + c.close() + + # Wait for a log file to be pre-allocated. Avoid timing problems, but + # assert a file is created within 30 seconds. + def prepfiles(self): + for i in range(1,30): + f = fnmatch.filter(os.listdir('.'), "*Prep*") + if f: + return f + time.sleep(1) + self.assertFalse(not f) + + # There was a bug where pre-allocated log files accumulated on + # Windows systems due to an issue with the directory list code. + def test_bug019(self): + # Create a table just to write something into the log. + self.session.create(self.uri, 'key_format=i,value_format=i') + self.populate(self.entries) + self.session.checkpoint() + + # Loop, making sure pre-allocation is working and the range is moving. + older = self.prepfiles() + for i in range(1, 10): + self.populate(self.entries) + newer = self.prepfiles() + + # Files can be returned in any order when reading a directory, older + # pre-allocated files can persist longer than newer files when newer + # files are returned first. Confirm files are being consumed. + self.assertFalse(set(older) < set(newer)) + + older = newer + self.session.checkpoint() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_cursor13.py b/src/third_party/wiredtiger/test/suite/test_cursor13.py index 27884b6726c..35a841ed78d 100644 --- a/src/third_party/wiredtiger/test/suite/test_cursor13.py +++ b/src/third_party/wiredtiger/test/suite/test_cursor13.py @@ -509,7 +509,7 @@ class test_cursor13_sweep(test_cursor13_big_base): swept = end_sweep_stats[3] - begin_sweep_stats[3] # Although this is subject to tuning parameters, we know that - # in an active sesssion, we'll sweep through minimum of 1% of + # in an active session, we'll sweep through minimum of 1% of # the cached cursors per second. We've set this test to run # 5 rounds. In 2 of the 5 rounds (sandwiched between the others), # some of the uris are allowed to close. So during the 'closing rounds' diff --git a/src/third_party/wiredtiger/test/suite/test_cursor14.py b/src/third_party/wiredtiger/test/suite/test_cursor14.py new file mode 100644 index 00000000000..25bd0cec00a --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_cursor14.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtdataset import SimpleDataSet, ComplexDataSet, ComplexLSMDataSet +from wtscenario import make_scenarios + +# test_cursor14.py +# Test that more than 64K cursors can be opened on a data source +class test_cursor14(wttest.WiredTigerTestCase): + scenarios = make_scenarios([ + ('file-r', dict(type='file:', keyfmt='r', dataset=SimpleDataSet)), + ('file-S', dict(type='file:', keyfmt='S', dataset=SimpleDataSet)), + ('lsm-S', dict(type='lsm:', keyfmt='S', dataset=SimpleDataSet)), + ('table-r', dict(type='table:', keyfmt='r', dataset=SimpleDataSet)), + ('table-S', dict(type='table:', keyfmt='S', dataset=SimpleDataSet)), + ('table-r-complex', dict(type='table:', keyfmt='r', + dataset=ComplexDataSet)), + ('table-S-complex', dict(type='table:', keyfmt='S', + dataset=ComplexDataSet)), + ('table-S-complex-lsm', dict(type='table:', keyfmt='S', + dataset=ComplexLSMDataSet)), + ]) + + def test_cursor14(self): + uri = self.type + 'cursor14' + + ds = self.dataset(self, uri, 100, key_format=self.keyfmt) + ds.populate() + + for i in xrange(66000): + cursor = self.session.open_cursor(uri, None, None) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_prepare04.py b/src/third_party/wiredtiger/test/suite/test_prepare04.py new file mode 100644 index 00000000000..af5dd12b1e5 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_prepare04.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_prepare04.py +# Prepare: prepare conflict with update and read operations +# + +import random +from suite_subprocess import suite_subprocess +import wiredtiger, wttest +from wtscenario import make_scenarios + +def timestamp_str(t): + return '%x' % t + +class test_prepare04(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_prepare_cursor' + uri = 'table:' + tablename + before_ts = timestamp_str(150) + prepare_ts = timestamp_str(200) + after_ts = timestamp_str(250) + + types = [ + ('col', dict(extra_config=',log=(enabled=false),key_format=r')), + ('lsm', dict(extra_config=',log=(enabled=false),type=lsm')), + ('row', dict(extra_config=',log=(enabled=false)')), + ] + + # Various begin_transaction config + txncfg = [ + ('before_ts', dict(txn_config='isolation=snapshot,read_timestamp=' + before_ts, after_ts=False)), + ('after_ts', dict(txn_config='isolation=snapshot,read_timestamp=' + after_ts, after_ts=True)), + ('no_ts', dict(txn_config='isolation=snapshot', after_ts=True)), + ] + + preparecfg = [ + ('ignore_false', dict(ignore_config=',ignore_prepare=false', ignore=False)), + ('ignore_true', dict(ignore_config=',ignore_prepare=true', ignore=True)), + ] + conn_config = 'log=(enabled)' + + scenarios = make_scenarios(types, txncfg, preparecfg) + + def test_prepare_conflict(self): + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + self.session.create(self.uri, + 'key_format=i,value_format=i' + self.extra_config) + c = self.session.open_cursor(self.uri) + + # Insert keys 1..100 each with timestamp=key, in some order + orig_keys = range(1, 101) + keys = orig_keys[:] + random.shuffle(keys) + + k = 1 + self.session.begin_transaction() + c[k] = 1 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(100)) + + # Everything up to and including timestamp 100 has been committed. + self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(100)) + + # Bump the oldest timestamp, we're not going back... + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(100)) + + # make prepared updates. + k = 1 + self.session.begin_transaction('isolation=snapshot') + c.set_key(1) + c.set_value(2) + c.update() + self.session.prepare_transaction('prepare_timestamp=' + self.prepare_ts) + conflictmsg = '/conflict between concurrent operations/' + preparemsg = '/conflict with a prepared update/' + + #''' + # Verify data visibility from a different session/transaction. + s_other = self.conn.open_session() + c_other = s_other.open_cursor(self.uri, None) + s_other.begin_transaction(self.txn_config + self.ignore_config) + c_other.set_key(1) + if self.ignore == False and self.after_ts == True: + self.assertRaises(wiredtiger.WiredTigerError, lambda:c_other.search()) + else: + c_other.search() + self.assertTrue(c_other.get_value() == 1) + c_other.set_value(3) + self.assertRaises(wiredtiger.WiredTigerError, lambda:c_other.update()) + s_other.commit_transaction() + #''' + + self.session.commit_transaction('commit_timestamp=' + timestamp_str(300)) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp04.py b/src/third_party/wiredtiger/test/suite/test_timestamp04.py index 48ec7fac9a6..83ed4e904a6 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp04.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp04.py @@ -32,6 +32,7 @@ from suite_subprocess import suite_subprocess import wiredtiger, wttest +from wiredtiger import stat from wtscenario import make_scenarios def timestamp_str(t): @@ -98,7 +99,7 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): def ConnectionOpen(self, cacheSize): self.home = '.' - conn_params = 'create,' + \ + conn_params = 'create,statistics=(fast),' + \ cacheSize + ',error_prefix="%s" %s' % (self.shortid(), self.conn_config) try: self.conn = wiredtiger.wiredtiger_open(self.home, conn_params) @@ -164,6 +165,12 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): stable_ts = timestamp_str(key_range / 2) self.conn.set_timestamp('stable_timestamp=' + stable_ts) self.conn.rollback_to_stable() + stat_cursor = self.session.open_cursor('statistics:', None, None) + calls = stat_cursor[stat.conn.txn_rollback_to_stable][2] + upd_aborted = stat_cursor[stat.conn.txn_rollback_upd_aborted][2] + stat_cursor.close() + self.assertEqual(calls, 1) + self.assertTrue(upd_aborted >= key_range/2) # Check that we see the inserted value (i.e. 1) for all the keys in # non-timestamp tables. @@ -224,9 +231,20 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): # Scenario: 4 # Advance the stable_timestamp by a quarter range and rollback. # Three-fourths of the later timestamps will be rolled back. - stable_ts = timestamp_str(key_range + key_range / 4) + rolled_range = key_range + key_range / 4 + stable_ts = timestamp_str(rolled_range) self.conn.set_timestamp('stable_timestamp=' + stable_ts) self.conn.rollback_to_stable() + stat_cursor = self.session.open_cursor('statistics:', None, None) + calls = stat_cursor[stat.conn.txn_rollback_to_stable][2] + upd_aborted = stat_cursor[stat.conn.txn_rollback_upd_aborted][2] + stat_cursor.close() + self.assertEqual(calls, 2) + # + # We rolled back half on the earlier call and now three-quarters on + # this call, which is one and one quarter of all keys rolled back. + # + self.assertTrue(upd_aborted >= rolled_range) # Check that we see the updated value (i.e. 2) for all the keys in # non-timestamped tables. diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp10.py b/src/third_party/wiredtiger/test/suite/test_timestamp10.py index a798f5ff355..02b22e6afbe 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp10.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp10.py @@ -27,7 +27,7 @@ # OTHER DEALINGS IN THE SOFTWARE. # # test_timestamp10.py -# Timestamps: Saving and querying the checkpoint recovery timestamp +# Timestamps: Saving and querying the last checkpoint and recovery timestamps # import fnmatch, os, shutil @@ -101,6 +101,8 @@ class test_timestamp10(wttest.WiredTigerTestCase, suite_subprocess): ',stable_timestamp=' + timestamp_str(ts)) # This forces a different checkpoint timestamp for each table. self.session.checkpoint() + q = self.conn.query_timestamp('get=last_checkpoint') + self.assertTimestampsEqual(q, timestamp_str(ts)) # Copy to a new database and then recover. self.copy_dir(".", "RESTART") |