diff options
31 files changed, 759 insertions, 438 deletions
diff --git a/src/third_party/wiredtiger/dist/api_data.py b/src/third_party/wiredtiger/dist/api_data.py index 17fe0d97735..f5e0b4a67a3 100644 --- a/src/third_party/wiredtiger/dist/api_data.py +++ b/src/third_party/wiredtiger/dist/api_data.py @@ -1371,7 +1371,8 @@ methods = { \c oldest_timestamp and the read timestamps of all active readers, and \c stable returns the most recent \c stable_timestamp set with WT_CONNECTION::set_timestamp. See @ref transaction_timestamps''', - choices=['all_committed','oldest','pinned','recovery','stable']), + choices=['all_committed','last_checkpoint', + 'oldest','pinned','recovery','stable']), ]), 'WT_CONNECTION.set_timestamp' : Method([ diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index a630ebe3fa9..1441187812e 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -518,6 +518,9 @@ connection_stats = [ TxnStat('txn_read_queue_inserts', 'read timestamp queue inserts total'), TxnStat('txn_read_queue_len', 'read timestamp queue length'), TxnStat('txn_rollback', 'transactions rolled back'), + TxnStat('txn_rollback_las_removed', 'rollback to stable updates removed from lookaside'), + TxnStat('txn_rollback_to_stable', 'rollback to stable calls'), + TxnStat('txn_rollback_upd_aborted', 'rollback to stable updates aborted'), TxnStat('txn_set_ts', 'set timestamp calls'), TxnStat('txn_set_ts_commit', 'set timestamp commit calls'), TxnStat('txn_set_ts_commit_upd', 'set timestamp commit updates'), diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 16374d1697a..53642b757b5 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "60a06941b8d5d7ddc9f93646e75fc4b52d40f9b4", + "commit": "b33708d7d9b2971cda05e71fcba6067b230b97cc", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.8" diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index 6737af9996b..63d2cda4714 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -54,12 +54,17 @@ __cursor_fix_append_next(WT_CURSOR_BTREE *cbt, bool newpage) * insert is aborted, we simply return zero (empty), regardless of * whether we are at the end of the data. */ - if (cbt->recno < WT_INSERT_RECNO(cbt->ins) || - (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { + if (cbt->recno < WT_INSERT_RECNO(cbt->ins)) { cbt->v = 0; cbt->iface.value.data = &cbt->v; - } else - cbt->iface.value.data = upd->data; + } else { + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); + if (upd == NULL) { + cbt->v = 0; + cbt->iface.value.data = &cbt->v; + } else + cbt->iface.value.data = upd->data; + } cbt->iface.value.size = 1; return (0); } @@ -79,6 +84,7 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage) session = (WT_SESSION_IMPL *)cbt->iface.session; btree = S2BT(session); page = cbt->ref->page; + upd = NULL; /* Initialize for each new page. */ if (newpage) { @@ -101,7 +107,8 @@ new_page: cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; - upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); + if (cbt->ins != NULL) + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); if (upd == NULL) { cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); cbt->iface.value.data = &cbt->v; @@ -134,7 +141,8 @@ new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); - if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); + if (upd == NULL) continue; if (upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && @@ -193,8 +201,9 @@ new_page: /* Find the matching WT_COL slot. */ /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); - upd = cbt->ins == NULL ? - NULL : __wt_txn_read(session, cbt->ins->upd); + upd = NULL; + if (cbt->ins != NULL) + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); if (upd != NULL) { if (upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && @@ -311,7 +320,8 @@ __cursor_row_next(WT_CURSOR_BTREE *cbt, bool newpage) cbt->ins = WT_SKIP_NEXT(cbt->ins); new_insert: if ((ins = cbt->ins) != NULL) { - if ((upd = __wt_txn_read(session, ins->upd)) == NULL) + WT_RET(__wt_txn_read(session, ins->upd, &upd)); + if (upd == NULL) continue; if (upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && @@ -344,7 +354,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row[cbt->slot]; - upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); + WT_RET(__wt_txn_read(session, WT_ROW_UPDATE(page, rip), &upd)); if (upd != NULL && upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) @@ -571,8 +581,9 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; + WT_UPDATE *upd; uint32_t flags; - bool newpage; + bool newpage, valid; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; @@ -582,6 +593,26 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + /* + * In case of retrying a next operation due to a prepare conflict, + * cursor would have been already positioned at an update structure + * which resulted in conflict. So, now when retrying we should examine + * the same update again instead of starting from the next one in the + * update chain. + */ + F_CLR(cbt, WT_CBT_RETRY_PREV); + if (F_ISSET(cbt, WT_CBT_RETRY_NEXT)) { + WT_RET(__wt_cursor_valid(cbt, &upd, &valid)); + F_CLR(cbt, WT_CBT_RETRY_NEXT); + if (valid) { + /* + * If the update, which returned prepared conflict is + * visible, return the value. + */ + return (__cursor_kv_return(session, cbt, upd)); + } + } + WT_RET(__cursor_func_init(cbt, false)); /* @@ -663,15 +694,24 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } - #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, true)); #endif - if (ret == 0) +err: switch (ret) { + case 0: F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); - -err: if (ret != 0) + break; + case WT_PREPARE_CONFLICT: + /* + * If prepare conflict occurs, cursor should not be reset, + * as current cursor position will be reused in case of a + * retry from user. + */ + F_SET(cbt, WT_CBT_RETRY_NEXT); + break; + default: WT_TRET(__cursor_reset(cbt)); + } return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 068a9915ab9..3356baeb24a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -199,13 +199,18 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) * created records written by reconciliation are deleted and so can be * never seen by a read. */ - if (cbt->ins == NULL || - cbt->recno > WT_INSERT_RECNO(cbt->ins) || - (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { + if (cbt->ins == NULL || cbt->recno > WT_INSERT_RECNO(cbt->ins)) { cbt->v = 0; cbt->iface.value.data = &cbt->v; - } else - cbt->iface.value.data = upd->data; + } else { + upd = NULL; + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); + if (upd == NULL) { + cbt->v = 0; + cbt->iface.value.data = &cbt->v; + } else + cbt->iface.value.data = upd->data; + } cbt->iface.value.size = 1; return (0); } @@ -247,7 +252,9 @@ new_page: cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno); if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) cbt->ins = NULL; - upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); + upd = NULL; + if (cbt->ins != NULL) + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); if (upd == NULL) { cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); cbt->iface.value.data = &cbt->v; @@ -280,7 +287,8 @@ new_page: if (cbt->ins == NULL) return (WT_NOTFOUND); __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins)); - if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); + if (upd == NULL) continue; if (upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && @@ -340,8 +348,9 @@ new_page: if (cbt->recno < cbt->ref->ref_recno) /* Check any insert list for a matching record. */ cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot); cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno); - upd = cbt->ins == NULL ? - NULL : __wt_txn_read(session, cbt->ins->upd); + upd = NULL; + if (cbt->ins != NULL) + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); if (upd != NULL) { if (upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && @@ -468,7 +477,8 @@ __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) WT_RET(__cursor_skip_prev(cbt)); new_insert: if ((ins = cbt->ins) != NULL) { - if ((upd = __wt_txn_read(session, ins->upd)) == NULL) + WT_RET(__wt_txn_read(session, ins->upd, &upd)); + if (upd == NULL) continue; if (upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && @@ -503,7 +513,7 @@ new_insert: if ((ins = cbt->ins) != NULL) { cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row[cbt->slot]; - upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); + WT_RET(__wt_txn_read(session, WT_ROW_UPDATE(page, rip), &upd)); if (upd != NULL && upd->type == WT_UPDATE_TOMBSTONE) { if (upd->txnid != WT_TXN_NONE && __wt_txn_upd_visible_all(session, upd)) @@ -526,8 +536,9 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; + WT_UPDATE *upd; uint32_t flags; - bool newpage; + bool newpage, valid; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; @@ -537,6 +548,26 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + /* + * In case of retrying a prev operation due to a prepare conflict, + * cursor would have been already positioned at an update structure + * which resulted in conflict. So, now when retrying we should examine + * the same update again instead of starting from the next one in the + * update chain. + */ + F_CLR(cbt, WT_CBT_RETRY_NEXT); + if (F_ISSET(cbt, WT_CBT_RETRY_PREV)) { + WT_RET(__wt_cursor_valid(cbt, &upd, &valid)); + F_CLR(cbt, WT_CBT_RETRY_PREV); + if (valid) { + /* + * If the update, which returned prepared conflict is + * visible, return the value. + */ + return (__cursor_kv_return(session, cbt, upd)); + } + } + WT_RET(__cursor_func_init(cbt, false)); /* @@ -622,10 +653,20 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, false)); #endif - if (ret == 0) +err: switch (ret) { + case 0: F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); - -err: if (ret != 0) + break; + case WT_PREPARE_CONFLICT: + /* + * If prepare conflict occurs, cursor should not be reset, + * as current cursor position will be reused in case of a + * retry from user. + */ + F_SET(cbt, WT_CBT_RETRY_PREV); + break; + default: WT_TRET(__cursor_reset(cbt)); + } return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 29725e22b2c..bf535896c73 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -205,8 +205,8 @@ __cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt) * __wt_cursor_valid -- * Return if the cursor references an valid key/value pair. */ -bool -__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) +int +__wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *valid) { WT_BTREE *btree; WT_CELL *cell; @@ -215,11 +215,12 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) WT_SESSION_IMPL *session; WT_UPDATE *upd; + if (updp != NULL) + *updp = NULL; + *valid = false; btree = cbt->btree; page = cbt->ref->page; session = (WT_SESSION_IMPL *)cbt->iface.session; - if (updp != NULL) - *updp = NULL; /* * We may be pointing to an insert object, and we may have a page with @@ -265,13 +266,16 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * First, check for an insert object with a visible update (a visible * update that's been deleted is not a valid key/value pair). */ - if (cbt->ins != NULL && - (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) - return (false); - if (updp != NULL) - *updp = upd; - return (true); + if (cbt->ins != NULL) { + WT_RET(__wt_txn_read(session, cbt->ins->upd, &upd)); + if (upd != NULL) { + if (upd->type == WT_UPDATE_TOMBSTONE) + return (0); + if (updp != NULL) + *updp = upd; + *valid = true; + return (0); + } } /* @@ -290,7 +294,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * keys, check for retrieval past the end of the page. */ if (cbt->recno >= cbt->ref->ref_recno + page->entries) - return (false); + return (0); /* * An update would have appeared as an "insert" object; no @@ -300,7 +304,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) case BTREE_COL_VAR: /* The search function doesn't check for empty pages. */ if (page->entries == 0) - return (false); + return (0); WT_ASSERT(session, cbt->slot < page->entries); /* @@ -309,7 +313,7 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * returned on-page object must be checked for a match. */ if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH)) - return (false); + return (0); /* * Although updates would have appeared as an "insert" objects, @@ -320,12 +324,12 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) cip = &page->pg_var[cbt->slot]; if ((cell = WT_COL_PTR(page, cip)) == NULL || __wt_cell_type(cell) == WT_CELL_DEL) - return (false); + return (0); break; case BTREE_ROW: /* The search function doesn't check for empty pages. */ if (page->entries == 0) - return (false); + return (0); WT_ASSERT(session, cbt->slot < page->entries); /* @@ -333,34 +337,23 @@ __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * key as an on-page object, we're done. */ if (cbt->ins != NULL) - return (false); + return (0); /* Check for an update. */ if (page->modify != NULL && - page->modify->mod_row_update != NULL && - (upd = __wt_txn_read(session, - page->modify->mod_row_update[cbt->slot])) != NULL) { - if (upd->type == WT_UPDATE_TOMBSTONE) - return (false); - if (updp != NULL) - *updp = upd; + page->modify->mod_row_update != NULL) { + WT_RET(__wt_txn_read(session, + page->modify->mod_row_update[cbt->slot], &upd)); + if (upd != NULL) { + if (upd->type == WT_UPDATE_TOMBSTONE) + return (0); + if (updp != NULL) + *updp = upd; + } } break; } - return (true); -} - -/* - * __cursor_kv_return -- - * Return a page referenced key/value pair to the application. - */ -static inline int -__cursor_kv_return( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) -{ - WT_RET(__wt_key_return(session, cbt)); - WT_RET(__wt_value_return(session, cbt, upd)); - + *valid = true; return (0); } @@ -512,7 +505,10 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, false) : __cursor_col_search(session, cbt, cbt->ref)); - valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd); + + /* Return, if prepare conflict encountered. */ + if (cbt->compare == 0) + WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); @@ -520,7 +516,10 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, false) : __cursor_col_search(session, cbt, NULL)); - valid = cbt->compare == 0 && __wt_cursor_valid(cbt, &upd); + + /* Return, if prepare conflict encountered. */ + if (cbt->compare == 0) + WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); } if (valid) @@ -618,14 +617,14 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) * Ignore those cases, it makes things too complicated. */ if (cbt->slot != 0 && cbt->slot != cbt->ref->page->entries - 1) - valid = __wt_cursor_valid(cbt, &upd); + WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); } if (!valid) { WT_ERR(__cursor_func_init(cbt, true)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); - valid = __wt_cursor_valid(cbt, &upd); + WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); } /* @@ -656,9 +655,10 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) exact = 0; F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); - } else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) + } else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND) { + WT_ERR(ret); exact = 1; - else { + } else { /* * The cursor next call may have overwritten our caller's key, * restore it to its original value. @@ -669,11 +669,14 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, NULL, true) : __cursor_col_search(session, cbt, NULL)); - if (__wt_cursor_valid(cbt, &upd)) { + WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); + if (valid) { exact = cbt->compare; ret = __cursor_kv_return(session, cbt, upd); - } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) + } else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND) { + WT_ERR(ret); exact = -1; + } } err: if (ret == 0 && exactp != NULL) @@ -703,7 +706,7 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - bool append_key; + bool append_key, valid; btree = cbt->btree; cursor = &cbt->iface; @@ -784,8 +787,11 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); * key/value pair. */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) - WT_ERR(WT_DUPLICATE_KEY); + cbt->compare == 0) { + WT_ERR(__wt_cursor_valid(cbt, NULL, &valid)); + if (valid) + WT_ERR(WT_DUPLICATE_KEY); + } ret = __cursor_row_modify(session, cbt, WT_UPDATE_STANDARD); } else { @@ -805,10 +811,14 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); * column-store implicitly fills the gap with empty records. * Fail in that case, the record exists. */ - if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) && - ((cbt->compare == 0 && __wt_cursor_valid(cbt, NULL)) || - (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) - WT_ERR(WT_DUPLICATE_KEY); + if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { + if (cbt->compare == 0) { + WT_ERR(__wt_cursor_valid(cbt, NULL, &valid)); + if (valid) + WT_ERR(WT_DUPLICATE_KEY); + } else if (__cursor_fix_implicit(btree, cbt)) + WT_ERR(WT_DUPLICATE_KEY); + } WT_ERR(__cursor_col_modify(session, cbt, WT_UPDATE_STANDARD)); @@ -932,7 +942,7 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; - bool iterating; + bool iterating, valid; btree = cbt->btree; cursor = &cbt->iface; @@ -1028,7 +1038,10 @@ retry: if (positioned == POSITIONED) /* Check whether an update would conflict. */ WT_ERR(__curfile_update_check(cbt)); - if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) + if (cbt->compare != 0) + WT_ERR(WT_NOTFOUND); + WT_ERR(__wt_cursor_valid(cbt, NULL, &valid)); + if (!valid) WT_ERR(WT_NOTFOUND); ret = __cursor_row_modify(session, cbt, WT_UPDATE_TOMBSTONE); @@ -1043,7 +1056,10 @@ retry: if (positioned == POSITIONED) WT_ERR(__curfile_update_check(cbt)); /* Remove the record if it exists. */ - if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) { + valid = false; + if (cbt->compare == 0) + WT_ERR(__wt_cursor_valid(cbt, NULL, &valid)); + if (cbt->compare != 0 || !valid) { if (!__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); /* @@ -1143,6 +1159,7 @@ __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type) WT_CURSOR *cursor; WT_DECL_RET; WT_SESSION_IMPL *session; + bool valid; btree = cbt->btree; cursor = &cbt->iface; @@ -1207,7 +1224,10 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); - if (cbt->compare != 0 || !__wt_cursor_valid(cbt, NULL)) + if (cbt->compare != 0) + WT_ERR(WT_NOTFOUND); + WT_ERR(__wt_cursor_valid(cbt, NULL, &valid)); + if (!valid) WT_ERR(WT_NOTFOUND); } ret = __cursor_row_modify_v(session, cbt, value, modify_type); @@ -1224,8 +1244,10 @@ retry: WT_ERR(__cursor_func_init(cbt, true)); */ if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) { WT_ERR(__curfile_update_check(cbt)); - if ((cbt->compare != 0 || - !__wt_cursor_valid(cbt, NULL)) && + valid = false; + if (cbt->compare == 0) + WT_ERR(__wt_cursor_valid(cbt, NULL, &valid)); + if ((cbt->compare != 0 || !valid) && !__cursor_fix_implicit(btree, cbt)) WT_ERR(WT_NOTFOUND); } diff --git a/src/third_party/wiredtiger/src/btree/bt_random.c b/src/third_party/wiredtiger/src/btree/bt_random.c index 03b5039b00b..8eb120f06ec 100644 --- a/src/third_party/wiredtiger/src/btree/bt_random.c +++ b/src/third_party/wiredtiger/src/btree/bt_random.c @@ -302,6 +302,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_UPDATE *upd; wt_off_t size; uint64_t n, skip; + bool valid; btree = cbt->btree; cursor = &cbt->iface; @@ -421,7 +422,8 @@ random_page_entry: * the next entry, if that doesn't work, move to the previous entry. */ WT_ERR(__wt_row_random_leaf(session, cbt)); - if (__wt_cursor_valid(cbt, &upd)) { + WT_ERR(__wt_cursor_valid(cbt, &upd, &valid)); + if (valid) { WT_ERR(__wt_key_return(session, cbt)); WT_ERR(__wt_value_return(session, cbt, upd)); } else { diff --git a/src/third_party/wiredtiger/src/config/config_def.c b/src/third_party/wiredtiger/src/config/config_def.c index f473cfe3e8d..ffcb2139330 100644 --- a/src/third_party/wiredtiger/src/config/config_def.c +++ b/src/third_party/wiredtiger/src/config/config_def.c @@ -46,8 +46,8 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_open_session[] = { static const WT_CONFIG_CHECK confchk_WT_CONNECTION_query_timestamp[] = { { "get", "string", - NULL, "choices=[\"all_committed\",\"oldest\",\"pinned\"," - "\"recovery\",\"stable\"]", + NULL, "choices=[\"all_committed\",\"last_checkpoint\",\"oldest\"" + ",\"pinned\",\"recovery\",\"stable\"]", NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index afefbe8ad5c..ff757c4bfb5 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -68,6 +68,7 @@ if ((ret) != 0 && \ (ret) != WT_NOTFOUND && \ (ret) != WT_DUPLICATE_KEY && \ + (ret) != WT_PREPARE_CONFLICT && \ F_ISSET(&(s)->txn, WT_TXN_RUNNING)) \ F_SET(&(s)->txn, WT_TXN_ERROR); \ /* \ @@ -237,6 +238,7 @@ JOINABLE_CURSOR_CALL_CHECK(cur) #define CURSOR_UPDATE_API_END(s, ret) \ + ((ret == WT_PREPARE_CONFLICT) ? (ret = WT_ROLLBACK) : ret ); \ TXN_API_END(s, ret) #define ASYNCOP_API_CALL(conn, s, n) \ diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index 7ba73d1b94f..d56a3773933 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -997,7 +997,7 @@ struct __wt_update { finalized prepare */ #define WT_UPDATE_STATE_LOCKED 1 /* locked */ #define WT_UPDATE_STATE_PREPARED 2 /* prepared */ - uint8_t state; /* state (one byte : conserve memory) */ + volatile uint8_t state; /* If the update includes a complete value. */ #define WT_UPDATE_DATA_VALUE(upd) \ diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index 70f9318f6d7..ec5c6689c3f 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -217,20 +217,23 @@ struct __wt_cursor_btree { #endif /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_CBT_ACTIVE 0x01u /* Active in the tree */ -#define WT_CBT_ITERATE_APPEND 0x02u /* Col-store: iterating append list */ -#define WT_CBT_ITERATE_NEXT 0x04u /* Next iteration configuration */ -#define WT_CBT_ITERATE_PREV 0x08u /* Prev iteration configuration */ -#define WT_CBT_NO_TXN 0x10u /* Non-txn cursor (e.g. a checkpoint) */ -#define WT_CBT_SEARCH_SMALLEST 0x20u /* Row-store: small-key insert list */ -#define WT_CBT_VAR_ONPAGE_MATCH 0x40u /* Var-store: on-page recno match */ +#define WT_CBT_ACTIVE 0x001u /* Active in the tree */ +#define WT_CBT_ITERATE_APPEND 0x002u /* Col-store: iterating append list */ +#define WT_CBT_ITERATE_NEXT 0x004u /* Next iteration configuration */ +#define WT_CBT_ITERATE_PREV 0x008u /* Prev iteration configuration */ +#define WT_CBT_NO_TXN 0x010u /* Non-txn cursor (e.g. a checkpoint) */ +#define WT_CBT_RETRY_NEXT 0x020u /* Next, resulted in prepare conflict */ +#define WT_CBT_RETRY_PREV 0x040u /* Prev, resulted in prepare conflict */ +#define WT_CBT_SEARCH_SMALLEST 0x080u /* Row-store: small-key insert list */ +#define WT_CBT_VAR_ONPAGE_MATCH 0x100u /* Var-store: on-page recno match */ /* AUTOMATIC FLAG VALUE GENERATION STOP */ #define WT_CBT_POSITION_MASK /* Flags associated with position */ \ (WT_CBT_ITERATE_APPEND | WT_CBT_ITERATE_NEXT | WT_CBT_ITERATE_PREV | \ - WT_CBT_SEARCH_SMALLEST | WT_CBT_VAR_ONPAGE_MATCH) + WT_CBT_RETRY_NEXT | WT_CBT_RETRY_PREV | WT_CBT_SEARCH_SMALLEST | \ + WT_CBT_VAR_ONPAGE_MATCH) - uint8_t flags; + uint32_t flags; }; struct __wt_cursor_bulk { diff --git a/src/third_party/wiredtiger/src/include/cursor.i b/src/third_party/wiredtiger/src/include/cursor.i index a4e986c4325..d338c47dfae 100644 --- a/src/third_party/wiredtiger/src/include/cursor.i +++ b/src/third_party/wiredtiger/src/include/cursor.i @@ -311,6 +311,20 @@ __wt_cursor_dhandle_decr_use(WT_SESSION_IMPL *session) } /* + * __cursor_kv_return -- + * Return a page referenced key/value pair to the application. + */ +static inline int +__cursor_kv_return( + WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) +{ + WT_RET(__wt_key_return(session, cbt)); + WT_RET(__wt_value_return(session, cbt, upd)); + + return (0); +} + +/* * __cursor_func_init -- * Cursor call setup. */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 8b69f9ef244..508480b95c2 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -100,7 +100,7 @@ extern void __wt_cursor_key_order_reset(WT_CURSOR_BTREE *cbt); extern void __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt); extern int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); -extern bool __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp); +extern int __wt_cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp, bool *valid) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_search(WT_CURSOR_BTREE *cbt) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/mutex.h b/src/third_party/wiredtiger/src/include/mutex.h index 2a3fc7448f8..ba32d166f03 100644 --- a/src/third_party/wiredtiger/src/include/mutex.h +++ b/src/third_party/wiredtiger/src/include/mutex.h @@ -44,9 +44,8 @@ struct __wt_rwlock { /* Read/write lock */ uint8_t current; /* Current ticket */ uint8_t next; /* Next available ticket */ uint8_t reader; /* Read queue ticket */ - uint8_t __notused; /* Padding */ - uint16_t readers_active;/* Count of active readers */ - uint16_t readers_queued;/* Count of queued readers */ + uint8_t readers_queued; /* Count of queued readers */ + uint32_t readers_active;/* Count of active readers */ } s; } u; diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 7ef63cb0eaf..01a982b8602 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -627,6 +627,9 @@ struct __wt_connection_stats { int64_t txn_read_queue_head; int64_t txn_read_queue_inserts; int64_t txn_read_queue_len; + int64_t txn_rollback_to_stable; + int64_t txn_rollback_upd_aborted; + int64_t txn_rollback_las_removed; int64_t txn_set_ts; int64_t txn_set_ts_commit; int64_t txn_set_ts_commit_upd; diff --git a/src/third_party/wiredtiger/src/include/txn.h b/src/third_party/wiredtiger/src/include/txn.h index dd7f5d4a8bc..19e0be2d695 100644 --- a/src/third_party/wiredtiger/src/include/txn.h +++ b/src/third_party/wiredtiger/src/include/txn.h @@ -102,6 +102,7 @@ struct __wt_txn_global { volatile uint64_t oldest_id; WT_DECL_TIMESTAMP(commit_timestamp) + WT_DECL_TIMESTAMP(last_ckpt_timestamp) WT_DECL_TIMESTAMP(oldest_timestamp) WT_DECL_TIMESTAMP(pinned_timestamp) WT_DECL_TIMESTAMP(recovery_timestamp) diff --git a/src/third_party/wiredtiger/src/include/txn.i b/src/third_party/wiredtiger/src/include/txn.i index 5fcf8ee11c9..36cac1a26f3 100644 --- a/src/third_party/wiredtiger/src/include/txn.i +++ b/src/third_party/wiredtiger/src/include/txn.i @@ -9,6 +9,11 @@ static inline int __wt_txn_id_check(WT_SESSION_IMPL *session); static inline void __wt_txn_read_last(WT_SESSION_IMPL *session); +typedef enum { + WT_VISIBLE_FALSE=0, /* Not a visible update */ + WT_VISIBLE_PREPARE=1, /* Prepared update */ + WT_VISIBLE_TRUE=2 /* A visible update */ +} WT_VISIBLE_TYPE; #ifdef HAVE_TIMESTAMPS /* * __wt_txn_timestamp_flags -- @@ -539,33 +544,74 @@ __wt_txn_visible( } /* + * __wt_txn_upd_visible_type -- + * Visible type of given update for the current transaction. + */ +static inline WT_VISIBLE_TYPE +__wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd) +{ + uint8_t upd_state; + bool upd_visible; + + for (;;__wt_yield()) { + /* Commit is in progress, yield and try again. */ + if ((upd_state = upd->state) == WT_UPDATE_STATE_LOCKED) + continue; + + upd_visible = __wt_txn_visible( + session, upd->txnid, WT_TIMESTAMP_NULL(&upd->timestamp)); + + /* + * The visibility check is only valid if the update does not + * change state. If the state does change, recheck visibility. + */ + if (upd->state == upd_state) + break; + } + + if (!upd_visible) + return (WT_VISIBLE_FALSE); + + if (upd_state == WT_UPDATE_STATE_PREPARED) + return (F_ISSET(&session->txn, WT_TXN_IGNORE_PREPARE) ? + WT_VISIBLE_FALSE : WT_VISIBLE_PREPARE); + + return (WT_VISIBLE_TRUE); +} + +/* * __wt_txn_upd_visible -- * Can the current transaction see the given update. */ static inline bool __wt_txn_upd_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd) { - return (__wt_txn_visible(session, - upd->txnid, WT_TIMESTAMP_NULL(&upd->timestamp))); + return (__wt_txn_upd_visible_type(session, upd) == WT_VISIBLE_TRUE); } /* * __wt_txn_read -- * Get the first visible update in a list (or NULL if none are visible). */ -static inline WT_UPDATE * -__wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd) +static inline int +__wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_UPDATE **updp) { static WT_UPDATE tombstone = { .txnid = WT_TXN_NONE, .type = WT_UPDATE_TOMBSTONE }; + WT_VISIBLE_TYPE upd_visible; bool skipped_birthmark; + *updp = NULL; for (skipped_birthmark = false; upd != NULL; upd = upd->next) { /* Skip reserved place-holders, they're never visible. */ - if (upd->type != WT_UPDATE_RESERVE && - __wt_txn_upd_visible(session, upd)) - break; + if (upd->type != WT_UPDATE_RESERVE) { + upd_visible = __wt_txn_upd_visible_type(session, upd); + if (upd_visible == WT_VISIBLE_TRUE) + break; + if (upd_visible == WT_VISIBLE_PREPARE) + return (WT_PREPARE_CONFLICT); + } /* An invisible birthmark is equivalent to a tombstone. */ if (upd->type == WT_UPDATE_BIRTHMARK) skipped_birthmark = true; @@ -574,7 +620,8 @@ __wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd) if (upd == NULL && skipped_birthmark) upd = &tombstone; - return (upd == NULL || upd->type == WT_UPDATE_BIRTHMARK ? NULL : upd); + *updp = (upd == NULL || upd->type == WT_UPDATE_BIRTHMARK ? NULL : upd); + return (0); } /* @@ -786,21 +833,32 @@ static inline int __wt_txn_update_check(WT_SESSION_IMPL *session, WT_UPDATE *upd) { WT_TXN *txn; + bool ignore_prepare_set; txn = &session->txn; - if (txn->isolation == WT_ISO_SNAPSHOT) - while (upd != NULL && !__wt_txn_upd_visible(session, upd)) { - if (upd->txnid != WT_TXN_ABORTED) { - WT_STAT_CONN_INCR( - session, txn_update_conflict); - WT_STAT_DATA_INCR( - session, txn_update_conflict); - return (__wt_txn_rollback_required(session, + if (txn->isolation != WT_ISO_SNAPSHOT) + return (0); + + /* + * Clear the ignore prepare setting of txn, as it is not supposed, to + * affect the visibility for update operations. + */ + ignore_prepare_set = F_ISSET(txn, WT_TXN_IGNORE_PREPARE); + F_CLR(txn, WT_TXN_IGNORE_PREPARE); + for (;upd != NULL && !__wt_txn_upd_visible(session, upd); + upd = upd->next) { + if (upd->txnid != WT_TXN_ABORTED) { + if (ignore_prepare_set) + F_SET(txn, WT_TXN_IGNORE_PREPARE); + WT_STAT_CONN_INCR(session, txn_update_conflict); + WT_STAT_DATA_INCR(session, txn_update_conflict); + return (__wt_txn_rollback_required(session, "conflict between concurrent operations")); - } - upd = upd->next; } + } + if (ignore_prepare_set) + F_SET(txn, WT_TXN_IGNORE_PREPARE); return (0); } diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index a75c22497ce..1f2a438b8e9 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -2431,8 +2431,9 @@ struct __wt_connection { * timestamps of all active readers\, and \c stable returns the most * recent \c stable_timestamp set with WT_CONNECTION::set_timestamp. * See @ref transaction_timestamps., a string\, chosen from the - * following options: \c "all_committed"\, \c "oldest"\, \c "pinned"\, - * \c "recovery"\, \c "stable"; default \c all_committed.} + * following options: \c "all_committed"\, \c "last_checkpoint"\, \c + * "oldest"\, \c "pinned"\, \c "recovery"\, \c "stable"; default \c + * all_committed.} * @configend * @errors * If there is no matching timestamp (e.g., if this method is called @@ -5562,81 +5563,87 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_TXN_READ_QUEUE_INSERTS 1312 /*! transaction: read timestamp queue length */ #define WT_STAT_CONN_TXN_READ_QUEUE_LEN 1313 +/*! transaction: rollback to stable calls */ +#define WT_STAT_CONN_TXN_ROLLBACK_TO_STABLE 1314 +/*! transaction: rollback to stable updates aborted */ +#define WT_STAT_CONN_TXN_ROLLBACK_UPD_ABORTED 1315 +/*! transaction: rollback to stable updates removed from lookaside */ +#define WT_STAT_CONN_TXN_ROLLBACK_LAS_REMOVED 1316 /*! transaction: set timestamp calls */ -#define WT_STAT_CONN_TXN_SET_TS 1314 +#define WT_STAT_CONN_TXN_SET_TS 1317 /*! transaction: set timestamp commit calls */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1315 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT 1318 /*! transaction: set timestamp commit updates */ -#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1316 +#define WT_STAT_CONN_TXN_SET_TS_COMMIT_UPD 1319 /*! transaction: set timestamp oldest calls */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1317 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1320 /*! transaction: set timestamp oldest updates */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1318 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1321 /*! transaction: set timestamp stable calls */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE 1319 +#define WT_STAT_CONN_TXN_SET_TS_STABLE 1322 /*! transaction: set timestamp stable updates */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1320 +#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1323 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1321 +#define WT_STAT_CONN_TXN_BEGIN 1324 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1322 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1325 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1323 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1326 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1324 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1327 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1325 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1328 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1326 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1329 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1327 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1330 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1328 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1331 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1329 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1332 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1330 +#define WT_STAT_CONN_TXN_CHECKPOINT 1333 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1331 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1334 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1332 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1335 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1333 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1336 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1334 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1337 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1335 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1338 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1336 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1339 /*! * transaction: transaction range of IDs currently pinned by named * snapshots */ -#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1337 +#define WT_STAT_CONN_TXN_PINNED_SNAPSHOT_RANGE 1340 /*! transaction: transaction range of timestamps currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1338 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1341 /*! * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1339 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1342 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1340 +#define WT_STAT_CONN_TXN_SYNC 1343 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1341 +#define WT_STAT_CONN_TXN_COMMIT 1344 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1342 +#define WT_STAT_CONN_TXN_ROLLBACK 1345 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1343 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1346 /*! * @} diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index f4d0fc0b1ef..a2ce2c136b5 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -1341,12 +1341,14 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, * started. The global commit point can move forward during * reconciliation so we use a cached copy to avoid races when a * concurrent transaction commits or rolls back while we are - * examining its updates. + * examining its updates. As prepared transaction id's are + * globally visible, need to check the update state as well. */ if (F_ISSET(r, WT_REC_EVICT) && + (upd->state != WT_UPDATE_STATE_READY || (F_ISSET(r, WT_REC_VISIBLE_ALL) ? WT_TXNID_LE(r->last_running, txnid) : - !__txn_visible_id(session, txnid))) { + !__txn_visible_id(session, txnid)))) { uncommitted = r->update_uncommitted = true; continue; } diff --git a/src/third_party/wiredtiger/src/support/mtx_rw.c b/src/third_party/wiredtiger/src/support/mtx_rw.c index 572592b9fbc..b554f589b3d 100644 --- a/src/third_party/wiredtiger/src/support/mtx_rw.c +++ b/src/third_party/wiredtiger/src/support/mtx_rw.c @@ -48,9 +48,8 @@ * uint8_t current; // Current ticket * uint8_t next; // Next available ticket * uint8_t reader; // Read queue ticket - * uint8_t __notused; // Padding - * uint16_t readers_active; // Count of active readers - * uint16_t readers_queued; // Count of queued readers + * uint8_t readers_queued; // Count of queued readers + * uint32_t readers_active; // Count of active readers * } s; * } u; * @@ -75,6 +74,12 @@ * 'reader' to 'next' (i.e. readers are scheduled after any queued writers, * avoiding starvation), then atomically incrementing 'readers_queued'. * + * We limit how many readers can queue: we don't allow more readers to queue + * than there are active writers (calculated as `next - current`): otherwise, + * in write-heavy workloads, readers can keep queuing up in front of writers + * and throughput is unstable. The remaining read requests wait without any + * ordering. + * * The 'next' field is a 1-byte value so the available ticket number wraps * after 256 requests. If a thread's write lock request would cause the 'next' * field to catch up with 'current', instead it waits to avoid the same ticket diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 40a07be0174..ae13f7d8abe 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -1052,6 +1052,9 @@ static const char * const __stats_connection_desc[] = { "transaction: read timestamp queue inserts to head", "transaction: read timestamp queue inserts total", "transaction: read timestamp queue length", + "transaction: rollback to stable calls", + "transaction: rollback to stable updates aborted", + "transaction: rollback to stable updates removed from lookaside", "transaction: set timestamp calls", "transaction: set timestamp commit calls", "transaction: set timestamp commit updates", @@ -1438,6 +1441,9 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->txn_read_queue_head = 0; stats->txn_read_queue_inserts = 0; stats->txn_read_queue_len = 0; + stats->txn_rollback_to_stable = 0; + stats->txn_rollback_upd_aborted = 0; + stats->txn_rollback_las_removed = 0; stats->txn_set_ts = 0; stats->txn_set_ts_commit = 0; stats->txn_set_ts_commit_upd = 0; @@ -1956,6 +1962,12 @@ __wt_stat_connection_aggregate( to->txn_read_queue_inserts += WT_STAT_READ(from, txn_read_queue_inserts); to->txn_read_queue_len += WT_STAT_READ(from, txn_read_queue_len); + to->txn_rollback_to_stable += + WT_STAT_READ(from, txn_rollback_to_stable); + to->txn_rollback_upd_aborted += + WT_STAT_READ(from, txn_rollback_upd_aborted); + to->txn_rollback_las_removed += + WT_STAT_READ(from, txn_rollback_las_removed); to->txn_set_ts += WT_STAT_READ(from, txn_set_ts); to->txn_set_ts_commit += WT_STAT_READ(from, txn_set_ts_commit); to->txn_set_ts_commit_upd += diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 1235bc8c2b2..d3f11c5fa69 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -748,6 +748,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; + WT_DECL_TIMESTAMP(ckpt_tmp_ts) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_ISOLATION saved_isolation; @@ -899,6 +900,15 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * checkpointing the metadata since we know that all files in the * checkpoint are now in a consistent state. */ +#ifdef HAVE_TIMESTAMPS + /* + * Record the timestamp from the transaction if we were successful. + * Store it in a temp variable now because it will be invalidated during + * commit but we don't want to set it until we know the checkpoint + * is successful. + */ + __wt_timestamp_set(&ckpt_tmp_ts, &txn->read_timestamp); +#endif WT_ERR(__wt_txn_commit(session, NULL)); /* @@ -942,8 +952,13 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) */ txn_global->checkpoint_state.pinned_id = WT_TXN_NONE; - if (full) + if (full) { __checkpoint_stats(session); +#ifdef HAVE_TIMESTAMPS + __wt_timestamp_set( + &conn->txn_global.last_ckpt_timestamp, &ckpt_tmp_ts); +#endif + } err: /* * Reset the timer so that next checkpoint tracks the progress only if diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index d31b3995092..eef2fde5284 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -74,6 +74,7 @@ __txn_rollback_to_stable_lookaside_fixup(WT_SESSION_IMPL *session) &rollback_timestamp, las_timestamp.data) < 0) { WT_ERR(cursor->remove(cursor)); ++remove_cnt; + WT_STAT_CONN_INCR(session, txn_rollback_las_removed); } else ++las_total; } @@ -111,6 +112,7 @@ __txn_abort_newer_update(WT_SESSION_IMPL *session, if (__wt_timestamp_cmp( rollback_timestamp, &next_upd->timestamp) < 0) { next_upd->txnid = WT_TXN_ABORTED; + WT_STAT_CONN_INCR(session, txn_rollback_upd_aborted); __wt_timestamp_set_zero(&next_upd->timestamp); /* @@ -425,6 +427,7 @@ __wt_txn_rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); + WT_STAT_CONN_INCR(session, txn_rollback_to_stable); /* * Mark that a rollback operation is in progress and wait for eviction * to drain. This is necessary because lookaside eviction uses diff --git a/src/third_party/wiredtiger/src/txn/txn_timestamp.c b/src/third_party/wiredtiger/src/txn/txn_timestamp.c index 280425eb56e..2266a9cd6f5 100644 --- a/src/third_party/wiredtiger/src/txn/txn_timestamp.c +++ b/src/third_party/wiredtiger/src/txn/txn_timestamp.c @@ -239,7 +239,10 @@ __txn_global_query_timestamp( break; } __wt_readunlock(session, &txn_global->commit_timestamp_rwlock); - } else if (WT_STRING_MATCH("oldest", cval.str, cval.len)) { + } else if (WT_STRING_MATCH("last_checkpoint", cval.str, cval.len)) + /* Read-only value forever. No lock needed. */ + __wt_timestamp_set(&ts, &txn_global->last_ckpt_timestamp); + else if (WT_STRING_MATCH("oldest", cval.str, cval.len)) { if (!txn_global->has_oldest_timestamp) return (WT_NOTFOUND); WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, diff --git a/src/third_party/wiredtiger/test/csuite/rwlock/main.c b/src/third_party/wiredtiger/test/csuite/rwlock/main.c index e1d00344ee2..f69628dca40 100644 --- a/src/third_party/wiredtiger/test/csuite/rwlock/main.c +++ b/src/third_party/wiredtiger/test/csuite/rwlock/main.c @@ -171,8 +171,8 @@ thread_dump(void *arg) { sleep(1); printf("\n" "rwlock { current %" PRIu8 ", next %" PRIu8 - ", reader %" PRIu8 ", readers_active %" PRIu16 - ", readers_queued %" PRIu16 " }\n", + ", reader %" PRIu8 ", readers_active %" PRIu32 + ", readers_queued %" PRIu8 " }\n", rwlock.u.s.current, rwlock.u.s.next, rwlock.u.s.reader, diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index 8d85d331c89..d46b0868887 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -191,9 +191,13 @@ config_setup(void) /* * Turn off truncate for LSM runs (some configurations with truncate * always results in a timeout). + * + * WiredTiger doesn't currently support truncate and prepare at the + * same time, see WT-3922. For now, pick one on each run. */ - if (!config_is_perm("truncate") && DATASOURCE("lsm")) - config_single("truncate=off", 0); + if (!config_is_perm("truncate")) + if (DATASOURCE("lsm") || mmrand(NULL, 0, 1) == 1) + config_single("truncate=off", 0); /* Give Helium configuration a final review. */ if (DATASOURCE("helium")) diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 4c54972516e..2466fe4d64d 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -403,10 +403,8 @@ snap_check(WT_CURSOR *cursor, break; case WT_NOTFOUND: break; - case WT_ROLLBACK: - return (WT_ROLLBACK); default: - testutil_die(ret, "WT_CURSOR.search"); + return (ret); } /* Check for simple matches. */ @@ -644,6 +642,19 @@ prepare_transaction(TINFO *tinfo, WT_SESSION *session) } /* + * OP_FAILED -- + * General error handling. + */ +#define OP_FAILED(notfound_ok) do { \ + positioned = false; \ + if (intxn && (ret == WT_CACHE_FULL || ret == WT_ROLLBACK)) \ + goto deadlock; \ + testutil_assert((notfound_ok && ret == WT_NOTFOUND) || \ + ret == WT_CACHE_FULL || \ + ret == WT_PREPARE_CONFLICT || ret == WT_ROLLBACK); \ +} while (0) + +/* * ops -- * Per-thread operations. */ @@ -825,11 +836,8 @@ ops(void *arg) if (ret == 0) { positioned = true; SNAP_TRACK(READ, tinfo); - } else { - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_NOTFOUND); - } + } else + OP_FAILED(true); } /* Optionally reserve a row. */ @@ -847,12 +855,8 @@ ops(void *arg) positioned = true; __wt_yield(); /* Let other threads proceed. */ - } else { - positioned = false; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_NOTFOUND); - } + } else + OP_FAILED(true); } /* Perform the operation. */ @@ -881,11 +885,8 @@ ops(void *arg) if (ret == 0) { ++tinfo->insert; SNAP_TRACK(INSERT, tinfo); - } else { - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_ROLLBACK); - } + } else + OP_FAILED(false); break; case MODIFY: /* @@ -907,13 +908,8 @@ ops(void *arg) if (ret == 0) { positioned = true; SNAP_TRACK(MODIFY, tinfo); - } else { - positioned = false; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert( - ret == WT_NOTFOUND || ret == WT_ROLLBACK); - } + } else + OP_FAILED(true); break; case READ: ++tinfo->search; @@ -921,12 +917,8 @@ ops(void *arg) if (ret == 0) { positioned = true; SNAP_TRACK(READ, tinfo); - } else { - positioned = false; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_NOTFOUND); - } + } else + OP_FAILED(true); break; case REMOVE: remove_instead_of_truncate: @@ -946,12 +938,8 @@ remove_instead_of_truncate: * previous state, but not necessarily set. */ SNAP_TRACK(REMOVE, tinfo); - } else { - positioned = false; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_NOTFOUND); - } + } else + OP_FAILED(true); break; case TRUNCATE: /* @@ -1020,11 +1008,8 @@ remove_instead_of_truncate: if (ret == 0) { ++tinfo->truncate; SNAP_TRACK(TRUNCATE, tinfo); - } else { - testutil_assert(ret == WT_ROLLBACK); - if (intxn) - goto deadlock; - } + } else + OP_FAILED(false); break; case UPDATE: update_instead_of_chosen_op: @@ -1041,12 +1026,8 @@ update_instead_of_chosen_op: if (ret == 0) { positioned = true; SNAP_TRACK(UPDATE, tinfo); - } else { - positioned = false; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_ROLLBACK); - } + } else + OP_FAILED(false); break; } @@ -1061,9 +1042,8 @@ update_instead_of_chosen_op: for (i = 0; i < j; ++i) { if ((ret = nextprev(tinfo, cursor, next)) == 0) continue; - if (ret == WT_ROLLBACK && intxn) - goto deadlock; - testutil_assert(ret == WT_NOTFOUND); + + OP_FAILED(true); break; } } @@ -1090,9 +1070,11 @@ update_instead_of_chosen_op: goto deadlock; } - /* Prepare the transaction 10% of the time. */ - /* XXX: CONFIGURE PREPARE OFF FOR NOW */ - if (mmrand(&tinfo->rnd, 1, 10) == 0) { + /* + * Prepare the transaction 10% of the time. + * Currently doesn't work with truncation, see WT-3922. + */ + if (g.c_truncate == 0 && mmrand(&tinfo->rnd, 1, 10) == 1) { ret = prepare_transaction(tinfo, session); testutil_assert(ret == 0 || ret == WT_PREPARE_CONFLICT); if (ret == WT_PREPARE_CONFLICT) @@ -1138,7 +1120,7 @@ deadlock: ++tinfo->deadlock; /* * wts_read_scan -- - * Read and verify all elements in a file. + * Read and verify a subset of the elements in a file. */ void wts_read_scan(void) @@ -1182,6 +1164,7 @@ wts_read_scan(void) case 0: case WT_NOTFOUND: case WT_ROLLBACK: + case WT_PREPARE_CONFLICT: break; default: testutil_die( @@ -1209,11 +1192,6 @@ read_row_worker( session = cursor->session; - /* Log the operation */ - if (g.logging == LOG_OPS) - (void)g.wt_api->msg_printf(g.wt_api, - session, "%-10s%" PRIu64, "read", keyno); - /* Retrieve the key/value pair by key. */ switch (g.type) { case FIX: @@ -1254,12 +1232,15 @@ read_row_worker( value->size = 1; } break; - case WT_ROLLBACK: - return (WT_ROLLBACK); default: - testutil_die(ret, "read_row: read row %" PRIu64, keyno); + return (ret); } + /* Log the operation */ + if (g.logging == LOG_OPS) + (void)g.wt_api->msg_printf(g.wt_api, + session, "%-10s%" PRIu64, "read", keyno); + #ifdef HAVE_BERKELEY_DB if (!SINGLETHREADED) return (ret); @@ -1394,24 +1375,39 @@ nextprev(TINFO *tinfo, WT_CURSOR *cursor, bool next) break; case WT_NOTFOUND: break; - case WT_ROLLBACK: - return (WT_ROLLBACK); default: - testutil_die(ret, "%s", which); + return (ret); } + if (g.logging == LOG_OPS) + switch (g.type) { + case FIX: + (void)g.wt_api->msg_printf(g.wt_api, + cursor->session, "%-10s%" PRIu64 " {0x%02x}", + which, keyno, ((char *)value.data)[0]); + break; + case ROW: + (void)g.wt_api->msg_printf(g.wt_api, + cursor->session, "%-10s{%.*s}, {%.*s}", + which, (int)key.size, (char *)key.data, + (int)value.size, (char *)value.data); + break; + case VAR: + (void)g.wt_api->msg_printf(g.wt_api, + cursor->session, "%-10s%" PRIu64 " {%.*s}", + which, keyno, (int)value.size, (char *)value.data); + break; + } + #ifdef HAVE_BERKELEY_DB if (!SINGLETHREADED) return (ret); { WT_ITEM bdb_key, bdb_value; - WT_SESSION *session; int notfound; char *p; - session = cursor->session; - /* Retrieve the BDB key/value. */ bdb_np(next, &bdb_key.data, &bdb_key.size, &bdb_value.data, &bdb_value.size, ¬found); @@ -1444,26 +1440,6 @@ mismatch: if (g.type == ROW) { print_item(" wt-value", &value); testutil_die(0, NULL); } - - if (g.logging == LOG_OPS) - switch (g.type) { - case FIX: - (void)g.wt_api->msg_printf(g.wt_api, - session, "%-10s%" PRIu64 " {0x%02x}", which, - keyno, ((char *)value.data)[0]); - break; - case ROW: - (void)g.wt_api->msg_printf( - g.wt_api, session, "%-10s{%.*s}, {%.*s}", which, - (int)key.size, (char *)key.data, - (int)value.size, (char *)value.data); - break; - case VAR: - (void)g.wt_api->msg_printf(g.wt_api, session, - "%-10s%" PRIu64 " {%.*s}", which, - keyno, (int)value.size, (char *)value.data); - break; - } } #endif return (ret); @@ -1483,24 +1459,14 @@ row_reserve(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) cursor->set_key(cursor, tinfo->key); } + if ((ret = cursor->reserve(cursor)) != 0) + return (ret); + if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, cursor->session, "%-10s{%.*s}", "reserve", (int)tinfo->key->size, tinfo->key->data); - switch (ret = cursor->reserve(cursor)) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - case WT_NOTFOUND: - return (WT_NOTFOUND); - default: - testutil_die(ret, - "row_reserve: reserve row %" PRIu64 " by key", - tinfo->keyno); - } return (0); } @@ -1516,21 +1482,13 @@ col_reserve(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) if (!positioned) cursor->set_key(cursor, tinfo->keyno); + if ((ret = cursor->reserve(cursor)) != 0) + return (ret); + if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, cursor->session, "%-10s%" PRIu64, "reserve", tinfo->keyno); - switch (ret = cursor->reserve(cursor)) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - case WT_NOTFOUND: - return (WT_NOTFOUND); - default: - testutil_die(ret, "col_reserve: %" PRIu64, tinfo->keyno); - } return (0); } @@ -1577,19 +1535,10 @@ row_modify(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) } modify_build(tinfo, entries, &nentries); - switch (ret = cursor->modify(cursor, entries, nentries)) { - case 0: - testutil_check(cursor->get_value(cursor, tinfo->value)); - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - case WT_NOTFOUND: - return (WT_NOTFOUND); - default: - testutil_die(ret, - "row_modify: modify row %" PRIu64 " by key", tinfo->keyno); - } + if ((ret = cursor->modify(cursor, entries, nentries)) != 0) + return (ret); + + testutil_check(cursor->get_value(cursor, tinfo->value)); if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, cursor->session, @@ -1624,25 +1573,16 @@ col_modify(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) cursor->set_key(cursor, tinfo->keyno); modify_build(tinfo, entries, &nentries); - switch (ret = cursor->modify(cursor, entries, nentries)) { - case 0: - testutil_check(cursor->get_value(cursor, tinfo->value)); - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - case WT_NOTFOUND: - return (WT_NOTFOUND); - default: - testutil_die(ret, - "col_modify: modify row %" PRIu64, tinfo->keyno); - } + if ((ret = cursor->modify(cursor, entries, nentries)) != 0) + return (ret); + + testutil_check(cursor->get_value(cursor, tinfo->value)); if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, cursor->session, - "%-10s{%.*s}, {%.*s}", + "%-10s%" PRIu64 ", {%.*s}", "modify", - (int)tinfo->key->size, tinfo->key->data, + tinfo->keyno, (int)tinfo->value->size, tinfo->value->data); #ifdef HAVE_BERKELEY_DB @@ -1698,24 +1638,15 @@ row_truncate(TINFO *tinfo, WT_CURSOR *cursor) testutil_check(c2->close(c2)); } + if (ret != 0) + return (ret); + if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, session, "%-10s%" PRIu64 ", %" PRIu64, "truncate", tinfo->keyno, tinfo->last); - switch (ret) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, - "row_truncate: row %" PRIu64 "-%" PRIu64, - tinfo->keyno, tinfo->last); - } - #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) bdb_truncate(tinfo->keyno, tinfo->last); @@ -1724,49 +1655,6 @@ row_truncate(TINFO *tinfo, WT_CURSOR *cursor) } /* - * row_update -- - * Update a row in a row-store file. - */ -static int -row_update(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) -{ - WT_DECL_RET; - - if (!positioned) { - key_gen(tinfo->key, tinfo->keyno); - cursor->set_key(cursor, tinfo->key); - } - val_gen(&tinfo->rnd, tinfo->value, tinfo->keyno); - cursor->set_value(cursor, tinfo->value); - - if (g.logging == LOG_OPS) - (void)g.wt_api->msg_printf(g.wt_api, cursor->session, - "%-10s{%.*s}, {%.*s}", - "put", - (int)tinfo->key->size, tinfo->key->data, - (int)tinfo->value->size, tinfo->value->data); - - switch (ret = cursor->update(cursor)) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, - "row_update: update row %" PRIu64 " by key", tinfo->keyno); - } - -#ifdef HAVE_BERKELEY_DB - if (SINGLETHREADED) - bdb_update( - tinfo->key->data, tinfo->key->size, - tinfo->value->data, tinfo->value->size); -#endif - return (0); -} - -/* * col_truncate -- * Truncate rows in a column-store file. */ @@ -1802,6 +1690,8 @@ col_truncate(TINFO *tinfo, WT_CURSOR *cursor) ret = session->truncate(session, NULL, cursor, c2, NULL); testutil_check(c2->close(c2)); } + if (ret != 0) + return (ret); if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, session, @@ -1809,21 +1699,44 @@ col_truncate(TINFO *tinfo, WT_CURSOR *cursor) "truncate", tinfo->keyno, tinfo->last); - switch (ret) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, - "col_truncate: row %" PRIu64 "-%" PRIu64, - tinfo->keyno, tinfo->last); +#ifdef HAVE_BERKELEY_DB + if (SINGLETHREADED) + bdb_truncate(tinfo->keyno, tinfo->last); +#endif + return (0); +} + +/* + * row_update -- + * Update a row in a row-store file. + */ +static int +row_update(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) +{ + WT_DECL_RET; + + if (!positioned) { + key_gen(tinfo->key, tinfo->keyno); + cursor->set_key(cursor, tinfo->key); } + val_gen(&tinfo->rnd, tinfo->value, tinfo->keyno); + cursor->set_value(cursor, tinfo->value); + + if ((ret = cursor->update(cursor)) != 0) + return (ret); + + if (g.logging == LOG_OPS) + (void)g.wt_api->msg_printf(g.wt_api, cursor->session, + "%-10s{%.*s}, {%.*s}", + "put", + (int)tinfo->key->size, tinfo->key->data, + (int)tinfo->value->size, tinfo->value->data); #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) - bdb_truncate(tinfo->keyno, tinfo->last); + bdb_update( + tinfo->key->data, tinfo->key->size, + tinfo->value->data, tinfo->value->size); #endif return (0); } @@ -1845,6 +1758,9 @@ col_update(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) else cursor->set_value(cursor, tinfo->value); + if ((ret = cursor->update(cursor)) != 0) + return (ret); + if (g.logging == LOG_OPS) { if (g.type == FIX) (void)g.wt_api->msg_printf(g.wt_api, cursor->session, @@ -1859,16 +1775,6 @@ col_update(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) (char *)tinfo->value->data); } - switch (ret = cursor->update(cursor)) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, "col_update: %" PRIu64, tinfo->keyno); - } - #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) { key_gen(tinfo->key, tinfo->keyno); @@ -1999,6 +1905,9 @@ row_insert(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) val_gen(&tinfo->rnd, tinfo->value, tinfo->keyno); cursor->set_value(cursor, tinfo->value); + if ((ret = cursor->insert(cursor)) != 0) + return (ret); + /* Log the operation */ if (g.logging == LOG_OPS) (void)g.wt_api->msg_printf(g.wt_api, cursor->session, @@ -2007,17 +1916,6 @@ row_insert(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) (int)tinfo->key->size, tinfo->key->data, (int)tinfo->value->size, tinfo->value->data); - switch (ret = cursor->insert(cursor)) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, - "row_insert: insert row %" PRIu64 " by key", tinfo->keyno); - } - #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) bdb_update( @@ -2041,15 +1939,10 @@ col_insert(TINFO *tinfo, WT_CURSOR *cursor) cursor->set_value(cursor, *(uint8_t *)tinfo->value->data); else cursor->set_value(cursor, tinfo->value); - switch (ret = cursor->insert(cursor)) { - case 0: - break; - case WT_CACHE_FULL: - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, "cursor.insert"); - } + + if ((ret = cursor->insert(cursor)) != 0) + return (ret); + testutil_check(cursor->get_key(cursor, &tinfo->keyno)); table_append(tinfo->keyno); /* Extend the object. */ @@ -2093,23 +1986,16 @@ row_remove(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) cursor->set_key(cursor, tinfo->key); } - if (g.logging == LOG_OPS) - (void)g.wt_api->msg_printf(g.wt_api, - cursor->session, "%-10s%" PRIu64, "remove", tinfo->keyno); - /* We use the cursor in overwrite mode, check for existence. */ if ((ret = cursor->search(cursor)) == 0) ret = cursor->remove(cursor); - switch (ret) { - case 0: - case WT_NOTFOUND: - break; - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, - "row_remove: remove %" PRIu64 " by key", tinfo->keyno); - } + + if (ret != 0 && ret != WT_NOTFOUND) + return (ret); + + if (g.logging == LOG_OPS) + (void)g.wt_api->msg_printf(g.wt_api, + cursor->session, "%-10s%" PRIu64, "remove", tinfo->keyno); #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) { @@ -2134,23 +2020,16 @@ col_remove(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) if (!positioned) cursor->set_key(cursor, tinfo->keyno); - if (g.logging == LOG_OPS) - (void)g.wt_api->msg_printf(g.wt_api, - cursor->session, "%-10s%" PRIu64, "remove", tinfo->keyno); - /* We use the cursor in overwrite mode, check for existence. */ if ((ret = cursor->search(cursor)) == 0) ret = cursor->remove(cursor); - switch (ret) { - case 0: - case WT_NOTFOUND: - break; - case WT_ROLLBACK: - return (WT_ROLLBACK); - default: - testutil_die(ret, - "col_remove: remove %" PRIu64 " by key", tinfo->keyno); - } + + if (ret != 0 && ret != WT_NOTFOUND) + return (ret); + + if (g.logging == LOG_OPS) + (void)g.wt_api->msg_printf(g.wt_api, + cursor->session, "%-10s%" PRIu64, "remove", tinfo->keyno); #ifdef HAVE_BERKELEY_DB if (SINGLETHREADED) { diff --git a/src/third_party/wiredtiger/test/suite/test_cursor14.py b/src/third_party/wiredtiger/test/suite/test_cursor14.py new file mode 100644 index 00000000000..25bd0cec00a --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_cursor14.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtdataset import SimpleDataSet, ComplexDataSet, ComplexLSMDataSet +from wtscenario import make_scenarios + +# test_cursor14.py +# Test that more than 64K cursors can be opened on a data source +class test_cursor14(wttest.WiredTigerTestCase): + scenarios = make_scenarios([ + ('file-r', dict(type='file:', keyfmt='r', dataset=SimpleDataSet)), + ('file-S', dict(type='file:', keyfmt='S', dataset=SimpleDataSet)), + ('lsm-S', dict(type='lsm:', keyfmt='S', dataset=SimpleDataSet)), + ('table-r', dict(type='table:', keyfmt='r', dataset=SimpleDataSet)), + ('table-S', dict(type='table:', keyfmt='S', dataset=SimpleDataSet)), + ('table-r-complex', dict(type='table:', keyfmt='r', + dataset=ComplexDataSet)), + ('table-S-complex', dict(type='table:', keyfmt='S', + dataset=ComplexDataSet)), + ('table-S-complex-lsm', dict(type='table:', keyfmt='S', + dataset=ComplexLSMDataSet)), + ]) + + def test_cursor14(self): + uri = self.type + 'cursor14' + + ds = self.dataset(self, uri, 100, key_format=self.keyfmt) + ds.populate() + + for i in xrange(66000): + cursor = self.session.open_cursor(uri, None, None) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_prepare04.py b/src/third_party/wiredtiger/test/suite/test_prepare04.py new file mode 100644 index 00000000000..af5dd12b1e5 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_prepare04.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2018 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# test_prepare04.py +# Prepare: prepare conflict with update and read operations +# + +import random +from suite_subprocess import suite_subprocess +import wiredtiger, wttest +from wtscenario import make_scenarios + +def timestamp_str(t): + return '%x' % t + +class test_prepare04(wttest.WiredTigerTestCase, suite_subprocess): + tablename = 'test_prepare_cursor' + uri = 'table:' + tablename + before_ts = timestamp_str(150) + prepare_ts = timestamp_str(200) + after_ts = timestamp_str(250) + + types = [ + ('col', dict(extra_config=',log=(enabled=false),key_format=r')), + ('lsm', dict(extra_config=',log=(enabled=false),type=lsm')), + ('row', dict(extra_config=',log=(enabled=false)')), + ] + + # Various begin_transaction config + txncfg = [ + ('before_ts', dict(txn_config='isolation=snapshot,read_timestamp=' + before_ts, after_ts=False)), + ('after_ts', dict(txn_config='isolation=snapshot,read_timestamp=' + after_ts, after_ts=True)), + ('no_ts', dict(txn_config='isolation=snapshot', after_ts=True)), + ] + + preparecfg = [ + ('ignore_false', dict(ignore_config=',ignore_prepare=false', ignore=False)), + ('ignore_true', dict(ignore_config=',ignore_prepare=true', ignore=True)), + ] + conn_config = 'log=(enabled)' + + scenarios = make_scenarios(types, txncfg, preparecfg) + + def test_prepare_conflict(self): + if not wiredtiger.timestamp_build(): + self.skipTest('requires a timestamp build') + + self.session.create(self.uri, + 'key_format=i,value_format=i' + self.extra_config) + c = self.session.open_cursor(self.uri) + + # Insert keys 1..100 each with timestamp=key, in some order + orig_keys = range(1, 101) + keys = orig_keys[:] + random.shuffle(keys) + + k = 1 + self.session.begin_transaction() + c[k] = 1 + self.session.commit_transaction('commit_timestamp=' + timestamp_str(100)) + + # Everything up to and including timestamp 100 has been committed. + self.assertTimestampsEqual(self.conn.query_timestamp(), timestamp_str(100)) + + # Bump the oldest timestamp, we're not going back... + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(100)) + + # make prepared updates. + k = 1 + self.session.begin_transaction('isolation=snapshot') + c.set_key(1) + c.set_value(2) + c.update() + self.session.prepare_transaction('prepare_timestamp=' + self.prepare_ts) + conflictmsg = '/conflict between concurrent operations/' + preparemsg = '/conflict with a prepared update/' + + #''' + # Verify data visibility from a different session/transaction. + s_other = self.conn.open_session() + c_other = s_other.open_cursor(self.uri, None) + s_other.begin_transaction(self.txn_config + self.ignore_config) + c_other.set_key(1) + if self.ignore == False and self.after_ts == True: + self.assertRaises(wiredtiger.WiredTigerError, lambda:c_other.search()) + else: + c_other.search() + self.assertTrue(c_other.get_value() == 1) + c_other.set_value(3) + self.assertRaises(wiredtiger.WiredTigerError, lambda:c_other.update()) + s_other.commit_transaction() + #''' + + self.session.commit_transaction('commit_timestamp=' + timestamp_str(300)) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp04.py b/src/third_party/wiredtiger/test/suite/test_timestamp04.py index 48ec7fac9a6..83ed4e904a6 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp04.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp04.py @@ -32,6 +32,7 @@ from suite_subprocess import suite_subprocess import wiredtiger, wttest +from wiredtiger import stat from wtscenario import make_scenarios def timestamp_str(t): @@ -98,7 +99,7 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): def ConnectionOpen(self, cacheSize): self.home = '.' - conn_params = 'create,' + \ + conn_params = 'create,statistics=(fast),' + \ cacheSize + ',error_prefix="%s" %s' % (self.shortid(), self.conn_config) try: self.conn = wiredtiger.wiredtiger_open(self.home, conn_params) @@ -164,6 +165,12 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): stable_ts = timestamp_str(key_range / 2) self.conn.set_timestamp('stable_timestamp=' + stable_ts) self.conn.rollback_to_stable() + stat_cursor = self.session.open_cursor('statistics:', None, None) + calls = stat_cursor[stat.conn.txn_rollback_to_stable][2] + upd_aborted = stat_cursor[stat.conn.txn_rollback_upd_aborted][2] + stat_cursor.close() + self.assertEqual(calls, 1) + self.assertTrue(upd_aborted >= key_range/2) # Check that we see the inserted value (i.e. 1) for all the keys in # non-timestamp tables. @@ -224,9 +231,20 @@ class test_timestamp04(wttest.WiredTigerTestCase, suite_subprocess): # Scenario: 4 # Advance the stable_timestamp by a quarter range and rollback. # Three-fourths of the later timestamps will be rolled back. - stable_ts = timestamp_str(key_range + key_range / 4) + rolled_range = key_range + key_range / 4 + stable_ts = timestamp_str(rolled_range) self.conn.set_timestamp('stable_timestamp=' + stable_ts) self.conn.rollback_to_stable() + stat_cursor = self.session.open_cursor('statistics:', None, None) + calls = stat_cursor[stat.conn.txn_rollback_to_stable][2] + upd_aborted = stat_cursor[stat.conn.txn_rollback_upd_aborted][2] + stat_cursor.close() + self.assertEqual(calls, 2) + # + # We rolled back half on the earlier call and now three-quarters on + # this call, which is one and one quarter of all keys rolled back. + # + self.assertTrue(upd_aborted >= rolled_range) # Check that we see the updated value (i.e. 2) for all the keys in # non-timestamped tables. diff --git a/src/third_party/wiredtiger/test/suite/test_timestamp10.py b/src/third_party/wiredtiger/test/suite/test_timestamp10.py index a798f5ff355..02b22e6afbe 100644 --- a/src/third_party/wiredtiger/test/suite/test_timestamp10.py +++ b/src/third_party/wiredtiger/test/suite/test_timestamp10.py @@ -27,7 +27,7 @@ # OTHER DEALINGS IN THE SOFTWARE. # # test_timestamp10.py -# Timestamps: Saving and querying the checkpoint recovery timestamp +# Timestamps: Saving and querying the last checkpoint and recovery timestamps # import fnmatch, os, shutil @@ -101,6 +101,8 @@ class test_timestamp10(wttest.WiredTigerTestCase, suite_subprocess): ',stable_timestamp=' + timestamp_str(ts)) # This forces a different checkpoint timestamp for each table. self.session.checkpoint() + q = self.conn.query_timestamp('get=last_checkpoint') + self.assertTimestampsEqual(q, timestamp_str(ts)) # Copy to a new database and then recover. self.copy_dir(".", "RESTART") |