diff options
20 files changed, 1104 insertions, 280 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 23ba8dc737e..689aa30c192 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -336,6 +336,7 @@ RLEs RMW RNG RPC +RTS RUNDIR RWLOCK RXB @@ -1251,6 +1252,7 @@ srch ssize startup statlog +stb stderr stdin stdout @@ -1308,6 +1310,7 @@ timedwait timestamp timestamped timestamps +tinfo tmp todo tokenizer diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py index f4400cc8bdf..c7bfc07b404 100644 --- a/src/third_party/wiredtiger/dist/stat_data.py +++ b/src/third_party/wiredtiger/dist/stat_data.py @@ -651,10 +651,11 @@ connection_stats = [ TxnStat('txn_rollback', 'transactions rolled back'), TxnStat('txn_rts', 'rollback to stable calls'), TxnStat('txn_rts_hs_removed', 'rollback to stable updates removed from history store'), - TxnStat('txn_rts_sweep_hs_keys', 'rollback to stable sweeping history store keys'), TxnStat('txn_rts_keys_removed', 'rollback to stable keys removed'), TxnStat('txn_rts_keys_restored', 'rollback to stable keys restored'), TxnStat('txn_rts_pages_visited', 'rollback to stable pages visited'), + TxnStat('txn_rts_skip_interal_pages_walk', 'rollback to stable skipping internal pages tree walk'), + TxnStat('txn_rts_sweep_hs_keys', 'rollback to stable sweeping history store keys'), TxnStat('txn_rts_upd_aborted', 'rollback to stable updates aborted'), TxnStat('txn_set_ts', 'set timestamp calls'), TxnStat('txn_set_ts_durable', 'set timestamp durable calls'), diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 3c04510d191..f36da07c129 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "930bbacc3761a10483875585dbd4ecb58271d57e" + "commit": "ab40833d9130b71f4b36a1a03fd8f4f137d11bdd" } diff --git a/src/third_party/wiredtiger/src/history/hs.c b/src/third_party/wiredtiger/src/history/hs.c index 03d36bf5d07..78b47ecd6aa 100644 --- a/src/third_party/wiredtiger/src/history/hs.c +++ b/src/third_party/wiredtiger/src/history/hs.c @@ -306,10 +306,38 @@ __wt_hs_cursor_close(WT_SESSION_IMPL *session, uint32_t session_flags, bool is_o static int __hs_row_search(WT_CURSOR_BTREE *hs_cbt, WT_ITEM *srch_key, bool insert) { + WT_CURSOR *hs_cursor; WT_DECL_RET; + bool leaf_found; + + hs_cursor = &hs_cbt->iface; + leaf_found = false; + + /* + * Check whether the search key can be find in the provided leaf page, if exists. Otherwise + * perform a full search. + */ + if (hs_cbt->ref != NULL) { + WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt), + ret = __wt_row_search(hs_cbt, srch_key, insert, hs_cbt->ref, false, &leaf_found)); + WT_RET(ret); + + /* + * Only use the pinned page search results if search returns an exact match or a slot other + * than the page's boundary slots, if that's not the case, the record might belong on an + * entirely different page. + */ + if (leaf_found && (hs_cbt->compare != 0 && + (hs_cbt->slot == 0 || hs_cbt->slot == hs_cbt->ref->page->entries - 1))) + leaf_found = false; + if (!leaf_found) + hs_cursor->reset(hs_cursor); + } + + if (!leaf_found) + WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt), + ret = __wt_row_search(hs_cbt, srch_key, insert, NULL, false, NULL)); - WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt), - ret = __wt_row_search(hs_cbt, srch_key, insert, NULL, false, NULL)); #ifdef HAVE_DIAGNOSTIC WT_TRET(__wt_cursor_key_order_init(hs_cbt)); #endif @@ -398,18 +426,49 @@ __hs_insert_record_with_btree_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, W WT_HS_TIME_POINT *stop_time_point) { WT_CURSOR_BTREE *cbt; + WT_DECL_ITEM(hs_key); WT_DECL_RET; WT_UPDATE *hs_upd, *upd_local; + wt_timestamp_t hs_start_ts; + uint64_t counter, hs_counter; + uint32_t hs_btree_id; + int cmp; cbt = (WT_CURSOR_BTREE *)cursor; hs_upd = upd_local = NULL; + counter = 0; + + /* Allocate buffers for the data store and history store key. */ + WT_ERR(__wt_scr_alloc(session, 0, &hs_key)); + + /* + * Adjust counter if there exists an update in the history store with same btree id, key and + * timestamp. Otherwise the newly inserting history store record may fall behind the existing + * one can lead to wrong order. + */ + WT_ERR_NOTFOUND_OK( + __wt_hs_cursor_position(session, cursor, btree->id, key, upd->start_ts), true); + if (ret == 0) { + WT_ERR(cursor->get_key(cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter)); + + /* + * Check the whether the existing record is also from the same timestamp. + * + * Verify simple checks first to confirm whether the retrieved update same or not before + * performing the expensive key comparison. + */ + if (hs_btree_id == btree->id && upd->start_ts == hs_start_ts) { + WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp)); + if (cmp == 0) + counter = hs_counter + 1; + } + } /* * Use WT_CURSOR.set_key and WT_CURSOR.set_value to create key and value items, then use them to * create an update chain for a direct insertion onto the history store page. */ - cursor->set_key( - cursor, btree->id, key, upd->start_ts, __wt_atomic_add64(&btree->hs_counter, 1)); + cursor->set_key(cursor, btree->id, key, upd->start_ts, counter); cursor->set_value( cursor, stop_time_point->durable_ts, upd->durable_ts, (uint64_t)type, hs_value); @@ -452,6 +511,7 @@ __hs_insert_record_with_btree_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, W WT_STAT_CONN_INCR(session, cache_hs_insert); err: + __wt_scr_free(session, &hs_key); if (ret != 0) { __wt_free_update_list(session, &hs_upd); @@ -950,7 +1010,7 @@ err: */ int __wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, - WT_ITEM *key, wt_timestamp_t timestamp) + const WT_ITEM *key, wt_timestamp_t timestamp) { WT_DECL_ITEM(srch_key); WT_DECL_RET; diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h index abf19855a3f..b691552dc1f 100644 --- a/src/third_party/wiredtiger/src/include/btree.h +++ b/src/third_party/wiredtiger/src/include/btree.h @@ -169,7 +169,6 @@ struct __wt_btree { uint64_t write_gen; /* Write generation */ uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */ wt_timestamp_t rec_max_timestamp; - uint64_t hs_counter; /* History store counter */ uint64_t checkpoint_gen; /* Checkpoint generation */ WT_SESSION_IMPL *sync_session; /* Syncing session */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 9ada3107728..a55267f1bc0 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -763,7 +763,7 @@ extern int __wt_hs_cursor_close(WT_SESSION_IMPL *session, uint32_t session_flags extern int __wt_hs_cursor_open(WT_SESSION_IMPL *session) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id, - WT_ITEM *key, wt_timestamp_t timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + const WT_ITEM *key, wt_timestamp_t timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_get_btree(WT_SESSION_IMPL *session, WT_BTREE **hs_btreep) @@ -1239,6 +1239,8 @@ extern int __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_IT WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *leaf, bool leaf_safe, bool *leaf_foundp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rts_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[]) diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h index 3df5aa605d8..50cc22f6d4f 100644 --- a/src/third_party/wiredtiger/src/include/stat.h +++ b/src/third_party/wiredtiger/src/include/stat.h @@ -714,6 +714,7 @@ struct __wt_connection_stats { int64_t txn_rts_keys_removed; int64_t txn_rts_keys_restored; int64_t txn_rts_pages_visited; + int64_t txn_rts_skip_interal_pages_walk; int64_t txn_rts_sweep_hs_keys; int64_t txn_rts_upd_aborted; int64_t txn_rts_hs_removed; diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in index a3abb6c26af..efc658938d0 100644 --- a/src/third_party/wiredtiger/src/include/wiredtiger.in +++ b/src/third_party/wiredtiger/src/include/wiredtiger.in @@ -5886,106 +5886,108 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1417 /*! transaction: rollback to stable pages visited */ #define WT_STAT_CONN_TXN_RTS_PAGES_VISITED 1418 +/*! transaction: rollback to stable skipping internal pages tree walk */ +#define WT_STAT_CONN_TXN_RTS_SKIP_INTERAL_PAGES_WALK 1419 /*! transaction: rollback to stable sweeping history store keys */ -#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1419 +#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1420 /*! transaction: rollback to stable updates aborted */ -#define WT_STAT_CONN_TXN_RTS_UPD_ABORTED 1420 +#define WT_STAT_CONN_TXN_RTS_UPD_ABORTED 1421 /*! transaction: rollback to stable updates removed from history store */ -#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1421 +#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1422 /*! transaction: set timestamp calls */ -#define WT_STAT_CONN_TXN_SET_TS 1422 +#define WT_STAT_CONN_TXN_SET_TS 1423 /*! transaction: set timestamp durable calls */ -#define WT_STAT_CONN_TXN_SET_TS_DURABLE 1423 +#define WT_STAT_CONN_TXN_SET_TS_DURABLE 1424 /*! transaction: set timestamp durable updates */ -#define WT_STAT_CONN_TXN_SET_TS_DURABLE_UPD 1424 +#define WT_STAT_CONN_TXN_SET_TS_DURABLE_UPD 1425 /*! transaction: set timestamp oldest calls */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1425 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1426 /*! transaction: set timestamp oldest updates */ -#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1426 +#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1427 /*! transaction: set timestamp stable calls */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE 1427 +#define WT_STAT_CONN_TXN_SET_TS_STABLE 1428 /*! transaction: set timestamp stable updates */ -#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1428 +#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1429 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1429 +#define WT_STAT_CONN_TXN_BEGIN 1430 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1430 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1431 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1431 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1432 /*! * transaction: transaction checkpoint history store file duration * (usecs) */ -#define WT_STAT_CONN_TXN_HS_CKPT_DURATION 1432 +#define WT_STAT_CONN_TXN_HS_CKPT_DURATION 1433 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1433 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1434 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1434 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1435 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1435 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1436 /*! transaction: transaction checkpoint prepare currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RUNNING 1436 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RUNNING 1437 /*! transaction: transaction checkpoint prepare max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MAX 1437 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MAX 1438 /*! transaction: transaction checkpoint prepare min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MIN 1438 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MIN 1439 /*! transaction: transaction checkpoint prepare most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1439 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1440 /*! transaction: transaction checkpoint prepare total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1440 +#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1441 /*! transaction: transaction checkpoint scrub dirty target */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1441 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1442 /*! transaction: transaction checkpoint scrub time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1442 +#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1443 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1443 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1444 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1444 +#define WT_STAT_CONN_TXN_CHECKPOINT 1445 /*! * transaction: transaction checkpoints skipped because database was * clean */ -#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1445 +#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1446 /*! transaction: transaction failures due to history store */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1446 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1447 /*! * transaction: transaction fsync calls for checkpoint after allocating * the transaction ID */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1447 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1448 /*! * transaction: transaction fsync duration for checkpoint after * allocating the transaction ID (usecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1448 +#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1449 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1449 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1450 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1450 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1451 /*! transaction: transaction range of timestamps currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1451 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1452 /*! transaction: transaction range of timestamps pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1452 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1453 /*! * transaction: transaction range of timestamps pinned by the oldest * active read timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1453 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1454 /*! * transaction: transaction range of timestamps pinned by the oldest * timestamp */ -#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1454 +#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1455 /*! transaction: transaction read timestamp of the oldest active reader */ -#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1455 +#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1456 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1456 +#define WT_STAT_CONN_TXN_SYNC 1457 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1457 +#define WT_STAT_CONN_TXN_COMMIT 1458 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1458 +#define WT_STAT_CONN_TXN_ROLLBACK 1459 /*! transaction: update conflicts */ -#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1459 +#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1460 /*! * @} diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c index 5e481d850f6..4c0b38d5192 100644 --- a/src/third_party/wiredtiger/src/support/stat.c +++ b/src/third_party/wiredtiger/src/support/stat.c @@ -1053,6 +1053,7 @@ static const char *const __stats_connection_desc[] = { "transaction: read timestamp queue inserts total", "transaction: read timestamp queue length", "transaction: rollback to stable calls", "transaction: rollback to stable keys removed", "transaction: rollback to stable keys restored", "transaction: rollback to stable pages visited", + "transaction: rollback to stable skipping internal pages tree walk", "transaction: rollback to stable sweeping history store keys", "transaction: rollback to stable updates aborted", "transaction: rollback to stable updates removed from history store", @@ -1547,6 +1548,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->txn_rts_keys_removed = 0; stats->txn_rts_keys_restored = 0; stats->txn_rts_pages_visited = 0; + stats->txn_rts_skip_interal_pages_walk = 0; stats->txn_rts_sweep_hs_keys = 0; stats->txn_rts_upd_aborted = 0; stats->txn_rts_hs_removed = 0; @@ -2050,6 +2052,7 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS * to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed); to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored); to->txn_rts_pages_visited += WT_STAT_READ(from, txn_rts_pages_visited); + to->txn_rts_skip_interal_pages_walk += WT_STAT_READ(from, txn_rts_skip_interal_pages_walk); to->txn_rts_sweep_hs_keys += WT_STAT_READ(from, txn_rts_sweep_hs_keys); to->txn_rts_upd_aborted += WT_STAT_READ(from, txn_rts_upd_aborted); to->txn_rts_hs_removed += WT_STAT_READ(from, txn_rts_hs_removed); diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c index 48dc34a2cf7..13c3725659d 100644 --- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c +++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c @@ -819,6 +819,40 @@ err: } /* + * __wt_rts_page_skip -- + * Skip if rollback to stable doesn't requires to read this page. + */ +int +__wt_rts_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp) +{ + wt_timestamp_t rollback_timestamp; + + rollback_timestamp = *(wt_timestamp_t *)(context); + *skipp = false; /* Default to reading */ + + /* If the page is in-memory, we want to look at it. */ + if (ref->state != WT_REF_DISK) + return (0); + + /* + * Rollback to stable doesn't read leaf pages into memory as part of the tree walk. The leaf + * page is loaded into memory in the caller functions if it has newer updates that are need to + * be aborted. Don't process further on leaf pages as part of tree walk function. + */ + if (!F_ISSET(ref, WT_REF_FLAG_INTERNAL)) + return (0); + + /* Check whether this ref has any possible updates to be aborted. */ + if (!__rollback_page_needs_abort(session, ref, rollback_timestamp)) { + *skipp = true; + __wt_verbose(session, WT_VERB_RTS, "%p: internal page walk skipped", (void *)ref); + WT_STAT_CONN_INCR(session, txn_rts_skip_interal_pages_walk); + } + + return (0); +} + +/* * __rollback_to_stable_btree_walk -- * Called for each open handle - choose to either skip or wipe the commits */ @@ -830,8 +864,8 @@ __rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollbac /* Walk the tree, marking commits aborted where appropriate. */ ref = NULL; - while ((ret = __wt_tree_walk( - session, &ref, WT_READ_CACHE_LEAF | WT_READ_NO_EVICT | WT_READ_WONT_NEED)) == 0 && + while ((ret = __wt_tree_walk_custom_skip(session, &ref, __wt_rts_page_skip, &rollback_timestamp, + WT_READ_CACHE_LEAF | WT_READ_NO_EVICT | WT_READ_WONT_NEED)) == 0 && ref != NULL) if (F_ISSET(ref, WT_REF_FLAG_INTERNAL)) { WT_INTL_FOREACH_BEGIN (session, ref->page, child_ref) { diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c index 89e0406c3cb..72c17366639 100644 --- a/src/third_party/wiredtiger/test/format/bulk.c +++ b/src/third_party/wiredtiger/test/format/bulk.c @@ -59,7 +59,7 @@ bulk_commit_transaction(WT_SESSION *session) testutil_check(session->commit_transaction(session, buf)); /* Update the oldest timestamp, otherwise updates are pinned in memory. */ - timestamp_once(session); + timestamp_once(session, false); } /* diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c index 1db9cc5854c..374bc470f5b 100644 --- a/src/third_party/wiredtiger/test/format/config.c +++ b/src/third_party/wiredtiger/test/format/config.c @@ -942,7 +942,9 @@ config_transaction(void) if (g.c_txn_freq != 100 && config_is_perm("transaction.frequency")) testutil_die(EINVAL, "snapshot isolation requires transaction frequency set to 100"); } - + if (g.c_txn_rollback_to_stable && config_is_perm("transaction.rollback_to_stable") && + g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation")) + testutil_die(EINVAL, "rollback to stable requires snapshot isolation"); /* * The permanent configuration has no incompatible settings, adjust the temporary configuration * as necessary. Prepare overrides timestamps, overrides isolation, for no reason other than @@ -958,6 +960,12 @@ config_transaction(void) if (g.c_txn_freq != 100) config_single("transaction.frequency=100", false); } + if (g.c_txn_rollback_to_stable) { + if (!g.c_txn_timestamps) + config_single("transaction.timestamps=on", false); + if (g.c_logging) + config_single("logging=off", false); + } if (g.c_txn_timestamps) { if (g.c_isolation_flag != ISOLATION_SNAPSHOT) config_single("transaction.isolation=snapshot", false); diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index 30b8f2a09a6..b8f2922021e 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -57,97 +57,93 @@ typedef struct { char **vstr; /* Value for string options */ } CONFIG; -#define COMPRESSION_LIST "(none | lz4 | snappy | zlib | zstd)" +#define COMPRESSION_LIST " (none | lz4 | snappy | zlib | zstd)" static CONFIG c[] = { /* 5% */ - {"assert.commit_timestamp", "if assert commit_timestamp", C_BOOL, 5, 0, 0, + {"assert.commit_timestamp", "assert commit_timestamp", C_BOOL, 5, 0, 0, &g.c_assert_commit_timestamp, NULL}, /* 5% */ - {"assert.read_timestamp", "if assert read_timestamp", C_BOOL, 5, 0, 0, &g.c_assert_read_timestamp, + {"assert.read_timestamp", "assert read_timestamp", C_BOOL, 5, 0, 0, &g.c_assert_read_timestamp, NULL}, /* 20% */ - {"backup", "if backups are enabled", C_BOOL, 20, 0, 0, &g.c_backups, NULL}, + {"backup", "configure backups", C_BOOL, 20, 0, 0, &g.c_backups, NULL}, - {"backup.incremental", "type of backup (block | log | off)", C_IGNORE | C_STRING, 0, 0, 0, NULL, + {"backup.incremental", "backup type (block | log | off)", C_IGNORE | C_STRING, 0, 0, 0, NULL, &g.c_backup_incremental}, - {"backup.incr_granularity", "incremental backup block granularity in KB", 0x0, 4, 16384, 16384, + {"backup.incr_granularity", "incremental backup block granularity (KB)", 0x0, 4, 16384, 16384, &g.c_backup_incr_granularity, NULL}, - {"btree.bitcnt", "number of bits for fixed-length column-store files", 0x0, 1, 8, 8, &g.c_bitcnt, - NULL}, + {"btree.bitcnt", "fixed-length column-store object size (number of bits)", 0x0, 1, 8, 8, + &g.c_bitcnt, NULL}, - {"btree.compression", "type of compression " COMPRESSION_LIST, C_IGNORE | C_STRING, 0, 0, 0, NULL, + {"btree.compression", "compression type" COMPRESSION_LIST, C_IGNORE | C_STRING, 0, 0, 0, NULL, &g.c_compression}, /* 20% */ - {"btree.dictionary", "if values are dictionary compressed", C_BOOL, 20, 0, 0, &g.c_dictionary, + {"btree.dictionary", "configure dictionary compressed values", C_BOOL, 20, 0, 0, &g.c_dictionary, NULL}, /* 20% */ - {"btree.huffman_key", "if keys are huffman encoded", C_BOOL, 20, 0, 0, &g.c_huffman_key, NULL}, + {"btree.huffman_key", "configure huffman encoded keys", C_BOOL, 20, 0, 0, &g.c_huffman_key, NULL}, /* 20% */ - {"btree.huffman_value", "if values are huffman encoded", C_BOOL, 20, 0, 0, &g.c_huffman_value, + {"btree.huffman_value", "configure huffman encoded values", C_BOOL, 20, 0, 0, &g.c_huffman_value, NULL}, /* 95% */ - {"btree.internal_key_truncation", "if internal keys are truncated", C_BOOL, 95, 0, 0, + {"btree.internal_key_truncation", "truncate internal keys", C_BOOL, 95, 0, 0, &g.c_internal_key_truncation, NULL}, - {"btree.internal_page_max", "maximum size of Btree internal nodes", 0x0, 9, 17, 27, + {"btree.internal_page_max", "btree internal node maximum size", 0x0, 9, 17, 27, &g.c_intl_page_max, NULL}, - {"btree.key_gap", "gap between instantiated keys on a Btree page", 0x0, 0, 20, 20, &g.c_key_gap, - NULL}, + {"btree.key_gap", "btree page instantiated key gap", 0x0, 0, 20, 20, &g.c_key_gap, NULL}, - {"btree.key_max", "maximum size of keys", 0x0, 20, 128, MEGABYTE(10), &g.c_key_max, NULL}, + {"btree.key_max", "maximum key size", 0x0, 20, 128, MEGABYTE(10), &g.c_key_max, NULL}, /* * A minimum key size of 11 is necessary. Row-store keys have a leading 10-digit number and the * 11 guarantees we never see a key that we can't convert to a numeric value without formatting * it first because there's a trailing non-digit character in every key. */ - {"btree.key_min", "minimum size of keys", 0x0, 11, 32, 256, &g.c_key_min, NULL}, + {"btree.key_min", "minimum key size", 0x0, 11, 32, 256, &g.c_key_min, NULL}, - {"btree.leaf_page_max", "maximum size of Btree leaf nodes", 0x0, 9, 17, 27, &g.c_leaf_page_max, - NULL}, + {"btree.leaf_page_max", "btree leaf node maximum size", 0x0, 9, 17, 27, &g.c_leaf_page_max, NULL}, - {"btree.memory_page_max", "maximum size of in-memory pages", 0x0, 1, 10, 128, - &g.c_memory_page_max, NULL}, + {"btree.memory_page_max", "maximum cache page size", 0x0, 1, 10, 128, &g.c_memory_page_max, NULL}, /* 80% */ - {"btree.prefix_compression", "if keys are prefix compressed", C_BOOL, 80, 0, 0, + {"btree.prefix_compression", "configure prefix compressed keys", C_BOOL, 80, 0, 0, &g.c_prefix_compression, NULL}, - {"btree.prefix_compression_min", "minimum gain before prefix compression is used", 0x0, 0, 8, 256, - &g.c_prefix_compression_min, NULL}, + {"btree.prefix_compression_min", "minimum gain before prefix compression is used (bytes)", 0x0, 0, + 8, 256, &g.c_prefix_compression_min, NULL}, - {"btree.repeat_data_pct", "percent duplicate values in row- or var-length column-stores", 0x0, 0, - 90, 90, &g.c_repeat_data_pct, NULL}, + {"btree.repeat_data_pct", "duplicate values (percentage)", 0x0, 0, 90, 90, &g.c_repeat_data_pct, + NULL}, /* 10% */ - {"btree.reverse", "collate in reverse order", C_BOOL, 10, 0, 0, &g.c_reverse, NULL}, + {"btree.reverse", "reverse order collation", C_BOOL, 10, 0, 0, &g.c_reverse, NULL}, {"btree.split_pct", "page split size as a percentage of the maximum page size", 0x0, 50, 100, 100, &g.c_split_pct, NULL}, - {"btree.value_max", "maximum size of values", 0x0, 32, 4096, MEGABYTE(10), &g.c_value_max, NULL}, + {"btree.value_max", "maximum value size", 0x0, 32, 4096, MEGABYTE(10), &g.c_value_max, NULL}, - {"btree.value_min", "minimum size of values", 0x0, 0, 20, 4096, &g.c_value_min, NULL}, + {"btree.value_min", "minimum value size", 0x0, 0, 20, 4096, &g.c_value_min, NULL}, - {"cache", "size of the cache in MB", 0x0, 1, 100, 100 * 1024, &g.c_cache, NULL}, + {"cache", "cache size (MB)", 0x0, 1, 100, 100 * 1024, &g.c_cache, NULL}, - {"cache.evict_max", "the maximum number of eviction workers", 0x0, 0, 5, 100, &g.c_evict_max, - NULL}, + {"cache.evict_max", "maximum number of eviction workers", 0x0, 0, 5, 100, &g.c_evict_max, NULL}, - {"cache.minimum", "minimum size of the cache in MB", C_IGNORE, 0, 0, 100 * 1024, - &g.c_cache_minimum, NULL}, + {"cache.minimum", "minimum cache size (MB)", C_IGNORE, 0, 0, 100 * 1024, &g.c_cache_minimum, + NULL}, - {"checkpoint", "type of checkpoints (on | off | wiredtiger)", C_IGNORE | C_STRING, 0, 0, 0, NULL, + {"checkpoint", "checkpoint type (on | off | wiredtiger)", C_IGNORE | C_STRING, 0, 0, 0, NULL, &g.c_checkpoint}, {"checkpoint.log_size", "MB of log to wait if wiredtiger checkpoints configured", 0x0, 20, 200, @@ -156,151 +152,145 @@ static CONFIG c[] = { {"checkpoint.wait", "seconds to wait if wiredtiger checkpoints configured", 0x0, 5, 100, 3600, &g.c_checkpoint_wait, NULL}, - {"disk.checksum", "type of checksums (on | off | uncompressed)", C_IGNORE | C_STRING, 0, 0, 0, - NULL, &g.c_checksum}, + {"disk.checksum", "checksum type (on | off | uncompressed)", C_IGNORE | C_STRING, 0, 0, 0, NULL, + &g.c_checksum}, /* 5% */ - {"disk.data_extend", "if data files are extended", C_BOOL, 5, 0, 0, &g.c_data_extend, NULL}, + {"disk.data_extend", "configure data file extension", C_BOOL, 5, 0, 0, &g.c_data_extend, NULL}, /* 0% */ - {"disk.direct_io", "if direct I/O is configured for data objects", C_IGNORE | C_BOOL, 0, 0, 1, + {"disk.direct_io", "configure direct I/O for data objects", C_IGNORE | C_BOOL, 0, 0, 1, &g.c_direct_io, NULL}, - {"disk.encryption", "type of encryption (none | rotn-7)", C_IGNORE | C_STRING, 0, 0, 0, NULL, + {"disk.encryption", "encryption type (none | rotn-7)", C_IGNORE | C_STRING, 0, 0, 0, NULL, &g.c_encryption}, /* 10% */ - {"disk.firstfit", "if allocation is firstfit", C_BOOL, 10, 0, 0, &g.c_firstfit, NULL}, + {"disk.firstfit", "configure first-fit allocation", C_BOOL, 10, 0, 0, &g.c_firstfit, NULL}, /* 90% */ - {"disk.mmap", "configure for mmap operations (readonly)", C_BOOL, 90, 0, 0, &g.c_mmap, NULL}, + {"disk.mmap", "configure mmap operations (reads only)", C_BOOL, 90, 0, 0, &g.c_mmap, NULL}, /* 5% */ - {"disk.mmap_all", "configure for mmap operations (read and write)", C_BOOL, 5, 0, 0, - &g.c_mmap_all, NULL}, + {"disk.mmap_all", "configure mmap operations (read and write)", C_BOOL, 5, 0, 0, &g.c_mmap_all, + NULL}, /* 0% */ - {"format.abort", "if timed run should drop core", C_BOOL, 0, 0, 0, &g.c_abort, NULL}, + {"format.abort", "drop core during timed run", C_BOOL, 0, 0, 0, &g.c_abort, NULL}, /* 75% */ - {"format.independent_thread_rng", "if thread RNG space is independent", C_BOOL, 75, 0, 0, + {"format.independent_thread_rng", "configure independent thread RNG space", C_BOOL, 75, 0, 0, &g.c_independent_thread_rng, NULL}, - {"format.major_timeout", "timeout for long-running operations (minutes)", C_IGNORE, 0, 0, 1000, + {"format.major_timeout", "long-running operations timeout (minutes)", C_IGNORE, 0, 0, 1000, &g.c_major_timeout, NULL}, /* 50% */ - {"logging", "if logging configured", C_BOOL, 50, 0, 0, &g.c_logging, NULL}, + {"logging", "configure logging", C_BOOL, 50, 0, 0, &g.c_logging, NULL}, /* 50% */ - {"logging.archive", "if log file archival configured", C_BOOL, 50, 0, 0, &g.c_logging_archive, - NULL}, + {"logging.archive", "configure log file archival", C_BOOL, 50, 0, 0, &g.c_logging_archive, NULL}, - {"logging.compression", "type of logging compression " COMPRESSION_LIST, C_IGNORE | C_STRING, 0, - 0, 0, NULL, &g.c_logging_compression}, + {"logging.compression", "logging compression type" COMPRESSION_LIST, C_IGNORE | C_STRING, 0, 0, 0, + NULL, &g.c_logging_compression}, - {"logging.file_max", "maximum log file size in KB", 0x0, 100, 512000, 2097152, + {"logging.file_max", "maximum log file size (KB)", 0x0, 100, 512000, 2097152, &g.c_logging_file_max, NULL}, /* 50% */ - {"logging.prealloc", "if log file pre-allocation configured", C_BOOL, 50, 0, 0, - &g.c_logging_prealloc, NULL}, + {"logging.prealloc", "configure log file pre-allocation", C_BOOL, 50, 0, 0, &g.c_logging_prealloc, + NULL}, /* 90% */ - {"lsm.auto_throttle", "if LSM inserts are throttled", C_BOOL, 90, 0, 0, &g.c_auto_throttle, NULL}, + {"lsm.auto_throttle", "throttle LSM inserts", C_BOOL, 90, 0, 0, &g.c_auto_throttle, NULL}, /* 95% */ - {"lsm.bloom", "if bloom filters are configured", C_BOOL, 95, 0, 0, &g.c_bloom, NULL}, + {"lsm.bloom", "configure bloom filters", C_BOOL, 95, 0, 0, &g.c_bloom, NULL}, - {"lsm.bloom_bit_count", "number of bits per item for LSM bloom filters", 0x0, 4, 64, 1000, + {"lsm.bloom_bit_count", "number of bits per item for bloom filters", 0x0, 4, 64, 1000, &g.c_bloom_bit_count, NULL}, - {"lsm.bloom_hash_count", "number of hash values per item for LSM bloom filters", 0x0, 4, 32, 100, + {"lsm.bloom_hash_count", "number of hash values per item for bloom filters", 0x0, 4, 32, 100, &g.c_bloom_hash_count, NULL}, /* 10% */ - {"lsm.bloom_oldest", "if bloom_oldest=true", C_BOOL, 10, 0, 0, &g.c_bloom_oldest, NULL}, + {"lsm.bloom_oldest", "configure bloom_oldest=true", C_BOOL, 10, 0, 0, &g.c_bloom_oldest, NULL}, - {"lsm.chunk_size", "LSM chunk size in MB", 0x0, 1, 10, 100, &g.c_chunk_size, NULL}, + {"lsm.chunk_size", "LSM chunk size (MB)", 0x0, 1, 10, 100, &g.c_chunk_size, NULL}, - {"lsm.merge_max", "the maximum number of chunks to include in a merge operation", 0x0, 4, 20, 100, - &g.c_merge_max, NULL}, + {"lsm.merge_max", "maximum number of chunks to include in an LSM merge operation", 0x0, 4, 20, + 100, &g.c_merge_max, NULL}, - {"lsm.worker_threads", "the number of LSM worker threads", 0x0, 3, 4, 20, &g.c_lsm_worker_threads, + {"lsm.worker_threads", "number of LSM worker threads", 0x0, 3, 4, 20, &g.c_lsm_worker_threads, NULL}, /* 10% */ - {"ops.alter", "if altering the table is enabled", C_BOOL, 10, 0, 0, &g.c_alter, NULL}, + {"ops.alter", "configure table alterations", C_BOOL, 10, 0, 0, &g.c_alter, NULL}, /* 10% */ - {"ops.compaction", "if compaction is running", C_BOOL, 10, 0, 0, &g.c_compact, NULL}, + {"ops.compaction", "configure compaction", C_BOOL, 10, 0, 0, &g.c_compact, NULL}, /* 50% */ - {"ops.hs_cursor", "if history store cursor reads configured", C_BOOL, 50, 0, 0, &g.c_hs_cursor, - NULL}, + {"ops.hs_cursor", "configure history store cursor reads", C_BOOL, 50, 0, 0, &g.c_hs_cursor, NULL}, - {"ops.pct.delete", "percent operations that are deletes", C_IGNORE, 0, 0, 100, &g.c_delete_pct, - NULL}, + {"ops.pct.delete", "delete operations (percentage)", C_IGNORE, 0, 0, 100, &g.c_delete_pct, NULL}, - {"ops.pct.insert", "percent operations that are inserts", C_IGNORE, 0, 0, 100, &g.c_insert_pct, - NULL}, + {"ops.pct.insert", "insert operations (percentage)", C_IGNORE, 0, 0, 100, &g.c_insert_pct, NULL}, - {"ops.pct.modify", "percent operations that are value modifications", C_IGNORE, 0, 0, 100, - &g.c_modify_pct, NULL}, + {"ops.pct.modify", "modify operations (percentage)", C_IGNORE, 0, 0, 100, &g.c_modify_pct, NULL}, - {"ops.pct.read", "percent operations that are reads", C_IGNORE, 0, 0, 100, &g.c_read_pct, NULL}, + {"ops.pct.read", "read operations (percentage)", C_IGNORE, 0, 0, 100, &g.c_read_pct, NULL}, - {"ops.pct.write", "percent operations that are value updates", C_IGNORE, 0, 0, 100, - &g.c_write_pct, NULL}, + {"ops.pct.write", "update operations (percentage)", C_IGNORE, 0, 0, 100, &g.c_write_pct, NULL}, /* 5% */ {"ops.prepare", "configure transaction prepare", C_BOOL, 5, 0, 0, &g.c_prepare, NULL}, /* 10% */ - {"ops.random_cursor", "if random cursor reads configured", C_BOOL, 10, 0, 0, &g.c_random_cursor, + {"ops.random_cursor", "configure random cursor reads", C_BOOL, 10, 0, 0, &g.c_random_cursor, NULL}, /* 100% */ - {"ops.rebalance", "rebalance testing", C_BOOL, 100, 1, 0, &g.c_rebalance, NULL}, + {"ops.rebalance", "configure rebalance", C_BOOL, 100, 1, 0, &g.c_rebalance, NULL}, /* 100% */ - {"ops.salvage", "salvage testing", C_BOOL, 100, 1, 0, &g.c_salvage, NULL}, + {"ops.salvage", "configure salvage", C_BOOL, 100, 1, 0, &g.c_salvage, NULL}, /* 100% */ - {"ops.truncate", "enable truncation", C_BOOL, 100, 0, 0, &g.c_truncate, NULL}, + {"ops.truncate", "configure truncation", C_BOOL, 100, 0, 0, &g.c_truncate, NULL}, /* 100% */ - {"ops.verify", "to regularly verify during a run", C_BOOL, 100, 1, 0, &g.c_verify, NULL}, + {"ops.verify", "configure verify", C_BOOL, 100, 1, 0, &g.c_verify, NULL}, {"quiet", "quiet run (same as -q)", C_IGNORE | C_BOOL, 0, 0, 1, &g.c_quiet, NULL}, - {"runs", "the number of runs", C_IGNORE, 0, 0, UINT_MAX, &g.c_runs, NULL}, + {"runs", "number of runs", C_IGNORE, 0, 0, UINT_MAX, &g.c_runs, NULL}, - {"runs.in_memory", "if in-memory configured", C_IGNORE | C_BOOL, 0, 0, 1, &g.c_in_memory, NULL}, + {"runs.in_memory", "configure in-memory", C_IGNORE | C_BOOL, 0, 0, 1, &g.c_in_memory, NULL}, - {"runs.ops", "the number of operations done per run", 0x0, 0, M(2), M(100), &g.c_ops, NULL}, + {"runs.ops", "operations per run", 0x0, 0, M(2), M(100), &g.c_ops, NULL}, - {"runs.rows", "the number of rows to create", 0x0, 10, M(1), M(100), &g.c_rows, NULL}, + {"runs.rows", "number of rows", 0x0, 10, M(1), M(100), &g.c_rows, NULL}, - {"runs.source", "data source (file | lsm | table)", C_IGNORE | C_STRING, 0, 0, 0, NULL, + {"runs.source", "data source type (file | lsm | table)", C_IGNORE | C_STRING, 0, 0, 0, NULL, &g.c_data_source}, - {"runs.threads", "the number of worker threads", 0x0, 1, 32, 128, &g.c_threads, NULL}, + {"runs.threads", "number of worker threads", 0x0, 1, 32, 128, &g.c_threads, NULL}, - {"runs.timer", "maximum time to run in minutes", C_IGNORE, 0, 0, UINT_MAX, &g.c_timer, NULL}, + {"runs.timer", "run time (minutes)", C_IGNORE, 0, 0, UINT_MAX, &g.c_timer, NULL}, - {"runs.type", "type of store to create (fix | var | row)", C_IGNORE | C_STRING, 0, 0, 0, NULL, + {"runs.type", "object type (fix | var | row)", C_IGNORE | C_STRING, 0, 0, 0, NULL, &g.c_file_type}, - {"runs.verify_failure_dump", "attempt page dump on repeatable read error", C_IGNORE | C_BOOL, 0, + {"runs.verify_failure_dump", "configure page dump on repeatable read error", C_IGNORE | C_BOOL, 0, 0, 1, &g.c_verify_failure_dump, NULL}, /* 20% */ - {"statistics", "maintain statistics", C_BOOL, 20, 0, 0, &g.c_statistics, NULL}, + {"statistics", "configure statistics", C_BOOL, 20, 0, 0, &g.c_statistics, NULL}, /* 5% */ - {"statistics.server", "run the statistics server thread", C_BOOL, 5, 0, 0, &g.c_statistics_server, - NULL}, + {"statistics.server", "configure statistics server thread", C_BOOL, 5, 0, 0, + &g.c_statistics_server, NULL}, /* 2% */ {"stress.aggressive_sweep", "stress aggressive sweep", C_BOOL, 2, 0, 0, @@ -341,25 +331,29 @@ static CONFIG c[] = { /* 2% */ {"stress.split_8", "stress splits (#8)", C_BOOL, 2, 0, 0, &g.c_timing_stress_split_8, NULL}, - {"transaction.frequency", "percent operations done inside an explicit transaction", 0x0, 1, 100, + {"transaction.frequency", "operations inside an explicit transaction (percentage)", 0x0, 1, 100, 100, &g.c_txn_freq, NULL}, {"transaction.isolation", "isolation level (random | read-uncommitted | read-committed | snapshot)", C_IGNORE | C_STRING, 0, 0, 0, NULL, &g.c_isolation}, + /* 0% - By default, turned off until fallout has been debugged. */ + {"transaction.rollback_to_stable", "configure rollback_to_stable", C_BOOL, 0, 0, 0, + &g.c_txn_rollback_to_stable, NULL}, + /* 70% */ - {"transaction.timestamps", "enable transaction timestamp support", C_BOOL, 70, 0, 0, + {"transaction.timestamps", "configure transaction timestamps", C_BOOL, 70, 0, 0, &g.c_txn_timestamps, NULL}, - {"wiredtiger.config", "configuration string used to wiredtiger_open", C_IGNORE | C_STRING, 0, 0, - 0, NULL, &g.c_config_open}, + {"wiredtiger.config", "wiredtiger_open API configuration string", C_IGNORE | C_STRING, 0, 0, 0, + NULL, &g.c_config_open}, /* 80% */ - {"wiredtiger.rwlock", "if wiredtiger read/write mutexes should be used", C_BOOL, 80, 0, 0, - &g.c_wt_mutex, NULL}, + {"wiredtiger.rwlock", "configure wiredtiger read/write mutexes", C_BOOL, 80, 0, 0, &g.c_wt_mutex, + NULL}, - {"wiredtiger.leak_memory", "if memory should be leaked on close", C_BOOL, 0, 0, 0, + {"wiredtiger.leak_memory", "configure memory leaked on shutdown", C_BOOL, 0, 0, 0, &g.c_leak_memory, NULL}, {NULL, NULL, 0x0, 0, 0, 0, NULL, NULL}}; diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index af5e16cc60b..5b4e8589b8c 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -74,7 +74,6 @@ typedef struct { #define LOCK_INITIALIZED(lock) ((lock)->lock_type != LOCK_NONE) typedef struct { - wt_thread_t tid; /* thread ID */ char tidbuf[128]; /* thread ID in printable form */ WT_CONNECTION *wts_conn; @@ -112,6 +111,8 @@ typedef struct { WT_RAND_STATE rnd; /* Global RNG state */ + uint32_t rts_no_check; /* track unsuccessful RTS checking */ + /* * Prepare will return an error if the prepare timestamp is less than any active read timestamp. * Lock across allocating prepare and read timestamps. @@ -121,7 +122,9 @@ typedef struct { */ RWLOCK ts_lock; - uint64_t timestamp; /* Counter for timestamps */ + uint64_t timestamp; /* Counter for timestamps */ + uint64_t oldest_timestamp; /* Last timestamp used for oldest */ + uint64_t stable_timestamp; /* Last timestamp used for stable */ uint64_t truncate_cnt; /* Counter for truncation */ @@ -222,6 +225,7 @@ typedef struct { uint32_t c_timing_stress_split_8; uint32_t c_truncate; uint32_t c_txn_freq; + uint32_t c_txn_rollback_to_stable; uint32_t c_txn_timestamps; uint32_t c_value_max; uint32_t c_value_min; @@ -305,6 +309,13 @@ typedef struct { } SNAP_OPS; typedef struct { + SNAP_OPS *snap_state_current; + SNAP_OPS *snap_state_end; + SNAP_OPS *snap_state_first; + SNAP_OPS *snap_state_list; +} SNAP_STATE; + +typedef struct { int id; /* simple thread ID */ wt_thread_t tid; /* thread ID */ char tidbuf[128]; /* thread ID in printable form */ @@ -340,7 +351,14 @@ typedef struct { uint64_t opid; /* Operation ID */ uint64_t read_ts; /* read timestamp */ uint64_t commit_ts; /* commit timestamp */ - SNAP_OPS *snap, *snap_first, snap_list[512]; + uint64_t stable_ts; /* stable timestamp */ + SNAP_STATE snap_states[2]; + SNAP_STATE *s; /* points to one of the snap_states */ + +#define snap_current s->snap_state_current +#define snap_end s->snap_state_end +#define snap_first s->snap_state_first +#define snap_list s->snap_state_list uint64_t insert_list[256]; /* column-store inserted records */ u_int insert_list_cnt; @@ -354,6 +372,8 @@ typedef struct { } TINFO; extern TINFO **tinfo_list; +#define SNAP_LIST_SIZE 512 + WT_THREAD_RET alter(void *); WT_THREAD_RET backup(void *); WT_THREAD_RET checkpoint(void *); @@ -382,12 +402,16 @@ void operations(u_int, bool); void path_setup(const char *); void set_alarm(u_int); void set_core_off(void); -void snap_init(TINFO *, uint64_t, bool); +void snap_init(TINFO *); +void snap_teardown(TINFO *); +void snap_op_init(TINFO *, uint64_t, bool); +void snap_repeat_rollback(WT_CURSOR *, TINFO **, size_t); void snap_repeat_single(WT_CURSOR *, TINFO *); int snap_repeat_txn(WT_CURSOR *, TINFO *); void snap_repeat_update(TINFO *, bool); void snap_track(TINFO *, thread_op); -void timestamp_once(WT_SESSION *); +void timestamp_once(WT_SESSION *, bool); +void timestamp_parse(WT_SESSION *, const char *, uint64_t *); int trace_config(const char *); void trace_init(void); void trace_ops_init(TINFO *); diff --git a/src/third_party/wiredtiger/test/format/hs.c b/src/third_party/wiredtiger/test/format/hs.c index 55b298f20e7..0cb0cf352aa 100644 --- a/src/third_party/wiredtiger/test/format/hs.c +++ b/src/third_party/wiredtiger/test/format/hs.c @@ -83,8 +83,13 @@ hs_cursor(void *arg) /* Search to the last-known location. */ if (!restart) { cursor->set_key(cursor, hs_btree_id, &key, hs_start_ts, hs_counter); + + /* + * Limit expected errors because this is a diagnostic check (the WiredTiger API allows + * prepare-conflict, but that would be unexpected from the history store file). + */ ret = cursor->search_near(cursor, &exact); - testutil_assert(ret == 0 || ret == WT_NOTFOUND); + testutil_assert(ret == 0 || ret == WT_NOTFOUND || ret == WT_ROLLBACK); } /* @@ -99,7 +104,7 @@ hs_cursor(void *arg) cursor, &hs_stop_durable_ts, &hs_durable_timestamp, &hs_upd_type, &hs_value)); continue; } - testutil_assert(ret == WT_NOTFOUND); + testutil_assert(ret == WT_NOTFOUND || ret == WT_ROLLBACK); break; } diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 0d4692f0472..38c9a5b5690 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -102,6 +102,130 @@ random_failure(void) TINFO **tinfo_list; /* + * tinfo_init -- + * Initialize the worker thread structures. + */ +static void +tinfo_init(void) +{ + TINFO *tinfo; + u_int i; + + /* Allocate the thread structures separately to minimize false sharing. */ + if (tinfo_list == NULL) { + tinfo_list = dcalloc((size_t)g.c_threads + 1, sizeof(TINFO *)); + for (i = 0; i < g.c_threads; ++i) { + tinfo_list[i] = dcalloc(1, sizeof(TINFO)); + tinfo = tinfo_list[i]; + + tinfo->id = (int)i + 1; + + /* Set up the default key and value buffers. */ + tinfo->key = &tinfo->_key; + key_gen_init(tinfo->key); + tinfo->value = &tinfo->_value; + val_gen_init(tinfo->value); + tinfo->lastkey = &tinfo->_lastkey; + key_gen_init(tinfo->lastkey); + + snap_init(tinfo); + } + } + + /* Cleanup for each new run. */ + for (i = 0; i < g.c_threads; ++i) { + tinfo = tinfo_list[i]; + + tinfo->ops = 0; + tinfo->commit = 0; + tinfo->insert = 0; + tinfo->prepare = 0; + tinfo->remove = 0; + tinfo->rollback = 0; + tinfo->search = 0; + tinfo->truncate = 0; + tinfo->update = 0; + + tinfo->session = NULL; + tinfo->cursor = NULL; + + tinfo->insert_list_cnt = 0; + + tinfo->state = TINFO_RUNNING; + tinfo->quit = false; + } +} + +/* + * tinfo_teardown -- + * Tear down the worker thread structures. + */ +static void +tinfo_teardown(void) +{ + TINFO *tinfo; + u_int i; + + for (i = 0; i < g.c_threads; ++i) { + tinfo = tinfo_list[i]; + + __wt_buf_free(NULL, &tinfo->vprint); + + /* + * Assert records were not removed unless configured to do so, otherwise subsequent runs can + * incorrectly report scan errors. + */ + testutil_assert(g.c_delete_pct != 0 || tinfo->remove == 0); + + snap_teardown(tinfo); + key_gen_teardown(tinfo->key); + val_gen_teardown(tinfo->value); + key_gen_teardown(tinfo->lastkey); + + free(tinfo); + } + free(tinfo_list); + tinfo_list = NULL; +} + +/* + * Command used before rollback to stable to save the interesting files so we can replay the command + * as necessary. + * + * Redirect the "cd" command to /dev/null so chatty cd implementations don't add the new working + * directory to our output. + */ +#define ROLLBACK_STABLE_COPY_CMD \ + "cd %s > /dev/null && " \ + "rm -rf ROLLBACK.copy && mkdir ROLLBACK.copy && " \ + "cp WiredTiger* wt* ROLLBACK.copy/" + +/* + * tinfo_rollback_to_stable_and_check -- + * Do a rollback to stable, then check that changes are correct from what we know in the worker + * thread structures. + */ +static void +tinfo_rollback_to_stable_and_check(WT_SESSION *session) +{ + WT_CURSOR *cursor; + WT_DECL_RET; + char cmd[512]; + + testutil_check(__wt_snprintf(cmd, sizeof(cmd), ROLLBACK_STABLE_COPY_CMD, g.home)); + if ((ret = system(cmd)) != 0) + testutil_die(ret, "rollback to stable copy (\"%s\") failed", cmd); + trace_msg("%-10s ts=%" PRIu64, "rts", g.stable_timestamp); + + g.wts_conn->rollback_to_stable(g.wts_conn, NULL); + + /* Check the saved snap operations for consistency. */ + testutil_check(session->open_cursor(session, g.uri, NULL, NULL, &cursor)); + snap_repeat_rollback(cursor, tinfo_list, g.c_threads); + testutil_check(cursor->close(cursor)); +} + +/* * operations -- * Perform a number of operations in a set of threads. */ @@ -153,29 +277,23 @@ operations(u_int ops_seconds, bool lastrun) quit_fourths = fourths + 15 * 4 * 60; } + /* Get a session. */ testutil_check(conn->open_session(conn, NULL, NULL, &session)); + + /* Initialize and start the worker threads. */ + tinfo_init(); trace_msg("%s", "=============== thread ops start"); /* Initialize locks to single-thread backups, failures, and timestamp updates. */ lock_init(session, &g.backup_lock); lock_init(session, &g.ts_lock); - /* - * Create the per-thread structures and start the worker threads. Allocate the thread structures - * separately to minimize false sharing. - */ - tinfo_list = dcalloc((size_t)g.c_threads + 1, sizeof(TINFO *)); for (i = 0; i < g.c_threads; ++i) { - tinfo_list[i] = tinfo = dcalloc(1, sizeof(TINFO)); - - tinfo->id = (int)i + 1; - tinfo->state = TINFO_RUNNING; + tinfo = tinfo_list[i]; testutil_check(__wt_thread_create(NULL, &tinfo->tid, ops, tinfo)); } - /* - * If a multi-threaded run, start optional special-purpose threads. - */ + /* Start optional special-purpose threads. */ if (g.c_alter) testutil_check(__wt_thread_create(NULL, &alter_tid, alter, NULL)); if (g.c_backups) @@ -278,21 +396,14 @@ operations(u_int ops_seconds, bool lastrun) lock_destroy(session, &g.ts_lock); trace_msg("%s", "=============== thread ops stop"); - testutil_check(session->close(session, NULL)); - for (i = 0; i < g.c_threads; ++i) { - tinfo = tinfo_list[i]; + if (g.c_txn_rollback_to_stable) + tinfo_rollback_to_stable_and_check(session); - __wt_buf_free(NULL, &tinfo->vprint); + testutil_check(session->close(session, NULL)); - /* - * Assert records were not removed unless configured to do so, otherwise subsequent runs can - * incorrectly report scan errors. - */ - testutil_assert(g.c_delete_pct != 0 || tinfo->remove == 0); - free(tinfo); - } - free(tinfo_list); + if (lastrun) + tinfo_teardown(); } /* @@ -335,7 +446,7 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp) testutil_check(__wt_snprintf(buf, sizeof(buf), "read_timestamp=%" PRIx64, ts)); ret = session->timestamp_transaction(session, buf); if (ret == 0) { - snap_init(tinfo, ts, true); + snap_op_init(tinfo, ts, true); trace_op(tinfo, "begin snapshot read-ts=%" PRIu64 " (repeatable)", ts); return; } @@ -363,7 +474,7 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp) lock_writeunlock(session, &g.ts_lock); - snap_init(tinfo, ts, false); + snap_op_init(tinfo, ts, false); trace_op(tinfo, "begin snapshot read-ts=%" PRIu64 " (not repeatable)", ts); } @@ -401,7 +512,7 @@ begin_transaction(TINFO *tinfo, u_int *iso_configp) wiredtiger_begin_transaction(session, config); - snap_init(tinfo, WT_TS_NONE, false); + snap_op_init(tinfo, WT_TS_NONE, false); trace_op(tinfo, "begin %s", config); } @@ -605,10 +716,10 @@ ops(void *arg) testutil_check(__wt_thread_str(tinfo->tidbuf, sizeof(tinfo->tidbuf))); /* - * Characterize the per-thread random number generator. Normally we want independent behavior - * so threads start in different parts of the RNG space, but we've found bugs by having the - * threads pound on the same key/value pairs, that is, by making them traverse the same RNG - * space. 75% of the time we run in independent RNG space. + * Characterize the per-thread random number generator. Normally we want independent behavior so + * threads start in different parts of the RNG space, but we've found bugs by having the threads + * pound on the same key/value pairs, that is, by making them traverse the same RNG space. 75% + * of the time we run in independent RNG space. */ if (g.c_independent_thread_rng) __wt_random_init_seed(NULL, &tinfo->rnd); @@ -617,17 +728,6 @@ ops(void *arg) iso_config = ISOLATION_RANDOM; /* -Wconditional-uninitialized */ - /* Tracking of transactional snapshot isolation operations. */ - tinfo->snap = tinfo->snap_first = tinfo->snap_list; - - /* Set up the default key and value buffers. */ - tinfo->key = &tinfo->_key; - key_gen_init(tinfo->key); - tinfo->value = &tinfo->_value; - val_gen_init(tinfo->value); - tinfo->lastkey = &tinfo->_lastkey; - key_gen_init(tinfo->lastkey); - /* Set the first operation where we'll create sessions and cursors. */ cursor = NULL; session = NULL; @@ -996,16 +1096,11 @@ rollback: intxn = false; } - if (session != NULL) + if (session != NULL) { testutil_check(session->close(session, NULL)); - - for (i = 0; i < WT_ELEMENTS(tinfo->snap_list); ++i) { - free(tinfo->snap_list[i].kdata); - free(tinfo->snap_list[i].vdata); + tinfo->cursor = NULL; + tinfo->session = NULL; } - key_gen_teardown(tinfo->key); - val_gen_teardown(tinfo->value); - key_gen_teardown(tinfo->lastkey); tinfo->state = TINFO_COMPLETE; return (WT_THREAD_RET_VALUE); diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c index 3f82e5b6499..4fe784e87db 100644 --- a/src/third_party/wiredtiger/test/format/snap.c +++ b/src/third_party/wiredtiger/test/format/snap.c @@ -29,15 +29,111 @@ #include "format.h" /* + * Issue a warning when there enough consecutive unsuccessful checks for rollback to stable. + */ +#define WARN_RTS_NO_CHECK 5 + +/* * snap_init -- * Initialize the repeatable operation tracking. */ void -snap_init(TINFO *tinfo, uint64_t read_ts, bool repeatable_reads) +snap_init(TINFO *tinfo) +{ + /* + * We maintain two snap lists. The current one is indicated by tinfo->s, and keeps the most + * recent operations. The other one is used when we are running with rollback_to_stable. When + * each thread notices that the stable timestamp has changed, it stashes the current snap list + * and starts fresh with the other snap list. After we've completed a rollback_to_stable, we can + * the secondary snap list to see the state of keys/values seen and updated at the time of the + * rollback. + */ + if (g.c_txn_rollback_to_stable) { + tinfo->s = &tinfo->snap_states[1]; + tinfo->snap_list = dcalloc(SNAP_LIST_SIZE, sizeof(SNAP_OPS)); + tinfo->snap_end = &tinfo->snap_list[SNAP_LIST_SIZE]; + } + tinfo->s = &tinfo->snap_states[0]; + tinfo->snap_list = dcalloc(SNAP_LIST_SIZE, sizeof(SNAP_OPS)); + tinfo->snap_end = &tinfo->snap_list[SNAP_LIST_SIZE]; + tinfo->snap_current = tinfo->snap_list; +} + +/* + * snap_teardown -- + * Tear down the repeatable operation tracking structures. + */ +void +snap_teardown(TINFO *tinfo) { + SNAP_OPS *snaplist; + u_int i, snap_index; + + for (snap_index = 0; snap_index < WT_ELEMENTS(tinfo->snap_states); snap_index++) + if ((snaplist = tinfo->snap_states[snap_index].snap_state_list) != NULL) { + for (i = 0; i < SNAP_LIST_SIZE; ++i) { + free(snaplist[i].kdata); + free(snaplist[i].vdata); + } + free(snaplist); + } +} + +/* + * snap_clear -- + * Clear a single snap entry. + */ +static void +snap_clear_one(SNAP_OPS *snap) +{ + snap->repeatable = false; +} + +/* + * snap_clear -- + * Clear the snap list. + */ +static void +snap_clear(TINFO *tinfo) +{ + SNAP_OPS *snap; + + for (snap = tinfo->snap_list; snap < tinfo->snap_end; ++snap) + snap_clear_one(snap); +} + +/* + * snap_op_init -- + * Initialize the repeatable operation tracking for each new operation. + */ +void +snap_op_init(TINFO *tinfo, uint64_t read_ts, bool repeatable_reads) +{ + uint64_t stable_ts; + ++tinfo->opid; - tinfo->snap_first = tinfo->snap; + if (g.c_txn_rollback_to_stable) { + /* + * If the stable timestamp has changed and we've advanced beyond it, preserve the current + * snapshot history up to this point, we'll use it verify rollback_to_stable. Switch our + * tracking to the other snap list. + */ + stable_ts = __wt_atomic_addv64(&g.stable_timestamp, 0); + if (stable_ts != tinfo->stable_ts && read_ts > stable_ts) { + tinfo->stable_ts = stable_ts; + if (tinfo->s == &tinfo->snap_states[0]) + tinfo->s = &tinfo->snap_states[1]; + else + tinfo->s = &tinfo->snap_states[0]; + tinfo->snap_current = tinfo->snap_list; + + /* Clear out older info from the snap list. */ + snap_clear(tinfo); + } + } + + tinfo->snap_first = tinfo->snap_current; tinfo->read_ts = read_ts; tinfo->repeatable_reads = repeatable_reads; @@ -54,7 +150,7 @@ snap_track(TINFO *tinfo, thread_op op) WT_ITEM *ip; SNAP_OPS *snap; - snap = tinfo->snap; + snap = tinfo->snap_current; snap->op = op; snap->opid = tinfo->opid; snap->keyno = tinfo->keyno; @@ -82,15 +178,15 @@ snap_track(TINFO *tinfo, thread_op op) } /* Move to the next slot, wrap at the end of the circular buffer. */ - if (++tinfo->snap >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)]) - tinfo->snap = tinfo->snap_list; + if (++tinfo->snap_current >= tinfo->snap_end) + tinfo->snap_current = tinfo->snap_list; /* * It's possible to pass this transaction's buffer starting point and start replacing our own * entries. If that happens, we can't repeat operations because we don't know which ones were * previously modified. */ - if (tinfo->snap->opid == tinfo->opid) + if (tinfo->snap_current->opid == tinfo->opid) tinfo->repeatable_wrap = true; } @@ -236,10 +332,9 @@ static void snap_ts_clear(TINFO *tinfo, uint64_t ts) { SNAP_OPS *snap; - int count; /* Check from the first slot to the last. */ - for (snap = tinfo->snap_list, count = WT_ELEMENTS(tinfo->snap_list); count > 0; --count, ++snap) + for (snap = tinfo->snap_list; snap < tinfo->snap_end; ++snap) if (snap->repeatable && snap->ts <= ts) snap->repeatable = false; } @@ -297,7 +392,7 @@ snap_repeat_ok_commit(TINFO *tinfo, SNAP_OPS *current) */ for (p = current;;) { /* Wrap at the end of the circular buffer. */ - if (++p >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)]) + if (++p >= tinfo->snap_end) p = tinfo->snap_list; if (p->opid != tinfo->opid) break; @@ -311,7 +406,7 @@ snap_repeat_ok_commit(TINFO *tinfo, SNAP_OPS *current) for (p = current;;) { /* Wrap at the beginning of the circular buffer. */ if (--p < tinfo->snap_list) - p = &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list) - 1]; + p = &tinfo->snap_list[SNAP_LIST_SIZE - 1]; if (p->opid != tinfo->opid) break; @@ -341,7 +436,7 @@ snap_repeat_ok_rollback(TINFO *tinfo, SNAP_OPS *current) for (p = current;;) { /* Wrap at the beginning of the circular buffer. */ if (--p < tinfo->snap_list) - p = &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list) - 1]; + p = &tinfo->snap_list[SNAP_LIST_SIZE - 1]; if (p->opid != tinfo->opid) break; @@ -367,7 +462,7 @@ snap_repeat_txn(WT_CURSOR *cursor, TINFO *tinfo) /* Check from the first operation we saved to the last. */ for (current = tinfo->snap_first;; ++current) { /* Wrap at the end of the circular buffer. */ - if (current >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)]) + if (current >= tinfo->snap_end) current = tinfo->snap_list; if (current->opid != tinfo->opid) break; @@ -401,7 +496,7 @@ snap_repeat_update(TINFO *tinfo, bool committed) /* Check from the first operation we saved to the last. */ for (current = tinfo->snap_first;; ++current) { /* Wrap at the end of the circular buffer. */ - if (current >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)]) + if (current >= tinfo->snap_end) current = tinfo->snap_list; if (current->opid != tinfo->opid) break; @@ -429,45 +524,22 @@ snap_repeat_update(TINFO *tinfo, bool committed) } /* - * snap_repeat_single -- - * Repeat an historic operation. + * snap_repeat -- + * Repeat one operation. */ -void -snap_repeat_single(WT_CURSOR *cursor, TINFO *tinfo) +static void +snap_repeat(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap, bool rollback_allowed) { - SNAP_OPS *snap; WT_DECL_RET; WT_SESSION *session; - int count; - u_int v; char buf[64]; session = cursor->session; /* - * Start at a random spot in the list of operations and look for a read to retry. Stop when - * we've walked the entire list or found one. - */ - v = mmrand(&tinfo->rnd, 1, WT_ELEMENTS(tinfo->snap_list)) - 1; - for (snap = &tinfo->snap_list[v], count = WT_ELEMENTS(tinfo->snap_list); count > 0; - --count, ++snap) { - /* Wrap at the end of the circular buffer. */ - if (snap >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)]) - snap = tinfo->snap_list; - - if (snap->repeatable) - break; - } - - if (count == 0) - return; - - /* * Start a new transaction. Set the read timestamp. Verify the record. Discard the transaction. */ - while ((ret = session->begin_transaction(session, "isolation=snapshot")) == WT_CACHE_FULL) - __wt_yield(); - testutil_check(ret); + wiredtiger_begin_transaction(session, "isolation=snapshot"); /* * If the timestamp has aged out of the system, we'll get EINVAL when we try and set it. @@ -482,7 +554,7 @@ snap_repeat_single(WT_CURSOR *cursor, TINFO *tinfo) /* The only expected error is rollback. */ ret = snap_verify(cursor, tinfo, snap); - if (ret != 0 && ret != WT_ROLLBACK) + if (ret != 0 && (!rollback_allowed || ret != WT_ROLLBACK)) testutil_check(ret); } else if (ret == EINVAL) snap_ts_clear(tinfo, snap->ts); @@ -492,3 +564,91 @@ snap_repeat_single(WT_CURSOR *cursor, TINFO *tinfo) /* Discard the transaction. */ testutil_check(session->rollback_transaction(session, NULL)); } + +/* + * snap_repeat_single -- + * Repeat an historic operation. + */ +void +snap_repeat_single(WT_CURSOR *cursor, TINFO *tinfo) +{ + SNAP_OPS *snap; + u_int v; + int count; + + /* + * Start at a random spot in the list of operations and look for a read to retry. Stop when + * we've walked the entire list or found one. + */ + v = mmrand(&tinfo->rnd, 1, SNAP_LIST_SIZE) - 1; + for (snap = &tinfo->snap_list[v], count = SNAP_LIST_SIZE; count > 0; --count, ++snap) { + /* Wrap at the end of the circular buffer. */ + if (snap >= tinfo->snap_end) + snap = tinfo->snap_list; + + if (snap->repeatable) + break; + } + + if (count == 0) + return; + + snap_repeat(cursor, tinfo, snap, true); +} + +/* + * snap_repeat_rollback -- + * Repeat all known operations after a rollback. + */ +void +snap_repeat_rollback(WT_CURSOR *cursor, TINFO **tinfo_array, size_t tinfo_count) +{ + SNAP_OPS *snap; + SNAP_STATE *state; + TINFO *tinfo, **tinfop; + uint32_t count; + size_t i, statenum; + char buf[100]; + + count = 0; + + track("rollback_to_stable: checking", 0ULL, NULL); + for (i = 0, tinfop = tinfo_array; i < tinfo_count; ++i, ++tinfop) { + tinfo = *tinfop; + + /* + * For this thread, walk through both sets of snaps ("states"), looking for entries that are + * repeatable and have relevant timestamps. One set will have the most current operations, + * meaning they will likely be newer than the stable timestamp, and thus cannot be checked. + * The other set typically has operations that are just before the stable timestamp, so are + * candidates for checking. + */ + for (statenum = 0; statenum < WT_ELEMENTS(tinfo->snap_states); statenum++) { + state = &tinfo->snap_states[statenum]; + for (snap = state->snap_state_list; snap < state->snap_state_end; ++snap) { + if (snap->repeatable && snap->ts <= g.stable_timestamp && + snap->ts >= g.oldest_timestamp) { + snap_repeat(cursor, tinfo, snap, false); + ++count; + if (count % 100 == 0) { + testutil_check(__wt_snprintf( + buf, sizeof(buf), "rollback_to_stable: %" PRIu32 " ops repeated", count)); + track(buf, 0ULL, NULL); + } + } + snap_clear_one(snap); + } + } + } + + /* Show the final result and check that we're accomplishing some checking. */ + testutil_check( + __wt_snprintf(buf, sizeof(buf), "rollback_to_stable: %" PRIu32 " ops repeated", count)); + track(buf, 0ULL, NULL); + if (count == 0) { + if (++g.rts_no_check >= WARN_RTS_NO_CHECK) + fprintf(stderr, + "Warning: %" PRIu32 " consecutive runs with no rollback_to_stable checking\n", count); + } else + g.rts_no_check = 0; +} diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c index 787488c15b7..d2ae281e281 100644 --- a/src/third_party/wiredtiger/test/format/util.c +++ b/src/third_party/wiredtiger/test/format/util.c @@ -28,12 +28,46 @@ #include "format.h" +/* + * track_ts_diff -- + * Return a one character descriptor of relative timestamp values. + */ +static const char * +track_ts_diff(uint64_t left_ts, uint64_t right_ts) +{ + if (left_ts < right_ts) + return "+"; + else if (left_ts == right_ts) + return "="; + else + return "-"; +} + +/* + * track_ts_dots -- + * Return an entry in the time stamp progress indicator. + */ +static const char * +track_ts_dots(u_int dot_count) +{ + static const char *dots[] = {" ", ". ", ".. ", "..."}; + + return (dots[dot_count % WT_ELEMENTS(dots)]); +} + +/* + * track -- + * Show a status line of operations and time stamp progress. + */ void track(const char *tag, uint64_t cnt, TINFO *tinfo) { - static size_t lastlen = 0; + static size_t last_len; + static uint64_t last_cur, last_old, last_stable; + static u_int cur_dot_cnt, old_dot_cnt, stable_dot_cnt; size_t len; - char msg[128]; + uint64_t cur_ts, old_ts, stable_ts; + char msg[128], ts_msg[64]; if (g.c_quiet || tag == NULL) return; @@ -44,12 +78,49 @@ track(const char *tag, uint64_t cnt, TINFO *tinfo) else if (tinfo == NULL) testutil_check(__wt_snprintf_len_set( msg, sizeof(msg), &len, "%4" PRIu32 ": %s: %" PRIu64, g.run_cnt, tag, cnt)); - else + else { + ts_msg[0] = '\0'; + if (g.c_txn_timestamps) { + /* + * Don't worry about having a completely consistent set of timestamps. + */ + old_ts = g.oldest_timestamp; + stable_ts = g.stable_timestamp; + cur_ts = g.timestamp; + + if (old_ts != last_old) { + ++old_dot_cnt; + last_old = old_ts; + } + if (stable_ts != last_stable) { + ++stable_dot_cnt; + last_stable = stable_ts; + } + if (cur_ts != last_cur) { + ++cur_dot_cnt; + last_cur = cur_ts; + } + + if (g.c_txn_rollback_to_stable) + testutil_check(__wt_snprintf(ts_msg, sizeof(ts_msg), + " old%s" + "stb%s%s" + "ts%s%s", + track_ts_dots(old_dot_cnt), track_ts_diff(old_ts, stable_ts), + track_ts_dots(stable_dot_cnt), track_ts_diff(stable_ts, cur_ts), + track_ts_dots(cur_dot_cnt))); + else + testutil_check(__wt_snprintf(ts_msg, sizeof(ts_msg), + " old%s" + "ts%s%s", + track_ts_dots(old_dot_cnt), track_ts_diff(old_ts, cur_ts), + track_ts_dots(cur_dot_cnt))); + } testutil_check(__wt_snprintf_len_set(msg, sizeof(msg), &len, "%4" PRIu32 ": %s: " - "search %" PRIu64 "%s, " - "insert %" PRIu64 "%s, " - "update %" PRIu64 "%s, " - "remove %" PRIu64 "%s", + "S %" PRIu64 "%s, " + "I %" PRIu64 "%s, " + "U %" PRIu64 "%s, " + "R %" PRIu64 "%s%s", g.run_cnt, tag, tinfo->search > M(9) ? tinfo->search / M(1) : tinfo->search, tinfo->search > M(9) ? "M" : "", tinfo->insert > M(9) ? tinfo->insert / M(1) : tinfo->insert, @@ -57,13 +128,13 @@ track(const char *tag, uint64_t cnt, TINFO *tinfo) tinfo->update > M(9) ? tinfo->update / M(1) : tinfo->update, tinfo->update > M(9) ? "M" : "", tinfo->remove > M(9) ? tinfo->remove / M(1) : tinfo->remove, - tinfo->remove > M(9) ? "M" : "")); - - if (lastlen > len) { - memset(msg + len, ' ', (size_t)(lastlen - len)); - msg[lastlen] = '\0'; + tinfo->remove > M(9) ? "M" : "", ts_msg)); + } + if (last_len > len) { + memset(msg + len, ' ', (size_t)(last_len - len)); + msg[last_len] = '\0'; } - lastlen = len; + last_len = len; if (printf("%s\r", msg) < 0) testutil_die(EIO, "printf"); @@ -163,16 +234,18 @@ fclose_and_clear(FILE **fpp) * Update the timestamp once. */ void -timestamp_once(WT_SESSION *session) +timestamp_once(WT_SESSION *session, bool allow_lag) { static const char *oldest_timestamp_str = "oldest_timestamp="; + static const char *stable_timestamp_str = "stable_timestamp="; WT_CONNECTION *conn; WT_DECL_RET; - char buf[WT_TS_HEX_STRING_SIZE + 64]; + size_t len; + uint64_t all_durable, stable; + char buf[WT_TS_HEX_STRING_SIZE * 2 + 64], tsbuf[WT_TS_HEX_STRING_SIZE]; conn = g.wts_conn; - - testutil_check(__wt_snprintf(buf, sizeof(buf), "%s", oldest_timestamp_str)); + stable = 0ULL; /* * Lock out transaction timestamp operations. The lock acts as a barrier ensuring we've checked @@ -183,16 +256,57 @@ timestamp_once(WT_SESSION *session) if (LOCK_INITIALIZED(&g.ts_lock)) lock_writelock(session, &g.ts_lock); - ret = conn->query_timestamp(conn, buf + strlen(oldest_timestamp_str), "get=all_durable"); - testutil_assert(ret == 0 || ret == WT_NOTFOUND); - if (ret == 0) + if ((ret = conn->query_timestamp(conn, tsbuf, "get=all_durable")) == 0) { + timestamp_parse(session, tsbuf, &all_durable); + + /* + * If a lag is permitted, move the oldest timestamp half the way to the current + * "all_durable" timestamp. + */ + if (allow_lag) + g.oldest_timestamp = (all_durable + g.oldest_timestamp) / 2; + else + g.oldest_timestamp = all_durable; + testutil_check( + __wt_snprintf(buf, sizeof(buf), "%s%" PRIx64, oldest_timestamp_str, g.oldest_timestamp)); + + /* + * When we're doing rollback to stable operations, we'll advance the stable timestamp to the + * current timestamp value. + */ + if (g.c_txn_rollback_to_stable) { + stable = g.timestamp; + len = strlen(buf); + WT_ASSERT((WT_SESSION_IMPL *)session, len < sizeof(buf)); + testutil_check(__wt_snprintf( + buf + len, sizeof(buf) - len, ",%s%" PRIx64, stable_timestamp_str, stable)); + } testutil_check(conn->set_timestamp(conn, buf)); + trace_msg("%-10s oldest=%" PRIu64 ", stable=%" PRIu64, "setts", g.oldest_timestamp, stable); + if (g.c_txn_rollback_to_stable) + g.stable_timestamp = stable; + + } else + testutil_assert(ret == WT_NOTFOUND); if (LOCK_INITIALIZED(&g.ts_lock)) lock_writeunlock(session, &g.ts_lock); } /* + * timestamp_parse -- + * Parse a timestamp to an integral value. + */ +void +timestamp_parse(WT_SESSION *session, const char *str, uint64_t *tsp) +{ + char *p; + + *tsp = strtoull(str, &p, 16); + WT_ASSERT((WT_SESSION_IMPL *)session, p - str <= 16); +} + +/* * timestamp -- * Periodically update the oldest timestamp. */ @@ -212,16 +326,28 @@ timestamp(void *arg) /* Update the oldest timestamp at least once every 15 seconds. */ done = false; do { + random_sleep(&g.rnd, 15); + /* - * Do a final bump of the oldest timestamp as part of shutting down the worker threads, - * otherwise recent operations can prevent verify from running. + * If running without rollback_to_stable, do a final bump of the oldest timestamp as part of + * shutting down the worker threads, otherwise recent operations can prevent verify from + * running. + * + * With rollback_to_stable configured, don't do a bump at the end of the run. We need the + * worker threads to have time to see any changes in the stable timestamp, so they can stash + * their stable state - if we bump they will have no time to do that. And when we rollback, + * we'd like to see a reasonable amount of data changed. So we don't bump the stable + * timestamp, and we can't bump the oldest timestamp as well, as it would get ahead of the + * stable timestamp, which is not allowed. */ if (g.workers_finished) done = true; - else - random_sleep(&g.rnd, 15); - timestamp_once(session); + if (!done || !g.c_txn_rollback_to_stable) { + timestamp_once(session, true); + if (done) + timestamp_once(session, true); + } } while (!done); diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable11.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable11.py new file mode 100755 index 00000000000..eed6d8e0c26 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable11.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2020 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import fnmatch, os, shutil, time +from helper import copy_wiredtiger_home +from test_rollback_to_stable01 import test_rollback_to_stable_base +from wiredtiger import stat +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios + +def timestamp_str(t): + return '%x' % t + +# test_rollback_to_stable11.py +# Test the rollback to stable is retrieving the proper hs update. +class test_rollback_to_stable11(test_rollback_to_stable_base): + session_config = 'isolation=snapshot' + + prepare_values = [ + ('no_prepare', dict(prepare=False)), + ('prepare', dict(prepare=True)) + ] + + scenarios = make_scenarios(prepare_values) + + def conn_config(self): + config = 'cache_size=1MB,statistics=(all),log=(enabled=true)' + return config + + def simulate_crash_restart(self, olddir, newdir): + ''' Simulate a crash from olddir and restart in newdir. ''' + # with the connection still open, copy files to new directory + shutil.rmtree(newdir, ignore_errors=True) + os.mkdir(newdir) + for fname in os.listdir(olddir): + fullname = os.path.join(olddir, fname) + # Skip lock file on Windows since it is locked + if os.path.isfile(fullname) and \ + "WiredTiger.lock" not in fullname and \ + "Tmplog" not in fullname and \ + "Preplog" not in fullname: + shutil.copy(fullname, newdir) + # + # close the original connection and open to new directory + # NOTE: This really cannot test the difference between the + # write-no-sync (off) version of log_flush and the sync + # version since we're not crashing the system itself. + # + self.close_conn() + self.conn = self.setUpConnectionOpen(newdir) + self.session = self.setUpSessionOpen(self.conn) + + def test_rollback_to_stable(self): + nrows = 1 + + # Create a table without logging. + uri = "table:rollback_to_stable11" + ds = SimpleDataSet( + self, uri, 0, key_format="i", value_format="S", config='log=(enabled=false)') + ds.populate() + + # Pin oldest and stable to timestamp 10. + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) + + ',stable_timestamp=' + timestamp_str(10)) + + value_a = "aaaaa" * 100 + value_b = "bbbbb" * 100 + value_c = "ccccc" * 100 + value_d = "ddddd" * 100 + + # Perform several updates. + self.large_updates(uri, value_a, ds, nrows, 20) + self.large_updates(uri, value_a, ds, nrows, 20) + self.large_updates(uri, value_a, ds, nrows, 20) + self.large_updates(uri, value_b, ds, nrows, 20) + + # Verify data is visible and correct. + self.check(value_b, uri, nrows, 20) + + # Pin stable to timestamp 30 if prepare otherwise 20. + if self.prepare: + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(30)) + else: + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(20)) + + # Checkpoint to ensure that all the updates are flushed to disk. + self.session.checkpoint() + + # Simulate a server crash and restart. + self.simulate_crash_restart(".", "RESTART") + + # Check that the correct data is seen at and after the stable timestamp. + self.check(value_b, uri, nrows, 20) + + # Perform several updates. + self.large_updates(uri, value_c, ds, nrows, 30) + self.large_updates(uri, value_c, ds, nrows, 30) + self.large_updates(uri, value_c, ds, nrows, 30) + self.large_updates(uri, value_d, ds, nrows, 30) + + # Verify data is visible and correct. + self.check(value_d, uri, nrows, 30) + + # Checkpoint to ensure that all the updates are flushed to disk. + self.session.checkpoint() + + # Simulate a server crash and restart. + self.simulate_crash_restart("RESTART", "RESTART2") + + # Check that the correct data is seen at and after the stable timestamp. + self.check(value_b, uri, nrows, 20) + self.check(value_b, uri, nrows, 40) + + stat_cursor = self.session.open_cursor('statistics:', None, None) + calls = stat_cursor[stat.conn.txn_rts][2] + hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2] + hs_sweep = stat_cursor[stat.conn.txn_rts_sweep_hs_keys][2] + keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2] + keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2] + pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2] + upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2] + stat_cursor.close() + + self.assertEqual(calls, 0) + self.assertEqual(keys_removed, 0) + self.assertEqual(keys_restored, 0) + self.assertEqual(upd_aborted, 0) + self.assertGreater(pages_visited, 0) + self.assertEqual(hs_removed, 4) + self.assertEqual(hs_sweep, 0) + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py new file mode 100755 index 00000000000..b4fa7a9087b --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2020 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import fnmatch, os, shutil, time +from helper import copy_wiredtiger_home +from test_rollback_to_stable01 import test_rollback_to_stable_base +from wiredtiger import stat +from wtdataset import SimpleDataSet +from wtscenario import make_scenarios + +def timestamp_str(t): + return '%x' % t + +# test_rollback_to_stable12.py +# Test the rollback to stable operation skipping subtrees in during tree walk. +class test_rollback_to_stable12(test_rollback_to_stable_base): + session_config = 'isolation=snapshot' + + prepare_values = [ + ('no_prepare', dict(prepare=False)), + ('prepare', dict(prepare=True)) + ] + + scenarios = make_scenarios(prepare_values) + + def conn_config(self): + config = 'cache_size=500MB,statistics=(all),log=(enabled=true)' + return config + + def simulate_crash_restart(self, olddir, newdir): + ''' Simulate a crash from olddir and restart in newdir. ''' + # with the connection still open, copy files to new directory + shutil.rmtree(newdir, ignore_errors=True) + os.mkdir(newdir) + for fname in os.listdir(olddir): + fullname = os.path.join(olddir, fname) + # Skip lock file on Windows since it is locked + if os.path.isfile(fullname) and \ + "WiredTiger.lock" not in fullname and \ + "Tmplog" not in fullname and \ + "Preplog" not in fullname: + shutil.copy(fullname, newdir) + # + # close the original connection and open to new directory + # NOTE: This really cannot test the difference between the + # write-no-sync (off) version of log_flush and the sync + # version since we're not crashing the system itself. + # + self.close_conn() + self.conn = self.setUpConnectionOpen(newdir) + self.session = self.setUpSessionOpen(self.conn) + + def test_rollback_to_stable(self): + nrows = 1000000 + + # Create a table without logging. + uri = "table:rollback_to_stable12" + ds = SimpleDataSet( + self, uri, 0, key_format="i", value_format="S", config='split_pct=50,log=(enabled=false)') + ds.populate() + + # Pin oldest and stable to timestamp 10. + self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) + + ',stable_timestamp=' + timestamp_str(10)) + + value_a = "aaaaa" * 100 + value_b = "bbbbb" * 100 + + # Perform several updates. + self.large_updates(uri, value_a, ds, nrows, 20) + + # Verify data is visible and correct. + self.check(value_a, uri, nrows, 20) + + # Pin stable to timestamp 30 if prepare otherwise 20. + if self.prepare: + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(30)) + else: + self.conn.set_timestamp('stable_timestamp=' + timestamp_str(20)) + + # Load a single row modification to be removed. + commit_ts = 30 + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[ds.key(1)] = value_b + if self.prepare: + self.session.prepare_transaction('prepare_timestamp=' + timestamp_str(commit_ts-1)) + self.session.timestamp_transaction('commit_timestamp=' + timestamp_str(commit_ts)) + self.session.timestamp_transaction('durable_timestamp=' + timestamp_str(commit_ts+1)) + self.session.commit_transaction() + else: + self.session.commit_transaction('commit_timestamp=' + timestamp_str(commit_ts)) + cursor.close() + + self.session.checkpoint() + + # Simulate a server crash and restart. + self.simulate_crash_restart(".", "RESTART") + + # Check that the correct data is seen at and after the stable timestamp. + self.check(value_a, uri, nrows, 30) + + stat_cursor = self.session.open_cursor('statistics:', None, None) + calls = stat_cursor[stat.conn.txn_rts][2] + hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2] + hs_sweep = stat_cursor[stat.conn.txn_rts_sweep_hs_keys][2] + keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2] + keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2] + pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2] + pages_walk_skipped = stat_cursor[stat.conn.txn_rts_skip_interal_pages_walk][2] + upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2] + stat_cursor.close() + + self.assertEqual(calls, 0) + self.assertEqual(keys_removed, 0) + self.assertEqual(keys_restored, 0) + self.assertGreaterEqual(upd_aborted, 0) + self.assertGreater(pages_visited, 0) + self.assertGreaterEqual(hs_removed, 0) + self.assertEqual(hs_sweep, 0) + self.assertGreater(pages_walk_skipped, 0) + +if __name__ == '__main__': + wttest.run() |