summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok3
-rw-r--r--src/third_party/wiredtiger/dist/stat_data.py3
-rw-r--r--src/third_party/wiredtiger/import.data2
-rw-r--r--src/third_party/wiredtiger/src/history/hs.c70
-rw-r--r--src/third_party/wiredtiger/src/include/btree.h1
-rw-r--r--src/third_party/wiredtiger/src/include/extern.h4
-rw-r--r--src/third_party/wiredtiger/src/include/stat.h1
-rw-r--r--src/third_party/wiredtiger/src/include/wiredtiger.in84
-rw-r--r--src/third_party/wiredtiger/src/support/stat.c3
-rw-r--r--src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c38
-rw-r--r--src/third_party/wiredtiger/test/format/bulk.c2
-rw-r--r--src/third_party/wiredtiger/test/format/config.c10
-rw-r--r--src/third_party/wiredtiger/test/format/config.h198
-rw-r--r--src/third_party/wiredtiger/test/format/format.h34
-rw-r--r--src/third_party/wiredtiger/test/format/hs.c9
-rw-r--r--src/third_party/wiredtiger/test/format/ops.c195
-rw-r--r--src/third_party/wiredtiger/test/format/snap.c244
-rw-r--r--src/third_party/wiredtiger/test/format/util.c176
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_rollback_to_stable11.py158
-rwxr-xr-xsrc/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py149
20 files changed, 1104 insertions, 280 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 23ba8dc737e..689aa30c192 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -336,6 +336,7 @@ RLEs
RMW
RNG
RPC
+RTS
RUNDIR
RWLOCK
RXB
@@ -1251,6 +1252,7 @@ srch
ssize
startup
statlog
+stb
stderr
stdin
stdout
@@ -1308,6 +1310,7 @@ timedwait
timestamp
timestamped
timestamps
+tinfo
tmp
todo
tokenizer
diff --git a/src/third_party/wiredtiger/dist/stat_data.py b/src/third_party/wiredtiger/dist/stat_data.py
index f4400cc8bdf..c7bfc07b404 100644
--- a/src/third_party/wiredtiger/dist/stat_data.py
+++ b/src/third_party/wiredtiger/dist/stat_data.py
@@ -651,10 +651,11 @@ connection_stats = [
TxnStat('txn_rollback', 'transactions rolled back'),
TxnStat('txn_rts', 'rollback to stable calls'),
TxnStat('txn_rts_hs_removed', 'rollback to stable updates removed from history store'),
- TxnStat('txn_rts_sweep_hs_keys', 'rollback to stable sweeping history store keys'),
TxnStat('txn_rts_keys_removed', 'rollback to stable keys removed'),
TxnStat('txn_rts_keys_restored', 'rollback to stable keys restored'),
TxnStat('txn_rts_pages_visited', 'rollback to stable pages visited'),
+ TxnStat('txn_rts_skip_interal_pages_walk', 'rollback to stable skipping internal pages tree walk'),
+ TxnStat('txn_rts_sweep_hs_keys', 'rollback to stable sweeping history store keys'),
TxnStat('txn_rts_upd_aborted', 'rollback to stable updates aborted'),
TxnStat('txn_set_ts', 'set timestamp calls'),
TxnStat('txn_set_ts_durable', 'set timestamp durable calls'),
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 3c04510d191..f36da07c129 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-4.4",
- "commit": "930bbacc3761a10483875585dbd4ecb58271d57e"
+ "commit": "ab40833d9130b71f4b36a1a03fd8f4f137d11bdd"
}
diff --git a/src/third_party/wiredtiger/src/history/hs.c b/src/third_party/wiredtiger/src/history/hs.c
index 03d36bf5d07..78b47ecd6aa 100644
--- a/src/third_party/wiredtiger/src/history/hs.c
+++ b/src/third_party/wiredtiger/src/history/hs.c
@@ -306,10 +306,38 @@ __wt_hs_cursor_close(WT_SESSION_IMPL *session, uint32_t session_flags, bool is_o
static int
__hs_row_search(WT_CURSOR_BTREE *hs_cbt, WT_ITEM *srch_key, bool insert)
{
+ WT_CURSOR *hs_cursor;
WT_DECL_RET;
+ bool leaf_found;
+
+ hs_cursor = &hs_cbt->iface;
+ leaf_found = false;
+
+ /*
+ * Check whether the search key can be find in the provided leaf page, if exists. Otherwise
+ * perform a full search.
+ */
+ if (hs_cbt->ref != NULL) {
+ WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt),
+ ret = __wt_row_search(hs_cbt, srch_key, insert, hs_cbt->ref, false, &leaf_found));
+ WT_RET(ret);
+
+ /*
+ * Only use the pinned page search results if search returns an exact match or a slot other
+ * than the page's boundary slots, if that's not the case, the record might belong on an
+ * entirely different page.
+ */
+ if (leaf_found && (hs_cbt->compare != 0 &&
+ (hs_cbt->slot == 0 || hs_cbt->slot == hs_cbt->ref->page->entries - 1)))
+ leaf_found = false;
+ if (!leaf_found)
+ hs_cursor->reset(hs_cursor);
+ }
+
+ if (!leaf_found)
+ WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt),
+ ret = __wt_row_search(hs_cbt, srch_key, insert, NULL, false, NULL));
- WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt),
- ret = __wt_row_search(hs_cbt, srch_key, insert, NULL, false, NULL));
#ifdef HAVE_DIAGNOSTIC
WT_TRET(__wt_cursor_key_order_init(hs_cbt));
#endif
@@ -398,18 +426,49 @@ __hs_insert_record_with_btree_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, W
WT_HS_TIME_POINT *stop_time_point)
{
WT_CURSOR_BTREE *cbt;
+ WT_DECL_ITEM(hs_key);
WT_DECL_RET;
WT_UPDATE *hs_upd, *upd_local;
+ wt_timestamp_t hs_start_ts;
+ uint64_t counter, hs_counter;
+ uint32_t hs_btree_id;
+ int cmp;
cbt = (WT_CURSOR_BTREE *)cursor;
hs_upd = upd_local = NULL;
+ counter = 0;
+
+ /* Allocate buffers for the data store and history store key. */
+ WT_ERR(__wt_scr_alloc(session, 0, &hs_key));
+
+ /*
+ * Adjust counter if there exists an update in the history store with same btree id, key and
+ * timestamp. Otherwise the newly inserting history store record may fall behind the existing
+ * one can lead to wrong order.
+ */
+ WT_ERR_NOTFOUND_OK(
+ __wt_hs_cursor_position(session, cursor, btree->id, key, upd->start_ts), true);
+ if (ret == 0) {
+ WT_ERR(cursor->get_key(cursor, &hs_btree_id, hs_key, &hs_start_ts, &hs_counter));
+
+ /*
+ * Check the whether the existing record is also from the same timestamp.
+ *
+ * Verify simple checks first to confirm whether the retrieved update same or not before
+ * performing the expensive key comparison.
+ */
+ if (hs_btree_id == btree->id && upd->start_ts == hs_start_ts) {
+ WT_ERR(__wt_compare(session, NULL, hs_key, key, &cmp));
+ if (cmp == 0)
+ counter = hs_counter + 1;
+ }
+ }
/*
* Use WT_CURSOR.set_key and WT_CURSOR.set_value to create key and value items, then use them to
* create an update chain for a direct insertion onto the history store page.
*/
- cursor->set_key(
- cursor, btree->id, key, upd->start_ts, __wt_atomic_add64(&btree->hs_counter, 1));
+ cursor->set_key(cursor, btree->id, key, upd->start_ts, counter);
cursor->set_value(
cursor, stop_time_point->durable_ts, upd->durable_ts, (uint64_t)type, hs_value);
@@ -452,6 +511,7 @@ __hs_insert_record_with_btree_int(WT_SESSION_IMPL *session, WT_CURSOR *cursor, W
WT_STAT_CONN_INCR(session, cache_hs_insert);
err:
+ __wt_scr_free(session, &hs_key);
if (ret != 0) {
__wt_free_update_list(session, &hs_upd);
@@ -950,7 +1010,7 @@ err:
*/
int
__wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id,
- WT_ITEM *key, wt_timestamp_t timestamp)
+ const WT_ITEM *key, wt_timestamp_t timestamp)
{
WT_DECL_ITEM(srch_key);
WT_DECL_RET;
diff --git a/src/third_party/wiredtiger/src/include/btree.h b/src/third_party/wiredtiger/src/include/btree.h
index abf19855a3f..b691552dc1f 100644
--- a/src/third_party/wiredtiger/src/include/btree.h
+++ b/src/third_party/wiredtiger/src/include/btree.h
@@ -169,7 +169,6 @@ struct __wt_btree {
uint64_t write_gen; /* Write generation */
uint64_t rec_max_txn; /* Maximum txn seen (clean trees) */
wt_timestamp_t rec_max_timestamp;
- uint64_t hs_counter; /* History store counter */
uint64_t checkpoint_gen; /* Checkpoint generation */
WT_SESSION_IMPL *sync_session; /* Syncing session */
diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h
index 9ada3107728..a55267f1bc0 100644
--- a/src/third_party/wiredtiger/src/include/extern.h
+++ b/src/third_party/wiredtiger/src/include/extern.h
@@ -763,7 +763,7 @@ extern int __wt_hs_cursor_close(WT_SESSION_IMPL *session, uint32_t session_flags
extern int __wt_hs_cursor_open(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_cursor_position(WT_SESSION_IMPL *session, WT_CURSOR *cursor, uint32_t btree_id,
- WT_ITEM *key, wt_timestamp_t timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+ const WT_ITEM *key, wt_timestamp_t timestamp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, uint32_t btree_id,
const WT_ITEM *key, wt_timestamp_t ts) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_hs_get_btree(WT_SESSION_IMPL *session, WT_BTREE **hs_btreep)
@@ -1239,6 +1239,8 @@ extern int __wt_row_modify(WT_CURSOR_BTREE *cbt, const WT_ITEM *key, const WT_IT
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_row_search(WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool insert, WT_REF *leaf,
bool leaf_safe, bool *leaf_foundp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
+extern int __wt_rts_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp)
+ WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_rwlock_init(WT_SESSION_IMPL *session, WT_RWLOCK *l)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_salvage(WT_SESSION_IMPL *session, const char *cfg[])
diff --git a/src/third_party/wiredtiger/src/include/stat.h b/src/third_party/wiredtiger/src/include/stat.h
index 3df5aa605d8..50cc22f6d4f 100644
--- a/src/third_party/wiredtiger/src/include/stat.h
+++ b/src/third_party/wiredtiger/src/include/stat.h
@@ -714,6 +714,7 @@ struct __wt_connection_stats {
int64_t txn_rts_keys_removed;
int64_t txn_rts_keys_restored;
int64_t txn_rts_pages_visited;
+ int64_t txn_rts_skip_interal_pages_walk;
int64_t txn_rts_sweep_hs_keys;
int64_t txn_rts_upd_aborted;
int64_t txn_rts_hs_removed;
diff --git a/src/third_party/wiredtiger/src/include/wiredtiger.in b/src/third_party/wiredtiger/src/include/wiredtiger.in
index a3abb6c26af..efc658938d0 100644
--- a/src/third_party/wiredtiger/src/include/wiredtiger.in
+++ b/src/third_party/wiredtiger/src/include/wiredtiger.in
@@ -5886,106 +5886,108 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection);
#define WT_STAT_CONN_TXN_RTS_KEYS_RESTORED 1417
/*! transaction: rollback to stable pages visited */
#define WT_STAT_CONN_TXN_RTS_PAGES_VISITED 1418
+/*! transaction: rollback to stable skipping internal pages tree walk */
+#define WT_STAT_CONN_TXN_RTS_SKIP_INTERAL_PAGES_WALK 1419
/*! transaction: rollback to stable sweeping history store keys */
-#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1419
+#define WT_STAT_CONN_TXN_RTS_SWEEP_HS_KEYS 1420
/*! transaction: rollback to stable updates aborted */
-#define WT_STAT_CONN_TXN_RTS_UPD_ABORTED 1420
+#define WT_STAT_CONN_TXN_RTS_UPD_ABORTED 1421
/*! transaction: rollback to stable updates removed from history store */
-#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1421
+#define WT_STAT_CONN_TXN_RTS_HS_REMOVED 1422
/*! transaction: set timestamp calls */
-#define WT_STAT_CONN_TXN_SET_TS 1422
+#define WT_STAT_CONN_TXN_SET_TS 1423
/*! transaction: set timestamp durable calls */
-#define WT_STAT_CONN_TXN_SET_TS_DURABLE 1423
+#define WT_STAT_CONN_TXN_SET_TS_DURABLE 1424
/*! transaction: set timestamp durable updates */
-#define WT_STAT_CONN_TXN_SET_TS_DURABLE_UPD 1424
+#define WT_STAT_CONN_TXN_SET_TS_DURABLE_UPD 1425
/*! transaction: set timestamp oldest calls */
-#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1425
+#define WT_STAT_CONN_TXN_SET_TS_OLDEST 1426
/*! transaction: set timestamp oldest updates */
-#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1426
+#define WT_STAT_CONN_TXN_SET_TS_OLDEST_UPD 1427
/*! transaction: set timestamp stable calls */
-#define WT_STAT_CONN_TXN_SET_TS_STABLE 1427
+#define WT_STAT_CONN_TXN_SET_TS_STABLE 1428
/*! transaction: set timestamp stable updates */
-#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1428
+#define WT_STAT_CONN_TXN_SET_TS_STABLE_UPD 1429
/*! transaction: transaction begins */
-#define WT_STAT_CONN_TXN_BEGIN 1429
+#define WT_STAT_CONN_TXN_BEGIN 1430
/*! transaction: transaction checkpoint currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1430
+#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1431
/*! transaction: transaction checkpoint generation */
-#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1431
+#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1432
/*!
* transaction: transaction checkpoint history store file duration
* (usecs)
*/
-#define WT_STAT_CONN_TXN_HS_CKPT_DURATION 1432
+#define WT_STAT_CONN_TXN_HS_CKPT_DURATION 1433
/*! transaction: transaction checkpoint max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1433
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1434
/*! transaction: transaction checkpoint min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1434
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1435
/*! transaction: transaction checkpoint most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1435
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1436
/*! transaction: transaction checkpoint prepare currently running */
-#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RUNNING 1436
+#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RUNNING 1437
/*! transaction: transaction checkpoint prepare max time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MAX 1437
+#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MAX 1438
/*! transaction: transaction checkpoint prepare min time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MIN 1438
+#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_MIN 1439
/*! transaction: transaction checkpoint prepare most recent time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1439
+#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_RECENT 1440
/*! transaction: transaction checkpoint prepare total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1440
+#define WT_STAT_CONN_TXN_CHECKPOINT_PREP_TOTAL 1441
/*! transaction: transaction checkpoint scrub dirty target */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1441
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TARGET 1442
/*! transaction: transaction checkpoint scrub time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1442
+#define WT_STAT_CONN_TXN_CHECKPOINT_SCRUB_TIME 1443
/*! transaction: transaction checkpoint total time (msecs) */
-#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1443
+#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1444
/*! transaction: transaction checkpoints */
-#define WT_STAT_CONN_TXN_CHECKPOINT 1444
+#define WT_STAT_CONN_TXN_CHECKPOINT 1445
/*!
* transaction: transaction checkpoints skipped because database was
* clean
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1445
+#define WT_STAT_CONN_TXN_CHECKPOINT_SKIPPED 1446
/*! transaction: transaction failures due to history store */
-#define WT_STAT_CONN_TXN_FAIL_CACHE 1446
+#define WT_STAT_CONN_TXN_FAIL_CACHE 1447
/*!
* transaction: transaction fsync calls for checkpoint after allocating
* the transaction ID
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1447
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST 1448
/*!
* transaction: transaction fsync duration for checkpoint after
* allocating the transaction ID (usecs)
*/
-#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1448
+#define WT_STAT_CONN_TXN_CHECKPOINT_FSYNC_POST_DURATION 1449
/*! transaction: transaction range of IDs currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_RANGE 1449
+#define WT_STAT_CONN_TXN_PINNED_RANGE 1450
/*! transaction: transaction range of IDs currently pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1450
+#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1451
/*! transaction: transaction range of timestamps currently pinned */
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1451
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP 1452
/*! transaction: transaction range of timestamps pinned by a checkpoint */
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1452
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_CHECKPOINT 1453
/*!
* transaction: transaction range of timestamps pinned by the oldest
* active read timestamp
*/
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1453
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_READER 1454
/*!
* transaction: transaction range of timestamps pinned by the oldest
* timestamp
*/
-#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1454
+#define WT_STAT_CONN_TXN_PINNED_TIMESTAMP_OLDEST 1455
/*! transaction: transaction read timestamp of the oldest active reader */
-#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1455
+#define WT_STAT_CONN_TXN_TIMESTAMP_OLDEST_ACTIVE_READ 1456
/*! transaction: transaction sync calls */
-#define WT_STAT_CONN_TXN_SYNC 1456
+#define WT_STAT_CONN_TXN_SYNC 1457
/*! transaction: transactions committed */
-#define WT_STAT_CONN_TXN_COMMIT 1457
+#define WT_STAT_CONN_TXN_COMMIT 1458
/*! transaction: transactions rolled back */
-#define WT_STAT_CONN_TXN_ROLLBACK 1458
+#define WT_STAT_CONN_TXN_ROLLBACK 1459
/*! transaction: update conflicts */
-#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1459
+#define WT_STAT_CONN_TXN_UPDATE_CONFLICT 1460
/*!
* @}
diff --git a/src/third_party/wiredtiger/src/support/stat.c b/src/third_party/wiredtiger/src/support/stat.c
index 5e481d850f6..4c0b38d5192 100644
--- a/src/third_party/wiredtiger/src/support/stat.c
+++ b/src/third_party/wiredtiger/src/support/stat.c
@@ -1053,6 +1053,7 @@ static const char *const __stats_connection_desc[] = {
"transaction: read timestamp queue inserts total", "transaction: read timestamp queue length",
"transaction: rollback to stable calls", "transaction: rollback to stable keys removed",
"transaction: rollback to stable keys restored", "transaction: rollback to stable pages visited",
+ "transaction: rollback to stable skipping internal pages tree walk",
"transaction: rollback to stable sweeping history store keys",
"transaction: rollback to stable updates aborted",
"transaction: rollback to stable updates removed from history store",
@@ -1547,6 +1548,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->txn_rts_keys_removed = 0;
stats->txn_rts_keys_restored = 0;
stats->txn_rts_pages_visited = 0;
+ stats->txn_rts_skip_interal_pages_walk = 0;
stats->txn_rts_sweep_hs_keys = 0;
stats->txn_rts_upd_aborted = 0;
stats->txn_rts_hs_removed = 0;
@@ -2050,6 +2052,7 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *
to->txn_rts_keys_removed += WT_STAT_READ(from, txn_rts_keys_removed);
to->txn_rts_keys_restored += WT_STAT_READ(from, txn_rts_keys_restored);
to->txn_rts_pages_visited += WT_STAT_READ(from, txn_rts_pages_visited);
+ to->txn_rts_skip_interal_pages_walk += WT_STAT_READ(from, txn_rts_skip_interal_pages_walk);
to->txn_rts_sweep_hs_keys += WT_STAT_READ(from, txn_rts_sweep_hs_keys);
to->txn_rts_upd_aborted += WT_STAT_READ(from, txn_rts_upd_aborted);
to->txn_rts_hs_removed += WT_STAT_READ(from, txn_rts_hs_removed);
diff --git a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
index 48dc34a2cf7..13c3725659d 100644
--- a/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
+++ b/src/third_party/wiredtiger/src/txn/txn_rollback_to_stable.c
@@ -819,6 +819,40 @@ err:
}
/*
+ * __wt_rts_page_skip --
+ * Skip if rollback to stable doesn't requires to read this page.
+ */
+int
+__wt_rts_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, void *context, bool *skipp)
+{
+ wt_timestamp_t rollback_timestamp;
+
+ rollback_timestamp = *(wt_timestamp_t *)(context);
+ *skipp = false; /* Default to reading */
+
+ /* If the page is in-memory, we want to look at it. */
+ if (ref->state != WT_REF_DISK)
+ return (0);
+
+ /*
+ * Rollback to stable doesn't read leaf pages into memory as part of the tree walk. The leaf
+ * page is loaded into memory in the caller functions if it has newer updates that are need to
+ * be aborted. Don't process further on leaf pages as part of tree walk function.
+ */
+ if (!F_ISSET(ref, WT_REF_FLAG_INTERNAL))
+ return (0);
+
+ /* Check whether this ref has any possible updates to be aborted. */
+ if (!__rollback_page_needs_abort(session, ref, rollback_timestamp)) {
+ *skipp = true;
+ __wt_verbose(session, WT_VERB_RTS, "%p: internal page walk skipped", (void *)ref);
+ WT_STAT_CONN_INCR(session, txn_rts_skip_interal_pages_walk);
+ }
+
+ return (0);
+}
+
+/*
* __rollback_to_stable_btree_walk --
* Called for each open handle - choose to either skip or wipe the commits
*/
@@ -830,8 +864,8 @@ __rollback_to_stable_btree_walk(WT_SESSION_IMPL *session, wt_timestamp_t rollbac
/* Walk the tree, marking commits aborted where appropriate. */
ref = NULL;
- while ((ret = __wt_tree_walk(
- session, &ref, WT_READ_CACHE_LEAF | WT_READ_NO_EVICT | WT_READ_WONT_NEED)) == 0 &&
+ while ((ret = __wt_tree_walk_custom_skip(session, &ref, __wt_rts_page_skip, &rollback_timestamp,
+ WT_READ_CACHE_LEAF | WT_READ_NO_EVICT | WT_READ_WONT_NEED)) == 0 &&
ref != NULL)
if (F_ISSET(ref, WT_REF_FLAG_INTERNAL)) {
WT_INTL_FOREACH_BEGIN (session, ref->page, child_ref) {
diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c
index 89e0406c3cb..72c17366639 100644
--- a/src/third_party/wiredtiger/test/format/bulk.c
+++ b/src/third_party/wiredtiger/test/format/bulk.c
@@ -59,7 +59,7 @@ bulk_commit_transaction(WT_SESSION *session)
testutil_check(session->commit_transaction(session, buf));
/* Update the oldest timestamp, otherwise updates are pinned in memory. */
- timestamp_once(session);
+ timestamp_once(session, false);
}
/*
diff --git a/src/third_party/wiredtiger/test/format/config.c b/src/third_party/wiredtiger/test/format/config.c
index 1db9cc5854c..374bc470f5b 100644
--- a/src/third_party/wiredtiger/test/format/config.c
+++ b/src/third_party/wiredtiger/test/format/config.c
@@ -942,7 +942,9 @@ config_transaction(void)
if (g.c_txn_freq != 100 && config_is_perm("transaction.frequency"))
testutil_die(EINVAL, "snapshot isolation requires transaction frequency set to 100");
}
-
+ if (g.c_txn_rollback_to_stable && config_is_perm("transaction.rollback_to_stable") &&
+ g.c_isolation_flag != ISOLATION_SNAPSHOT && config_is_perm("transaction.isolation"))
+ testutil_die(EINVAL, "rollback to stable requires snapshot isolation");
/*
* The permanent configuration has no incompatible settings, adjust the temporary configuration
* as necessary. Prepare overrides timestamps, overrides isolation, for no reason other than
@@ -958,6 +960,12 @@ config_transaction(void)
if (g.c_txn_freq != 100)
config_single("transaction.frequency=100", false);
}
+ if (g.c_txn_rollback_to_stable) {
+ if (!g.c_txn_timestamps)
+ config_single("transaction.timestamps=on", false);
+ if (g.c_logging)
+ config_single("logging=off", false);
+ }
if (g.c_txn_timestamps) {
if (g.c_isolation_flag != ISOLATION_SNAPSHOT)
config_single("transaction.isolation=snapshot", false);
diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h
index 30b8f2a09a6..b8f2922021e 100644
--- a/src/third_party/wiredtiger/test/format/config.h
+++ b/src/third_party/wiredtiger/test/format/config.h
@@ -57,97 +57,93 @@ typedef struct {
char **vstr; /* Value for string options */
} CONFIG;
-#define COMPRESSION_LIST "(none | lz4 | snappy | zlib | zstd)"
+#define COMPRESSION_LIST " (none | lz4 | snappy | zlib | zstd)"
static CONFIG c[] = {
/* 5% */
- {"assert.commit_timestamp", "if assert commit_timestamp", C_BOOL, 5, 0, 0,
+ {"assert.commit_timestamp", "assert commit_timestamp", C_BOOL, 5, 0, 0,
&g.c_assert_commit_timestamp, NULL},
/* 5% */
- {"assert.read_timestamp", "if assert read_timestamp", C_BOOL, 5, 0, 0, &g.c_assert_read_timestamp,
+ {"assert.read_timestamp", "assert read_timestamp", C_BOOL, 5, 0, 0, &g.c_assert_read_timestamp,
NULL},
/* 20% */
- {"backup", "if backups are enabled", C_BOOL, 20, 0, 0, &g.c_backups, NULL},
+ {"backup", "configure backups", C_BOOL, 20, 0, 0, &g.c_backups, NULL},
- {"backup.incremental", "type of backup (block | log | off)", C_IGNORE | C_STRING, 0, 0, 0, NULL,
+ {"backup.incremental", "backup type (block | log | off)", C_IGNORE | C_STRING, 0, 0, 0, NULL,
&g.c_backup_incremental},
- {"backup.incr_granularity", "incremental backup block granularity in KB", 0x0, 4, 16384, 16384,
+ {"backup.incr_granularity", "incremental backup block granularity (KB)", 0x0, 4, 16384, 16384,
&g.c_backup_incr_granularity, NULL},
- {"btree.bitcnt", "number of bits for fixed-length column-store files", 0x0, 1, 8, 8, &g.c_bitcnt,
- NULL},
+ {"btree.bitcnt", "fixed-length column-store object size (number of bits)", 0x0, 1, 8, 8,
+ &g.c_bitcnt, NULL},
- {"btree.compression", "type of compression " COMPRESSION_LIST, C_IGNORE | C_STRING, 0, 0, 0, NULL,
+ {"btree.compression", "compression type" COMPRESSION_LIST, C_IGNORE | C_STRING, 0, 0, 0, NULL,
&g.c_compression},
/* 20% */
- {"btree.dictionary", "if values are dictionary compressed", C_BOOL, 20, 0, 0, &g.c_dictionary,
+ {"btree.dictionary", "configure dictionary compressed values", C_BOOL, 20, 0, 0, &g.c_dictionary,
NULL},
/* 20% */
- {"btree.huffman_key", "if keys are huffman encoded", C_BOOL, 20, 0, 0, &g.c_huffman_key, NULL},
+ {"btree.huffman_key", "configure huffman encoded keys", C_BOOL, 20, 0, 0, &g.c_huffman_key, NULL},
/* 20% */
- {"btree.huffman_value", "if values are huffman encoded", C_BOOL, 20, 0, 0, &g.c_huffman_value,
+ {"btree.huffman_value", "configure huffman encoded values", C_BOOL, 20, 0, 0, &g.c_huffman_value,
NULL},
/* 95% */
- {"btree.internal_key_truncation", "if internal keys are truncated", C_BOOL, 95, 0, 0,
+ {"btree.internal_key_truncation", "truncate internal keys", C_BOOL, 95, 0, 0,
&g.c_internal_key_truncation, NULL},
- {"btree.internal_page_max", "maximum size of Btree internal nodes", 0x0, 9, 17, 27,
+ {"btree.internal_page_max", "btree internal node maximum size", 0x0, 9, 17, 27,
&g.c_intl_page_max, NULL},
- {"btree.key_gap", "gap between instantiated keys on a Btree page", 0x0, 0, 20, 20, &g.c_key_gap,
- NULL},
+ {"btree.key_gap", "btree page instantiated key gap", 0x0, 0, 20, 20, &g.c_key_gap, NULL},
- {"btree.key_max", "maximum size of keys", 0x0, 20, 128, MEGABYTE(10), &g.c_key_max, NULL},
+ {"btree.key_max", "maximum key size", 0x0, 20, 128, MEGABYTE(10), &g.c_key_max, NULL},
/*
* A minimum key size of 11 is necessary. Row-store keys have a leading 10-digit number and the
* 11 guarantees we never see a key that we can't convert to a numeric value without formatting
* it first because there's a trailing non-digit character in every key.
*/
- {"btree.key_min", "minimum size of keys", 0x0, 11, 32, 256, &g.c_key_min, NULL},
+ {"btree.key_min", "minimum key size", 0x0, 11, 32, 256, &g.c_key_min, NULL},
- {"btree.leaf_page_max", "maximum size of Btree leaf nodes", 0x0, 9, 17, 27, &g.c_leaf_page_max,
- NULL},
+ {"btree.leaf_page_max", "btree leaf node maximum size", 0x0, 9, 17, 27, &g.c_leaf_page_max, NULL},
- {"btree.memory_page_max", "maximum size of in-memory pages", 0x0, 1, 10, 128,
- &g.c_memory_page_max, NULL},
+ {"btree.memory_page_max", "maximum cache page size", 0x0, 1, 10, 128, &g.c_memory_page_max, NULL},
/* 80% */
- {"btree.prefix_compression", "if keys are prefix compressed", C_BOOL, 80, 0, 0,
+ {"btree.prefix_compression", "configure prefix compressed keys", C_BOOL, 80, 0, 0,
&g.c_prefix_compression, NULL},
- {"btree.prefix_compression_min", "minimum gain before prefix compression is used", 0x0, 0, 8, 256,
- &g.c_prefix_compression_min, NULL},
+ {"btree.prefix_compression_min", "minimum gain before prefix compression is used (bytes)", 0x0, 0,
+ 8, 256, &g.c_prefix_compression_min, NULL},
- {"btree.repeat_data_pct", "percent duplicate values in row- or var-length column-stores", 0x0, 0,
- 90, 90, &g.c_repeat_data_pct, NULL},
+ {"btree.repeat_data_pct", "duplicate values (percentage)", 0x0, 0, 90, 90, &g.c_repeat_data_pct,
+ NULL},
/* 10% */
- {"btree.reverse", "collate in reverse order", C_BOOL, 10, 0, 0, &g.c_reverse, NULL},
+ {"btree.reverse", "reverse order collation", C_BOOL, 10, 0, 0, &g.c_reverse, NULL},
{"btree.split_pct", "page split size as a percentage of the maximum page size", 0x0, 50, 100, 100,
&g.c_split_pct, NULL},
- {"btree.value_max", "maximum size of values", 0x0, 32, 4096, MEGABYTE(10), &g.c_value_max, NULL},
+ {"btree.value_max", "maximum value size", 0x0, 32, 4096, MEGABYTE(10), &g.c_value_max, NULL},
- {"btree.value_min", "minimum size of values", 0x0, 0, 20, 4096, &g.c_value_min, NULL},
+ {"btree.value_min", "minimum value size", 0x0, 0, 20, 4096, &g.c_value_min, NULL},
- {"cache", "size of the cache in MB", 0x0, 1, 100, 100 * 1024, &g.c_cache, NULL},
+ {"cache", "cache size (MB)", 0x0, 1, 100, 100 * 1024, &g.c_cache, NULL},
- {"cache.evict_max", "the maximum number of eviction workers", 0x0, 0, 5, 100, &g.c_evict_max,
- NULL},
+ {"cache.evict_max", "maximum number of eviction workers", 0x0, 0, 5, 100, &g.c_evict_max, NULL},
- {"cache.minimum", "minimum size of the cache in MB", C_IGNORE, 0, 0, 100 * 1024,
- &g.c_cache_minimum, NULL},
+ {"cache.minimum", "minimum cache size (MB)", C_IGNORE, 0, 0, 100 * 1024, &g.c_cache_minimum,
+ NULL},
- {"checkpoint", "type of checkpoints (on | off | wiredtiger)", C_IGNORE | C_STRING, 0, 0, 0, NULL,
+ {"checkpoint", "checkpoint type (on | off | wiredtiger)", C_IGNORE | C_STRING, 0, 0, 0, NULL,
&g.c_checkpoint},
{"checkpoint.log_size", "MB of log to wait if wiredtiger checkpoints configured", 0x0, 20, 200,
@@ -156,151 +152,145 @@ static CONFIG c[] = {
{"checkpoint.wait", "seconds to wait if wiredtiger checkpoints configured", 0x0, 5, 100, 3600,
&g.c_checkpoint_wait, NULL},
- {"disk.checksum", "type of checksums (on | off | uncompressed)", C_IGNORE | C_STRING, 0, 0, 0,
- NULL, &g.c_checksum},
+ {"disk.checksum", "checksum type (on | off | uncompressed)", C_IGNORE | C_STRING, 0, 0, 0, NULL,
+ &g.c_checksum},
/* 5% */
- {"disk.data_extend", "if data files are extended", C_BOOL, 5, 0, 0, &g.c_data_extend, NULL},
+ {"disk.data_extend", "configure data file extension", C_BOOL, 5, 0, 0, &g.c_data_extend, NULL},
/* 0% */
- {"disk.direct_io", "if direct I/O is configured for data objects", C_IGNORE | C_BOOL, 0, 0, 1,
+ {"disk.direct_io", "configure direct I/O for data objects", C_IGNORE | C_BOOL, 0, 0, 1,
&g.c_direct_io, NULL},
- {"disk.encryption", "type of encryption (none | rotn-7)", C_IGNORE | C_STRING, 0, 0, 0, NULL,
+ {"disk.encryption", "encryption type (none | rotn-7)", C_IGNORE | C_STRING, 0, 0, 0, NULL,
&g.c_encryption},
/* 10% */
- {"disk.firstfit", "if allocation is firstfit", C_BOOL, 10, 0, 0, &g.c_firstfit, NULL},
+ {"disk.firstfit", "configure first-fit allocation", C_BOOL, 10, 0, 0, &g.c_firstfit, NULL},
/* 90% */
- {"disk.mmap", "configure for mmap operations (readonly)", C_BOOL, 90, 0, 0, &g.c_mmap, NULL},
+ {"disk.mmap", "configure mmap operations (reads only)", C_BOOL, 90, 0, 0, &g.c_mmap, NULL},
/* 5% */
- {"disk.mmap_all", "configure for mmap operations (read and write)", C_BOOL, 5, 0, 0,
- &g.c_mmap_all, NULL},
+ {"disk.mmap_all", "configure mmap operations (read and write)", C_BOOL, 5, 0, 0, &g.c_mmap_all,
+ NULL},
/* 0% */
- {"format.abort", "if timed run should drop core", C_BOOL, 0, 0, 0, &g.c_abort, NULL},
+ {"format.abort", "drop core during timed run", C_BOOL, 0, 0, 0, &g.c_abort, NULL},
/* 75% */
- {"format.independent_thread_rng", "if thread RNG space is independent", C_BOOL, 75, 0, 0,
+ {"format.independent_thread_rng", "configure independent thread RNG space", C_BOOL, 75, 0, 0,
&g.c_independent_thread_rng, NULL},
- {"format.major_timeout", "timeout for long-running operations (minutes)", C_IGNORE, 0, 0, 1000,
+ {"format.major_timeout", "long-running operations timeout (minutes)", C_IGNORE, 0, 0, 1000,
&g.c_major_timeout, NULL},
/* 50% */
- {"logging", "if logging configured", C_BOOL, 50, 0, 0, &g.c_logging, NULL},
+ {"logging", "configure logging", C_BOOL, 50, 0, 0, &g.c_logging, NULL},
/* 50% */
- {"logging.archive", "if log file archival configured", C_BOOL, 50, 0, 0, &g.c_logging_archive,
- NULL},
+ {"logging.archive", "configure log file archival", C_BOOL, 50, 0, 0, &g.c_logging_archive, NULL},
- {"logging.compression", "type of logging compression " COMPRESSION_LIST, C_IGNORE | C_STRING, 0,
- 0, 0, NULL, &g.c_logging_compression},
+ {"logging.compression", "logging compression type" COMPRESSION_LIST, C_IGNORE | C_STRING, 0, 0, 0,
+ NULL, &g.c_logging_compression},
- {"logging.file_max", "maximum log file size in KB", 0x0, 100, 512000, 2097152,
+ {"logging.file_max", "maximum log file size (KB)", 0x0, 100, 512000, 2097152,
&g.c_logging_file_max, NULL},
/* 50% */
- {"logging.prealloc", "if log file pre-allocation configured", C_BOOL, 50, 0, 0,
- &g.c_logging_prealloc, NULL},
+ {"logging.prealloc", "configure log file pre-allocation", C_BOOL, 50, 0, 0, &g.c_logging_prealloc,
+ NULL},
/* 90% */
- {"lsm.auto_throttle", "if LSM inserts are throttled", C_BOOL, 90, 0, 0, &g.c_auto_throttle, NULL},
+ {"lsm.auto_throttle", "throttle LSM inserts", C_BOOL, 90, 0, 0, &g.c_auto_throttle, NULL},
/* 95% */
- {"lsm.bloom", "if bloom filters are configured", C_BOOL, 95, 0, 0, &g.c_bloom, NULL},
+ {"lsm.bloom", "configure bloom filters", C_BOOL, 95, 0, 0, &g.c_bloom, NULL},
- {"lsm.bloom_bit_count", "number of bits per item for LSM bloom filters", 0x0, 4, 64, 1000,
+ {"lsm.bloom_bit_count", "number of bits per item for bloom filters", 0x0, 4, 64, 1000,
&g.c_bloom_bit_count, NULL},
- {"lsm.bloom_hash_count", "number of hash values per item for LSM bloom filters", 0x0, 4, 32, 100,
+ {"lsm.bloom_hash_count", "number of hash values per item for bloom filters", 0x0, 4, 32, 100,
&g.c_bloom_hash_count, NULL},
/* 10% */
- {"lsm.bloom_oldest", "if bloom_oldest=true", C_BOOL, 10, 0, 0, &g.c_bloom_oldest, NULL},
+ {"lsm.bloom_oldest", "configure bloom_oldest=true", C_BOOL, 10, 0, 0, &g.c_bloom_oldest, NULL},
- {"lsm.chunk_size", "LSM chunk size in MB", 0x0, 1, 10, 100, &g.c_chunk_size, NULL},
+ {"lsm.chunk_size", "LSM chunk size (MB)", 0x0, 1, 10, 100, &g.c_chunk_size, NULL},
- {"lsm.merge_max", "the maximum number of chunks to include in a merge operation", 0x0, 4, 20, 100,
- &g.c_merge_max, NULL},
+ {"lsm.merge_max", "maximum number of chunks to include in an LSM merge operation", 0x0, 4, 20,
+ 100, &g.c_merge_max, NULL},
- {"lsm.worker_threads", "the number of LSM worker threads", 0x0, 3, 4, 20, &g.c_lsm_worker_threads,
+ {"lsm.worker_threads", "number of LSM worker threads", 0x0, 3, 4, 20, &g.c_lsm_worker_threads,
NULL},
/* 10% */
- {"ops.alter", "if altering the table is enabled", C_BOOL, 10, 0, 0, &g.c_alter, NULL},
+ {"ops.alter", "configure table alterations", C_BOOL, 10, 0, 0, &g.c_alter, NULL},
/* 10% */
- {"ops.compaction", "if compaction is running", C_BOOL, 10, 0, 0, &g.c_compact, NULL},
+ {"ops.compaction", "configure compaction", C_BOOL, 10, 0, 0, &g.c_compact, NULL},
/* 50% */
- {"ops.hs_cursor", "if history store cursor reads configured", C_BOOL, 50, 0, 0, &g.c_hs_cursor,
- NULL},
+ {"ops.hs_cursor", "configure history store cursor reads", C_BOOL, 50, 0, 0, &g.c_hs_cursor, NULL},
- {"ops.pct.delete", "percent operations that are deletes", C_IGNORE, 0, 0, 100, &g.c_delete_pct,
- NULL},
+ {"ops.pct.delete", "delete operations (percentage)", C_IGNORE, 0, 0, 100, &g.c_delete_pct, NULL},
- {"ops.pct.insert", "percent operations that are inserts", C_IGNORE, 0, 0, 100, &g.c_insert_pct,
- NULL},
+ {"ops.pct.insert", "insert operations (percentage)", C_IGNORE, 0, 0, 100, &g.c_insert_pct, NULL},
- {"ops.pct.modify", "percent operations that are value modifications", C_IGNORE, 0, 0, 100,
- &g.c_modify_pct, NULL},
+ {"ops.pct.modify", "modify operations (percentage)", C_IGNORE, 0, 0, 100, &g.c_modify_pct, NULL},
- {"ops.pct.read", "percent operations that are reads", C_IGNORE, 0, 0, 100, &g.c_read_pct, NULL},
+ {"ops.pct.read", "read operations (percentage)", C_IGNORE, 0, 0, 100, &g.c_read_pct, NULL},
- {"ops.pct.write", "percent operations that are value updates", C_IGNORE, 0, 0, 100,
- &g.c_write_pct, NULL},
+ {"ops.pct.write", "update operations (percentage)", C_IGNORE, 0, 0, 100, &g.c_write_pct, NULL},
/* 5% */
{"ops.prepare", "configure transaction prepare", C_BOOL, 5, 0, 0, &g.c_prepare, NULL},
/* 10% */
- {"ops.random_cursor", "if random cursor reads configured", C_BOOL, 10, 0, 0, &g.c_random_cursor,
+ {"ops.random_cursor", "configure random cursor reads", C_BOOL, 10, 0, 0, &g.c_random_cursor,
NULL},
/* 100% */
- {"ops.rebalance", "rebalance testing", C_BOOL, 100, 1, 0, &g.c_rebalance, NULL},
+ {"ops.rebalance", "configure rebalance", C_BOOL, 100, 1, 0, &g.c_rebalance, NULL},
/* 100% */
- {"ops.salvage", "salvage testing", C_BOOL, 100, 1, 0, &g.c_salvage, NULL},
+ {"ops.salvage", "configure salvage", C_BOOL, 100, 1, 0, &g.c_salvage, NULL},
/* 100% */
- {"ops.truncate", "enable truncation", C_BOOL, 100, 0, 0, &g.c_truncate, NULL},
+ {"ops.truncate", "configure truncation", C_BOOL, 100, 0, 0, &g.c_truncate, NULL},
/* 100% */
- {"ops.verify", "to regularly verify during a run", C_BOOL, 100, 1, 0, &g.c_verify, NULL},
+ {"ops.verify", "configure verify", C_BOOL, 100, 1, 0, &g.c_verify, NULL},
{"quiet", "quiet run (same as -q)", C_IGNORE | C_BOOL, 0, 0, 1, &g.c_quiet, NULL},
- {"runs", "the number of runs", C_IGNORE, 0, 0, UINT_MAX, &g.c_runs, NULL},
+ {"runs", "number of runs", C_IGNORE, 0, 0, UINT_MAX, &g.c_runs, NULL},
- {"runs.in_memory", "if in-memory configured", C_IGNORE | C_BOOL, 0, 0, 1, &g.c_in_memory, NULL},
+ {"runs.in_memory", "configure in-memory", C_IGNORE | C_BOOL, 0, 0, 1, &g.c_in_memory, NULL},
- {"runs.ops", "the number of operations done per run", 0x0, 0, M(2), M(100), &g.c_ops, NULL},
+ {"runs.ops", "operations per run", 0x0, 0, M(2), M(100), &g.c_ops, NULL},
- {"runs.rows", "the number of rows to create", 0x0, 10, M(1), M(100), &g.c_rows, NULL},
+ {"runs.rows", "number of rows", 0x0, 10, M(1), M(100), &g.c_rows, NULL},
- {"runs.source", "data source (file | lsm | table)", C_IGNORE | C_STRING, 0, 0, 0, NULL,
+ {"runs.source", "data source type (file | lsm | table)", C_IGNORE | C_STRING, 0, 0, 0, NULL,
&g.c_data_source},
- {"runs.threads", "the number of worker threads", 0x0, 1, 32, 128, &g.c_threads, NULL},
+ {"runs.threads", "number of worker threads", 0x0, 1, 32, 128, &g.c_threads, NULL},
- {"runs.timer", "maximum time to run in minutes", C_IGNORE, 0, 0, UINT_MAX, &g.c_timer, NULL},
+ {"runs.timer", "run time (minutes)", C_IGNORE, 0, 0, UINT_MAX, &g.c_timer, NULL},
- {"runs.type", "type of store to create (fix | var | row)", C_IGNORE | C_STRING, 0, 0, 0, NULL,
+ {"runs.type", "object type (fix | var | row)", C_IGNORE | C_STRING, 0, 0, 0, NULL,
&g.c_file_type},
- {"runs.verify_failure_dump", "attempt page dump on repeatable read error", C_IGNORE | C_BOOL, 0,
+ {"runs.verify_failure_dump", "configure page dump on repeatable read error", C_IGNORE | C_BOOL, 0,
0, 1, &g.c_verify_failure_dump, NULL},
/* 20% */
- {"statistics", "maintain statistics", C_BOOL, 20, 0, 0, &g.c_statistics, NULL},
+ {"statistics", "configure statistics", C_BOOL, 20, 0, 0, &g.c_statistics, NULL},
/* 5% */
- {"statistics.server", "run the statistics server thread", C_BOOL, 5, 0, 0, &g.c_statistics_server,
- NULL},
+ {"statistics.server", "configure statistics server thread", C_BOOL, 5, 0, 0,
+ &g.c_statistics_server, NULL},
/* 2% */
{"stress.aggressive_sweep", "stress aggressive sweep", C_BOOL, 2, 0, 0,
@@ -341,25 +331,29 @@ static CONFIG c[] = {
/* 2% */
{"stress.split_8", "stress splits (#8)", C_BOOL, 2, 0, 0, &g.c_timing_stress_split_8, NULL},
- {"transaction.frequency", "percent operations done inside an explicit transaction", 0x0, 1, 100,
+ {"transaction.frequency", "operations inside an explicit transaction (percentage)", 0x0, 1, 100,
100, &g.c_txn_freq, NULL},
{"transaction.isolation",
"isolation level (random | read-uncommitted | read-committed | snapshot)", C_IGNORE | C_STRING,
0, 0, 0, NULL, &g.c_isolation},
+ /* 0% - By default, turned off until fallout has been debugged. */
+ {"transaction.rollback_to_stable", "configure rollback_to_stable", C_BOOL, 0, 0, 0,
+ &g.c_txn_rollback_to_stable, NULL},
+
/* 70% */
- {"transaction.timestamps", "enable transaction timestamp support", C_BOOL, 70, 0, 0,
+ {"transaction.timestamps", "configure transaction timestamps", C_BOOL, 70, 0, 0,
&g.c_txn_timestamps, NULL},
- {"wiredtiger.config", "configuration string used to wiredtiger_open", C_IGNORE | C_STRING, 0, 0,
- 0, NULL, &g.c_config_open},
+ {"wiredtiger.config", "wiredtiger_open API configuration string", C_IGNORE | C_STRING, 0, 0, 0,
+ NULL, &g.c_config_open},
/* 80% */
- {"wiredtiger.rwlock", "if wiredtiger read/write mutexes should be used", C_BOOL, 80, 0, 0,
- &g.c_wt_mutex, NULL},
+ {"wiredtiger.rwlock", "configure wiredtiger read/write mutexes", C_BOOL, 80, 0, 0, &g.c_wt_mutex,
+ NULL},
- {"wiredtiger.leak_memory", "if memory should be leaked on close", C_BOOL, 0, 0, 0,
+ {"wiredtiger.leak_memory", "configure memory leaked on shutdown", C_BOOL, 0, 0, 0,
&g.c_leak_memory, NULL},
{NULL, NULL, 0x0, 0, 0, 0, NULL, NULL}};
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index af5e16cc60b..5b4e8589b8c 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -74,7 +74,6 @@ typedef struct {
#define LOCK_INITIALIZED(lock) ((lock)->lock_type != LOCK_NONE)
typedef struct {
- wt_thread_t tid; /* thread ID */
char tidbuf[128]; /* thread ID in printable form */
WT_CONNECTION *wts_conn;
@@ -112,6 +111,8 @@ typedef struct {
WT_RAND_STATE rnd; /* Global RNG state */
+ uint32_t rts_no_check; /* track unsuccessful RTS checking */
+
/*
* Prepare will return an error if the prepare timestamp is less than any active read timestamp.
* Lock across allocating prepare and read timestamps.
@@ -121,7 +122,9 @@ typedef struct {
*/
RWLOCK ts_lock;
- uint64_t timestamp; /* Counter for timestamps */
+ uint64_t timestamp; /* Counter for timestamps */
+ uint64_t oldest_timestamp; /* Last timestamp used for oldest */
+ uint64_t stable_timestamp; /* Last timestamp used for stable */
uint64_t truncate_cnt; /* Counter for truncation */
@@ -222,6 +225,7 @@ typedef struct {
uint32_t c_timing_stress_split_8;
uint32_t c_truncate;
uint32_t c_txn_freq;
+ uint32_t c_txn_rollback_to_stable;
uint32_t c_txn_timestamps;
uint32_t c_value_max;
uint32_t c_value_min;
@@ -305,6 +309,13 @@ typedef struct {
} SNAP_OPS;
typedef struct {
+ SNAP_OPS *snap_state_current;
+ SNAP_OPS *snap_state_end;
+ SNAP_OPS *snap_state_first;
+ SNAP_OPS *snap_state_list;
+} SNAP_STATE;
+
+typedef struct {
int id; /* simple thread ID */
wt_thread_t tid; /* thread ID */
char tidbuf[128]; /* thread ID in printable form */
@@ -340,7 +351,14 @@ typedef struct {
uint64_t opid; /* Operation ID */
uint64_t read_ts; /* read timestamp */
uint64_t commit_ts; /* commit timestamp */
- SNAP_OPS *snap, *snap_first, snap_list[512];
+ uint64_t stable_ts; /* stable timestamp */
+ SNAP_STATE snap_states[2];
+ SNAP_STATE *s; /* points to one of the snap_states */
+
+#define snap_current s->snap_state_current
+#define snap_end s->snap_state_end
+#define snap_first s->snap_state_first
+#define snap_list s->snap_state_list
uint64_t insert_list[256]; /* column-store inserted records */
u_int insert_list_cnt;
@@ -354,6 +372,8 @@ typedef struct {
} TINFO;
extern TINFO **tinfo_list;
+#define SNAP_LIST_SIZE 512
+
WT_THREAD_RET alter(void *);
WT_THREAD_RET backup(void *);
WT_THREAD_RET checkpoint(void *);
@@ -382,12 +402,16 @@ void operations(u_int, bool);
void path_setup(const char *);
void set_alarm(u_int);
void set_core_off(void);
-void snap_init(TINFO *, uint64_t, bool);
+void snap_init(TINFO *);
+void snap_teardown(TINFO *);
+void snap_op_init(TINFO *, uint64_t, bool);
+void snap_repeat_rollback(WT_CURSOR *, TINFO **, size_t);
void snap_repeat_single(WT_CURSOR *, TINFO *);
int snap_repeat_txn(WT_CURSOR *, TINFO *);
void snap_repeat_update(TINFO *, bool);
void snap_track(TINFO *, thread_op);
-void timestamp_once(WT_SESSION *);
+void timestamp_once(WT_SESSION *, bool);
+void timestamp_parse(WT_SESSION *, const char *, uint64_t *);
int trace_config(const char *);
void trace_init(void);
void trace_ops_init(TINFO *);
diff --git a/src/third_party/wiredtiger/test/format/hs.c b/src/third_party/wiredtiger/test/format/hs.c
index 55b298f20e7..0cb0cf352aa 100644
--- a/src/third_party/wiredtiger/test/format/hs.c
+++ b/src/third_party/wiredtiger/test/format/hs.c
@@ -83,8 +83,13 @@ hs_cursor(void *arg)
/* Search to the last-known location. */
if (!restart) {
cursor->set_key(cursor, hs_btree_id, &key, hs_start_ts, hs_counter);
+
+ /*
+ * Limit expected errors because this is a diagnostic check (the WiredTiger API allows
+ * prepare-conflict, but that would be unexpected from the history store file).
+ */
ret = cursor->search_near(cursor, &exact);
- testutil_assert(ret == 0 || ret == WT_NOTFOUND);
+ testutil_assert(ret == 0 || ret == WT_NOTFOUND || ret == WT_ROLLBACK);
}
/*
@@ -99,7 +104,7 @@ hs_cursor(void *arg)
cursor, &hs_stop_durable_ts, &hs_durable_timestamp, &hs_upd_type, &hs_value));
continue;
}
- testutil_assert(ret == WT_NOTFOUND);
+ testutil_assert(ret == WT_NOTFOUND || ret == WT_ROLLBACK);
break;
}
diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c
index 0d4692f0472..38c9a5b5690 100644
--- a/src/third_party/wiredtiger/test/format/ops.c
+++ b/src/third_party/wiredtiger/test/format/ops.c
@@ -102,6 +102,130 @@ random_failure(void)
TINFO **tinfo_list;
/*
+ * tinfo_init --
+ * Initialize the worker thread structures.
+ */
+static void
+tinfo_init(void)
+{
+ TINFO *tinfo;
+ u_int i;
+
+ /* Allocate the thread structures separately to minimize false sharing. */
+ if (tinfo_list == NULL) {
+ tinfo_list = dcalloc((size_t)g.c_threads + 1, sizeof(TINFO *));
+ for (i = 0; i < g.c_threads; ++i) {
+ tinfo_list[i] = dcalloc(1, sizeof(TINFO));
+ tinfo = tinfo_list[i];
+
+ tinfo->id = (int)i + 1;
+
+ /* Set up the default key and value buffers. */
+ tinfo->key = &tinfo->_key;
+ key_gen_init(tinfo->key);
+ tinfo->value = &tinfo->_value;
+ val_gen_init(tinfo->value);
+ tinfo->lastkey = &tinfo->_lastkey;
+ key_gen_init(tinfo->lastkey);
+
+ snap_init(tinfo);
+ }
+ }
+
+ /* Cleanup for each new run. */
+ for (i = 0; i < g.c_threads; ++i) {
+ tinfo = tinfo_list[i];
+
+ tinfo->ops = 0;
+ tinfo->commit = 0;
+ tinfo->insert = 0;
+ tinfo->prepare = 0;
+ tinfo->remove = 0;
+ tinfo->rollback = 0;
+ tinfo->search = 0;
+ tinfo->truncate = 0;
+ tinfo->update = 0;
+
+ tinfo->session = NULL;
+ tinfo->cursor = NULL;
+
+ tinfo->insert_list_cnt = 0;
+
+ tinfo->state = TINFO_RUNNING;
+ tinfo->quit = false;
+ }
+}
+
+/*
+ * tinfo_teardown --
+ * Tear down the worker thread structures.
+ */
+static void
+tinfo_teardown(void)
+{
+ TINFO *tinfo;
+ u_int i;
+
+ for (i = 0; i < g.c_threads; ++i) {
+ tinfo = tinfo_list[i];
+
+ __wt_buf_free(NULL, &tinfo->vprint);
+
+ /*
+ * Assert records were not removed unless configured to do so, otherwise subsequent runs can
+ * incorrectly report scan errors.
+ */
+ testutil_assert(g.c_delete_pct != 0 || tinfo->remove == 0);
+
+ snap_teardown(tinfo);
+ key_gen_teardown(tinfo->key);
+ val_gen_teardown(tinfo->value);
+ key_gen_teardown(tinfo->lastkey);
+
+ free(tinfo);
+ }
+ free(tinfo_list);
+ tinfo_list = NULL;
+}
+
+/*
+ * Command used before rollback to stable to save the interesting files so we can replay the command
+ * as necessary.
+ *
+ * Redirect the "cd" command to /dev/null so chatty cd implementations don't add the new working
+ * directory to our output.
+ */
+#define ROLLBACK_STABLE_COPY_CMD \
+ "cd %s > /dev/null && " \
+ "rm -rf ROLLBACK.copy && mkdir ROLLBACK.copy && " \
+ "cp WiredTiger* wt* ROLLBACK.copy/"
+
+/*
+ * tinfo_rollback_to_stable_and_check --
+ * Do a rollback to stable, then check that changes are correct from what we know in the worker
+ * thread structures.
+ */
+static void
+tinfo_rollback_to_stable_and_check(WT_SESSION *session)
+{
+ WT_CURSOR *cursor;
+ WT_DECL_RET;
+ char cmd[512];
+
+ testutil_check(__wt_snprintf(cmd, sizeof(cmd), ROLLBACK_STABLE_COPY_CMD, g.home));
+ if ((ret = system(cmd)) != 0)
+ testutil_die(ret, "rollback to stable copy (\"%s\") failed", cmd);
+ trace_msg("%-10s ts=%" PRIu64, "rts", g.stable_timestamp);
+
+ g.wts_conn->rollback_to_stable(g.wts_conn, NULL);
+
+ /* Check the saved snap operations for consistency. */
+ testutil_check(session->open_cursor(session, g.uri, NULL, NULL, &cursor));
+ snap_repeat_rollback(cursor, tinfo_list, g.c_threads);
+ testutil_check(cursor->close(cursor));
+}
+
+/*
* operations --
* Perform a number of operations in a set of threads.
*/
@@ -153,29 +277,23 @@ operations(u_int ops_seconds, bool lastrun)
quit_fourths = fourths + 15 * 4 * 60;
}
+ /* Get a session. */
testutil_check(conn->open_session(conn, NULL, NULL, &session));
+
+ /* Initialize and start the worker threads. */
+ tinfo_init();
trace_msg("%s", "=============== thread ops start");
/* Initialize locks to single-thread backups, failures, and timestamp updates. */
lock_init(session, &g.backup_lock);
lock_init(session, &g.ts_lock);
- /*
- * Create the per-thread structures and start the worker threads. Allocate the thread structures
- * separately to minimize false sharing.
- */
- tinfo_list = dcalloc((size_t)g.c_threads + 1, sizeof(TINFO *));
for (i = 0; i < g.c_threads; ++i) {
- tinfo_list[i] = tinfo = dcalloc(1, sizeof(TINFO));
-
- tinfo->id = (int)i + 1;
- tinfo->state = TINFO_RUNNING;
+ tinfo = tinfo_list[i];
testutil_check(__wt_thread_create(NULL, &tinfo->tid, ops, tinfo));
}
- /*
- * If a multi-threaded run, start optional special-purpose threads.
- */
+ /* Start optional special-purpose threads. */
if (g.c_alter)
testutil_check(__wt_thread_create(NULL, &alter_tid, alter, NULL));
if (g.c_backups)
@@ -278,21 +396,14 @@ operations(u_int ops_seconds, bool lastrun)
lock_destroy(session, &g.ts_lock);
trace_msg("%s", "=============== thread ops stop");
- testutil_check(session->close(session, NULL));
- for (i = 0; i < g.c_threads; ++i) {
- tinfo = tinfo_list[i];
+ if (g.c_txn_rollback_to_stable)
+ tinfo_rollback_to_stable_and_check(session);
- __wt_buf_free(NULL, &tinfo->vprint);
+ testutil_check(session->close(session, NULL));
- /*
- * Assert records were not removed unless configured to do so, otherwise subsequent runs can
- * incorrectly report scan errors.
- */
- testutil_assert(g.c_delete_pct != 0 || tinfo->remove == 0);
- free(tinfo);
- }
- free(tinfo_list);
+ if (lastrun)
+ tinfo_teardown();
}
/*
@@ -335,7 +446,7 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp)
testutil_check(__wt_snprintf(buf, sizeof(buf), "read_timestamp=%" PRIx64, ts));
ret = session->timestamp_transaction(session, buf);
if (ret == 0) {
- snap_init(tinfo, ts, true);
+ snap_op_init(tinfo, ts, true);
trace_op(tinfo, "begin snapshot read-ts=%" PRIu64 " (repeatable)", ts);
return;
}
@@ -363,7 +474,7 @@ begin_transaction_ts(TINFO *tinfo, u_int *iso_configp)
lock_writeunlock(session, &g.ts_lock);
- snap_init(tinfo, ts, false);
+ snap_op_init(tinfo, ts, false);
trace_op(tinfo, "begin snapshot read-ts=%" PRIu64 " (not repeatable)", ts);
}
@@ -401,7 +512,7 @@ begin_transaction(TINFO *tinfo, u_int *iso_configp)
wiredtiger_begin_transaction(session, config);
- snap_init(tinfo, WT_TS_NONE, false);
+ snap_op_init(tinfo, WT_TS_NONE, false);
trace_op(tinfo, "begin %s", config);
}
@@ -605,10 +716,10 @@ ops(void *arg)
testutil_check(__wt_thread_str(tinfo->tidbuf, sizeof(tinfo->tidbuf)));
/*
- * Characterize the per-thread random number generator. Normally we want independent behavior
- * so threads start in different parts of the RNG space, but we've found bugs by having the
- * threads pound on the same key/value pairs, that is, by making them traverse the same RNG
- * space. 75% of the time we run in independent RNG space.
+ * Characterize the per-thread random number generator. Normally we want independent behavior so
+ * threads start in different parts of the RNG space, but we've found bugs by having the threads
+ * pound on the same key/value pairs, that is, by making them traverse the same RNG space. 75%
+ * of the time we run in independent RNG space.
*/
if (g.c_independent_thread_rng)
__wt_random_init_seed(NULL, &tinfo->rnd);
@@ -617,17 +728,6 @@ ops(void *arg)
iso_config = ISOLATION_RANDOM; /* -Wconditional-uninitialized */
- /* Tracking of transactional snapshot isolation operations. */
- tinfo->snap = tinfo->snap_first = tinfo->snap_list;
-
- /* Set up the default key and value buffers. */
- tinfo->key = &tinfo->_key;
- key_gen_init(tinfo->key);
- tinfo->value = &tinfo->_value;
- val_gen_init(tinfo->value);
- tinfo->lastkey = &tinfo->_lastkey;
- key_gen_init(tinfo->lastkey);
-
/* Set the first operation where we'll create sessions and cursors. */
cursor = NULL;
session = NULL;
@@ -996,16 +1096,11 @@ rollback:
intxn = false;
}
- if (session != NULL)
+ if (session != NULL) {
testutil_check(session->close(session, NULL));
-
- for (i = 0; i < WT_ELEMENTS(tinfo->snap_list); ++i) {
- free(tinfo->snap_list[i].kdata);
- free(tinfo->snap_list[i].vdata);
+ tinfo->cursor = NULL;
+ tinfo->session = NULL;
}
- key_gen_teardown(tinfo->key);
- val_gen_teardown(tinfo->value);
- key_gen_teardown(tinfo->lastkey);
tinfo->state = TINFO_COMPLETE;
return (WT_THREAD_RET_VALUE);
diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c
index 3f82e5b6499..4fe784e87db 100644
--- a/src/third_party/wiredtiger/test/format/snap.c
+++ b/src/third_party/wiredtiger/test/format/snap.c
@@ -29,15 +29,111 @@
#include "format.h"
/*
+ * Issue a warning when there enough consecutive unsuccessful checks for rollback to stable.
+ */
+#define WARN_RTS_NO_CHECK 5
+
+/*
* snap_init --
* Initialize the repeatable operation tracking.
*/
void
-snap_init(TINFO *tinfo, uint64_t read_ts, bool repeatable_reads)
+snap_init(TINFO *tinfo)
+{
+ /*
+ * We maintain two snap lists. The current one is indicated by tinfo->s, and keeps the most
+ * recent operations. The other one is used when we are running with rollback_to_stable. When
+ * each thread notices that the stable timestamp has changed, it stashes the current snap list
+ * and starts fresh with the other snap list. After we've completed a rollback_to_stable, we can
+ * the secondary snap list to see the state of keys/values seen and updated at the time of the
+ * rollback.
+ */
+ if (g.c_txn_rollback_to_stable) {
+ tinfo->s = &tinfo->snap_states[1];
+ tinfo->snap_list = dcalloc(SNAP_LIST_SIZE, sizeof(SNAP_OPS));
+ tinfo->snap_end = &tinfo->snap_list[SNAP_LIST_SIZE];
+ }
+ tinfo->s = &tinfo->snap_states[0];
+ tinfo->snap_list = dcalloc(SNAP_LIST_SIZE, sizeof(SNAP_OPS));
+ tinfo->snap_end = &tinfo->snap_list[SNAP_LIST_SIZE];
+ tinfo->snap_current = tinfo->snap_list;
+}
+
+/*
+ * snap_teardown --
+ * Tear down the repeatable operation tracking structures.
+ */
+void
+snap_teardown(TINFO *tinfo)
{
+ SNAP_OPS *snaplist;
+ u_int i, snap_index;
+
+ for (snap_index = 0; snap_index < WT_ELEMENTS(tinfo->snap_states); snap_index++)
+ if ((snaplist = tinfo->snap_states[snap_index].snap_state_list) != NULL) {
+ for (i = 0; i < SNAP_LIST_SIZE; ++i) {
+ free(snaplist[i].kdata);
+ free(snaplist[i].vdata);
+ }
+ free(snaplist);
+ }
+}
+
+/*
+ * snap_clear --
+ * Clear a single snap entry.
+ */
+static void
+snap_clear_one(SNAP_OPS *snap)
+{
+ snap->repeatable = false;
+}
+
+/*
+ * snap_clear --
+ * Clear the snap list.
+ */
+static void
+snap_clear(TINFO *tinfo)
+{
+ SNAP_OPS *snap;
+
+ for (snap = tinfo->snap_list; snap < tinfo->snap_end; ++snap)
+ snap_clear_one(snap);
+}
+
+/*
+ * snap_op_init --
+ * Initialize the repeatable operation tracking for each new operation.
+ */
+void
+snap_op_init(TINFO *tinfo, uint64_t read_ts, bool repeatable_reads)
+{
+ uint64_t stable_ts;
+
++tinfo->opid;
- tinfo->snap_first = tinfo->snap;
+ if (g.c_txn_rollback_to_stable) {
+ /*
+ * If the stable timestamp has changed and we've advanced beyond it, preserve the current
+ * snapshot history up to this point, we'll use it verify rollback_to_stable. Switch our
+ * tracking to the other snap list.
+ */
+ stable_ts = __wt_atomic_addv64(&g.stable_timestamp, 0);
+ if (stable_ts != tinfo->stable_ts && read_ts > stable_ts) {
+ tinfo->stable_ts = stable_ts;
+ if (tinfo->s == &tinfo->snap_states[0])
+ tinfo->s = &tinfo->snap_states[1];
+ else
+ tinfo->s = &tinfo->snap_states[0];
+ tinfo->snap_current = tinfo->snap_list;
+
+ /* Clear out older info from the snap list. */
+ snap_clear(tinfo);
+ }
+ }
+
+ tinfo->snap_first = tinfo->snap_current;
tinfo->read_ts = read_ts;
tinfo->repeatable_reads = repeatable_reads;
@@ -54,7 +150,7 @@ snap_track(TINFO *tinfo, thread_op op)
WT_ITEM *ip;
SNAP_OPS *snap;
- snap = tinfo->snap;
+ snap = tinfo->snap_current;
snap->op = op;
snap->opid = tinfo->opid;
snap->keyno = tinfo->keyno;
@@ -82,15 +178,15 @@ snap_track(TINFO *tinfo, thread_op op)
}
/* Move to the next slot, wrap at the end of the circular buffer. */
- if (++tinfo->snap >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)])
- tinfo->snap = tinfo->snap_list;
+ if (++tinfo->snap_current >= tinfo->snap_end)
+ tinfo->snap_current = tinfo->snap_list;
/*
* It's possible to pass this transaction's buffer starting point and start replacing our own
* entries. If that happens, we can't repeat operations because we don't know which ones were
* previously modified.
*/
- if (tinfo->snap->opid == tinfo->opid)
+ if (tinfo->snap_current->opid == tinfo->opid)
tinfo->repeatable_wrap = true;
}
@@ -236,10 +332,9 @@ static void
snap_ts_clear(TINFO *tinfo, uint64_t ts)
{
SNAP_OPS *snap;
- int count;
/* Check from the first slot to the last. */
- for (snap = tinfo->snap_list, count = WT_ELEMENTS(tinfo->snap_list); count > 0; --count, ++snap)
+ for (snap = tinfo->snap_list; snap < tinfo->snap_end; ++snap)
if (snap->repeatable && snap->ts <= ts)
snap->repeatable = false;
}
@@ -297,7 +392,7 @@ snap_repeat_ok_commit(TINFO *tinfo, SNAP_OPS *current)
*/
for (p = current;;) {
/* Wrap at the end of the circular buffer. */
- if (++p >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)])
+ if (++p >= tinfo->snap_end)
p = tinfo->snap_list;
if (p->opid != tinfo->opid)
break;
@@ -311,7 +406,7 @@ snap_repeat_ok_commit(TINFO *tinfo, SNAP_OPS *current)
for (p = current;;) {
/* Wrap at the beginning of the circular buffer. */
if (--p < tinfo->snap_list)
- p = &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list) - 1];
+ p = &tinfo->snap_list[SNAP_LIST_SIZE - 1];
if (p->opid != tinfo->opid)
break;
@@ -341,7 +436,7 @@ snap_repeat_ok_rollback(TINFO *tinfo, SNAP_OPS *current)
for (p = current;;) {
/* Wrap at the beginning of the circular buffer. */
if (--p < tinfo->snap_list)
- p = &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list) - 1];
+ p = &tinfo->snap_list[SNAP_LIST_SIZE - 1];
if (p->opid != tinfo->opid)
break;
@@ -367,7 +462,7 @@ snap_repeat_txn(WT_CURSOR *cursor, TINFO *tinfo)
/* Check from the first operation we saved to the last. */
for (current = tinfo->snap_first;; ++current) {
/* Wrap at the end of the circular buffer. */
- if (current >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)])
+ if (current >= tinfo->snap_end)
current = tinfo->snap_list;
if (current->opid != tinfo->opid)
break;
@@ -401,7 +496,7 @@ snap_repeat_update(TINFO *tinfo, bool committed)
/* Check from the first operation we saved to the last. */
for (current = tinfo->snap_first;; ++current) {
/* Wrap at the end of the circular buffer. */
- if (current >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)])
+ if (current >= tinfo->snap_end)
current = tinfo->snap_list;
if (current->opid != tinfo->opid)
break;
@@ -429,45 +524,22 @@ snap_repeat_update(TINFO *tinfo, bool committed)
}
/*
- * snap_repeat_single --
- * Repeat an historic operation.
+ * snap_repeat --
+ * Repeat one operation.
*/
-void
-snap_repeat_single(WT_CURSOR *cursor, TINFO *tinfo)
+static void
+snap_repeat(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap, bool rollback_allowed)
{
- SNAP_OPS *snap;
WT_DECL_RET;
WT_SESSION *session;
- int count;
- u_int v;
char buf[64];
session = cursor->session;
/*
- * Start at a random spot in the list of operations and look for a read to retry. Stop when
- * we've walked the entire list or found one.
- */
- v = mmrand(&tinfo->rnd, 1, WT_ELEMENTS(tinfo->snap_list)) - 1;
- for (snap = &tinfo->snap_list[v], count = WT_ELEMENTS(tinfo->snap_list); count > 0;
- --count, ++snap) {
- /* Wrap at the end of the circular buffer. */
- if (snap >= &tinfo->snap_list[WT_ELEMENTS(tinfo->snap_list)])
- snap = tinfo->snap_list;
-
- if (snap->repeatable)
- break;
- }
-
- if (count == 0)
- return;
-
- /*
* Start a new transaction. Set the read timestamp. Verify the record. Discard the transaction.
*/
- while ((ret = session->begin_transaction(session, "isolation=snapshot")) == WT_CACHE_FULL)
- __wt_yield();
- testutil_check(ret);
+ wiredtiger_begin_transaction(session, "isolation=snapshot");
/*
* If the timestamp has aged out of the system, we'll get EINVAL when we try and set it.
@@ -482,7 +554,7 @@ snap_repeat_single(WT_CURSOR *cursor, TINFO *tinfo)
/* The only expected error is rollback. */
ret = snap_verify(cursor, tinfo, snap);
- if (ret != 0 && ret != WT_ROLLBACK)
+ if (ret != 0 && (!rollback_allowed || ret != WT_ROLLBACK))
testutil_check(ret);
} else if (ret == EINVAL)
snap_ts_clear(tinfo, snap->ts);
@@ -492,3 +564,91 @@ snap_repeat_single(WT_CURSOR *cursor, TINFO *tinfo)
/* Discard the transaction. */
testutil_check(session->rollback_transaction(session, NULL));
}
+
+/*
+ * snap_repeat_single --
+ * Repeat an historic operation.
+ */
+void
+snap_repeat_single(WT_CURSOR *cursor, TINFO *tinfo)
+{
+ SNAP_OPS *snap;
+ u_int v;
+ int count;
+
+ /*
+ * Start at a random spot in the list of operations and look for a read to retry. Stop when
+ * we've walked the entire list or found one.
+ */
+ v = mmrand(&tinfo->rnd, 1, SNAP_LIST_SIZE) - 1;
+ for (snap = &tinfo->snap_list[v], count = SNAP_LIST_SIZE; count > 0; --count, ++snap) {
+ /* Wrap at the end of the circular buffer. */
+ if (snap >= tinfo->snap_end)
+ snap = tinfo->snap_list;
+
+ if (snap->repeatable)
+ break;
+ }
+
+ if (count == 0)
+ return;
+
+ snap_repeat(cursor, tinfo, snap, true);
+}
+
+/*
+ * snap_repeat_rollback --
+ * Repeat all known operations after a rollback.
+ */
+void
+snap_repeat_rollback(WT_CURSOR *cursor, TINFO **tinfo_array, size_t tinfo_count)
+{
+ SNAP_OPS *snap;
+ SNAP_STATE *state;
+ TINFO *tinfo, **tinfop;
+ uint32_t count;
+ size_t i, statenum;
+ char buf[100];
+
+ count = 0;
+
+ track("rollback_to_stable: checking", 0ULL, NULL);
+ for (i = 0, tinfop = tinfo_array; i < tinfo_count; ++i, ++tinfop) {
+ tinfo = *tinfop;
+
+ /*
+ * For this thread, walk through both sets of snaps ("states"), looking for entries that are
+ * repeatable and have relevant timestamps. One set will have the most current operations,
+ * meaning they will likely be newer than the stable timestamp, and thus cannot be checked.
+ * The other set typically has operations that are just before the stable timestamp, so are
+ * candidates for checking.
+ */
+ for (statenum = 0; statenum < WT_ELEMENTS(tinfo->snap_states); statenum++) {
+ state = &tinfo->snap_states[statenum];
+ for (snap = state->snap_state_list; snap < state->snap_state_end; ++snap) {
+ if (snap->repeatable && snap->ts <= g.stable_timestamp &&
+ snap->ts >= g.oldest_timestamp) {
+ snap_repeat(cursor, tinfo, snap, false);
+ ++count;
+ if (count % 100 == 0) {
+ testutil_check(__wt_snprintf(
+ buf, sizeof(buf), "rollback_to_stable: %" PRIu32 " ops repeated", count));
+ track(buf, 0ULL, NULL);
+ }
+ }
+ snap_clear_one(snap);
+ }
+ }
+ }
+
+ /* Show the final result and check that we're accomplishing some checking. */
+ testutil_check(
+ __wt_snprintf(buf, sizeof(buf), "rollback_to_stable: %" PRIu32 " ops repeated", count));
+ track(buf, 0ULL, NULL);
+ if (count == 0) {
+ if (++g.rts_no_check >= WARN_RTS_NO_CHECK)
+ fprintf(stderr,
+ "Warning: %" PRIu32 " consecutive runs with no rollback_to_stable checking\n", count);
+ } else
+ g.rts_no_check = 0;
+}
diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c
index 787488c15b7..d2ae281e281 100644
--- a/src/third_party/wiredtiger/test/format/util.c
+++ b/src/third_party/wiredtiger/test/format/util.c
@@ -28,12 +28,46 @@
#include "format.h"
+/*
+ * track_ts_diff --
+ * Return a one character descriptor of relative timestamp values.
+ */
+static const char *
+track_ts_diff(uint64_t left_ts, uint64_t right_ts)
+{
+ if (left_ts < right_ts)
+ return "+";
+ else if (left_ts == right_ts)
+ return "=";
+ else
+ return "-";
+}
+
+/*
+ * track_ts_dots --
+ * Return an entry in the time stamp progress indicator.
+ */
+static const char *
+track_ts_dots(u_int dot_count)
+{
+ static const char *dots[] = {" ", ". ", ".. ", "..."};
+
+ return (dots[dot_count % WT_ELEMENTS(dots)]);
+}
+
+/*
+ * track --
+ * Show a status line of operations and time stamp progress.
+ */
void
track(const char *tag, uint64_t cnt, TINFO *tinfo)
{
- static size_t lastlen = 0;
+ static size_t last_len;
+ static uint64_t last_cur, last_old, last_stable;
+ static u_int cur_dot_cnt, old_dot_cnt, stable_dot_cnt;
size_t len;
- char msg[128];
+ uint64_t cur_ts, old_ts, stable_ts;
+ char msg[128], ts_msg[64];
if (g.c_quiet || tag == NULL)
return;
@@ -44,12 +78,49 @@ track(const char *tag, uint64_t cnt, TINFO *tinfo)
else if (tinfo == NULL)
testutil_check(__wt_snprintf_len_set(
msg, sizeof(msg), &len, "%4" PRIu32 ": %s: %" PRIu64, g.run_cnt, tag, cnt));
- else
+ else {
+ ts_msg[0] = '\0';
+ if (g.c_txn_timestamps) {
+ /*
+ * Don't worry about having a completely consistent set of timestamps.
+ */
+ old_ts = g.oldest_timestamp;
+ stable_ts = g.stable_timestamp;
+ cur_ts = g.timestamp;
+
+ if (old_ts != last_old) {
+ ++old_dot_cnt;
+ last_old = old_ts;
+ }
+ if (stable_ts != last_stable) {
+ ++stable_dot_cnt;
+ last_stable = stable_ts;
+ }
+ if (cur_ts != last_cur) {
+ ++cur_dot_cnt;
+ last_cur = cur_ts;
+ }
+
+ if (g.c_txn_rollback_to_stable)
+ testutil_check(__wt_snprintf(ts_msg, sizeof(ts_msg),
+ " old%s"
+ "stb%s%s"
+ "ts%s%s",
+ track_ts_dots(old_dot_cnt), track_ts_diff(old_ts, stable_ts),
+ track_ts_dots(stable_dot_cnt), track_ts_diff(stable_ts, cur_ts),
+ track_ts_dots(cur_dot_cnt)));
+ else
+ testutil_check(__wt_snprintf(ts_msg, sizeof(ts_msg),
+ " old%s"
+ "ts%s%s",
+ track_ts_dots(old_dot_cnt), track_ts_diff(old_ts, cur_ts),
+ track_ts_dots(cur_dot_cnt)));
+ }
testutil_check(__wt_snprintf_len_set(msg, sizeof(msg), &len, "%4" PRIu32 ": %s: "
- "search %" PRIu64 "%s, "
- "insert %" PRIu64 "%s, "
- "update %" PRIu64 "%s, "
- "remove %" PRIu64 "%s",
+ "S %" PRIu64 "%s, "
+ "I %" PRIu64 "%s, "
+ "U %" PRIu64 "%s, "
+ "R %" PRIu64 "%s%s",
g.run_cnt, tag, tinfo->search > M(9) ? tinfo->search / M(1) : tinfo->search,
tinfo->search > M(9) ? "M" : "",
tinfo->insert > M(9) ? tinfo->insert / M(1) : tinfo->insert,
@@ -57,13 +128,13 @@ track(const char *tag, uint64_t cnt, TINFO *tinfo)
tinfo->update > M(9) ? tinfo->update / M(1) : tinfo->update,
tinfo->update > M(9) ? "M" : "",
tinfo->remove > M(9) ? tinfo->remove / M(1) : tinfo->remove,
- tinfo->remove > M(9) ? "M" : ""));
-
- if (lastlen > len) {
- memset(msg + len, ' ', (size_t)(lastlen - len));
- msg[lastlen] = '\0';
+ tinfo->remove > M(9) ? "M" : "", ts_msg));
+ }
+ if (last_len > len) {
+ memset(msg + len, ' ', (size_t)(last_len - len));
+ msg[last_len] = '\0';
}
- lastlen = len;
+ last_len = len;
if (printf("%s\r", msg) < 0)
testutil_die(EIO, "printf");
@@ -163,16 +234,18 @@ fclose_and_clear(FILE **fpp)
* Update the timestamp once.
*/
void
-timestamp_once(WT_SESSION *session)
+timestamp_once(WT_SESSION *session, bool allow_lag)
{
static const char *oldest_timestamp_str = "oldest_timestamp=";
+ static const char *stable_timestamp_str = "stable_timestamp=";
WT_CONNECTION *conn;
WT_DECL_RET;
- char buf[WT_TS_HEX_STRING_SIZE + 64];
+ size_t len;
+ uint64_t all_durable, stable;
+ char buf[WT_TS_HEX_STRING_SIZE * 2 + 64], tsbuf[WT_TS_HEX_STRING_SIZE];
conn = g.wts_conn;
-
- testutil_check(__wt_snprintf(buf, sizeof(buf), "%s", oldest_timestamp_str));
+ stable = 0ULL;
/*
* Lock out transaction timestamp operations. The lock acts as a barrier ensuring we've checked
@@ -183,16 +256,57 @@ timestamp_once(WT_SESSION *session)
if (LOCK_INITIALIZED(&g.ts_lock))
lock_writelock(session, &g.ts_lock);
- ret = conn->query_timestamp(conn, buf + strlen(oldest_timestamp_str), "get=all_durable");
- testutil_assert(ret == 0 || ret == WT_NOTFOUND);
- if (ret == 0)
+ if ((ret = conn->query_timestamp(conn, tsbuf, "get=all_durable")) == 0) {
+ timestamp_parse(session, tsbuf, &all_durable);
+
+ /*
+ * If a lag is permitted, move the oldest timestamp half the way to the current
+ * "all_durable" timestamp.
+ */
+ if (allow_lag)
+ g.oldest_timestamp = (all_durable + g.oldest_timestamp) / 2;
+ else
+ g.oldest_timestamp = all_durable;
+ testutil_check(
+ __wt_snprintf(buf, sizeof(buf), "%s%" PRIx64, oldest_timestamp_str, g.oldest_timestamp));
+
+ /*
+ * When we're doing rollback to stable operations, we'll advance the stable timestamp to the
+ * current timestamp value.
+ */
+ if (g.c_txn_rollback_to_stable) {
+ stable = g.timestamp;
+ len = strlen(buf);
+ WT_ASSERT((WT_SESSION_IMPL *)session, len < sizeof(buf));
+ testutil_check(__wt_snprintf(
+ buf + len, sizeof(buf) - len, ",%s%" PRIx64, stable_timestamp_str, stable));
+ }
testutil_check(conn->set_timestamp(conn, buf));
+ trace_msg("%-10s oldest=%" PRIu64 ", stable=%" PRIu64, "setts", g.oldest_timestamp, stable);
+ if (g.c_txn_rollback_to_stable)
+ g.stable_timestamp = stable;
+
+ } else
+ testutil_assert(ret == WT_NOTFOUND);
if (LOCK_INITIALIZED(&g.ts_lock))
lock_writeunlock(session, &g.ts_lock);
}
/*
+ * timestamp_parse --
+ * Parse a timestamp to an integral value.
+ */
+void
+timestamp_parse(WT_SESSION *session, const char *str, uint64_t *tsp)
+{
+ char *p;
+
+ *tsp = strtoull(str, &p, 16);
+ WT_ASSERT((WT_SESSION_IMPL *)session, p - str <= 16);
+}
+
+/*
* timestamp --
* Periodically update the oldest timestamp.
*/
@@ -212,16 +326,28 @@ timestamp(void *arg)
/* Update the oldest timestamp at least once every 15 seconds. */
done = false;
do {
+ random_sleep(&g.rnd, 15);
+
/*
- * Do a final bump of the oldest timestamp as part of shutting down the worker threads,
- * otherwise recent operations can prevent verify from running.
+ * If running without rollback_to_stable, do a final bump of the oldest timestamp as part of
+ * shutting down the worker threads, otherwise recent operations can prevent verify from
+ * running.
+ *
+ * With rollback_to_stable configured, don't do a bump at the end of the run. We need the
+ * worker threads to have time to see any changes in the stable timestamp, so they can stash
+ * their stable state - if we bump they will have no time to do that. And when we rollback,
+ * we'd like to see a reasonable amount of data changed. So we don't bump the stable
+ * timestamp, and we can't bump the oldest timestamp as well, as it would get ahead of the
+ * stable timestamp, which is not allowed.
*/
if (g.workers_finished)
done = true;
- else
- random_sleep(&g.rnd, 15);
- timestamp_once(session);
+ if (!done || !g.c_txn_rollback_to_stable) {
+ timestamp_once(session, true);
+ if (done)
+ timestamp_once(session, true);
+ }
} while (!done);
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable11.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable11.py
new file mode 100755
index 00000000000..eed6d8e0c26
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable11.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import fnmatch, os, shutil, time
+from helper import copy_wiredtiger_home
+from test_rollback_to_stable01 import test_rollback_to_stable_base
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+def timestamp_str(t):
+ return '%x' % t
+
+# test_rollback_to_stable11.py
+# Test the rollback to stable is retrieving the proper hs update.
+class test_rollback_to_stable11(test_rollback_to_stable_base):
+ session_config = 'isolation=snapshot'
+
+ prepare_values = [
+ ('no_prepare', dict(prepare=False)),
+ ('prepare', dict(prepare=True))
+ ]
+
+ scenarios = make_scenarios(prepare_values)
+
+ def conn_config(self):
+ config = 'cache_size=1MB,statistics=(all),log=(enabled=true)'
+ return config
+
+ def simulate_crash_restart(self, olddir, newdir):
+ ''' Simulate a crash from olddir and restart in newdir. '''
+ # with the connection still open, copy files to new directory
+ shutil.rmtree(newdir, ignore_errors=True)
+ os.mkdir(newdir)
+ for fname in os.listdir(olddir):
+ fullname = os.path.join(olddir, fname)
+ # Skip lock file on Windows since it is locked
+ if os.path.isfile(fullname) and \
+ "WiredTiger.lock" not in fullname and \
+ "Tmplog" not in fullname and \
+ "Preplog" not in fullname:
+ shutil.copy(fullname, newdir)
+ #
+ # close the original connection and open to new directory
+ # NOTE: This really cannot test the difference between the
+ # write-no-sync (off) version of log_flush and the sync
+ # version since we're not crashing the system itself.
+ #
+ self.close_conn()
+ self.conn = self.setUpConnectionOpen(newdir)
+ self.session = self.setUpSessionOpen(self.conn)
+
+ def test_rollback_to_stable(self):
+ nrows = 1
+
+ # Create a table without logging.
+ uri = "table:rollback_to_stable11"
+ ds = SimpleDataSet(
+ self, uri, 0, key_format="i", value_format="S", config='log=(enabled=false)')
+ ds.populate()
+
+ # Pin oldest and stable to timestamp 10.
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) +
+ ',stable_timestamp=' + timestamp_str(10))
+
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+ value_c = "ccccc" * 100
+ value_d = "ddddd" * 100
+
+ # Perform several updates.
+ self.large_updates(uri, value_a, ds, nrows, 20)
+ self.large_updates(uri, value_a, ds, nrows, 20)
+ self.large_updates(uri, value_a, ds, nrows, 20)
+ self.large_updates(uri, value_b, ds, nrows, 20)
+
+ # Verify data is visible and correct.
+ self.check(value_b, uri, nrows, 20)
+
+ # Pin stable to timestamp 30 if prepare otherwise 20.
+ if self.prepare:
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(30))
+ else:
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(20))
+
+ # Checkpoint to ensure that all the updates are flushed to disk.
+ self.session.checkpoint()
+
+ # Simulate a server crash and restart.
+ self.simulate_crash_restart(".", "RESTART")
+
+ # Check that the correct data is seen at and after the stable timestamp.
+ self.check(value_b, uri, nrows, 20)
+
+ # Perform several updates.
+ self.large_updates(uri, value_c, ds, nrows, 30)
+ self.large_updates(uri, value_c, ds, nrows, 30)
+ self.large_updates(uri, value_c, ds, nrows, 30)
+ self.large_updates(uri, value_d, ds, nrows, 30)
+
+ # Verify data is visible and correct.
+ self.check(value_d, uri, nrows, 30)
+
+ # Checkpoint to ensure that all the updates are flushed to disk.
+ self.session.checkpoint()
+
+ # Simulate a server crash and restart.
+ self.simulate_crash_restart("RESTART", "RESTART2")
+
+ # Check that the correct data is seen at and after the stable timestamp.
+ self.check(value_b, uri, nrows, 20)
+ self.check(value_b, uri, nrows, 40)
+
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ calls = stat_cursor[stat.conn.txn_rts][2]
+ hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2]
+ hs_sweep = stat_cursor[stat.conn.txn_rts_sweep_hs_keys][2]
+ keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2]
+ keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2]
+ pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2]
+ upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2]
+ stat_cursor.close()
+
+ self.assertEqual(calls, 0)
+ self.assertEqual(keys_removed, 0)
+ self.assertEqual(keys_restored, 0)
+ self.assertEqual(upd_aborted, 0)
+ self.assertGreater(pages_visited, 0)
+ self.assertEqual(hs_removed, 4)
+ self.assertEqual(hs_sweep, 0)
+
+if __name__ == '__main__':
+ wttest.run()
diff --git a/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py
new file mode 100755
index 00000000000..b4fa7a9087b
--- /dev/null
+++ b/src/third_party/wiredtiger/test/suite/test_rollback_to_stable12.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python
+#
+# Public Domain 2014-2020 MongoDB, Inc.
+# Public Domain 2008-2014 WiredTiger, Inc.
+#
+# This is free and unencumbered software released into the public domain.
+#
+# Anyone is free to copy, modify, publish, use, compile, sell, or
+# distribute this software, either in source code form or as a compiled
+# binary, for any purpose, commercial or non-commercial, and by any
+# means.
+#
+# In jurisdictions that recognize copyright laws, the author or authors
+# of this software dedicate any and all copyright interest in the
+# software to the public domain. We make this dedication for the benefit
+# of the public at large and to the detriment of our heirs and
+# successors. We intend this dedication to be an overt act of
+# relinquishment in perpetuity of all present and future rights to this
+# software under copyright law.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import fnmatch, os, shutil, time
+from helper import copy_wiredtiger_home
+from test_rollback_to_stable01 import test_rollback_to_stable_base
+from wiredtiger import stat
+from wtdataset import SimpleDataSet
+from wtscenario import make_scenarios
+
+def timestamp_str(t):
+ return '%x' % t
+
+# test_rollback_to_stable12.py
+# Test the rollback to stable operation skipping subtrees in during tree walk.
+class test_rollback_to_stable12(test_rollback_to_stable_base):
+ session_config = 'isolation=snapshot'
+
+ prepare_values = [
+ ('no_prepare', dict(prepare=False)),
+ ('prepare', dict(prepare=True))
+ ]
+
+ scenarios = make_scenarios(prepare_values)
+
+ def conn_config(self):
+ config = 'cache_size=500MB,statistics=(all),log=(enabled=true)'
+ return config
+
+ def simulate_crash_restart(self, olddir, newdir):
+ ''' Simulate a crash from olddir and restart in newdir. '''
+ # with the connection still open, copy files to new directory
+ shutil.rmtree(newdir, ignore_errors=True)
+ os.mkdir(newdir)
+ for fname in os.listdir(olddir):
+ fullname = os.path.join(olddir, fname)
+ # Skip lock file on Windows since it is locked
+ if os.path.isfile(fullname) and \
+ "WiredTiger.lock" not in fullname and \
+ "Tmplog" not in fullname and \
+ "Preplog" not in fullname:
+ shutil.copy(fullname, newdir)
+ #
+ # close the original connection and open to new directory
+ # NOTE: This really cannot test the difference between the
+ # write-no-sync (off) version of log_flush and the sync
+ # version since we're not crashing the system itself.
+ #
+ self.close_conn()
+ self.conn = self.setUpConnectionOpen(newdir)
+ self.session = self.setUpSessionOpen(self.conn)
+
+ def test_rollback_to_stable(self):
+ nrows = 1000000
+
+ # Create a table without logging.
+ uri = "table:rollback_to_stable12"
+ ds = SimpleDataSet(
+ self, uri, 0, key_format="i", value_format="S", config='split_pct=50,log=(enabled=false)')
+ ds.populate()
+
+ # Pin oldest and stable to timestamp 10.
+ self.conn.set_timestamp('oldest_timestamp=' + timestamp_str(10) +
+ ',stable_timestamp=' + timestamp_str(10))
+
+ value_a = "aaaaa" * 100
+ value_b = "bbbbb" * 100
+
+ # Perform several updates.
+ self.large_updates(uri, value_a, ds, nrows, 20)
+
+ # Verify data is visible and correct.
+ self.check(value_a, uri, nrows, 20)
+
+ # Pin stable to timestamp 30 if prepare otherwise 20.
+ if self.prepare:
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(30))
+ else:
+ self.conn.set_timestamp('stable_timestamp=' + timestamp_str(20))
+
+ # Load a single row modification to be removed.
+ commit_ts = 30
+ cursor = self.session.open_cursor(uri)
+ self.session.begin_transaction()
+ cursor[ds.key(1)] = value_b
+ if self.prepare:
+ self.session.prepare_transaction('prepare_timestamp=' + timestamp_str(commit_ts-1))
+ self.session.timestamp_transaction('commit_timestamp=' + timestamp_str(commit_ts))
+ self.session.timestamp_transaction('durable_timestamp=' + timestamp_str(commit_ts+1))
+ self.session.commit_transaction()
+ else:
+ self.session.commit_transaction('commit_timestamp=' + timestamp_str(commit_ts))
+ cursor.close()
+
+ self.session.checkpoint()
+
+ # Simulate a server crash and restart.
+ self.simulate_crash_restart(".", "RESTART")
+
+ # Check that the correct data is seen at and after the stable timestamp.
+ self.check(value_a, uri, nrows, 30)
+
+ stat_cursor = self.session.open_cursor('statistics:', None, None)
+ calls = stat_cursor[stat.conn.txn_rts][2]
+ hs_removed = stat_cursor[stat.conn.txn_rts_hs_removed][2]
+ hs_sweep = stat_cursor[stat.conn.txn_rts_sweep_hs_keys][2]
+ keys_removed = stat_cursor[stat.conn.txn_rts_keys_removed][2]
+ keys_restored = stat_cursor[stat.conn.txn_rts_keys_restored][2]
+ pages_visited = stat_cursor[stat.conn.txn_rts_pages_visited][2]
+ pages_walk_skipped = stat_cursor[stat.conn.txn_rts_skip_interal_pages_walk][2]
+ upd_aborted = stat_cursor[stat.conn.txn_rts_upd_aborted][2]
+ stat_cursor.close()
+
+ self.assertEqual(calls, 0)
+ self.assertEqual(keys_removed, 0)
+ self.assertEqual(keys_restored, 0)
+ self.assertGreaterEqual(upd_aborted, 0)
+ self.assertGreater(pages_visited, 0)
+ self.assertGreaterEqual(hs_removed, 0)
+ self.assertEqual(hs_sweep, 0)
+ self.assertGreater(pages_walk_skipped, 0)
+
+if __name__ == '__main__':
+ wttest.run()