summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2023-02-13 09:36:07 +1100
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2023-02-12 23:24:00 +0000
commit2d695bc7f8eb7328efd728ab0419255bbcc5beed (patch)
tree889c07054168a84eb3c811d90e4693614dbdab4f
parentec7653ce39c817c1832fd6238474b1264a31b7b0 (diff)
downloadmongo-2d695bc7f8eb7328efd728ab0419255bbcc5beed.tar.gz
Import wiredtiger: 604dd69988250e1c8698cf7e5ac5dbce4a8f88f7 from branch mongodb-master
ref: bb3421a839..604dd69988 for: 7.0.0-rc0 WT-9915 For tiered storage testing, get predictable outputs for test/format
-rw-r--r--src/third_party/wiredtiger/import.data2
-rwxr-xr-xsrc/third_party/wiredtiger/test/evergreen.yml61
-rw-r--r--src/third_party/wiredtiger/test/format/CONFIG.replay26
-rw-r--r--src/third_party/wiredtiger/test/format/alter.c4
-rw-r--r--src/third_party/wiredtiger/test/format/backup.c10
-rw-r--r--src/third_party/wiredtiger/test/format/bulk.c8
-rw-r--r--src/third_party/wiredtiger/test/format/checkpoint.c10
-rw-r--r--src/third_party/wiredtiger/test/format/compact.c4
-rw-r--r--src/third_party/wiredtiger/test/format/config.h75
-rwxr-xr-xsrc/third_party/wiredtiger/test/format/config.sh8
-rw-r--r--src/third_party/wiredtiger/test/format/format.h81
-rw-r--r--src/third_party/wiredtiger/test/format/format_config.c207
-rw-r--r--src/third_party/wiredtiger/test/format/format_config_def.c15
-rw-r--r--src/third_party/wiredtiger/test/format/format_inline.h32
-rw-r--r--src/third_party/wiredtiger/test/format/format_salvage.c2
-rw-r--r--src/third_party/wiredtiger/test/format/format_timestamp.c43
-rw-r--r--src/third_party/wiredtiger/test/format/hs.c6
-rw-r--r--src/third_party/wiredtiger/test/format/import.c4
-rw-r--r--src/third_party/wiredtiger/test/format/kv.c6
-rw-r--r--src/third_party/wiredtiger/test/format/ops.c319
-rw-r--r--src/third_party/wiredtiger/test/format/random.c6
-rw-r--r--src/third_party/wiredtiger/test/format/replay.c548
-rw-r--r--src/third_party/wiredtiger/test/format/snap.c2
-rw-r--r--src/third_party/wiredtiger/test/format/t.c26
-rw-r--r--src/third_party/wiredtiger/test/format/wts.c6
-rwxr-xr-xsrc/third_party/wiredtiger/tools/wt_cmp_dir2
26 files changed, 1301 insertions, 212 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 17bb59ba27a..332601000e4 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger.git",
"branch": "mongodb-master",
- "commit": "bb3421a83981c5ece92579e9689e1636db90b559"
+ "commit": "604dd69988250e1c8698cf7e5ac5dbce4a8f88f7"
}
diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml
index 07931f47523..1d360bfdd6e 100755
--- a/src/third_party/wiredtiger/test/evergreen.yml
+++ b/src/third_party/wiredtiger/test/evergreen.yml
@@ -473,6 +473,53 @@ functions:
for i in $(seq ${times|1}); do
./t -c ${config|../../../test/format/CONFIG.stress} ${trace_args|-T bulk,txn,retain=100} ${extra_args|} || ( [ -f RUNDIR/CONFIG ] && cat RUNDIR/CONFIG ) 2>&1
done
+ "format test predictable":
+ command: shell.exec
+ params:
+ working_dir: "wiredtiger/cmake_build/test/format"
+ script: |
+ # To test predictable replay, we run test/format three times with the same data seed
+ # each time, and compare the keys and values found in the WT home directories.
+ # The first run is a timed one. When it's completed, we get the run's stable timestamp,
+ # and do the subsequent runs up to that stable timestamp. This, along with predictable
+ # replay using the same data seed, should guarantee we have equivalent data created.
+ set -o errexit
+ set -o verbose
+ fail() {
+ echo "======= FAILURE =========="
+ for file; do
+ if [ -f "$file" ]; then
+ echo Contents of "$file":
+ cat "$file"
+ echo "================"
+ fi
+ done
+ exit 1
+ }
+ runtime=3 # minutes
+ config=../../../test/format/CONFIG.replay
+ for i in $(seq ${times}); do
+ echo Iteration $i/${times}
+ x2=$RANDOM$RANDOM
+ x3=$RANDOM$RANDOM
+ rm -rf RUNDIR_{1,2,3}
+
+ first_run_args="-c $config runs.timer=$runtime"
+ ./t -h RUNDIR_1 $first_run_args ${extra_args} || fail RUNDIR_1/CONFIG 2>&1
+ stable_hex=$(../../../tools/wt_timestamps RUNDIR_1 | sed -e '/stable=/!d' -e 's/.*=//')
+ ops=$(echo $((0x$stable_hex)))
+
+ # Do the second run up to the stable timestamp, using the same data seed,
+ # but with a different extra seed. Compare it when done.
+ common_args="-c RUNDIR_1/CONFIG runs.timer=0 runs.ops=$ops"
+ ./t -h RUNDIR_2 $common_args random.extra_seed=$x2 || fail RUNDIR_2/CONFIG 2>&1
+ ../../../tools/wt_cmp_dir RUNDIR_1 RUNDIR_2 || fail RUNDIR_1/CONFIG RUNDIR_2/CONFIG 2>&1
+
+ # Do the third run up to the stable timestamp, using the same data seed,
+ # but with a different extra seed. Compare it to the second run when done.
+ ./t -h RUNDIR_3 $common_args random.extra_seed=$x3 || fail RUNDIR_3/CONFIG 2>&1
+ ../../../tools/wt_cmp_dir RUNDIR_2 RUNDIR_3 || fail RUNDIR_2/CONFIG RUNDIR_3/CONFIG 2>&1
+ done
"format test script":
command: shell.exec
params:
@@ -3727,6 +3774,19 @@ tasks:
vars:
format_test_script_args: -a -t 30
+ - name: format-predictable-test
+ # Set 2.5 hour timeout (60 * 60 * 2.5)
+ exec_timeout_secs: 9000
+ commands:
+ - func: "get project"
+ - func: "compile wiredtiger"
+ vars:
+ <<: *configure_flags_with_builtins
+ CMAKE_TOOLCHAIN_FILE: -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/mongodbtoolchain_v4_gcc.cmake
+ - func: "format test predictable"
+ vars:
+ times: 5
+
- name: many-collection-test
commands:
- command: timeout.update
@@ -4918,6 +4978,7 @@ buildvariants:
- name: ".stress-test-4-nonstandalone"
- name: ".stress-test-no-barrier-nonstandalone"
- name: format-abort-recovery-stress-test-nonstandalone
+ - name: format-predictable-test
# When running the Python tests on this variant tcmalloc must be preloaded otherwise the wiredtiger library
# fails to load and resolve its dependency.
diff --git a/src/third_party/wiredtiger/test/format/CONFIG.replay b/src/third_party/wiredtiger/test/format/CONFIG.replay
new file mode 100644
index 00000000000..9e42cf00cfd
--- /dev/null
+++ b/src/third_party/wiredtiger/test/format/CONFIG.replay
@@ -0,0 +1,26 @@
+############################################
+# RUN PARAMETERS: V3
+############################################
+# A configuration for predictable replay.
+# Some things are locked down at the moment.
+backup=0
+btree.huffman_value=0
+cache.minimum=20
+format.abort=0
+format.independent_thread_rng=1
+import=0
+ops.alter=0
+ops.compaction=0
+ops.truncate=0
+ops.salvage=0
+quiet=0
+runs.in_memory=0
+runs.mirror=0
+runs.predictable_replay=1
+runs.rows=1000000:5000000
+runs.tables=3:10
+runs.threads=4:32
+runs.timer=6:30
+runs.timer=30
+transaction.implicit=0
+transaction.timestamps=1
diff --git a/src/third_party/wiredtiger/test/format/alter.c b/src/third_party/wiredtiger/test/format/alter.c
index 546bfb8a8d5..8f5e0000662 100644
--- a/src/third_party/wiredtiger/test/format/alter.c
+++ b/src/third_party/wiredtiger/test/format/alter.c
@@ -60,14 +60,14 @@ alter(void *arg)
counter = 0;
while (!g.workers_finished) {
- period = mmrand(NULL, 1, 10);
+ period = mmrand(&g.extra_rnd, 1, 10);
testutil_check(__wt_snprintf(
buf, sizeof(buf), "access_pattern_hint=%s", access_value ? "random" : "none"));
access_value = !access_value;
/* Alter can return EBUSY if concurrent with other operations. */
- table = table_select(NULL);
+ table = table_select(NULL, false);
trace_msg(session, "Alter #%u URI %s start %s", ++counter, table->uri, buf);
while ((ret = session->alter(session, table->uri, buf)) != 0 && ret != EBUSY)
diff --git a/src/third_party/wiredtiger/test/format/backup.c b/src/third_party/wiredtiger/test/format/backup.c
index f463b20d5a4..31f8aa5ee2e 100644
--- a/src/third_party/wiredtiger/test/format/backup.c
+++ b/src/third_party/wiredtiger/test/format/backup.c
@@ -539,7 +539,7 @@ backup(void *arg)
* larger intervals, optionally do incremental backups between full backups.
*/
this_id = 0;
- for (period = mmrand(NULL, 1, 10);; period = mmrand(NULL, 20, 45)) {
+ for (period = mmrand(&g.extra_rnd, 1, 10);; period = mmrand(&g.extra_rnd, 20, 45)) {
/* Sleep for short periods so we don't make the run wait. */
while (period > 0 && !g.workers_finished) {
--period;
@@ -584,7 +584,7 @@ backup(void *arg)
src_id, g.backup_id));
/* Restart a full incremental every once in a while. */
full = false;
- incr_full = mmrand(NULL, 1, 8) == 1;
+ incr_full = mmrand(&g.extra_rnd, 1, 8) == 1;
}
this_id = g.backup_id++;
config = cfg;
@@ -600,7 +600,7 @@ backup(void *arg)
config = cfg;
full = false;
/* Restart a full incremental every once in a while. */
- incr_full = mmrand(NULL, 1, 8) == 1;
+ incr_full = mmrand(&g.extra_rnd, 1, 8) == 1;
}
} else {
config = NULL;
@@ -679,9 +679,9 @@ backup(void *arg)
if (full) {
incremental = 1;
if (g.backup_incr_flag == INCREMENTAL_LOG)
- incremental = GV(LOGGING_REMOVE) ? 1 : mmrand(NULL, 1, 8);
+ incremental = GV(LOGGING_REMOVE) ? 1 : mmrand(&g.extra_rnd, 1, 8);
else if (g.backup_incr_flag == INCREMENTAL_BLOCK)
- incremental = mmrand(NULL, 1, 8);
+ incremental = mmrand(&g.extra_rnd, 1, 8);
}
if (--incremental == 0) {
check_copy();
diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c
index a811b3866b0..c53f88ab0ab 100644
--- a/src/third_party/wiredtiger/test/format/bulk.c
+++ b/src/third_party/wiredtiger/test/format/bulk.c
@@ -121,7 +121,7 @@ table_load(TABLE *base, TABLE *table)
if (table->type == ROW)
key_gen(table, &key, keyno);
if (base == NULL)
- val_gen(table, NULL, &value, &bitv, keyno);
+ val_gen(table, &g.data_rnd, &value, &bitv, keyno);
else {
testutil_check(read_op(base_cursor, NEXT, NULL));
testutil_check(base_cursor->get_value(base_cursor, &value));
@@ -168,6 +168,12 @@ table_load(TABLE *base, TABLE *table)
testutil_assertfmt(base == NULL && (ret == WT_CACHE_FULL || ret == WT_ROLLBACK),
"WT_CURSOR.insert failed: %d", ret);
+ /*
+ * If this occurs with predictable replay, we may need to redo the bulk load with fewer
+ * keys in each batch. For now, we just don't handle it.
+ */
+ testutil_assert(!GV(RUNS_PREDICTABLE_REPLAY));
+
if (g.transaction_timestamps_config) {
bulk_rollback_transaction(session);
bulk_begin_transaction(session);
diff --git a/src/third_party/wiredtiger/test/format/checkpoint.c b/src/third_party/wiredtiger/test/format/checkpoint.c
index 7b63eaba648..f7afc24374b 100644
--- a/src/third_party/wiredtiger/test/format/checkpoint.c
+++ b/src/third_party/wiredtiger/test/format/checkpoint.c
@@ -79,7 +79,7 @@ checkpoint(void *arg)
wt_wrap_open_session(conn, &sap, NULL, &session);
named_checkpoints = !g.lsm_config;
- for (secs = mmrand(NULL, 1, 10); !g.workers_finished;) {
+ for (secs = mmrand(&g.extra_rnd, 1, 10); !g.workers_finished;) {
if (secs > 0) {
__wt_sleep(1, 0);
--secs;
@@ -96,7 +96,7 @@ checkpoint(void *arg)
ckpt_vrfy_name = "WiredTigerCheckpoint";
backup_locked = false;
if (named_checkpoints)
- switch (mmrand(NULL, 1, 20)) {
+ switch (mmrand(&g.extra_rnd, 1, 20)) {
case 1:
/*
* 5% create a named snapshot. Rotate between a few names to test multiple named
@@ -105,8 +105,8 @@ checkpoint(void *arg)
ret = lock_try_writelock(session, &g.backup_lock);
if (ret == 0) {
backup_locked = true;
- testutil_check(__wt_snprintf(
- config_buf, sizeof(config_buf), "name=mine.%" PRIu32, mmrand(NULL, 1, 4)));
+ testutil_check(__wt_snprintf(config_buf, sizeof(config_buf),
+ "name=mine.%" PRIu32, mmrand(&g.extra_rnd, 1, 4)));
ckpt_config = config_buf;
ckpt_vrfy_name = config_buf + strlen("name=");
} else if (ret != EBUSY)
@@ -143,7 +143,7 @@ checkpoint(void *arg)
/* Verify the checkpoints. */
wts_verify_checkpoint(conn, ckpt_vrfy_name);
- secs = mmrand(NULL, 5, 40);
+ secs = mmrand(&g.extra_rnd, 5, 40);
}
wt_wrap_open_session(conn, &sap, NULL, &session);
diff --git a/src/third_party/wiredtiger/test/format/compact.c b/src/third_party/wiredtiger/test/format/compact.c
index feea20fa092..84a467d0734 100644
--- a/src/third_party/wiredtiger/test/format/compact.c
+++ b/src/third_party/wiredtiger/test/format/compact.c
@@ -54,7 +54,7 @@ compact(void *arg)
* Perform compaction at somewhere under 15 seconds (so we get at least one done), and then at
* 23 second intervals.
*/
- for (period = mmrand(NULL, 1, 15);; period = 23) {
+ for (period = mmrand(&g.extra_rnd, 1, 15);; period = 23) {
/* Sleep for short periods so we don't make the run wait. */
while (period > 0 && !g.workers_finished) {
--period;
@@ -70,7 +70,7 @@ compact(void *arg)
* Compact returns ETIMEDOUT if the compaction doesn't finish in some number of seconds. We
* don't configure a timeout and occasionally exceed the default of 1200 seconds.
*/
- table = table_select(NULL);
+ table = table_select(NULL, false);
ret = session->compact(session, table->uri, NULL);
testutil_assertfmt(ret == 0 || ret == EBUSY || ret == ETIMEDOUT || ret == WT_CACHE_FULL ||
ret == WT_ROLLBACK,
diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h
index 201df695c97..bfdc9650699 100644
--- a/src/third_party/wiredtiger/test/format/config.h
+++ b/src/third_party/wiredtiger/test/format/config.h
@@ -110,40 +110,43 @@ typedef struct {
#define V_TABLE_OPS_TRUNCATE 78
#define V_GLOBAL_OPS_VERIFY 79
#define V_GLOBAL_QUIET 80
-#define V_GLOBAL_RUNS_IN_MEMORY 81
-#define V_GLOBAL_RUNS_OPS 82
-#define V_TABLE_RUNS_MIRROR 83
-#define V_TABLE_RUNS_ROWS 84
-#define V_TABLE_RUNS_SOURCE 85
-#define V_GLOBAL_RUNS_TABLES 86
-#define V_GLOBAL_RUNS_THREADS 87
-#define V_GLOBAL_RUNS_TIMER 88
-#define V_TABLE_RUNS_TYPE 89
-#define V_GLOBAL_RUNS_VERIFY_FAILURE_DUMP 90
-#define V_GLOBAL_STATISTICS_MODE 91
-#define V_GLOBAL_STATISTICS_LOG_SOURCES 92
-#define V_GLOBAL_STRESS_AGGRESSIVE_SWEEP 93
-#define V_GLOBAL_STRESS_CHECKPOINT 94
-#define V_GLOBAL_STRESS_CHECKPOINT_EVICT_PAGE 95
-#define V_GLOBAL_STRESS_CHECKPOINT_PREPARE 96
-#define V_GLOBAL_STRESS_EVICT_REPOSITION 97
-#define V_GLOBAL_STRESS_FAILPOINT_EVICTION_FAIL_AFTER_RECONCILIATION 98
-#define V_GLOBAL_STRESS_FAILPOINT_HS_DELETE_KEY_FROM_TS 99
-#define V_GLOBAL_STRESS_HS_CHECKPOINT_DELAY 100
-#define V_GLOBAL_STRESS_HS_SEARCH 101
-#define V_GLOBAL_STRESS_HS_SWEEP 102
-#define V_GLOBAL_STRESS_SLEEP_BEFORE_READ_OVERFLOW_ONPAGE 103
-#define V_GLOBAL_STRESS_SPLIT_1 104
-#define V_GLOBAL_STRESS_SPLIT_2 105
-#define V_GLOBAL_STRESS_SPLIT_3 106
-#define V_GLOBAL_STRESS_SPLIT_4 107
-#define V_GLOBAL_STRESS_SPLIT_5 108
-#define V_GLOBAL_STRESS_SPLIT_6 109
-#define V_GLOBAL_STRESS_SPLIT_7 110
-#define V_GLOBAL_TRANSACTION_IMPLICIT 111
-#define V_GLOBAL_TRANSACTION_TIMESTAMPS 112
-#define V_GLOBAL_WIREDTIGER_CONFIG 113
-#define V_GLOBAL_WIREDTIGER_RWLOCK 114
-#define V_GLOBAL_WIREDTIGER_LEAK_MEMORY 115
+#define V_GLOBAL_RANDOM_DATA_SEED 81
+#define V_GLOBAL_RANDOM_EXTRA_SEED 82
+#define V_GLOBAL_RUNS_IN_MEMORY 83
+#define V_TABLE_RUNS_MIRROR 84
+#define V_GLOBAL_RUNS_OPS 85
+#define V_GLOBAL_RUNS_PREDICTABLE_REPLAY 86
+#define V_TABLE_RUNS_ROWS 87
+#define V_TABLE_RUNS_SOURCE 88
+#define V_GLOBAL_RUNS_TABLES 89
+#define V_GLOBAL_RUNS_THREADS 90
+#define V_GLOBAL_RUNS_TIMER 91
+#define V_TABLE_RUNS_TYPE 92
+#define V_GLOBAL_RUNS_VERIFY_FAILURE_DUMP 93
+#define V_GLOBAL_STATISTICS_MODE 94
+#define V_GLOBAL_STATISTICS_LOG_SOURCES 95
+#define V_GLOBAL_STRESS_AGGRESSIVE_SWEEP 96
+#define V_GLOBAL_STRESS_CHECKPOINT 97
+#define V_GLOBAL_STRESS_CHECKPOINT_EVICT_PAGE 98
+#define V_GLOBAL_STRESS_CHECKPOINT_PREPARE 99
+#define V_GLOBAL_STRESS_EVICT_REPOSITION 100
+#define V_GLOBAL_STRESS_FAILPOINT_EVICTION_FAIL_AFTER_RECONCILIATION 101
+#define V_GLOBAL_STRESS_FAILPOINT_HS_DELETE_KEY_FROM_TS 102
+#define V_GLOBAL_STRESS_HS_CHECKPOINT_DELAY 103
+#define V_GLOBAL_STRESS_HS_SEARCH 104
+#define V_GLOBAL_STRESS_HS_SWEEP 105
+#define V_GLOBAL_STRESS_SLEEP_BEFORE_READ_OVERFLOW_ONPAGE 106
+#define V_GLOBAL_STRESS_SPLIT_1 107
+#define V_GLOBAL_STRESS_SPLIT_2 108
+#define V_GLOBAL_STRESS_SPLIT_3 109
+#define V_GLOBAL_STRESS_SPLIT_4 110
+#define V_GLOBAL_STRESS_SPLIT_5 111
+#define V_GLOBAL_STRESS_SPLIT_6 112
+#define V_GLOBAL_STRESS_SPLIT_7 113
+#define V_GLOBAL_TRANSACTION_IMPLICIT 114
+#define V_GLOBAL_TRANSACTION_TIMESTAMPS 115
+#define V_GLOBAL_WIREDTIGER_CONFIG 116
+#define V_GLOBAL_WIREDTIGER_RWLOCK 117
+#define V_GLOBAL_WIREDTIGER_LEAK_MEMORY 118
-#define V_ELEMENT_COUNT 116
+#define V_ELEMENT_COUNT 119
diff --git a/src/third_party/wiredtiger/test/format/config.sh b/src/third_party/wiredtiger/test/format/config.sh
index 9f9a68003e8..21a9a6984f5 100755
--- a/src/third_party/wiredtiger/test/format/config.sh
+++ b/src/third_party/wiredtiger/test/format/config.sh
@@ -238,11 +238,17 @@ CONFIG configuration_list[] = {
{"quiet", "quiet run (same as -q)", C_BOOL | C_IGNORE, 0, 0, 1}
+{"random.data_seed", "set random seed for data operations", 0x0, 0, 0, UINT_MAX}
+
+{"random.extra_seed", "set random seed for extra operations", 0x0, 0, 0, UINT_MAX}
+
{"runs.in_memory", "configure in-memory", C_BOOL | C_IGNORE, 0, 0, 1}
+{"runs.mirror", "mirror tables", C_BOOL | C_IGNORE | C_TABLE, 0, 0, 0}
+
{"runs.ops", "operations per run", 0x0, 0, M(2), M(100)}
-{"runs.mirror", "mirror tables", C_BOOL | C_IGNORE | C_TABLE, 0, 0, 0}
+{"runs.predictable_replay", "configure predictable replay", C_BOOL, 0, 0, 0}
{"runs.rows", "number of rows", C_TABLE, 10, M(1), M(100)}
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index 3b43c304435..1a94278364d 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -84,6 +84,38 @@
#define STR(s) #s
#define XSTR(s) STR(s)
+#include "config.h"
+extern CONFIG configuration_list[];
+
+typedef struct {
+ uint32_t v; /* integral value */
+ char *vstr; /* string value */
+ bool set; /* value explicitly set */
+} CONFIGV;
+
+/*
+ * The LANE data structure is used with predictable replay. With predictable replay, we want to make
+ * sure that two threads can never act on the same key. The last bits of the timestamp to be used to
+ * determine a lane, so it takes a while (LANE_COUNT operations) to cycle through the lanes. A lane
+ * only acts on key numbers whose last bits match the lane. We also keep track of lanes via the
+ * g.lanes array. This guarantees that a lane is only being used one at a time, which in turn
+ * guarantees that a key can only be used once at a time.
+ *
+ * A more complete description of how this fits into predictable replay is in replay.c .
+ */
+typedef struct {
+ uint64_t last_commit_ts;
+ bool in_use;
+} LANE;
+#define LANE_NONE UINT32_MAX /* A lane number guaranteed to be illegal */
+#define LANE_COUNT 1024u
+
+/* Arguments to the read scanner. */
+typedef struct {
+ WT_CONNECTION *conn;
+ WT_RAND_STATE *rnd;
+} READ_SCAN_ARGS;
+
/*
* Abstract lock that lets us use either pthread reader-writer locks or WiredTiger's own (likely
* faster) implementation.
@@ -112,15 +144,6 @@ typedef struct {
*/
#define FIX_VALUE_WRONG 0xff
-#include "config.h"
-extern CONFIG configuration_list[];
-
-typedef struct {
- uint32_t v; /* integral value */
- char *vstr; /* string value */
- bool set; /* value explicitly set */
-} CONFIGV;
-
typedef enum { FIX, ROW, VAR } table_type;
typedef struct {
u_int id; /* table ID */
@@ -224,7 +247,8 @@ typedef struct {
#define INCREMENTAL_OFF 3
u_int backup_incr_flag; /* Incremental backup configuration */
- WT_RAND_STATE rnd; /* Global RNG state */
+ WT_RAND_STATE data_rnd; /* Global RNG state for data operations */
+ WT_RAND_STATE extra_rnd; /* Global RNG state for extra operations */
uint64_t timestamp; /* Counter for timestamps */
uint64_t oldest_timestamp; /* Last timestamp used for oldest */
@@ -232,6 +256,12 @@ typedef struct {
uint64_t truncate_cnt; /* truncation operation counter */
+ uint64_t replay_cached_committed; /* Our committed timestamp, cached */
+ uint32_t replay_calculate_committed; /* Times before recalculating cached committed */
+ uint64_t replay_start_timestamp; /* Timestamp at the beginning of a run */
+ uint64_t stop_timestamp; /* If non-zero, stop when stable reaches this */
+ uint64_t timestamp_copy; /* A copy of the timestamp, for safety checks */
+
/*
* Lock to prevent the stable timestamp from moving during the commit of prepared transactions.
* Otherwise, it may panic if the stable timestamp is moved to greater than or equal to the
@@ -271,9 +301,15 @@ typedef struct {
#define CHECKPOINT_ON 2
#define CHECKPOINT_WIREDTIGER 3
u_int checkpoint_config; /* Checkpoint configuration */
+
+ LANE lanes[LANE_COUNT]; /* The lanes for multithreaded coordination */
+ pthread_rwlock_t lane_lock; /* Lock used when modifying lanes */
} GLOBAL;
extern GLOBAL g;
+/* Timestamp to lane number */
+#define LANE_NUMBER(ts) (ts & (LANE_COUNT - 1))
+
/* Worker thread operations. */
typedef enum { INSERT = 1, MODIFY, READ, REMOVE, TRUNCATE, UPDATE } thread_op;
@@ -311,7 +347,12 @@ typedef struct {
SAP sap; /* Thread's session event handler information */
- WT_RAND_STATE rnd; /* thread RNG state */
+ WT_RAND_STATE data_rnd; /* thread RNG state for data operations */
+ WT_RAND_STATE extra_rnd; /* thread RNG state for extra operations */
+
+ uint32_t lane; /* Current lane for replay */
+ thread_op op; /* Operation */
+ bool replay_again; /* Need to redo an operation at a timestamp. */
volatile bool quit; /* thread should quit */
@@ -348,8 +389,9 @@ typedef struct {
bool repeatable_reads; /* if read ops repeatable */
bool repeatable_wrap; /* if circular buffer wrapped */
uint64_t opid; /* Operation ID */
- uint64_t read_ts; /* read timestamp */
uint64_t commit_ts; /* commit timestamp */
+ uint64_t read_ts; /* read timestamp */
+ uint64_t replay_ts; /* allocated timestamp for predictable replay */
uint64_t stable_ts; /* stable timestamp */
SNAP_STATE snap_states[2];
SNAP_STATE *s; /* points to one of the snap_states */
@@ -398,7 +440,7 @@ void key_gen_teardown(WT_ITEM *);
void key_init(TABLE *, void *);
void lock_destroy(WT_SESSION *, RWLOCK *);
void lock_init(WT_SESSION *, RWLOCK *);
-void operations(u_int, bool);
+void operations(u_int, u_int, u_int);
void path_setup(const char *);
void set_alarm(u_int);
void set_core(bool);
@@ -415,6 +457,19 @@ void table_verify(TABLE *, void *);
void timestamp_init(void);
uint64_t timestamp_maximum_committed(void);
void timestamp_once(WT_SESSION *, bool, bool);
+void replay_adjust_key(TINFO *, uint64_t);
+uint64_t replay_commit_ts(TINFO *);
+void replay_committed(TINFO *);
+void replay_end_timed_run(void);
+void replay_loop_begin(TINFO *, bool);
+uint64_t replay_maximum_committed(void);
+bool replay_operation_enabled(thread_op);
+void replay_pause_after_rollback(TINFO *, uint32_t);
+uint64_t replay_prepare_ts(TINFO *);
+uint64_t replay_read_ts(TINFO *);
+void replay_rollback(TINFO *);
+void replay_run_begin(WT_SESSION *);
+void replay_run_end(WT_SESSION *);
void timestamp_query(const char *, uint64_t *);
void timestamp_set_oldest(void);
void timestamp_teardown(WT_SESSION *);
diff --git a/src/third_party/wiredtiger/test/format/format_config.c b/src/third_party/wiredtiger/test/format/format_config.c
index e3ac21a724b..c28c4a3c976 100644
--- a/src/third_party/wiredtiger/test/format/format_config.c
+++ b/src/third_party/wiredtiger/test/format/format_config.c
@@ -53,6 +53,63 @@ static void config_off_all(const char *);
static void config_pct(TABLE *);
static void config_statistics(void);
static void config_transaction(void);
+static bool config_var(TABLE *);
+
+/*
+ * config_random_generator --
+ * For a given seed/RNG combination, generate a seed if not given, and initialize the RNG.
+ */
+static void
+config_random_generator(
+ const char *config_name, uint64_t seed, uint32_t rand_count, WT_RAND_STATE *rnd)
+{
+ char buf[128];
+ bool seed_set;
+
+ /* See if the seed is already present in the configuration. */
+ seed_set = (seed != 0);
+
+ /* Initialize the RNG, and potentially the seed. */
+ testutil_random_init(rnd, &seed, rand_count);
+
+ /* If we generated a seed just now, put it into the configuration file. */
+ if (!seed_set) {
+ testutil_assert(seed != 0);
+ testutil_check(__wt_snprintf(buf, sizeof(buf), "%s=%" PRIu64, config_name, seed));
+ config_single(NULL, buf, true);
+ }
+
+ /* Make sure the generator is ready. */
+ testutil_assert(rnd->v != 0);
+}
+
+/*
+ * config_random_generators --
+ * Initialize our global random generators using provided seeds.
+ */
+static void
+config_random_generators(void)
+{
+ config_random_generator("random.data_seed", GV(RANDOM_DATA_SEED), 0, &g.data_rnd);
+ config_random_generator("random.extra_seed", GV(RANDOM_EXTRA_SEED), 1, &g.extra_rnd);
+}
+
+/*
+ * config_random_generators_before_run --
+ * One use case for predictable replay is to run test/format once with little or no
+ * configuration values set. test/format rolls the dice and picks the configuration, recording
+ * it along with the random seeds. If we want to rerun it predictably, we can use the same
+ * seeds. However, the second run will not need to roll the dice during configuration, so the
+ * state of the RNG after configuration would be different than after configuration during the
+ * first run. To make everything line up, we re-seed the generator after the configuration, and
+ * before execution begins.
+ */
+static void
+config_random_generators_before_run(void)
+{
+ testutil_random_from_seed(&g.data_rnd, GV(RANDOM_DATA_SEED));
+ testutil_random_from_seed(&g.extra_rnd, GV(RANDOM_EXTRA_SEED));
+}
/*
* config_random --
@@ -85,7 +142,7 @@ config_random(TABLE *table, bool table_only)
continue;
/* Configure key prefixes only rarely, 5% if the length isn't set explicitly. */
- if (cp->off == V_TABLE_BTREE_PREFIX_LEN && mmrand(NULL, 1, 100) > 5)
+ if (cp->off == V_TABLE_BTREE_PREFIX_LEN && mmrand(&g.extra_rnd, 1, 100) > 5)
continue;
/*
@@ -93,11 +150,11 @@ config_random(TABLE *table, bool table_only)
* is "on" (so "on" if random rolled <= N, otherwise "off").
*/
if (F_ISSET(cp, C_BOOL))
- testutil_check(__wt_snprintf(
- buf, sizeof(buf), "%s=%s", cp->name, mmrand(NULL, 1, 100) <= cp->min ? "on" : "off"));
+ testutil_check(__wt_snprintf(buf, sizeof(buf), "%s=%s", cp->name,
+ mmrand(&g.data_rnd, 1, 100) <= cp->min ? "on" : "off"));
else
- testutil_check(__wt_snprintf(
- buf, sizeof(buf), "%s=%" PRIu32, cp->name, mmrand(NULL, cp->min, cp->maxrand)));
+ testutil_check(__wt_snprintf(buf, sizeof(buf), "%s=%" PRIu32, cp->name,
+ mmrand(&g.data_rnd, cp->min, cp->maxrand)));
config_single(table, buf, false);
}
}
@@ -141,12 +198,15 @@ config_table_am(TABLE *table)
if (config_explicit(table, "runs.source") && DATASOURCE(table, "lsm"))
config_single(table, "runs.type=row", false);
else
- switch (mmrand(NULL, 1, 10)) {
+ switch (mmrand(&g.data_rnd, 1, 10)) {
case 1:
case 2:
case 3: /* 30% */
- config_single(table, "runs.type=var", false);
- break;
+ if (config_var(table)) {
+ config_single(table, "runs.type=var", false);
+ break;
+ }
+ /* FALLTHROUGH */
case 4: /* 10% */
if (config_fix(table)) {
config_single(table, "runs.type=fix", false);
@@ -165,7 +225,7 @@ config_table_am(TABLE *table)
}
if (!config_explicit(table, "runs.source"))
- switch (mmrand(NULL, 1, 5)) {
+ switch (mmrand(&g.data_rnd, 1, 5)) {
case 1: /* 20% */
config_single(table, "runs.source=file", false);
break;
@@ -335,6 +395,31 @@ config_table(TABLE *table, void *arg)
if (TV(BTREE_VALUE_MIN) > TV(BTREE_VALUE_MAX))
testutil_die(EINVAL, "btree.value_min may not be larger than btree.value_max");
+ if (GV(RUNS_PREDICTABLE_REPLAY)) {
+ /*
+ * In predictable replay, force the number of rows in a table to be a manageable size so we
+ * can modify key numbers without problems.
+ */
+ TV(RUNS_ROWS) = WT_MAX(TV(RUNS_ROWS), 2 * LANE_COUNT);
+
+ /*
+ * We don't support some operations in predictable replay.
+ */
+ if (!replay_operation_enabled(MODIFY)) {
+ if (config_explicit(table, "ops.pct.modify") && TV(OPS_PCT_MODIFY))
+ WARN("turning off modify operations for table%" PRIu32
+ " to work with predictable replay",
+ table->id);
+ config_single(table, "ops.pct.modify=0", false);
+ }
+ if (!replay_operation_enabled(TRUNCATE)) {
+ if (config_explicit(table, "ops.truncate") && TV(OPS_TRUNCATE))
+ WARN("turning off truncate for table%" PRIu32 " to work with predictable replay",
+ table->id);
+ config_single(table, "ops.truncate=0", false);
+ }
+ }
+
/*
* If common key prefixes are configured, add prefix compression if no explicit choice was made
* and track the largest common key prefix in the run.
@@ -372,6 +457,8 @@ config_table(TABLE *table, void *arg)
void
config_run(void)
{
+ config_random_generators(); /* Configure the random number generators. */
+
config_random(tables[0], false); /* Configure the remaining global name space. */
/*
@@ -433,6 +520,8 @@ config_run(void)
else
config_single(NULL, "runs.timer=360", false);
}
+
+ config_random_generators_before_run();
}
/*
@@ -463,7 +552,7 @@ config_backup_incr(void)
* Choose a type of incremental backup, where the log remove setting can eliminate incremental
* backup based on log files.
*/
- switch (mmrand(NULL, 1, 10)) {
+ switch (mmrand(&g.extra_rnd, 1, 10)) {
case 1: /* 30% full backup only */
case 2:
case 3:
@@ -508,7 +597,7 @@ config_backup_incr_granularity(void)
* granularity is in units of KB.
*/
granularity = 0;
- i = mmrand(NULL, 1, 10);
+ i = mmrand(&g.extra_rnd, 1, 10);
switch (i) {
case 1: /* 50% small size for stress testing */
case 2:
@@ -669,7 +758,7 @@ config_checkpoint(void)
{
/* Choose a checkpoint mode if nothing was specified. */
if (!config_explicit(NULL, "checkpoint"))
- switch (mmrand(NULL, 1, 20)) {
+ switch (mmrand(&g.extra_rnd, 1, 20)) {
case 1:
case 2:
case 3:
@@ -694,7 +783,7 @@ config_checksum(TABLE *table)
{
/* Choose a checksum mode if nothing was specified. */
if (!config_explicit(table, "disk.checksum"))
- switch (mmrand(NULL, 1, 10)) {
+ switch (mmrand(&g.extra_rnd, 1, 10)) {
case 1:
case 2:
case 3:
@@ -746,7 +835,7 @@ config_compression(TABLE *table, const char *conf_name)
* correct if all of the possible engines are compiled in.
*/
cstr = "off";
- switch (mmrand(NULL, 1, 20)) {
+ switch (mmrand(&g.extra_rnd, 1, 20)) {
#ifdef HAVE_BUILTIN_EXTENSION_LZ4
case 1:
case 2:
@@ -858,7 +947,7 @@ config_encryption(void)
return;
/* 70% no encryption, 30% rotn */
- if (mmrand(NULL, 1, 10) < 8)
+ if (mmrand(&g.data_rnd, 1, 10) < 8)
config_off(NULL, "disk.encryption");
else
config_single(NULL, "disk.encryption=rotn-7", false);
@@ -871,8 +960,24 @@ config_encryption(void)
static bool
config_fix(TABLE *table)
{
- /* Fixed-length column stores don't support modify operations. */
- return (!config_explicit(table, "ops.pct.modify"));
+ /*
+ * Fixed-length column stores don't support modify operations, and can't be used with
+ * predictable replay.
+ */
+ return (!GV(RUNS_PREDICTABLE_REPLAY) && !config_explicit(table, "ops.pct.modify"));
+}
+
+/*
+ * config_var --
+ * Variable-length column-store configuration.
+ */
+static bool
+config_var(TABLE *table)
+{
+ /*
+ * Variable-length column store insertions can't be used with predictable replay.
+ */
+ return (!GV(RUNS_PREDICTABLE_REPLAY) || !config_explicit(table, "ops.pct.insert"));
}
/*
@@ -918,8 +1023,10 @@ config_in_memory(void)
return;
if (config_explicit(NULL, "runs.mirror"))
return;
+ if (config_explicit(NULL, "runs.predictable_replay"))
+ return;
- if (!config_explicit(NULL, "runs.in_memory") && mmrand(NULL, 1, 20) == 1) {
+ if (!config_explicit(NULL, "runs.in_memory") && mmrand(&g.extra_rnd, 1, 20) == 1) {
config_single(NULL, "runs.in_memory=1", false);
/* Use table[0] to access the global value (RUN_ROWS is a table value). */
if (NTV(tables[0], RUNS_ROWS) > WT_MILLION) {
@@ -1064,7 +1171,18 @@ config_mirrors(void)
* tables.
*/
explicit_mirror = config_explicit(NULL, "runs.mirror");
- if (!explicit_mirror && mmrand(NULL, 1, 10) < 9) {
+ if (!explicit_mirror && mmrand(&g.data_rnd, 1, 10) < 9) {
+ config_off_all("runs.mirror");
+ return;
+ }
+
+ /*
+ * In theory, mirroring should work with predictable replay, although there's some overlap in
+ * functionality. That is, we usually do multiple runs with the same key with predictable replay
+ * and would notice if data was different or missing. We disable it to keep runs simple.
+ */
+ if (GV(RUNS_PREDICTABLE_REPLAY)) {
+ WARN("%s", "turning off mirroring for predictable replay");
config_off_all("runs.mirror");
return;
}
@@ -1122,7 +1240,7 @@ config_mirrors(void)
* Pick some number of tables to mirror, then turn on mirroring the next (n-1) tables, where
* allowed.
*/
- for (mirrors = mmrand(NULL, 2, ntables) - 1, i = 1; i <= ntables; ++i) {
+ for (mirrors = mmrand(&g.data_rnd, 2, ntables) - 1, i = 1; i <= ntables; ++i) {
if (NT_EXPLICIT_OFF(tables[i], RUNS_MIRROR))
continue;
if (tables[i] != g.base_mirror) {
@@ -1155,25 +1273,32 @@ config_pct(TABLE *table)
const char *name; /* Operation */
uint32_t *vp; /* Value store */
u_int order; /* Order of assignment */
+ bool enabled; /* Enabled for this configuration */
} list[5];
u_int i, max_order, max_slot, n, pct;
bool slot_available;
+ /* We explicitly disable modify operations for predictable replay. */
list[0].name = "ops.pct.delete";
list[0].vp = &TV(OPS_PCT_DELETE);
list[0].order = 0;
+ list[0].enabled = replay_operation_enabled(REMOVE);
list[1].name = "ops.pct.insert";
list[1].vp = &TV(OPS_PCT_INSERT);
list[1].order = 0;
+ list[1].enabled = replay_operation_enabled(INSERT);
list[2].name = "ops.pct.modify";
list[2].vp = &TV(OPS_PCT_MODIFY);
list[2].order = 0;
+ list[2].enabled = replay_operation_enabled(MODIFY);
list[3].name = "ops.pct.read";
list[3].vp = &TV(OPS_PCT_READ);
list[3].order = 0;
+ list[3].enabled = replay_operation_enabled(READ);
list[4].name = "ops.pct.write";
list[4].vp = &TV(OPS_PCT_WRITE);
list[4].order = 0;
+ list[4].enabled = replay_operation_enabled(UPDATE);
/*
* Walk the list of operations, checking for an illegal configuration and creating a random
@@ -1182,11 +1307,13 @@ config_pct(TABLE *table)
pct = 0;
slot_available = false;
for (i = 0; i < WT_ELEMENTS(list); ++i)
- if (config_explicit(table, list[i].name))
- pct += *list[i].vp;
- else {
- list[i].order = mmrand(NULL, 1, WT_THOUSAND);
- slot_available = true;
+ if (list[i].enabled) {
+ if (config_explicit(table, list[i].name))
+ pct += *list[i].vp;
+ else {
+ list[i].order = mmrand(&g.data_rnd, 1, WT_THOUSAND);
+ slot_available = true;
+ }
}
/*
@@ -1197,7 +1324,7 @@ config_pct(TABLE *table)
WARN("operation percentages %s than 100, resetting to random values",
pct > 100 ? "greater" : "less");
for (i = 0; i < WT_ELEMENTS(list); ++i)
- list[i].order = mmrand(NULL, 1, WT_THOUSAND);
+ list[i].order = mmrand(&g.data_rnd, 1, WT_THOUSAND);
pct = 0;
}
@@ -1210,9 +1337,9 @@ config_pct(TABLE *table)
*/
for (pct = 100 - pct;;) {
for (i = n = max_order = max_slot = 0; i < WT_ELEMENTS(list); ++i) {
- if (list[i].order != 0)
+ if (list[i].order != 0 && list[i].enabled)
++n;
- if (list[i].order > max_order) {
+ if (list[i].order > max_order && list[i].enabled) {
max_order = list[i].order;
max_slot = i;
}
@@ -1223,7 +1350,7 @@ config_pct(TABLE *table)
*list[max_slot].vp = pct;
break;
}
- *list[max_slot].vp = mmrand(NULL, 0, pct);
+ *list[max_slot].vp = mmrand(&g.data_rnd, 0, pct);
list[max_slot].order = 0;
pct -= *list[max_slot].vp;
}
@@ -1246,7 +1373,7 @@ config_statistics(void)
if (!config_explicit(NULL, "statistics.mode")) {
/* 70% of the time set statistics to fast. */
- if (mmrand(NULL, 1, 10) < 8)
+ if (mmrand(&g.extra_rnd, 1, 10) < 8)
config_single(NULL, "statistics.mode=fast", false);
else
config_single(NULL, "statistics.mode=all", false);
@@ -1254,7 +1381,7 @@ config_statistics(void)
if (!config_explicit(NULL, "statistics_log.sources")) {
/* 10% of the time use sources if all. */
- if (strcmp(GVS(STATISTICS_MODE), "all") == 0 && mmrand(NULL, 1, 10) == 1)
+ if (strcmp(GVS(STATISTICS_MODE), "all") == 0 && mmrand(&g.extra_rnd, 1, 10) == 1)
config_single(NULL, "statistics_log.sources=file:", false);
}
}
@@ -1266,6 +1393,12 @@ config_statistics(void)
static void
config_transaction(void)
{
+ /* Predictable replay requires timestamps. */
+ if (GV(RUNS_PREDICTABLE_REPLAY)) {
+ config_single(NULL, "transaction.implicit=0", false);
+ config_single(NULL, "transaction.timestamps=on", true);
+ }
+
/* Transaction prepare requires timestamps and is incompatible with logging. */
if (GV(OPS_PREPARE) && config_explicit(NULL, "ops.prepare")) {
if (!GV(TRANSACTION_TIMESTAMPS) && config_explicit(NULL, "transaction.timestamps"))
@@ -1664,6 +1797,7 @@ config_table_extend(u_int ntable)
void
config_single(TABLE *table, const char *s, bool explicit)
{
+ WT_RAND_STATE *rnd;
enum { RANGE_FIXED, RANGE_NONE, RANGE_WEIGHTED } range;
CONFIG *cp;
CONFIGV *v;
@@ -1726,6 +1860,11 @@ config_single(TABLE *table, const char *s, bool explicit)
++equalp;
v = &table->v[cp->off];
+ /*
+ * Use the data RNG for these options, that's conservative.
+ */
+ rnd = &g.data_rnd;
+
if (F_ISSET(cp, C_STRING)) {
/*
* Historically, both "none" and "off" were used for turning off string configurations, now
@@ -1822,7 +1961,7 @@ config_single(TABLE *table, const char *s, bool explicit)
testutil_die(EINVAL, "%s: %s: illegal numeric range", progname, s);
if (range == RANGE_FIXED)
- v1 = mmrand(NULL, (u_int)v1, (u_int)v2);
+ v1 = mmrand(rnd, (u_int)v1, (u_int)v2);
else {
/*
* Roll dice, 50% chance of proceeding to the next larger value, and 5 steps to the
@@ -1832,7 +1971,7 @@ config_single(TABLE *table, const char *s, bool explicit)
if (steps == 0)
steps = 1;
for (i = 0; i < 5; ++i, v1 += steps)
- if (mmrand(NULL, 0, 1) == 0)
+ if (mmrand(rnd, 0, 1) == 0)
break;
v1 = WT_MIN(v1, v2);
}
@@ -1897,7 +2036,7 @@ config_map_file_type(const char *s, u_int *vp)
*
* Variable-length column-store is 90% vs. fixed, 30% vs. fixed and row, and 40% vs row.
*/
- v = mmrand(NULL, 1, 10);
+ v = mmrand(&g.data_rnd, 1, 10);
if (fix && v == 1)
*vp = FIX;
else if (var && (v < 5 || !row))
diff --git a/src/third_party/wiredtiger/test/format/format_config_def.c b/src/third_party/wiredtiger/test/format/format_config_def.c
index de704a1ac71..399b323d0c0 100644
--- a/src/third_party/wiredtiger/test/format/format_config_def.c
+++ b/src/third_party/wiredtiger/test/format/format_config_def.c
@@ -96,7 +96,7 @@ CONFIG configuration_list[] = {{"assert.read_timestamp", "assert read_timestamp"
{"checkpoint.wait", "seconds to wait if wiredtiger checkpoints configured", 0x0, 5, 100, 3600,
V_GLOBAL_CHECKPOINT_WAIT},
- {"debug.checkpoint_retention", "adjust log removal to retain the log records", 0x0, 0, 128, 1024,
+ {"debug.checkpoint_retention", "adjust log removal to retain the log records", 0x0, 0, 10, 1024,
V_GLOBAL_DEBUG_CHECKPOINT_RETENTION},
{"debug.cursor_reposition",
@@ -109,7 +109,7 @@ CONFIG configuration_list[] = {{"assert.read_timestamp", "assert read_timestamp"
C_BOOL, 2, 0, 0, V_GLOBAL_DEBUG_EVICTION},
{"debug.log_retention", "adjust log removal to retain at least this number of log files", 0x0, 0,
- 128, 1024, V_GLOBAL_DEBUG_LOG_RETENTION},
+ 10, 1024, V_GLOBAL_DEBUG_LOG_RETENTION},
{"debug.realloc_exact", "reallocation of memory will only provide the exact amount requested",
C_BOOL, 0, 0, 0, V_GLOBAL_DEBUG_REALLOC_EXACT},
@@ -236,11 +236,20 @@ CONFIG configuration_list[] = {{"assert.read_timestamp", "assert read_timestamp"
{"quiet", "quiet run (same as -q)", C_BOOL | C_IGNORE, 0, 0, 1, V_GLOBAL_QUIET},
+ {"random.data_seed", "set random seed for data operations", 0x0, 0, 0, UINT_MAX,
+ V_GLOBAL_RANDOM_DATA_SEED},
+
+ {"random.extra_seed", "set random seed for extra operations", 0x0, 0, 0, UINT_MAX,
+ V_GLOBAL_RANDOM_EXTRA_SEED},
+
{"runs.in_memory", "configure in-memory", C_BOOL | C_IGNORE, 0, 0, 1, V_GLOBAL_RUNS_IN_MEMORY},
+ {"runs.mirror", "mirror tables", C_BOOL | C_IGNORE | C_TABLE, 0, 0, 0, V_TABLE_RUNS_MIRROR},
+
{"runs.ops", "operations per run", 0x0, 0, M(2), M(100), V_GLOBAL_RUNS_OPS},
- {"runs.mirror", "mirror tables", C_BOOL | C_IGNORE | C_TABLE, 0, 0, 0, V_TABLE_RUNS_MIRROR},
+ {"runs.predictable_replay", "configure predictable replay", C_BOOL, 0, 0, 0,
+ V_GLOBAL_RUNS_PREDICTABLE_REPLAY},
{"runs.rows", "number of rows", C_TABLE, 10, M(1), M(100), V_TABLE_RUNS_ROWS},
diff --git a/src/third_party/wiredtiger/test/format/format_inline.h b/src/third_party/wiredtiger/test/format/format_inline.h
index 07f33f5319c..0d6d42befa9 100644
--- a/src/third_party/wiredtiger/test/format/format_inline.h
+++ b/src/third_party/wiredtiger/test/format/format_inline.h
@@ -112,10 +112,7 @@ read_op(WT_CURSOR *cursor, read_operation op, int *exactp)
static inline uint32_t
rng(WT_RAND_STATE *rnd)
{
- /* Threaded operations have their own RNG information, otherwise we use the default. */
- if (rnd == NULL)
- rnd = &g.rnd;
-
+ testutil_assert(rnd != NULL);
return (__wt_random(rnd));
}
@@ -228,12 +225,25 @@ table_sumv(u_int off)
* Randomly select a table.
*/
static inline TABLE *
-table_select(TINFO *tinfo)
+table_select(TINFO *tinfo, bool modifies_data)
{
+ WT_RAND_STATE *rnd;
+
if (ntables == 0)
return (tables[0]);
- return (tables[mmrand(tinfo == NULL ? NULL : &tinfo->rnd, 1, ntables)]);
+ if (tinfo == NULL) {
+ if (modifies_data)
+ rnd = &g.data_rnd;
+ else
+ rnd = &g.extra_rnd;
+ } else {
+ if (modifies_data)
+ rnd = &tinfo->data_rnd;
+ else
+ rnd = &tinfo->extra_rnd;
+ }
+ return (tables[mmrand(rnd, 1, ntables)]);
}
/*
@@ -241,14 +251,20 @@ table_select(TINFO *tinfo)
* Randomly select a table of a specific type.
*/
static inline TABLE *
-table_select_type(table_type type)
+table_select_type(table_type type, bool modifies_data)
{
+ WT_RAND_STATE *rnd;
u_int i;
if (ntables == 0)
return (tables[0]->type == type ? tables[0] : NULL);
- for (i = mmrand(NULL, 1, ntables);; ++i) {
+ if (modifies_data)
+ rnd = &g.data_rnd;
+ else
+ rnd = &g.extra_rnd;
+
+ for (i = mmrand(rnd, 1, ntables);; ++i) {
if (i > ntables)
i = 1;
if (tables[i]->type == type)
diff --git a/src/third_party/wiredtiger/test/format/format_salvage.c b/src/third_party/wiredtiger/test/format/format_salvage.c
index 8c23fae5cf9..787ecdade1d 100644
--- a/src/third_party/wiredtiger/test/format/format_salvage.c
+++ b/src/third_party/wiredtiger/test/format/format_salvage.c
@@ -84,7 +84,7 @@ corrupt(TABLE *table)
* exceeding a megabyte (so we aren't just corrupting the whole file).
*/
testutil_check(fstat(fd, &sb));
- offset = mmrand(NULL, 0, (u_int)sb.st_size - 1024);
+ offset = mmrand(&g.data_rnd, 0, (u_int)sb.st_size - 1024);
len = (size_t)(sb.st_size * 2) / 100;
len += 4 * 1024;
len = WT_MIN(len, WT_MEGABYTE);
diff --git a/src/third_party/wiredtiger/test/format/format_timestamp.c b/src/third_party/wiredtiger/test/format/format_timestamp.c
index db243be7335..aacc2c76c91 100644
--- a/src/third_party/wiredtiger/test/format/format_timestamp.c
+++ b/src/third_party/wiredtiger/test/format/format_timestamp.c
@@ -38,6 +38,9 @@ timestamp_maximum_committed(void)
TINFO **tlp;
uint64_t commit_ts, ts;
+ if (GV(RUNS_PREDICTABLE_REPLAY))
+ return replay_maximum_committed();
+
/* A barrier additionally prevents using cache values here. */
WT_ORDERED_READ(ts, g.timestamp);
if (tinfo_list != NULL)
@@ -96,7 +99,7 @@ timestamp_once(WT_SESSION *session, bool allow_lag, bool final)
static const char *oldest_timestamp_str = "oldest_timestamp=";
static const char *stable_timestamp_str = "stable_timestamp=";
WT_CONNECTION *conn;
- uint64_t oldest_timestamp, stable_timestamp;
+ uint64_t oldest_timestamp, stable_timestamp, stop_timestamp;
char buf[WT_TS_HEX_STRING_SIZE * 2 + 64];
conn = g.wts_conn;
@@ -106,7 +109,24 @@ timestamp_once(WT_SESSION *session, bool allow_lag, bool final)
if (oldest_timestamp == 0)
return;
- if (!final) {
+ if (GV(RUNS_PREDICTABLE_REPLAY)) {
+ /*
+ * For predictable replay, we need the oldest timestamp to lag when the process exits. That
+ * allows two runs that finish with stable timestamps in the same ballpark to be compared.
+ */
+ if (stable_timestamp > 10 * WT_THOUSAND)
+ oldest_timestamp = stable_timestamp - 10 * WT_THOUSAND;
+ else
+ oldest_timestamp = stable_timestamp / 2;
+
+ /*
+ * For predictable replay, our end state is to have the stable timestamp represent a precise
+ * number of operations.
+ */
+ WT_ORDERED_READ(stop_timestamp, g.stop_timestamp);
+ if (stable_timestamp > stop_timestamp && stop_timestamp != 0)
+ stable_timestamp = stop_timestamp;
+ } else if (!final) {
/*
* If lag is permitted, update the oldest timestamp halfway to the largest timestamp that's
* no longer in use, otherwise update the oldest timestamp to that timestamp. Update stable
@@ -152,11 +172,22 @@ timestamp(void *arg)
memset(&sap, 0, sizeof(sap));
wt_wrap_open_session(conn, &sap, NULL, &session);
- /* Update the oldest and stable timestamps at least once every 15 seconds. */
+ /*
+ * Update the oldest and stable timestamps at least once every 15 seconds. For predictable
+ * replay, update at a much faster pace. We can't afford to get behind because that means more
+ * rollback errors, and we don't have the luxury of giving up on an operation that has rolled
+ * back.
+ */
while (!g.workers_finished) {
- random_sleep(&g.rnd, 15);
-
- timestamp_once(session, true, false);
+ if (!GV(RUNS_PREDICTABLE_REPLAY))
+ random_sleep(&g.extra_rnd, 15);
+ else {
+ if ((rng(&g.extra_rnd) & 0x1) == 1)
+ __wt_yield();
+ else
+ __wt_sleep(0, 10 * WT_THOUSAND);
+ }
+ timestamp_once(session, !GV(RUNS_PREDICTABLE_REPLAY), false);
}
wt_wrap_close_session(session);
diff --git a/src/third_party/wiredtiger/test/format/hs.c b/src/third_party/wiredtiger/test/format/hs.c
index fbad2aeefa6..808ca204aea 100644
--- a/src/third_party/wiredtiger/test/format/hs.c
+++ b/src/third_party/wiredtiger/test/format/hs.c
@@ -78,8 +78,8 @@ hs_cursor(void *arg)
* cursor, so we should be able to traverse large chunks of the HS store quickly, without
* blocking normal operations.
*/
- next = mmrand(NULL, 0, 1) == 1;
- for (i = mmrand(NULL, WT_THOUSAND, 100 * WT_THOUSAND); i > 0; --i) {
+ next = mmrand(&g.extra_rnd, 0, 1) == 1;
+ for (i = mmrand(&g.extra_rnd, WT_THOUSAND, 100 * WT_THOUSAND); i > 0; --i) {
if ((ret = (next ? cursor->next(cursor) : cursor->prev(cursor))) != 0) {
testutil_assertfmt(ret == WT_NOTFOUND || ret == WT_CACHE_FULL || ret == WT_ROLLBACK,
"WT_CURSOR.%s failed: %d", next ? "next" : "prev", ret);
@@ -94,7 +94,7 @@ hs_cursor(void *arg)
testutil_check(cursor->close(cursor));
/* Sleep for some number of seconds, in short intervals so we don't make the run wait. */
- for (period = mmrand(NULL, 1, 10); period > 0 && !g.workers_finished; --period)
+ for (period = mmrand(&g.extra_rnd, 1, 10); period > 0 && !g.workers_finished; --period)
__wt_sleep(1, 0);
if (g.workers_finished)
break;
diff --git a/src/third_party/wiredtiger/test/format/import.c b/src/third_party/wiredtiger/test/format/import.c
index 77c42435672..27fc14acee2 100644
--- a/src/third_party/wiredtiger/test/format/import.c
+++ b/src/third_party/wiredtiger/test/format/import.c
@@ -103,7 +103,7 @@ import(void *arg)
copy_file_into_directory(import_session, "import.wt");
/* Perform import with either repair or file metadata. */
- import_value = mmrand(NULL, 0, 1);
+ import_value = mmrand(&g.extra_rnd, 0, 1);
if (import_value == 0)
testutil_check(__wt_snprintf(buf, sizeof(buf), "import=(enabled,repair=true)"));
else
@@ -116,7 +116,7 @@ import(void *arg)
/* Drop import table, so we can import the table again */
testutil_drop(session, IMPORT_URI, NULL);
- period = mmrand(NULL, 1, 10);
+ period = mmrand(&g.extra_rnd, 1, 10);
while (period > 0 && !g.workers_finished) {
--period;
__wt_sleep(1, 0);
diff --git a/src/third_party/wiredtiger/test/format/kv.c b/src/third_party/wiredtiger/test/format/kv.c
index bf9bd7a689f..1b1ace2ffd4 100644
--- a/src/third_party/wiredtiger/test/format/kv.c
+++ b/src/third_party/wiredtiger/test/format/kv.c
@@ -47,7 +47,7 @@ key_init_random(TABLE *table)
max = TV(BTREE_KEY_MAX);
if (i % 20 != 0 && max > TV(BTREE_KEY_MIN) + 20)
max = TV(BTREE_KEY_MIN) + 20;
- table->key_rand_len[i] = mmrand(NULL, TV(BTREE_KEY_MIN), max);
+ table->key_rand_len[i] = mmrand(&g.data_rnd, TV(BTREE_KEY_MIN), max);
}
}
@@ -241,6 +241,7 @@ val_len(WT_RAND_STATE *rnd, uint64_t keyno, uint32_t min, uint32_t max)
void
val_init(TABLE *table, void *arg)
{
+ WT_RAND_STATE *rnd;
size_t i;
uint32_t len;
@@ -263,8 +264,9 @@ val_init(TABLE *table, void *arg)
for (i = 0; i < len; ++i)
table->val_base[i] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % 26];
+ rnd = &g.data_rnd;
table->val_dup_data_len =
- val_len(NULL, (uint64_t)mmrand(NULL, 1, 20), TV(BTREE_VALUE_MIN), TV(BTREE_VALUE_MAX));
+ val_len(rnd, (uint64_t)mmrand(rnd, 1, 20), TV(BTREE_VALUE_MIN), TV(BTREE_VALUE_MAX));
}
/*
diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c
index 1b59ec9e80c..da1431e0bcb 100644
--- a/src/third_party/wiredtiger/test/format/ops.c
+++ b/src/third_party/wiredtiger/test/format/ops.c
@@ -28,7 +28,7 @@
#include "format.h"
-static void apply_bounds(WT_CURSOR *, TABLE *);
+static void apply_bounds(WT_CURSOR *, TABLE *, WT_RAND_STATE *);
static void clear_bounds(WT_CURSOR *, TABLE *);
static int col_insert(TINFO *);
static void col_insert_resolve(TABLE *, void *);
@@ -40,6 +40,7 @@ static int col_update(TINFO *, bool);
static int nextprev(TINFO *, bool);
static WT_THREAD_RET ops(void *);
static int read_row(TINFO *);
+static void rollback_transaction(TINFO *);
static int row_insert(TINFO *, bool);
static int row_modify(TINFO *, bool);
static int row_remove(TINFO *, bool);
@@ -72,16 +73,16 @@ modify_build(TINFO *tinfo)
int i, nentries;
/* Randomly select a number of byte changes, offsets and lengths. */
- nentries = (int)mmrand(&tinfo->rnd, 1, MAX_MODIFY_ENTRIES);
+ nentries = (int)mmrand(&tinfo->data_rnd, 1, MAX_MODIFY_ENTRIES);
for (i = 0; i < nentries; ++i) {
tinfo->entries[i].data.data =
- modify_repl + mmrand(&tinfo->rnd, 1, sizeof(modify_repl) - 10);
- tinfo->entries[i].data.size = (size_t)mmrand(&tinfo->rnd, 0, 10);
+ modify_repl + mmrand(&tinfo->data_rnd, 1, sizeof(modify_repl) - 10);
+ tinfo->entries[i].data.size = (size_t)mmrand(&tinfo->data_rnd, 0, 10);
/*
* Start at least 11 bytes into the buffer so we skip leading key information.
*/
- tinfo->entries[i].offset = (size_t)mmrand(&tinfo->rnd, 20, 40);
- tinfo->entries[i].size = (size_t)mmrand(&tinfo->rnd, 0, 10);
+ tinfo->entries[i].offset = (size_t)mmrand(&tinfo->data_rnd, 20, 40);
+ tinfo->entries[i].size = (size_t)mmrand(&tinfo->data_rnd, 0, 10);
}
tinfo->nentries = nentries;
@@ -169,6 +170,25 @@ tinfo_init(void)
tinfo->state = TINFO_RUNNING;
tinfo->quit = false;
+
+ testutil_random_from_random(&tinfo->data_rnd, &g.data_rnd);
+ testutil_random_from_random(&tinfo->extra_rnd, &g.extra_rnd);
+ }
+}
+
+/*
+ * lanes_init --
+ * Initialize the lanes structures.
+ */
+static void
+lanes_init(void)
+{
+ uint32_t lane;
+
+ /* Cleanup for each new run. */
+ for (lane = 0; lane < LANE_COUNT; ++lane) {
+ g.lanes[lane].in_use = false;
+ g.lanes[lane].last_commit_ts = 0;
}
}
@@ -226,6 +246,14 @@ rollback_to_stable(WT_SESSION *session)
/* Check the saved snap operations for consistency. */
snap_repeat_rollback(session, tinfo_list, GV(RUNS_THREADS));
+
+ /*
+ * For a predictable run, the final stable timestamp is known and fixed, but individual threads
+ * may have gone beyond that. Now that we've rolled back, set the current timestamp to the
+ * stable so that next run starts from a known value.
+ */
+ if (GV(RUNS_PREDICTABLE_REPLAY))
+ g.timestamp = g.stable_timestamp;
}
/*
@@ -233,7 +261,7 @@ rollback_to_stable(WT_SESSION *session)
* Perform a number of operations in a set of threads.
*/
void
-operations(u_int ops_seconds, bool lastrun)
+operations(u_int ops_seconds, u_int run_current, u_int run_total)
{
SAP sap;
TINFO *tinfo, total;
@@ -243,9 +271,10 @@ operations(u_int ops_seconds, bool lastrun)
wt_thread_t timestamp_tid;
int64_t fourths, quit_fourths, thread_ops;
uint32_t i;
- bool running;
+ bool lastrun, running;
conn = g.wts_conn;
+ lastrun = (run_current == run_total);
/* Make the modify pad character printable to simplify debugging and logging. */
__wt_process.modify_pad_byte = FORMAT_PAD_BYTE;
@@ -266,17 +295,30 @@ operations(u_int ops_seconds, bool lastrun)
* There are two mechanisms to specify the length of the run, a number of operations and a
* timer, when either expire the run terminates.
*
- * Each thread does an equal share of the total operations (and make sure that it's not 0).
+ * If we have a number of operations with predictable replay, we set a stop timestamp. Without
+ * predictable replay, each thread does an equal share of the total operations (and make sure
+ * that it's not 0).
*
- * Calculate how many fourth-of-a-second sleeps until the timer expires. If the timer expires
- * and threads don't return in 15 minutes, assume there is something hung, and force the quit.
+ * With a timer, calculate how many fourth-of-a-second sleeps until the timer expires. If the
+ * timer expires and threads don't return in 15 minutes, assume there is something hung, and
+ * force the quit.
*/
+ g.stop_timestamp = 0;
if (GV(RUNS_OPS) == 0)
thread_ops = -1;
else {
if (GV(RUNS_OPS) < GV(RUNS_THREADS))
GV(RUNS_OPS) = GV(RUNS_THREADS);
- thread_ops = GV(RUNS_OPS) / GV(RUNS_THREADS);
+ if (GV(RUNS_PREDICTABLE_REPLAY)) {
+ /*
+ * If running with an operation count for predictable replay, ignore other ways of
+ * stopping.
+ */
+ thread_ops = -1;
+ ops_seconds = 0;
+ g.stop_timestamp = (GV(RUNS_OPS) * run_current) / run_total;
+ } else
+ thread_ops = GV(RUNS_OPS) / GV(RUNS_THREADS);
}
if (ops_seconds == 0)
fourths = quit_fourths = -1;
@@ -290,9 +332,12 @@ operations(u_int ops_seconds, bool lastrun)
wt_wrap_open_session(conn, &sap, NULL, &session);
/* Initialize and start the worker threads. */
+ lanes_init();
tinfo_init();
trace_msg(session, "%s", "=============== thread ops start");
+ replay_run_begin(session);
+
for (i = 0; i < GV(RUNS_THREADS); ++i) {
tinfo = tinfo_list[i];
testutil_check(__wt_thread_create(NULL, &tinfo->tid, ops, tinfo));
@@ -356,13 +401,24 @@ operations(u_int ops_seconds, bool lastrun)
*/
if (lastrun && GV(FORMAT_ABORT))
random_failure();
- tinfo->quit = true;
+
+ /*
+ * Predictable replay cannot independently tag every thread to stop, we would end up
+ * with a mix of commits at the end of the run. Rather, later in this loop, when we
+ * see we are finishing, we give all threads stop timestamp that they must run to,
+ * but not exceed.
+ */
+ if (!GV(RUNS_PREDICTABLE_REPLAY))
+ tinfo->quit = true;
}
}
track_ops(&total);
if (!running)
break;
__wt_sleep(0, 250 * WT_THOUSAND); /* 1/4th of a second */
+
+ if (fourths == 1 && GV(RUNS_PREDICTABLE_REPLAY))
+ replay_end_timed_run();
if (fourths != -1)
--fourths;
if (quit_fourths != -1 && --quit_fourths == 0) {
@@ -417,6 +473,8 @@ operations(u_int ops_seconds, bool lastrun)
*/
rollback_to_stable(session);
+ replay_run_end(session);
+
if (lastrun) {
tinfo_teardown();
timestamp_teardown(session);
@@ -438,14 +496,18 @@ begin_transaction_ts(TINFO *tinfo)
session = tinfo->session;
- /*
- * Transaction timestamp reads are repeatable, but read timestamps must be before any possible
- * commit timestamp. Without a read timestamp, reads are based on the transaction snapshot,
- * which will include the latest values as of when the snapshot is taken. Test in both modes:
- * 75% of the time, pick a read timestamp before any commit timestamp still in use, 25% of the
- * time don't set a timestamp at all.
- */
- ts = mmrand(&tinfo->rnd, 1, 4) == 1 ? 0 : timestamp_maximum_committed();
+ /* Pick a read timestamp. */
+ if (GV(RUNS_PREDICTABLE_REPLAY))
+ ts = replay_read_ts(tinfo);
+ else
+ /*
+ * Transaction timestamp reads are repeatable, but read timestamps must be before any
+ * possible commit timestamp. Without a read timestamp, reads are based on the transaction
+ * snapshot, which will include the latest values as of when the snapshot is taken. Test in
+ * both modes: 75% of the time, pick a read timestamp before any commit timestamp still in
+ * use, 25% of the time don't set a timestamp at all.
+ */
+ ts = mmrand(&tinfo->data_rnd, 1, 4) == 1 ? 0 : timestamp_maximum_committed();
if (ts != 0) {
wt_wrap_begin_transaction(session, NULL);
@@ -459,6 +521,13 @@ begin_transaction_ts(TINFO *tinfo)
trace_uri_op(tinfo, NULL, "begin snapshot read-ts=%" PRIu64 " (repeatable)", ts);
return;
}
+
+ /*
+ * It should not be possible for a timestamp to age out of the system with predictable
+ * replay. If a begin transaction were to fail, we'd need to begin the transaction again
+ * with the same replay timestamp; we can never give up on a timestamp.
+ */
+ testutil_assert(!GV(RUNS_PREDICTABLE_REPLAY));
testutil_assert(ret == EINVAL);
testutil_check(session->rollback_transaction(session, NULL));
}
@@ -505,7 +574,10 @@ commit_transaction(TINFO *tinfo, bool prepared)
if (prepared)
lock_readlock(session, &g.prepare_commit_lock);
- ts = __wt_atomic_addv64(&g.timestamp, 1);
+ if (GV(RUNS_PREDICTABLE_REPLAY))
+ ts = replay_commit_ts(tinfo);
+ else
+ ts = __wt_atomic_addv64(&g.timestamp, 1);
testutil_check(session->timestamp_transaction_uint(session, WT_TS_TXN_TYPE_COMMIT, ts));
if (prepared)
@@ -515,6 +587,7 @@ commit_transaction(TINFO *tinfo, bool prepared)
testutil_check(session->commit_transaction(session, NULL));
if (prepared)
lock_readunlock(session, &g.prepare_commit_lock);
+ replay_committed(tinfo);
} else
testutil_check(session->commit_transaction(session, NULL));
@@ -542,6 +615,7 @@ rollback_transaction(TINFO *tinfo)
++tinfo->rollback;
testutil_check(session->rollback_transaction(session, NULL));
+ replay_rollback(tinfo);
trace_uri_op(tinfo, NULL, "abort read-ts=%" PRIu64, tinfo->read_ts);
}
@@ -561,12 +635,15 @@ prepare_transaction(TINFO *tinfo)
++tinfo->prepare;
- /*
- * Prepare timestamps must be less than or equal to the eventual commit timestamp. Set the
- * prepare timestamp to whatever the global value is now. The subsequent commit will increment
- * it, ensuring correctness.
- */
- ts = __wt_atomic_fetch_addv64(&g.timestamp, 1);
+ if (GV(RUNS_PREDICTABLE_REPLAY))
+ ts = replay_prepare_ts(tinfo);
+ else
+ /*
+ * Prepare timestamps must be less than or equal to the eventual commit timestamp. Set the
+ * prepare timestamp to whatever the global value is now. The subsequent commit will
+ * increment it, ensuring correctness.
+ */
+ ts = __wt_atomic_fetch_addv64(&g.timestamp, 1);
testutil_check(session->timestamp_transaction_uint(session, WT_TS_TXN_TYPE_PREPARE, ts));
ret = session->prepare_transaction(session, NULL);
@@ -623,6 +700,20 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op)
tinfo->cursor = table_cursor(tinfo, table->id);
/*
+ * Predictable replay has some restrictions. Someday we may be able to resolve some of these
+ * restrictions, this may require adding complexity.
+ *
+ * We disallow inserts into column stores, as column stores do inserts by expanding the number
+ * of keys in the table. This has an interplay with other threads that are trying to predictably
+ * generate key numbers since the key space is growing at a random time. Thus column stores are
+ * restricted to accessing keys that were inserted via bulk load.
+ */
+ if (GV(RUNS_PREDICTABLE_REPLAY)) {
+ if (table->type != ROW && op == INSERT)
+ op = READ;
+ }
+
+ /*
* Truncate has the key set to before/after rows in the table, skip pre-fetch and reserve for
* simplicity.
*
@@ -641,7 +732,7 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op)
* position taken from a previous search. If not already doing a read, position the cursor
* at an existing point in the tree 20% of the time.
*/
- if (op != READ && mmrand(&tinfo->rnd, 1, 5) == 1) {
+ if (op != READ && mmrand(&tinfo->data_rnd, 1, 5) == 1) {
++tinfo->search;
ret = read_row(tinfo);
if (ret == 0) {
@@ -657,7 +748,7 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op)
* work, but doesn't make sense. Reserving a row before a read won't be useful but it's not
* unexpected.
*/
- if (intxn && iso_level == ISOLATION_SNAPSHOT && mmrand(&tinfo->rnd, 0, 20) == 1) {
+ if (intxn && iso_level == ISOLATION_SNAPSHOT && mmrand(&tinfo->data_rnd, 0, 20) == 1) {
switch (table->type) {
case ROW:
ret = row_reserve(tinfo, positioned);
@@ -718,14 +809,14 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op)
case READ:
++tinfo->search;
- if (!positioned && GV(OPS_BOUND_CURSOR) && mmrand(NULL, 1, 2) == 1) {
+ if (!positioned && GV(OPS_BOUND_CURSOR) && mmrand(&tinfo->extra_rnd, 1, 2) == 1) {
bound_set = true;
/*
* FIXME-WT-9883: It is possible that the underlying cursor is still positioned even
* though the positioned variable is false. Reset the position through reset for now.
*/
testutil_check(tinfo->cursor->reset(tinfo->cursor));
- apply_bounds(tinfo->cursor, tinfo->table);
+ apply_bounds(tinfo->cursor, tinfo->table, &tinfo->extra_rnd);
}
ret = read_row(tinfo);
@@ -803,8 +894,8 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op)
* direction.
*/
if (positioned) {
- next = mmrand(&tinfo->rnd, 0, 1) == 1;
- j = mmrand(&tinfo->rnd, 1, 100);
+ next = mmrand(&tinfo->extra_rnd, 0, 1) == 1;
+ j = mmrand(&tinfo->extra_rnd, 1, 100);
for (i = 0; i < j; ++i) {
if ((ret = nextprev(tinfo, next)) == 0)
continue;
@@ -818,7 +909,7 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op)
* Reset the cursor: there is no reason to keep pages pinned, periodically forcibly evict the
* underlying page.
*/
- evict_page = mmrand(&tinfo->rnd, 1, 20) == 1;
+ evict_page = mmrand(&tinfo->extra_rnd, 1, 20) == 1;
if (evict_page)
F_SET(tinfo->cursor, WT_CURSTD_DEBUG_RESET_EVICT);
testutil_check(tinfo->cursor->reset(tinfo->cursor));
@@ -865,7 +956,7 @@ ops(void *arg)
iso_level_t iso_level;
thread_op op;
uint64_t reset_op, session_op, truncate_op;
- uint32_t max_rows, range, rnd;
+ uint32_t max_rows, ntries, range, rnd;
u_int i;
const char *iso_config;
bool greater_than, intxn, prepared;
@@ -878,25 +969,59 @@ ops(void *arg)
* pound on the same key/value pairs, that is, by making them traverse the same RNG space. 75%
* of the time we run in independent RNG space.
*/
- if (GV(FORMAT_INDEPENDENT_THREAD_RNG))
- __wt_random_init_seed(NULL, &tinfo->rnd);
- else
- __wt_random_init(&tinfo->rnd);
+ if (GV(FORMAT_INDEPENDENT_THREAD_RNG)) {
+ testutil_random_from_seed(&tinfo->data_rnd, GV(RANDOM_DATA_SEED) + (u_int)tinfo->id);
+ testutil_random_from_seed(&tinfo->extra_rnd, GV(RANDOM_EXTRA_SEED) + (u_int)tinfo->id);
+ } else {
+ testutil_random_from_seed(&tinfo->data_rnd, GV(RANDOM_DATA_SEED));
+ testutil_random_from_seed(&tinfo->extra_rnd, GV(RANDOM_EXTRA_SEED));
+ }
iso_level = ISOLATION_SNAPSHOT; /* -Wconditional-uninitialized */
+ tinfo->replay_again = false;
+ tinfo->lane = LANE_NONE;
/* Set the first operation where we'll create a new session and cursors. */
session = NULL;
session_op = 0;
+ ntries = 0;
/* Set the first operation where we'll reset the session. */
- reset_op = mmrand(&tinfo->rnd, 100, 10 * WT_THOUSAND);
+ reset_op = mmrand(&tinfo->extra_rnd, 100, 10 * WT_THOUSAND);
/* Set the first operation where we'll truncate a range. */
- truncate_op = mmrand(&tinfo->rnd, 100, 10 * WT_THOUSAND);
+ truncate_op = mmrand(&tinfo->data_rnd, 100, 10 * WT_THOUSAND);
for (intxn = false; !tinfo->quit;) {
+rollback_retry:
+ if (tinfo->quit)
+ break;
+
++tinfo->ops;
+ if (!tinfo->replay_again)
+ /*
+ * Number of failures so far for the current operation and key. In predictable replay,
+ * unless we have a read operation, we cannot give up on any operation and maintain the
+ * integrity of the replay.
+ */
+ ntries = 0;
+
+ /* Number of tries only gets incremented during predictable replay. */
+ testutil_assert(ntries == 0 || (!intxn && tinfo->replay_again));
+
+ /*
+ * In predictable replay, put each operation in its own transaction. It's possible we could
+ * make multiple operations work predictably in the future.
+ */
+ if (intxn && GV(RUNS_PREDICTABLE_REPLAY)) {
+ commit_transaction(tinfo, false);
+ intxn = false;
+ }
+
+ replay_loop_begin(tinfo, intxn);
+ if (tinfo->quit)
+ break;
+
/* Periodically open up a new session and cursors. */
if (tinfo->ops > session_op) {
/* Resolve any running transaction. */
@@ -909,7 +1034,7 @@ ops(void *arg)
session = tinfo->session;
/* Pick the next session/cursor close/open. */
- session_op += mmrand(&tinfo->rnd, 100, 5 * WT_THOUSAND);
+ session_op += mmrand(&tinfo->extra_rnd, 100, 5 * WT_THOUSAND);
}
/* If not in a transaction, reset the session periodically so that operation is tested. */
@@ -917,20 +1042,20 @@ ops(void *arg)
testutil_check(session->reset(session));
/* Pick the next reset operation. */
- reset_op += mmrand(&tinfo->rnd, 40 * WT_THOUSAND, 60 * WT_THOUSAND);
+ reset_op += mmrand(&tinfo->extra_rnd, 40 * WT_THOUSAND, 60 * WT_THOUSAND);
}
/*
* If not in a transaction and in a timestamp world, occasionally repeat timestamped
* operations.
*/
- if (!intxn && g.transaction_timestamps_config && mmrand(&tinfo->rnd, 1, 15) == 1) {
+ if (!intxn && g.transaction_timestamps_config && mmrand(&tinfo->extra_rnd, 1, 15) == 1) {
++tinfo->search;
snap_repeat_single(tinfo);
}
/* Select a table. */
- table = tinfo->table = table_select(tinfo);
+ table = tinfo->table = table_select(tinfo, true);
/*
* If not in a transaction and in a timestamp world, start a transaction (which is always at
@@ -946,14 +1071,15 @@ ops(void *arg)
intxn = true;
}
if (!intxn) {
+ testutil_assert(!GV(RUNS_PREDICTABLE_REPLAY));
iso_level = ISOLATION_IMPLICIT;
- if (table->mirror || mmrand(&tinfo->rnd, 1, 100) < GV(TRANSACTION_IMPLICIT)) {
+ if (table->mirror || mmrand(&tinfo->data_rnd, 1, 100) < GV(TRANSACTION_IMPLICIT)) {
iso_level = ISOLATION_SNAPSHOT;
iso_config = "isolation=snapshot";
/* Occasionally do reads at an isolation level lower than snapshot. */
- switch (mmrand(NULL, 1, 20)) {
+ switch (mmrand(&tinfo->data_rnd, 1, 20)) {
case 1:
iso_level = ISOLATION_READ_COMMITTED; /* 5% */
iso_config = "isolation=read-committed";
@@ -975,7 +1101,7 @@ ops(void *arg)
*/
op = READ;
if (iso_level == ISOLATION_IMPLICIT || iso_level == ISOLATION_SNAPSHOT) {
- i = mmrand(&tinfo->rnd, 1, 100);
+ i = mmrand(&tinfo->data_rnd, 1, 100);
if (i < TV(OPS_PCT_DELETE)) {
op = REMOVE;
if (TV(OPS_TRUNCATE) && tinfo->ops > truncate_op) {
@@ -986,7 +1112,7 @@ ops(void *arg)
op = TRUNCATE;
/* Pick the next truncate operation. */
- truncate_op += mmrand(&tinfo->rnd, 20 * WT_THOUSAND, 100 * WT_THOUSAND);
+ truncate_op += mmrand(&tinfo->data_rnd, 20 * WT_THOUSAND, 100 * WT_THOUSAND);
}
} else if (i < TV(OPS_PCT_DELETE) + TV(OPS_PCT_INSERT))
op = INSERT;
@@ -996,6 +1122,10 @@ ops(void *arg)
TV(OPS_PCT_DELETE) + TV(OPS_PCT_INSERT) + TV(OPS_PCT_MODIFY) + TV(OPS_PCT_WRITE))
op = UPDATE;
}
+ tinfo->op = op; /* Keep the op in the thread info for debugging */
+
+ /* Make sure this is an operation that is permitted for this kind of run. */
+ testutil_assert(replay_operation_enabled(op));
/*
* Get the number of rows. Column-store extends the object, use that extended count if this
@@ -1005,7 +1135,8 @@ ops(void *arg)
max_rows = TV(RUNS_ROWS);
if (table->type != ROW && !table->mirror)
WT_ORDERED_READ(max_rows, table->rows_current);
- tinfo->keyno = mmrand(&tinfo->rnd, 1, (u_int)max_rows);
+ tinfo->keyno = mmrand(&tinfo->data_rnd, 1, (u_int)max_rows);
+ replay_adjust_key(tinfo, max_rows);
/*
* If the operation is a truncate, select a range.
@@ -1020,9 +1151,9 @@ ops(void *arg)
* from lower keys to higher keys or vice-versa).
*/
if (op == TRUNCATE) {
- tinfo->last = tinfo->keyno = mmrand(&tinfo->rnd, 1, (u_int)max_rows);
- greater_than = mmrand(&tinfo->rnd, 0, 1) == 1;
- range = max_rows < 20 ? 0 : mmrand(&tinfo->rnd, 0, (u_int)max_rows / 50);
+ tinfo->last = tinfo->keyno = mmrand(&tinfo->data_rnd, 1, (u_int)max_rows);
+ greater_than = mmrand(&tinfo->data_rnd, 0, 1) == 1;
+ range = max_rows < 20 ? 0 : mmrand(&tinfo->data_rnd, 0, (u_int)max_rows / 50);
if (greater_than) {
if (TV(BTREE_REVERSE)) {
if (tinfo->keyno <= range)
@@ -1069,9 +1200,10 @@ ops(void *arg)
*/
if (op == INSERT || op == UPDATE) {
if (table->type == FIX && table->mirror)
- val_gen(g.base_mirror, &tinfo->rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno);
+ val_gen(
+ g.base_mirror, &tinfo->data_rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno);
else
- val_gen(table, &tinfo->rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno);
+ val_gen(table, &tinfo->data_rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno);
}
/*
@@ -1085,7 +1217,7 @@ ops(void *arg)
if (table->type != FIX || table->mirror)
modify_build(tinfo);
else
- val_gen(table, &tinfo->rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno);
+ val_gen(table, &tinfo->data_rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno);
}
/*
@@ -1106,7 +1238,14 @@ ops(void *arg)
* skip the operation. This isn't to avoid wasted work: any FLCS table in the mirrored
* will do an update as FLCS doesn't support modify, and we'll fail when we compare the
* remove to the FLCS value.
+ *
+ * For predictable replay if the record doesn't exist (that's predictable), and we must
+ * force a rollback, we always finish a loop iteration in a committed or rolled back
+ * state.
*/
+ if (GV(RUNS_PREDICTABLE_REPLAY) && (ret == WT_ROLLBACK || tinfo->op_ret == WT_NOTFOUND))
+ goto rollback;
+
if (tinfo->op_ret == WT_NOTFOUND)
goto skip_operation;
@@ -1116,6 +1255,8 @@ ops(void *arg)
tinfo->table = table;
ret = table_op(tinfo, intxn, iso_level, op);
testutil_assert(ret == 0 || ret == WT_ROLLBACK);
+ if (GV(RUNS_PREDICTABLE_REPLAY) && ret == WT_ROLLBACK)
+ goto rollback;
skip2 = table;
}
if (ret == 0 && table->mirror)
@@ -1124,6 +1265,8 @@ ops(void *arg)
tinfo->table = tables[i];
ret = table_op(tinfo, intxn, iso_level, op);
testutil_assert(ret == 0 || ret == WT_ROLLBACK);
+ if (GV(RUNS_PREDICTABLE_REPLAY) && ret == WT_ROLLBACK)
+ goto rollback;
if (ret == WT_ROLLBACK)
break;
}
@@ -1144,9 +1287,22 @@ skip_operation:
/*
* If not in a transaction, we're done with this operation. If in a transaction, add more
- * operations to the transaction half the time.
+ * operations to the transaction half the time. For predictable replay runs, always complete
+ * the transaction.
*/
- if (!intxn || (rnd = mmrand(&tinfo->rnd, 1, 10)) > 5)
+ if (GV(RUNS_PREDICTABLE_REPLAY)) {
+ rnd = mmrand(&tinfo->data_rnd, 1, 5);
+
+ /*
+ * Note that a random value of 5 would result in a rollback per the switch below. For
+ * predictable replay, only do that once per timestamp. If we didn't have this check, a
+ * retry would start again with the same timestamp and RNG state, and get the same dice
+ * roll. This would happen every time and the thread will be get stuck doing continuous
+ * rollbacks.
+ */
+ if (rnd == 5 && ntries != 0)
+ rnd = 4; /* Choose to do a commit this time. */
+ } else if (!intxn || (rnd = mmrand(&tinfo->data_rnd, 1, 10)) > 5)
continue;
/*
@@ -1168,7 +1324,7 @@ skip_operation:
* timestamped world, which means we're in a snapshot-isolation transaction by definition.
*/
prepared = false;
- if (GV(OPS_PREPARE) && mmrand(&tinfo->rnd, 1, 10) == 1) {
+ if (GV(OPS_PREPARE) && mmrand(&tinfo->data_rnd, 1, 10) == 1) {
if ((ret = prepare_transaction(tinfo)) != 0) {
testutil_assert(ret == WT_ROLLBACK);
goto rollback;
@@ -1191,6 +1347,18 @@ skip_operation:
break;
case 5: /* 10% */
rollback:
+ if (GV(RUNS_PREDICTABLE_REPLAY)) {
+ if (tinfo->quit)
+ goto loop_exit;
+ /* Force a rollback */
+ testutil_assert(intxn);
+ rollback_transaction(tinfo);
+ intxn = false;
+ ++ntries;
+ replay_pause_after_rollback(tinfo, ntries);
+ ret = 0;
+ goto rollback_retry;
+ }
__wt_yield(); /* Encourage races */
rollback_transaction(tinfo);
snap_repeat_update(tinfo, false);
@@ -1200,6 +1368,7 @@ rollback:
intxn = false;
}
+loop_exit:
if (session != NULL)
testutil_check(session->close(session, NULL));
tinfo->session = NULL;
@@ -1238,7 +1407,11 @@ read_row_worker(TINFO *tinfo, TABLE *table, WT_CURSOR *cursor, uint64_t keyno, W
break;
}
- if (sn) {
+ /*
+ * We don't use search near for predictable replay runs, as the return key can be variable
+ * depending on the structure of the Btree.
+ */
+ if (sn && !GV(RUNS_PREDICTABLE_REPLAY)) {
ret = read_op(cursor, SEARCH_NEAR, &exact);
if (ret == 0 && exact != 0)
ret = WT_NOTFOUND;
@@ -1293,7 +1466,7 @@ read_row_worker(TINFO *tinfo, TABLE *table, WT_CURSOR *cursor, uint64_t keyno, W
* Apply lower and upper bounds on the cursor. The lower and upper bound is randomly generated.
*/
static void
-apply_bounds(WT_CURSOR *cursor, TABLE *table)
+apply_bounds(WT_CURSOR *cursor, TABLE *table, WT_RAND_STATE *rnd)
{
WT_ITEM key;
uint32_t lower_keyno, max_rows, upper_keyno;
@@ -1310,7 +1483,7 @@ apply_bounds(WT_CURSOR *cursor, TABLE *table)
* Generate a random lower key and apply to the lower bound or upper bound depending on the
* reverse collator.
*/
- lower_keyno = mmrand(NULL, 1, max_rows);
+ lower_keyno = mmrand(rnd, 1, max_rows);
/* Retrieve the key/value pair by key. */
switch (table->type) {
case FIX:
@@ -1331,7 +1504,7 @@ apply_bounds(WT_CURSOR *cursor, TABLE *table)
* Generate a random upper key and apply to the upper bound or lower bound depending on the
* reverse collator.
*/
- upper_keyno = mmrand(NULL, lower_keyno, max_rows);
+ upper_keyno = mmrand(rnd, lower_keyno, max_rows);
/* Retrieve the key/value pair by key. */
switch (table->type) {
@@ -1371,20 +1544,22 @@ clear_bounds(WT_CURSOR *cursor, TABLE *table)
* Read and verify a subset of the elements in a file.
*/
void
-wts_read_scan(TABLE *table, void *arg)
+wts_read_scan(TABLE *table, void *args)
{
SAP sap;
WT_CONNECTION *conn;
WT_CURSOR *cursor;
WT_DECL_RET;
WT_ITEM key, value;
+ WT_RAND_STATE *rnd;
WT_SESSION *session;
uint64_t keyno;
uint32_t max_rows;
uint8_t bitv;
- conn = (WT_CONNECTION *)arg;
testutil_assert(table != NULL);
+ conn = ((READ_SCAN_ARGS *)args)->conn;
+ rnd = ((READ_SCAN_ARGS *)args)->rnd;
/*
* We're not configuring transactions or read timestamps: if there's a diagnostic check that all
@@ -1406,14 +1581,14 @@ wts_read_scan(TABLE *table, void *arg)
WT_ORDERED_READ(max_rows, table->rows_current);
for (keyno = 0; keyno < max_rows;) {
if (++keyno > 50)
- keyno += mmrand(NULL, 1, WT_THOUSAND);
+ keyno += mmrand(rnd, 1, WT_THOUSAND);
if (keyno > max_rows)
keyno = max_rows;
- if (GV(OPS_BOUND_CURSOR) && mmrand(NULL, 1, 10) == 1) {
+ if (GV(OPS_BOUND_CURSOR) && mmrand(rnd, 1, 10) == 1) {
/* Reset the position of the cursor, so that we can apply bounds on the cursor. */
testutil_check(cursor->reset(cursor));
- apply_bounds(cursor, table);
+ apply_bounds(cursor, table, rnd);
}
switch (ret = read_row_worker(NULL, table, cursor, keyno, &key, &value, &bitv, false)) {
@@ -1444,7 +1619,7 @@ read_row(TINFO *tinfo)
{
/* 25% of the time we call search-near. */
return (read_row_worker(tinfo, NULL, tinfo->cursor, tinfo->keyno, tinfo->key, tinfo->value,
- &tinfo->bitv, mmrand(&tinfo->rnd, 0, 3) == 1));
+ &tinfo->bitv, mmrand(&tinfo->extra_rnd, 0, 3) == 1));
}
/*
@@ -1563,7 +1738,7 @@ modify(TINFO *tinfo, WT_CURSOR *cursor, bool positioned)
bool modify_check;
/* Periodically verify the WT_CURSOR.modify return. */
- modify_check = positioned && mmrand(&tinfo->rnd, 1, 20) == 1;
+ modify_check = positioned && mmrand(&tinfo->extra_rnd, 1, 20) == 1;
if (modify_check) {
testutil_check(cursor->get_value(cursor, &tinfo->moda));
testutil_check(
@@ -1794,7 +1969,7 @@ row_insert(TINFO *tinfo, bool positioned)
* Otherwise, generate a unique key and insert (or update an already inserted record).
*/
if (!positioned) {
- key_gen_insert(tinfo->table, &tinfo->rnd, tinfo->key, tinfo->keyno);
+ key_gen_insert(tinfo->table, &tinfo->data_rnd, tinfo->key, tinfo->keyno);
cursor->set_key(cursor, tinfo->key);
}
cursor->set_value(cursor, tinfo->new_value);
diff --git a/src/third_party/wiredtiger/test/format/random.c b/src/third_party/wiredtiger/test/format/random.c
index c88fe8e21f7..859186fa29f 100644
--- a/src/third_party/wiredtiger/test/format/random.c
+++ b/src/third_party/wiredtiger/test/format/random.c
@@ -72,11 +72,11 @@ random_kv(void *arg)
simple = !simple;
/* Select a table and open a cursor. */
- table = table_select_type(ROW);
+ table = table_select_type(ROW, false);
wt_wrap_open_cursor(session, table->uri, config, &cursor);
/* This is just a smoke-test, get some key/value pairs. */
- for (i = mmrand(NULL, 0, WT_THOUSAND); i > 0; --i) {
+ for (i = mmrand(&g.extra_rnd, 0, WT_THOUSAND); i > 0; --i) {
switch (ret = cursor->next(cursor)) {
case 0:
break;
@@ -95,7 +95,7 @@ random_kv(void *arg)
testutil_check(cursor->close(cursor));
/* Sleep for some number of seconds. */
- period = mmrand(NULL, 1, 10);
+ period = mmrand(&g.extra_rnd, 1, 10);
/* Sleep for short periods so we don't make the run wait. */
while (period > 0 && !g.workers_finished) {
diff --git a/src/third_party/wiredtiger/test/format/replay.c b/src/third_party/wiredtiger/test/format/replay.c
new file mode 100644
index 00000000000..e730119d70f
--- /dev/null
+++ b/src/third_party/wiredtiger/test/format/replay.c
@@ -0,0 +1,548 @@
+/*-
+ * Public Domain 2014-present MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "format.h"
+
+/*
+ * Predictable replay is the ability to do test runs multiple times and always have predictable
+ * changes made at every timestamp. Two predictable runs with the same starting data seed executed
+ * up to the same timestamp will always have their data compare identically. Predictable replay only
+ * works with timestamped transactions and to avoid complexity, only a single operation is allowed
+ * in a transaction.
+ *
+ * To achieve the predictability we use two random number generators (the data RNG and the extra
+ * RNG) with known start seeds, the data seed and the extra seed. Every single-threaded modification
+ * (like bulk loading) when deciding on a random course, uses the global data RNG, which is seeded
+ * by the data seed. Global decisions that don't affect data, like whether to turn on verbose, or
+ * even the rate of checkpointing, use the global extra RNG, which is seeded by the extra seed.
+ * Changing the extra seed may change some characteristics of how a workload is tested, but should
+ * not change any data on disk. When worker threads run, they have their own data and extra RNGs,
+ * and these are seeded by the timestamp they are working on.
+ *
+ * Before a worker thread can decide on what operation to do on which key in which table, it must
+ * obtain the next timestamp. Timestamps are doled out atomically, so no two worker threads can ever
+ * perform operations using the same timestamp. The timestamp is XOR-ed with the data seed, the
+ * result is the seed of the thread's private data RNG for the duration of that operation. Likewise,
+ * a private extra RNG is seeded from the timestamp and the extra seed. This ensures that all
+ * decisions about what is committed at that timestamp are predictable based on the timestamp. As
+ * you might expect, the thread's data RNG is used to decide what operation to do, which table to
+ * use, and which key within the table. Other random decisions, like whether to reopen a session, or
+ * whether to repeat a read from the snap list, use the extra RNG.
+ *
+ * Note that once a thread has started to work on an operation at a timestamp, it cannot give up on
+ * the effort. If, for example, a rollback error naturally happens, we can rollback the transaction.
+ * However, immediately getting a new timestamp would mean that we would lose the consequences of
+ * the previous timestamp, perhaps a record would not be updated in a particular way. Thus, after a
+ * rollback, a thread starts again, using the same timestamp it had before, and it seeds its RNGs
+ * again using this timestamp. This gives full predictability, even in the face of temporary
+ * failures.
+ *
+ * To avoid the possibility that two threads work on the same key at the same time, we have the
+ * concept of lanes, and only one thread can be working in a lane at once. There are LANE_COUNT
+ * lanes, where LANE_COUNT is 2^k for some k. A thread uses a data RNG to choose the top bits of a
+ * key number, but the bottom k bits of the key number are set to the bottom k bits of the timestamp
+ * being worked. Those bottom k bits also determine the lane we are in. Each lane has a flag that
+ * determines whether the lane is in use by some operation. If thread T1 working an operation at
+ * timestamp X takes a sufficiently long time relative to other operations, it may be that the
+ * current timestamp has advanced to X + LANE_COUNT. If that is the case, a different thread T2 that
+ * gets that larger timestamp will see that the lane is occupied. Rather than using that timestamp
+ * and potentially getting the same key number, the T2 leaves that timestamp, knowing that T1 will
+ * do it, and advances to another timestamp to work on. When T1 finishes its long operation, it will
+ * notice if there are other timestamps that have been left for it. If so, it keeps the lane
+ * occupied, and works on the new timestamp. At some point, it will notice that all the timestamps
+ * in the lane have been processed up to that point, and it can release the lane, and go back to
+ * choosing the next available timestamp to process.
+ *
+ * Having some operations lag behind is a natural part of processing. This leads to a stable
+ * timestamp that may lag significantly. Due to the possibility of dependencies between operations,
+ * the more lag, the more chance that a rollback error occurs. Without predictable replay, this is
+ * not a problem, any operation that produces a rollback can be freely abandoned, and threads
+ * generally continue moving quickly ahead with more work. However, with predictable replay, no
+ * operation can be abandoned, and an operation that failed because of a dependency will repeatedly
+ * fail until the stable timestamp advances. For that reason, we keep calculating and moving the
+ * stable timestamp ahead at a much faster pace when predictable replay is configured. We also use
+ * an algorithm that only uses lanes that are in use to calculate the stable timestamp. This is safe
+ * and more responsive than the default calculation. And when there is a rollback error, we try to
+ * be smart whether we need to yield or pause. These modifications allow predictable performance to
+ * be on par with regular performance.
+ */
+
+/*
+ * replay_end_timed_run --
+ * In a timed run, get everyone to stop.
+ */
+void
+replay_end_timed_run(void)
+{
+ /*
+ * We'll post a stop timestamp that all worker threads should abide by. There's a potential race
+ * between when we read the current timestamp and before we publish the stop timestamp. During
+ * that time, other threads could do work and advance the current timestamp, potentially beyond
+ * the intended stop timestamp. We pick a stop timestamp far enough in the future that it's
+ * rather unlikely to happen.
+ */
+ WT_PUBLISH(g.stop_timestamp, g.timestamp + 0x10000);
+}
+
+/*
+ * replay_maximum_committed --
+ * For predictable replay runs, return the largest timestamp that's no longer in use.
+ */
+uint64_t
+replay_maximum_committed(void)
+{
+ uint64_t commit_ts, ts;
+ uint32_t lane;
+
+ /*
+ * The calculation is expensive, and does not need to be accurate all the time, and it's okay to
+ * be behind. So we use a cached value most of the time.
+ */
+ ts = g.replay_cached_committed;
+ if (ts == 0 || __wt_atomic_addv32(&g.replay_calculate_committed, 1) % 20 == 0) {
+ WT_ORDERED_READ(ts, g.timestamp);
+ testutil_check(pthread_rwlock_wrlock(&g.lane_lock));
+ for (lane = 0; lane < LANE_COUNT; ++lane) {
+ if (g.lanes[lane].in_use) {
+ commit_ts = g.lanes[lane].last_commit_ts;
+ if (commit_ts != 0)
+ ts = WT_MIN(ts, commit_ts);
+ }
+ }
+ if (ts == 0)
+ ts = 1;
+ g.replay_cached_committed = ts;
+ testutil_check(pthread_rwlock_unlock(&g.lane_lock));
+ }
+ return (ts);
+}
+
+/*
+ * replay_operation_enabled --
+ * Return whether an operation type should be enabled in the configuration.
+ */
+bool
+replay_operation_enabled(thread_op op)
+{
+ if (!GV(RUNS_PREDICTABLE_REPLAY))
+ return (true);
+
+ /*
+ * We don't permit modify operations with predictable replay.
+ *
+ * The problem is read timestamps. As currently implemented, the read timestamp selected is
+ * variable, based on the state of other threads and their progress with other timestamped
+ * operations. And if two changes are made to the same key in a short amount of time, if the
+ * second operation were to be performed sometimes with a read timestamp before the first
+ * operation, and sometimes with a read timestamp after the first operation, then the results
+ * would be variable.
+ *
+ * We could track recent operations on a key (in its lane, for instance), but when we realize
+ * the read timestamp isn't recent enough, we would need to wait for the stable timestamp to
+ * move forward (and our waiting can affect/delay other thread's operations as well). Having the
+ * stable timestamp move forward is the only way our read timestamp can progress.
+ *
+ * Another possibility that also involves tracking recent operations on a key would be to
+ * disallow modifies that occur within, say 10000 timestamps of a previous write operation on
+ * the same key. Those modifies could be silently converted to reads, for instance. If our read
+ * timestamp was greater than 10000 timestamps behind, we'd still need to wait for the stable
+ * timestamp to catch up.
+ */
+ if (op == MODIFY)
+ return (false);
+
+ /*
+ * FIXME-WT-10570. We don't permit remove operations with predictable replay.
+ *
+ * This should be something we can and should fix. The problem may be similar to the problem
+ * with modify, where having a varying read timestamp can cause different results for different
+ * runs.
+ */
+ if (op == REMOVE)
+ return (false);
+
+ /*
+ * We don't permit truncate operations with predictable replay.
+ *
+ * Currently, we use an operation's timestamp to help derive the operation's key.
+ * The last N bits of the timestamp are used as the last bits of the key (where
+ * 2^N == LANE_COUNT). These last N bits give the lane number, and within each
+ * lane we track the progress of operations for that lane. Using lanes, we can
+ * track and guarantee that only a single operation is active in a lane at once,
+ * and therefore we can't have multiple operations on a single key performed out
+ * of order or simultaneously. The truncate operation, for a small set of keys,
+ * would reserve multiple consecutive lanes (probably okay) and for larger sets,
+ * would reserve the entire set of lanes. This would effectively require all
+ * threads to get into a holding state, waiting for the truncate to start and then
+ * complete before continuing with their next operation. While we could fudge this
+ * in certain ways (e.g. operations with 10000 timestamps of a truncate would be
+ * forced to stay out of its table), there still would be a lot of details, and
+ * some rethink of our lane strategy. Even getting this to work, we would have
+ * a truncate that had the whole table to itself, which doesn't seem like an
+ * effective test.
+ */
+ if (op == TRUNCATE)
+ return (false);
+
+ return (true);
+}
+
+/*
+ * replay_pick_timestamp --
+ * Pick the next timestamp for this operation. That timestamp is used for any commits and also
+ * determines which lane we are in, to prevent races from occurring on operations on a single
+ * key. Also, by using the timestamp to seed the random number generators, it also determines
+ * precisely the nature of the operation.
+ */
+static void
+replay_pick_timestamp(TINFO *tinfo)
+{
+ uint64_t replay_seed, stop_ts, ts;
+ uint32_t lane;
+ bool in_use;
+
+ /*
+ * Choose a unique timestamp for commits. When we do predictable replay. If the field for
+ * replaying again is set, we already have a timestamp picked for us.
+ */
+ if (tinfo->replay_again) {
+ /*
+ * Timestamp is already picked for us.
+ */
+ testutil_assert(tinfo->lane == LANE_NUMBER(tinfo->replay_ts));
+ tinfo->replay_again = false;
+ } else {
+ testutil_assert(tinfo->lane == LANE_NONE);
+
+ stop_ts = g.stop_timestamp;
+ if (stop_ts != 0 && g.stable_timestamp >= stop_ts && tinfo->replay_ts == 0) {
+ tinfo->quit = true;
+ return;
+ }
+
+ testutil_check(pthread_rwlock_wrlock(&g.lane_lock));
+ do {
+ /*
+ * For predictable replay, this is the only place we increment the timestamp. We keep a
+ * copy to check that assumption. If we were to mistakenly change the timestamp
+ * elsewhere (as might be done in non-predictable runs), we would lose the integrity of
+ * the predictable run.
+ */
+ testutil_assert(g.timestamp_copy == g.timestamp);
+ ts = __wt_atomic_addv64(&g.timestamp, 1);
+ g.timestamp_copy = g.timestamp;
+ lane = LANE_NUMBER(ts);
+ WT_ORDERED_READ(in_use, g.lanes[lane].in_use);
+ } while (in_use);
+
+ tinfo->replay_ts = ts;
+ WT_PUBLISH(g.lanes[lane].in_use, true);
+ testutil_check(pthread_rwlock_unlock(&g.lane_lock));
+ tinfo->lane = lane;
+ }
+
+ testutil_assert(tinfo->lane != LANE_NONE);
+ testutil_assert(g.lanes[tinfo->lane].in_use);
+
+ /*
+ * For this operation, seed the RNG used for data operations according to the timestamp and the
+ * global data seed. This allows us to have a predictable set of actions related to commits at
+ * this timestamp, so long as we are running with the same global data seed.
+ */
+ replay_seed = tinfo->replay_ts ^ GV(RANDOM_DATA_SEED);
+ testutil_random_from_seed(&tinfo->data_rnd, replay_seed);
+ replay_seed = tinfo->replay_ts ^ GV(RANDOM_EXTRA_SEED);
+ testutil_random_from_seed(&tinfo->extra_rnd, replay_seed);
+}
+
+/*
+ * replay_loop_begin --
+ * Called at the top of the operation loop.
+ */
+void
+replay_loop_begin(TINFO *tinfo, bool intxn)
+{
+ if (GV(RUNS_PREDICTABLE_REPLAY)) {
+ /*
+ * Predictable replay, as it works now, requires that we're not in transaction when we start
+ * the loop.
+ */
+ testutil_assert(!intxn);
+
+ /*
+ * We're here at the start of the loop for one of four reasons:
+ * 1) We needed to rollback the transaction, so we didn't give up our replay timestamp,
+ * and we set the again flag.
+ * 2) We successfully committed the last transaction, but our lane was behind,
+ * and was skipped over, so we're obligated to perform the next timestamp in our lane.
+ * In that case, we have a replay timestamp and the again flag is set.
+ * 3) We successfully committed the last transaction, and our lane was not behind.
+ * We don't have a replay timestamp and the again flag is off.
+ * 4) It's our first time through the loop, this is equivalent to the previous case.
+ */
+ testutil_assert(tinfo->replay_again == (tinfo->replay_ts != 0));
+ /*
+ * Choose a unique timestamp for commits, based on the conditions above.
+ */
+ replay_pick_timestamp(tinfo);
+
+ testutil_assert(tinfo->quit || tinfo->replay_ts != 0);
+ }
+}
+
+/*
+ * replay_run_reset --
+ * Called at beginning and end of runs to set up the lanes.
+ */
+static void
+replay_run_reset(void)
+{
+ TINFO *tinfo, **tlp;
+ uint64_t ts;
+ uint32_t lane;
+
+ /* Set every lane's commit timestamp to the current timestamp. */
+ ts = g.timestamp;
+ g.timestamp_copy = ts;
+ for (lane = 0; lane < LANE_COUNT; ++lane)
+ g.lanes[lane].last_commit_ts = ts;
+ g.replay_cached_committed = ts;
+
+ /* Reset fields in tinfo. */
+ if (tinfo_list != NULL)
+ for (tlp = tinfo_list; *tlp != NULL; ++tlp) {
+ tinfo = *tlp;
+ tinfo->replay_again = false;
+ tinfo->replay_ts = 0;
+ tinfo->lane = 0;
+ tinfo->op = (thread_op)0;
+ }
+}
+
+/*
+ * replay_run_begin --
+ * Called at the beginning of a run.
+ */
+void
+replay_run_begin(WT_SESSION *session)
+{
+ (void)session;
+
+ if (GV(RUNS_PREDICTABLE_REPLAY))
+ replay_run_reset();
+}
+
+/*
+ * replay_run_end --
+ * Called when finishing processing for a run.
+ */
+void
+replay_run_end(WT_SESSION *session)
+{
+ (void)session;
+
+ if (GV(RUNS_PREDICTABLE_REPLAY))
+ replay_run_reset();
+}
+
+/*
+ * replay_read_ts --
+ * Return a read timestamp for a begin transaction call.
+ */
+uint64_t
+replay_read_ts(TINFO *tinfo)
+{
+ uint64_t commit_ts;
+
+ testutil_assert(GV(RUNS_PREDICTABLE_REPLAY) && tinfo->lane != LANE_NONE &&
+ g.lanes[tinfo->lane].in_use && tinfo->replay_ts != 0);
+
+ commit_ts = replay_maximum_committed();
+ testutil_assert(commit_ts != 0);
+ return (commit_ts);
+}
+
+/*
+ * replay_prepare_ts --
+ * Return a timestamp to be used for prepare.
+ */
+uint64_t
+replay_prepare_ts(TINFO *tinfo)
+{
+ uint64_t prepare_ts, ts;
+
+ testutil_assert(GV(RUNS_PREDICTABLE_REPLAY));
+
+ /* See if we're just starting a run. */
+ if (tinfo->replay_ts == 0 || tinfo->replay_ts <= g.replay_start_timestamp + LANE_COUNT)
+ /*
+ * When we're starting a run, we'll just use the final commit timestamp for our prepare
+ * timestamp. We know that's safe.
+ */
+ prepare_ts = tinfo->replay_ts;
+ else {
+ /*
+ * Our lane's current operation will have a commit timestamp tinfo->replay_ts. Our lane's
+ * previous commit timestamp was that number minus LANE_COUNT. The global stable timestamp
+ * generally should not be advanced past our lane's previous commit timestamp. So a prepare
+ * timestamp halfway between the lane's previous commit timestamp and the current commit
+ * timestamp should be valid.
+ */
+ ts = tinfo->replay_ts - LANE_COUNT / 2;
+
+ /* As a sanity check, make sure the timestamp hasn't completely aged out. */
+ if (ts < g.oldest_timestamp)
+ prepare_ts = ts;
+ else
+ prepare_ts = tinfo->replay_ts;
+ }
+ return (prepare_ts);
+}
+
+/*
+ * replay_commit_ts --
+ * Return the commit timestamp.
+ */
+uint64_t
+replay_commit_ts(TINFO *tinfo)
+{
+ testutil_assert(GV(RUNS_PREDICTABLE_REPLAY));
+
+ testutil_assert(tinfo->replay_ts != 0);
+ return (tinfo->replay_ts);
+}
+
+/*
+ * replay_committed --
+ * Called when a transaction was successfully committed. We can give up a lane if appropriate.
+ */
+void
+replay_committed(TINFO *tinfo)
+{
+ uint32_t lane;
+
+ if (!GV(RUNS_PREDICTABLE_REPLAY))
+ return;
+
+ testutil_assert(tinfo->replay_ts != 0);
+
+ lane = tinfo->lane;
+ testutil_assert(!tinfo->replay_again);
+ testutil_check(pthread_rwlock_wrlock(&g.lane_lock));
+
+ /*
+ * Updating the last commit timestamp for a lane in use allows read, oldest and stable
+ * timestamps to advance.
+ */
+ WT_PUBLISH(g.lanes[lane].last_commit_ts, tinfo->replay_ts);
+ if (g.timestamp <= tinfo->replay_ts + LANE_COUNT) {
+ WT_PUBLISH(g.lanes[lane].in_use, false);
+ tinfo->lane = LANE_NONE;
+ tinfo->replay_ts = 0;
+ } else {
+ tinfo->replay_ts += LANE_COUNT;
+ tinfo->replay_again = true;
+ }
+ testutil_check(pthread_rwlock_unlock(&g.lane_lock));
+}
+
+/*
+ * replay_adjust_key --
+ * Given a fully random key number, modify the key that is in our lane.
+ */
+void
+replay_adjust_key(TINFO *tinfo, uint64_t max_rows)
+{
+ uint64_t keyno;
+ uint32_t lane;
+
+ if (GV(RUNS_PREDICTABLE_REPLAY)) {
+ lane = tinfo->lane;
+ keyno = (tinfo->keyno & ~(LANE_COUNT - 1)) | lane;
+
+ if (keyno == 0)
+ keyno = LANE_COUNT;
+ else if (keyno >= max_rows)
+ keyno -= LANE_COUNT;
+
+ tinfo->keyno = keyno;
+ }
+}
+
+/*
+ * replay_rollback --
+ * Called after a rollback.
+ */
+void
+replay_rollback(TINFO *tinfo)
+{
+ if (!GV(RUNS_PREDICTABLE_REPLAY))
+ return;
+
+ /*
+ * After a rollback, we don't give up our timestamp or our lane, we need to retry at the top of
+ * the operations loop.
+ */
+ tinfo->replay_again = true;
+
+ testutil_assert(tinfo->replay_ts != 0);
+ testutil_assert(tinfo->lane != LANE_NONE);
+ testutil_assert(g.lanes[tinfo->lane].in_use);
+}
+
+/*
+ * replay_pause_after_rollback --
+ * Called after a rollback, allowing us to yield or pause.
+ */
+void
+replay_pause_after_rollback(TINFO *tinfo, uint32_t ntries)
+{
+ uint64_t high, low, mid;
+
+ if (!GV(RUNS_PREDICTABLE_REPLAY))
+ return;
+
+ /* Generally, the more behind we are, the less we want to wait. */
+ low = replay_maximum_committed();
+ high = g.timestamp;
+ mid = high + low / 2;
+
+ /* If we're in the furthest group behind, don't wait at all. */
+ if (low + LANE_COUNT <= tinfo->replay_ts)
+ return;
+
+ /*
+ * If we're in the last half, don't sleep. If we're in the front half, occasionally sleep.
+ */
+ if (tinfo->replay_ts < mid && ntries % 10 != 0)
+ __wt_yield();
+ else {
+ /* Never sleep more than .1 seconds */
+ __wt_sleep(0, ntries > 100 ? 100 * WT_THOUSAND : ntries * WT_THOUSAND);
+ }
+}
diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c
index 2f1d5a366b9..3855d05f379 100644
--- a/src/third_party/wiredtiger/test/format/snap.c
+++ b/src/third_party/wiredtiger/test/format/snap.c
@@ -677,7 +677,7 @@ snap_repeat_single(TINFO *tinfo)
* Start at a random spot in the list of operations and look for a read to retry. Stop when
* we've walked the entire list or found one.
*/
- v = mmrand(&tinfo->rnd, 1, SNAP_LIST_SIZE) - 1;
+ v = mmrand(&tinfo->extra_rnd, 1, SNAP_LIST_SIZE) - 1;
for (snap = &tinfo->snap_list[v], count = SNAP_LIST_SIZE; count > 0; --count, ++snap) {
/* Wrap at the end of the circular buffer. */
if (snap >= tinfo->snap_end)
diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c
index 632a457d04f..7066dedf6dd 100644
--- a/src/third_party/wiredtiger/test/format/t.c
+++ b/src/third_party/wiredtiger/test/format/t.c
@@ -179,14 +179,15 @@ static bool syntax_check; /* Only checking configuration syntax. */
/*
* main --
- * TODO: Add a comment describing this function.
+ * Run a variety of multithreaded WiredTiger operations based on a set of configurations.
*/
int
main(int argc, char *argv[])
{
+ READ_SCAN_ARGS scan_args;
uint64_t now, start;
- u_int ops_seconds;
- int ch, reps;
+ u_int ops_seconds, reps;
+ int ch;
const char *config, *home;
bool is_backup, quiet_flag, verify_only;
@@ -252,11 +253,20 @@ main(int argc, char *argv[])
fflush(stdout);
}
- __wt_random_init_seed(NULL, &g.rnd); /* Initialize the RNG. */
+ /*
+ * Initialize the RNGs. This is needed early because some random decisions are made while
+ * reading configuration. There may be random seeds in the configuration, however, so we will
+ * reinitialize the RNGs later.
+ */
+ __wt_random_init_seed(NULL, &g.data_rnd);
+ __wt_random_init_seed(NULL, &g.extra_rnd);
- /* Initialize lock to ensure single threading during failure handling */
+ /* Initialize lock to ensure single threading during failure handling. */
testutil_check(pthread_rwlock_init(&g.death_lock, NULL));
+ /* Initialize lock to ensure single threading for lane operations in predictable replay. */
+ testutil_check(pthread_rwlock_init(&g.lane_lock, NULL));
+
/*
* Initialize the tables array and default to multi-table testing if not in backward-compatible
* mode.
@@ -357,7 +367,9 @@ main(int argc, char *argv[])
TIMED_MAJOR_OP(wts_verify(g.wts_conn, true));
if (verify_only)
goto skip_operations;
- TIMED_MAJOR_OP(tables_apply(wts_read_scan, g.wts_conn));
+ scan_args.conn = g.wts_conn;
+ scan_args.rnd = &g.extra_rnd;
+ TIMED_MAJOR_OP(tables_apply(wts_read_scan, &scan_args));
/* Optionally start checkpoints. */
wts_checkpoints();
@@ -373,7 +385,7 @@ main(int argc, char *argv[])
*/
ops_seconds = GV(RUNS_TIMER) == 0 ? 0 : ((GV(RUNS_TIMER) * 60) - 15) / FORMAT_OPERATION_REPS;
for (reps = 1; reps <= FORMAT_OPERATION_REPS; ++reps)
- operations(ops_seconds, reps == FORMAT_OPERATION_REPS);
+ operations(ops_seconds, reps, FORMAT_OPERATION_REPS);
/* Copy out the run's statistics. */
TIMED_MAJOR_OP(wts_stats());
diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c
index 657c30b202f..4ea4a429be9 100644
--- a/src/third_party/wiredtiger/test/format/wts.c
+++ b/src/third_party/wiredtiger/test/format/wts.c
@@ -387,10 +387,10 @@ create_object(TABLE *table, void *arg)
* Configure the maximum key/value sizes, but leave it as the default if we come up with
* something crazy.
*/
- maxleafkey = mmrand(NULL, table->max_leaf_page / 50, table->max_leaf_page / 40);
+ maxleafkey = mmrand(&g.extra_rnd, table->max_leaf_page / 50, table->max_leaf_page / 40);
if (maxleafkey > 20)
CONFIG_APPEND(p, ",leaf_key_max=%" PRIu32, maxleafkey);
- maxleafvalue = mmrand(NULL, table->max_leaf_page * 10, table->max_leaf_page / 40);
+ maxleafvalue = mmrand(&g.extra_rnd, table->max_leaf_page * 10, table->max_leaf_page / 40);
if (maxleafvalue > 40 && maxleafvalue < 100 * 1024)
CONFIG_APPEND(p, ",leaf_value_max=%" PRIu32, maxleafvalue);
@@ -408,7 +408,7 @@ create_object(TABLE *table, void *arg)
if (TV(BTREE_HUFFMAN_VALUE))
CONFIG_APPEND(p, ",huffman_value=english");
if (TV(BTREE_DICTIONARY))
- CONFIG_APPEND(p, ",dictionary=%" PRIu32, mmrand(NULL, 123, 517));
+ CONFIG_APPEND(p, ",dictionary=%" PRIu32, mmrand(&g.extra_rnd, 123, 517));
break;
}
diff --git a/src/third_party/wiredtiger/tools/wt_cmp_dir b/src/third_party/wiredtiger/tools/wt_cmp_dir
index 7e4e5d87a8b..3cd39557c40 100755
--- a/src/third_party/wiredtiger/tools/wt_cmp_dir
+++ b/src/third_party/wiredtiger/tools/wt_cmp_dir
@@ -145,7 +145,7 @@ cmp_uri_script=$(dirname "$0")/wt_cmp_uri
ecode=0
for f in $files1; do
echo $f
- if ! python $cmp_uri_script $timestamp_opt1 "$dir1"/$f $timestamp_opt2 "$dir2"/$f; then
+ if ! python3 $cmp_uri_script $timestamp_opt1 "$dir1"/$f $timestamp_opt2 "$dir2"/$f; then
ecode=1
fi
done