Import wiredtiger: 604dd69988250e1c8698cf7e5ac5dbce4a8f88f7 from branch mongodb-master

ref: bb3421a839..604dd69988 for: 7.0.0-rc0 WT-9915 For tiered storage testing, get predictable outputs for test/format
author: Luke Chen <luke.chen@mongodb.com> 2023-02-13 09:36:07 +1100
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2023-02-12 23:24:00 +0000
commit: 2d695bc7f8eb7328efd728ab0419255bbcc5beed (patch)
tree: 889c07054168a84eb3c811d90e4693614dbdab4f
parent: ec7653ce39c817c1832fd6238474b1264a31b7b0 (diff)
download: mongo-2d695bc7f8eb7328efd728ab0419255bbcc5beed.tar.gz
26 files changed, 1301 insertions, 212 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index 17bb59ba27a..332601000e4 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
     "vendor": "wiredtiger",
     "github": "wiredtiger/wiredtiger.git",
     "branch": "mongodb-master",
-    "commit": "bb3421a83981c5ece92579e9689e1636db90b559"
+    "commit": "604dd69988250e1c8698cf7e5ac5dbce4a8f88f7"
 }
diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml
index 07931f47523..1d360bfdd6e 100755
--- a/src/third_party/wiredtiger/test/evergreen.yml
+++ b/src/third_party/wiredtiger/test/evergreen.yml
@@ -473,6 +473,53 @@ functions:
         for i in $(seq ${times|1}); do
           ./t -c ${config|../../../test/format/CONFIG.stress} ${trace_args|-T bulk,txn,retain=100} ${extra_args|} || ( [ -f RUNDIR/CONFIG ] && cat RUNDIR/CONFIG ) 2>&1
         done
+  "format test predictable":
+    command: shell.exec
+    params:
+      working_dir: "wiredtiger/cmake_build/test/format"
+      script: |
+        # To test predictable replay, we run test/format three times with the same data seed
+        # each time, and compare the keys and values found in the WT home directories.
+        # The first run is a timed one. When it's completed, we get the run's stable timestamp,
+        # and do the subsequent runs up to that stable timestamp.  This, along with predictable
+        # replay using the same data seed, should guarantee we have equivalent data created.
+        set -o errexit
+        set -o verbose
+        fail() {
+          echo "======= FAILURE =========="
+          for file; do
+            if [ -f "$file" ]; then
+              echo Contents of "$file":
+              cat "$file"
+              echo "================"
+            fi
+          done
+          exit 1
+        }
+        runtime=3  # minutes
+        config=../../../test/format/CONFIG.replay
+        for i in $(seq ${times}); do
+          echo Iteration $i/${times}
+          x2=$RANDOM$RANDOM
+          x3=$RANDOM$RANDOM
+          rm -rf RUNDIR_{1,2,3}
+
+          first_run_args="-c $config runs.timer=$runtime"
+          ./t -h RUNDIR_1 $first_run_args ${extra_args} || fail RUNDIR_1/CONFIG 2>&1
+          stable_hex=$(../../../tools/wt_timestamps RUNDIR_1 | sed -e '/stable=/!d' -e 's/.*=//')
+          ops=$(echo $((0x$stable_hex)))
+
+          # Do the second run up to the stable timestamp, using the same data seed,
+          # but with a different extra seed.  Compare it when done.
+          common_args="-c RUNDIR_1/CONFIG runs.timer=0 runs.ops=$ops"
+          ./t -h RUNDIR_2 $common_args random.extra_seed=$x2 || fail RUNDIR_2/CONFIG 2>&1
+          ../../../tools/wt_cmp_dir RUNDIR_1 RUNDIR_2 || fail RUNDIR_1/CONFIG RUNDIR_2/CONFIG 2>&1
+
+          # Do the third run up to the stable timestamp, using the same data seed,
+          # but with a different extra seed.  Compare it to the second run when done.
+          ./t -h RUNDIR_3 $common_args random.extra_seed=$x3 || fail RUNDIR_3/CONFIG 2>&1
+          ../../../tools/wt_cmp_dir RUNDIR_2 RUNDIR_3 || fail RUNDIR_2/CONFIG RUNDIR_3/CONFIG 2>&1
+        done
   "format test script":
     command: shell.exec
     params:
@@ -3727,6 +3774,19 @@ tasks:
         vars:
           format_test_script_args: -a -t 30
 
+  - name: format-predictable-test
+    # Set 2.5 hour timeout (60 * 60 * 2.5)
+    exec_timeout_secs: 9000
+    commands:
+      - func: "get project"
+      - func: "compile wiredtiger"
+        vars:
+          <<: *configure_flags_with_builtins
+          CMAKE_TOOLCHAIN_FILE: -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/mongodbtoolchain_v4_gcc.cmake
+      - func: "format test predictable"
+        vars:
+          times: 5
+
   - name: many-collection-test
     commands:
       - command: timeout.update
@@ -4918,6 +4978,7 @@ buildvariants:
     - name: ".stress-test-4-nonstandalone"
     - name: ".stress-test-no-barrier-nonstandalone"
     - name: format-abort-recovery-stress-test-nonstandalone
+    - name: format-predictable-test
 
 # When running the Python tests on this variant tcmalloc must be preloaded otherwise the wiredtiger library
 # fails to load and resolve its dependency.
diff --git a/src/third_party/wiredtiger/test/format/CONFIG.replay b/src/third_party/wiredtiger/test/format/CONFIG.replay
new file mode 100644
index 00000000000..9e42cf00cfd
--- /dev/null
+++ b/src/third_party/wiredtiger/test/format/CONFIG.replay
@@ -0,0 +1,26 @@
+############################################
+#  RUN PARAMETERS: V3
+############################################
+# A configuration for predictable replay.
+# Some things are locked down at the moment.
+backup=0
+btree.huffman_value=0
+cache.minimum=20
+format.abort=0
+format.independent_thread_rng=1
+import=0
+ops.alter=0
+ops.compaction=0
+ops.truncate=0
+ops.salvage=0
+quiet=0
+runs.in_memory=0
+runs.mirror=0
+runs.predictable_replay=1
+runs.rows=1000000:5000000
+runs.tables=3:10
+runs.threads=4:32
+runs.timer=6:30
+runs.timer=30
+transaction.implicit=0
+transaction.timestamps=1
diff --git a/src/third_party/wiredtiger/test/format/alter.c b/src/third_party/wiredtiger/test/format/alter.c
index 546bfb8a8d5..8f5e0000662 100644
--- a/src/third_party/wiredtiger/test/format/alter.c
+++ b/src/third_party/wiredtiger/test/format/alter.c
@@ -60,14 +60,14 @@ alter(void *arg)
     counter = 0;
 
     while (!g.workers_finished) {
-        period = mmrand(NULL, 1, 10);
+        period = mmrand(&g.extra_rnd, 1, 10);
 
         testutil_check(__wt_snprintf(
           buf, sizeof(buf), "access_pattern_hint=%s", access_value ? "random" : "none"));
         access_value = !access_value;
 
         /* Alter can return EBUSY if concurrent with other operations. */
-        table = table_select(NULL);
+        table = table_select(NULL, false);
         trace_msg(session, "Alter #%u URI %s start %s", ++counter, table->uri, buf);
 
         while ((ret = session->alter(session, table->uri, buf)) != 0 && ret != EBUSY)
diff --git a/src/third_party/wiredtiger/test/format/backup.c b/src/third_party/wiredtiger/test/format/backup.c
index f463b20d5a4..31f8aa5ee2e 100644
--- a/src/third_party/wiredtiger/test/format/backup.c
+++ b/src/third_party/wiredtiger/test/format/backup.c
@@ -539,7 +539,7 @@ backup(void *arg)
      * larger intervals, optionally do incremental backups between full backups.
      */
     this_id = 0;
-    for (period = mmrand(NULL, 1, 10);; period = mmrand(NULL, 20, 45)) {
+    for (period = mmrand(&g.extra_rnd, 1, 10);; period = mmrand(&g.extra_rnd, 20, 45)) {
         /* Sleep for short periods so we don't make the run wait. */
         while (period > 0 && !g.workers_finished) {
             --period;
@@ -584,7 +584,7 @@ backup(void *arg)
                   src_id, g.backup_id));
                 /* Restart a full incremental every once in a while. */
                 full = false;
-                incr_full = mmrand(NULL, 1, 8) == 1;
+                incr_full = mmrand(&g.extra_rnd, 1, 8) == 1;
             }
             this_id = g.backup_id++;
             config = cfg;
@@ -600,7 +600,7 @@ backup(void *arg)
                 config = cfg;
                 full = false;
                 /* Restart a full incremental every once in a while. */
-                incr_full = mmrand(NULL, 1, 8) == 1;
+                incr_full = mmrand(&g.extra_rnd, 1, 8) == 1;
             }
         } else {
             config = NULL;
@@ -679,9 +679,9 @@ backup(void *arg)
         if (full) {
             incremental = 1;
             if (g.backup_incr_flag == INCREMENTAL_LOG)
-                incremental = GV(LOGGING_REMOVE) ? 1 : mmrand(NULL, 1, 8);
+                incremental = GV(LOGGING_REMOVE) ? 1 : mmrand(&g.extra_rnd, 1, 8);
             else if (g.backup_incr_flag == INCREMENTAL_BLOCK)
-                incremental = mmrand(NULL, 1, 8);
+                incremental = mmrand(&g.extra_rnd, 1, 8);
         }
         if (--incremental == 0) {
             check_copy();
diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c
index a811b3866b0..c53f88ab0ab 100644
--- a/src/third_party/wiredtiger/test/format/bulk.c
+++ b/src/third_party/wiredtiger/test/format/bulk.c
@@ -121,7 +121,7 @@ table_load(TABLE *base, TABLE *table)
         if (table->type == ROW)
             key_gen(table, &key, keyno);
         if (base == NULL)
-            val_gen(table, NULL, &value, &bitv, keyno);
+            val_gen(table, &g.data_rnd, &value, &bitv, keyno);
         else {
             testutil_check(read_op(base_cursor, NEXT, NULL));
             testutil_check(base_cursor->get_value(base_cursor, &value));
@@ -168,6 +168,12 @@ table_load(TABLE *base, TABLE *table)
             testutil_assertfmt(base == NULL && (ret == WT_CACHE_FULL || ret == WT_ROLLBACK),
               "WT_CURSOR.insert failed: %d", ret);
 
+            /*
+             * If this occurs with predictable replay, we may need to redo the bulk load with fewer
+             * keys in each batch. For now, we just don't handle it.
+             */
+            testutil_assert(!GV(RUNS_PREDICTABLE_REPLAY));
+
             if (g.transaction_timestamps_config) {
                 bulk_rollback_transaction(session);
                 bulk_begin_transaction(session);
diff --git a/src/third_party/wiredtiger/test/format/checkpoint.c b/src/third_party/wiredtiger/test/format/checkpoint.c
index 7b63eaba648..f7afc24374b 100644
--- a/src/third_party/wiredtiger/test/format/checkpoint.c
+++ b/src/third_party/wiredtiger/test/format/checkpoint.c
@@ -79,7 +79,7 @@ checkpoint(void *arg)
     wt_wrap_open_session(conn, &sap, NULL, &session);
 
     named_checkpoints = !g.lsm_config;
-    for (secs = mmrand(NULL, 1, 10); !g.workers_finished;) {
+    for (secs = mmrand(&g.extra_rnd, 1, 10); !g.workers_finished;) {
         if (secs > 0) {
             __wt_sleep(1, 0);
             --secs;
@@ -96,7 +96,7 @@ checkpoint(void *arg)
         ckpt_vrfy_name = "WiredTigerCheckpoint";
         backup_locked = false;
         if (named_checkpoints)
-            switch (mmrand(NULL, 1, 20)) {
+            switch (mmrand(&g.extra_rnd, 1, 20)) {
             case 1:
                 /*
                  * 5% create a named snapshot. Rotate between a few names to test multiple named
@@ -105,8 +105,8 @@ checkpoint(void *arg)
                 ret = lock_try_writelock(session, &g.backup_lock);
                 if (ret == 0) {
                     backup_locked = true;
-                    testutil_check(__wt_snprintf(
-                      config_buf, sizeof(config_buf), "name=mine.%" PRIu32, mmrand(NULL, 1, 4)));
+                    testutil_check(__wt_snprintf(config_buf, sizeof(config_buf),
+                      "name=mine.%" PRIu32, mmrand(&g.extra_rnd, 1, 4)));
                     ckpt_config = config_buf;
                     ckpt_vrfy_name = config_buf + strlen("name=");
                 } else if (ret != EBUSY)
@@ -143,7 +143,7 @@ checkpoint(void *arg)
         /* Verify the checkpoints. */
         wts_verify_checkpoint(conn, ckpt_vrfy_name);
 
-        secs = mmrand(NULL, 5, 40);
+        secs = mmrand(&g.extra_rnd, 5, 40);
     }
 
     wt_wrap_open_session(conn, &sap, NULL, &session);
diff --git a/src/third_party/wiredtiger/test/format/compact.c b/src/third_party/wiredtiger/test/format/compact.c
index feea20fa092..84a467d0734 100644
--- a/src/third_party/wiredtiger/test/format/compact.c
+++ b/src/third_party/wiredtiger/test/format/compact.c
@@ -54,7 +54,7 @@ compact(void *arg)
      * Perform compaction at somewhere under 15 seconds (so we get at least one done), and then at
      * 23 second intervals.
      */
-    for (period = mmrand(NULL, 1, 15);; period = 23) {
+    for (period = mmrand(&g.extra_rnd, 1, 15);; period = 23) {
         /* Sleep for short periods so we don't make the run wait. */
         while (period > 0 && !g.workers_finished) {
             --period;
@@ -70,7 +70,7 @@ compact(void *arg)
          * Compact returns ETIMEDOUT if the compaction doesn't finish in some number of seconds. We
          * don't configure a timeout and occasionally exceed the default of 1200 seconds.
          */
-        table = table_select(NULL);
+        table = table_select(NULL, false);
         ret = session->compact(session, table->uri, NULL);
         testutil_assertfmt(ret == 0 || ret == EBUSY || ret == ETIMEDOUT || ret == WT_CACHE_FULL ||
             ret == WT_ROLLBACK,
diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h
index 201df695c97..bfdc9650699 100644
--- a/src/third_party/wiredtiger/test/format/config.h
+++ b/src/third_party/wiredtiger/test/format/config.h
@@ -110,40 +110,43 @@ typedef struct {
 #define V_TABLE_OPS_TRUNCATE 78
 #define V_GLOBAL_OPS_VERIFY 79
 #define V_GLOBAL_QUIET 80
-#define V_GLOBAL_RUNS_IN_MEMORY 81
-#define V_GLOBAL_RUNS_OPS 82
-#define V_TABLE_RUNS_MIRROR 83
-#define V_TABLE_RUNS_ROWS 84
-#define V_TABLE_RUNS_SOURCE 85
-#define V_GLOBAL_RUNS_TABLES 86
-#define V_GLOBAL_RUNS_THREADS 87
-#define V_GLOBAL_RUNS_TIMER 88
-#define V_TABLE_RUNS_TYPE 89
-#define V_GLOBAL_RUNS_VERIFY_FAILURE_DUMP 90
-#define V_GLOBAL_STATISTICS_MODE 91
-#define V_GLOBAL_STATISTICS_LOG_SOURCES 92
-#define V_GLOBAL_STRESS_AGGRESSIVE_SWEEP 93
-#define V_GLOBAL_STRESS_CHECKPOINT 94
-#define V_GLOBAL_STRESS_CHECKPOINT_EVICT_PAGE 95
-#define V_GLOBAL_STRESS_CHECKPOINT_PREPARE 96
-#define V_GLOBAL_STRESS_EVICT_REPOSITION 97
-#define V_GLOBAL_STRESS_FAILPOINT_EVICTION_FAIL_AFTER_RECONCILIATION 98
-#define V_GLOBAL_STRESS_FAILPOINT_HS_DELETE_KEY_FROM_TS 99
-#define V_GLOBAL_STRESS_HS_CHECKPOINT_DELAY 100
-#define V_GLOBAL_STRESS_HS_SEARCH 101
-#define V_GLOBAL_STRESS_HS_SWEEP 102
-#define V_GLOBAL_STRESS_SLEEP_BEFORE_READ_OVERFLOW_ONPAGE 103
-#define V_GLOBAL_STRESS_SPLIT_1 104
-#define V_GLOBAL_STRESS_SPLIT_2 105
-#define V_GLOBAL_STRESS_SPLIT_3 106
-#define V_GLOBAL_STRESS_SPLIT_4 107
-#define V_GLOBAL_STRESS_SPLIT_5 108
-#define V_GLOBAL_STRESS_SPLIT_6 109
-#define V_GLOBAL_STRESS_SPLIT_7 110
-#define V_GLOBAL_TRANSACTION_IMPLICIT 111
-#define V_GLOBAL_TRANSACTION_TIMESTAMPS 112
-#define V_GLOBAL_WIREDTIGER_CONFIG 113
-#define V_GLOBAL_WIREDTIGER_RWLOCK 114
-#define V_GLOBAL_WIREDTIGER_LEAK_MEMORY 115
+#define V_GLOBAL_RANDOM_DATA_SEED 81
+#define V_GLOBAL_RANDOM_EXTRA_SEED 82
+#define V_GLOBAL_RUNS_IN_MEMORY 83
+#define V_TABLE_RUNS_MIRROR 84
+#define V_GLOBAL_RUNS_OPS 85
+#define V_GLOBAL_RUNS_PREDICTABLE_REPLAY 86
+#define V_TABLE_RUNS_ROWS 87
+#define V_TABLE_RUNS_SOURCE 88
+#define V_GLOBAL_RUNS_TABLES 89
+#define V_GLOBAL_RUNS_THREADS 90
+#define V_GLOBAL_RUNS_TIMER 91
+#define V_TABLE_RUNS_TYPE 92
+#define V_GLOBAL_RUNS_VERIFY_FAILURE_DUMP 93
+#define V_GLOBAL_STATISTICS_MODE 94
+#define V_GLOBAL_STATISTICS_LOG_SOURCES 95
+#define V_GLOBAL_STRESS_AGGRESSIVE_SWEEP 96
+#define V_GLOBAL_STRESS_CHECKPOINT 97
+#define V_GLOBAL_STRESS_CHECKPOINT_EVICT_PAGE 98
+#define V_GLOBAL_STRESS_CHECKPOINT_PREPARE 99
+#define V_GLOBAL_STRESS_EVICT_REPOSITION 100
+#define V_GLOBAL_STRESS_FAILPOINT_EVICTION_FAIL_AFTER_RECONCILIATION 101
+#define V_GLOBAL_STRESS_FAILPOINT_HS_DELETE_KEY_FROM_TS 102
+#define V_GLOBAL_STRESS_HS_CHECKPOINT_DELAY 103
+#define V_GLOBAL_STRESS_HS_SEARCH 104
+#define V_GLOBAL_STRESS_HS_SWEEP 105
+#define V_GLOBAL_STRESS_SLEEP_BEFORE_READ_OVERFLOW_ONPAGE 106
+#define V_GLOBAL_STRESS_SPLIT_1 107
+#define V_GLOBAL_STRESS_SPLIT_2 108
+#define V_GLOBAL_STRESS_SPLIT_3 109
+#define V_GLOBAL_STRESS_SPLIT_4 110
+#define V_GLOBAL_STRESS_SPLIT_5 111
+#define V_GLOBAL_STRESS_SPLIT_6 112
+#define V_GLOBAL_STRESS_SPLIT_7 113
+#define V_GLOBAL_TRANSACTION_IMPLICIT 114
+#define V_GLOBAL_TRANSACTION_TIMESTAMPS 115
+#define V_GLOBAL_WIREDTIGER_CONFIG 116
+#define V_GLOBAL_WIREDTIGER_RWLOCK 117
+#define V_GLOBAL_WIREDTIGER_LEAK_MEMORY 118
 
-#define V_ELEMENT_COUNT 116
+#define V_ELEMENT_COUNT 119
diff --git a/src/third_party/wiredtiger/test/format/config.sh b/src/third_party/wiredtiger/test/format/config.sh
index 9f9a68003e8..21a9a6984f5 100755
--- a/src/third_party/wiredtiger/test/format/config.sh
+++ b/src/third_party/wiredtiger/test/format/config.sh
@@ -238,11 +238,17 @@ CONFIG configuration_list[] = {
 
 {"quiet", "quiet run (same as -q)", C_BOOL | C_IGNORE, 0, 0, 1}
 
+{"random.data_seed", "set random seed for data operations", 0x0, 0, 0, UINT_MAX}
+
+{"random.extra_seed", "set random seed for extra operations", 0x0, 0, 0, UINT_MAX}
+
 {"runs.in_memory", "configure in-memory", C_BOOL | C_IGNORE, 0, 0, 1}
 
+{"runs.mirror", "mirror tables", C_BOOL | C_IGNORE | C_TABLE, 0, 0, 0}
+
 {"runs.ops", "operations per run", 0x0, 0, M(2), M(100)}
 
-{"runs.mirror", "mirror tables", C_BOOL | C_IGNORE | C_TABLE, 0, 0, 0}
+{"runs.predictable_replay", "configure predictable replay", C_BOOL, 0, 0, 0}
 
 {"runs.rows", "number of rows", C_TABLE, 10, M(1), M(100)}
 
diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h
index 3b43c304435..1a94278364d 100644
--- a/src/third_party/wiredtiger/test/format/format.h
+++ b/src/third_party/wiredtiger/test/format/format.h
@@ -84,6 +84,38 @@
 #define STR(s) #s
 #define XSTR(s) STR(s)
 
+#include "config.h"
+extern CONFIG configuration_list[];
+
+typedef struct {
+    uint32_t v; /* integral value */
+    char *vstr; /* string value */
+    bool set;   /* value explicitly set */
+} CONFIGV;
+
+/*
+ * The LANE data structure is used with predictable replay. With predictable replay, we want to make
+ * sure that two threads can never act on the same key. The last bits of the timestamp to be used to
+ * determine a lane, so it takes a while (LANE_COUNT operations) to cycle through the lanes. A lane
+ * only acts on key numbers whose last bits match the lane. We also keep track of lanes via the
+ * g.lanes array. This guarantees that a lane is only being used one at a time, which in turn
+ * guarantees that a key can only be used once at a time.
+ *
+ * A more complete description of how this fits into predictable replay is in replay.c .
+ */
+typedef struct {
+    uint64_t last_commit_ts;
+    bool in_use;
+} LANE;
+#define LANE_NONE UINT32_MAX /* A lane number guaranteed to be illegal */
+#define LANE_COUNT 1024u
+
+/* Arguments to the read scanner. */
+typedef struct {
+    WT_CONNECTION *conn;
+    WT_RAND_STATE *rnd;
+} READ_SCAN_ARGS;
+
 /*
  * Abstract lock that lets us use either pthread reader-writer locks or WiredTiger's own (likely
  * faster) implementation.
@@ -112,15 +144,6 @@ typedef struct {
  */
 #define FIX_VALUE_WRONG 0xff
 
-#include "config.h"
-extern CONFIG configuration_list[];
-
-typedef struct {
-    uint32_t v; /* integral value */
-    char *vstr; /* string value */
-    bool set;   /* value explicitly set */
-} CONFIGV;
-
 typedef enum { FIX, ROW, VAR } table_type;
 typedef struct {
     u_int id;              /* table ID */
@@ -224,7 +247,8 @@ typedef struct {
 #define INCREMENTAL_OFF 3
     u_int backup_incr_flag; /* Incremental backup configuration */
 
-    WT_RAND_STATE rnd; /* Global RNG state */
+    WT_RAND_STATE data_rnd;  /* Global RNG state for data operations */
+    WT_RAND_STATE extra_rnd; /* Global RNG state for extra operations */
 
     uint64_t timestamp;        /* Counter for timestamps */
     uint64_t oldest_timestamp; /* Last timestamp used for oldest */
@@ -232,6 +256,12 @@ typedef struct {
 
     uint64_t truncate_cnt; /* truncation operation counter */
 
+    uint64_t replay_cached_committed;    /* Our committed timestamp, cached */
+    uint32_t replay_calculate_committed; /* Times before recalculating cached committed */
+    uint64_t replay_start_timestamp;     /* Timestamp at the beginning of a run */
+    uint64_t stop_timestamp;             /* If non-zero, stop when stable reaches this */
+    uint64_t timestamp_copy;             /* A copy of the timestamp, for safety checks */
+
     /*
      * Lock to prevent the stable timestamp from moving during the commit of prepared transactions.
      * Otherwise, it may panic if the stable timestamp is moved to greater than or equal to the
@@ -271,9 +301,15 @@ typedef struct {
 #define CHECKPOINT_ON 2
 #define CHECKPOINT_WIREDTIGER 3
     u_int checkpoint_config; /* Checkpoint configuration */
+
+    LANE lanes[LANE_COUNT];     /* The lanes for multithreaded coordination  */
+    pthread_rwlock_t lane_lock; /* Lock used when modifying lanes */
 } GLOBAL;
 extern GLOBAL g;
 
+/* Timestamp to lane number */
+#define LANE_NUMBER(ts) (ts & (LANE_COUNT - 1))
+
 /* Worker thread operations. */
 typedef enum { INSERT = 1, MODIFY, READ, REMOVE, TRUNCATE, UPDATE } thread_op;
 
@@ -311,7 +347,12 @@ typedef struct {
 
     SAP sap; /* Thread's session event handler information */
 
-    WT_RAND_STATE rnd; /* thread RNG state */
+    WT_RAND_STATE data_rnd;  /* thread RNG state for data operations */
+    WT_RAND_STATE extra_rnd; /* thread RNG state for extra operations */
+
+    uint32_t lane;     /* Current lane for replay */
+    thread_op op;      /* Operation */
+    bool replay_again; /* Need to redo an operation at a timestamp. */
 
     volatile bool quit; /* thread should quit */
 
@@ -348,8 +389,9 @@ typedef struct {
     bool repeatable_reads; /* if read ops repeatable */
     bool repeatable_wrap;  /* if circular buffer wrapped */
     uint64_t opid;         /* Operation ID */
-    uint64_t read_ts;      /* read timestamp */
     uint64_t commit_ts;    /* commit timestamp */
+    uint64_t read_ts;      /* read timestamp */
+    uint64_t replay_ts;    /* allocated timestamp for predictable replay */
     uint64_t stable_ts;    /* stable timestamp */
     SNAP_STATE snap_states[2];
     SNAP_STATE *s; /* points to one of the snap_states */
@@ -398,7 +440,7 @@ void key_gen_teardown(WT_ITEM *);
 void key_init(TABLE *, void *);
 void lock_destroy(WT_SESSION *, RWLOCK *);
 void lock_init(WT_SESSION *, RWLOCK *);
-void operations(u_int, bool);
+void operations(u_int, u_int, u_int);
 void path_setup(const char *);
 void set_alarm(u_int);
 void set_core(bool);
@@ -415,6 +457,19 @@ void table_verify(TABLE *, void *);
 void timestamp_init(void);
 uint64_t timestamp_maximum_committed(void);
 void timestamp_once(WT_SESSION *, bool, bool);
+void replay_adjust_key(TINFO *, uint64_t);
+uint64_t replay_commit_ts(TINFO *);
+void replay_committed(TINFO *);
+void replay_end_timed_run(void);
+void replay_loop_begin(TINFO *, bool);
+uint64_t replay_maximum_committed(void);
+bool replay_operation_enabled(thread_op);
+void replay_pause_after_rollback(TINFO *, uint32_t);
+uint64_t replay_prepare_ts(TINFO *);
+uint64_t replay_read_ts(TINFO *);
+void replay_rollback(TINFO *);
+void replay_run_begin(WT_SESSION *);
+void replay_run_end(WT_SESSION *);
 void timestamp_query(const char *, uint64_t *);
 void timestamp_set_oldest(void);
 void timestamp_teardown(WT_SESSION *);
diff --git a/src/third_party/wiredtiger/test/format/format_config.c b/src/third_party/wiredtiger/test/format/format_config.c
index e3ac21a724b..c28c4a3c976 100644
--- a/src/third_party/wiredtiger/test/format/format_config.c
+++ b/src/third_party/wiredtiger/test/format/format_config.c
@@ -53,6 +53,63 @@ static void config_off_all(const char *);
 static void config_pct(TABLE *);
 static void config_statistics(void);
 static void config_transaction(void);
+static bool config_var(TABLE *);
+
+/*
+ * config_random_generator --
+ *     For a given seed/RNG combination, generate a seed if not given, and initialize the RNG.
+ */
+static void
+config_random_generator(
+  const char *config_name, uint64_t seed, uint32_t rand_count, WT_RAND_STATE *rnd)
+{
+    char buf[128];
+    bool seed_set;
+
+    /* See if the seed is already present in the configuration. */
+    seed_set = (seed != 0);
+
+    /* Initialize the RNG, and potentially the seed. */
+    testutil_random_init(rnd, &seed, rand_count);
+
+    /* If we generated a seed just now, put it into the configuration file. */
+    if (!seed_set) {
+        testutil_assert(seed != 0);
+        testutil_check(__wt_snprintf(buf, sizeof(buf), "%s=%" PRIu64, config_name, seed));
+        config_single(NULL, buf, true);
+    }
+
+    /* Make sure the generator is ready. */
+    testutil_assert(rnd->v != 0);
+}
+
+/*
+ * config_random_generators --
+ *     Initialize our global random generators using provided seeds.
+ */
+static void
+config_random_generators(void)
+{
+    config_random_generator("random.data_seed", GV(RANDOM_DATA_SEED), 0, &g.data_rnd);
+    config_random_generator("random.extra_seed", GV(RANDOM_EXTRA_SEED), 1, &g.extra_rnd);
+}
+
+/*
+ * config_random_generators_before_run --
+ *     One use case for predictable replay is to run test/format once with little or no
+ *     configuration values set. test/format rolls the dice and picks the configuration, recording
+ *     it along with the random seeds. If we want to rerun it predictably, we can use the same
+ *     seeds. However, the second run will not need to roll the dice during configuration, so the
+ *     state of the RNG after configuration would be different than after configuration during the
+ *     first run. To make everything line up, we re-seed the generator after the configuration, and
+ *     before execution begins.
+ */
+static void
+config_random_generators_before_run(void)
+{
+    testutil_random_from_seed(&g.data_rnd, GV(RANDOM_DATA_SEED));
+    testutil_random_from_seed(&g.extra_rnd, GV(RANDOM_EXTRA_SEED));
+}
 
 /*
  * config_random --
@@ -85,7 +142,7 @@ config_random(TABLE *table, bool table_only)
             continue;
 
         /* Configure key prefixes only rarely, 5% if the length isn't set explicitly. */
-        if (cp->off == V_TABLE_BTREE_PREFIX_LEN && mmrand(NULL, 1, 100) > 5)
+        if (cp->off == V_TABLE_BTREE_PREFIX_LEN && mmrand(&g.extra_rnd, 1, 100) > 5)
             continue;
 
         /*
@@ -93,11 +150,11 @@ config_random(TABLE *table, bool table_only)
          * is "on" (so "on" if random rolled <= N, otherwise "off").
          */
         if (F_ISSET(cp, C_BOOL))
-            testutil_check(__wt_snprintf(
-              buf, sizeof(buf), "%s=%s", cp->name, mmrand(NULL, 1, 100) <= cp->min ? "on" : "off"));
+            testutil_check(__wt_snprintf(buf, sizeof(buf), "%s=%s", cp->name,
+              mmrand(&g.data_rnd, 1, 100) <= cp->min ? "on" : "off"));
         else
-            testutil_check(__wt_snprintf(
-              buf, sizeof(buf), "%s=%" PRIu32, cp->name, mmrand(NULL, cp->min, cp->maxrand)));
+            testutil_check(__wt_snprintf(buf, sizeof(buf), "%s=%" PRIu32, cp->name,
+              mmrand(&g.data_rnd, cp->min, cp->maxrand)));
         config_single(table, buf, false);
     }
 }
@@ -141,12 +198,15 @@ config_table_am(TABLE *table)
         if (config_explicit(table, "runs.source") && DATASOURCE(table, "lsm"))
             config_single(table, "runs.type=row", false);
         else
-            switch (mmrand(NULL, 1, 10)) {
+            switch (mmrand(&g.data_rnd, 1, 10)) {
             case 1:
             case 2:
             case 3: /* 30% */
-                config_single(table, "runs.type=var", false);
-                break;
+                if (config_var(table)) {
+                    config_single(table, "runs.type=var", false);
+                    break;
+                }
+                /* FALLTHROUGH */
             case 4: /* 10% */
                 if (config_fix(table)) {
                     config_single(table, "runs.type=fix", false);
@@ -165,7 +225,7 @@ config_table_am(TABLE *table)
     }
 
     if (!config_explicit(table, "runs.source"))
-        switch (mmrand(NULL, 1, 5)) {
+        switch (mmrand(&g.data_rnd, 1, 5)) {
         case 1: /* 20% */
             config_single(table, "runs.source=file", false);
             break;
@@ -335,6 +395,31 @@ config_table(TABLE *table, void *arg)
     if (TV(BTREE_VALUE_MIN) > TV(BTREE_VALUE_MAX))
         testutil_die(EINVAL, "btree.value_min may not be larger than btree.value_max");
 
+    if (GV(RUNS_PREDICTABLE_REPLAY)) {
+        /*
+         * In predictable replay, force the number of rows in a table to be a manageable size so we
+         * can modify key numbers without problems.
+         */
+        TV(RUNS_ROWS) = WT_MAX(TV(RUNS_ROWS), 2 * LANE_COUNT);
+
+        /*
+         * We don't support some operations in predictable replay.
+         */
+        if (!replay_operation_enabled(MODIFY)) {
+            if (config_explicit(table, "ops.pct.modify") && TV(OPS_PCT_MODIFY))
+                WARN("turning off modify operations for table%" PRIu32
+                     " to work with predictable replay",
+                  table->id);
+            config_single(table, "ops.pct.modify=0", false);
+        }
+        if (!replay_operation_enabled(TRUNCATE)) {
+            if (config_explicit(table, "ops.truncate") && TV(OPS_TRUNCATE))
+                WARN("turning off truncate for table%" PRIu32 " to work with predictable replay",
+                  table->id);
+            config_single(table, "ops.truncate=0", false);
+        }
+    }
+
     /*
      * If common key prefixes are configured, add prefix compression if no explicit choice was made
      * and track the largest common key prefix in the run.
@@ -372,6 +457,8 @@ config_table(TABLE *table, void *arg)
 void
 config_run(void)
 {
+    config_random_generators(); /* Configure the random number generators. */
+
     config_random(tables[0], false); /* Configure the remaining global name space. */
 
     /*
@@ -433,6 +520,8 @@ config_run(void)
         else
             config_single(NULL, "runs.timer=360", false);
     }
+
+    config_random_generators_before_run();
 }
 
 /*
@@ -463,7 +552,7 @@ config_backup_incr(void)
      * Choose a type of incremental backup, where the log remove setting can eliminate incremental
      * backup based on log files.
      */
-    switch (mmrand(NULL, 1, 10)) {
+    switch (mmrand(&g.extra_rnd, 1, 10)) {
     case 1: /* 30% full backup only */
     case 2:
     case 3:
@@ -508,7 +597,7 @@ config_backup_incr_granularity(void)
      * granularity is in units of KB.
      */
     granularity = 0;
-    i = mmrand(NULL, 1, 10);
+    i = mmrand(&g.extra_rnd, 1, 10);
     switch (i) {
     case 1: /* 50% small size for stress testing */
     case 2:
@@ -669,7 +758,7 @@ config_checkpoint(void)
 {
     /* Choose a checkpoint mode if nothing was specified. */
     if (!config_explicit(NULL, "checkpoint"))
-        switch (mmrand(NULL, 1, 20)) {
+        switch (mmrand(&g.extra_rnd, 1, 20)) {
         case 1:
         case 2:
         case 3:
@@ -694,7 +783,7 @@ config_checksum(TABLE *table)
 {
     /* Choose a checksum mode if nothing was specified. */
     if (!config_explicit(table, "disk.checksum"))
-        switch (mmrand(NULL, 1, 10)) {
+        switch (mmrand(&g.extra_rnd, 1, 10)) {
         case 1:
         case 2:
         case 3:
@@ -746,7 +835,7 @@ config_compression(TABLE *table, const char *conf_name)
      * correct if all of the possible engines are compiled in.
      */
     cstr = "off";
-    switch (mmrand(NULL, 1, 20)) {
+    switch (mmrand(&g.extra_rnd, 1, 20)) {
 #ifdef HAVE_BUILTIN_EXTENSION_LZ4
     case 1:
     case 2:
@@ -858,7 +947,7 @@ config_encryption(void)
         return;
 
     /* 70% no encryption, 30% rotn */
-    if (mmrand(NULL, 1, 10) < 8)
+    if (mmrand(&g.data_rnd, 1, 10) < 8)
         config_off(NULL, "disk.encryption");
     else
         config_single(NULL, "disk.encryption=rotn-7", false);
@@ -871,8 +960,24 @@ config_encryption(void)
 static bool
 config_fix(TABLE *table)
 {
-    /* Fixed-length column stores don't support modify operations. */
-    return (!config_explicit(table, "ops.pct.modify"));
+    /*
+     * Fixed-length column stores don't support modify operations, and can't be used with
+     * predictable replay.
+     */
+    return (!GV(RUNS_PREDICTABLE_REPLAY) && !config_explicit(table, "ops.pct.modify"));
+}
+
+/*
+ * config_var --
+ *     Variable-length column-store configuration.
+ */
+static bool
+config_var(TABLE *table)
+{
+    /*
+     * Variable-length column store insertions can't be used with predictable replay.
+     */
+    return (!GV(RUNS_PREDICTABLE_REPLAY) || !config_explicit(table, "ops.pct.insert"));
 }
 
 /*
@@ -918,8 +1023,10 @@ config_in_memory(void)
         return;
     if (config_explicit(NULL, "runs.mirror"))
         return;
+    if (config_explicit(NULL, "runs.predictable_replay"))
+        return;
 
-    if (!config_explicit(NULL, "runs.in_memory") && mmrand(NULL, 1, 20) == 1) {
+    if (!config_explicit(NULL, "runs.in_memory") && mmrand(&g.extra_rnd, 1, 20) == 1) {
         config_single(NULL, "runs.in_memory=1", false);
         /* Use table[0] to access the global value (RUN_ROWS is a table value). */
         if (NTV(tables[0], RUNS_ROWS) > WT_MILLION) {
@@ -1064,7 +1171,18 @@ config_mirrors(void)
      * tables.
      */
     explicit_mirror = config_explicit(NULL, "runs.mirror");
-    if (!explicit_mirror && mmrand(NULL, 1, 10) < 9) {
+    if (!explicit_mirror && mmrand(&g.data_rnd, 1, 10) < 9) {
+        config_off_all("runs.mirror");
+        return;
+    }
+
+    /*
+     * In theory, mirroring should work with predictable replay, although there's some overlap in
+     * functionality. That is, we usually do multiple runs with the same key with predictable replay
+     * and would notice if data was different or missing. We disable it to keep runs simple.
+     */
+    if (GV(RUNS_PREDICTABLE_REPLAY)) {
+        WARN("%s", "turning off mirroring for predictable replay");
         config_off_all("runs.mirror");
         return;
     }
@@ -1122,7 +1240,7 @@ config_mirrors(void)
      * Pick some number of tables to mirror, then turn on mirroring the next (n-1) tables, where
      * allowed.
      */
-    for (mirrors = mmrand(NULL, 2, ntables) - 1, i = 1; i <= ntables; ++i) {
+    for (mirrors = mmrand(&g.data_rnd, 2, ntables) - 1, i = 1; i <= ntables; ++i) {
         if (NT_EXPLICIT_OFF(tables[i], RUNS_MIRROR))
             continue;
         if (tables[i] != g.base_mirror) {
@@ -1155,25 +1273,32 @@ config_pct(TABLE *table)
         const char *name; /* Operation */
         uint32_t *vp;     /* Value store */
         u_int order;      /* Order of assignment */
+        bool enabled;     /* Enabled for this configuration */
     } list[5];
     u_int i, max_order, max_slot, n, pct;
     bool slot_available;
 
+    /* We explicitly disable modify operations for predictable replay. */
     list[0].name = "ops.pct.delete";
     list[0].vp = &TV(OPS_PCT_DELETE);
     list[0].order = 0;
+    list[0].enabled = replay_operation_enabled(REMOVE);
     list[1].name = "ops.pct.insert";
     list[1].vp = &TV(OPS_PCT_INSERT);
     list[1].order = 0;
+    list[1].enabled = replay_operation_enabled(INSERT);
     list[2].name = "ops.pct.modify";
     list[2].vp = &TV(OPS_PCT_MODIFY);
     list[2].order = 0;
+    list[2].enabled = replay_operation_enabled(MODIFY);
     list[3].name = "ops.pct.read";
     list[3].vp = &TV(OPS_PCT_READ);
     list[3].order = 0;
+    list[3].enabled = replay_operation_enabled(READ);
     list[4].name = "ops.pct.write";
     list[4].vp = &TV(OPS_PCT_WRITE);
     list[4].order = 0;
+    list[4].enabled = replay_operation_enabled(UPDATE);
 
     /*
      * Walk the list of operations, checking for an illegal configuration and creating a random
@@ -1182,11 +1307,13 @@ config_pct(TABLE *table)
     pct = 0;
     slot_available = false;
     for (i = 0; i < WT_ELEMENTS(list); ++i)
-        if (config_explicit(table, list[i].name))
-            pct += *list[i].vp;
-        else {
-            list[i].order = mmrand(NULL, 1, WT_THOUSAND);
-            slot_available = true;
+        if (list[i].enabled) {
+            if (config_explicit(table, list[i].name))
+                pct += *list[i].vp;
+            else {
+                list[i].order = mmrand(&g.data_rnd, 1, WT_THOUSAND);
+                slot_available = true;
+            }
         }
 
     /*
@@ -1197,7 +1324,7 @@ config_pct(TABLE *table)
         WARN("operation percentages %s than 100, resetting to random values",
           pct > 100 ? "greater" : "less");
         for (i = 0; i < WT_ELEMENTS(list); ++i)
-            list[i].order = mmrand(NULL, 1, WT_THOUSAND);
+            list[i].order = mmrand(&g.data_rnd, 1, WT_THOUSAND);
         pct = 0;
     }
 
@@ -1210,9 +1337,9 @@ config_pct(TABLE *table)
      */
     for (pct = 100 - pct;;) {
         for (i = n = max_order = max_slot = 0; i < WT_ELEMENTS(list); ++i) {
-            if (list[i].order != 0)
+            if (list[i].order != 0 && list[i].enabled)
                 ++n;
-            if (list[i].order > max_order) {
+            if (list[i].order > max_order && list[i].enabled) {
                 max_order = list[i].order;
                 max_slot = i;
             }
@@ -1223,7 +1350,7 @@ config_pct(TABLE *table)
             *list[max_slot].vp = pct;
             break;
         }
-        *list[max_slot].vp = mmrand(NULL, 0, pct);
+        *list[max_slot].vp = mmrand(&g.data_rnd, 0, pct);
         list[max_slot].order = 0;
         pct -= *list[max_slot].vp;
     }
@@ -1246,7 +1373,7 @@ config_statistics(void)
 
     if (!config_explicit(NULL, "statistics.mode")) {
         /* 70% of the time set statistics to fast. */
-        if (mmrand(NULL, 1, 10) < 8)
+        if (mmrand(&g.extra_rnd, 1, 10) < 8)
             config_single(NULL, "statistics.mode=fast", false);
         else
             config_single(NULL, "statistics.mode=all", false);
@@ -1254,7 +1381,7 @@ config_statistics(void)
 
     if (!config_explicit(NULL, "statistics_log.sources")) {
         /* 10% of the time use sources if all. */
-        if (strcmp(GVS(STATISTICS_MODE), "all") == 0 && mmrand(NULL, 1, 10) == 1)
+        if (strcmp(GVS(STATISTICS_MODE), "all") == 0 && mmrand(&g.extra_rnd, 1, 10) == 1)
             config_single(NULL, "statistics_log.sources=file:", false);
     }
 }
@@ -1266,6 +1393,12 @@ config_statistics(void)
 static void
 config_transaction(void)
 {
+    /* Predictable replay requires timestamps. */
+    if (GV(RUNS_PREDICTABLE_REPLAY)) {
+        config_single(NULL, "transaction.implicit=0", false);
+        config_single(NULL, "transaction.timestamps=on", true);
+    }
+
     /* Transaction prepare requires timestamps and is incompatible with logging. */
     if (GV(OPS_PREPARE) && config_explicit(NULL, "ops.prepare")) {
         if (!GV(TRANSACTION_TIMESTAMPS) && config_explicit(NULL, "transaction.timestamps"))
@@ -1664,6 +1797,7 @@ config_table_extend(u_int ntable)
 void
 config_single(TABLE *table, const char *s, bool explicit)
 {
+    WT_RAND_STATE *rnd;
     enum { RANGE_FIXED, RANGE_NONE, RANGE_WEIGHTED } range;
     CONFIG *cp;
     CONFIGV *v;
@@ -1726,6 +1860,11 @@ config_single(TABLE *table, const char *s, bool explicit)
     ++equalp;
     v = &table->v[cp->off];
 
+    /*
+     * Use the data RNG for these options, that's conservative.
+     */
+    rnd = &g.data_rnd;
+
     if (F_ISSET(cp, C_STRING)) {
         /*
          * Historically, both "none" and "off" were used for turning off string configurations, now
@@ -1822,7 +1961,7 @@ config_single(TABLE *table, const char *s, bool explicit)
             testutil_die(EINVAL, "%s: %s: illegal numeric range", progname, s);
 
         if (range == RANGE_FIXED)
-            v1 = mmrand(NULL, (u_int)v1, (u_int)v2);
+            v1 = mmrand(rnd, (u_int)v1, (u_int)v2);
         else {
             /*
              * Roll dice, 50% chance of proceeding to the next larger value, and 5 steps to the
@@ -1832,7 +1971,7 @@ config_single(TABLE *table, const char *s, bool explicit)
             if (steps == 0)
                 steps = 1;
             for (i = 0; i < 5; ++i, v1 += steps)
-                if (mmrand(NULL, 0, 1) == 0)
+                if (mmrand(rnd, 0, 1) == 0)
                     break;
             v1 = WT_MIN(v1, v2);
         }
@@ -1897,7 +2036,7 @@ config_map_file_type(const char *s, u_int *vp)
      *
      * Variable-length column-store is 90% vs. fixed, 30% vs. fixed and row, and 40% vs row.
      */
-    v = mmrand(NULL, 1, 10);
+    v = mmrand(&g.data_rnd, 1, 10);
     if (fix && v == 1)
         *vp = FIX;
     else if (var && (v < 5 || !row))
diff --git a/src/third_party/wiredtiger/test/format/format_config_def.c b/src/third_party/wiredtiger/test/format/format_config_def.c
index de704a1ac71..399b323d0c0 100644
--- a/src/third_party/wiredtiger/test/format/format_config_def.c
+++ b/src/third_party/wiredtiger/test/format/format_config_def.c
@@ -96,7 +96,7 @@ CONFIG configuration_list[] = {{"assert.read_timestamp", "assert read_timestamp"
   {"checkpoint.wait", "seconds to wait if wiredtiger checkpoints configured", 0x0, 5, 100, 3600,
     V_GLOBAL_CHECKPOINT_WAIT},
 
-  {"debug.checkpoint_retention", "adjust log removal to retain the log records", 0x0, 0, 128, 1024,
+  {"debug.checkpoint_retention", "adjust log removal to retain the log records", 0x0, 0, 10, 1024,
     V_GLOBAL_DEBUG_CHECKPOINT_RETENTION},
 
   {"debug.cursor_reposition",
@@ -109,7 +109,7 @@ CONFIG configuration_list[] = {{"assert.read_timestamp", "assert read_timestamp"
     C_BOOL, 2, 0, 0, V_GLOBAL_DEBUG_EVICTION},
 
   {"debug.log_retention", "adjust log removal to retain at least this number of log files", 0x0, 0,
-    128, 1024, V_GLOBAL_DEBUG_LOG_RETENTION},
+    10, 1024, V_GLOBAL_DEBUG_LOG_RETENTION},
 
   {"debug.realloc_exact", "reallocation of memory will only provide the exact amount requested",
     C_BOOL, 0, 0, 0, V_GLOBAL_DEBUG_REALLOC_EXACT},
@@ -236,11 +236,20 @@ CONFIG configuration_list[] = {{"assert.read_timestamp", "assert read_timestamp"
 
   {"quiet", "quiet run (same as -q)", C_BOOL | C_IGNORE, 0, 0, 1, V_GLOBAL_QUIET},
 
+  {"random.data_seed", "set random seed for data operations", 0x0, 0, 0, UINT_MAX,
+    V_GLOBAL_RANDOM_DATA_SEED},
+
+  {"random.extra_seed", "set random seed for extra operations", 0x0, 0, 0, UINT_MAX,
+    V_GLOBAL_RANDOM_EXTRA_SEED},
+
   {"runs.in_memory", "configure in-memory", C_BOOL | C_IGNORE, 0, 0, 1, V_GLOBAL_RUNS_IN_MEMORY},
 
+  {"runs.mirror", "mirror tables", C_BOOL | C_IGNORE | C_TABLE, 0, 0, 0, V_TABLE_RUNS_MIRROR},
+
   {"runs.ops", "operations per run", 0x0, 0, M(2), M(100), V_GLOBAL_RUNS_OPS},
 
-  {"runs.mirror", "mirror tables", C_BOOL | C_IGNORE | C_TABLE, 0, 0, 0, V_TABLE_RUNS_MIRROR},
+  {"runs.predictable_replay", "configure predictable replay", C_BOOL, 0, 0, 0,
+    V_GLOBAL_RUNS_PREDICTABLE_REPLAY},
 
   {"runs.rows", "number of rows", C_TABLE, 10, M(1), M(100), V_TABLE_RUNS_ROWS},
 
diff --git a/src/third_party/wiredtiger/test/format/format_inline.h b/src/third_party/wiredtiger/test/format/format_inline.h
index 07f33f5319c..0d6d42befa9 100644
--- a/src/third_party/wiredtiger/test/format/format_inline.h
+++ b/src/third_party/wiredtiger/test/format/format_inline.h
@@ -112,10 +112,7 @@ read_op(WT_CURSOR *cursor, read_operation op, int *exactp)
 static inline uint32_t
 rng(WT_RAND_STATE *rnd)
 {
-    /* Threaded operations have their own RNG information, otherwise we use the default. */
-    if (rnd == NULL)
-        rnd = &g.rnd;
-
+    testutil_assert(rnd != NULL);
     return (__wt_random(rnd));
 }
 
@@ -228,12 +225,25 @@ table_sumv(u_int off)
  *     Randomly select a table.
  */
 static inline TABLE *
-table_select(TINFO *tinfo)
+table_select(TINFO *tinfo, bool modifies_data)
 {
+    WT_RAND_STATE *rnd;
+
     if (ntables == 0)
         return (tables[0]);
 
-    return (tables[mmrand(tinfo == NULL ? NULL : &tinfo->rnd, 1, ntables)]);
+    if (tinfo == NULL) {
+        if (modifies_data)
+            rnd = &g.data_rnd;
+        else
+            rnd = &g.extra_rnd;
+    } else {
+        if (modifies_data)
+            rnd = &tinfo->data_rnd;
+        else
+            rnd = &tinfo->extra_rnd;
+    }
+    return (tables[mmrand(rnd, 1, ntables)]);
 }
 
 /*
@@ -241,14 +251,20 @@ table_select(TINFO *tinfo)
  *     Randomly select a table of a specific type.
  */
 static inline TABLE *
-table_select_type(table_type type)
+table_select_type(table_type type, bool modifies_data)
 {
+    WT_RAND_STATE *rnd;
     u_int i;
 
     if (ntables == 0)
         return (tables[0]->type == type ? tables[0] : NULL);
 
-    for (i = mmrand(NULL, 1, ntables);; ++i) {
+    if (modifies_data)
+        rnd = &g.data_rnd;
+    else
+        rnd = &g.extra_rnd;
+
+    for (i = mmrand(rnd, 1, ntables);; ++i) {
         if (i > ntables)
             i = 1;
         if (tables[i]->type == type)
diff --git a/src/third_party/wiredtiger/test/format/format_salvage.c b/src/third_party/wiredtiger/test/format/format_salvage.c
index 8c23fae5cf9..787ecdade1d 100644
--- a/src/third_party/wiredtiger/test/format/format_salvage.c
+++ b/src/third_party/wiredtiger/test/format/format_salvage.c
@@ -84,7 +84,7 @@ corrupt(TABLE *table)
      * exceeding a megabyte (so we aren't just corrupting the whole file).
      */
     testutil_check(fstat(fd, &sb));
-    offset = mmrand(NULL, 0, (u_int)sb.st_size - 1024);
+    offset = mmrand(&g.data_rnd, 0, (u_int)sb.st_size - 1024);
     len = (size_t)(sb.st_size * 2) / 100;
     len += 4 * 1024;
     len = WT_MIN(len, WT_MEGABYTE);
diff --git a/src/third_party/wiredtiger/test/format/format_timestamp.c b/src/third_party/wiredtiger/test/format/format_timestamp.c
index db243be7335..aacc2c76c91 100644
--- a/src/third_party/wiredtiger/test/format/format_timestamp.c
+++ b/src/third_party/wiredtiger/test/format/format_timestamp.c
@@ -38,6 +38,9 @@ timestamp_maximum_committed(void)
     TINFO **tlp;
     uint64_t commit_ts, ts;
 
+    if (GV(RUNS_PREDICTABLE_REPLAY))
+        return replay_maximum_committed();
+
     /* A barrier additionally prevents using cache values here. */
     WT_ORDERED_READ(ts, g.timestamp);
     if (tinfo_list != NULL)
@@ -96,7 +99,7 @@ timestamp_once(WT_SESSION *session, bool allow_lag, bool final)
     static const char *oldest_timestamp_str = "oldest_timestamp=";
     static const char *stable_timestamp_str = "stable_timestamp=";
     WT_CONNECTION *conn;
-    uint64_t oldest_timestamp, stable_timestamp;
+    uint64_t oldest_timestamp, stable_timestamp, stop_timestamp;
     char buf[WT_TS_HEX_STRING_SIZE * 2 + 64];
 
     conn = g.wts_conn;
@@ -106,7 +109,24 @@ timestamp_once(WT_SESSION *session, bool allow_lag, bool final)
     if (oldest_timestamp == 0)
         return;
 
-    if (!final) {
+    if (GV(RUNS_PREDICTABLE_REPLAY)) {
+        /*
+         * For predictable replay, we need the oldest timestamp to lag when the process exits. That
+         * allows two runs that finish with stable timestamps in the same ballpark to be compared.
+         */
+        if (stable_timestamp > 10 * WT_THOUSAND)
+            oldest_timestamp = stable_timestamp - 10 * WT_THOUSAND;
+        else
+            oldest_timestamp = stable_timestamp / 2;
+
+        /*
+         * For predictable replay, our end state is to have the stable timestamp represent a precise
+         * number of operations.
+         */
+        WT_ORDERED_READ(stop_timestamp, g.stop_timestamp);
+        if (stable_timestamp > stop_timestamp && stop_timestamp != 0)
+            stable_timestamp = stop_timestamp;
+    } else if (!final) {
         /*
          * If lag is permitted, update the oldest timestamp halfway to the largest timestamp that's
          * no longer in use, otherwise update the oldest timestamp to that timestamp. Update stable
@@ -152,11 +172,22 @@ timestamp(void *arg)
     memset(&sap, 0, sizeof(sap));
     wt_wrap_open_session(conn, &sap, NULL, &session);
 
-    /* Update the oldest and stable timestamps at least once every 15 seconds. */
+    /*
+     * Update the oldest and stable timestamps at least once every 15 seconds. For predictable
+     * replay, update at a much faster pace. We can't afford to get behind because that means more
+     * rollback errors, and we don't have the luxury of giving up on an operation that has rolled
+     * back.
+     */
     while (!g.workers_finished) {
-        random_sleep(&g.rnd, 15);
-
-        timestamp_once(session, true, false);
+        if (!GV(RUNS_PREDICTABLE_REPLAY))
+            random_sleep(&g.extra_rnd, 15);
+        else {
+            if ((rng(&g.extra_rnd) & 0x1) == 1)
+                __wt_yield();
+            else
+                __wt_sleep(0, 10 * WT_THOUSAND);
+        }
+        timestamp_once(session, !GV(RUNS_PREDICTABLE_REPLAY), false);
     }
 
     wt_wrap_close_session(session);
diff --git a/src/third_party/wiredtiger/test/format/hs.c b/src/third_party/wiredtiger/test/format/hs.c
index fbad2aeefa6..808ca204aea 100644
--- a/src/third_party/wiredtiger/test/format/hs.c
+++ b/src/third_party/wiredtiger/test/format/hs.c
@@ -78,8 +78,8 @@ hs_cursor(void *arg)
          * cursor, so we should be able to traverse large chunks of the HS store quickly, without
          * blocking normal operations.
          */
-        next = mmrand(NULL, 0, 1) == 1;
-        for (i = mmrand(NULL, WT_THOUSAND, 100 * WT_THOUSAND); i > 0; --i) {
+        next = mmrand(&g.extra_rnd, 0, 1) == 1;
+        for (i = mmrand(&g.extra_rnd, WT_THOUSAND, 100 * WT_THOUSAND); i > 0; --i) {
             if ((ret = (next ? cursor->next(cursor) : cursor->prev(cursor))) != 0) {
                 testutil_assertfmt(ret == WT_NOTFOUND || ret == WT_CACHE_FULL || ret == WT_ROLLBACK,
                   "WT_CURSOR.%s failed: %d", next ? "next" : "prev", ret);
@@ -94,7 +94,7 @@ hs_cursor(void *arg)
         testutil_check(cursor->close(cursor));
 
         /* Sleep for some number of seconds, in short intervals so we don't make the run wait. */
-        for (period = mmrand(NULL, 1, 10); period > 0 && !g.workers_finished; --period)
+        for (period = mmrand(&g.extra_rnd, 1, 10); period > 0 && !g.workers_finished; --period)
             __wt_sleep(1, 0);
         if (g.workers_finished)
             break;
diff --git a/src/third_party/wiredtiger/test/format/import.c b/src/third_party/wiredtiger/test/format/import.c
index 77c42435672..27fc14acee2 100644
--- a/src/third_party/wiredtiger/test/format/import.c
+++ b/src/third_party/wiredtiger/test/format/import.c
@@ -103,7 +103,7 @@ import(void *arg)
         copy_file_into_directory(import_session, "import.wt");
 
         /* Perform import with either repair or file metadata. */
-        import_value = mmrand(NULL, 0, 1);
+        import_value = mmrand(&g.extra_rnd, 0, 1);
         if (import_value == 0)
             testutil_check(__wt_snprintf(buf, sizeof(buf), "import=(enabled,repair=true)"));
         else
@@ -116,7 +116,7 @@ import(void *arg)
         /* Drop import table, so we can import the table again */
         testutil_drop(session, IMPORT_URI, NULL);
 
-        period = mmrand(NULL, 1, 10);
+        period = mmrand(&g.extra_rnd, 1, 10);
         while (period > 0 && !g.workers_finished) {
             --period;
             __wt_sleep(1, 0);
diff --git a/src/third_party/wiredtiger/test/format/kv.c b/src/third_party/wiredtiger/test/format/kv.c
index bf9bd7a689f..1b1ace2ffd4 100644
--- a/src/third_party/wiredtiger/test/format/kv.c
+++ b/src/third_party/wiredtiger/test/format/kv.c
@@ -47,7 +47,7 @@ key_init_random(TABLE *table)
         max = TV(BTREE_KEY_MAX);
         if (i % 20 != 0 && max > TV(BTREE_KEY_MIN) + 20)
             max = TV(BTREE_KEY_MIN) + 20;
-        table->key_rand_len[i] = mmrand(NULL, TV(BTREE_KEY_MIN), max);
+        table->key_rand_len[i] = mmrand(&g.data_rnd, TV(BTREE_KEY_MIN), max);
     }
 }
 
@@ -241,6 +241,7 @@ val_len(WT_RAND_STATE *rnd, uint64_t keyno, uint32_t min, uint32_t max)
 void
 val_init(TABLE *table, void *arg)
 {
+    WT_RAND_STATE *rnd;
     size_t i;
     uint32_t len;
 
@@ -263,8 +264,9 @@ val_init(TABLE *table, void *arg)
     for (i = 0; i < len; ++i)
         table->val_base[i] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % 26];
 
+    rnd = &g.data_rnd;
     table->val_dup_data_len =
-      val_len(NULL, (uint64_t)mmrand(NULL, 1, 20), TV(BTREE_VALUE_MIN), TV(BTREE_VALUE_MAX));
+      val_len(rnd, (uint64_t)mmrand(rnd, 1, 20), TV(BTREE_VALUE_MIN), TV(BTREE_VALUE_MAX));
 }
 
 /*
diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c
index 1b59ec9e80c..da1431e0bcb 100644
--- a/src/third_party/wiredtiger/test/format/ops.c
+++ b/src/third_party/wiredtiger/test/format/ops.c
@@ -28,7 +28,7 @@
 
 #include "format.h"
 
-static void apply_bounds(WT_CURSOR *, TABLE *);
+static void apply_bounds(WT_CURSOR *, TABLE *, WT_RAND_STATE *);
 static void clear_bounds(WT_CURSOR *, TABLE *);
 static int col_insert(TINFO *);
 static void col_insert_resolve(TABLE *, void *);
@@ -40,6 +40,7 @@ static int col_update(TINFO *, bool);
 static int nextprev(TINFO *, bool);
 static WT_THREAD_RET ops(void *);
 static int read_row(TINFO *);
+static void rollback_transaction(TINFO *);
 static int row_insert(TINFO *, bool);
 static int row_modify(TINFO *, bool);
 static int row_remove(TINFO *, bool);
@@ -72,16 +73,16 @@ modify_build(TINFO *tinfo)
     int i, nentries;
 
     /* Randomly select a number of byte changes, offsets and lengths. */
-    nentries = (int)mmrand(&tinfo->rnd, 1, MAX_MODIFY_ENTRIES);
+    nentries = (int)mmrand(&tinfo->data_rnd, 1, MAX_MODIFY_ENTRIES);
     for (i = 0; i < nentries; ++i) {
         tinfo->entries[i].data.data =
-          modify_repl + mmrand(&tinfo->rnd, 1, sizeof(modify_repl) - 10);
-        tinfo->entries[i].data.size = (size_t)mmrand(&tinfo->rnd, 0, 10);
+          modify_repl + mmrand(&tinfo->data_rnd, 1, sizeof(modify_repl) - 10);
+        tinfo->entries[i].data.size = (size_t)mmrand(&tinfo->data_rnd, 0, 10);
         /*
          * Start at least 11 bytes into the buffer so we skip leading key information.
          */
-        tinfo->entries[i].offset = (size_t)mmrand(&tinfo->rnd, 20, 40);
-        tinfo->entries[i].size = (size_t)mmrand(&tinfo->rnd, 0, 10);
+        tinfo->entries[i].offset = (size_t)mmrand(&tinfo->data_rnd, 20, 40);
+        tinfo->entries[i].size = (size_t)mmrand(&tinfo->data_rnd, 0, 10);
     }
 
     tinfo->nentries = nentries;
@@ -169,6 +170,25 @@ tinfo_init(void)
 
         tinfo->state = TINFO_RUNNING;
         tinfo->quit = false;
+
+        testutil_random_from_random(&tinfo->data_rnd, &g.data_rnd);
+        testutil_random_from_random(&tinfo->extra_rnd, &g.extra_rnd);
+    }
+}
+
+/*
+ * lanes_init --
+ *     Initialize the lanes structures.
+ */
+static void
+lanes_init(void)
+{
+    uint32_t lane;
+
+    /* Cleanup for each new run. */
+    for (lane = 0; lane < LANE_COUNT; ++lane) {
+        g.lanes[lane].in_use = false;
+        g.lanes[lane].last_commit_ts = 0;
     }
 }
 
@@ -226,6 +246,14 @@ rollback_to_stable(WT_SESSION *session)
 
     /* Check the saved snap operations for consistency. */
     snap_repeat_rollback(session, tinfo_list, GV(RUNS_THREADS));
+
+    /*
+     * For a predictable run, the final stable timestamp is known and fixed, but individual threads
+     * may have gone beyond that. Now that we've rolled back, set the current timestamp to the
+     * stable so that next run starts from a known value.
+     */
+    if (GV(RUNS_PREDICTABLE_REPLAY))
+        g.timestamp = g.stable_timestamp;
 }
 
 /*
@@ -233,7 +261,7 @@ rollback_to_stable(WT_SESSION *session)
  *     Perform a number of operations in a set of threads.
  */
 void
-operations(u_int ops_seconds, bool lastrun)
+operations(u_int ops_seconds, u_int run_current, u_int run_total)
 {
     SAP sap;
     TINFO *tinfo, total;
@@ -243,9 +271,10 @@ operations(u_int ops_seconds, bool lastrun)
     wt_thread_t timestamp_tid;
     int64_t fourths, quit_fourths, thread_ops;
     uint32_t i;
-    bool running;
+    bool lastrun, running;
 
     conn = g.wts_conn;
+    lastrun = (run_current == run_total);
 
     /* Make the modify pad character printable to simplify debugging and logging. */
     __wt_process.modify_pad_byte = FORMAT_PAD_BYTE;
@@ -266,17 +295,30 @@ operations(u_int ops_seconds, bool lastrun)
      * There are two mechanisms to specify the length of the run, a number of operations and a
      * timer, when either expire the run terminates.
      *
-     * Each thread does an equal share of the total operations (and make sure that it's not 0).
+     * If we have a number of operations with predictable replay, we set a stop timestamp. Without
+     * predictable replay, each thread does an equal share of the total operations (and make sure
+     * that it's not 0).
      *
-     * Calculate how many fourth-of-a-second sleeps until the timer expires. If the timer expires
-     * and threads don't return in 15 minutes, assume there is something hung, and force the quit.
+     * With a timer, calculate how many fourth-of-a-second sleeps until the timer expires. If the
+     * timer expires and threads don't return in 15 minutes, assume there is something hung, and
+     * force the quit.
      */
+    g.stop_timestamp = 0;
     if (GV(RUNS_OPS) == 0)
         thread_ops = -1;
     else {
         if (GV(RUNS_OPS) < GV(RUNS_THREADS))
             GV(RUNS_OPS) = GV(RUNS_THREADS);
-        thread_ops = GV(RUNS_OPS) / GV(RUNS_THREADS);
+        if (GV(RUNS_PREDICTABLE_REPLAY)) {
+            /*
+             * If running with an operation count for predictable replay, ignore other ways of
+             * stopping.
+             */
+            thread_ops = -1;
+            ops_seconds = 0;
+            g.stop_timestamp = (GV(RUNS_OPS) * run_current) / run_total;
+        } else
+            thread_ops = GV(RUNS_OPS) / GV(RUNS_THREADS);
     }
     if (ops_seconds == 0)
         fourths = quit_fourths = -1;
@@ -290,9 +332,12 @@ operations(u_int ops_seconds, bool lastrun)
     wt_wrap_open_session(conn, &sap, NULL, &session);
 
     /* Initialize and start the worker threads. */
+    lanes_init();
     tinfo_init();
     trace_msg(session, "%s", "=============== thread ops start");
 
+    replay_run_begin(session);
+
     for (i = 0; i < GV(RUNS_THREADS); ++i) {
         tinfo = tinfo_list[i];
         testutil_check(__wt_thread_create(NULL, &tinfo->tid, ops, tinfo));
@@ -356,13 +401,24 @@ operations(u_int ops_seconds, bool lastrun)
                  */
                 if (lastrun && GV(FORMAT_ABORT))
                     random_failure();
-                tinfo->quit = true;
+
+                /*
+                 * Predictable replay cannot independently tag every thread to stop, we would end up
+                 * with a mix of commits at the end of the run. Rather, later in this loop, when we
+                 * see we are finishing, we give all threads stop timestamp that they must run to,
+                 * but not exceed.
+                 */
+                if (!GV(RUNS_PREDICTABLE_REPLAY))
+                    tinfo->quit = true;
             }
         }
         track_ops(&total);
         if (!running)
             break;
         __wt_sleep(0, 250 * WT_THOUSAND); /* 1/4th of a second */
+
+        if (fourths == 1 && GV(RUNS_PREDICTABLE_REPLAY))
+            replay_end_timed_run();
         if (fourths != -1)
             --fourths;
         if (quit_fourths != -1 && --quit_fourths == 0) {
@@ -417,6 +473,8 @@ operations(u_int ops_seconds, bool lastrun)
      */
     rollback_to_stable(session);
 
+    replay_run_end(session);
+
     if (lastrun) {
         tinfo_teardown();
         timestamp_teardown(session);
@@ -438,14 +496,18 @@ begin_transaction_ts(TINFO *tinfo)
 
     session = tinfo->session;
 
-    /*
-     * Transaction timestamp reads are repeatable, but read timestamps must be before any possible
-     * commit timestamp. Without a read timestamp, reads are based on the transaction snapshot,
-     * which will include the latest values as of when the snapshot is taken. Test in both modes:
-     * 75% of the time, pick a read timestamp before any commit timestamp still in use, 25% of the
-     * time don't set a timestamp at all.
-     */
-    ts = mmrand(&tinfo->rnd, 1, 4) == 1 ? 0 : timestamp_maximum_committed();
+    /* Pick a read timestamp. */
+    if (GV(RUNS_PREDICTABLE_REPLAY))
+        ts = replay_read_ts(tinfo);
+    else
+        /*
+         * Transaction timestamp reads are repeatable, but read timestamps must be before any
+         * possible commit timestamp. Without a read timestamp, reads are based on the transaction
+         * snapshot, which will include the latest values as of when the snapshot is taken. Test in
+         * both modes: 75% of the time, pick a read timestamp before any commit timestamp still in
+         * use, 25% of the time don't set a timestamp at all.
+         */
+        ts = mmrand(&tinfo->data_rnd, 1, 4) == 1 ? 0 : timestamp_maximum_committed();
     if (ts != 0) {
         wt_wrap_begin_transaction(session, NULL);
 
@@ -459,6 +521,13 @@ begin_transaction_ts(TINFO *tinfo)
             trace_uri_op(tinfo, NULL, "begin snapshot read-ts=%" PRIu64 " (repeatable)", ts);
             return;
         }
+
+        /*
+         * It should not be possible for a timestamp to age out of the system with predictable
+         * replay. If a begin transaction were to fail, we'd need to begin the transaction again
+         * with the same replay timestamp; we can never give up on a timestamp.
+         */
+        testutil_assert(!GV(RUNS_PREDICTABLE_REPLAY));
         testutil_assert(ret == EINVAL);
         testutil_check(session->rollback_transaction(session, NULL));
     }
@@ -505,7 +574,10 @@ commit_transaction(TINFO *tinfo, bool prepared)
         if (prepared)
             lock_readlock(session, &g.prepare_commit_lock);
 
-        ts = __wt_atomic_addv64(&g.timestamp, 1);
+        if (GV(RUNS_PREDICTABLE_REPLAY))
+            ts = replay_commit_ts(tinfo);
+        else
+            ts = __wt_atomic_addv64(&g.timestamp, 1);
         testutil_check(session->timestamp_transaction_uint(session, WT_TS_TXN_TYPE_COMMIT, ts));
 
         if (prepared)
@@ -515,6 +587,7 @@ commit_transaction(TINFO *tinfo, bool prepared)
         testutil_check(session->commit_transaction(session, NULL));
         if (prepared)
             lock_readunlock(session, &g.prepare_commit_lock);
+        replay_committed(tinfo);
     } else
         testutil_check(session->commit_transaction(session, NULL));
 
@@ -542,6 +615,7 @@ rollback_transaction(TINFO *tinfo)
     ++tinfo->rollback;
 
     testutil_check(session->rollback_transaction(session, NULL));
+    replay_rollback(tinfo);
 
     trace_uri_op(tinfo, NULL, "abort read-ts=%" PRIu64, tinfo->read_ts);
 }
@@ -561,12 +635,15 @@ prepare_transaction(TINFO *tinfo)
 
     ++tinfo->prepare;
 
-    /*
-     * Prepare timestamps must be less than or equal to the eventual commit timestamp. Set the
-     * prepare timestamp to whatever the global value is now. The subsequent commit will increment
-     * it, ensuring correctness.
-     */
-    ts = __wt_atomic_fetch_addv64(&g.timestamp, 1);
+    if (GV(RUNS_PREDICTABLE_REPLAY))
+        ts = replay_prepare_ts(tinfo);
+    else
+        /*
+         * Prepare timestamps must be less than or equal to the eventual commit timestamp. Set the
+         * prepare timestamp to whatever the global value is now. The subsequent commit will
+         * increment it, ensuring correctness.
+         */
+        ts = __wt_atomic_fetch_addv64(&g.timestamp, 1);
     testutil_check(session->timestamp_transaction_uint(session, WT_TS_TXN_TYPE_PREPARE, ts));
     ret = session->prepare_transaction(session, NULL);
 
@@ -623,6 +700,20 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op)
     tinfo->cursor = table_cursor(tinfo, table->id);
 
     /*
+     * Predictable replay has some restrictions. Someday we may be able to resolve some of these
+     * restrictions, this may require adding complexity.
+     *
+     * We disallow inserts into column stores, as column stores do inserts by expanding the number
+     * of keys in the table. This has an interplay with other threads that are trying to predictably
+     * generate key numbers since the key space is growing at a random time. Thus column stores are
+     * restricted to accessing keys that were inserted via bulk load.
+     */
+    if (GV(RUNS_PREDICTABLE_REPLAY)) {
+        if (table->type != ROW && op == INSERT)
+            op = READ;
+    }
+
+    /*
      * Truncate has the key set to before/after rows in the table, skip pre-fetch and reserve for
      * simplicity.
      *
@@ -641,7 +732,7 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op)
          * position taken from a previous search. If not already doing a read, position the cursor
          * at an existing point in the tree 20% of the time.
          */
-        if (op != READ && mmrand(&tinfo->rnd, 1, 5) == 1) {
+        if (op != READ && mmrand(&tinfo->data_rnd, 1, 5) == 1) {
             ++tinfo->search;
             ret = read_row(tinfo);
             if (ret == 0) {
@@ -657,7 +748,7 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op)
          * work, but doesn't make sense. Reserving a row before a read won't be useful but it's not
          * unexpected.
          */
-        if (intxn && iso_level == ISOLATION_SNAPSHOT && mmrand(&tinfo->rnd, 0, 20) == 1) {
+        if (intxn && iso_level == ISOLATION_SNAPSHOT && mmrand(&tinfo->data_rnd, 0, 20) == 1) {
             switch (table->type) {
             case ROW:
                 ret = row_reserve(tinfo, positioned);
@@ -718,14 +809,14 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op)
     case READ:
         ++tinfo->search;
 
-        if (!positioned && GV(OPS_BOUND_CURSOR) && mmrand(NULL, 1, 2) == 1) {
+        if (!positioned && GV(OPS_BOUND_CURSOR) && mmrand(&tinfo->extra_rnd, 1, 2) == 1) {
             bound_set = true;
             /*
              * FIXME-WT-9883: It is possible that the underlying cursor is still positioned even
              * though the positioned variable is false. Reset the position through reset for now.
              */
             testutil_check(tinfo->cursor->reset(tinfo->cursor));
-            apply_bounds(tinfo->cursor, tinfo->table);
+            apply_bounds(tinfo->cursor, tinfo->table, &tinfo->extra_rnd);
         }
 
         ret = read_row(tinfo);
@@ -803,8 +894,8 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op)
      * direction.
      */
     if (positioned) {
-        next = mmrand(&tinfo->rnd, 0, 1) == 1;
-        j = mmrand(&tinfo->rnd, 1, 100);
+        next = mmrand(&tinfo->extra_rnd, 0, 1) == 1;
+        j = mmrand(&tinfo->extra_rnd, 1, 100);
         for (i = 0; i < j; ++i) {
             if ((ret = nextprev(tinfo, next)) == 0)
                 continue;
@@ -818,7 +909,7 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op)
      * Reset the cursor: there is no reason to keep pages pinned, periodically forcibly evict the
      * underlying page.
      */
-    evict_page = mmrand(&tinfo->rnd, 1, 20) == 1;
+    evict_page = mmrand(&tinfo->extra_rnd, 1, 20) == 1;
     if (evict_page)
         F_SET(tinfo->cursor, WT_CURSTD_DEBUG_RESET_EVICT);
     testutil_check(tinfo->cursor->reset(tinfo->cursor));
@@ -865,7 +956,7 @@ ops(void *arg)
     iso_level_t iso_level;
     thread_op op;
     uint64_t reset_op, session_op, truncate_op;
-    uint32_t max_rows, range, rnd;
+    uint32_t max_rows, ntries, range, rnd;
     u_int i;
     const char *iso_config;
     bool greater_than, intxn, prepared;
@@ -878,25 +969,59 @@ ops(void *arg)
      * pound on the same key/value pairs, that is, by making them traverse the same RNG space. 75%
      * of the time we run in independent RNG space.
      */
-    if (GV(FORMAT_INDEPENDENT_THREAD_RNG))
-        __wt_random_init_seed(NULL, &tinfo->rnd);
-    else
-        __wt_random_init(&tinfo->rnd);
+    if (GV(FORMAT_INDEPENDENT_THREAD_RNG)) {
+        testutil_random_from_seed(&tinfo->data_rnd, GV(RANDOM_DATA_SEED) + (u_int)tinfo->id);
+        testutil_random_from_seed(&tinfo->extra_rnd, GV(RANDOM_EXTRA_SEED) + (u_int)tinfo->id);
+    } else {
+        testutil_random_from_seed(&tinfo->data_rnd, GV(RANDOM_DATA_SEED));
+        testutil_random_from_seed(&tinfo->extra_rnd, GV(RANDOM_EXTRA_SEED));
+    }
 
     iso_level = ISOLATION_SNAPSHOT; /* -Wconditional-uninitialized */
+    tinfo->replay_again = false;
+    tinfo->lane = LANE_NONE;
 
     /* Set the first operation where we'll create a new session and cursors. */
     session = NULL;
     session_op = 0;
+    ntries = 0;
 
     /* Set the first operation where we'll reset the session. */
-    reset_op = mmrand(&tinfo->rnd, 100, 10 * WT_THOUSAND);
+    reset_op = mmrand(&tinfo->extra_rnd, 100, 10 * WT_THOUSAND);
     /* Set the first operation where we'll truncate a range. */
-    truncate_op = mmrand(&tinfo->rnd, 100, 10 * WT_THOUSAND);
+    truncate_op = mmrand(&tinfo->data_rnd, 100, 10 * WT_THOUSAND);
 
     for (intxn = false; !tinfo->quit;) {
+rollback_retry:
+        if (tinfo->quit)
+            break;
+
         ++tinfo->ops;
 
+        if (!tinfo->replay_again)
+            /*
+             * Number of failures so far for the current operation and key. In predictable replay,
+             * unless we have a read operation, we cannot give up on any operation and maintain the
+             * integrity of the replay.
+             */
+            ntries = 0;
+
+        /* Number of tries only gets incremented during predictable replay. */
+        testutil_assert(ntries == 0 || (!intxn && tinfo->replay_again));
+
+        /*
+         * In predictable replay, put each operation in its own transaction. It's possible we could
+         * make multiple operations work predictably in the future.
+         */
+        if (intxn && GV(RUNS_PREDICTABLE_REPLAY)) {
+            commit_transaction(tinfo, false);
+            intxn = false;
+        }
+
+        replay_loop_begin(tinfo, intxn);
+        if (tinfo->quit)
+            break;
+
         /* Periodically open up a new session and cursors. */
         if (tinfo->ops > session_op) {
             /* Resolve any running transaction. */
@@ -909,7 +1034,7 @@ ops(void *arg)
             session = tinfo->session;
 
             /* Pick the next session/cursor close/open. */
-            session_op += mmrand(&tinfo->rnd, 100, 5 * WT_THOUSAND);
+            session_op += mmrand(&tinfo->extra_rnd, 100, 5 * WT_THOUSAND);
         }
 
         /* If not in a transaction, reset the session periodically so that operation is tested. */
@@ -917,20 +1042,20 @@ ops(void *arg)
             testutil_check(session->reset(session));
 
             /* Pick the next reset operation. */
-            reset_op += mmrand(&tinfo->rnd, 40 * WT_THOUSAND, 60 * WT_THOUSAND);
+            reset_op += mmrand(&tinfo->extra_rnd, 40 * WT_THOUSAND, 60 * WT_THOUSAND);
         }
 
         /*
          * If not in a transaction and in a timestamp world, occasionally repeat timestamped
          * operations.
          */
-        if (!intxn && g.transaction_timestamps_config && mmrand(&tinfo->rnd, 1, 15) == 1) {
+        if (!intxn && g.transaction_timestamps_config && mmrand(&tinfo->extra_rnd, 1, 15) == 1) {
             ++tinfo->search;
             snap_repeat_single(tinfo);
         }
 
         /* Select a table. */
-        table = tinfo->table = table_select(tinfo);
+        table = tinfo->table = table_select(tinfo, true);
 
         /*
          * If not in a transaction and in a timestamp world, start a transaction (which is always at
@@ -946,14 +1071,15 @@ ops(void *arg)
             intxn = true;
         }
         if (!intxn) {
+            testutil_assert(!GV(RUNS_PREDICTABLE_REPLAY));
             iso_level = ISOLATION_IMPLICIT;
 
-            if (table->mirror || mmrand(&tinfo->rnd, 1, 100) < GV(TRANSACTION_IMPLICIT)) {
+            if (table->mirror || mmrand(&tinfo->data_rnd, 1, 100) < GV(TRANSACTION_IMPLICIT)) {
                 iso_level = ISOLATION_SNAPSHOT;
                 iso_config = "isolation=snapshot";
 
                 /* Occasionally do reads at an isolation level lower than snapshot. */
-                switch (mmrand(NULL, 1, 20)) {
+                switch (mmrand(&tinfo->data_rnd, 1, 20)) {
                 case 1:
                     iso_level = ISOLATION_READ_COMMITTED; /* 5% */
                     iso_config = "isolation=read-committed";
@@ -975,7 +1101,7 @@ ops(void *arg)
          */
         op = READ;
         if (iso_level == ISOLATION_IMPLICIT || iso_level == ISOLATION_SNAPSHOT) {
-            i = mmrand(&tinfo->rnd, 1, 100);
+            i = mmrand(&tinfo->data_rnd, 1, 100);
             if (i < TV(OPS_PCT_DELETE)) {
                 op = REMOVE;
                 if (TV(OPS_TRUNCATE) && tinfo->ops > truncate_op) {
@@ -986,7 +1112,7 @@ ops(void *arg)
                         op = TRUNCATE;
 
                     /* Pick the next truncate operation. */
-                    truncate_op += mmrand(&tinfo->rnd, 20 * WT_THOUSAND, 100 * WT_THOUSAND);
+                    truncate_op += mmrand(&tinfo->data_rnd, 20 * WT_THOUSAND, 100 * WT_THOUSAND);
                 }
             } else if (i < TV(OPS_PCT_DELETE) + TV(OPS_PCT_INSERT))
                 op = INSERT;
@@ -996,6 +1122,10 @@ ops(void *arg)
               TV(OPS_PCT_DELETE) + TV(OPS_PCT_INSERT) + TV(OPS_PCT_MODIFY) + TV(OPS_PCT_WRITE))
                 op = UPDATE;
         }
+        tinfo->op = op; /* Keep the op in the thread info for debugging */
+
+        /* Make sure this is an operation that is permitted for this kind of run. */
+        testutil_assert(replay_operation_enabled(op));
 
         /*
          * Get the number of rows. Column-store extends the object, use that extended count if this
@@ -1005,7 +1135,8 @@ ops(void *arg)
         max_rows = TV(RUNS_ROWS);
         if (table->type != ROW && !table->mirror)
             WT_ORDERED_READ(max_rows, table->rows_current);
-        tinfo->keyno = mmrand(&tinfo->rnd, 1, (u_int)max_rows);
+        tinfo->keyno = mmrand(&tinfo->data_rnd, 1, (u_int)max_rows);
+        replay_adjust_key(tinfo, max_rows);
 
         /*
          * If the operation is a truncate, select a range.
@@ -1020,9 +1151,9 @@ ops(void *arg)
          * from lower keys to higher keys or vice-versa).
          */
         if (op == TRUNCATE) {
-            tinfo->last = tinfo->keyno = mmrand(&tinfo->rnd, 1, (u_int)max_rows);
-            greater_than = mmrand(&tinfo->rnd, 0, 1) == 1;
-            range = max_rows < 20 ? 0 : mmrand(&tinfo->rnd, 0, (u_int)max_rows / 50);
+            tinfo->last = tinfo->keyno = mmrand(&tinfo->data_rnd, 1, (u_int)max_rows);
+            greater_than = mmrand(&tinfo->data_rnd, 0, 1) == 1;
+            range = max_rows < 20 ? 0 : mmrand(&tinfo->data_rnd, 0, (u_int)max_rows / 50);
             if (greater_than) {
                 if (TV(BTREE_REVERSE)) {
                     if (tinfo->keyno <= range)
@@ -1069,9 +1200,10 @@ ops(void *arg)
          */
         if (op == INSERT || op == UPDATE) {
             if (table->type == FIX && table->mirror)
-                val_gen(g.base_mirror, &tinfo->rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno);
+                val_gen(
+                  g.base_mirror, &tinfo->data_rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno);
             else
-                val_gen(table, &tinfo->rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno);
+                val_gen(table, &tinfo->data_rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno);
         }
 
         /*
@@ -1085,7 +1217,7 @@ ops(void *arg)
             if (table->type != FIX || table->mirror)
                 modify_build(tinfo);
             else
-                val_gen(table, &tinfo->rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno);
+                val_gen(table, &tinfo->data_rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno);
         }
 
         /*
@@ -1106,7 +1238,14 @@ ops(void *arg)
              * skip the operation. This isn't to avoid wasted work: any FLCS table in the mirrored
              * will do an update as FLCS doesn't support modify, and we'll fail when we compare the
              * remove to the FLCS value.
+             *
+             * For predictable replay if the record doesn't exist (that's predictable), and we must
+             * force a rollback, we always finish a loop iteration in a committed or rolled back
+             * state.
              */
+            if (GV(RUNS_PREDICTABLE_REPLAY) && (ret == WT_ROLLBACK || tinfo->op_ret == WT_NOTFOUND))
+                goto rollback;
+
             if (tinfo->op_ret == WT_NOTFOUND)
                 goto skip_operation;
 
@@ -1116,6 +1255,8 @@ ops(void *arg)
             tinfo->table = table;
             ret = table_op(tinfo, intxn, iso_level, op);
             testutil_assert(ret == 0 || ret == WT_ROLLBACK);
+            if (GV(RUNS_PREDICTABLE_REPLAY) && ret == WT_ROLLBACK)
+                goto rollback;
             skip2 = table;
         }
         if (ret == 0 && table->mirror)
@@ -1124,6 +1265,8 @@ ops(void *arg)
                     tinfo->table = tables[i];
                     ret = table_op(tinfo, intxn, iso_level, op);
                     testutil_assert(ret == 0 || ret == WT_ROLLBACK);
+                    if (GV(RUNS_PREDICTABLE_REPLAY) && ret == WT_ROLLBACK)
+                        goto rollback;
                     if (ret == WT_ROLLBACK)
                         break;
                 }
@@ -1144,9 +1287,22 @@ skip_operation:
 
         /*
          * If not in a transaction, we're done with this operation. If in a transaction, add more
-         * operations to the transaction half the time.
+         * operations to the transaction half the time. For predictable replay runs, always complete
+         * the transaction.
          */
-        if (!intxn || (rnd = mmrand(&tinfo->rnd, 1, 10)) > 5)
+        if (GV(RUNS_PREDICTABLE_REPLAY)) {
+            rnd = mmrand(&tinfo->data_rnd, 1, 5);
+
+            /*
+             * Note that a random value of 5 would result in a rollback per the switch below. For
+             * predictable replay, only do that once per timestamp. If we didn't have this check, a
+             * retry would start again with the same timestamp and RNG state, and get the same dice
+             * roll. This would happen every time and the thread will be get stuck doing continuous
+             * rollbacks.
+             */
+            if (rnd == 5 && ntries != 0)
+                rnd = 4; /* Choose to do a commit this time. */
+        } else if (!intxn || (rnd = mmrand(&tinfo->data_rnd, 1, 10)) > 5)
             continue;
 
         /*
@@ -1168,7 +1324,7 @@ skip_operation:
          * timestamped world, which means we're in a snapshot-isolation transaction by definition.
          */
         prepared = false;
-        if (GV(OPS_PREPARE) && mmrand(&tinfo->rnd, 1, 10) == 1) {
+        if (GV(OPS_PREPARE) && mmrand(&tinfo->data_rnd, 1, 10) == 1) {
             if ((ret = prepare_transaction(tinfo)) != 0) {
                 testutil_assert(ret == WT_ROLLBACK);
                 goto rollback;
@@ -1191,6 +1347,18 @@ skip_operation:
             break;
         case 5: /* 10% */
 rollback:
+            if (GV(RUNS_PREDICTABLE_REPLAY)) {
+                if (tinfo->quit)
+                    goto loop_exit;
+                /* Force a rollback */
+                testutil_assert(intxn);
+                rollback_transaction(tinfo);
+                intxn = false;
+                ++ntries;
+                replay_pause_after_rollback(tinfo, ntries);
+                ret = 0;
+                goto rollback_retry;
+            }
             __wt_yield(); /* Encourage races */
             rollback_transaction(tinfo);
             snap_repeat_update(tinfo, false);
@@ -1200,6 +1368,7 @@ rollback:
         intxn = false;
     }
 
+loop_exit:
     if (session != NULL)
         testutil_check(session->close(session, NULL));
     tinfo->session = NULL;
@@ -1238,7 +1407,11 @@ read_row_worker(TINFO *tinfo, TABLE *table, WT_CURSOR *cursor, uint64_t keyno, W
         break;
     }
 
-    if (sn) {
+    /*
+     * We don't use search near for predictable replay runs, as the return key can be variable
+     * depending on the structure of the Btree.
+     */
+    if (sn && !GV(RUNS_PREDICTABLE_REPLAY)) {
         ret = read_op(cursor, SEARCH_NEAR, &exact);
         if (ret == 0 && exact != 0)
             ret = WT_NOTFOUND;
@@ -1293,7 +1466,7 @@ read_row_worker(TINFO *tinfo, TABLE *table, WT_CURSOR *cursor, uint64_t keyno, W
  *     Apply lower and upper bounds on the cursor. The lower and upper bound is randomly generated.
  */
 static void
-apply_bounds(WT_CURSOR *cursor, TABLE *table)
+apply_bounds(WT_CURSOR *cursor, TABLE *table, WT_RAND_STATE *rnd)
 {
     WT_ITEM key;
     uint32_t lower_keyno, max_rows, upper_keyno;
@@ -1310,7 +1483,7 @@ apply_bounds(WT_CURSOR *cursor, TABLE *table)
      * Generate a random lower key and apply to the lower bound or upper bound depending on the
      * reverse collator.
      */
-    lower_keyno = mmrand(NULL, 1, max_rows);
+    lower_keyno = mmrand(rnd, 1, max_rows);
     /* Retrieve the key/value pair by key. */
     switch (table->type) {
     case FIX:
@@ -1331,7 +1504,7 @@ apply_bounds(WT_CURSOR *cursor, TABLE *table)
      * Generate a random upper key and apply to the upper bound or lower bound depending on the
      * reverse collator.
      */
-    upper_keyno = mmrand(NULL, lower_keyno, max_rows);
+    upper_keyno = mmrand(rnd, lower_keyno, max_rows);
 
     /* Retrieve the key/value pair by key. */
     switch (table->type) {
@@ -1371,20 +1544,22 @@ clear_bounds(WT_CURSOR *cursor, TABLE *table)
  *     Read and verify a subset of the elements in a file.
  */
 void
-wts_read_scan(TABLE *table, void *arg)
+wts_read_scan(TABLE *table, void *args)
 {
     SAP sap;
     WT_CONNECTION *conn;
     WT_CURSOR *cursor;
     WT_DECL_RET;
     WT_ITEM key, value;
+    WT_RAND_STATE *rnd;
     WT_SESSION *session;
     uint64_t keyno;
     uint32_t max_rows;
     uint8_t bitv;
 
-    conn = (WT_CONNECTION *)arg;
     testutil_assert(table != NULL);
+    conn = ((READ_SCAN_ARGS *)args)->conn;
+    rnd = ((READ_SCAN_ARGS *)args)->rnd;
 
     /*
      * We're not configuring transactions or read timestamps: if there's a diagnostic check that all
@@ -1406,14 +1581,14 @@ wts_read_scan(TABLE *table, void *arg)
     WT_ORDERED_READ(max_rows, table->rows_current);
     for (keyno = 0; keyno < max_rows;) {
         if (++keyno > 50)
-            keyno += mmrand(NULL, 1, WT_THOUSAND);
+            keyno += mmrand(rnd, 1, WT_THOUSAND);
         if (keyno > max_rows)
             keyno = max_rows;
 
-        if (GV(OPS_BOUND_CURSOR) && mmrand(NULL, 1, 10) == 1) {
+        if (GV(OPS_BOUND_CURSOR) && mmrand(rnd, 1, 10) == 1) {
             /* Reset the position of the cursor, so that we can apply bounds on the cursor. */
             testutil_check(cursor->reset(cursor));
-            apply_bounds(cursor, table);
+            apply_bounds(cursor, table, rnd);
         }
 
         switch (ret = read_row_worker(NULL, table, cursor, keyno, &key, &value, &bitv, false)) {
@@ -1444,7 +1619,7 @@ read_row(TINFO *tinfo)
 {
     /* 25% of the time we call search-near. */
     return (read_row_worker(tinfo, NULL, tinfo->cursor, tinfo->keyno, tinfo->key, tinfo->value,
-      &tinfo->bitv, mmrand(&tinfo->rnd, 0, 3) == 1));
+      &tinfo->bitv, mmrand(&tinfo->extra_rnd, 0, 3) == 1));
 }
 
 /*
@@ -1563,7 +1738,7 @@ modify(TINFO *tinfo, WT_CURSOR *cursor, bool positioned)
     bool modify_check;
 
     /* Periodically verify the WT_CURSOR.modify return. */
-    modify_check = positioned && mmrand(&tinfo->rnd, 1, 20) == 1;
+    modify_check = positioned && mmrand(&tinfo->extra_rnd, 1, 20) == 1;
     if (modify_check) {
         testutil_check(cursor->get_value(cursor, &tinfo->moda));
         testutil_check(
@@ -1794,7 +1969,7 @@ row_insert(TINFO *tinfo, bool positioned)
      * Otherwise, generate a unique key and insert (or update an already inserted record).
      */
     if (!positioned) {
-        key_gen_insert(tinfo->table, &tinfo->rnd, tinfo->key, tinfo->keyno);
+        key_gen_insert(tinfo->table, &tinfo->data_rnd, tinfo->key, tinfo->keyno);
         cursor->set_key(cursor, tinfo->key);
     }
     cursor->set_value(cursor, tinfo->new_value);
diff --git a/src/third_party/wiredtiger/test/format/random.c b/src/third_party/wiredtiger/test/format/random.c
index c88fe8e21f7..859186fa29f 100644
--- a/src/third_party/wiredtiger/test/format/random.c
+++ b/src/third_party/wiredtiger/test/format/random.c
@@ -72,11 +72,11 @@ random_kv(void *arg)
         simple = !simple;
 
         /* Select a table and open a cursor. */
-        table = table_select_type(ROW);
+        table = table_select_type(ROW, false);
         wt_wrap_open_cursor(session, table->uri, config, &cursor);
 
         /* This is just a smoke-test, get some key/value pairs. */
-        for (i = mmrand(NULL, 0, WT_THOUSAND); i > 0; --i) {
+        for (i = mmrand(&g.extra_rnd, 0, WT_THOUSAND); i > 0; --i) {
             switch (ret = cursor->next(cursor)) {
             case 0:
                 break;
@@ -95,7 +95,7 @@ random_kv(void *arg)
         testutil_check(cursor->close(cursor));
 
         /* Sleep for some number of seconds. */
-        period = mmrand(NULL, 1, 10);
+        period = mmrand(&g.extra_rnd, 1, 10);
 
         /* Sleep for short periods so we don't make the run wait. */
         while (period > 0 && !g.workers_finished) {
diff --git a/src/third_party/wiredtiger/test/format/replay.c b/src/third_party/wiredtiger/test/format/replay.c
new file mode 100644
index 00000000000..e730119d70f
--- /dev/null
+++ b/src/third_party/wiredtiger/test/format/replay.c
@@ -0,0 +1,548 @@
+/*-
+ * Public Domain 2014-present MongoDB, Inc.
+ * Public Domain 2008-2014 WiredTiger, Inc.
+ *
+ * This is free and unencumbered software released into the public domain.
+ *
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
+ * distribute this software, either in source code form or as a compiled
+ * binary, for any purpose, commercial or non-commercial, and by any
+ * means.
+ *
+ * In jurisdictions that recognize copyright laws, the author or authors
+ * of this software dedicate any and all copyright interest in the
+ * software to the public domain. We make this dedication for the benefit
+ * of the public at large and to the detriment of our heirs and
+ * successors. We intend this dedication to be an overt act of
+ * relinquishment in perpetuity of all present and future rights to this
+ * software under copyright law.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "format.h"
+
+/*
+ * Predictable replay is the ability to do test runs multiple times and always have predictable
+ * changes made at every timestamp. Two predictable runs with the same starting data seed executed
+ * up to the same timestamp will always have their data compare identically. Predictable replay only
+ * works with timestamped transactions and to avoid complexity, only a single operation is allowed
+ * in a transaction.
+ *
+ * To achieve the predictability we use two random number generators (the data RNG and the extra
+ * RNG) with known start seeds, the data seed and the extra seed. Every single-threaded modification
+ * (like bulk loading) when deciding on a random course, uses the global data RNG, which is seeded
+ * by the data seed. Global decisions that don't affect data, like whether to turn on verbose, or
+ * even the rate of checkpointing, use the global extra RNG, which is seeded by the extra seed.
+ * Changing the extra seed may change some characteristics of how a workload is tested, but should
+ * not change any data on disk. When worker threads run, they have their own data and extra RNGs,
+ * and these are seeded by the timestamp they are working on.
+ *
+ * Before a worker thread can decide on what operation to do on which key in which table, it must
+ * obtain the next timestamp. Timestamps are doled out atomically, so no two worker threads can ever
+ * perform operations using the same timestamp. The timestamp is XOR-ed with the data seed, the
+ * result is the seed of the thread's private data RNG for the duration of that operation. Likewise,
+ * a private extra RNG is seeded from the timestamp and the extra seed. This ensures that all
+ * decisions about what is committed at that timestamp are predictable based on the timestamp. As
+ * you might expect, the thread's data RNG is used to decide what operation to do, which table to
+ * use, and which key within the table. Other random decisions, like whether to reopen a session, or
+ * whether to repeat a read from the snap list, use the extra RNG.
+ *
+ * Note that once a thread has started to work on an operation at a timestamp, it cannot give up on
+ * the effort. If, for example, a rollback error naturally happens, we can rollback the transaction.
+ * However, immediately getting a new timestamp would mean that we would lose the consequences of
+ * the previous timestamp, perhaps a record would not be updated in a particular way. Thus, after a
+ * rollback, a thread starts again, using the same timestamp it had before, and it seeds its RNGs
+ * again using this timestamp. This gives full predictability, even in the face of temporary
+ * failures.
+ *
+ * To avoid the possibility that two threads work on the same key at the same time, we have the
+ * concept of lanes, and only one thread can be working in a lane at once. There are LANE_COUNT
+ * lanes, where LANE_COUNT is 2^k for some k. A thread uses a data RNG to choose the top bits of a
+ * key number, but the bottom k bits of the key number are set to the bottom k bits of the timestamp
+ * being worked. Those bottom k bits also determine the lane we are in. Each lane has a flag that
+ * determines whether the lane is in use by some operation. If thread T1 working an operation at
+ * timestamp X takes a sufficiently long time relative to other operations, it may be that the
+ * current timestamp has advanced to X + LANE_COUNT. If that is the case, a different thread T2 that
+ * gets that larger timestamp will see that the lane is occupied. Rather than using that timestamp
+ * and potentially getting the same key number, the T2 leaves that timestamp, knowing that T1 will
+ * do it, and advances to another timestamp to work on. When T1 finishes its long operation, it will
+ * notice if there are other timestamps that have been left for it. If so, it keeps the lane
+ * occupied, and works on the new timestamp. At some point, it will notice that all the timestamps
+ * in the lane have been processed up to that point, and it can release the lane, and go back to
+ * choosing the next available timestamp to process.
+ *
+ * Having some operations lag behind is a natural part of processing. This leads to a stable
+ * timestamp that may lag significantly. Due to the possibility of dependencies between operations,
+ * the more lag, the more chance that a rollback error occurs. Without predictable replay, this is
+ * not a problem, any operation that produces a rollback can be freely abandoned, and threads
+ * generally continue moving quickly ahead with more work. However, with predictable replay, no
+ * operation can be abandoned, and an operation that failed because of a dependency will repeatedly
+ * fail until the stable timestamp advances. For that reason, we keep calculating and moving the
+ * stable timestamp ahead at a much faster pace when predictable replay is configured. We also use
+ * an algorithm that only uses lanes that are in use to calculate the stable timestamp. This is safe
+ * and more responsive than the default calculation. And when there is a rollback error, we try to
+ * be smart whether we need to yield or pause. These modifications allow predictable performance to
+ * be on par with regular performance.
+ */
+
+/*
+ * replay_end_timed_run --
+ *     In a timed run, get everyone to stop.
+ */
+void
+replay_end_timed_run(void)
+{
+    /*
+     * We'll post a stop timestamp that all worker threads should abide by. There's a potential race
+     * between when we read the current timestamp and before we publish the stop timestamp. During
+     * that time, other threads could do work and advance the current timestamp, potentially beyond
+     * the intended stop timestamp. We pick a stop timestamp far enough in the future that it's
+     * rather unlikely to happen.
+     */
+    WT_PUBLISH(g.stop_timestamp, g.timestamp + 0x10000);
+}
+
+/*
+ * replay_maximum_committed --
+ *     For predictable replay runs, return the largest timestamp that's no longer in use.
+ */
+uint64_t
+replay_maximum_committed(void)
+{
+    uint64_t commit_ts, ts;
+    uint32_t lane;
+
+    /*
+     * The calculation is expensive, and does not need to be accurate all the time, and it's okay to
+     * be behind. So we use a cached value most of the time.
+     */
+    ts = g.replay_cached_committed;
+    if (ts == 0 || __wt_atomic_addv32(&g.replay_calculate_committed, 1) % 20 == 0) {
+        WT_ORDERED_READ(ts, g.timestamp);
+        testutil_check(pthread_rwlock_wrlock(&g.lane_lock));
+        for (lane = 0; lane < LANE_COUNT; ++lane) {
+            if (g.lanes[lane].in_use) {
+                commit_ts = g.lanes[lane].last_commit_ts;
+                if (commit_ts != 0)
+                    ts = WT_MIN(ts, commit_ts);
+            }
+        }
+        if (ts == 0)
+            ts = 1;
+        g.replay_cached_committed = ts;
+        testutil_check(pthread_rwlock_unlock(&g.lane_lock));
+    }
+    return (ts);
+}
+
+/*
+ * replay_operation_enabled --
+ *     Return whether an operation type should be enabled in the configuration.
+ */
+bool
+replay_operation_enabled(thread_op op)
+{
+    if (!GV(RUNS_PREDICTABLE_REPLAY))
+        return (true);
+
+    /*
+     * We don't permit modify operations with predictable replay.
+     *
+     * The problem is read timestamps. As currently implemented, the read timestamp selected is
+     * variable, based on the state of other threads and their progress with other timestamped
+     * operations. And if two changes are made to the same key in a short amount of time, if the
+     * second operation were to be performed sometimes with a read timestamp before the first
+     * operation, and sometimes with a read timestamp after the first operation, then the results
+     * would be variable.
+     *
+     * We could track recent operations on a key (in its lane, for instance), but when we realize
+     * the read timestamp isn't recent enough, we would need to wait for the stable timestamp to
+     * move forward (and our waiting can affect/delay other thread's operations as well). Having the
+     * stable timestamp move forward is the only way our read timestamp can progress.
+     *
+     * Another possibility that also involves tracking recent operations on a key would be to
+     * disallow modifies that occur within, say 10000 timestamps of a previous write operation on
+     * the same key. Those modifies could be silently converted to reads, for instance. If our read
+     * timestamp was greater than 10000 timestamps behind, we'd still need to wait for the stable
+     * timestamp to catch up.
+     */
+    if (op == MODIFY)
+        return (false);
+
+    /*
+     * FIXME-WT-10570. We don't permit remove operations with predictable replay.
+     *
+     * This should be something we can and should fix. The problem may be similar to the problem
+     * with modify, where having a varying read timestamp can cause different results for different
+     * runs.
+     */
+    if (op == REMOVE)
+        return (false);
+
+    /*
+     * We don't permit truncate operations with predictable replay.
+     *
+     * Currently, we use an operation's timestamp to help derive the operation's key.
+     * The last N bits of the timestamp are used as the last bits of the key (where
+     * 2^N == LANE_COUNT). These last N bits give the lane number, and within each
+     * lane we track the progress of operations for that lane. Using lanes, we can
+     * track and guarantee that only a single operation is active in a lane at once,
+     * and therefore we can't have multiple operations on a single key performed out
+     * of order or simultaneously. The truncate operation, for a small set of keys,
+     * would reserve multiple consecutive lanes (probably okay) and for larger sets,
+     * would reserve the entire set of lanes. This would effectively require all
+     * threads to get into a holding state, waiting for the truncate to start and then
+     * complete before continuing with their next operation. While we could fudge this
+     * in certain ways (e.g. operations with 10000 timestamps of a truncate would be
+     * forced to stay out of its table), there still would be a lot of details, and
+     * some rethink of our lane strategy. Even getting this to work, we would have
+     * a truncate that had the whole table to itself, which doesn't seem like an
+     * effective test.
+     */
+    if (op == TRUNCATE)
+        return (false);
+
+    return (true);
+}
+
+/*
+ * replay_pick_timestamp --
+ *     Pick the next timestamp for this operation. That timestamp is used for any commits and also
+ *     determines which lane we are in, to prevent races from occurring on operations on a single
+ *     key. Also, by using the timestamp to seed the random number generators, it also determines
+ *     precisely the nature of the operation.
+ */
+static void
+replay_pick_timestamp(TINFO *tinfo)
+{
+    uint64_t replay_seed, stop_ts, ts;
+    uint32_t lane;
+    bool in_use;
+
+    /*
+     * Choose a unique timestamp for commits. When we do predictable replay. If the field for
+     * replaying again is set, we already have a timestamp picked for us.
+     */
+    if (tinfo->replay_again) {
+        /*
+         * Timestamp is already picked for us.
+         */
+        testutil_assert(tinfo->lane == LANE_NUMBER(tinfo->replay_ts));
+        tinfo->replay_again = false;
+    } else {
+        testutil_assert(tinfo->lane == LANE_NONE);
+
+        stop_ts = g.stop_timestamp;
+        if (stop_ts != 0 && g.stable_timestamp >= stop_ts && tinfo->replay_ts == 0) {
+            tinfo->quit = true;
+            return;
+        }
+
+        testutil_check(pthread_rwlock_wrlock(&g.lane_lock));
+        do {
+            /*
+             * For predictable replay, this is the only place we increment the timestamp. We keep a
+             * copy to check that assumption. If we were to mistakenly change the timestamp
+             * elsewhere (as might be done in non-predictable runs), we would lose the integrity of
+             * the predictable run.
+             */
+            testutil_assert(g.timestamp_copy == g.timestamp);
+            ts = __wt_atomic_addv64(&g.timestamp, 1);
+            g.timestamp_copy = g.timestamp;
+            lane = LANE_NUMBER(ts);
+            WT_ORDERED_READ(in_use, g.lanes[lane].in_use);
+        } while (in_use);
+
+        tinfo->replay_ts = ts;
+        WT_PUBLISH(g.lanes[lane].in_use, true);
+        testutil_check(pthread_rwlock_unlock(&g.lane_lock));
+        tinfo->lane = lane;
+    }
+
+    testutil_assert(tinfo->lane != LANE_NONE);
+    testutil_assert(g.lanes[tinfo->lane].in_use);
+
+    /*
+     * For this operation, seed the RNG used for data operations according to the timestamp and the
+     * global data seed. This allows us to have a predictable set of actions related to commits at
+     * this timestamp, so long as we are running with the same global data seed.
+     */
+    replay_seed = tinfo->replay_ts ^ GV(RANDOM_DATA_SEED);
+    testutil_random_from_seed(&tinfo->data_rnd, replay_seed);
+    replay_seed = tinfo->replay_ts ^ GV(RANDOM_EXTRA_SEED);
+    testutil_random_from_seed(&tinfo->extra_rnd, replay_seed);
+}
+
+/*
+ * replay_loop_begin --
+ *     Called at the top of the operation loop.
+ */
+void
+replay_loop_begin(TINFO *tinfo, bool intxn)
+{
+    if (GV(RUNS_PREDICTABLE_REPLAY)) {
+        /*
+         * Predictable replay, as it works now, requires that we're not in transaction when we start
+         * the loop.
+         */
+        testutil_assert(!intxn);
+
+        /*
+         * We're here at the start of the loop for one of four reasons:
+         *   1) We needed to rollback the transaction, so we didn't give up our replay timestamp,
+         * and we set the again flag.
+         *   2) We successfully committed the last transaction, but our lane was behind,
+         * and was skipped over, so we're obligated to perform the next timestamp in our lane.
+         * In that case, we have a replay timestamp and the again flag is set.
+         *   3) We successfully committed the last transaction, and our lane was not behind.
+         * We don't have a replay timestamp and the again flag is off.
+         *   4) It's our first time through the loop, this is equivalent to the previous case.
+         */
+        testutil_assert(tinfo->replay_again == (tinfo->replay_ts != 0));
+        /*
+         * Choose a unique timestamp for commits, based on the conditions above.
+         */
+        replay_pick_timestamp(tinfo);
+
+        testutil_assert(tinfo->quit || tinfo->replay_ts != 0);
+    }
+}
+
+/*
+ * replay_run_reset --
+ *     Called at beginning and end of runs to set up the lanes.
+ */
+static void
+replay_run_reset(void)
+{
+    TINFO *tinfo, **tlp;
+    uint64_t ts;
+    uint32_t lane;
+
+    /* Set every lane's commit timestamp to the current timestamp. */
+    ts = g.timestamp;
+    g.timestamp_copy = ts;
+    for (lane = 0; lane < LANE_COUNT; ++lane)
+        g.lanes[lane].last_commit_ts = ts;
+    g.replay_cached_committed = ts;
+
+    /* Reset fields in tinfo. */
+    if (tinfo_list != NULL)
+        for (tlp = tinfo_list; *tlp != NULL; ++tlp) {
+            tinfo = *tlp;
+            tinfo->replay_again = false;
+            tinfo->replay_ts = 0;
+            tinfo->lane = 0;
+            tinfo->op = (thread_op)0;
+        }
+}
+
+/*
+ * replay_run_begin --
+ *     Called at the beginning of a run.
+ */
+void
+replay_run_begin(WT_SESSION *session)
+{
+    (void)session;
+
+    if (GV(RUNS_PREDICTABLE_REPLAY))
+        replay_run_reset();
+}
+
+/*
+ * replay_run_end --
+ *     Called when finishing processing for a run.
+ */
+void
+replay_run_end(WT_SESSION *session)
+{
+    (void)session;
+
+    if (GV(RUNS_PREDICTABLE_REPLAY))
+        replay_run_reset();
+}
+
+/*
+ * replay_read_ts --
+ *     Return a read timestamp for a begin transaction call.
+ */
+uint64_t
+replay_read_ts(TINFO *tinfo)
+{
+    uint64_t commit_ts;
+
+    testutil_assert(GV(RUNS_PREDICTABLE_REPLAY) && tinfo->lane != LANE_NONE &&
+      g.lanes[tinfo->lane].in_use && tinfo->replay_ts != 0);
+
+    commit_ts = replay_maximum_committed();
+    testutil_assert(commit_ts != 0);
+    return (commit_ts);
+}
+
+/*
+ * replay_prepare_ts --
+ *     Return a timestamp to be used for prepare.
+ */
+uint64_t
+replay_prepare_ts(TINFO *tinfo)
+{
+    uint64_t prepare_ts, ts;
+
+    testutil_assert(GV(RUNS_PREDICTABLE_REPLAY));
+
+    /* See if we're just starting a run. */
+    if (tinfo->replay_ts == 0 || tinfo->replay_ts <= g.replay_start_timestamp + LANE_COUNT)
+        /*
+         * When we're starting a run, we'll just use the final commit timestamp for our prepare
+         * timestamp. We know that's safe.
+         */
+        prepare_ts = tinfo->replay_ts;
+    else {
+        /*
+         * Our lane's current operation will have a commit timestamp tinfo->replay_ts. Our lane's
+         * previous commit timestamp was that number minus LANE_COUNT. The global stable timestamp
+         * generally should not be advanced past our lane's previous commit timestamp. So a prepare
+         * timestamp halfway between the lane's previous commit timestamp and the current commit
+         * timestamp should be valid.
+         */
+        ts = tinfo->replay_ts - LANE_COUNT / 2;
+
+        /* As a sanity check, make sure the timestamp hasn't completely aged out. */
+        if (ts < g.oldest_timestamp)
+            prepare_ts = ts;
+        else
+            prepare_ts = tinfo->replay_ts;
+    }
+    return (prepare_ts);
+}
+
+/*
+ * replay_commit_ts --
+ *     Return the commit timestamp.
+ */
+uint64_t
+replay_commit_ts(TINFO *tinfo)
+{
+    testutil_assert(GV(RUNS_PREDICTABLE_REPLAY));
+
+    testutil_assert(tinfo->replay_ts != 0);
+    return (tinfo->replay_ts);
+}
+
+/*
+ * replay_committed --
+ *     Called when a transaction was successfully committed. We can give up a lane if appropriate.
+ */
+void
+replay_committed(TINFO *tinfo)
+{
+    uint32_t lane;
+
+    if (!GV(RUNS_PREDICTABLE_REPLAY))
+        return;
+
+    testutil_assert(tinfo->replay_ts != 0);
+
+    lane = tinfo->lane;
+    testutil_assert(!tinfo->replay_again);
+    testutil_check(pthread_rwlock_wrlock(&g.lane_lock));
+
+    /*
+     * Updating the last commit timestamp for a lane in use allows read, oldest and stable
+     * timestamps to advance.
+     */
+    WT_PUBLISH(g.lanes[lane].last_commit_ts, tinfo->replay_ts);
+    if (g.timestamp <= tinfo->replay_ts + LANE_COUNT) {
+        WT_PUBLISH(g.lanes[lane].in_use, false);
+        tinfo->lane = LANE_NONE;
+        tinfo->replay_ts = 0;
+    } else {
+        tinfo->replay_ts += LANE_COUNT;
+        tinfo->replay_again = true;
+    }
+    testutil_check(pthread_rwlock_unlock(&g.lane_lock));
+}
+
+/*
+ * replay_adjust_key --
+ *     Given a fully random key number, modify the key that is in our lane.
+ */
+void
+replay_adjust_key(TINFO *tinfo, uint64_t max_rows)
+{
+    uint64_t keyno;
+    uint32_t lane;
+
+    if (GV(RUNS_PREDICTABLE_REPLAY)) {
+        lane = tinfo->lane;
+        keyno = (tinfo->keyno & ~(LANE_COUNT - 1)) | lane;
+
+        if (keyno == 0)
+            keyno = LANE_COUNT;
+        else if (keyno >= max_rows)
+            keyno -= LANE_COUNT;
+
+        tinfo->keyno = keyno;
+    }
+}
+
+/*
+ * replay_rollback --
+ *     Called after a rollback.
+ */
+void
+replay_rollback(TINFO *tinfo)
+{
+    if (!GV(RUNS_PREDICTABLE_REPLAY))
+        return;
+
+    /*
+     * After a rollback, we don't give up our timestamp or our lane, we need to retry at the top of
+     * the operations loop.
+     */
+    tinfo->replay_again = true;
+
+    testutil_assert(tinfo->replay_ts != 0);
+    testutil_assert(tinfo->lane != LANE_NONE);
+    testutil_assert(g.lanes[tinfo->lane].in_use);
+}
+
+/*
+ * replay_pause_after_rollback --
+ *     Called after a rollback, allowing us to yield or pause.
+ */
+void
+replay_pause_after_rollback(TINFO *tinfo, uint32_t ntries)
+{
+    uint64_t high, low, mid;
+
+    if (!GV(RUNS_PREDICTABLE_REPLAY))
+        return;
+
+    /* Generally, the more behind we are, the less we want to wait. */
+    low = replay_maximum_committed();
+    high = g.timestamp;
+    mid = high + low / 2;
+
+    /* If we're in the furthest group behind, don't wait at all. */
+    if (low + LANE_COUNT <= tinfo->replay_ts)
+        return;
+
+    /*
+     * If we're in the last half, don't sleep. If we're in the front half, occasionally sleep.
+     */
+    if (tinfo->replay_ts < mid && ntries % 10 != 0)
+        __wt_yield();
+    else {
+        /* Never sleep more than .1 seconds */
+        __wt_sleep(0, ntries > 100 ? 100 * WT_THOUSAND : ntries * WT_THOUSAND);
+    }
+}
diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c
index 2f1d5a366b9..3855d05f379 100644
--- a/src/third_party/wiredtiger/test/format/snap.c
+++ b/src/third_party/wiredtiger/test/format/snap.c
@@ -677,7 +677,7 @@ snap_repeat_single(TINFO *tinfo)
      * Start at a random spot in the list of operations and look for a read to retry. Stop when
      * we've walked the entire list or found one.
      */
-    v = mmrand(&tinfo->rnd, 1, SNAP_LIST_SIZE) - 1;
+    v = mmrand(&tinfo->extra_rnd, 1, SNAP_LIST_SIZE) - 1;
     for (snap = &tinfo->snap_list[v], count = SNAP_LIST_SIZE; count > 0; --count, ++snap) {
         /* Wrap at the end of the circular buffer. */
         if (snap >= tinfo->snap_end)
diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c
index 632a457d04f..7066dedf6dd 100644
--- a/src/third_party/wiredtiger/test/format/t.c
+++ b/src/third_party/wiredtiger/test/format/t.c
@@ -179,14 +179,15 @@ static bool syntax_check; /* Only checking configuration syntax. */
 
 /*
  * main --
- *     TODO: Add a comment describing this function.
+ *     Run a variety of multithreaded WiredTiger operations based on a set of configurations.
  */
 int
 main(int argc, char *argv[])
 {
+    READ_SCAN_ARGS scan_args;
     uint64_t now, start;
-    u_int ops_seconds;
-    int ch, reps;
+    u_int ops_seconds, reps;
+    int ch;
     const char *config, *home;
     bool is_backup, quiet_flag, verify_only;
 
@@ -252,11 +253,20 @@ main(int argc, char *argv[])
         fflush(stdout);
     }
 
-    __wt_random_init_seed(NULL, &g.rnd); /* Initialize the RNG. */
+    /*
+     * Initialize the RNGs. This is needed early because some random decisions are made while
+     * reading configuration. There may be random seeds in the configuration, however, so we will
+     * reinitialize the RNGs later.
+     */
+    __wt_random_init_seed(NULL, &g.data_rnd);
+    __wt_random_init_seed(NULL, &g.extra_rnd);
 
-    /* Initialize lock to ensure single threading during failure handling */
+    /* Initialize lock to ensure single threading during failure handling. */
     testutil_check(pthread_rwlock_init(&g.death_lock, NULL));
 
+    /* Initialize lock to ensure single threading for lane operations in predictable replay. */
+    testutil_check(pthread_rwlock_init(&g.lane_lock, NULL));
+
     /*
      * Initialize the tables array and default to multi-table testing if not in backward-compatible
      * mode.
@@ -357,7 +367,9 @@ main(int argc, char *argv[])
     TIMED_MAJOR_OP(wts_verify(g.wts_conn, true));
     if (verify_only)
         goto skip_operations;
-    TIMED_MAJOR_OP(tables_apply(wts_read_scan, g.wts_conn));
+    scan_args.conn = g.wts_conn;
+    scan_args.rnd = &g.extra_rnd;
+    TIMED_MAJOR_OP(tables_apply(wts_read_scan, &scan_args));
 
     /* Optionally start checkpoints. */
     wts_checkpoints();
@@ -373,7 +385,7 @@ main(int argc, char *argv[])
      */
     ops_seconds = GV(RUNS_TIMER) == 0 ? 0 : ((GV(RUNS_TIMER) * 60) - 15) / FORMAT_OPERATION_REPS;
     for (reps = 1; reps <= FORMAT_OPERATION_REPS; ++reps)
-        operations(ops_seconds, reps == FORMAT_OPERATION_REPS);
+        operations(ops_seconds, reps, FORMAT_OPERATION_REPS);
 
     /* Copy out the run's statistics. */
     TIMED_MAJOR_OP(wts_stats());
diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c
index 657c30b202f..4ea4a429be9 100644
--- a/src/third_party/wiredtiger/test/format/wts.c
+++ b/src/third_party/wiredtiger/test/format/wts.c
@@ -387,10 +387,10 @@ create_object(TABLE *table, void *arg)
      * Configure the maximum key/value sizes, but leave it as the default if we come up with
      * something crazy.
      */
-    maxleafkey = mmrand(NULL, table->max_leaf_page / 50, table->max_leaf_page / 40);
+    maxleafkey = mmrand(&g.extra_rnd, table->max_leaf_page / 50, table->max_leaf_page / 40);
     if (maxleafkey > 20)
         CONFIG_APPEND(p, ",leaf_key_max=%" PRIu32, maxleafkey);
-    maxleafvalue = mmrand(NULL, table->max_leaf_page * 10, table->max_leaf_page / 40);
+    maxleafvalue = mmrand(&g.extra_rnd, table->max_leaf_page * 10, table->max_leaf_page / 40);
     if (maxleafvalue > 40 && maxleafvalue < 100 * 1024)
         CONFIG_APPEND(p, ",leaf_value_max=%" PRIu32, maxleafvalue);
 
@@ -408,7 +408,7 @@ create_object(TABLE *table, void *arg)
         if (TV(BTREE_HUFFMAN_VALUE))
             CONFIG_APPEND(p, ",huffman_value=english");
         if (TV(BTREE_DICTIONARY))
-            CONFIG_APPEND(p, ",dictionary=%" PRIu32, mmrand(NULL, 123, 517));
+            CONFIG_APPEND(p, ",dictionary=%" PRIu32, mmrand(&g.extra_rnd, 123, 517));
         break;
     }
 
diff --git a/src/third_party/wiredtiger/tools/wt_cmp_dir b/src/third_party/wiredtiger/tools/wt_cmp_dir
index 7e4e5d87a8b..3cd39557c40 100755
--- a/src/third_party/wiredtiger/tools/wt_cmp_dir
+++ b/src/third_party/wiredtiger/tools/wt_cmp_dir
@@ -145,7 +145,7 @@ cmp_uri_script=$(dirname "$0")/wt_cmp_uri
 ecode=0
 for f in $files1; do
     echo $f
-    if ! python $cmp_uri_script $timestamp_opt1 "$dir1"/$f $timestamp_opt2 "$dir2"/$f; then
+    if ! python3 $cmp_uri_script $timestamp_opt1 "$dir1"/$f $timestamp_opt2 "$dir2"/$f; then
         ecode=1
     fi
 done
author	Luke Chen <luke.chen@mongodb.com>	2023-02-13 09:36:07 +1100
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2023-02-12 23:24:00 +0000
commit	2d695bc7f8eb7328efd728ab0419255bbcc5beed (patch)
tree	889c07054168a84eb3c811d90e4693614dbdab4f
parent	ec7653ce39c817c1832fd6238474b1264a31b7b0 (diff)
download	mongo-2d695bc7f8eb7328efd728ab0419255bbcc5beed.tar.gz