Import wiredtiger: 723fc71346d686d39d0eb3eac96e618c95eab928 from branch mongodb-master

ref: b3e1507ca3..723fc71346 for: 7.0.0-rc0 WT-9914 test tiered: Convert test/checkpoint to do predictable replay
author: Sulabh Mahajan <sulabh.mahajan@mongodb.com> 2023-04-17 11:09:14 +1000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2023-04-17 01:57:50 +0000
commit: 044d9cfb46dfc8a9577ca9e84908aa251cf6e587 (patch)
tree: 56be102eb14ef79096e1cb52adbecfea1d87de9d
parent: b38874f5754958cc44e07b03a6a651df0cdd0f89 (diff)
download: mongo-044d9cfb46dfc8a9577ca9e84908aa251cf6e587.tar.gz
7 files changed, 367 insertions, 81 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index ae85c26c330..10bd3eb55ed 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -100,7 +100,7 @@ Decrypt
 DeleteFileW
 Deterministically
 Dk
-DmpvXx
+DmpRvXx
 Dxxx
 EACCES
 EB
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index f4f7e4276d4..8c90c3bb1d2 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -2,5 +2,5 @@
     "vendor": "wiredtiger",
     "github": "wiredtiger/wiredtiger.git",
     "branch": "mongodb-master",
-    "commit": "b3e1507ca31173e2d19ebfb647096231b7808f01"
+    "commit": "723fc71346d686d39d0eb3eac96e618c95eab928"
 }
diff --git a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
index 686fcf1b12b..20dadce9bcd 100644
--- a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
+++ b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c
@@ -32,22 +32,22 @@ static WT_THREAD_RET checkpointer(void *);
 static WT_THREAD_RET clock_thread(void *);
 static int compare_cursors(WT_CURSOR *, table_type, WT_CURSOR *, table_type);
 static int diagnose_key_error(WT_CURSOR *, table_type, int, WT_CURSOR *, table_type, int);
-static int real_checkpointer(void);
+static int real_checkpointer(THREAD_DATA *);
 
 /*
  * set_stable --
- *     Set the stable timestamp from g.ts_stable.
+ *     Set the given timestamp as the stable timestamp.
  */
 static void
-set_stable(void)
+set_stable(uint64_t stable_ts)
 {
     char buf[128];
 
     if (g.race_timestamps)
         testutil_check(__wt_snprintf(buf, sizeof(buf),
-          "stable_timestamp=%" PRIx64 ",oldest_timestamp=%" PRIx64, g.ts_stable, g.ts_stable));
+          "stable_timestamp=%" PRIx64 ",oldest_timestamp=%" PRIx64, stable_ts, stable_ts));
     else
-        testutil_check(__wt_snprintf(buf, sizeof(buf), "stable_timestamp=%" PRIx64, g.ts_stable));
+        testutil_check(__wt_snprintf(buf, sizeof(buf), "stable_timestamp=%" PRIx64, stable_ts));
     testutil_check(g.conn->set_timestamp(g.conn, buf));
 }
 
@@ -58,11 +58,16 @@ set_stable(void)
 void
 start_threads(void)
 {
-    set_stable();
-    testutil_check(__wt_thread_create(NULL, &g.checkpoint_thread, checkpointer, NULL));
+    set_stable(1); /* Let's start with 1 as the stable as 0 is not a valid timestamp. */
+    /*
+     * If there are N worker threads (0 - N-1), the checkpoint thread has an ID of N and the clock
+     * thread an ID of N + 1.
+     */
+    testutil_check(__wt_thread_create(NULL, &g.checkpoint_thread, checkpointer, &g.td[g.nworkers]));
     if (g.use_timestamps) {
         testutil_check(__wt_rwlock_init(NULL, &g.clock_lock));
-        testutil_check(__wt_thread_create(NULL, &g.clock_thread, clock_thread, NULL));
+        testutil_check(
+          __wt_thread_create(NULL, &g.clock_thread, clock_thread, &g.td[g.nworkers + 1]));
     }
 }
 
@@ -87,46 +92,92 @@ end_threads(void)
 }
 
 /*
+ * get_all_committed_ts --
+ *     Returns the least of commit timestamps across all the threads. Returns UINT64_MAX if one of
+ *     the threads has not yet started.
+ */
+static uint64_t
+get_all_committed_ts(void)
+{
+    uint64_t ret;
+    int i;
+
+    ret = UINT64_MAX;
+    for (i = 0; i < g.nworkers; ++i) {
+        if (g.td[i].ts < ret)
+            ret = g.td[i].ts;
+        if (ret == 0)
+            return (UINT64_MAX);
+    }
+
+    return (ret);
+}
+
+/*
  * clock_thread --
  *     Clock thread: ticks up timestamps.
  */
 static WT_THREAD_RET
 clock_thread(void *arg)
 {
-    WT_RAND_STATE rnd;
     WT_SESSION *wt_session;
     WT_SESSION_IMPL *session;
-    uint64_t delay;
+    THREAD_DATA *td;
+    uint64_t delay, last_ts, oldest_ts;
+    char tid[128];
+
+    testutil_check(__wt_thread_str(tid, sizeof(tid)));
+    printf("clock thread starting: tid: %s\n", tid);
+    fflush(stdout);
 
-    WT_UNUSED(arg);
+    td = (THREAD_DATA *)arg;
+    last_ts = 0;
 
-    __wt_random_init(&rnd);
     testutil_check(g.conn->open_session(g.conn, NULL, NULL, &wt_session));
     session = (WT_SESSION_IMPL *)wt_session;
 
     while (g.opts.running) {
-        __wt_writelock(session, &g.clock_lock);
-        if (g.prepare)
-            /*
-             * Leave a gap between timestamps so prepared insert followed by remove don't overlap
-             * with stable timestamp.
-             */
-            g.ts_stable += 5;
-        else
-            ++g.ts_stable;
-        set_stable();
-        if (g.ts_stable % 997 == 0) {
-            /*
-             * Random value between 6 and 10 seconds.
-             */
-            delay = __wt_random(&rnd) % 5;
-            __wt_sleep(delay + 6, 0);
+        if (g.predictable_replay) {
+            oldest_ts = get_all_committed_ts();
+            if (oldest_ts != UINT64_MAX && oldest_ts - last_ts > PRED_REPLAY_STABLE_PERIOD) {
+                /*
+                 * If we are doing a predictable rerun, don't go past the provided stop timestamp.
+                 */
+                if (g.stop_ts > 0 && oldest_ts >= g.stop_ts) {
+                    printf("Clock thread at %" PRIu64
+                           " has reached the provided stop timestamp. "
+                           "Stopping the clock.\n",
+                      g.stop_ts);
+                    set_stable(g.stop_ts);
+                    break;
+                }
+                set_stable(oldest_ts);
+                last_ts = oldest_ts;
+            }
+        } else {
+            __wt_writelock(session, &g.clock_lock);
+            if (g.prepare)
+                /*
+                 * Leave a gap between timestamps so prepared insert followed by remove don't
+                 * overlap with stable timestamp.
+                 */
+                g.ts_stable += 5;
+            else
+                ++g.ts_stable;
+            set_stable(g.ts_stable);
+            if (g.ts_stable % 997 == 0) {
+                /*
+                 * Random value between 6 and 10 seconds.
+                 */
+                delay = __wt_random(&td->extra_rnd) % 5;
+                __wt_sleep(delay + 6, 0);
+            }
+            __wt_writeunlock(session, &g.clock_lock);
         }
-        __wt_writeunlock(session, &g.clock_lock);
         /*
          * Random value between 5000 and 10000.
          */
-        delay = __wt_random(&rnd) % 5001;
+        delay = __wt_random(&td->extra_rnd) % 5001;
         __wt_sleep(0, delay + 5 * WT_THOUSAND);
     }
 
@@ -144,13 +195,11 @@ checkpointer(void *arg)
 {
     char tid[128];
 
-    WT_UNUSED(arg);
-
     testutil_check(__wt_thread_str(tid, sizeof(tid)));
     printf("checkpointer thread starting: tid: %s\n", tid);
     fflush(stdout);
 
-    (void)real_checkpointer();
+    (void)real_checkpointer((THREAD_DATA *)arg);
     return (WT_THREAD_RET_VALUE);
 }
 
@@ -179,12 +228,11 @@ set_flush_tier_delay(WT_RAND_STATE *rnd)
  *     in a timely fashion.
  */
 static int
-real_checkpointer(void)
+real_checkpointer(THREAD_DATA *td)
 {
-    WT_RAND_STATE rnd;
     WT_SESSION *session;
     wt_timestamp_t stable_ts, oldest_ts, verify_ts;
-    uint64_t delay;
+    uint64_t delay, tmp_ts;
     int ret;
     char buf[128], flush_tier_config[128], timestamp_buf[64];
     const char *checkpoint_config, *ts_config;
@@ -197,7 +245,6 @@ real_checkpointer(void)
     if (!g.opts.running)
         return (log_print_err("Checkpoint thread started stopped\n", EINVAL, 1));
 
-    __wt_random_init(&rnd);
     while (g.ntables > g.ntables_created && g.opts.running)
         __wt_yield();
 
@@ -216,7 +263,8 @@ real_checkpointer(void)
     testutil_check(__wt_snprintf(
       flush_tier_config, sizeof(flush_tier_config), "flush_tier=(enabled,force),%s", ts_config));
 
-    set_flush_tier_delay(&rnd);
+    /* Use the extra random generator as the tier delay doesn't affect the actual data content. */
+    set_flush_tier_delay(&td->extra_rnd);
 
     while (g.opts.running) {
         /*
@@ -233,10 +281,25 @@ real_checkpointer(void)
             if (stable_ts <= oldest_ts)
                 verify_ts = stable_ts;
             else
-                verify_ts = __wt_random(&rnd) % (stable_ts - oldest_ts + 1) + oldest_ts;
-            __wt_writelock((WT_SESSION_IMPL *)session, &g.clock_lock);
-            g.ts_oldest = g.ts_stable;
-            __wt_writeunlock((WT_SESSION_IMPL *)session, &g.clock_lock);
+                /* Use the extra random generator as the data is not getting modified. */
+                verify_ts = __wt_random(&td->extra_rnd) % (stable_ts - oldest_ts + 1) + oldest_ts;
+            if (g.predictable_replay) {
+                tmp_ts = WT_MIN(get_all_committed_ts(), stable_ts);
+                /* Update the oldest timestamp, but do not go past the provided stop timestamp. */
+                if (tmp_ts != UINT64_MAX && (g.stop_ts == 0 || tmp_ts <= g.stop_ts))
+                    g.ts_oldest = tmp_ts;
+                if (g.stop_ts > 0 && stable_ts >= g.stop_ts) {
+                    printf(
+                      "The checkpoint thread has reached the stop timestamp of "
+                      "%" PRIu64 ". Finish the test run.\n",
+                      g.stop_ts);
+                    g.opts.running = false;
+                }
+            } else {
+                __wt_writelock((WT_SESSION_IMPL *)session, &g.clock_lock);
+                g.ts_oldest = g.ts_stable;
+                __wt_writeunlock((WT_SESSION_IMPL *)session, &g.clock_lock);
+            }
         }
 
         /* Execute a checkpoint */
@@ -255,7 +318,11 @@ real_checkpointer(void)
             flush_tier = false;
             printf("Finished a flush_tier\n");
 
-            set_flush_tier_delay(&rnd);
+            /*
+             * Use the extra random generator as the tier delay doesn't affect the actual data
+             * content.
+             */
+            set_flush_tier_delay(&td->extra_rnd);
         }
 
         if (!g.opts.running)
@@ -277,8 +344,11 @@ real_checkpointer(void)
         }
 
         if (g.sweep_stress)
-            /* Random value between 4 and 8 seconds. */
-            delay = __wt_random(&rnd) % 5 + 4;
+            /*
+             * Random value between 4 and 8 seconds. Use the extra random generator as the tier
+             * sleep delay doesn't affect the actual data content.
+             */
+            delay = __wt_random(&td->extra_rnd) % 5 + 4;
         else
             /* Just find out if we should flush_tier. */
             delay = 0;
@@ -286,6 +356,12 @@ real_checkpointer(void)
     }
 
 done:
+    /* To be able to replay, print the stable timestamp the test stopped at. */
+    if (g.predictable_replay && g.use_timestamps) {
+        testutil_check(g.conn->query_timestamp(g.conn, timestamp_buf, "get=stable_timestamp"));
+        stable_ts = testutil_timestamp_parse(timestamp_buf);
+        printf("Test stopped at a stable timestamp of %" PRIu64 ".\n", stable_ts);
+    }
     if ((ret = session->close(session, NULL)) != 0)
         return (log_print_err("session.close", ret, 1));
 
diff --git a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
index 5272b60cbfa..01738ddfda8 100644
--- a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
+++ b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.c
@@ -44,6 +44,36 @@ extern int __wt_optind;
 extern char *__wt_optarg;
 
 /*
+ * init_thread_data --
+ *     Initialize the thread data struct.
+ */
+static void
+init_thread_data(THREAD_DATA *td, int info)
+{
+    td->info = info;
+    /*
+     * For a predictable replay have a non-overlapping key space for each thread. Also divide the
+     * key range between the threads. Otherwise, share the key space among all the threads.
+     */
+    if (g.predictable_replay) {
+        td->start_key = (u_int)info * WT_MILLION + 1;
+        td->key_range = g.nkeys / (u_int)g.nworkers;
+    } else {
+        td->start_key = 1;
+        td->key_range = g.nkeys;
+    }
+
+    /*
+     * For a predictable replay the worker threads use a predetermined set of timestamps. They
+     * publish their most recently used timestamps for the clock thread to read across the workers
+     * to base their decision on.
+     */
+    td->ts = 0;
+    testutil_random_from_random(&td->data_rnd, &g.opts.data_rnd);
+    testutil_random_from_random(&td->extra_rnd, &g.opts.extra_rnd);
+}
+
+/*
  * main --
  *     TODO: Add a comment describing this function.
  */
@@ -51,8 +81,9 @@ int
 main(int argc, char *argv[])
 {
     table_type ttype;
-    int ch, cnt, i, ret, runs;
+    int base, ch, cnt, i, ret, runs;
     const char *config_open;
+    char *end_number, *stop_arg;
     bool verify_only;
 
     (void)testutil_set_progname(argv);
@@ -65,6 +96,7 @@ main(int argc, char *argv[])
     g.home = dmalloc(512);
     g.nkeys = 10 * WT_THOUSAND;
     g.nops = 100 * WT_THOUSAND;
+    g.stop_ts = 0;
     g.ntables = 3;
     g.nworkers = 1;
     g.evict_reposition_timing_stress = false;
@@ -74,13 +106,14 @@ main(int argc, char *argv[])
     g.hs_checkpoint_timing_stress = false;
     g.checkpoint_slow_timing_stress = false;
     g.no_ts_deletes = false;
+    g.predictable_replay = false;
     runs = 1;
     verify_only = false;
 
     testutil_parse_begin_opt(argc, argv, SHARED_PARSE_OPTIONS, &g.opts);
 
     while ((ch = __wt_getopt(
-              progname, argc, argv, "C:c:Dk:l:mn:pr:s:T:t:vW:xX" SHARED_PARSE_OPTIONS)) != EOF)
+              progname, argc, argv, "C:c:Dk:l:mn:pr:Rs:S:T:t:vW:xX" SHARED_PARSE_OPTIONS)) != EOF)
         switch (ch) {
         case 'c':
             g.checkpoint_name = __wt_optarg;
@@ -112,6 +145,9 @@ main(int argc, char *argv[])
         case 'r': /* runs */
             runs = atoi(__wt_optarg);
             break;
+        case 'R': /* predictable replay */
+            g.predictable_replay = true;
+            break;
         case 's':
             switch (__wt_optarg[0]) {
             case '1':
@@ -136,6 +172,17 @@ main(int argc, char *argv[])
                 return (usage());
             }
             break;
+        case 'S': /* run until this stable timestamp */
+            stop_arg = __wt_optarg;
+            if (WT_PREFIX_MATCH(stop_arg, "0x")) {
+                base = 16;
+                stop_arg += 2;
+            } else
+                base = 10;
+            g.stop_ts = (uint64_t)strtoll(stop_arg, &end_number, base);
+            if (*end_number)
+                return (usage());
+            break;
         case 't':
             switch (__wt_optarg[0]) {
             case 'c':
@@ -182,6 +229,14 @@ main(int argc, char *argv[])
     if (argc != 0)
         return (usage());
 
+    if (g.stop_ts > 0 && (!g.predictable_replay || !g.use_timestamps)) {
+        fprintf(stderr, "-S is only valid if specified along with -X and -R.\n");
+        return (EXIT_FAILURE);
+    }
+
+    /*
+     * Among other things, this initializes the random number generators in the option structure.
+     */
     testutil_parse_end_opt(&g.opts);
     /* Clean up on signal. */
     (void)signal(SIGINT, onint);
@@ -193,6 +248,10 @@ main(int argc, char *argv[])
     g.ts_oldest = 1;
 
     printf("%s: process %" PRIu64 "\n", progname, (uint64_t)getpid());
+    if (g.predictable_replay)
+        printf("Config to seed for replay: " TESTUTIL_SEED_FORMAT "\n", g.opts.data_seed,
+          g.opts.extra_seed);
+
     for (cnt = 1; (runs == 0 || cnt <= runs) && g.status == 0; ++cnt) {
         cleanup(cnt == 1 && !verify_only); /* Clean up previous runs */
 
@@ -217,6 +276,21 @@ main(int argc, char *argv[])
               g.cookies[i].uri, sizeof(g.cookies[i].uri), "%s%04d", URI_BASE, g.cookies[i].id));
         }
 
+        /*
+         * Setup thread data. There are N worker threads, a checkpoint thread and possibly a clock
+         * thread. The workers have ID 0 to N-1, checkpoint thread has N, and the clock thread has N
+         * + 1.
+         */
+        if ((g.td = calloc((size_t)(g.nworkers + 2), sizeof(THREAD_DATA))) == NULL) {
+            (void)log_print_err("No memory", ENOMEM, 1);
+            break;
+        }
+        for (i = 0; i < g.nworkers; ++i)
+            init_thread_data(&g.td[i], i);
+        init_thread_data(&g.td[g.nworkers], g.nworkers); /* Checkpoint thread. */
+        if (g.use_timestamps)
+            init_thread_data(&g.td[g.nworkers + 1], g.nworkers + 1); /* Clock thread. */
+
         g.opts.running = true;
 
         wt_connect(config_open);
@@ -245,6 +319,8 @@ main(int argc, char *argv[])
 run_complete:
         free(g.cookies);
         g.cookies = NULL;
+        free(g.td);
+        g.td = NULL;
         if ((ret = wt_shutdown()) != 0) {
             (void)log_print_err("Shutdown failed", ret, 1);
             break;
@@ -271,7 +347,6 @@ static void
 wt_connect(const char *config_open)
 {
     static WT_EVENT_HANDLER event_handler = {handle_error, handle_message, NULL, NULL, NULL};
-    WT_RAND_STATE rnd;
     char buf[512], config[1024];
     bool fast_eviction;
 
@@ -280,8 +355,7 @@ wt_connect(const char *config_open)
     /*
      * Randomly decide on the eviction rate (fast or default).
      */
-    __wt_random_init_seed(NULL, &rnd);
-    if ((__wt_random(&rnd) % 15) % 2 == 0)
+    if ((__wt_random(&g.opts.extra_rnd) % 15) % 2 == 0)
         fast_eviction = true;
 
     /* Set up the basic configuration string first. */
@@ -331,7 +405,7 @@ wt_connect(const char *config_open)
         g.opts.conn = g.conn;
 
         /* Set up a random delay for the first flush. */
-        set_flush_tier_delay(&rnd);
+        set_flush_tier_delay(&g.opts.extra_rnd);
         testutil_tiered_begin(&g.opts);
     }
 }
@@ -601,8 +675,9 @@ usage(void)
 {
     fprintf(stderr,
       "usage: %s\n"
-      "    [-DmpvXx] [-C wiredtiger-config] [-c checkpoint] [-h home] [-k keys] [-l log]\n"
-      "    [-n ops] [-r runs] [-s 1|2|3|4|5] [-T table-config] [-t f|r|v] [-W workers]\n",
+      "    [-DmpRvXx] [-C wiredtiger-config] [-c checkpoint] [-h home] [-k keys] [-l log]\n"
+      "    [-n ops] [-r runs] [-s 1|2|3|4|5] [-T table-config] [-t f|r|v]\n"
+      "    [-W workers]\n",
       progname);
     fprintf(stderr, "%s",
       "\t-C specify wiredtiger_open configuration arguments\n"
@@ -615,7 +690,9 @@ usage(void)
       "\t-n set number of operations each thread does\n"
       "\t-p use prepare\n"
       "\t-r set number of runs (0 for continuous)\n"
+      "\t-R configure predictable replay\n"
       "\t-s specify which timing stress configuration to use ( 1 | 2 | 3 | 4 | 5 )\n"
+      "\t-S set a stable timestamp to stop the test run\n"
       "\t\t1: sweep_stress\n"
       "\t\t2: failpoint_hs_delete_key_from_ts\n"
       "\t\t3: hs_checkpoint_timing_stress\n"
diff --git a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h
index ccab3cdf4c2..babe569d0c3 100644
--- a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h
+++ b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h
@@ -46,6 +46,18 @@
 typedef enum { MIX = 0, COL, FIX, LSM, ROW } table_type; /* File type */
 
 /*
+ * For a predictable run we reserve timestamps for each thread for the entire run. The timestamp for
+ * the i-th key that a thread writes is given by the macro below. In a given iteration for each
+ * thread, there are three timestamps available. We never use the second and only sometimes use the
+ * third. The first is used as the commit and optionally as the prepared timestamp. The third as the
+ * durable timestamp ahead of the commit timestamp.
+ */
+#define RESERVED_TIMESTAMPS_FOR_ITERATION(threadcount, td, iter) \
+    (((iter) * (uint64_t)(threadcount) + (uint64_t)((td)->info)) * 3 + 1)
+
+#define PRED_REPLAY_STABLE_PERIOD 100
+
+/*
  * Per-table cookie structure.
  */
 typedef struct {
@@ -55,6 +67,15 @@ typedef struct {
 } COOKIE;
 
 typedef struct {
+    int info;
+    u_int start_key;
+    u_int key_range;
+    uint64_t ts; /* Only used for runs with predictable replay. */
+    WT_RAND_STATE data_rnd;
+    WT_RAND_STATE extra_rnd;
+} THREAD_DATA;
+
+typedef struct {
     TEST_OPTS opts;              /* Shared test options */
     char *home;                  /* Home directory */
     const char *checkpoint_name; /* Checkpoint name */
@@ -82,9 +103,12 @@ typedef struct {
     bool prepare;                                      /* Use prepare transactions */
     bool race_timestamps;                              /* Async update to oldest timestamp */
 
-    bool use_timestamps; /* Use txn timestamps. Start clock thread */
+    bool use_timestamps;     /* Use txn timestamps. Start clock thread */
+    bool predictable_replay; /* Run such that a predictable replay is possible. */
+    uint64_t stop_ts; /* Run a replay until the stable timestamp reaches this stop timestamp. */
 
-    COOKIE *cookies;               /* Per-thread info */
+    COOKIE *cookies;               /* Per-table info */
+    THREAD_DATA *td;               /* Per-thread info */
     WT_RWLOCK clock_lock;          /* Clock synchronization */
     wt_thread_t checkpoint_thread; /* Checkpoint thread */
     wt_thread_t clock_thread;      /* Clock thread */
diff --git a/src/third_party/wiredtiger/test/checkpoint/workers.c b/src/third_party/wiredtiger/test/checkpoint/workers.c
index 69fa646616c..20a3427e855 100644
--- a/src/third_party/wiredtiger/test/checkpoint/workers.c
+++ b/src/third_party/wiredtiger/test/checkpoint/workers.c
@@ -31,7 +31,7 @@
 #define MAX_MODIFY_ENTRIES 5
 
 static char modify_repl[256];
-static int real_worker(void);
+static int real_worker(THREAD_DATA *);
 static WT_THREAD_RET worker(void *);
 
 /*
@@ -120,9 +120,9 @@ start_workers(void)
 
     (void)gettimeofday(&start, NULL);
 
-    /* Create threads. */
+    /* Create threads. The N workers have ID 0 to N - 1. */
     for (i = 0; i < g.nworkers; ++i)
-        testutil_check(__wt_thread_create(NULL, &tids[i], worker, &g.cookies[i]));
+        testutil_check(__wt_thread_create(NULL, &tids[i], worker, &g.td[i]));
 
     /* Wait for the threads. */
     for (i = 0; i < g.nworkers; ++i)
@@ -334,15 +334,17 @@ worker_op(WT_CURSOR *cursor, table_type type, uint64_t keyno, u_int new_val)
 static WT_THREAD_RET
 worker(void *arg)
 {
+    THREAD_DATA *td;
     char tid[128];
 
-    WT_UNUSED(arg);
+    td = (THREAD_DATA *)arg;
 
     testutil_check(__wt_thread_str(tid, sizeof(tid)));
-    printf("worker thread starting: tid: %s\n", tid);
+    printf("worker thread starting: tid: %s key-range: %" PRIu32 " - %" PRIu32 "\n", tid,
+      td->start_key, td->start_key + td->key_range);
     fflush(stdout);
 
-    (void)real_worker();
+    (void)real_worker(td);
     return (WT_THREAD_RET_VALUE);
 }
 
@@ -351,11 +353,11 @@ worker(void *arg)
  *     A single worker thread that transactionally updates all tables with consistent values.
  */
 static int
-real_worker(void)
+real_worker(THREAD_DATA *td)
 {
     WT_CURSOR **cursors;
-    WT_RAND_STATE rnd;
     WT_SESSION *session;
+    uint64_t base_ts;
     u_int i, keyno, next_rnd;
     int j, ret, t_ret;
     char buf[128];
@@ -379,19 +381,31 @@ real_worker(void)
     if (g.use_timestamps) {
         if (g.no_ts_deletes)
             begin_cfg = "no_timestamp=true,read_timestamp=1,roundup_timestamps=(read=true)";
-        else
+        else if (!g.predictable_replay)
             begin_cfg = "read_timestamp=1,roundup_timestamps=(read=true)";
+        /*
+         * Note: For predictable replays we do not specify a read timestamp, hence reading the
+         * latest committed values. This is important for a predictable outcome as reading at the
+         * oldest timestamp depends on where the clock and the checkpoint threads have placed the
+         * oldest at this moment.
+         */
     }
 
-    __wt_random_init_seed((WT_SESSION_IMPL *)session, &rnd);
-
     for (j = 0; j < g.ntables; j++)
         if ((ret = session->open_cursor(session, g.cookies[j].uri, NULL, NULL, &cursors[j])) != 0) {
             (void)log_print_err("session.open_cursor", ret, 1);
             goto err;
         }
 
-    for (i = 0; i < g.nops && g.opts.running; ++i, __wt_yield()) {
+    for (i = 0; g.opts.running; ++i, __wt_yield()) {
+        /*
+         * If a stop timestamp has been provided, the workers will continue to run until the clock
+         * thread reaches a stable equal to the stop timestamp. Ignore the provided operation count
+         * in such a case.
+         */
+        if (g.stop_ts == 0 && i >= g.nops)
+            break;
+
         if (i > 0 && i % (5 * WT_THOUSAND) == 0)
             printf("Worker %u of %u ops\n", i, g.nops);
         if (start_txn) {
@@ -402,9 +416,10 @@ real_worker(void)
             new_txn = true;
             start_txn = false;
         }
-        keyno = __wt_random(&rnd) % g.nkeys + 1;
+        keyno = __wt_random(&td->data_rnd) % td->key_range + td->start_key;
         /* If we have specified to run with mix mode deletes we need to do it in it's own txn. */
-        if (g.use_timestamps && g.no_ts_deletes && new_txn && __wt_random(&rnd) % 72 == 0) {
+        if (g.use_timestamps && g.no_ts_deletes && new_txn &&
+          __wt_random(&td->data_rnd) % 72 == 0) {
             new_txn = false;
             for (j = 0; j < g.ntables; j++) {
                 ret = worker_no_ts_delete(cursors[j], keyno);
@@ -436,41 +451,60 @@ real_worker(void)
             (void)log_print_err("worker op failed", ret, 1);
             goto err;
         } else if (ret == 0) {
-            next_rnd = __wt_random(&rnd);
+            next_rnd = __wt_random(&td->data_rnd);
             if (next_rnd % 7 == 0) {
                 if (g.use_timestamps) {
-                    if (__wt_try_readlock((WT_SESSION_IMPL *)session, &g.clock_lock) == 0) {
-                        next_rnd = __wt_random(&rnd);
+                    /*
+                     * For a predictable run, the timestamps for worker's operations are managed by
+                     * reserving them across the threads and the iterations, such that they don't
+                     * overlap. For a regular run, the timestamp thread manages the advance of the
+                     * global clock. The workers synchronize with the clock using a reader - writer
+                     * lock, and decide the operation timestamp based on the global clock.
+                     */
+                    if (g.predictable_replay ||
+                      (__wt_try_readlock((WT_SESSION_IMPL *)session, &g.clock_lock) == 0)) {
+                        if (g.predictable_replay)
+                            /* i + 1 because we don't want a thread to start with commit-ts of 1 */
+                            base_ts = RESERVED_TIMESTAMPS_FOR_ITERATION(g.nworkers, td, i + 1);
+                        else
+                            base_ts = g.ts_stable + 1;
+                        next_rnd = __wt_random(&td->data_rnd);
                         if (g.prepare && next_rnd % 2 == 0) {
                             testutil_check(__wt_snprintf(
-                              buf, sizeof(buf), "prepare_timestamp=%" PRIx64, g.ts_stable + 1));
+                              buf, sizeof(buf), "prepare_timestamp=%" PRIx64, base_ts));
                             if ((ret = session->prepare_transaction(session, buf)) != 0) {
-                                __wt_readunlock((WT_SESSION_IMPL *)session, &g.clock_lock);
+                                if (!g.predictable_replay)
+                                    __wt_readunlock((WT_SESSION_IMPL *)session, &g.clock_lock);
                                 (void)log_print_err("real_worker:prepare_transaction", ret, 1);
                                 goto err;
                             }
                             testutil_check(__wt_snprintf(buf, sizeof(buf),
                               "durable_timestamp=%" PRIx64 ",commit_timestamp=%" PRIx64,
-                              g.ts_stable + 3, g.ts_stable + 1));
+                              base_ts + 2, base_ts));
                         } else
                             testutil_check(__wt_snprintf(
-                              buf, sizeof(buf), "commit_timestamp=%" PRIx64, g.ts_stable + 1));
+                              buf, sizeof(buf), "commit_timestamp=%" PRIx64, base_ts));
 
                         /* Commit majority of times. */
                         if (next_rnd % 49 != 0) {
                             if ((ret = session->commit_transaction(session, buf)) != 0) {
-                                __wt_readunlock((WT_SESSION_IMPL *)session, &g.clock_lock);
+                                if (!g.predictable_replay)
+                                    __wt_readunlock((WT_SESSION_IMPL *)session, &g.clock_lock);
                                 (void)log_print_err("real_worker:commit_transaction", ret, 1);
                                 goto err;
                             }
+                            if (g.predictable_replay)
+                                WT_PUBLISH(td->ts, base_ts);
                         } else {
                             if ((ret = session->rollback_transaction(session, NULL)) != 0) {
-                                __wt_readunlock((WT_SESSION_IMPL *)session, &g.clock_lock);
+                                if (!g.predictable_replay)
+                                    __wt_readunlock((WT_SESSION_IMPL *)session, &g.clock_lock);
                                 (void)log_print_err("real_worker:rollback_transaction", ret, 1);
                                 goto err;
                             }
                         }
-                        __wt_readunlock((WT_SESSION_IMPL *)session, &g.clock_lock);
+                        if (!g.predictable_replay)
+                            __wt_readunlock((WT_SESSION_IMPL *)session, &g.clock_lock);
                         start_txn = true;
                         /* Occasionally reopen cursors after transaction finish. */
                         if (next_rnd % 13 == 0)
diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml
index 1a9e00ff108..3c7cd2d5220 100755
--- a/src/third_party/wiredtiger/test/evergreen.yml
+++ b/src/third_party/wiredtiger/test/evergreen.yml
@@ -776,6 +776,54 @@ functions:
         set -o verbose
         ${test_env_vars|} ./test_checkpoint ${checkpoint_args} 2>&1
 
+  "checkpoint test predictable":
+    command: shell.exec
+    params:
+      working_dir: "wiredtiger/cmake_build/test/checkpoint"
+      script: |
+        # Get a random value with leading zeroes removed, /bin/sh version.
+        rando() {
+          tr -cd 0-9 </dev/urandom | head -c 5 | sed -e 's/0*\(.\)/\1/'
+        }
+
+        # Run test/checkpoint in a way that can test predictable replay.
+        set -o errexit
+        set -o verbose
+
+        toolsdir=../../../tools
+        wtutil=../../wt
+
+        r=$(rando)$(rando)
+        x0=$(rando)$(rando)
+
+        # Always run with timestamps and in the predictable mode
+        base_args="-x -R"
+
+        rm -rf RUNDIR_0
+        # The first run is for calibration only.  We just want to run for the designated
+        # time and get an approriate stop timestamp that can be used in later runs.
+        calibration_run_args="-PSD$r,E$x0"
+        ${test_env_vars|} ./test_checkpoint -h RUNDIR_0 $base_args ${checkpoint_args} $calibration_run_args || exit 1
+        echo "Finished calibration run"
+        stable_hex=$($toolsdir/wt_timestamps RUNDIR_0 | sed -e '/stable=/!d' -e 's/.*=//')
+        stop_ts=$(echo $((0x$stable_hex)))
+        for i in $(seq ${times}); do
+          echo Iteration $i/${times}
+          x1=$(rando)$(rando)
+          x2=$(rando)$(rando)
+          rm -rf RUNDIR_1 RUNDIR_2
+          # Do two runs up to the stable timestamp, using the same data seed,
+          # but with a different extra seed.  Compare it when done.
+          first_run_args="-PSD$r,E$x1 -S $stop_ts"
+          echo "First run with args $base_args ${checkpoint_args} $first_run_args"
+          ${test_env_vars|} ./test_checkpoint -h RUNDIR_1 $base_args ${checkpoint_args} $first_run_args || exit 1
+          second_run_args="-PSD$r,E$x2 -S $stop_ts"
+          echo "Second run with args $base_args ${checkpoint_args} $second_run_args"
+          ${test_env_vars|} ./test_checkpoint -h RUNDIR_2 $base_args ${checkpoint_args} $second_run_args || exit 1
+          # Compare the runs.
+          $toolsdir/wt_cmp_dir RUNDIR_1 RUNDIR_2 || exit 1
+        done
+
   "checkpoint stress test":
     command: shell.exec
     params:
@@ -4115,6 +4163,31 @@ tasks:
         vars:
           times: 5
 
+  - name: checkpoint-filetypes-predictable-test
+    commands:
+      - func: "get project"
+      - func: "compile wiredtiger"
+        vars:
+          # Don't use diagnostic - this test looks for timing problems that are more likely to occur without it
+          HAVE_DIAGNOSTIC: -DHAVE_DIAGNOSTIC=0
+      # FIXME-WT-10936: Enable once predictable replay supports column store
+      #- func: "checkpoint test predictable"
+      #  vars:
+      #    checkpoint_args: -t m -n 1000000 -k 5000000 -C cache_size=100MB
+      #    times: 5
+      #- func: "checkpoint test predictable"
+      #  vars:
+      #    checkpoint_args: -t c -n 1000000 -k 5000000 -C cache_size=100MB
+      #    times: 5
+      #- func: "checkpoint test predictable"
+      #  vars:
+      #    checkpoint_args: -n 1000000 -k 5000000 -C cache_size=100MB
+      #    times: 5
+      - func: "checkpoint test predictable"
+        vars:
+          checkpoint_args: -t r -n 1000000 -k 5000000 -C cache_size=100MB
+          times: 5
+
   - name: many-collection-test
     commands:
       - command: timeout.update
@@ -5346,6 +5419,8 @@ buildvariants:
     # FIXME-WT-10822
     # - name: format-tiered-test
     - name: schema-abort-predictable-test
+    - name: checkpoint-filetypes-predictable-test
+
 
 # When running the Python tests on this variant tcmalloc must be preloaded otherwise the wiredtiger library
 # fails to load and resolve its dependency.
author	Sulabh Mahajan <sulabh.mahajan@mongodb.com>	2023-04-17 11:09:14 +1000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2023-04-17 01:57:50 +0000
commit	044d9cfb46dfc8a9577ca9e84908aa251cf6e587 (patch)
tree	56be102eb14ef79096e1cb52adbecfea1d87de9d
parent	b38874f5754958cc44e07b03a6a651df0cdd0f89 (diff)
download	mongo-044d9cfb46dfc8a9577ca9e84908aa251cf6e587.tar.gz