diff options
author | Luke Chen <luke.chen@mongodb.com> | 2023-02-13 09:36:07 +1100 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-02-12 23:24:00 +0000 |
commit | 2d695bc7f8eb7328efd728ab0419255bbcc5beed (patch) | |
tree | 889c07054168a84eb3c811d90e4693614dbdab4f | |
parent | ec7653ce39c817c1832fd6238474b1264a31b7b0 (diff) | |
download | mongo-2d695bc7f8eb7328efd728ab0419255bbcc5beed.tar.gz |
Import wiredtiger: 604dd69988250e1c8698cf7e5ac5dbce4a8f88f7 from branch mongodb-master
ref: bb3421a839..604dd69988
for: 7.0.0-rc0
WT-9915 For tiered storage testing, get predictable outputs for test/format
26 files changed, 1301 insertions, 212 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 17bb59ba27a..332601000e4 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "bb3421a83981c5ece92579e9689e1636db90b559" + "commit": "604dd69988250e1c8698cf7e5ac5dbce4a8f88f7" } diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index 07931f47523..1d360bfdd6e 100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -473,6 +473,53 @@ functions: for i in $(seq ${times|1}); do ./t -c ${config|../../../test/format/CONFIG.stress} ${trace_args|-T bulk,txn,retain=100} ${extra_args|} || ( [ -f RUNDIR/CONFIG ] && cat RUNDIR/CONFIG ) 2>&1 done + "format test predictable": + command: shell.exec + params: + working_dir: "wiredtiger/cmake_build/test/format" + script: | + # To test predictable replay, we run test/format three times with the same data seed + # each time, and compare the keys and values found in the WT home directories. + # The first run is a timed one. When it's completed, we get the run's stable timestamp, + # and do the subsequent runs up to that stable timestamp. This, along with predictable + # replay using the same data seed, should guarantee we have equivalent data created. + set -o errexit + set -o verbose + fail() { + echo "======= FAILURE ==========" + for file; do + if [ -f "$file" ]; then + echo Contents of "$file": + cat "$file" + echo "================" + fi + done + exit 1 + } + runtime=3 # minutes + config=../../../test/format/CONFIG.replay + for i in $(seq ${times}); do + echo Iteration $i/${times} + x2=$RANDOM$RANDOM + x3=$RANDOM$RANDOM + rm -rf RUNDIR_{1,2,3} + + first_run_args="-c $config runs.timer=$runtime" + ./t -h RUNDIR_1 $first_run_args ${extra_args} || fail RUNDIR_1/CONFIG 2>&1 + stable_hex=$(../../../tools/wt_timestamps RUNDIR_1 | sed -e '/stable=/!d' -e 's/.*=//') + ops=$(echo $((0x$stable_hex))) + + # Do the second run up to the stable timestamp, using the same data seed, + # but with a different extra seed. Compare it when done. + common_args="-c RUNDIR_1/CONFIG runs.timer=0 runs.ops=$ops" + ./t -h RUNDIR_2 $common_args random.extra_seed=$x2 || fail RUNDIR_2/CONFIG 2>&1 + ../../../tools/wt_cmp_dir RUNDIR_1 RUNDIR_2 || fail RUNDIR_1/CONFIG RUNDIR_2/CONFIG 2>&1 + + # Do the third run up to the stable timestamp, using the same data seed, + # but with a different extra seed. Compare it to the second run when done. + ./t -h RUNDIR_3 $common_args random.extra_seed=$x3 || fail RUNDIR_3/CONFIG 2>&1 + ../../../tools/wt_cmp_dir RUNDIR_2 RUNDIR_3 || fail RUNDIR_2/CONFIG RUNDIR_3/CONFIG 2>&1 + done "format test script": command: shell.exec params: @@ -3727,6 +3774,19 @@ tasks: vars: format_test_script_args: -a -t 30 + - name: format-predictable-test + # Set 2.5 hour timeout (60 * 60 * 2.5) + exec_timeout_secs: 9000 + commands: + - func: "get project" + - func: "compile wiredtiger" + vars: + <<: *configure_flags_with_builtins + CMAKE_TOOLCHAIN_FILE: -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/mongodbtoolchain_v4_gcc.cmake + - func: "format test predictable" + vars: + times: 5 + - name: many-collection-test commands: - command: timeout.update @@ -4918,6 +4978,7 @@ buildvariants: - name: ".stress-test-4-nonstandalone" - name: ".stress-test-no-barrier-nonstandalone" - name: format-abort-recovery-stress-test-nonstandalone + - name: format-predictable-test # When running the Python tests on this variant tcmalloc must be preloaded otherwise the wiredtiger library # fails to load and resolve its dependency. diff --git a/src/third_party/wiredtiger/test/format/CONFIG.replay b/src/third_party/wiredtiger/test/format/CONFIG.replay new file mode 100644 index 00000000000..9e42cf00cfd --- /dev/null +++ b/src/third_party/wiredtiger/test/format/CONFIG.replay @@ -0,0 +1,26 @@ +############################################ +# RUN PARAMETERS: V3 +############################################ +# A configuration for predictable replay. +# Some things are locked down at the moment. +backup=0 +btree.huffman_value=0 +cache.minimum=20 +format.abort=0 +format.independent_thread_rng=1 +import=0 +ops.alter=0 +ops.compaction=0 +ops.truncate=0 +ops.salvage=0 +quiet=0 +runs.in_memory=0 +runs.mirror=0 +runs.predictable_replay=1 +runs.rows=1000000:5000000 +runs.tables=3:10 +runs.threads=4:32 +runs.timer=6:30 +runs.timer=30 +transaction.implicit=0 +transaction.timestamps=1 diff --git a/src/third_party/wiredtiger/test/format/alter.c b/src/third_party/wiredtiger/test/format/alter.c index 546bfb8a8d5..8f5e0000662 100644 --- a/src/third_party/wiredtiger/test/format/alter.c +++ b/src/third_party/wiredtiger/test/format/alter.c @@ -60,14 +60,14 @@ alter(void *arg) counter = 0; while (!g.workers_finished) { - period = mmrand(NULL, 1, 10); + period = mmrand(&g.extra_rnd, 1, 10); testutil_check(__wt_snprintf( buf, sizeof(buf), "access_pattern_hint=%s", access_value ? "random" : "none")); access_value = !access_value; /* Alter can return EBUSY if concurrent with other operations. */ - table = table_select(NULL); + table = table_select(NULL, false); trace_msg(session, "Alter #%u URI %s start %s", ++counter, table->uri, buf); while ((ret = session->alter(session, table->uri, buf)) != 0 && ret != EBUSY) diff --git a/src/third_party/wiredtiger/test/format/backup.c b/src/third_party/wiredtiger/test/format/backup.c index f463b20d5a4..31f8aa5ee2e 100644 --- a/src/third_party/wiredtiger/test/format/backup.c +++ b/src/third_party/wiredtiger/test/format/backup.c @@ -539,7 +539,7 @@ backup(void *arg) * larger intervals, optionally do incremental backups between full backups. */ this_id = 0; - for (period = mmrand(NULL, 1, 10);; period = mmrand(NULL, 20, 45)) { + for (period = mmrand(&g.extra_rnd, 1, 10);; period = mmrand(&g.extra_rnd, 20, 45)) { /* Sleep for short periods so we don't make the run wait. */ while (period > 0 && !g.workers_finished) { --period; @@ -584,7 +584,7 @@ backup(void *arg) src_id, g.backup_id)); /* Restart a full incremental every once in a while. */ full = false; - incr_full = mmrand(NULL, 1, 8) == 1; + incr_full = mmrand(&g.extra_rnd, 1, 8) == 1; } this_id = g.backup_id++; config = cfg; @@ -600,7 +600,7 @@ backup(void *arg) config = cfg; full = false; /* Restart a full incremental every once in a while. */ - incr_full = mmrand(NULL, 1, 8) == 1; + incr_full = mmrand(&g.extra_rnd, 1, 8) == 1; } } else { config = NULL; @@ -679,9 +679,9 @@ backup(void *arg) if (full) { incremental = 1; if (g.backup_incr_flag == INCREMENTAL_LOG) - incremental = GV(LOGGING_REMOVE) ? 1 : mmrand(NULL, 1, 8); + incremental = GV(LOGGING_REMOVE) ? 1 : mmrand(&g.extra_rnd, 1, 8); else if (g.backup_incr_flag == INCREMENTAL_BLOCK) - incremental = mmrand(NULL, 1, 8); + incremental = mmrand(&g.extra_rnd, 1, 8); } if (--incremental == 0) { check_copy(); diff --git a/src/third_party/wiredtiger/test/format/bulk.c b/src/third_party/wiredtiger/test/format/bulk.c index a811b3866b0..c53f88ab0ab 100644 --- a/src/third_party/wiredtiger/test/format/bulk.c +++ b/src/third_party/wiredtiger/test/format/bulk.c @@ -121,7 +121,7 @@ table_load(TABLE *base, TABLE *table) if (table->type == ROW) key_gen(table, &key, keyno); if (base == NULL) - val_gen(table, NULL, &value, &bitv, keyno); + val_gen(table, &g.data_rnd, &value, &bitv, keyno); else { testutil_check(read_op(base_cursor, NEXT, NULL)); testutil_check(base_cursor->get_value(base_cursor, &value)); @@ -168,6 +168,12 @@ table_load(TABLE *base, TABLE *table) testutil_assertfmt(base == NULL && (ret == WT_CACHE_FULL || ret == WT_ROLLBACK), "WT_CURSOR.insert failed: %d", ret); + /* + * If this occurs with predictable replay, we may need to redo the bulk load with fewer + * keys in each batch. For now, we just don't handle it. + */ + testutil_assert(!GV(RUNS_PREDICTABLE_REPLAY)); + if (g.transaction_timestamps_config) { bulk_rollback_transaction(session); bulk_begin_transaction(session); diff --git a/src/third_party/wiredtiger/test/format/checkpoint.c b/src/third_party/wiredtiger/test/format/checkpoint.c index 7b63eaba648..f7afc24374b 100644 --- a/src/third_party/wiredtiger/test/format/checkpoint.c +++ b/src/third_party/wiredtiger/test/format/checkpoint.c @@ -79,7 +79,7 @@ checkpoint(void *arg) wt_wrap_open_session(conn, &sap, NULL, &session); named_checkpoints = !g.lsm_config; - for (secs = mmrand(NULL, 1, 10); !g.workers_finished;) { + for (secs = mmrand(&g.extra_rnd, 1, 10); !g.workers_finished;) { if (secs > 0) { __wt_sleep(1, 0); --secs; @@ -96,7 +96,7 @@ checkpoint(void *arg) ckpt_vrfy_name = "WiredTigerCheckpoint"; backup_locked = false; if (named_checkpoints) - switch (mmrand(NULL, 1, 20)) { + switch (mmrand(&g.extra_rnd, 1, 20)) { case 1: /* * 5% create a named snapshot. Rotate between a few names to test multiple named @@ -105,8 +105,8 @@ checkpoint(void *arg) ret = lock_try_writelock(session, &g.backup_lock); if (ret == 0) { backup_locked = true; - testutil_check(__wt_snprintf( - config_buf, sizeof(config_buf), "name=mine.%" PRIu32, mmrand(NULL, 1, 4))); + testutil_check(__wt_snprintf(config_buf, sizeof(config_buf), + "name=mine.%" PRIu32, mmrand(&g.extra_rnd, 1, 4))); ckpt_config = config_buf; ckpt_vrfy_name = config_buf + strlen("name="); } else if (ret != EBUSY) @@ -143,7 +143,7 @@ checkpoint(void *arg) /* Verify the checkpoints. */ wts_verify_checkpoint(conn, ckpt_vrfy_name); - secs = mmrand(NULL, 5, 40); + secs = mmrand(&g.extra_rnd, 5, 40); } wt_wrap_open_session(conn, &sap, NULL, &session); diff --git a/src/third_party/wiredtiger/test/format/compact.c b/src/third_party/wiredtiger/test/format/compact.c index feea20fa092..84a467d0734 100644 --- a/src/third_party/wiredtiger/test/format/compact.c +++ b/src/third_party/wiredtiger/test/format/compact.c @@ -54,7 +54,7 @@ compact(void *arg) * Perform compaction at somewhere under 15 seconds (so we get at least one done), and then at * 23 second intervals. */ - for (period = mmrand(NULL, 1, 15);; period = 23) { + for (period = mmrand(&g.extra_rnd, 1, 15);; period = 23) { /* Sleep for short periods so we don't make the run wait. */ while (period > 0 && !g.workers_finished) { --period; @@ -70,7 +70,7 @@ compact(void *arg) * Compact returns ETIMEDOUT if the compaction doesn't finish in some number of seconds. We * don't configure a timeout and occasionally exceed the default of 1200 seconds. */ - table = table_select(NULL); + table = table_select(NULL, false); ret = session->compact(session, table->uri, NULL); testutil_assertfmt(ret == 0 || ret == EBUSY || ret == ETIMEDOUT || ret == WT_CACHE_FULL || ret == WT_ROLLBACK, diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index 201df695c97..bfdc9650699 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -110,40 +110,43 @@ typedef struct { #define V_TABLE_OPS_TRUNCATE 78 #define V_GLOBAL_OPS_VERIFY 79 #define V_GLOBAL_QUIET 80 -#define V_GLOBAL_RUNS_IN_MEMORY 81 -#define V_GLOBAL_RUNS_OPS 82 -#define V_TABLE_RUNS_MIRROR 83 -#define V_TABLE_RUNS_ROWS 84 -#define V_TABLE_RUNS_SOURCE 85 -#define V_GLOBAL_RUNS_TABLES 86 -#define V_GLOBAL_RUNS_THREADS 87 -#define V_GLOBAL_RUNS_TIMER 88 -#define V_TABLE_RUNS_TYPE 89 -#define V_GLOBAL_RUNS_VERIFY_FAILURE_DUMP 90 -#define V_GLOBAL_STATISTICS_MODE 91 -#define V_GLOBAL_STATISTICS_LOG_SOURCES 92 -#define V_GLOBAL_STRESS_AGGRESSIVE_SWEEP 93 -#define V_GLOBAL_STRESS_CHECKPOINT 94 -#define V_GLOBAL_STRESS_CHECKPOINT_EVICT_PAGE 95 -#define V_GLOBAL_STRESS_CHECKPOINT_PREPARE 96 -#define V_GLOBAL_STRESS_EVICT_REPOSITION 97 -#define V_GLOBAL_STRESS_FAILPOINT_EVICTION_FAIL_AFTER_RECONCILIATION 98 -#define V_GLOBAL_STRESS_FAILPOINT_HS_DELETE_KEY_FROM_TS 99 -#define V_GLOBAL_STRESS_HS_CHECKPOINT_DELAY 100 -#define V_GLOBAL_STRESS_HS_SEARCH 101 -#define V_GLOBAL_STRESS_HS_SWEEP 102 -#define V_GLOBAL_STRESS_SLEEP_BEFORE_READ_OVERFLOW_ONPAGE 103 -#define V_GLOBAL_STRESS_SPLIT_1 104 -#define V_GLOBAL_STRESS_SPLIT_2 105 -#define V_GLOBAL_STRESS_SPLIT_3 106 -#define V_GLOBAL_STRESS_SPLIT_4 107 -#define V_GLOBAL_STRESS_SPLIT_5 108 -#define V_GLOBAL_STRESS_SPLIT_6 109 -#define V_GLOBAL_STRESS_SPLIT_7 110 -#define V_GLOBAL_TRANSACTION_IMPLICIT 111 -#define V_GLOBAL_TRANSACTION_TIMESTAMPS 112 -#define V_GLOBAL_WIREDTIGER_CONFIG 113 -#define V_GLOBAL_WIREDTIGER_RWLOCK 114 -#define V_GLOBAL_WIREDTIGER_LEAK_MEMORY 115 +#define V_GLOBAL_RANDOM_DATA_SEED 81 +#define V_GLOBAL_RANDOM_EXTRA_SEED 82 +#define V_GLOBAL_RUNS_IN_MEMORY 83 +#define V_TABLE_RUNS_MIRROR 84 +#define V_GLOBAL_RUNS_OPS 85 +#define V_GLOBAL_RUNS_PREDICTABLE_REPLAY 86 +#define V_TABLE_RUNS_ROWS 87 +#define V_TABLE_RUNS_SOURCE 88 +#define V_GLOBAL_RUNS_TABLES 89 +#define V_GLOBAL_RUNS_THREADS 90 +#define V_GLOBAL_RUNS_TIMER 91 +#define V_TABLE_RUNS_TYPE 92 +#define V_GLOBAL_RUNS_VERIFY_FAILURE_DUMP 93 +#define V_GLOBAL_STATISTICS_MODE 94 +#define V_GLOBAL_STATISTICS_LOG_SOURCES 95 +#define V_GLOBAL_STRESS_AGGRESSIVE_SWEEP 96 +#define V_GLOBAL_STRESS_CHECKPOINT 97 +#define V_GLOBAL_STRESS_CHECKPOINT_EVICT_PAGE 98 +#define V_GLOBAL_STRESS_CHECKPOINT_PREPARE 99 +#define V_GLOBAL_STRESS_EVICT_REPOSITION 100 +#define V_GLOBAL_STRESS_FAILPOINT_EVICTION_FAIL_AFTER_RECONCILIATION 101 +#define V_GLOBAL_STRESS_FAILPOINT_HS_DELETE_KEY_FROM_TS 102 +#define V_GLOBAL_STRESS_HS_CHECKPOINT_DELAY 103 +#define V_GLOBAL_STRESS_HS_SEARCH 104 +#define V_GLOBAL_STRESS_HS_SWEEP 105 +#define V_GLOBAL_STRESS_SLEEP_BEFORE_READ_OVERFLOW_ONPAGE 106 +#define V_GLOBAL_STRESS_SPLIT_1 107 +#define V_GLOBAL_STRESS_SPLIT_2 108 +#define V_GLOBAL_STRESS_SPLIT_3 109 +#define V_GLOBAL_STRESS_SPLIT_4 110 +#define V_GLOBAL_STRESS_SPLIT_5 111 +#define V_GLOBAL_STRESS_SPLIT_6 112 +#define V_GLOBAL_STRESS_SPLIT_7 113 +#define V_GLOBAL_TRANSACTION_IMPLICIT 114 +#define V_GLOBAL_TRANSACTION_TIMESTAMPS 115 +#define V_GLOBAL_WIREDTIGER_CONFIG 116 +#define V_GLOBAL_WIREDTIGER_RWLOCK 117 +#define V_GLOBAL_WIREDTIGER_LEAK_MEMORY 118 -#define V_ELEMENT_COUNT 116 +#define V_ELEMENT_COUNT 119 diff --git a/src/third_party/wiredtiger/test/format/config.sh b/src/third_party/wiredtiger/test/format/config.sh index 9f9a68003e8..21a9a6984f5 100755 --- a/src/third_party/wiredtiger/test/format/config.sh +++ b/src/third_party/wiredtiger/test/format/config.sh @@ -238,11 +238,17 @@ CONFIG configuration_list[] = { {"quiet", "quiet run (same as -q)", C_BOOL | C_IGNORE, 0, 0, 1} +{"random.data_seed", "set random seed for data operations", 0x0, 0, 0, UINT_MAX} + +{"random.extra_seed", "set random seed for extra operations", 0x0, 0, 0, UINT_MAX} + {"runs.in_memory", "configure in-memory", C_BOOL | C_IGNORE, 0, 0, 1} +{"runs.mirror", "mirror tables", C_BOOL | C_IGNORE | C_TABLE, 0, 0, 0} + {"runs.ops", "operations per run", 0x0, 0, M(2), M(100)} -{"runs.mirror", "mirror tables", C_BOOL | C_IGNORE | C_TABLE, 0, 0, 0} +{"runs.predictable_replay", "configure predictable replay", C_BOOL, 0, 0, 0} {"runs.rows", "number of rows", C_TABLE, 10, M(1), M(100)} diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index 3b43c304435..1a94278364d 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -84,6 +84,38 @@ #define STR(s) #s #define XSTR(s) STR(s) +#include "config.h" +extern CONFIG configuration_list[]; + +typedef struct { + uint32_t v; /* integral value */ + char *vstr; /* string value */ + bool set; /* value explicitly set */ +} CONFIGV; + +/* + * The LANE data structure is used with predictable replay. With predictable replay, we want to make + * sure that two threads can never act on the same key. The last bits of the timestamp to be used to + * determine a lane, so it takes a while (LANE_COUNT operations) to cycle through the lanes. A lane + * only acts on key numbers whose last bits match the lane. We also keep track of lanes via the + * g.lanes array. This guarantees that a lane is only being used one at a time, which in turn + * guarantees that a key can only be used once at a time. + * + * A more complete description of how this fits into predictable replay is in replay.c . + */ +typedef struct { + uint64_t last_commit_ts; + bool in_use; +} LANE; +#define LANE_NONE UINT32_MAX /* A lane number guaranteed to be illegal */ +#define LANE_COUNT 1024u + +/* Arguments to the read scanner. */ +typedef struct { + WT_CONNECTION *conn; + WT_RAND_STATE *rnd; +} READ_SCAN_ARGS; + /* * Abstract lock that lets us use either pthread reader-writer locks or WiredTiger's own (likely * faster) implementation. @@ -112,15 +144,6 @@ typedef struct { */ #define FIX_VALUE_WRONG 0xff -#include "config.h" -extern CONFIG configuration_list[]; - -typedef struct { - uint32_t v; /* integral value */ - char *vstr; /* string value */ - bool set; /* value explicitly set */ -} CONFIGV; - typedef enum { FIX, ROW, VAR } table_type; typedef struct { u_int id; /* table ID */ @@ -224,7 +247,8 @@ typedef struct { #define INCREMENTAL_OFF 3 u_int backup_incr_flag; /* Incremental backup configuration */ - WT_RAND_STATE rnd; /* Global RNG state */ + WT_RAND_STATE data_rnd; /* Global RNG state for data operations */ + WT_RAND_STATE extra_rnd; /* Global RNG state for extra operations */ uint64_t timestamp; /* Counter for timestamps */ uint64_t oldest_timestamp; /* Last timestamp used for oldest */ @@ -232,6 +256,12 @@ typedef struct { uint64_t truncate_cnt; /* truncation operation counter */ + uint64_t replay_cached_committed; /* Our committed timestamp, cached */ + uint32_t replay_calculate_committed; /* Times before recalculating cached committed */ + uint64_t replay_start_timestamp; /* Timestamp at the beginning of a run */ + uint64_t stop_timestamp; /* If non-zero, stop when stable reaches this */ + uint64_t timestamp_copy; /* A copy of the timestamp, for safety checks */ + /* * Lock to prevent the stable timestamp from moving during the commit of prepared transactions. * Otherwise, it may panic if the stable timestamp is moved to greater than or equal to the @@ -271,9 +301,15 @@ typedef struct { #define CHECKPOINT_ON 2 #define CHECKPOINT_WIREDTIGER 3 u_int checkpoint_config; /* Checkpoint configuration */ + + LANE lanes[LANE_COUNT]; /* The lanes for multithreaded coordination */ + pthread_rwlock_t lane_lock; /* Lock used when modifying lanes */ } GLOBAL; extern GLOBAL g; +/* Timestamp to lane number */ +#define LANE_NUMBER(ts) (ts & (LANE_COUNT - 1)) + /* Worker thread operations. */ typedef enum { INSERT = 1, MODIFY, READ, REMOVE, TRUNCATE, UPDATE } thread_op; @@ -311,7 +347,12 @@ typedef struct { SAP sap; /* Thread's session event handler information */ - WT_RAND_STATE rnd; /* thread RNG state */ + WT_RAND_STATE data_rnd; /* thread RNG state for data operations */ + WT_RAND_STATE extra_rnd; /* thread RNG state for extra operations */ + + uint32_t lane; /* Current lane for replay */ + thread_op op; /* Operation */ + bool replay_again; /* Need to redo an operation at a timestamp. */ volatile bool quit; /* thread should quit */ @@ -348,8 +389,9 @@ typedef struct { bool repeatable_reads; /* if read ops repeatable */ bool repeatable_wrap; /* if circular buffer wrapped */ uint64_t opid; /* Operation ID */ - uint64_t read_ts; /* read timestamp */ uint64_t commit_ts; /* commit timestamp */ + uint64_t read_ts; /* read timestamp */ + uint64_t replay_ts; /* allocated timestamp for predictable replay */ uint64_t stable_ts; /* stable timestamp */ SNAP_STATE snap_states[2]; SNAP_STATE *s; /* points to one of the snap_states */ @@ -398,7 +440,7 @@ void key_gen_teardown(WT_ITEM *); void key_init(TABLE *, void *); void lock_destroy(WT_SESSION *, RWLOCK *); void lock_init(WT_SESSION *, RWLOCK *); -void operations(u_int, bool); +void operations(u_int, u_int, u_int); void path_setup(const char *); void set_alarm(u_int); void set_core(bool); @@ -415,6 +457,19 @@ void table_verify(TABLE *, void *); void timestamp_init(void); uint64_t timestamp_maximum_committed(void); void timestamp_once(WT_SESSION *, bool, bool); +void replay_adjust_key(TINFO *, uint64_t); +uint64_t replay_commit_ts(TINFO *); +void replay_committed(TINFO *); +void replay_end_timed_run(void); +void replay_loop_begin(TINFO *, bool); +uint64_t replay_maximum_committed(void); +bool replay_operation_enabled(thread_op); +void replay_pause_after_rollback(TINFO *, uint32_t); +uint64_t replay_prepare_ts(TINFO *); +uint64_t replay_read_ts(TINFO *); +void replay_rollback(TINFO *); +void replay_run_begin(WT_SESSION *); +void replay_run_end(WT_SESSION *); void timestamp_query(const char *, uint64_t *); void timestamp_set_oldest(void); void timestamp_teardown(WT_SESSION *); diff --git a/src/third_party/wiredtiger/test/format/format_config.c b/src/third_party/wiredtiger/test/format/format_config.c index e3ac21a724b..c28c4a3c976 100644 --- a/src/third_party/wiredtiger/test/format/format_config.c +++ b/src/third_party/wiredtiger/test/format/format_config.c @@ -53,6 +53,63 @@ static void config_off_all(const char *); static void config_pct(TABLE *); static void config_statistics(void); static void config_transaction(void); +static bool config_var(TABLE *); + +/* + * config_random_generator -- + * For a given seed/RNG combination, generate a seed if not given, and initialize the RNG. + */ +static void +config_random_generator( + const char *config_name, uint64_t seed, uint32_t rand_count, WT_RAND_STATE *rnd) +{ + char buf[128]; + bool seed_set; + + /* See if the seed is already present in the configuration. */ + seed_set = (seed != 0); + + /* Initialize the RNG, and potentially the seed. */ + testutil_random_init(rnd, &seed, rand_count); + + /* If we generated a seed just now, put it into the configuration file. */ + if (!seed_set) { + testutil_assert(seed != 0); + testutil_check(__wt_snprintf(buf, sizeof(buf), "%s=%" PRIu64, config_name, seed)); + config_single(NULL, buf, true); + } + + /* Make sure the generator is ready. */ + testutil_assert(rnd->v != 0); +} + +/* + * config_random_generators -- + * Initialize our global random generators using provided seeds. + */ +static void +config_random_generators(void) +{ + config_random_generator("random.data_seed", GV(RANDOM_DATA_SEED), 0, &g.data_rnd); + config_random_generator("random.extra_seed", GV(RANDOM_EXTRA_SEED), 1, &g.extra_rnd); +} + +/* + * config_random_generators_before_run -- + * One use case for predictable replay is to run test/format once with little or no + * configuration values set. test/format rolls the dice and picks the configuration, recording + * it along with the random seeds. If we want to rerun it predictably, we can use the same + * seeds. However, the second run will not need to roll the dice during configuration, so the + * state of the RNG after configuration would be different than after configuration during the + * first run. To make everything line up, we re-seed the generator after the configuration, and + * before execution begins. + */ +static void +config_random_generators_before_run(void) +{ + testutil_random_from_seed(&g.data_rnd, GV(RANDOM_DATA_SEED)); + testutil_random_from_seed(&g.extra_rnd, GV(RANDOM_EXTRA_SEED)); +} /* * config_random -- @@ -85,7 +142,7 @@ config_random(TABLE *table, bool table_only) continue; /* Configure key prefixes only rarely, 5% if the length isn't set explicitly. */ - if (cp->off == V_TABLE_BTREE_PREFIX_LEN && mmrand(NULL, 1, 100) > 5) + if (cp->off == V_TABLE_BTREE_PREFIX_LEN && mmrand(&g.extra_rnd, 1, 100) > 5) continue; /* @@ -93,11 +150,11 @@ config_random(TABLE *table, bool table_only) * is "on" (so "on" if random rolled <= N, otherwise "off"). */ if (F_ISSET(cp, C_BOOL)) - testutil_check(__wt_snprintf( - buf, sizeof(buf), "%s=%s", cp->name, mmrand(NULL, 1, 100) <= cp->min ? "on" : "off")); + testutil_check(__wt_snprintf(buf, sizeof(buf), "%s=%s", cp->name, + mmrand(&g.data_rnd, 1, 100) <= cp->min ? "on" : "off")); else - testutil_check(__wt_snprintf( - buf, sizeof(buf), "%s=%" PRIu32, cp->name, mmrand(NULL, cp->min, cp->maxrand))); + testutil_check(__wt_snprintf(buf, sizeof(buf), "%s=%" PRIu32, cp->name, + mmrand(&g.data_rnd, cp->min, cp->maxrand))); config_single(table, buf, false); } } @@ -141,12 +198,15 @@ config_table_am(TABLE *table) if (config_explicit(table, "runs.source") && DATASOURCE(table, "lsm")) config_single(table, "runs.type=row", false); else - switch (mmrand(NULL, 1, 10)) { + switch (mmrand(&g.data_rnd, 1, 10)) { case 1: case 2: case 3: /* 30% */ - config_single(table, "runs.type=var", false); - break; + if (config_var(table)) { + config_single(table, "runs.type=var", false); + break; + } + /* FALLTHROUGH */ case 4: /* 10% */ if (config_fix(table)) { config_single(table, "runs.type=fix", false); @@ -165,7 +225,7 @@ config_table_am(TABLE *table) } if (!config_explicit(table, "runs.source")) - switch (mmrand(NULL, 1, 5)) { + switch (mmrand(&g.data_rnd, 1, 5)) { case 1: /* 20% */ config_single(table, "runs.source=file", false); break; @@ -335,6 +395,31 @@ config_table(TABLE *table, void *arg) if (TV(BTREE_VALUE_MIN) > TV(BTREE_VALUE_MAX)) testutil_die(EINVAL, "btree.value_min may not be larger than btree.value_max"); + if (GV(RUNS_PREDICTABLE_REPLAY)) { + /* + * In predictable replay, force the number of rows in a table to be a manageable size so we + * can modify key numbers without problems. + */ + TV(RUNS_ROWS) = WT_MAX(TV(RUNS_ROWS), 2 * LANE_COUNT); + + /* + * We don't support some operations in predictable replay. + */ + if (!replay_operation_enabled(MODIFY)) { + if (config_explicit(table, "ops.pct.modify") && TV(OPS_PCT_MODIFY)) + WARN("turning off modify operations for table%" PRIu32 + " to work with predictable replay", + table->id); + config_single(table, "ops.pct.modify=0", false); + } + if (!replay_operation_enabled(TRUNCATE)) { + if (config_explicit(table, "ops.truncate") && TV(OPS_TRUNCATE)) + WARN("turning off truncate for table%" PRIu32 " to work with predictable replay", + table->id); + config_single(table, "ops.truncate=0", false); + } + } + /* * If common key prefixes are configured, add prefix compression if no explicit choice was made * and track the largest common key prefix in the run. @@ -372,6 +457,8 @@ config_table(TABLE *table, void *arg) void config_run(void) { + config_random_generators(); /* Configure the random number generators. */ + config_random(tables[0], false); /* Configure the remaining global name space. */ /* @@ -433,6 +520,8 @@ config_run(void) else config_single(NULL, "runs.timer=360", false); } + + config_random_generators_before_run(); } /* @@ -463,7 +552,7 @@ config_backup_incr(void) * Choose a type of incremental backup, where the log remove setting can eliminate incremental * backup based on log files. */ - switch (mmrand(NULL, 1, 10)) { + switch (mmrand(&g.extra_rnd, 1, 10)) { case 1: /* 30% full backup only */ case 2: case 3: @@ -508,7 +597,7 @@ config_backup_incr_granularity(void) * granularity is in units of KB. */ granularity = 0; - i = mmrand(NULL, 1, 10); + i = mmrand(&g.extra_rnd, 1, 10); switch (i) { case 1: /* 50% small size for stress testing */ case 2: @@ -669,7 +758,7 @@ config_checkpoint(void) { /* Choose a checkpoint mode if nothing was specified. */ if (!config_explicit(NULL, "checkpoint")) - switch (mmrand(NULL, 1, 20)) { + switch (mmrand(&g.extra_rnd, 1, 20)) { case 1: case 2: case 3: @@ -694,7 +783,7 @@ config_checksum(TABLE *table) { /* Choose a checksum mode if nothing was specified. */ if (!config_explicit(table, "disk.checksum")) - switch (mmrand(NULL, 1, 10)) { + switch (mmrand(&g.extra_rnd, 1, 10)) { case 1: case 2: case 3: @@ -746,7 +835,7 @@ config_compression(TABLE *table, const char *conf_name) * correct if all of the possible engines are compiled in. */ cstr = "off"; - switch (mmrand(NULL, 1, 20)) { + switch (mmrand(&g.extra_rnd, 1, 20)) { #ifdef HAVE_BUILTIN_EXTENSION_LZ4 case 1: case 2: @@ -858,7 +947,7 @@ config_encryption(void) return; /* 70% no encryption, 30% rotn */ - if (mmrand(NULL, 1, 10) < 8) + if (mmrand(&g.data_rnd, 1, 10) < 8) config_off(NULL, "disk.encryption"); else config_single(NULL, "disk.encryption=rotn-7", false); @@ -871,8 +960,24 @@ config_encryption(void) static bool config_fix(TABLE *table) { - /* Fixed-length column stores don't support modify operations. */ - return (!config_explicit(table, "ops.pct.modify")); + /* + * Fixed-length column stores don't support modify operations, and can't be used with + * predictable replay. + */ + return (!GV(RUNS_PREDICTABLE_REPLAY) && !config_explicit(table, "ops.pct.modify")); +} + +/* + * config_var -- + * Variable-length column-store configuration. + */ +static bool +config_var(TABLE *table) +{ + /* + * Variable-length column store insertions can't be used with predictable replay. + */ + return (!GV(RUNS_PREDICTABLE_REPLAY) || !config_explicit(table, "ops.pct.insert")); } /* @@ -918,8 +1023,10 @@ config_in_memory(void) return; if (config_explicit(NULL, "runs.mirror")) return; + if (config_explicit(NULL, "runs.predictable_replay")) + return; - if (!config_explicit(NULL, "runs.in_memory") && mmrand(NULL, 1, 20) == 1) { + if (!config_explicit(NULL, "runs.in_memory") && mmrand(&g.extra_rnd, 1, 20) == 1) { config_single(NULL, "runs.in_memory=1", false); /* Use table[0] to access the global value (RUN_ROWS is a table value). */ if (NTV(tables[0], RUNS_ROWS) > WT_MILLION) { @@ -1064,7 +1171,18 @@ config_mirrors(void) * tables. */ explicit_mirror = config_explicit(NULL, "runs.mirror"); - if (!explicit_mirror && mmrand(NULL, 1, 10) < 9) { + if (!explicit_mirror && mmrand(&g.data_rnd, 1, 10) < 9) { + config_off_all("runs.mirror"); + return; + } + + /* + * In theory, mirroring should work with predictable replay, although there's some overlap in + * functionality. That is, we usually do multiple runs with the same key with predictable replay + * and would notice if data was different or missing. We disable it to keep runs simple. + */ + if (GV(RUNS_PREDICTABLE_REPLAY)) { + WARN("%s", "turning off mirroring for predictable replay"); config_off_all("runs.mirror"); return; } @@ -1122,7 +1240,7 @@ config_mirrors(void) * Pick some number of tables to mirror, then turn on mirroring the next (n-1) tables, where * allowed. */ - for (mirrors = mmrand(NULL, 2, ntables) - 1, i = 1; i <= ntables; ++i) { + for (mirrors = mmrand(&g.data_rnd, 2, ntables) - 1, i = 1; i <= ntables; ++i) { if (NT_EXPLICIT_OFF(tables[i], RUNS_MIRROR)) continue; if (tables[i] != g.base_mirror) { @@ -1155,25 +1273,32 @@ config_pct(TABLE *table) const char *name; /* Operation */ uint32_t *vp; /* Value store */ u_int order; /* Order of assignment */ + bool enabled; /* Enabled for this configuration */ } list[5]; u_int i, max_order, max_slot, n, pct; bool slot_available; + /* We explicitly disable modify operations for predictable replay. */ list[0].name = "ops.pct.delete"; list[0].vp = &TV(OPS_PCT_DELETE); list[0].order = 0; + list[0].enabled = replay_operation_enabled(REMOVE); list[1].name = "ops.pct.insert"; list[1].vp = &TV(OPS_PCT_INSERT); list[1].order = 0; + list[1].enabled = replay_operation_enabled(INSERT); list[2].name = "ops.pct.modify"; list[2].vp = &TV(OPS_PCT_MODIFY); list[2].order = 0; + list[2].enabled = replay_operation_enabled(MODIFY); list[3].name = "ops.pct.read"; list[3].vp = &TV(OPS_PCT_READ); list[3].order = 0; + list[3].enabled = replay_operation_enabled(READ); list[4].name = "ops.pct.write"; list[4].vp = &TV(OPS_PCT_WRITE); list[4].order = 0; + list[4].enabled = replay_operation_enabled(UPDATE); /* * Walk the list of operations, checking for an illegal configuration and creating a random @@ -1182,11 +1307,13 @@ config_pct(TABLE *table) pct = 0; slot_available = false; for (i = 0; i < WT_ELEMENTS(list); ++i) - if (config_explicit(table, list[i].name)) - pct += *list[i].vp; - else { - list[i].order = mmrand(NULL, 1, WT_THOUSAND); - slot_available = true; + if (list[i].enabled) { + if (config_explicit(table, list[i].name)) + pct += *list[i].vp; + else { + list[i].order = mmrand(&g.data_rnd, 1, WT_THOUSAND); + slot_available = true; + } } /* @@ -1197,7 +1324,7 @@ config_pct(TABLE *table) WARN("operation percentages %s than 100, resetting to random values", pct > 100 ? "greater" : "less"); for (i = 0; i < WT_ELEMENTS(list); ++i) - list[i].order = mmrand(NULL, 1, WT_THOUSAND); + list[i].order = mmrand(&g.data_rnd, 1, WT_THOUSAND); pct = 0; } @@ -1210,9 +1337,9 @@ config_pct(TABLE *table) */ for (pct = 100 - pct;;) { for (i = n = max_order = max_slot = 0; i < WT_ELEMENTS(list); ++i) { - if (list[i].order != 0) + if (list[i].order != 0 && list[i].enabled) ++n; - if (list[i].order > max_order) { + if (list[i].order > max_order && list[i].enabled) { max_order = list[i].order; max_slot = i; } @@ -1223,7 +1350,7 @@ config_pct(TABLE *table) *list[max_slot].vp = pct; break; } - *list[max_slot].vp = mmrand(NULL, 0, pct); + *list[max_slot].vp = mmrand(&g.data_rnd, 0, pct); list[max_slot].order = 0; pct -= *list[max_slot].vp; } @@ -1246,7 +1373,7 @@ config_statistics(void) if (!config_explicit(NULL, "statistics.mode")) { /* 70% of the time set statistics to fast. */ - if (mmrand(NULL, 1, 10) < 8) + if (mmrand(&g.extra_rnd, 1, 10) < 8) config_single(NULL, "statistics.mode=fast", false); else config_single(NULL, "statistics.mode=all", false); @@ -1254,7 +1381,7 @@ config_statistics(void) if (!config_explicit(NULL, "statistics_log.sources")) { /* 10% of the time use sources if all. */ - if (strcmp(GVS(STATISTICS_MODE), "all") == 0 && mmrand(NULL, 1, 10) == 1) + if (strcmp(GVS(STATISTICS_MODE), "all") == 0 && mmrand(&g.extra_rnd, 1, 10) == 1) config_single(NULL, "statistics_log.sources=file:", false); } } @@ -1266,6 +1393,12 @@ config_statistics(void) static void config_transaction(void) { + /* Predictable replay requires timestamps. */ + if (GV(RUNS_PREDICTABLE_REPLAY)) { + config_single(NULL, "transaction.implicit=0", false); + config_single(NULL, "transaction.timestamps=on", true); + } + /* Transaction prepare requires timestamps and is incompatible with logging. */ if (GV(OPS_PREPARE) && config_explicit(NULL, "ops.prepare")) { if (!GV(TRANSACTION_TIMESTAMPS) && config_explicit(NULL, "transaction.timestamps")) @@ -1664,6 +1797,7 @@ config_table_extend(u_int ntable) void config_single(TABLE *table, const char *s, bool explicit) { + WT_RAND_STATE *rnd; enum { RANGE_FIXED, RANGE_NONE, RANGE_WEIGHTED } range; CONFIG *cp; CONFIGV *v; @@ -1726,6 +1860,11 @@ config_single(TABLE *table, const char *s, bool explicit) ++equalp; v = &table->v[cp->off]; + /* + * Use the data RNG for these options, that's conservative. + */ + rnd = &g.data_rnd; + if (F_ISSET(cp, C_STRING)) { /* * Historically, both "none" and "off" were used for turning off string configurations, now @@ -1822,7 +1961,7 @@ config_single(TABLE *table, const char *s, bool explicit) testutil_die(EINVAL, "%s: %s: illegal numeric range", progname, s); if (range == RANGE_FIXED) - v1 = mmrand(NULL, (u_int)v1, (u_int)v2); + v1 = mmrand(rnd, (u_int)v1, (u_int)v2); else { /* * Roll dice, 50% chance of proceeding to the next larger value, and 5 steps to the @@ -1832,7 +1971,7 @@ config_single(TABLE *table, const char *s, bool explicit) if (steps == 0) steps = 1; for (i = 0; i < 5; ++i, v1 += steps) - if (mmrand(NULL, 0, 1) == 0) + if (mmrand(rnd, 0, 1) == 0) break; v1 = WT_MIN(v1, v2); } @@ -1897,7 +2036,7 @@ config_map_file_type(const char *s, u_int *vp) * * Variable-length column-store is 90% vs. fixed, 30% vs. fixed and row, and 40% vs row. */ - v = mmrand(NULL, 1, 10); + v = mmrand(&g.data_rnd, 1, 10); if (fix && v == 1) *vp = FIX; else if (var && (v < 5 || !row)) diff --git a/src/third_party/wiredtiger/test/format/format_config_def.c b/src/third_party/wiredtiger/test/format/format_config_def.c index de704a1ac71..399b323d0c0 100644 --- a/src/third_party/wiredtiger/test/format/format_config_def.c +++ b/src/third_party/wiredtiger/test/format/format_config_def.c @@ -96,7 +96,7 @@ CONFIG configuration_list[] = {{"assert.read_timestamp", "assert read_timestamp" {"checkpoint.wait", "seconds to wait if wiredtiger checkpoints configured", 0x0, 5, 100, 3600, V_GLOBAL_CHECKPOINT_WAIT}, - {"debug.checkpoint_retention", "adjust log removal to retain the log records", 0x0, 0, 128, 1024, + {"debug.checkpoint_retention", "adjust log removal to retain the log records", 0x0, 0, 10, 1024, V_GLOBAL_DEBUG_CHECKPOINT_RETENTION}, {"debug.cursor_reposition", @@ -109,7 +109,7 @@ CONFIG configuration_list[] = {{"assert.read_timestamp", "assert read_timestamp" C_BOOL, 2, 0, 0, V_GLOBAL_DEBUG_EVICTION}, {"debug.log_retention", "adjust log removal to retain at least this number of log files", 0x0, 0, - 128, 1024, V_GLOBAL_DEBUG_LOG_RETENTION}, + 10, 1024, V_GLOBAL_DEBUG_LOG_RETENTION}, {"debug.realloc_exact", "reallocation of memory will only provide the exact amount requested", C_BOOL, 0, 0, 0, V_GLOBAL_DEBUG_REALLOC_EXACT}, @@ -236,11 +236,20 @@ CONFIG configuration_list[] = {{"assert.read_timestamp", "assert read_timestamp" {"quiet", "quiet run (same as -q)", C_BOOL | C_IGNORE, 0, 0, 1, V_GLOBAL_QUIET}, + {"random.data_seed", "set random seed for data operations", 0x0, 0, 0, UINT_MAX, + V_GLOBAL_RANDOM_DATA_SEED}, + + {"random.extra_seed", "set random seed for extra operations", 0x0, 0, 0, UINT_MAX, + V_GLOBAL_RANDOM_EXTRA_SEED}, + {"runs.in_memory", "configure in-memory", C_BOOL | C_IGNORE, 0, 0, 1, V_GLOBAL_RUNS_IN_MEMORY}, + {"runs.mirror", "mirror tables", C_BOOL | C_IGNORE | C_TABLE, 0, 0, 0, V_TABLE_RUNS_MIRROR}, + {"runs.ops", "operations per run", 0x0, 0, M(2), M(100), V_GLOBAL_RUNS_OPS}, - {"runs.mirror", "mirror tables", C_BOOL | C_IGNORE | C_TABLE, 0, 0, 0, V_TABLE_RUNS_MIRROR}, + {"runs.predictable_replay", "configure predictable replay", C_BOOL, 0, 0, 0, + V_GLOBAL_RUNS_PREDICTABLE_REPLAY}, {"runs.rows", "number of rows", C_TABLE, 10, M(1), M(100), V_TABLE_RUNS_ROWS}, diff --git a/src/third_party/wiredtiger/test/format/format_inline.h b/src/third_party/wiredtiger/test/format/format_inline.h index 07f33f5319c..0d6d42befa9 100644 --- a/src/third_party/wiredtiger/test/format/format_inline.h +++ b/src/third_party/wiredtiger/test/format/format_inline.h @@ -112,10 +112,7 @@ read_op(WT_CURSOR *cursor, read_operation op, int *exactp) static inline uint32_t rng(WT_RAND_STATE *rnd) { - /* Threaded operations have their own RNG information, otherwise we use the default. */ - if (rnd == NULL) - rnd = &g.rnd; - + testutil_assert(rnd != NULL); return (__wt_random(rnd)); } @@ -228,12 +225,25 @@ table_sumv(u_int off) * Randomly select a table. */ static inline TABLE * -table_select(TINFO *tinfo) +table_select(TINFO *tinfo, bool modifies_data) { + WT_RAND_STATE *rnd; + if (ntables == 0) return (tables[0]); - return (tables[mmrand(tinfo == NULL ? NULL : &tinfo->rnd, 1, ntables)]); + if (tinfo == NULL) { + if (modifies_data) + rnd = &g.data_rnd; + else + rnd = &g.extra_rnd; + } else { + if (modifies_data) + rnd = &tinfo->data_rnd; + else + rnd = &tinfo->extra_rnd; + } + return (tables[mmrand(rnd, 1, ntables)]); } /* @@ -241,14 +251,20 @@ table_select(TINFO *tinfo) * Randomly select a table of a specific type. */ static inline TABLE * -table_select_type(table_type type) +table_select_type(table_type type, bool modifies_data) { + WT_RAND_STATE *rnd; u_int i; if (ntables == 0) return (tables[0]->type == type ? tables[0] : NULL); - for (i = mmrand(NULL, 1, ntables);; ++i) { + if (modifies_data) + rnd = &g.data_rnd; + else + rnd = &g.extra_rnd; + + for (i = mmrand(rnd, 1, ntables);; ++i) { if (i > ntables) i = 1; if (tables[i]->type == type) diff --git a/src/third_party/wiredtiger/test/format/format_salvage.c b/src/third_party/wiredtiger/test/format/format_salvage.c index 8c23fae5cf9..787ecdade1d 100644 --- a/src/third_party/wiredtiger/test/format/format_salvage.c +++ b/src/third_party/wiredtiger/test/format/format_salvage.c @@ -84,7 +84,7 @@ corrupt(TABLE *table) * exceeding a megabyte (so we aren't just corrupting the whole file). */ testutil_check(fstat(fd, &sb)); - offset = mmrand(NULL, 0, (u_int)sb.st_size - 1024); + offset = mmrand(&g.data_rnd, 0, (u_int)sb.st_size - 1024); len = (size_t)(sb.st_size * 2) / 100; len += 4 * 1024; len = WT_MIN(len, WT_MEGABYTE); diff --git a/src/third_party/wiredtiger/test/format/format_timestamp.c b/src/third_party/wiredtiger/test/format/format_timestamp.c index db243be7335..aacc2c76c91 100644 --- a/src/third_party/wiredtiger/test/format/format_timestamp.c +++ b/src/third_party/wiredtiger/test/format/format_timestamp.c @@ -38,6 +38,9 @@ timestamp_maximum_committed(void) TINFO **tlp; uint64_t commit_ts, ts; + if (GV(RUNS_PREDICTABLE_REPLAY)) + return replay_maximum_committed(); + /* A barrier additionally prevents using cache values here. */ WT_ORDERED_READ(ts, g.timestamp); if (tinfo_list != NULL) @@ -96,7 +99,7 @@ timestamp_once(WT_SESSION *session, bool allow_lag, bool final) static const char *oldest_timestamp_str = "oldest_timestamp="; static const char *stable_timestamp_str = "stable_timestamp="; WT_CONNECTION *conn; - uint64_t oldest_timestamp, stable_timestamp; + uint64_t oldest_timestamp, stable_timestamp, stop_timestamp; char buf[WT_TS_HEX_STRING_SIZE * 2 + 64]; conn = g.wts_conn; @@ -106,7 +109,24 @@ timestamp_once(WT_SESSION *session, bool allow_lag, bool final) if (oldest_timestamp == 0) return; - if (!final) { + if (GV(RUNS_PREDICTABLE_REPLAY)) { + /* + * For predictable replay, we need the oldest timestamp to lag when the process exits. That + * allows two runs that finish with stable timestamps in the same ballpark to be compared. + */ + if (stable_timestamp > 10 * WT_THOUSAND) + oldest_timestamp = stable_timestamp - 10 * WT_THOUSAND; + else + oldest_timestamp = stable_timestamp / 2; + + /* + * For predictable replay, our end state is to have the stable timestamp represent a precise + * number of operations. + */ + WT_ORDERED_READ(stop_timestamp, g.stop_timestamp); + if (stable_timestamp > stop_timestamp && stop_timestamp != 0) + stable_timestamp = stop_timestamp; + } else if (!final) { /* * If lag is permitted, update the oldest timestamp halfway to the largest timestamp that's * no longer in use, otherwise update the oldest timestamp to that timestamp. Update stable @@ -152,11 +172,22 @@ timestamp(void *arg) memset(&sap, 0, sizeof(sap)); wt_wrap_open_session(conn, &sap, NULL, &session); - /* Update the oldest and stable timestamps at least once every 15 seconds. */ + /* + * Update the oldest and stable timestamps at least once every 15 seconds. For predictable + * replay, update at a much faster pace. We can't afford to get behind because that means more + * rollback errors, and we don't have the luxury of giving up on an operation that has rolled + * back. + */ while (!g.workers_finished) { - random_sleep(&g.rnd, 15); - - timestamp_once(session, true, false); + if (!GV(RUNS_PREDICTABLE_REPLAY)) + random_sleep(&g.extra_rnd, 15); + else { + if ((rng(&g.extra_rnd) & 0x1) == 1) + __wt_yield(); + else + __wt_sleep(0, 10 * WT_THOUSAND); + } + timestamp_once(session, !GV(RUNS_PREDICTABLE_REPLAY), false); } wt_wrap_close_session(session); diff --git a/src/third_party/wiredtiger/test/format/hs.c b/src/third_party/wiredtiger/test/format/hs.c index fbad2aeefa6..808ca204aea 100644 --- a/src/third_party/wiredtiger/test/format/hs.c +++ b/src/third_party/wiredtiger/test/format/hs.c @@ -78,8 +78,8 @@ hs_cursor(void *arg) * cursor, so we should be able to traverse large chunks of the HS store quickly, without * blocking normal operations. */ - next = mmrand(NULL, 0, 1) == 1; - for (i = mmrand(NULL, WT_THOUSAND, 100 * WT_THOUSAND); i > 0; --i) { + next = mmrand(&g.extra_rnd, 0, 1) == 1; + for (i = mmrand(&g.extra_rnd, WT_THOUSAND, 100 * WT_THOUSAND); i > 0; --i) { if ((ret = (next ? cursor->next(cursor) : cursor->prev(cursor))) != 0) { testutil_assertfmt(ret == WT_NOTFOUND || ret == WT_CACHE_FULL || ret == WT_ROLLBACK, "WT_CURSOR.%s failed: %d", next ? "next" : "prev", ret); @@ -94,7 +94,7 @@ hs_cursor(void *arg) testutil_check(cursor->close(cursor)); /* Sleep for some number of seconds, in short intervals so we don't make the run wait. */ - for (period = mmrand(NULL, 1, 10); period > 0 && !g.workers_finished; --period) + for (period = mmrand(&g.extra_rnd, 1, 10); period > 0 && !g.workers_finished; --period) __wt_sleep(1, 0); if (g.workers_finished) break; diff --git a/src/third_party/wiredtiger/test/format/import.c b/src/third_party/wiredtiger/test/format/import.c index 77c42435672..27fc14acee2 100644 --- a/src/third_party/wiredtiger/test/format/import.c +++ b/src/third_party/wiredtiger/test/format/import.c @@ -103,7 +103,7 @@ import(void *arg) copy_file_into_directory(import_session, "import.wt"); /* Perform import with either repair or file metadata. */ - import_value = mmrand(NULL, 0, 1); + import_value = mmrand(&g.extra_rnd, 0, 1); if (import_value == 0) testutil_check(__wt_snprintf(buf, sizeof(buf), "import=(enabled,repair=true)")); else @@ -116,7 +116,7 @@ import(void *arg) /* Drop import table, so we can import the table again */ testutil_drop(session, IMPORT_URI, NULL); - period = mmrand(NULL, 1, 10); + period = mmrand(&g.extra_rnd, 1, 10); while (period > 0 && !g.workers_finished) { --period; __wt_sleep(1, 0); diff --git a/src/third_party/wiredtiger/test/format/kv.c b/src/third_party/wiredtiger/test/format/kv.c index bf9bd7a689f..1b1ace2ffd4 100644 --- a/src/third_party/wiredtiger/test/format/kv.c +++ b/src/third_party/wiredtiger/test/format/kv.c @@ -47,7 +47,7 @@ key_init_random(TABLE *table) max = TV(BTREE_KEY_MAX); if (i % 20 != 0 && max > TV(BTREE_KEY_MIN) + 20) max = TV(BTREE_KEY_MIN) + 20; - table->key_rand_len[i] = mmrand(NULL, TV(BTREE_KEY_MIN), max); + table->key_rand_len[i] = mmrand(&g.data_rnd, TV(BTREE_KEY_MIN), max); } } @@ -241,6 +241,7 @@ val_len(WT_RAND_STATE *rnd, uint64_t keyno, uint32_t min, uint32_t max) void val_init(TABLE *table, void *arg) { + WT_RAND_STATE *rnd; size_t i; uint32_t len; @@ -263,8 +264,9 @@ val_init(TABLE *table, void *arg) for (i = 0; i < len; ++i) table->val_base[i] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"[i % 26]; + rnd = &g.data_rnd; table->val_dup_data_len = - val_len(NULL, (uint64_t)mmrand(NULL, 1, 20), TV(BTREE_VALUE_MIN), TV(BTREE_VALUE_MAX)); + val_len(rnd, (uint64_t)mmrand(rnd, 1, 20), TV(BTREE_VALUE_MIN), TV(BTREE_VALUE_MAX)); } /* diff --git a/src/third_party/wiredtiger/test/format/ops.c b/src/third_party/wiredtiger/test/format/ops.c index 1b59ec9e80c..da1431e0bcb 100644 --- a/src/third_party/wiredtiger/test/format/ops.c +++ b/src/third_party/wiredtiger/test/format/ops.c @@ -28,7 +28,7 @@ #include "format.h" -static void apply_bounds(WT_CURSOR *, TABLE *); +static void apply_bounds(WT_CURSOR *, TABLE *, WT_RAND_STATE *); static void clear_bounds(WT_CURSOR *, TABLE *); static int col_insert(TINFO *); static void col_insert_resolve(TABLE *, void *); @@ -40,6 +40,7 @@ static int col_update(TINFO *, bool); static int nextprev(TINFO *, bool); static WT_THREAD_RET ops(void *); static int read_row(TINFO *); +static void rollback_transaction(TINFO *); static int row_insert(TINFO *, bool); static int row_modify(TINFO *, bool); static int row_remove(TINFO *, bool); @@ -72,16 +73,16 @@ modify_build(TINFO *tinfo) int i, nentries; /* Randomly select a number of byte changes, offsets and lengths. */ - nentries = (int)mmrand(&tinfo->rnd, 1, MAX_MODIFY_ENTRIES); + nentries = (int)mmrand(&tinfo->data_rnd, 1, MAX_MODIFY_ENTRIES); for (i = 0; i < nentries; ++i) { tinfo->entries[i].data.data = - modify_repl + mmrand(&tinfo->rnd, 1, sizeof(modify_repl) - 10); - tinfo->entries[i].data.size = (size_t)mmrand(&tinfo->rnd, 0, 10); + modify_repl + mmrand(&tinfo->data_rnd, 1, sizeof(modify_repl) - 10); + tinfo->entries[i].data.size = (size_t)mmrand(&tinfo->data_rnd, 0, 10); /* * Start at least 11 bytes into the buffer so we skip leading key information. */ - tinfo->entries[i].offset = (size_t)mmrand(&tinfo->rnd, 20, 40); - tinfo->entries[i].size = (size_t)mmrand(&tinfo->rnd, 0, 10); + tinfo->entries[i].offset = (size_t)mmrand(&tinfo->data_rnd, 20, 40); + tinfo->entries[i].size = (size_t)mmrand(&tinfo->data_rnd, 0, 10); } tinfo->nentries = nentries; @@ -169,6 +170,25 @@ tinfo_init(void) tinfo->state = TINFO_RUNNING; tinfo->quit = false; + + testutil_random_from_random(&tinfo->data_rnd, &g.data_rnd); + testutil_random_from_random(&tinfo->extra_rnd, &g.extra_rnd); + } +} + +/* + * lanes_init -- + * Initialize the lanes structures. + */ +static void +lanes_init(void) +{ + uint32_t lane; + + /* Cleanup for each new run. */ + for (lane = 0; lane < LANE_COUNT; ++lane) { + g.lanes[lane].in_use = false; + g.lanes[lane].last_commit_ts = 0; } } @@ -226,6 +246,14 @@ rollback_to_stable(WT_SESSION *session) /* Check the saved snap operations for consistency. */ snap_repeat_rollback(session, tinfo_list, GV(RUNS_THREADS)); + + /* + * For a predictable run, the final stable timestamp is known and fixed, but individual threads + * may have gone beyond that. Now that we've rolled back, set the current timestamp to the + * stable so that next run starts from a known value. + */ + if (GV(RUNS_PREDICTABLE_REPLAY)) + g.timestamp = g.stable_timestamp; } /* @@ -233,7 +261,7 @@ rollback_to_stable(WT_SESSION *session) * Perform a number of operations in a set of threads. */ void -operations(u_int ops_seconds, bool lastrun) +operations(u_int ops_seconds, u_int run_current, u_int run_total) { SAP sap; TINFO *tinfo, total; @@ -243,9 +271,10 @@ operations(u_int ops_seconds, bool lastrun) wt_thread_t timestamp_tid; int64_t fourths, quit_fourths, thread_ops; uint32_t i; - bool running; + bool lastrun, running; conn = g.wts_conn; + lastrun = (run_current == run_total); /* Make the modify pad character printable to simplify debugging and logging. */ __wt_process.modify_pad_byte = FORMAT_PAD_BYTE; @@ -266,17 +295,30 @@ operations(u_int ops_seconds, bool lastrun) * There are two mechanisms to specify the length of the run, a number of operations and a * timer, when either expire the run terminates. * - * Each thread does an equal share of the total operations (and make sure that it's not 0). + * If we have a number of operations with predictable replay, we set a stop timestamp. Without + * predictable replay, each thread does an equal share of the total operations (and make sure + * that it's not 0). * - * Calculate how many fourth-of-a-second sleeps until the timer expires. If the timer expires - * and threads don't return in 15 minutes, assume there is something hung, and force the quit. + * With a timer, calculate how many fourth-of-a-second sleeps until the timer expires. If the + * timer expires and threads don't return in 15 minutes, assume there is something hung, and + * force the quit. */ + g.stop_timestamp = 0; if (GV(RUNS_OPS) == 0) thread_ops = -1; else { if (GV(RUNS_OPS) < GV(RUNS_THREADS)) GV(RUNS_OPS) = GV(RUNS_THREADS); - thread_ops = GV(RUNS_OPS) / GV(RUNS_THREADS); + if (GV(RUNS_PREDICTABLE_REPLAY)) { + /* + * If running with an operation count for predictable replay, ignore other ways of + * stopping. + */ + thread_ops = -1; + ops_seconds = 0; + g.stop_timestamp = (GV(RUNS_OPS) * run_current) / run_total; + } else + thread_ops = GV(RUNS_OPS) / GV(RUNS_THREADS); } if (ops_seconds == 0) fourths = quit_fourths = -1; @@ -290,9 +332,12 @@ operations(u_int ops_seconds, bool lastrun) wt_wrap_open_session(conn, &sap, NULL, &session); /* Initialize and start the worker threads. */ + lanes_init(); tinfo_init(); trace_msg(session, "%s", "=============== thread ops start"); + replay_run_begin(session); + for (i = 0; i < GV(RUNS_THREADS); ++i) { tinfo = tinfo_list[i]; testutil_check(__wt_thread_create(NULL, &tinfo->tid, ops, tinfo)); @@ -356,13 +401,24 @@ operations(u_int ops_seconds, bool lastrun) */ if (lastrun && GV(FORMAT_ABORT)) random_failure(); - tinfo->quit = true; + + /* + * Predictable replay cannot independently tag every thread to stop, we would end up + * with a mix of commits at the end of the run. Rather, later in this loop, when we + * see we are finishing, we give all threads stop timestamp that they must run to, + * but not exceed. + */ + if (!GV(RUNS_PREDICTABLE_REPLAY)) + tinfo->quit = true; } } track_ops(&total); if (!running) break; __wt_sleep(0, 250 * WT_THOUSAND); /* 1/4th of a second */ + + if (fourths == 1 && GV(RUNS_PREDICTABLE_REPLAY)) + replay_end_timed_run(); if (fourths != -1) --fourths; if (quit_fourths != -1 && --quit_fourths == 0) { @@ -417,6 +473,8 @@ operations(u_int ops_seconds, bool lastrun) */ rollback_to_stable(session); + replay_run_end(session); + if (lastrun) { tinfo_teardown(); timestamp_teardown(session); @@ -438,14 +496,18 @@ begin_transaction_ts(TINFO *tinfo) session = tinfo->session; - /* - * Transaction timestamp reads are repeatable, but read timestamps must be before any possible - * commit timestamp. Without a read timestamp, reads are based on the transaction snapshot, - * which will include the latest values as of when the snapshot is taken. Test in both modes: - * 75% of the time, pick a read timestamp before any commit timestamp still in use, 25% of the - * time don't set a timestamp at all. - */ - ts = mmrand(&tinfo->rnd, 1, 4) == 1 ? 0 : timestamp_maximum_committed(); + /* Pick a read timestamp. */ + if (GV(RUNS_PREDICTABLE_REPLAY)) + ts = replay_read_ts(tinfo); + else + /* + * Transaction timestamp reads are repeatable, but read timestamps must be before any + * possible commit timestamp. Without a read timestamp, reads are based on the transaction + * snapshot, which will include the latest values as of when the snapshot is taken. Test in + * both modes: 75% of the time, pick a read timestamp before any commit timestamp still in + * use, 25% of the time don't set a timestamp at all. + */ + ts = mmrand(&tinfo->data_rnd, 1, 4) == 1 ? 0 : timestamp_maximum_committed(); if (ts != 0) { wt_wrap_begin_transaction(session, NULL); @@ -459,6 +521,13 @@ begin_transaction_ts(TINFO *tinfo) trace_uri_op(tinfo, NULL, "begin snapshot read-ts=%" PRIu64 " (repeatable)", ts); return; } + + /* + * It should not be possible for a timestamp to age out of the system with predictable + * replay. If a begin transaction were to fail, we'd need to begin the transaction again + * with the same replay timestamp; we can never give up on a timestamp. + */ + testutil_assert(!GV(RUNS_PREDICTABLE_REPLAY)); testutil_assert(ret == EINVAL); testutil_check(session->rollback_transaction(session, NULL)); } @@ -505,7 +574,10 @@ commit_transaction(TINFO *tinfo, bool prepared) if (prepared) lock_readlock(session, &g.prepare_commit_lock); - ts = __wt_atomic_addv64(&g.timestamp, 1); + if (GV(RUNS_PREDICTABLE_REPLAY)) + ts = replay_commit_ts(tinfo); + else + ts = __wt_atomic_addv64(&g.timestamp, 1); testutil_check(session->timestamp_transaction_uint(session, WT_TS_TXN_TYPE_COMMIT, ts)); if (prepared) @@ -515,6 +587,7 @@ commit_transaction(TINFO *tinfo, bool prepared) testutil_check(session->commit_transaction(session, NULL)); if (prepared) lock_readunlock(session, &g.prepare_commit_lock); + replay_committed(tinfo); } else testutil_check(session->commit_transaction(session, NULL)); @@ -542,6 +615,7 @@ rollback_transaction(TINFO *tinfo) ++tinfo->rollback; testutil_check(session->rollback_transaction(session, NULL)); + replay_rollback(tinfo); trace_uri_op(tinfo, NULL, "abort read-ts=%" PRIu64, tinfo->read_ts); } @@ -561,12 +635,15 @@ prepare_transaction(TINFO *tinfo) ++tinfo->prepare; - /* - * Prepare timestamps must be less than or equal to the eventual commit timestamp. Set the - * prepare timestamp to whatever the global value is now. The subsequent commit will increment - * it, ensuring correctness. - */ - ts = __wt_atomic_fetch_addv64(&g.timestamp, 1); + if (GV(RUNS_PREDICTABLE_REPLAY)) + ts = replay_prepare_ts(tinfo); + else + /* + * Prepare timestamps must be less than or equal to the eventual commit timestamp. Set the + * prepare timestamp to whatever the global value is now. The subsequent commit will + * increment it, ensuring correctness. + */ + ts = __wt_atomic_fetch_addv64(&g.timestamp, 1); testutil_check(session->timestamp_transaction_uint(session, WT_TS_TXN_TYPE_PREPARE, ts)); ret = session->prepare_transaction(session, NULL); @@ -623,6 +700,20 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op) tinfo->cursor = table_cursor(tinfo, table->id); /* + * Predictable replay has some restrictions. Someday we may be able to resolve some of these + * restrictions, this may require adding complexity. + * + * We disallow inserts into column stores, as column stores do inserts by expanding the number + * of keys in the table. This has an interplay with other threads that are trying to predictably + * generate key numbers since the key space is growing at a random time. Thus column stores are + * restricted to accessing keys that were inserted via bulk load. + */ + if (GV(RUNS_PREDICTABLE_REPLAY)) { + if (table->type != ROW && op == INSERT) + op = READ; + } + + /* * Truncate has the key set to before/after rows in the table, skip pre-fetch and reserve for * simplicity. * @@ -641,7 +732,7 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op) * position taken from a previous search. If not already doing a read, position the cursor * at an existing point in the tree 20% of the time. */ - if (op != READ && mmrand(&tinfo->rnd, 1, 5) == 1) { + if (op != READ && mmrand(&tinfo->data_rnd, 1, 5) == 1) { ++tinfo->search; ret = read_row(tinfo); if (ret == 0) { @@ -657,7 +748,7 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op) * work, but doesn't make sense. Reserving a row before a read won't be useful but it's not * unexpected. */ - if (intxn && iso_level == ISOLATION_SNAPSHOT && mmrand(&tinfo->rnd, 0, 20) == 1) { + if (intxn && iso_level == ISOLATION_SNAPSHOT && mmrand(&tinfo->data_rnd, 0, 20) == 1) { switch (table->type) { case ROW: ret = row_reserve(tinfo, positioned); @@ -718,14 +809,14 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op) case READ: ++tinfo->search; - if (!positioned && GV(OPS_BOUND_CURSOR) && mmrand(NULL, 1, 2) == 1) { + if (!positioned && GV(OPS_BOUND_CURSOR) && mmrand(&tinfo->extra_rnd, 1, 2) == 1) { bound_set = true; /* * FIXME-WT-9883: It is possible that the underlying cursor is still positioned even * though the positioned variable is false. Reset the position through reset for now. */ testutil_check(tinfo->cursor->reset(tinfo->cursor)); - apply_bounds(tinfo->cursor, tinfo->table); + apply_bounds(tinfo->cursor, tinfo->table, &tinfo->extra_rnd); } ret = read_row(tinfo); @@ -803,8 +894,8 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op) * direction. */ if (positioned) { - next = mmrand(&tinfo->rnd, 0, 1) == 1; - j = mmrand(&tinfo->rnd, 1, 100); + next = mmrand(&tinfo->extra_rnd, 0, 1) == 1; + j = mmrand(&tinfo->extra_rnd, 1, 100); for (i = 0; i < j; ++i) { if ((ret = nextprev(tinfo, next)) == 0) continue; @@ -818,7 +909,7 @@ table_op(TINFO *tinfo, bool intxn, iso_level_t iso_level, thread_op op) * Reset the cursor: there is no reason to keep pages pinned, periodically forcibly evict the * underlying page. */ - evict_page = mmrand(&tinfo->rnd, 1, 20) == 1; + evict_page = mmrand(&tinfo->extra_rnd, 1, 20) == 1; if (evict_page) F_SET(tinfo->cursor, WT_CURSTD_DEBUG_RESET_EVICT); testutil_check(tinfo->cursor->reset(tinfo->cursor)); @@ -865,7 +956,7 @@ ops(void *arg) iso_level_t iso_level; thread_op op; uint64_t reset_op, session_op, truncate_op; - uint32_t max_rows, range, rnd; + uint32_t max_rows, ntries, range, rnd; u_int i; const char *iso_config; bool greater_than, intxn, prepared; @@ -878,25 +969,59 @@ ops(void *arg) * pound on the same key/value pairs, that is, by making them traverse the same RNG space. 75% * of the time we run in independent RNG space. */ - if (GV(FORMAT_INDEPENDENT_THREAD_RNG)) - __wt_random_init_seed(NULL, &tinfo->rnd); - else - __wt_random_init(&tinfo->rnd); + if (GV(FORMAT_INDEPENDENT_THREAD_RNG)) { + testutil_random_from_seed(&tinfo->data_rnd, GV(RANDOM_DATA_SEED) + (u_int)tinfo->id); + testutil_random_from_seed(&tinfo->extra_rnd, GV(RANDOM_EXTRA_SEED) + (u_int)tinfo->id); + } else { + testutil_random_from_seed(&tinfo->data_rnd, GV(RANDOM_DATA_SEED)); + testutil_random_from_seed(&tinfo->extra_rnd, GV(RANDOM_EXTRA_SEED)); + } iso_level = ISOLATION_SNAPSHOT; /* -Wconditional-uninitialized */ + tinfo->replay_again = false; + tinfo->lane = LANE_NONE; /* Set the first operation where we'll create a new session and cursors. */ session = NULL; session_op = 0; + ntries = 0; /* Set the first operation where we'll reset the session. */ - reset_op = mmrand(&tinfo->rnd, 100, 10 * WT_THOUSAND); + reset_op = mmrand(&tinfo->extra_rnd, 100, 10 * WT_THOUSAND); /* Set the first operation where we'll truncate a range. */ - truncate_op = mmrand(&tinfo->rnd, 100, 10 * WT_THOUSAND); + truncate_op = mmrand(&tinfo->data_rnd, 100, 10 * WT_THOUSAND); for (intxn = false; !tinfo->quit;) { +rollback_retry: + if (tinfo->quit) + break; + ++tinfo->ops; + if (!tinfo->replay_again) + /* + * Number of failures so far for the current operation and key. In predictable replay, + * unless we have a read operation, we cannot give up on any operation and maintain the + * integrity of the replay. + */ + ntries = 0; + + /* Number of tries only gets incremented during predictable replay. */ + testutil_assert(ntries == 0 || (!intxn && tinfo->replay_again)); + + /* + * In predictable replay, put each operation in its own transaction. It's possible we could + * make multiple operations work predictably in the future. + */ + if (intxn && GV(RUNS_PREDICTABLE_REPLAY)) { + commit_transaction(tinfo, false); + intxn = false; + } + + replay_loop_begin(tinfo, intxn); + if (tinfo->quit) + break; + /* Periodically open up a new session and cursors. */ if (tinfo->ops > session_op) { /* Resolve any running transaction. */ @@ -909,7 +1034,7 @@ ops(void *arg) session = tinfo->session; /* Pick the next session/cursor close/open. */ - session_op += mmrand(&tinfo->rnd, 100, 5 * WT_THOUSAND); + session_op += mmrand(&tinfo->extra_rnd, 100, 5 * WT_THOUSAND); } /* If not in a transaction, reset the session periodically so that operation is tested. */ @@ -917,20 +1042,20 @@ ops(void *arg) testutil_check(session->reset(session)); /* Pick the next reset operation. */ - reset_op += mmrand(&tinfo->rnd, 40 * WT_THOUSAND, 60 * WT_THOUSAND); + reset_op += mmrand(&tinfo->extra_rnd, 40 * WT_THOUSAND, 60 * WT_THOUSAND); } /* * If not in a transaction and in a timestamp world, occasionally repeat timestamped * operations. */ - if (!intxn && g.transaction_timestamps_config && mmrand(&tinfo->rnd, 1, 15) == 1) { + if (!intxn && g.transaction_timestamps_config && mmrand(&tinfo->extra_rnd, 1, 15) == 1) { ++tinfo->search; snap_repeat_single(tinfo); } /* Select a table. */ - table = tinfo->table = table_select(tinfo); + table = tinfo->table = table_select(tinfo, true); /* * If not in a transaction and in a timestamp world, start a transaction (which is always at @@ -946,14 +1071,15 @@ ops(void *arg) intxn = true; } if (!intxn) { + testutil_assert(!GV(RUNS_PREDICTABLE_REPLAY)); iso_level = ISOLATION_IMPLICIT; - if (table->mirror || mmrand(&tinfo->rnd, 1, 100) < GV(TRANSACTION_IMPLICIT)) { + if (table->mirror || mmrand(&tinfo->data_rnd, 1, 100) < GV(TRANSACTION_IMPLICIT)) { iso_level = ISOLATION_SNAPSHOT; iso_config = "isolation=snapshot"; /* Occasionally do reads at an isolation level lower than snapshot. */ - switch (mmrand(NULL, 1, 20)) { + switch (mmrand(&tinfo->data_rnd, 1, 20)) { case 1: iso_level = ISOLATION_READ_COMMITTED; /* 5% */ iso_config = "isolation=read-committed"; @@ -975,7 +1101,7 @@ ops(void *arg) */ op = READ; if (iso_level == ISOLATION_IMPLICIT || iso_level == ISOLATION_SNAPSHOT) { - i = mmrand(&tinfo->rnd, 1, 100); + i = mmrand(&tinfo->data_rnd, 1, 100); if (i < TV(OPS_PCT_DELETE)) { op = REMOVE; if (TV(OPS_TRUNCATE) && tinfo->ops > truncate_op) { @@ -986,7 +1112,7 @@ ops(void *arg) op = TRUNCATE; /* Pick the next truncate operation. */ - truncate_op += mmrand(&tinfo->rnd, 20 * WT_THOUSAND, 100 * WT_THOUSAND); + truncate_op += mmrand(&tinfo->data_rnd, 20 * WT_THOUSAND, 100 * WT_THOUSAND); } } else if (i < TV(OPS_PCT_DELETE) + TV(OPS_PCT_INSERT)) op = INSERT; @@ -996,6 +1122,10 @@ ops(void *arg) TV(OPS_PCT_DELETE) + TV(OPS_PCT_INSERT) + TV(OPS_PCT_MODIFY) + TV(OPS_PCT_WRITE)) op = UPDATE; } + tinfo->op = op; /* Keep the op in the thread info for debugging */ + + /* Make sure this is an operation that is permitted for this kind of run. */ + testutil_assert(replay_operation_enabled(op)); /* * Get the number of rows. Column-store extends the object, use that extended count if this @@ -1005,7 +1135,8 @@ ops(void *arg) max_rows = TV(RUNS_ROWS); if (table->type != ROW && !table->mirror) WT_ORDERED_READ(max_rows, table->rows_current); - tinfo->keyno = mmrand(&tinfo->rnd, 1, (u_int)max_rows); + tinfo->keyno = mmrand(&tinfo->data_rnd, 1, (u_int)max_rows); + replay_adjust_key(tinfo, max_rows); /* * If the operation is a truncate, select a range. @@ -1020,9 +1151,9 @@ ops(void *arg) * from lower keys to higher keys or vice-versa). */ if (op == TRUNCATE) { - tinfo->last = tinfo->keyno = mmrand(&tinfo->rnd, 1, (u_int)max_rows); - greater_than = mmrand(&tinfo->rnd, 0, 1) == 1; - range = max_rows < 20 ? 0 : mmrand(&tinfo->rnd, 0, (u_int)max_rows / 50); + tinfo->last = tinfo->keyno = mmrand(&tinfo->data_rnd, 1, (u_int)max_rows); + greater_than = mmrand(&tinfo->data_rnd, 0, 1) == 1; + range = max_rows < 20 ? 0 : mmrand(&tinfo->data_rnd, 0, (u_int)max_rows / 50); if (greater_than) { if (TV(BTREE_REVERSE)) { if (tinfo->keyno <= range) @@ -1069,9 +1200,10 @@ ops(void *arg) */ if (op == INSERT || op == UPDATE) { if (table->type == FIX && table->mirror) - val_gen(g.base_mirror, &tinfo->rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno); + val_gen( + g.base_mirror, &tinfo->data_rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno); else - val_gen(table, &tinfo->rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno); + val_gen(table, &tinfo->data_rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno); } /* @@ -1085,7 +1217,7 @@ ops(void *arg) if (table->type != FIX || table->mirror) modify_build(tinfo); else - val_gen(table, &tinfo->rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno); + val_gen(table, &tinfo->data_rnd, tinfo->new_value, &tinfo->bitv, tinfo->keyno); } /* @@ -1106,7 +1238,14 @@ ops(void *arg) * skip the operation. This isn't to avoid wasted work: any FLCS table in the mirrored * will do an update as FLCS doesn't support modify, and we'll fail when we compare the * remove to the FLCS value. + * + * For predictable replay if the record doesn't exist (that's predictable), and we must + * force a rollback, we always finish a loop iteration in a committed or rolled back + * state. */ + if (GV(RUNS_PREDICTABLE_REPLAY) && (ret == WT_ROLLBACK || tinfo->op_ret == WT_NOTFOUND)) + goto rollback; + if (tinfo->op_ret == WT_NOTFOUND) goto skip_operation; @@ -1116,6 +1255,8 @@ ops(void *arg) tinfo->table = table; ret = table_op(tinfo, intxn, iso_level, op); testutil_assert(ret == 0 || ret == WT_ROLLBACK); + if (GV(RUNS_PREDICTABLE_REPLAY) && ret == WT_ROLLBACK) + goto rollback; skip2 = table; } if (ret == 0 && table->mirror) @@ -1124,6 +1265,8 @@ ops(void *arg) tinfo->table = tables[i]; ret = table_op(tinfo, intxn, iso_level, op); testutil_assert(ret == 0 || ret == WT_ROLLBACK); + if (GV(RUNS_PREDICTABLE_REPLAY) && ret == WT_ROLLBACK) + goto rollback; if (ret == WT_ROLLBACK) break; } @@ -1144,9 +1287,22 @@ skip_operation: /* * If not in a transaction, we're done with this operation. If in a transaction, add more - * operations to the transaction half the time. + * operations to the transaction half the time. For predictable replay runs, always complete + * the transaction. */ - if (!intxn || (rnd = mmrand(&tinfo->rnd, 1, 10)) > 5) + if (GV(RUNS_PREDICTABLE_REPLAY)) { + rnd = mmrand(&tinfo->data_rnd, 1, 5); + + /* + * Note that a random value of 5 would result in a rollback per the switch below. For + * predictable replay, only do that once per timestamp. If we didn't have this check, a + * retry would start again with the same timestamp and RNG state, and get the same dice + * roll. This would happen every time and the thread will be get stuck doing continuous + * rollbacks. + */ + if (rnd == 5 && ntries != 0) + rnd = 4; /* Choose to do a commit this time. */ + } else if (!intxn || (rnd = mmrand(&tinfo->data_rnd, 1, 10)) > 5) continue; /* @@ -1168,7 +1324,7 @@ skip_operation: * timestamped world, which means we're in a snapshot-isolation transaction by definition. */ prepared = false; - if (GV(OPS_PREPARE) && mmrand(&tinfo->rnd, 1, 10) == 1) { + if (GV(OPS_PREPARE) && mmrand(&tinfo->data_rnd, 1, 10) == 1) { if ((ret = prepare_transaction(tinfo)) != 0) { testutil_assert(ret == WT_ROLLBACK); goto rollback; @@ -1191,6 +1347,18 @@ skip_operation: break; case 5: /* 10% */ rollback: + if (GV(RUNS_PREDICTABLE_REPLAY)) { + if (tinfo->quit) + goto loop_exit; + /* Force a rollback */ + testutil_assert(intxn); + rollback_transaction(tinfo); + intxn = false; + ++ntries; + replay_pause_after_rollback(tinfo, ntries); + ret = 0; + goto rollback_retry; + } __wt_yield(); /* Encourage races */ rollback_transaction(tinfo); snap_repeat_update(tinfo, false); @@ -1200,6 +1368,7 @@ rollback: intxn = false; } +loop_exit: if (session != NULL) testutil_check(session->close(session, NULL)); tinfo->session = NULL; @@ -1238,7 +1407,11 @@ read_row_worker(TINFO *tinfo, TABLE *table, WT_CURSOR *cursor, uint64_t keyno, W break; } - if (sn) { + /* + * We don't use search near for predictable replay runs, as the return key can be variable + * depending on the structure of the Btree. + */ + if (sn && !GV(RUNS_PREDICTABLE_REPLAY)) { ret = read_op(cursor, SEARCH_NEAR, &exact); if (ret == 0 && exact != 0) ret = WT_NOTFOUND; @@ -1293,7 +1466,7 @@ read_row_worker(TINFO *tinfo, TABLE *table, WT_CURSOR *cursor, uint64_t keyno, W * Apply lower and upper bounds on the cursor. The lower and upper bound is randomly generated. */ static void -apply_bounds(WT_CURSOR *cursor, TABLE *table) +apply_bounds(WT_CURSOR *cursor, TABLE *table, WT_RAND_STATE *rnd) { WT_ITEM key; uint32_t lower_keyno, max_rows, upper_keyno; @@ -1310,7 +1483,7 @@ apply_bounds(WT_CURSOR *cursor, TABLE *table) * Generate a random lower key and apply to the lower bound or upper bound depending on the * reverse collator. */ - lower_keyno = mmrand(NULL, 1, max_rows); + lower_keyno = mmrand(rnd, 1, max_rows); /* Retrieve the key/value pair by key. */ switch (table->type) { case FIX: @@ -1331,7 +1504,7 @@ apply_bounds(WT_CURSOR *cursor, TABLE *table) * Generate a random upper key and apply to the upper bound or lower bound depending on the * reverse collator. */ - upper_keyno = mmrand(NULL, lower_keyno, max_rows); + upper_keyno = mmrand(rnd, lower_keyno, max_rows); /* Retrieve the key/value pair by key. */ switch (table->type) { @@ -1371,20 +1544,22 @@ clear_bounds(WT_CURSOR *cursor, TABLE *table) * Read and verify a subset of the elements in a file. */ void -wts_read_scan(TABLE *table, void *arg) +wts_read_scan(TABLE *table, void *args) { SAP sap; WT_CONNECTION *conn; WT_CURSOR *cursor; WT_DECL_RET; WT_ITEM key, value; + WT_RAND_STATE *rnd; WT_SESSION *session; uint64_t keyno; uint32_t max_rows; uint8_t bitv; - conn = (WT_CONNECTION *)arg; testutil_assert(table != NULL); + conn = ((READ_SCAN_ARGS *)args)->conn; + rnd = ((READ_SCAN_ARGS *)args)->rnd; /* * We're not configuring transactions or read timestamps: if there's a diagnostic check that all @@ -1406,14 +1581,14 @@ wts_read_scan(TABLE *table, void *arg) WT_ORDERED_READ(max_rows, table->rows_current); for (keyno = 0; keyno < max_rows;) { if (++keyno > 50) - keyno += mmrand(NULL, 1, WT_THOUSAND); + keyno += mmrand(rnd, 1, WT_THOUSAND); if (keyno > max_rows) keyno = max_rows; - if (GV(OPS_BOUND_CURSOR) && mmrand(NULL, 1, 10) == 1) { + if (GV(OPS_BOUND_CURSOR) && mmrand(rnd, 1, 10) == 1) { /* Reset the position of the cursor, so that we can apply bounds on the cursor. */ testutil_check(cursor->reset(cursor)); - apply_bounds(cursor, table); + apply_bounds(cursor, table, rnd); } switch (ret = read_row_worker(NULL, table, cursor, keyno, &key, &value, &bitv, false)) { @@ -1444,7 +1619,7 @@ read_row(TINFO *tinfo) { /* 25% of the time we call search-near. */ return (read_row_worker(tinfo, NULL, tinfo->cursor, tinfo->keyno, tinfo->key, tinfo->value, - &tinfo->bitv, mmrand(&tinfo->rnd, 0, 3) == 1)); + &tinfo->bitv, mmrand(&tinfo->extra_rnd, 0, 3) == 1)); } /* @@ -1563,7 +1738,7 @@ modify(TINFO *tinfo, WT_CURSOR *cursor, bool positioned) bool modify_check; /* Periodically verify the WT_CURSOR.modify return. */ - modify_check = positioned && mmrand(&tinfo->rnd, 1, 20) == 1; + modify_check = positioned && mmrand(&tinfo->extra_rnd, 1, 20) == 1; if (modify_check) { testutil_check(cursor->get_value(cursor, &tinfo->moda)); testutil_check( @@ -1794,7 +1969,7 @@ row_insert(TINFO *tinfo, bool positioned) * Otherwise, generate a unique key and insert (or update an already inserted record). */ if (!positioned) { - key_gen_insert(tinfo->table, &tinfo->rnd, tinfo->key, tinfo->keyno); + key_gen_insert(tinfo->table, &tinfo->data_rnd, tinfo->key, tinfo->keyno); cursor->set_key(cursor, tinfo->key); } cursor->set_value(cursor, tinfo->new_value); diff --git a/src/third_party/wiredtiger/test/format/random.c b/src/third_party/wiredtiger/test/format/random.c index c88fe8e21f7..859186fa29f 100644 --- a/src/third_party/wiredtiger/test/format/random.c +++ b/src/third_party/wiredtiger/test/format/random.c @@ -72,11 +72,11 @@ random_kv(void *arg) simple = !simple; /* Select a table and open a cursor. */ - table = table_select_type(ROW); + table = table_select_type(ROW, false); wt_wrap_open_cursor(session, table->uri, config, &cursor); /* This is just a smoke-test, get some key/value pairs. */ - for (i = mmrand(NULL, 0, WT_THOUSAND); i > 0; --i) { + for (i = mmrand(&g.extra_rnd, 0, WT_THOUSAND); i > 0; --i) { switch (ret = cursor->next(cursor)) { case 0: break; @@ -95,7 +95,7 @@ random_kv(void *arg) testutil_check(cursor->close(cursor)); /* Sleep for some number of seconds. */ - period = mmrand(NULL, 1, 10); + period = mmrand(&g.extra_rnd, 1, 10); /* Sleep for short periods so we don't make the run wait. */ while (period > 0 && !g.workers_finished) { diff --git a/src/third_party/wiredtiger/test/format/replay.c b/src/third_party/wiredtiger/test/format/replay.c new file mode 100644 index 00000000000..e730119d70f --- /dev/null +++ b/src/third_party/wiredtiger/test/format/replay.c @@ -0,0 +1,548 @@ +/*- + * Public Domain 2014-present MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "format.h" + +/* + * Predictable replay is the ability to do test runs multiple times and always have predictable + * changes made at every timestamp. Two predictable runs with the same starting data seed executed + * up to the same timestamp will always have their data compare identically. Predictable replay only + * works with timestamped transactions and to avoid complexity, only a single operation is allowed + * in a transaction. + * + * To achieve the predictability we use two random number generators (the data RNG and the extra + * RNG) with known start seeds, the data seed and the extra seed. Every single-threaded modification + * (like bulk loading) when deciding on a random course, uses the global data RNG, which is seeded + * by the data seed. Global decisions that don't affect data, like whether to turn on verbose, or + * even the rate of checkpointing, use the global extra RNG, which is seeded by the extra seed. + * Changing the extra seed may change some characteristics of how a workload is tested, but should + * not change any data on disk. When worker threads run, they have their own data and extra RNGs, + * and these are seeded by the timestamp they are working on. + * + * Before a worker thread can decide on what operation to do on which key in which table, it must + * obtain the next timestamp. Timestamps are doled out atomically, so no two worker threads can ever + * perform operations using the same timestamp. The timestamp is XOR-ed with the data seed, the + * result is the seed of the thread's private data RNG for the duration of that operation. Likewise, + * a private extra RNG is seeded from the timestamp and the extra seed. This ensures that all + * decisions about what is committed at that timestamp are predictable based on the timestamp. As + * you might expect, the thread's data RNG is used to decide what operation to do, which table to + * use, and which key within the table. Other random decisions, like whether to reopen a session, or + * whether to repeat a read from the snap list, use the extra RNG. + * + * Note that once a thread has started to work on an operation at a timestamp, it cannot give up on + * the effort. If, for example, a rollback error naturally happens, we can rollback the transaction. + * However, immediately getting a new timestamp would mean that we would lose the consequences of + * the previous timestamp, perhaps a record would not be updated in a particular way. Thus, after a + * rollback, a thread starts again, using the same timestamp it had before, and it seeds its RNGs + * again using this timestamp. This gives full predictability, even in the face of temporary + * failures. + * + * To avoid the possibility that two threads work on the same key at the same time, we have the + * concept of lanes, and only one thread can be working in a lane at once. There are LANE_COUNT + * lanes, where LANE_COUNT is 2^k for some k. A thread uses a data RNG to choose the top bits of a + * key number, but the bottom k bits of the key number are set to the bottom k bits of the timestamp + * being worked. Those bottom k bits also determine the lane we are in. Each lane has a flag that + * determines whether the lane is in use by some operation. If thread T1 working an operation at + * timestamp X takes a sufficiently long time relative to other operations, it may be that the + * current timestamp has advanced to X + LANE_COUNT. If that is the case, a different thread T2 that + * gets that larger timestamp will see that the lane is occupied. Rather than using that timestamp + * and potentially getting the same key number, the T2 leaves that timestamp, knowing that T1 will + * do it, and advances to another timestamp to work on. When T1 finishes its long operation, it will + * notice if there are other timestamps that have been left for it. If so, it keeps the lane + * occupied, and works on the new timestamp. At some point, it will notice that all the timestamps + * in the lane have been processed up to that point, and it can release the lane, and go back to + * choosing the next available timestamp to process. + * + * Having some operations lag behind is a natural part of processing. This leads to a stable + * timestamp that may lag significantly. Due to the possibility of dependencies between operations, + * the more lag, the more chance that a rollback error occurs. Without predictable replay, this is + * not a problem, any operation that produces a rollback can be freely abandoned, and threads + * generally continue moving quickly ahead with more work. However, with predictable replay, no + * operation can be abandoned, and an operation that failed because of a dependency will repeatedly + * fail until the stable timestamp advances. For that reason, we keep calculating and moving the + * stable timestamp ahead at a much faster pace when predictable replay is configured. We also use + * an algorithm that only uses lanes that are in use to calculate the stable timestamp. This is safe + * and more responsive than the default calculation. And when there is a rollback error, we try to + * be smart whether we need to yield or pause. These modifications allow predictable performance to + * be on par with regular performance. + */ + +/* + * replay_end_timed_run -- + * In a timed run, get everyone to stop. + */ +void +replay_end_timed_run(void) +{ + /* + * We'll post a stop timestamp that all worker threads should abide by. There's a potential race + * between when we read the current timestamp and before we publish the stop timestamp. During + * that time, other threads could do work and advance the current timestamp, potentially beyond + * the intended stop timestamp. We pick a stop timestamp far enough in the future that it's + * rather unlikely to happen. + */ + WT_PUBLISH(g.stop_timestamp, g.timestamp + 0x10000); +} + +/* + * replay_maximum_committed -- + * For predictable replay runs, return the largest timestamp that's no longer in use. + */ +uint64_t +replay_maximum_committed(void) +{ + uint64_t commit_ts, ts; + uint32_t lane; + + /* + * The calculation is expensive, and does not need to be accurate all the time, and it's okay to + * be behind. So we use a cached value most of the time. + */ + ts = g.replay_cached_committed; + if (ts == 0 || __wt_atomic_addv32(&g.replay_calculate_committed, 1) % 20 == 0) { + WT_ORDERED_READ(ts, g.timestamp); + testutil_check(pthread_rwlock_wrlock(&g.lane_lock)); + for (lane = 0; lane < LANE_COUNT; ++lane) { + if (g.lanes[lane].in_use) { + commit_ts = g.lanes[lane].last_commit_ts; + if (commit_ts != 0) + ts = WT_MIN(ts, commit_ts); + } + } + if (ts == 0) + ts = 1; + g.replay_cached_committed = ts; + testutil_check(pthread_rwlock_unlock(&g.lane_lock)); + } + return (ts); +} + +/* + * replay_operation_enabled -- + * Return whether an operation type should be enabled in the configuration. + */ +bool +replay_operation_enabled(thread_op op) +{ + if (!GV(RUNS_PREDICTABLE_REPLAY)) + return (true); + + /* + * We don't permit modify operations with predictable replay. + * + * The problem is read timestamps. As currently implemented, the read timestamp selected is + * variable, based on the state of other threads and their progress with other timestamped + * operations. And if two changes are made to the same key in a short amount of time, if the + * second operation were to be performed sometimes with a read timestamp before the first + * operation, and sometimes with a read timestamp after the first operation, then the results + * would be variable. + * + * We could track recent operations on a key (in its lane, for instance), but when we realize + * the read timestamp isn't recent enough, we would need to wait for the stable timestamp to + * move forward (and our waiting can affect/delay other thread's operations as well). Having the + * stable timestamp move forward is the only way our read timestamp can progress. + * + * Another possibility that also involves tracking recent operations on a key would be to + * disallow modifies that occur within, say 10000 timestamps of a previous write operation on + * the same key. Those modifies could be silently converted to reads, for instance. If our read + * timestamp was greater than 10000 timestamps behind, we'd still need to wait for the stable + * timestamp to catch up. + */ + if (op == MODIFY) + return (false); + + /* + * FIXME-WT-10570. We don't permit remove operations with predictable replay. + * + * This should be something we can and should fix. The problem may be similar to the problem + * with modify, where having a varying read timestamp can cause different results for different + * runs. + */ + if (op == REMOVE) + return (false); + + /* + * We don't permit truncate operations with predictable replay. + * + * Currently, we use an operation's timestamp to help derive the operation's key. + * The last N bits of the timestamp are used as the last bits of the key (where + * 2^N == LANE_COUNT). These last N bits give the lane number, and within each + * lane we track the progress of operations for that lane. Using lanes, we can + * track and guarantee that only a single operation is active in a lane at once, + * and therefore we can't have multiple operations on a single key performed out + * of order or simultaneously. The truncate operation, for a small set of keys, + * would reserve multiple consecutive lanes (probably okay) and for larger sets, + * would reserve the entire set of lanes. This would effectively require all + * threads to get into a holding state, waiting for the truncate to start and then + * complete before continuing with their next operation. While we could fudge this + * in certain ways (e.g. operations with 10000 timestamps of a truncate would be + * forced to stay out of its table), there still would be a lot of details, and + * some rethink of our lane strategy. Even getting this to work, we would have + * a truncate that had the whole table to itself, which doesn't seem like an + * effective test. + */ + if (op == TRUNCATE) + return (false); + + return (true); +} + +/* + * replay_pick_timestamp -- + * Pick the next timestamp for this operation. That timestamp is used for any commits and also + * determines which lane we are in, to prevent races from occurring on operations on a single + * key. Also, by using the timestamp to seed the random number generators, it also determines + * precisely the nature of the operation. + */ +static void +replay_pick_timestamp(TINFO *tinfo) +{ + uint64_t replay_seed, stop_ts, ts; + uint32_t lane; + bool in_use; + + /* + * Choose a unique timestamp for commits. When we do predictable replay. If the field for + * replaying again is set, we already have a timestamp picked for us. + */ + if (tinfo->replay_again) { + /* + * Timestamp is already picked for us. + */ + testutil_assert(tinfo->lane == LANE_NUMBER(tinfo->replay_ts)); + tinfo->replay_again = false; + } else { + testutil_assert(tinfo->lane == LANE_NONE); + + stop_ts = g.stop_timestamp; + if (stop_ts != 0 && g.stable_timestamp >= stop_ts && tinfo->replay_ts == 0) { + tinfo->quit = true; + return; + } + + testutil_check(pthread_rwlock_wrlock(&g.lane_lock)); + do { + /* + * For predictable replay, this is the only place we increment the timestamp. We keep a + * copy to check that assumption. If we were to mistakenly change the timestamp + * elsewhere (as might be done in non-predictable runs), we would lose the integrity of + * the predictable run. + */ + testutil_assert(g.timestamp_copy == g.timestamp); + ts = __wt_atomic_addv64(&g.timestamp, 1); + g.timestamp_copy = g.timestamp; + lane = LANE_NUMBER(ts); + WT_ORDERED_READ(in_use, g.lanes[lane].in_use); + } while (in_use); + + tinfo->replay_ts = ts; + WT_PUBLISH(g.lanes[lane].in_use, true); + testutil_check(pthread_rwlock_unlock(&g.lane_lock)); + tinfo->lane = lane; + } + + testutil_assert(tinfo->lane != LANE_NONE); + testutil_assert(g.lanes[tinfo->lane].in_use); + + /* + * For this operation, seed the RNG used for data operations according to the timestamp and the + * global data seed. This allows us to have a predictable set of actions related to commits at + * this timestamp, so long as we are running with the same global data seed. + */ + replay_seed = tinfo->replay_ts ^ GV(RANDOM_DATA_SEED); + testutil_random_from_seed(&tinfo->data_rnd, replay_seed); + replay_seed = tinfo->replay_ts ^ GV(RANDOM_EXTRA_SEED); + testutil_random_from_seed(&tinfo->extra_rnd, replay_seed); +} + +/* + * replay_loop_begin -- + * Called at the top of the operation loop. + */ +void +replay_loop_begin(TINFO *tinfo, bool intxn) +{ + if (GV(RUNS_PREDICTABLE_REPLAY)) { + /* + * Predictable replay, as it works now, requires that we're not in transaction when we start + * the loop. + */ + testutil_assert(!intxn); + + /* + * We're here at the start of the loop for one of four reasons: + * 1) We needed to rollback the transaction, so we didn't give up our replay timestamp, + * and we set the again flag. + * 2) We successfully committed the last transaction, but our lane was behind, + * and was skipped over, so we're obligated to perform the next timestamp in our lane. + * In that case, we have a replay timestamp and the again flag is set. + * 3) We successfully committed the last transaction, and our lane was not behind. + * We don't have a replay timestamp and the again flag is off. + * 4) It's our first time through the loop, this is equivalent to the previous case. + */ + testutil_assert(tinfo->replay_again == (tinfo->replay_ts != 0)); + /* + * Choose a unique timestamp for commits, based on the conditions above. + */ + replay_pick_timestamp(tinfo); + + testutil_assert(tinfo->quit || tinfo->replay_ts != 0); + } +} + +/* + * replay_run_reset -- + * Called at beginning and end of runs to set up the lanes. + */ +static void +replay_run_reset(void) +{ + TINFO *tinfo, **tlp; + uint64_t ts; + uint32_t lane; + + /* Set every lane's commit timestamp to the current timestamp. */ + ts = g.timestamp; + g.timestamp_copy = ts; + for (lane = 0; lane < LANE_COUNT; ++lane) + g.lanes[lane].last_commit_ts = ts; + g.replay_cached_committed = ts; + + /* Reset fields in tinfo. */ + if (tinfo_list != NULL) + for (tlp = tinfo_list; *tlp != NULL; ++tlp) { + tinfo = *tlp; + tinfo->replay_again = false; + tinfo->replay_ts = 0; + tinfo->lane = 0; + tinfo->op = (thread_op)0; + } +} + +/* + * replay_run_begin -- + * Called at the beginning of a run. + */ +void +replay_run_begin(WT_SESSION *session) +{ + (void)session; + + if (GV(RUNS_PREDICTABLE_REPLAY)) + replay_run_reset(); +} + +/* + * replay_run_end -- + * Called when finishing processing for a run. + */ +void +replay_run_end(WT_SESSION *session) +{ + (void)session; + + if (GV(RUNS_PREDICTABLE_REPLAY)) + replay_run_reset(); +} + +/* + * replay_read_ts -- + * Return a read timestamp for a begin transaction call. + */ +uint64_t +replay_read_ts(TINFO *tinfo) +{ + uint64_t commit_ts; + + testutil_assert(GV(RUNS_PREDICTABLE_REPLAY) && tinfo->lane != LANE_NONE && + g.lanes[tinfo->lane].in_use && tinfo->replay_ts != 0); + + commit_ts = replay_maximum_committed(); + testutil_assert(commit_ts != 0); + return (commit_ts); +} + +/* + * replay_prepare_ts -- + * Return a timestamp to be used for prepare. + */ +uint64_t +replay_prepare_ts(TINFO *tinfo) +{ + uint64_t prepare_ts, ts; + + testutil_assert(GV(RUNS_PREDICTABLE_REPLAY)); + + /* See if we're just starting a run. */ + if (tinfo->replay_ts == 0 || tinfo->replay_ts <= g.replay_start_timestamp + LANE_COUNT) + /* + * When we're starting a run, we'll just use the final commit timestamp for our prepare + * timestamp. We know that's safe. + */ + prepare_ts = tinfo->replay_ts; + else { + /* + * Our lane's current operation will have a commit timestamp tinfo->replay_ts. Our lane's + * previous commit timestamp was that number minus LANE_COUNT. The global stable timestamp + * generally should not be advanced past our lane's previous commit timestamp. So a prepare + * timestamp halfway between the lane's previous commit timestamp and the current commit + * timestamp should be valid. + */ + ts = tinfo->replay_ts - LANE_COUNT / 2; + + /* As a sanity check, make sure the timestamp hasn't completely aged out. */ + if (ts < g.oldest_timestamp) + prepare_ts = ts; + else + prepare_ts = tinfo->replay_ts; + } + return (prepare_ts); +} + +/* + * replay_commit_ts -- + * Return the commit timestamp. + */ +uint64_t +replay_commit_ts(TINFO *tinfo) +{ + testutil_assert(GV(RUNS_PREDICTABLE_REPLAY)); + + testutil_assert(tinfo->replay_ts != 0); + return (tinfo->replay_ts); +} + +/* + * replay_committed -- + * Called when a transaction was successfully committed. We can give up a lane if appropriate. + */ +void +replay_committed(TINFO *tinfo) +{ + uint32_t lane; + + if (!GV(RUNS_PREDICTABLE_REPLAY)) + return; + + testutil_assert(tinfo->replay_ts != 0); + + lane = tinfo->lane; + testutil_assert(!tinfo->replay_again); + testutil_check(pthread_rwlock_wrlock(&g.lane_lock)); + + /* + * Updating the last commit timestamp for a lane in use allows read, oldest and stable + * timestamps to advance. + */ + WT_PUBLISH(g.lanes[lane].last_commit_ts, tinfo->replay_ts); + if (g.timestamp <= tinfo->replay_ts + LANE_COUNT) { + WT_PUBLISH(g.lanes[lane].in_use, false); + tinfo->lane = LANE_NONE; + tinfo->replay_ts = 0; + } else { + tinfo->replay_ts += LANE_COUNT; + tinfo->replay_again = true; + } + testutil_check(pthread_rwlock_unlock(&g.lane_lock)); +} + +/* + * replay_adjust_key -- + * Given a fully random key number, modify the key that is in our lane. + */ +void +replay_adjust_key(TINFO *tinfo, uint64_t max_rows) +{ + uint64_t keyno; + uint32_t lane; + + if (GV(RUNS_PREDICTABLE_REPLAY)) { + lane = tinfo->lane; + keyno = (tinfo->keyno & ~(LANE_COUNT - 1)) | lane; + + if (keyno == 0) + keyno = LANE_COUNT; + else if (keyno >= max_rows) + keyno -= LANE_COUNT; + + tinfo->keyno = keyno; + } +} + +/* + * replay_rollback -- + * Called after a rollback. + */ +void +replay_rollback(TINFO *tinfo) +{ + if (!GV(RUNS_PREDICTABLE_REPLAY)) + return; + + /* + * After a rollback, we don't give up our timestamp or our lane, we need to retry at the top of + * the operations loop. + */ + tinfo->replay_again = true; + + testutil_assert(tinfo->replay_ts != 0); + testutil_assert(tinfo->lane != LANE_NONE); + testutil_assert(g.lanes[tinfo->lane].in_use); +} + +/* + * replay_pause_after_rollback -- + * Called after a rollback, allowing us to yield or pause. + */ +void +replay_pause_after_rollback(TINFO *tinfo, uint32_t ntries) +{ + uint64_t high, low, mid; + + if (!GV(RUNS_PREDICTABLE_REPLAY)) + return; + + /* Generally, the more behind we are, the less we want to wait. */ + low = replay_maximum_committed(); + high = g.timestamp; + mid = high + low / 2; + + /* If we're in the furthest group behind, don't wait at all. */ + if (low + LANE_COUNT <= tinfo->replay_ts) + return; + + /* + * If we're in the last half, don't sleep. If we're in the front half, occasionally sleep. + */ + if (tinfo->replay_ts < mid && ntries % 10 != 0) + __wt_yield(); + else { + /* Never sleep more than .1 seconds */ + __wt_sleep(0, ntries > 100 ? 100 * WT_THOUSAND : ntries * WT_THOUSAND); + } +} diff --git a/src/third_party/wiredtiger/test/format/snap.c b/src/third_party/wiredtiger/test/format/snap.c index 2f1d5a366b9..3855d05f379 100644 --- a/src/third_party/wiredtiger/test/format/snap.c +++ b/src/third_party/wiredtiger/test/format/snap.c @@ -677,7 +677,7 @@ snap_repeat_single(TINFO *tinfo) * Start at a random spot in the list of operations and look for a read to retry. Stop when * we've walked the entire list or found one. */ - v = mmrand(&tinfo->rnd, 1, SNAP_LIST_SIZE) - 1; + v = mmrand(&tinfo->extra_rnd, 1, SNAP_LIST_SIZE) - 1; for (snap = &tinfo->snap_list[v], count = SNAP_LIST_SIZE; count > 0; --count, ++snap) { /* Wrap at the end of the circular buffer. */ if (snap >= tinfo->snap_end) diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c index 632a457d04f..7066dedf6dd 100644 --- a/src/third_party/wiredtiger/test/format/t.c +++ b/src/third_party/wiredtiger/test/format/t.c @@ -179,14 +179,15 @@ static bool syntax_check; /* Only checking configuration syntax. */ /* * main -- - * TODO: Add a comment describing this function. + * Run a variety of multithreaded WiredTiger operations based on a set of configurations. */ int main(int argc, char *argv[]) { + READ_SCAN_ARGS scan_args; uint64_t now, start; - u_int ops_seconds; - int ch, reps; + u_int ops_seconds, reps; + int ch; const char *config, *home; bool is_backup, quiet_flag, verify_only; @@ -252,11 +253,20 @@ main(int argc, char *argv[]) fflush(stdout); } - __wt_random_init_seed(NULL, &g.rnd); /* Initialize the RNG. */ + /* + * Initialize the RNGs. This is needed early because some random decisions are made while + * reading configuration. There may be random seeds in the configuration, however, so we will + * reinitialize the RNGs later. + */ + __wt_random_init_seed(NULL, &g.data_rnd); + __wt_random_init_seed(NULL, &g.extra_rnd); - /* Initialize lock to ensure single threading during failure handling */ + /* Initialize lock to ensure single threading during failure handling. */ testutil_check(pthread_rwlock_init(&g.death_lock, NULL)); + /* Initialize lock to ensure single threading for lane operations in predictable replay. */ + testutil_check(pthread_rwlock_init(&g.lane_lock, NULL)); + /* * Initialize the tables array and default to multi-table testing if not in backward-compatible * mode. @@ -357,7 +367,9 @@ main(int argc, char *argv[]) TIMED_MAJOR_OP(wts_verify(g.wts_conn, true)); if (verify_only) goto skip_operations; - TIMED_MAJOR_OP(tables_apply(wts_read_scan, g.wts_conn)); + scan_args.conn = g.wts_conn; + scan_args.rnd = &g.extra_rnd; + TIMED_MAJOR_OP(tables_apply(wts_read_scan, &scan_args)); /* Optionally start checkpoints. */ wts_checkpoints(); @@ -373,7 +385,7 @@ main(int argc, char *argv[]) */ ops_seconds = GV(RUNS_TIMER) == 0 ? 0 : ((GV(RUNS_TIMER) * 60) - 15) / FORMAT_OPERATION_REPS; for (reps = 1; reps <= FORMAT_OPERATION_REPS; ++reps) - operations(ops_seconds, reps == FORMAT_OPERATION_REPS); + operations(ops_seconds, reps, FORMAT_OPERATION_REPS); /* Copy out the run's statistics. */ TIMED_MAJOR_OP(wts_stats()); diff --git a/src/third_party/wiredtiger/test/format/wts.c b/src/third_party/wiredtiger/test/format/wts.c index 657c30b202f..4ea4a429be9 100644 --- a/src/third_party/wiredtiger/test/format/wts.c +++ b/src/third_party/wiredtiger/test/format/wts.c @@ -387,10 +387,10 @@ create_object(TABLE *table, void *arg) * Configure the maximum key/value sizes, but leave it as the default if we come up with * something crazy. */ - maxleafkey = mmrand(NULL, table->max_leaf_page / 50, table->max_leaf_page / 40); + maxleafkey = mmrand(&g.extra_rnd, table->max_leaf_page / 50, table->max_leaf_page / 40); if (maxleafkey > 20) CONFIG_APPEND(p, ",leaf_key_max=%" PRIu32, maxleafkey); - maxleafvalue = mmrand(NULL, table->max_leaf_page * 10, table->max_leaf_page / 40); + maxleafvalue = mmrand(&g.extra_rnd, table->max_leaf_page * 10, table->max_leaf_page / 40); if (maxleafvalue > 40 && maxleafvalue < 100 * 1024) CONFIG_APPEND(p, ",leaf_value_max=%" PRIu32, maxleafvalue); @@ -408,7 +408,7 @@ create_object(TABLE *table, void *arg) if (TV(BTREE_HUFFMAN_VALUE)) CONFIG_APPEND(p, ",huffman_value=english"); if (TV(BTREE_DICTIONARY)) - CONFIG_APPEND(p, ",dictionary=%" PRIu32, mmrand(NULL, 123, 517)); + CONFIG_APPEND(p, ",dictionary=%" PRIu32, mmrand(&g.extra_rnd, 123, 517)); break; } diff --git a/src/third_party/wiredtiger/tools/wt_cmp_dir b/src/third_party/wiredtiger/tools/wt_cmp_dir index 7e4e5d87a8b..3cd39557c40 100755 --- a/src/third_party/wiredtiger/tools/wt_cmp_dir +++ b/src/third_party/wiredtiger/tools/wt_cmp_dir @@ -145,7 +145,7 @@ cmp_uri_script=$(dirname "$0")/wt_cmp_uri ecode=0 for f in $files1; do echo $f - if ! python $cmp_uri_script $timestamp_opt1 "$dir1"/$f $timestamp_opt2 "$dir2"/$f; then + if ! python3 $cmp_uri_script $timestamp_opt1 "$dir1"/$f $timestamp_opt2 "$dir2"/$f; then ecode=1 fi done |