diff options
author | Alison Felizzi <alison.felizzi@mongodb.com> | 2021-09-09 04:24:05 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-09-09 04:47:02 +0000 |
commit | 171f588acc51178080f0c400bc00a190c67c50e1 (patch) | |
tree | 6bab7c1ac1aab0d7abcfbf4daeef4dff5e72db29 | |
parent | 196a6b6dfddb557f76b106a30e7f3638b34be9d4 (diff) | |
download | mongo-171f588acc51178080f0c400bc00a190c67c50e1.tar.gz |
Import wiredtiger: 43e2450c47f0a37185439b3e685bfbbd762cfe5f from branch mongodb-master
ref: 724b0d87f1..43e2450c47
for: 5.1.0
WT-7844 Add tiered_abort stress test for tiered storage.
-rw-r--r-- | src/third_party/wiredtiger/dist/s_string.ok | 1 | ||||
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/docs/arch-btree.dox | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/csuite/Makefile.am | 4 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/csuite/tiered_abort/main.c | 909 | ||||
-rwxr-xr-x | src/third_party/wiredtiger/test/csuite/tiered_abort/smoke.sh | 24 | ||||
-rwxr-xr-x | src/third_party/wiredtiger/test/evergreen.yml | 23 |
7 files changed, 963 insertions, 2 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 45db6f3b10a..8cf22d64e6c 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -1486,6 +1486,7 @@ vsnprintf vtype vunpack vxr +vz waitpid waker wakeup diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 9f686aa42a7..bbe7e507a75 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-master", - "commit": "724b0d87f13fccf82355bdbdfc470d890f502c30" + "commit": "43e2450c47f0a37185439b3e685bfbbd762cfe5f" } diff --git a/src/third_party/wiredtiger/src/docs/arch-btree.dox b/src/third_party/wiredtiger/src/docs/arch-btree.dox index b0055ced8ff..c416ff22759 100644 --- a/src/third_party/wiredtiger/src/docs/arch-btree.dox +++ b/src/third_party/wiredtiger/src/docs/arch-btree.dox @@ -84,4 +84,4 @@ before the page got truncated. The page must first be re-instantiated into memor in-memory version. Then as part of the page read process, we create a \c WT_UPDATE with a tombstone in the same transaction the truncate happened. -*/
\ No newline at end of file +*/ diff --git a/src/third_party/wiredtiger/test/csuite/Makefile.am b/src/third_party/wiredtiger/test/csuite/Makefile.am index 9c503f1a0a2..587c64be727 100644 --- a/src/third_party/wiredtiger/test/csuite/Makefile.am +++ b/src/third_party/wiredtiger/test/csuite/Makefile.am @@ -31,6 +31,10 @@ test_scope_SOURCES = scope/main.c noinst_PROGRAMS += test_scope all_TESTS += test_scope +test_tiered_abort_SOURCES = tiered_abort/main.c +noinst_PROGRAMS += test_tiered_abort +all_TESTS += tiered_abort/smoke.sh + test_timestamp_abort_SOURCES = timestamp_abort/main.c noinst_PROGRAMS += test_timestamp_abort all_TESTS += timestamp_abort/smoke.sh diff --git a/src/third_party/wiredtiger/test/csuite/tiered_abort/main.c b/src/third_party/wiredtiger/test/csuite/tiered_abort/main.c new file mode 100644 index 00000000000..f8c30ac7b91 --- /dev/null +++ b/src/third_party/wiredtiger/test/csuite/tiered_abort/main.c @@ -0,0 +1,909 @@ +/*- + * Public Domain 2014-present MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "test_util.h" + +#include <sys/wait.h> +#include <signal.h> + +static char home[1024]; /* Program working dir */ + +/* + * Create three tables that we will write the same data to and verify that + * all the types of usage have the expected data in them after a crash and + * recovery. We want: + * 1. A table that is logged and is not involved in timestamps. This table + * simulates a user local table. + * 2. A table that is logged and involved in timestamps. This simulates + * the oplog. + * 3. A table that is not logged and involved in timestamps. This simulates + * a typical collection file. We also insert identical data into a shadow table + * with a different timestamp that simulates insertion on a secondary. + * + * We also create another table that is not logged and not involved directly + * in timestamps to store the stable timestamp. That way we can know what the + * latest stable timestamp is on checkpoint. + * + * We also create several files that are not WiredTiger tables. The flush tier thread creates + * one such file indicating that the specified number of flush_tier calls have completed. The parent + * process uses this to know when that threshold is met and it can start the timer to abort. + * Also each worker thread creates its own textual records file that records the data it + * inserted and it records the timestamp that was used for that insertion. + */ +#define BUCKET "bucket" +#define INVALID_KEY UINT64_MAX +#define MAX_CKPT_INVL 5 /* Maximum interval between checkpoints */ +#define MAX_FLUSH_INVL 5 /* Maximum interval between flush_tier calls */ +#define MAX_TH 20 /* Maximum configurable threads */ +#define MAX_TIME 40 +#define MAX_VAL 1024 +#define MIN_TH 5 +#define MIN_TIME 10 +#define NUM_INT_THREADS 3 +#define RECORDS_FILE "records-%" PRIu32 +/* Include worker threads and extra sessions */ +#define SESSION_MAX (MAX_TH + 4) +#define WT_STORAGE_LIB "ext/storage_sources/local_store/.libs/libwiredtiger_local_store.so" + +static const char *table_pfx = "table"; +static const char *const uri_collection = "collection"; +static const char *const uri_local = "local"; +static const char *const uri_oplog = "oplog"; +static const char *const uri_shadow = "shadow"; + +static const char *const sentinel_file = "sentinel_ready"; + +static bool use_ts; +static volatile uint64_t global_ts = 1; +static uint32_t flush_calls = 1; + +/* + * The configuration sets the eviction update and dirty targets at 20% so that on average, each + * thread can have a couple of dirty pages before eviction threads kick in. See below where these + * symbols are used for cache sizing - we'll have about 10 pages allocated per thread. On the other + * side, the eviction update and dirty triggers are 90%, so application threads aren't involved in + * eviction until we're close to running out of cache. + */ +#define ENV_CONFIG_DEF \ + "cache_size=%" PRIu32 \ + "M,create," \ + "debug_mode=(table_logging=true,checkpoint_retention=5)," \ + "eviction_updates_target=20,eviction_updates_trigger=90," \ + "log=(archive=true,file_max=10M,enabled),session_max=%d," \ + "statistics=(fast),statistics_log=(wait=1,json=true)," \ + "tiered_storage=(bucket=%s,bucket_prefix=pfx,name=local_store)" +#define ENV_CONFIG_TXNSYNC \ + ENV_CONFIG_DEF \ + ",eviction_dirty_target=20,eviction_dirty_trigger=90" \ + ",transaction_sync=(enabled,method=none)" +#define ENV_CONFIG_REC "log=(archive=false,recover=on)" + +/* + * A minimum width of 10, along with zero filling, means that all the keys sort according to their + * integer value, making each thread's key space distinct. + */ +#define KEY_FORMAT ("%010" PRIu64) + +typedef struct { + uint64_t absent_key; /* Last absent key */ + uint64_t exist_key; /* First existing key after miss */ + uint64_t first_key; /* First key in range */ + uint64_t first_miss; /* First missing key */ + uint64_t last_key; /* Last key in range */ +} REPORT; + +typedef struct { + WT_CONNECTION *conn; + uint64_t start; + uint32_t info; +} THREAD_DATA; + +/* + * TODO: WT-7833 Lock to coordinate inserts and flush_tier. This lock should be removed when that + * ticket is fixed. Flush_tier should be able to run with ongoing operations. + */ +static pthread_rwlock_t flush_lock; +/* Lock for transactional ops that set or query a timestamp. */ +static pthread_rwlock_t ts_lock; + +static void handler(int) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); +static void usage(void) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); +static void +usage(void) +{ + fprintf(stderr, "usage: %s [-h dir] [-T threads] [-t time] [-vz]\n", progname); + exit(EXIT_FAILURE); +} + +/* + * thread_ts_run -- + * Runner function for a timestamp thread. + */ +static WT_THREAD_RET +thread_ts_run(void *arg) +{ + WT_DECL_RET; + WT_SESSION *session; + THREAD_DATA *td; + char tscfg[64], ts_string[WT_TS_HEX_STRING_SIZE]; + + td = (THREAD_DATA *)arg; + + testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session)); + /* Update the oldest timestamp every 1 millisecond. */ + for (;;) { + /* + * We get the last committed timestamp periodically in order to update the oldest timestamp, + * that requires locking out transactional ops that set or query a timestamp. + */ + testutil_check(pthread_rwlock_wrlock(&ts_lock)); + ret = td->conn->query_timestamp(td->conn, ts_string, "get=all_durable"); + testutil_check(pthread_rwlock_unlock(&ts_lock)); + testutil_assert(ret == 0 || ret == WT_NOTFOUND); + if (ret == 0) { + /* + * Set both the oldest and stable timestamp so that we don't need to maintain read + * availability at older timestamps. + */ + testutil_check(__wt_snprintf(tscfg, sizeof(tscfg), + "oldest_timestamp=%s,stable_timestamp=%s", ts_string, ts_string)); + testutil_check(td->conn->set_timestamp(td->conn, tscfg)); + } + __wt_sleep(0, 1000); + } + /* NOTREACHED */ +} + +/* + * thread_ckpt_run -- + * Runner function for the checkpoint thread. + */ +static WT_THREAD_RET +thread_ckpt_run(void *arg) +{ + WT_RAND_STATE rnd; + WT_SESSION *session; + THREAD_DATA *td; + uint64_t stable; + uint32_t sleep_time; + int i; + char ts_string[WT_TS_HEX_STRING_SIZE]; + + __wt_random_init(&rnd); + + td = (THREAD_DATA *)arg; + /* + * Keep a separate file with the records we wrote for checking. + */ + testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session)); + for (i = 0;; ++i) { + sleep_time = __wt_random(&rnd) % MAX_CKPT_INVL; + sleep(sleep_time); + /* + * Since this is the default, send in this string even if running without timestamps. + */ + testutil_check(session->checkpoint(session, "use_timestamp=true")); + testutil_check(td->conn->query_timestamp(td->conn, ts_string, "get=last_checkpoint")); + testutil_assert(sscanf(ts_string, "%" SCNx64, &stable) == 1); + printf("Checkpoint %d complete at stable %" PRIu64 ".\n", i, stable); + fflush(stdout); + } + /* NOTREACHED */ +} + +/* + * thread_flush_run -- + * Runner function for the flush_tier thread. + */ +static WT_THREAD_RET +thread_flush_run(void *arg) +{ + FILE *fp; + WT_RAND_STATE rnd; + WT_SESSION *session; + THREAD_DATA *td; + uint64_t stable; + uint32_t i, sleep_time; + char ts_string[WT_TS_HEX_STRING_SIZE]; + + __wt_random_init(&rnd); + + td = (THREAD_DATA *)arg; + /* + * Keep a separate file with the records we wrote for checking. + */ + (void)unlink(sentinel_file); + testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session)); + for (i = 0;; ++i) { + sleep_time = __wt_random(&rnd) % MAX_FLUSH_INVL; + sleep(sleep_time); + testutil_check(td->conn->query_timestamp(td->conn, ts_string, "get=last_checkpoint")); + testutil_assert(sscanf(ts_string, "%" SCNx64, &stable) == 1); + /* Effectively wait for the first checkpoint to complete. */ + if (use_ts && stable == WT_TS_NONE) + continue; + /* + * Currently not testing any of the flush tier configuration strings other than defaults. We + * expect the defaults are what MongoDB wants for now. + */ + testutil_check(pthread_rwlock_wrlock(&flush_lock)); + testutil_check(session->flush_tier(session, NULL)); + testutil_check(pthread_rwlock_unlock(&flush_lock)); + printf("Flush tier %" PRIu32 " completed.\n", i); + fflush(stdout); + /* + * Create the sentinel file so that the parent process knows the desired number of + * flush_tier calls have finished and can start its timer. + */ + if (i == flush_calls) { + testutil_checksys((fp = fopen(sentinel_file, "w")) == NULL); + testutil_checksys(fclose(fp) != 0); + } + } + /* NOTREACHED */ +} + +/* + * thread_run -- + * Runner function for the worker threads. + */ +static WT_THREAD_RET +thread_run(void *arg) +{ + FILE *fp; + WT_CURSOR *cur_coll, *cur_local, *cur_oplog, *cur_shadow; + WT_DECL_RET; + WT_ITEM data; + WT_RAND_STATE rnd; + WT_SESSION *session; + THREAD_DATA *td; + uint64_t i, active_ts; + char cbuf[MAX_VAL], lbuf[MAX_VAL], obuf[MAX_VAL]; + char kname[64], tscfg[64], uri[128]; + bool durable_ahead_commit, locked; + + __wt_random_init(&rnd); + memset(cbuf, 0, sizeof(cbuf)); + memset(lbuf, 0, sizeof(lbuf)); + memset(obuf, 0, sizeof(obuf)); + memset(kname, 0, sizeof(kname)); + locked = false; + + td = (THREAD_DATA *)arg; + /* + * Set up the separate file for checking. + */ + testutil_check(__wt_snprintf(cbuf, sizeof(cbuf), RECORDS_FILE, td->info)); + (void)unlink(cbuf); + testutil_checksys((fp = fopen(cbuf, "w")) == NULL); + /* + * Set to line buffering. But that is advisory only. We've seen cases where the result files end + * up with partial lines. + */ + __wt_stream_set_line_buffer(fp); + + durable_ahead_commit = false; + + testutil_check(td->conn->open_session(td->conn, NULL, "isolation=snapshot", &session)); + /* + * Open a cursor to each table. + */ + testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_collection)); + testutil_check(session->open_cursor(session, uri, NULL, NULL, &cur_coll)); + testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_shadow)); + testutil_check(session->open_cursor(session, uri, NULL, NULL, &cur_shadow)); + + testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_local)); + testutil_check(session->open_cursor(session, uri, NULL, NULL, &cur_local)); + testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_oplog)); + testutil_check(session->open_cursor(session, uri, NULL, NULL, &cur_oplog)); + + /* + * Write our portion of the key space until we're killed. + */ + printf("Thread %" PRIu32 " starts at %" PRIu64 "\n", td->info, td->start); + active_ts = 0; + for (i = td->start;; ++i) { + testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, i)); + + testutil_check(session->begin_transaction(session, NULL)); + + if (use_ts) { + testutil_check(pthread_rwlock_rdlock(&ts_lock)); + active_ts = __wt_atomic_addv64(&global_ts, 2); + testutil_check( + __wt_snprintf(tscfg, sizeof(tscfg), "commit_timestamp=%" PRIx64, active_ts)); + /* + * Set the transaction's timestamp now before performing the operation. + */ + testutil_check(session->timestamp_transaction(session, tscfg)); + testutil_check(pthread_rwlock_unlock(&ts_lock)); + } + + cur_coll->set_key(cur_coll, kname); + cur_local->set_key(cur_local, kname); + cur_oplog->set_key(cur_oplog, kname); + cur_shadow->set_key(cur_shadow, kname); + /* + * Put an informative string into the value so that it can be viewed well in a binary dump. + */ + testutil_check(__wt_snprintf(cbuf, sizeof(cbuf), + "COLL: thread:%" PRIu32 " ts:%" PRIu64 " key: %" PRIu64, td->info, active_ts, i)); + testutil_check(__wt_snprintf(lbuf, sizeof(lbuf), + "LOCAL: thread:%" PRIu32 " ts:%" PRIu64 " key: %" PRIu64, td->info, active_ts, i)); + testutil_check(__wt_snprintf(obuf, sizeof(obuf), + "OPLOG: thread:%" PRIu32 " ts:%" PRIu64 " key: %" PRIu64, td->info, active_ts, i)); + data.size = __wt_random(&rnd) % MAX_VAL; + data.data = cbuf; + cur_coll->set_value(cur_coll, &data); + testutil_check(pthread_rwlock_rdlock(&flush_lock)); + locked = true; + if ((ret = cur_coll->insert(cur_coll)) == WT_ROLLBACK) + goto rollback; + testutil_check(ret); + cur_shadow->set_value(cur_shadow, &data); + if (use_ts) { + /* + * Change the timestamp in the middle of the transaction so that we simulate a + * secondary. + */ + ++active_ts; + testutil_check( + __wt_snprintf(tscfg, sizeof(tscfg), "commit_timestamp=%" PRIx64, active_ts)); + testutil_check(session->timestamp_transaction(session, tscfg)); + } + if ((ret = cur_shadow->insert(cur_shadow)) == WT_ROLLBACK) + goto rollback; + data.size = __wt_random(&rnd) % MAX_VAL; + data.data = obuf; + cur_oplog->set_value(cur_oplog, &data); + if ((ret = cur_oplog->insert(cur_oplog)) == WT_ROLLBACK) + goto rollback; + testutil_check(session->commit_transaction(session, NULL)); + /* + * Insert into the local table outside the timestamp txn. This must occur after the + * timestamp transaction, not before, because of the possibility of rollback in the + * transaction. The local table must stay in sync with the other tables. + */ + data.size = __wt_random(&rnd) % MAX_VAL; + data.data = lbuf; + cur_local->set_value(cur_local, &data); + testutil_check(cur_local->insert(cur_local)); + testutil_check(pthread_rwlock_unlock(&flush_lock)); + locked = false; + + /* Save the timestamps and key separately for checking later. */ + if (fprintf(fp, "%" PRIu64 " %" PRIu64 " %" PRIu64 "\n", active_ts, + durable_ahead_commit ? active_ts + 4 : active_ts, i) < 0) + testutil_die(EIO, "fprintf"); + + if (0) { +rollback: + testutil_check(session->rollback_transaction(session, NULL)); + if (locked) { + testutil_check(pthread_rwlock_unlock(&flush_lock)); + locked = false; + } + } + } + /* NOTREACHED */ +} + +/* + * Child process creates the database and table, and then creates worker threads to add data until + * it is killed by the parent. + */ +static void run_workload(uint32_t, const char *) WT_GCC_FUNC_DECL_ATTRIBUTE((noreturn)); +static void +run_workload(uint32_t nth, const char *buf) +{ + WT_CONNECTION *conn; + WT_SESSION *session; + THREAD_DATA *td; + wt_thread_t *thr; + uint32_t cache_mb, ckpt_id, flush_id, i, ts_id; + char envconf[1024], extconf[512], uri[128]; + + thr = dcalloc(nth + NUM_INT_THREADS, sizeof(*thr)); + td = dcalloc(nth + NUM_INT_THREADS, sizeof(THREAD_DATA)); + + /* + * Size the cache appropriately for the number of threads. Each thread adds keys sequentially to + * its own portion of the key space, so each thread will be dirtying one page at a time. By + * default, a leaf page grows to 32K in size before it splits and the thread begins to fill + * another page. We'll budget for 10 full size leaf pages per thread in the cache plus a little + * extra in the total for overhead. + */ + cache_mb = ((32 * WT_KILOBYTE * 10) * nth) / WT_MEGABYTE + 20; + + if (chdir(home) != 0) + testutil_die(errno, "Child chdir: %s", home); + testutil_check( + __wt_snprintf(envconf, sizeof(envconf), ENV_CONFIG_TXNSYNC, cache_mb, SESSION_MAX, BUCKET)); + + testutil_check(__wt_snprintf( + extconf, sizeof(extconf), ",extensions=(%s/%s=(early_load=true))", buf, WT_STORAGE_LIB)); + + strcat(envconf, extconf); + printf("wiredtiger_open configuration: %s\n", envconf); + testutil_check(wiredtiger_open(NULL, NULL, envconf, &conn)); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); + /* + * Create all the tables. + */ + testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_collection)); + testutil_check( + session->create(session, uri, "key_format=S,value_format=u,log=(enabled=false)")); + testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_shadow)); + testutil_check( + session->create(session, uri, "key_format=S,value_format=u,log=(enabled=false)")); + testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_local)); + testutil_check(session->create(session, uri, "key_format=S,value_format=u")); + testutil_check(__wt_snprintf(uri, sizeof(uri), "%s:%s", table_pfx, uri_oplog)); + testutil_check(session->create(session, uri, "key_format=S,value_format=u")); + /* + * Don't log the stable timestamp table so that we know what timestamp was stored at the + * checkpoint. + */ + testutil_check(session->close(session, NULL)); + + /* + * The checkpoint thread and the timestamp threads are added at the end of the array. + */ + ckpt_id = nth; + td[ckpt_id].conn = conn; + td[ckpt_id].info = nth; + printf("Create checkpoint thread\n"); + testutil_check(__wt_thread_create(NULL, &thr[ckpt_id], thread_ckpt_run, &td[ckpt_id])); + flush_id = nth + 1; + td[flush_id].conn = conn; + td[flush_id].info = nth; + printf("Create flush thread\n"); + testutil_check(__wt_thread_create(NULL, &thr[flush_id], thread_flush_run, &td[flush_id])); + ts_id = nth + 2; + if (use_ts) { + td[ts_id].conn = conn; + td[ts_id].info = nth; + printf("Create timestamp thread\n"); + testutil_check(__wt_thread_create(NULL, &thr[ts_id], thread_ts_run, &td[ts_id])); + } + printf("Create %" PRIu32 " writer threads\n", nth); + printf("Create %" PRIu32 " writer threads\n", nth); + for (i = 0; i < nth; ++i) { + td[i].conn = conn; + td[i].start = WT_BILLION * (uint64_t)i; + td[i].info = i; + testutil_check(__wt_thread_create(NULL, &thr[i], thread_run, &td[i])); + } + /* + * The threads never exit, so the child will just wait here until it is killed. + */ + fflush(stdout); + for (i = 0; i <= ts_id; ++i) + testutil_check(__wt_thread_join(NULL, &thr[i])); + /* + * NOTREACHED + */ + free(thr); + free(td); + exit(EXIT_SUCCESS); +} + +extern int __wt_optind; +extern char *__wt_optarg; + +/* + * Initialize a report structure. Since zero is a valid key we cannot just clear it. + */ +static void +initialize_rep(REPORT *r) +{ + r->first_key = r->first_miss = INVALID_KEY; + r->absent_key = r->exist_key = r->last_key = INVALID_KEY; +} + +/* + * Print out information if we detect missing records in the middle of the data of a report + * structure. + */ +static void +print_missing(REPORT *r, const char *fname, const char *msg) +{ + if (r->exist_key != INVALID_KEY) + printf("%s: %s error %" PRIu64 " absent records %" PRIu64 "-%" PRIu64 ". Then keys %" PRIu64 + "-%" PRIu64 " exist. Key range %" PRIu64 "-%" PRIu64 "\n", + fname, msg, (r->exist_key - r->first_miss) - 1, r->first_miss, r->exist_key - 1, + r->exist_key, r->last_key, r->first_key, r->last_key); +} + +/* + * Signal handler to catch if the child died unexpectedly. + */ +static void +handler(int sig) +{ + pid_t pid; + + WT_UNUSED(sig); + pid = wait(NULL); + /* The core file will indicate why the child exited. Choose EINVAL here. */ + testutil_die(EINVAL, "Child process %" PRIu64 " abnormally exited", (uint64_t)pid); +} + +int +main(int argc, char *argv[]) +{ + struct sigaction sa; + struct stat sb; + FILE *fp; + REPORT c_rep[MAX_TH], l_rep[MAX_TH], o_rep[MAX_TH]; + TEST_OPTS *opts, _opts; + WT_CONNECTION *conn; + WT_CURSOR *cur_coll, *cur_local, *cur_oplog, *cur_shadow; + WT_RAND_STATE rnd; + WT_SESSION *session; + pid_t pid; + uint64_t absent_coll, absent_local, absent_oplog, absent_shadow, count, key, last_key; + uint64_t commit_fp, durable_fp, stable_val; + uint32_t i, nth, timeout; + int ch, status, ret; + const char *working_dir; + char buf[512], bucket[512], fname[64], kname[64]; + char envconf[1024], extconf[512]; + char ts_string[WT_TS_HEX_STRING_SIZE]; + bool fatal, rand_th, rand_time, verify_only; + + (void)testutil_set_progname(argv); + + use_ts = true; + nth = MIN_TH; + rand_th = rand_time = true; + timeout = MIN_TIME; + verify_only = false; + working_dir = "WT_TEST.tiered-abort"; + + while ((ch = __wt_getopt(progname, argc, argv, "f:h:T:t:vz")) != EOF) + switch (ch) { + case 'f': + flush_calls = (uint32_t)atoi(__wt_optarg); + break; + case 'h': + working_dir = __wt_optarg; + break; + case 'T': + rand_th = false; + nth = (uint32_t)atoi(__wt_optarg); + if (nth > MAX_TH) { + fprintf( + stderr, "Number of threads is larger than the maximum %" PRId32 "\n", MAX_TH); + return (EXIT_FAILURE); + } + break; + case 't': + rand_time = false; + timeout = (uint32_t)atoi(__wt_optarg); + break; + case 'v': + verify_only = true; + break; + case 'z': + use_ts = false; + break; + default: + usage(); + } + argc -= __wt_optind; + if (argc != 0) + usage(); + + /* + * Build the directory path needed for the extension after parsing the args. We are not using + * the opts variable other than for building the directory. We have already parsed the args + * we're interested in above. + */ + opts = &_opts; + memset(opts, 0, sizeof(*opts)); + testutil_check(testutil_parse_opts(argc, argv, opts)); + testutil_build_dir(opts, buf, 512); + + testutil_check(pthread_rwlock_init(&flush_lock, NULL)); + testutil_check(pthread_rwlock_init(&ts_lock, NULL)); + + testutil_work_dir_from_path(home, sizeof(home), working_dir); + /* + * If the user wants to verify they need to tell us how many threads there were so we can find + * the old record files. + */ + if (verify_only && rand_th) { + fprintf(stderr, "Verify option requires specifying number of threads\n"); + exit(EXIT_FAILURE); + } + if (!verify_only) { + /* Make both the home directory and the bucket directory under the home. */ + testutil_make_work_dir(home); + testutil_check(__wt_snprintf(bucket, sizeof(bucket), "%s/%s", working_dir, BUCKET)); + testutil_make_work_dir(bucket); + + __wt_random_init_seed(NULL, &rnd); + if (rand_time) { + timeout = __wt_random(&rnd) % MAX_TIME; + if (timeout < MIN_TIME) + timeout = MIN_TIME; + } + if (rand_th) { + nth = __wt_random(&rnd) % MAX_TH; + if (nth < MIN_TH) + nth = MIN_TH; + } + + printf("Parent: timestamp in use: %s\n", use_ts ? "true" : "false"); + printf("Parent: Create %" PRIu32 " threads; sleep %" PRIu32 " seconds\n", nth, timeout); + printf("CONFIG: %s%s -h %s -T %" PRIu32 " -t %" PRIu32 "\n", progname, !use_ts ? " -z" : "", + working_dir, nth, timeout); + /* + * Fork a child to insert as many items. We will then randomly kill the child, run recovery + * and make sure all items we wrote exist after recovery runs. + */ + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = handler; + testutil_checksys(sigaction(SIGCHLD, &sa, NULL)); + testutil_checksys((pid = fork()) < 0); + + strcpy(bucket, buf); + if (pid == 0) { /* child */ + run_workload(nth, bucket); + return (EXIT_SUCCESS); + } + + /* parent */ + /* + * Sleep for the configured amount of time before killing the child. Start the timeout from + * the time we notice that the file has been created. That allows the test to run correctly + * on really slow machines. + */ + testutil_check(__wt_snprintf(buf, sizeof(buf), "%s/%s", home, sentinel_file)); + while (stat(buf, &sb) != 0) + testutil_sleep_wait(1, pid); + sleep(timeout); + sa.sa_handler = SIG_DFL; + testutil_checksys(sigaction(SIGCHLD, &sa, NULL)); + + /* + * !!! It should be plenty long enough to make sure more than + * one log file exists. If wanted, that check would be added + * here. + */ + printf("Kill child\n"); + testutil_checksys(kill(pid, SIGKILL) != 0); + testutil_checksys(waitpid(pid, &status, 0) == -1); + } + + /* + * !!! If we wanted to take a copy of the directory before recovery, + * this is the place to do it. Don't do it all the time because + * it can use a lot of disk space, which can cause test machine + * issues. + */ + if (chdir(home) != 0) + testutil_die(errno, "parent chdir: %s", home); + + /* Copy the data to a separate folder for debugging purpose. */ + testutil_copy_data(home); + + printf("Open database, run recovery and verify content\n"); + + /* Open the connection which forces recovery to be run. */ + testutil_check(__wt_snprintf(envconf, sizeof(envconf), ENV_CONFIG_REC)); + + testutil_check(__wt_snprintf( + extconf, sizeof(extconf), ",extensions=(%s/%s=(early_load=true))", bucket, WT_STORAGE_LIB)); + + strcat(envconf, extconf); + testutil_check(wiredtiger_open(NULL, NULL, envconf, &conn)); + testutil_check(conn->open_session(conn, NULL, NULL, &session)); + /* Open a cursor on all the tables. */ + testutil_check(__wt_snprintf(buf, sizeof(buf), "%s:%s", table_pfx, uri_collection)); + testutil_check(session->open_cursor(session, buf, NULL, NULL, &cur_coll)); + testutil_check(__wt_snprintf(buf, sizeof(buf), "%s:%s", table_pfx, uri_shadow)); + testutil_check(session->open_cursor(session, buf, NULL, NULL, &cur_shadow)); + testutil_check(__wt_snprintf(buf, sizeof(buf), "%s:%s", table_pfx, uri_local)); + testutil_check(session->open_cursor(session, buf, NULL, NULL, &cur_local)); + testutil_check(__wt_snprintf(buf, sizeof(buf), "%s:%s", table_pfx, uri_oplog)); + testutil_check(session->open_cursor(session, buf, NULL, NULL, &cur_oplog)); + + /* Find the biggest stable timestamp value that was saved. */ + stable_val = 0; + if (use_ts) { + testutil_check(conn->query_timestamp(conn, ts_string, "get=recovery")); + testutil_assert(sscanf(ts_string, "%" SCNx64, &stable_val) == 1); + printf("Got stable_val %" PRIu64 "\n", stable_val); + } + + count = 0; + absent_coll = absent_local = absent_oplog = absent_shadow = 0; + fatal = false; + for (i = 0; i < nth; ++i) { + initialize_rep(&c_rep[i]); + initialize_rep(&l_rep[i]); + initialize_rep(&o_rep[i]); + testutil_check(__wt_snprintf(fname, sizeof(fname), RECORDS_FILE, i)); + if ((fp = fopen(fname, "r")) == NULL) + testutil_die(errno, "fopen: %s", fname); + + /* + * For every key in the saved file, verify that the key exists in the table after recovery. + * If we're doing in-memory log buffering we never expect a record missing in the middle, + * but records may be missing at the end. If we did write-no-sync, we expect every key to + * have been recovered. + */ + for (last_key = INVALID_KEY;; ++count, last_key = key) { + ret = fscanf(fp, "%" SCNu64 "%" SCNu64 "%" SCNu64 "\n", &commit_fp, &durable_fp, &key); + if (last_key == INVALID_KEY) { + c_rep[i].first_key = key; + l_rep[i].first_key = key; + o_rep[i].first_key = key; + } + if (ret != EOF && ret != 3) { + /* If we find a partial line, consider it like an EOF. */ + if (ret == 2 || ret == 1 || ret == 0) + break; + testutil_die(errno, "fscanf"); + } + if (ret == EOF) + break; + /* + * If we're unlucky, the last line may be a partially written key at the end that can + * result in a false negative error for a missing record. Detect it. + */ + if (last_key != INVALID_KEY && key != last_key + 1) { + printf("%s: Ignore partial record %" PRIu64 " last valid key %" PRIu64 "\n", fname, + key, last_key); + break; + } + testutil_check(__wt_snprintf(kname, sizeof(kname), KEY_FORMAT, key)); + cur_coll->set_key(cur_coll, kname); + cur_local->set_key(cur_local, kname); + cur_oplog->set_key(cur_oplog, kname); + cur_shadow->set_key(cur_shadow, kname); + /* + * The collection table should always only have the data as of the checkpoint. The + * shadow table should always have the exact same data (or not) as the collection table, + * except for the last key that may be committed after the stable timestamp. + */ + if ((ret = cur_coll->search(cur_coll)) != 0) { + if (ret != WT_NOTFOUND) + testutil_die(ret, "search"); + if ((ret = cur_shadow->search(cur_shadow)) == 0) + testutil_die(ret, "shadow search success"); + + /* + * If we don't find a record, the durable timestamp written to our file better be + * larger than the saved one. + */ + if (durable_fp != 0 && durable_fp <= stable_val) { + printf("%s: COLLECTION no record with key %" PRIu64 + " record durable ts %" PRIu64 " <= stable ts %" PRIu64 "\n", + fname, key, durable_fp, stable_val); + absent_coll++; + } + if (c_rep[i].first_miss == INVALID_KEY) + c_rep[i].first_miss = key; + c_rep[i].absent_key = key; + } else if ((ret = cur_shadow->search(cur_shadow)) != 0) { + if (ret != WT_NOTFOUND) + testutil_die(ret, "shadow search"); + /* + * We respectively insert the record to the collection table at timestamp t and to + * the shadow table at t + 1. If the checkpoint finishes at timestamp t, the last + * shadow table record will be removed by rollback to stable after restart. + */ + if (durable_fp <= stable_val) { + printf("%s: SHADOW no record with key %" PRIu64 "\n", fname, key); + absent_shadow++; + } + } else if (c_rep[i].absent_key != INVALID_KEY && c_rep[i].exist_key == INVALID_KEY) { + /* + * If we get here we found a record that exists after absent records, a hole in our + * data. + */ + c_rep[i].exist_key = key; + fatal = true; + } else if (commit_fp != 0 && commit_fp > stable_val) { + /* + * If we found a record, the commit timestamp written to our file better be no + * larger than the checkpoint one. + */ + printf("%s: COLLECTION record with key %" PRIu64 " commit record ts %" PRIu64 + " > stable ts %" PRIu64 "\n", + fname, key, commit_fp, stable_val); + fatal = true; + } else if ((ret = cur_shadow->search(cur_shadow)) != 0) + /* Collection and shadow both have the data. */ + testutil_die(ret, "shadow search failure"); + + /* The local table should always have all data. */ + if ((ret = cur_local->search(cur_local)) != 0) { + if (ret != WT_NOTFOUND) + testutil_die(ret, "search"); + printf("%s: LOCAL no record with key %" PRIu64 "\n", fname, key); + absent_local++; + if (l_rep[i].first_miss == INVALID_KEY) + l_rep[i].first_miss = key; + l_rep[i].absent_key = key; + } else if (l_rep[i].absent_key != INVALID_KEY && l_rep[i].exist_key == INVALID_KEY) { + /* We should never find an existing key after we have detected one missing. */ + l_rep[i].exist_key = key; + fatal = true; + } + /* The oplog table should always have all data. */ + if ((ret = cur_oplog->search(cur_oplog)) != 0) { + if (ret != WT_NOTFOUND) + testutil_die(ret, "search"); + printf("%s: OPLOG no record with key %" PRIu64 "\n", fname, key); + absent_oplog++; + if (o_rep[i].first_miss == INVALID_KEY) + o_rep[i].first_miss = key; + o_rep[i].absent_key = key; + } else if (o_rep[i].absent_key != INVALID_KEY && o_rep[i].exist_key == INVALID_KEY) { + /* We should never find an existing key after we have detected one missing. */ + o_rep[i].exist_key = key; + fatal = true; + } + } + c_rep[i].last_key = last_key; + l_rep[i].last_key = last_key; + o_rep[i].last_key = last_key; + testutil_checksys(fclose(fp) != 0); + print_missing(&c_rep[i], fname, "COLLECTION"); + print_missing(&l_rep[i], fname, "LOCAL"); + print_missing(&o_rep[i], fname, "OPLOG"); + } + testutil_check(conn->close(conn, NULL)); + if (absent_coll) { + printf("COLLECTION: %" PRIu64 " record(s) absent from %" PRIu64 "\n", absent_coll, count); + fatal = true; + } + if (absent_shadow) { + printf("SHADOW: %" PRIu64 " record(s) absent from %" PRIu64 "\n", absent_shadow, count); + fatal = true; + } + if (absent_local) { + printf("LOCAL: %" PRIu64 " record(s) absent from %" PRIu64 "\n", absent_local, count); + fatal = true; + } + if (absent_oplog) { + printf("OPLOG: %" PRIu64 " record(s) absent from %" PRIu64 "\n", absent_oplog, count); + fatal = true; + } + testutil_check(pthread_rwlock_destroy(&flush_lock)); + testutil_check(pthread_rwlock_destroy(&ts_lock)); + if (fatal) + return (EXIT_FAILURE); + printf("%" PRIu64 " records verified\n", count); + return (EXIT_SUCCESS); +} diff --git a/src/third_party/wiredtiger/test/csuite/tiered_abort/smoke.sh b/src/third_party/wiredtiger/test/csuite/tiered_abort/smoke.sh new file mode 100755 index 00000000000..b853c4fcd5e --- /dev/null +++ b/src/third_party/wiredtiger/test/csuite/tiered_abort/smoke.sh @@ -0,0 +1,24 @@ +#! /bin/sh + +set -e + +# Return success from the script because the test itself does not yet work. +# Do this in the script so that we can manually run the program on the command line. +exit 0 +# Smoke-test tiered-abort as part of running "make check". + +if [ -n "$1" ] +then + # If the test binary is passed in manually. + test_bin=$1 +else + # If $top_builddir/$top_srcdir aren't set, default to building in build_posix + # and running in test/csuite. + top_builddir=${top_builddir:-../../build_posix} + top_srcdir=${top_srcdir:-../..} + test_bin=$top_builddir/test/csuite/test_tiered_abort +fi +$TEST_WRAPPER $test_bin -t 10 -T 5 +$TEST_WRAPPER $test_bin -m -t 10 -T 5 +$TEST_WRAPPER $test_bin -C -t 10 -T 5 +$TEST_WRAPPER $test_bin -C -m -t 10 -T 5 diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index 1558e681cd8..0a9642ed335 100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -385,6 +385,14 @@ functions: set -o errexit set -o verbose ${test_env_vars|} ./test_random_abort ${random_abort_args|} 2>&1 + "tiered abort test": + command: shell.exec + params: + working_dir: "wiredtiger/build_posix/test/csuite" + script: | + set -o errexit + set -o verbose + ${test_env_vars|} ./test_tiered_abort ${tiered_abort_args|} 2>&1 "timestamp abort test": command: shell.exec params: @@ -1051,6 +1059,21 @@ tasks: ${test_env_vars|} $(pwd)/../test/csuite/schema_abort/smoke.sh 2>&1 + - name: csuite-tiered-abort-test + tags: ["pull_request"] + depends_on: + - name: compile + commands: + - func: "fetch artifacts" + - command: shell.exec + params: + working_dir: "wiredtiger/build_posix" + script: | + set -o errexit + set -o verbose + + ${test_env_vars|} $(pwd)/../test/csuite/tiered_abort/smoke.sh 2>&1 + - name: csuite-timestamp-abort-test tags: ["pull_request"] depends_on: |