diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2016-11-15 16:53:56 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2016-11-15 16:53:56 +1100 |
commit | ad3f146905f28e3d8a63811dbd1bf1c7a85409f3 (patch) | |
tree | 620dcf3ec047df184179ccbe53f5b950b381d0b6 | |
parent | 64b274e02c63ace15bd60cd47c754967339c586a (diff) | |
download | mongo-ad3f146905f28e3d8a63811dbd1bf1c7a85409f3.tar.gz |
Import wiredtiger: 187707a5c18897fa0087b1abab695fb115de53b9 from branch mongodb-3.2
ref: 2e2d5fe23b..187707a5c1
for: 3.2.11
WT-3000 Missing log records in recovery when crashing after a log file switch
-rw-r--r-- | src/third_party/wiredtiger/dist/s_string.ok | 1 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/log/log.c | 78 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/recovery/random-abort.c | 82 | ||||
-rwxr-xr-x | src/third_party/wiredtiger/test/recovery/smoke.sh | 1 |
4 files changed, 125 insertions, 37 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 65c74b61995..d19c05e802b 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -863,6 +863,7 @@ lu lwsync lz lzo +mT madvise majorp malloc diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 00e4ea5f441..96b593ec706 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -21,6 +21,60 @@ static int __log_write_internal( #define WT_LOG_OPEN_VERIFY 0x02 /* + * __log_wait_for_earlier_slot -- + * Wait for write_lsn to catch up to this slot. + */ +static void +__log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) +{ + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + int yield_count; + + conn = S2C(session); + log = conn->log; + yield_count = 0; + + while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) { + /* + * If we're on a locked path and the write LSN is not advancing, + * unlock in case an earlier thread is trying to switch its + * slot and complete its operation. + */ + if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) + __wt_spin_unlock(session, &log->log_slot_lock); + __wt_cond_auto_signal(session, conn->log_wrlsn_cond); + if (++yield_count < WT_THOUSAND) + __wt_yield(); + else + __wt_cond_wait(session, log->log_write_cond, 200); + if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) + __wt_spin_lock(session, &log->log_slot_lock); + } +} + +/* + * __log_fs_write -- + * Wrapper when writing to a log file. If we're writing to a new log + * file for the first time wait for writes to the previous log file. + */ +static int +__log_fs_write(WT_SESSION_IMPL *session, + WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf) +{ + /* + * If we're writing into a new log file, we have to wait for all + * writes to the previous log file to complete otherwise there could + * be a hole at the end of the previous log file that we cannot detect. + */ + if (slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) { + __log_wait_for_earlier_slot(session, slot); + WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn)); + } + return (__wt_write(session, slot->slot_fh, offset, len, buf)); +} + +/* * __wt_log_ckpt -- * Record the given LSN as the checkpoint LSN and signal the archive * thread as needed. @@ -576,7 +630,7 @@ __log_fill(WT_SESSION_IMPL *session, /* * If this is a force or unbuffered write, write it now. */ - WT_ERR(__wt_write(session, myslot->slot->slot_fh, + WT_ERR(__log_fs_write(session, myslot->slot, myslot->offset + myslot->slot->slot_start_offset, record->size, record->mem)); @@ -1352,13 +1406,11 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) WT_LSN sync_lsn; int64_t release_buffered, release_bytes; uint64_t fsync_duration_usecs; - int yield_count; bool locked; conn = S2C(session); log = conn->log; locked = false; - yield_count = 0; if (freep != NULL) *freep = 1; release_buffered = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state); @@ -1379,8 +1431,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) /* Write the buffered records */ if (release_buffered != 0) - WT_ERR(__wt_write(session, - slot->slot_fh, slot->slot_start_offset, + WT_ERR(__log_fs_write(session, slot, slot->slot_start_offset, (size_t)release_buffered, slot->slot_buf.mem)); /* @@ -1411,22 +1462,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) * be holes in the log file. */ WT_STAT_CONN_INCR(session, log_release_write_lsn); - while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) { - /* - * If we're on a locked path and the write LSN is not advancing, - * unlock in case an earlier thread is trying to switch its - * slot and complete its operation. - */ - if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) - __wt_spin_unlock(session, &log->log_slot_lock); - __wt_cond_auto_signal(session, conn->log_wrlsn_cond); - if (++yield_count < WT_THOUSAND) - __wt_yield(); - else - __wt_cond_wait(session, log->log_write_cond, 200); - if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) - __wt_spin_lock(session, &log->log_slot_lock); - } + __log_wait_for_earlier_slot(session, slot); log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; diff --git a/src/third_party/wiredtiger/test/recovery/random-abort.c b/src/third_party/wiredtiger/test/recovery/random-abort.c index 03e67e2f723..21460646017 100644 --- a/src/third_party/wiredtiger/test/recovery/random-abort.c +++ b/src/third_party/wiredtiger/test/recovery/random-abort.c @@ -34,6 +34,7 @@ static char home[512]; /* Program working dir */ static const char *progname; /* Program name */ static const char * const uri = "table:main"; +bool inmem; #define MAX_TH 12 #define MIN_TH 5 @@ -41,7 +42,9 @@ static const char * const uri = "table:main"; #define MIN_TIME 10 #define RECORDS_FILE "records-%" PRIu32 -#define ENV_CONFIG \ +#define ENV_CONFIG_DEF \ + "create,log=(file_max=10M,archive=false,enabled)" +#define ENV_CONFIG_TXNSYNC \ "create,log=(file_max=10M,archive=false,enabled)," \ "transaction_sync=(enabled,method=none)" #define ENV_CONFIG_REC "log=(recover=on)" @@ -73,11 +76,15 @@ thread_run(void *arg) WT_THREAD_DATA *td; uint64_t i; int ret; - char buf[MAX_VAL], kname[64]; + size_t lsize; + char buf[MAX_VAL], kname[64], lgbuf[8]; + char large[128*1024]; __wt_random_init(&rnd); memset(buf, 0, sizeof(buf)); memset(kname, 0, sizeof(kname)); + lsize = sizeof(large); + memset(large, 0, lsize); td = (WT_THREAD_DATA *)arg; /* @@ -85,6 +92,13 @@ thread_run(void *arg) */ snprintf(buf, sizeof(buf), RECORDS_FILE, td->id); /* + * Set up a large value putting our id in it. Write it in there a + * bunch of times, but the rest of the buffer can just be zero. + */ + snprintf(lgbuf, sizeof(lgbuf), "th-%" PRIu32, td->id); + for (i = 0; i < 128; i += strlen(lgbuf)) + snprintf(&large[i], lsize - i, "%s", lgbuf); + /* * Keep a separate file with the records we wrote for checking. */ (void)unlink(buf); @@ -107,8 +121,18 @@ thread_run(void *arg) */ for (i = td->start; ; ++i) { snprintf(kname, sizeof(kname), "%" PRIu64, i); - data.size = __wt_random(&rnd) % MAX_VAL; cursor->set_key(cursor, kname); + /* + * Every 30th record write a very large record that exceeds the + * log buffer size. This forces us to use the unbuffered path. + */ + if (i % 30 == 0) { + data.size = 128 * 1024; + data.data = large; + } else { + data.size = __wt_random(&rnd) % MAX_VAL; + data.data = buf; + } cursor->set_value(cursor, &data); if ((ret = cursor->insert(cursor)) != 0) testutil_die(ret, "WT_CURSOR.insert"); @@ -136,12 +160,17 @@ fill_db(uint32_t nth) WT_THREAD_DATA *td; uint32_t i; int ret; + const char *envconf; thr = dcalloc(nth, sizeof(pthread_t)); td = dcalloc(nth, sizeof(WT_THREAD_DATA)); if (chdir(home) != 0) testutil_die(errno, "Child chdir: %s", home); - if ((ret = wiredtiger_open(NULL, NULL, ENV_CONFIG, &conn)) != 0) + if (inmem) + envconf = ENV_CONFIG_DEF; + else + envconf = ENV_CONFIG_TXNSYNC; + if ((ret = wiredtiger_open(NULL, NULL, envconf, &conn)) != 0) testutil_die(ret, "wiredtiger_open"); if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) testutil_die(ret, "WT_CONNECTION:open_session"); @@ -187,11 +216,11 @@ main(int argc, char *argv[]) WT_CURSOR *cursor; WT_SESSION *session; WT_RAND_STATE rnd; - uint64_t key, last_key; - uint32_t absent, count, i, nth, timeout; + uint64_t absent, count, key, last_key, middle; + uint32_t i, nth, timeout; int ch, status, ret; pid_t pid; - bool rand_th, rand_time, verify_only; + bool fatal, rand_th, rand_time, verify_only; const char *working_dir; char fname[64], kname[64]; @@ -200,17 +229,21 @@ main(int argc, char *argv[]) else ++progname; + inmem = false; nth = MIN_TH; rand_th = rand_time = true; timeout = MIN_TIME; verify_only = false; working_dir = "WT_TEST.random-abort"; - while ((ch = __wt_getopt(progname, argc, argv, "h:T:t:v")) != EOF) + while ((ch = __wt_getopt(progname, argc, argv, "h:mT:t:v")) != EOF) switch (ch) { case 'h': working_dir = __wt_optarg; break; + case 'm': + inmem = true; + break; case 'T': rand_th = false; nth = (uint32_t)atoi(__wt_optarg); @@ -303,7 +336,9 @@ main(int argc, char *argv[]) testutil_die(ret, "WT_SESSION.open_cursor: %s", uri); absent = count = 0; + fatal = false; for (i = 0; i < nth; ++i) { + middle = 0; snprintf(fname, sizeof(fname), RECORDS_FILE, i); if ((fp = fopen(fname, "r")) == NULL) { fprintf(stderr, @@ -313,8 +348,10 @@ main(int argc, char *argv[]) /* * For every key in the saved file, verify that the key exists - * in the table after recovery. Since we did write-no-sync, we - * expect every key to have been recovered. + * in the table after recovery. If we're doing in-memory + * log buffering we never expect a record missing in the middle, + * but records may be missing at the end. If we did + * write-no-sync, we expect every key to have been recovered. */ for (last_key = UINT64_MAX;; ++count, last_key = key) { ret = fscanf(fp, "%" SCNu64 "\n", &key); @@ -338,9 +375,20 @@ main(int argc, char *argv[]) if ((ret = cursor->search(cursor)) != 0) { if (ret != WT_NOTFOUND) testutil_die(ret, "search"); - printf("%s: no record with key %" PRIu64 "\n", - fname, key); - ++absent; + if (!inmem) + printf("%s: no record with key %" + PRIu64 "\n", fname, key); + absent++; + middle = key; + } else if (middle != 0) { + /* + * We should never find an existing key after + * we have detected one missing. + */ + printf("%s: after absent record at %" PRIu64 + " key %" PRIu64 " exists\n", + fname, middle, key); + fatal = true; } } if (fclose(fp) != 0) @@ -348,11 +396,13 @@ main(int argc, char *argv[]) } if ((ret = conn->close(conn, NULL)) != 0) testutil_die(ret, "WT_CONNECTION:close"); - if (absent) { - printf("%" PRIu32 " record(s) absent from %" PRIu32 "\n", + if (fatal) + return (EXIT_FAILURE); + if (!inmem && absent) { + printf("%" PRIu64 " record(s) absent from %" PRIu64 "\n", absent, count); return (EXIT_FAILURE); } - printf("%" PRIu32 " records verified\n", count); + printf("%" PRIu64 " records verified\n", count); return (EXIT_SUCCESS); } diff --git a/src/third_party/wiredtiger/test/recovery/smoke.sh b/src/third_party/wiredtiger/test/recovery/smoke.sh index c7677b64503..ce0662d3b2b 100755 --- a/src/third_party/wiredtiger/test/recovery/smoke.sh +++ b/src/third_party/wiredtiger/test/recovery/smoke.sh @@ -5,4 +5,5 @@ set -e # Smoke-test recovery as part of running "make check". $TEST_WRAPPER ./random-abort -t 10 -T 5 +$TEST_WRAPPER ./random-abort -m -t 10 -T 5 $TEST_WRAPPER ./truncated-log |