diff options
author | Luke Chen <luke.chen@mongodb.com> | 2019-08-21 05:23:37 +0000 |
---|---|---|
committer | evergreen <evergreen@mongodb.com> | 2019-08-21 05:23:37 +0000 |
commit | ac41c65f6355f83aac70136324c98561ac79daa1 (patch) | |
tree | a7c3f7ef090b59c6a06838a02c96bd1d49e1c729 /src/third_party/wiredtiger/src/log/log.c | |
parent | f54709196711c63a429b71f47c584661286d675f (diff) | |
download | mongo-ac41c65f6355f83aac70136324c98561ac79daa1.tar.gz |
Import wiredtiger: 7dfd9391862bc9a6d84868c4dc51689c45a3aacf from branch mongodb-4.4
ref: c809757d8b..7dfd939186
for: 4.3.1
WT-4658 Apply Clang Format
WT-4810 Adding WT_ERR_ASSERT and WT_RET_ASSERT macros
WT-5046 Prepared transactions aren't properly cleared from global table with WT_CONN_LOG_DEBUG_MODE enabled
Diffstat (limited to 'src/third_party/wiredtiger/src/log/log.c')
-rw-r--r-- | src/third_party/wiredtiger/src/log/log.c | 5116 |
1 files changed, 2429 insertions, 2687 deletions
diff --git a/src/third_party/wiredtiger/src/log/log.c b/src/third_party/wiredtiger/src/log/log.c index 6361177c193..d6f18f82bb9 100644 --- a/src/third_party/wiredtiger/src/log/log.c +++ b/src/third_party/wiredtiger/src/log/log.c @@ -11,3079 +11,2821 @@ static int __log_newfile(WT_SESSION_IMPL *, bool, bool *); static int __log_openfile(WT_SESSION_IMPL *, uint32_t, uint32_t, WT_FH **); static int __log_truncate(WT_SESSION_IMPL *, WT_LSN *, bool, bool); -static int __log_write_internal( - WT_SESSION_IMPL *, WT_ITEM *, WT_LSN *, uint32_t); +static int __log_write_internal(WT_SESSION_IMPL *, WT_ITEM *, WT_LSN *, uint32_t); -#define WT_LOG_COMPRESS_SKIP (offsetof(WT_LOG_RECORD, record)) -#define WT_LOG_ENCRYPT_SKIP (offsetof(WT_LOG_RECORD, record)) +#define WT_LOG_COMPRESS_SKIP (offsetof(WT_LOG_RECORD, record)) +#define WT_LOG_ENCRYPT_SKIP (offsetof(WT_LOG_RECORD, record)) /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_LOG_OPEN_CREATE_OK 0x1u /* Flag to __log_openfile() */ -/* AUTOMATIC FLAG VALUE GENERATION STOP */ +#define WT_LOG_OPEN_CREATE_OK 0x1u /* Flag to __log_openfile() */ + /* AUTOMATIC FLAG VALUE GENERATION STOP */ /* * __wt_log_printf -- - * Write a text message to the log. + * Write a text message to the log. */ int __wt_log_printf(WT_SESSION_IMPL *session, const char *format, ...) { - WT_DECL_RET; - va_list ap; + WT_DECL_RET; + va_list ap; - va_start(ap, format); - ret = __wt_log_vprintf(session, format, ap); - va_end(ap); - return (ret); + va_start(ap, format); + ret = __wt_log_vprintf(session, format, ap); + va_end(ap); + return (ret); } /* * __log_checksum_match -- - * Given a log record, return whether the checksum matches. + * Given a log record, return whether the checksum matches. */ static bool __log_checksum_match(WT_ITEM *buf, uint32_t reclen) { - WT_LOG_RECORD *logrec; - uint32_t checksum_saved, checksum_tmp; - bool checksum_matched; + WT_LOG_RECORD *logrec; + uint32_t checksum_saved, checksum_tmp; + bool checksum_matched; - logrec = buf->mem; - checksum_saved = checksum_tmp = logrec->checksum; + logrec = buf->mem; + checksum_saved = checksum_tmp = logrec->checksum; #ifdef WORDS_BIGENDIAN - checksum_tmp = __wt_bswap32(checksum_tmp); + checksum_tmp = __wt_bswap32(checksum_tmp); #endif - logrec->checksum = 0; - checksum_matched = __wt_checksum_match(logrec, reclen, checksum_tmp); - logrec->checksum = checksum_saved; - return (checksum_matched); + logrec->checksum = 0; + checksum_matched = __wt_checksum_match(logrec, reclen, checksum_tmp); + logrec->checksum = checksum_saved; + return (checksum_matched); } /* * __log_get_files -- - * Retrieve the list of all log-related files of the given prefix type. + * Retrieve the list of all log-related files of the given prefix type. */ static int -__log_get_files(WT_SESSION_IMPL *session, - const char *file_prefix, char ***filesp, u_int *countp) +__log_get_files(WT_SESSION_IMPL *session, const char *file_prefix, char ***filesp, u_int *countp) { - WT_CONNECTION_IMPL *conn; - const char *log_path; - - *countp = 0; - *filesp = NULL; - - conn = S2C(session); - log_path = conn->log_path; - if (log_path == NULL) - log_path = ""; - return (__wt_fs_directory_list( - session, log_path, file_prefix, filesp, countp)); + WT_CONNECTION_IMPL *conn; + const char *log_path; + + *countp = 0; + *filesp = NULL; + + conn = S2C(session); + log_path = conn->log_path; + if (log_path == NULL) + log_path = ""; + return (__wt_fs_directory_list(session, log_path, file_prefix, filesp, countp)); } /* * __log_get_files_single -- - * Retrieve a single log-related file of the given prefix type. + * Retrieve a single log-related file of the given prefix type. */ static int -__log_get_files_single(WT_SESSION_IMPL *session, - const char *file_prefix, char ***filesp, u_int *countp) +__log_get_files_single( + WT_SESSION_IMPL *session, const char *file_prefix, char ***filesp, u_int *countp) { - WT_CONNECTION_IMPL *conn; - const char *log_path; - - *countp = 0; - *filesp = NULL; - - conn = S2C(session); - log_path = conn->log_path; - if (log_path == NULL) - log_path = ""; - return (__wt_fs_directory_list_single( - session, log_path, file_prefix, filesp, countp)); + WT_CONNECTION_IMPL *conn; + const char *log_path; + + *countp = 0; + *filesp = NULL; + + conn = S2C(session); + log_path = conn->log_path; + if (log_path == NULL) + log_path = ""; + return (__wt_fs_directory_list_single(session, log_path, file_prefix, filesp, countp)); } /* * __log_prealloc_remove -- - * Remove all previously created pre-allocated files. + * Remove all previously created pre-allocated files. */ static int __log_prealloc_remove(WT_SESSION_IMPL *session) { - WT_DECL_RET; - WT_LOG *log; - uint32_t lognum; - u_int i, logcount; - char **logfiles; - - logfiles = NULL; - logcount = 0; - log = S2C(session)->log; - __wt_spin_lock(session, &log->log_fs_lock); - /* - * Clean up any old interim pre-allocated files. We clean - * up these files because settings may have changed upon reboot - * and we want those settings to take effect right away. - */ - WT_ERR(__log_get_files(session, - WT_LOG_TMPNAME, &logfiles, &logcount)); - for (i = 0; i < logcount; i++) { - WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); - WT_ERR(__wt_log_remove(session, WT_LOG_TMPNAME, lognum)); - } - WT_ERR(__wt_fs_directory_list_free(session, &logfiles, logcount)); - WT_ERR(__log_get_files(session, - WT_LOG_PREPNAME, &logfiles, &logcount)); - for (i = 0; i < logcount; i++) { - WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); - WT_ERR(__wt_log_remove(session, WT_LOG_PREPNAME, lognum)); - } -err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); - __wt_spin_unlock(session, &log->log_fs_lock); - return (ret); + WT_DECL_RET; + WT_LOG *log; + uint32_t lognum; + u_int i, logcount; + char **logfiles; + + logfiles = NULL; + logcount = 0; + log = S2C(session)->log; + __wt_spin_lock(session, &log->log_fs_lock); + /* + * Clean up any old interim pre-allocated files. We clean up these files because settings may + * have changed upon reboot and we want those settings to take effect right away. + */ + WT_ERR(__log_get_files(session, WT_LOG_TMPNAME, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); + WT_ERR(__wt_log_remove(session, WT_LOG_TMPNAME, lognum)); + } + WT_ERR(__wt_fs_directory_list_free(session, &logfiles, logcount)); + WT_ERR(__log_get_files(session, WT_LOG_PREPNAME, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); + WT_ERR(__wt_log_remove(session, WT_LOG_PREPNAME, lognum)); + } +err: + WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); + __wt_spin_unlock(session, &log->log_fs_lock); + return (ret); } /* * __log_wait_for_earlier_slot -- - * Wait for write_lsn to catch up to this slot. + * Wait for write_lsn to catch up to this slot. */ static void __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { - WT_CONNECTION_IMPL *conn; - WT_LOG *log; - int yield_count; - - conn = S2C(session); - log = conn->log; - yield_count = 0; - - while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) { - /* - * If we're on a locked path and the write LSN is not advancing, - * unlock in case an earlier thread is trying to switch its - * slot and complete its operation. - */ - if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) - __wt_spin_unlock(session, &log->log_slot_lock); - /* - * This may not be initialized if we are starting at an - * older log file version. So only signal if valid. - */ - if (conn->log_wrlsn_cond != NULL) - __wt_cond_signal(session, conn->log_wrlsn_cond); - if (++yield_count < WT_THOUSAND) - __wt_yield(); - else - __wt_cond_wait(session, log->log_write_cond, 200, NULL); - if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) - __wt_spin_lock(session, &log->log_slot_lock); - } + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + int yield_count; + + conn = S2C(session); + log = conn->log; + yield_count = 0; + + while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) { + /* + * If we're on a locked path and the write LSN is not advancing, unlock in case an earlier + * thread is trying to switch its slot and complete its operation. + */ + if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) + __wt_spin_unlock(session, &log->log_slot_lock); + /* + * This may not be initialized if we are starting at an older log file version. So only + * signal if valid. + */ + if (conn->log_wrlsn_cond != NULL) + __wt_cond_signal(session, conn->log_wrlsn_cond); + if (++yield_count < WT_THOUSAND) + __wt_yield(); + else + __wt_cond_wait(session, log->log_write_cond, 200, NULL); + if (F_ISSET(session, WT_SESSION_LOCKED_SLOT)) + __wt_spin_lock(session, &log->log_slot_lock); + } } /* * __log_fs_read -- - * Wrapper when reading from a log file. + * Wrapper when reading from a log file. */ static int -__log_fs_read(WT_SESSION_IMPL *session, - WT_FH *fh, wt_off_t offset, size_t len, void *buf) +__log_fs_read(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf) { - WT_DECL_RET; + WT_DECL_RET; - __wt_capacity_throttle(session, len, WT_THROTTLE_LOG); - if ((ret = __wt_read(session, fh, offset, len, buf)) != 0) - WT_RET_MSG(session, ret, "%s: log read failure", fh->name); - return (ret); + __wt_capacity_throttle(session, len, WT_THROTTLE_LOG); + if ((ret = __wt_read(session, fh, offset, len, buf)) != 0) + WT_RET_MSG(session, ret, "%s: log read failure", fh->name); + return (ret); } /* * __log_fs_write -- - * Wrapper when writing to a log file. If we're writing to a new log - * file for the first time wait for writes to the previous log file. + * Wrapper when writing to a log file. If we're writing to a new log file for the first time + * wait for writes to the previous log file. */ static int -__log_fs_write(WT_SESSION_IMPL *session, - WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf) +__log_fs_write( + WT_SESSION_IMPL *session, WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf) { - WT_DECL_RET; - - /* - * If we're writing into a new log file and we're running in - * compatibility mode to an older release, we have to wait for all - * writes to the previous log file to complete otherwise there could - * be a hole at the end of the previous log file that we cannot detect. - * - * NOTE: Check for a version less than the one writing the system - * record since we've had a log version change without any actual - * file format changes. - */ - if (S2C(session)->log->log_version < WT_LOG_VERSION_SYSTEM && - slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) { - __log_wait_for_earlier_slot(session, slot); - WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn)); - } - __wt_capacity_throttle(session, len, WT_THROTTLE_LOG); - if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0) - WT_PANIC_RET(session, ret, - "%s: fatal log failure", slot->slot_fh->name); - return (ret); + WT_DECL_RET; + + /* + * If we're writing into a new log file and we're running in + * compatibility mode to an older release, we have to wait for all + * writes to the previous log file to complete otherwise there could + * be a hole at the end of the previous log file that we cannot detect. + * + * NOTE: Check for a version less than the one writing the system + * record since we've had a log version change without any actual + * file format changes. + */ + if (S2C(session)->log->log_version < WT_LOG_VERSION_SYSTEM && + slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) { + __log_wait_for_earlier_slot(session, slot); + WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn)); + } + __wt_capacity_throttle(session, len, WT_THROTTLE_LOG); + if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0) + WT_PANIC_RET(session, ret, "%s: fatal log failure", slot->slot_fh->name); + return (ret); } /* * __wt_log_ckpt -- - * Record the given LSN as the checkpoint LSN and signal the archive - * thread as needed. + * Record the given LSN as the checkpoint LSN and signal the archive thread as needed. */ void __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckpt_lsn) { - WT_CONNECTION_IMPL *conn; - WT_LOG *log; - int i; - - conn = S2C(session); - log = conn->log; - log->ckpt_lsn = *ckpt_lsn; - if (conn->log_cond != NULL) - __wt_cond_signal(session, conn->log_cond); - /* - * If we are storing debugging LSNs to retain additional log files - * from archiving, then rotate the newest LSN into the array. - */ - if (conn->debug_ckpt_cnt != 0) { - for (i = (int)conn->debug_ckpt_cnt - 1; i > 0; --i) - conn->debug_ckpt[i] = conn->debug_ckpt[i - 1]; - conn->debug_ckpt[0] = *ckpt_lsn; - } + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + int i; + + conn = S2C(session); + log = conn->log; + log->ckpt_lsn = *ckpt_lsn; + if (conn->log_cond != NULL) + __wt_cond_signal(session, conn->log_cond); + /* + * If we are storing debugging LSNs to retain additional log files from archiving, then rotate + * the newest LSN into the array. + */ + if (conn->debug_ckpt_cnt != 0) { + for (i = (int)conn->debug_ckpt_cnt - 1; i > 0; --i) + conn->debug_ckpt[i] = conn->debug_ckpt[i - 1]; + conn->debug_ckpt[0] = *ckpt_lsn; + } } /* * __wt_log_flush_lsn -- - * Force out buffered records and return the LSN, either the - * write_start_lsn or write_lsn depending on the argument. + * Force out buffered records and return the LSN, either the write_start_lsn or write_lsn + * depending on the argument. */ int __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) { - WT_CONNECTION_IMPL *conn; - WT_LOG *log; - - conn = S2C(session); - log = conn->log; - WT_RET(__wt_log_force_write(session, 1, NULL)); - __wt_log_wrlsn(session, NULL); - if (start) - *lsn = log->write_start_lsn; - else - *lsn = log->write_lsn; - return (0); + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + WT_RET(__wt_log_force_write(session, 1, NULL)); + __wt_log_wrlsn(session, NULL); + if (start) + *lsn = log->write_start_lsn; + else + *lsn = log->write_lsn; + return (0); } /* * __wt_log_background -- - * Record the given LSN as the background LSN and signal the - * thread as needed. + * Record the given LSN as the background LSN and signal the thread as needed. */ void __wt_log_background(WT_SESSION_IMPL *session, WT_LSN *lsn) { - WT_CONNECTION_IMPL *conn; - WT_LOG *log; - - conn = S2C(session); - log = conn->log; - /* - * If a thread already set the LSN to a bigger LSN, we're done. - */ - if (__wt_log_cmp(&session->bg_sync_lsn, lsn) > 0) - return; - session->bg_sync_lsn = *lsn; - - /* - * Advance the logging subsystem background sync LSN if - * needed. - */ - __wt_spin_lock(session, &log->log_sync_lock); - if (__wt_log_cmp(lsn, &log->bg_sync_lsn) > 0) - log->bg_sync_lsn = *lsn; - __wt_spin_unlock(session, &log->log_sync_lock); - __wt_cond_signal(session, conn->log_file_cond); + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + /* + * If a thread already set the LSN to a bigger LSN, we're done. + */ + if (__wt_log_cmp(&session->bg_sync_lsn, lsn) > 0) + return; + session->bg_sync_lsn = *lsn; + + /* + * Advance the logging subsystem background sync LSN if needed. + */ + __wt_spin_lock(session, &log->log_sync_lock); + if (__wt_log_cmp(lsn, &log->bg_sync_lsn) > 0) + log->bg_sync_lsn = *lsn; + __wt_spin_unlock(session, &log->log_sync_lock); + __wt_cond_signal(session, conn->log_file_cond); } /* * __wt_log_force_sync -- - * Force a sync of the log and files. + * Force a sync of the log and files. */ int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) { - WT_DECL_RET; - WT_FH *log_fh; - WT_LOG *log; - uint64_t fsync_duration_usecs, time_start, time_stop; - - log = S2C(session)->log; - log_fh = NULL; - - /* - * We need to wait for the previous log file to get written - * to disk before we sync out the current one and advance - * the LSN. Signal the worker thread because we know the - * LSN has moved into a later log file and there should be a - * log file ready to close. - */ - while (log->sync_lsn.l.file < min_lsn->l.file) { - __wt_cond_signal(session, S2C(session)->log_file_cond); - __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); - } - __wt_spin_lock(session, &log->log_sync_lock); - WT_ASSERT(session, log->log_dir_fh != NULL); - /* - * Sync the directory if the log file entry hasn't been written - * into the directory. - */ - if (log->sync_dir_lsn.l.file < min_lsn->l.file) { - __wt_verbose(session, WT_VERB_LOG, - "log_force_sync: sync directory %s to LSN %" PRIu32 - "/%" PRIu32, - log->log_dir_fh->name, min_lsn->l.file, min_lsn->l.offset); - time_start = __wt_clock(session); - WT_ERR(__wt_fsync(session, log->log_dir_fh, true)); - time_stop = __wt_clock(session); - fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); - log->sync_dir_lsn = *min_lsn; - WT_STAT_CONN_INCR(session, log_sync_dir); - WT_STAT_CONN_INCRV(session, - log_sync_dir_duration, fsync_duration_usecs); - } - /* - * Sync the log file if needed. - */ - if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) { - /* - * Get our own file handle to the log file. It is possible - * for the file handle in the log structure to change out - * from under us and either be NULL or point to a different - * file than we want. - */ - WT_ERR(__log_openfile(session, min_lsn->l.file, 0, &log_fh)); - __wt_verbose(session, WT_VERB_LOG, - "log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32, - log_fh->name, min_lsn->l.file, min_lsn->l.offset); - time_start = __wt_clock(session); - WT_ERR(__wt_fsync(session, log_fh, true)); - time_stop = __wt_clock(session); - fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); - log->sync_lsn = *min_lsn; - WT_STAT_CONN_INCR(session, log_sync); - WT_STAT_CONN_INCRV(session, - log_sync_duration, fsync_duration_usecs); - __wt_cond_signal(session, log->log_sync_cond); - } + WT_DECL_RET; + WT_FH *log_fh; + WT_LOG *log; + uint64_t fsync_duration_usecs, time_start, time_stop; + + log = S2C(session)->log; + log_fh = NULL; + + /* + * We need to wait for the previous log file to get written to disk before we sync out the + * current one and advance the LSN. Signal the worker thread because we know the LSN has moved + * into a later log file and there should be a log file ready to close. + */ + while (log->sync_lsn.l.file < min_lsn->l.file) { + __wt_cond_signal(session, S2C(session)->log_file_cond); + __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); + } + __wt_spin_lock(session, &log->log_sync_lock); + WT_ASSERT(session, log->log_dir_fh != NULL); + /* + * Sync the directory if the log file entry hasn't been written into the directory. + */ + if (log->sync_dir_lsn.l.file < min_lsn->l.file) { + __wt_verbose(session, WT_VERB_LOG, + "log_force_sync: sync directory %s to LSN %" PRIu32 "/%" PRIu32, log->log_dir_fh->name, + min_lsn->l.file, min_lsn->l.offset); + time_start = __wt_clock(session); + WT_ERR(__wt_fsync(session, log->log_dir_fh, true)); + time_stop = __wt_clock(session); + fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); + log->sync_dir_lsn = *min_lsn; + WT_STAT_CONN_INCR(session, log_sync_dir); + WT_STAT_CONN_INCRV(session, log_sync_dir_duration, fsync_duration_usecs); + } + /* + * Sync the log file if needed. + */ + if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) { + /* + * Get our own file handle to the log file. It is possible for the file handle in the log + * structure to change out from under us and either be NULL or point to a different file + * than we want. + */ + WT_ERR(__log_openfile(session, min_lsn->l.file, 0, &log_fh)); + __wt_verbose(session, WT_VERB_LOG, "log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32, + log_fh->name, min_lsn->l.file, min_lsn->l.offset); + time_start = __wt_clock(session); + WT_ERR(__wt_fsync(session, log_fh, true)); + time_stop = __wt_clock(session); + fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); + log->sync_lsn = *min_lsn; + WT_STAT_CONN_INCR(session, log_sync); + WT_STAT_CONN_INCRV(session, log_sync_duration, fsync_duration_usecs); + __wt_cond_signal(session, log->log_sync_cond); + } err: - __wt_spin_unlock(session, &log->log_sync_lock); - if (log_fh != NULL) - WT_TRET(__wt_close(session, &log_fh)); - return (ret); + __wt_spin_unlock(session, &log->log_sync_lock); + if (log_fh != NULL) + WT_TRET(__wt_close(session, &log_fh)); + return (ret); } /* * __wt_log_needs_recovery -- - * Return 0 if we encounter a clean shutdown and 1 if recovery - * must be run in the given variable. + * Return 0 if we encounter a clean shutdown and 1 if recovery must be run in the given + * variable. */ int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp) { - WT_CONNECTION_IMPL *conn; - WT_CURSOR *c; - WT_DECL_RET; - WT_ITEM dummy_key, dummy_value; - WT_LOG *log; - uint64_t dummy_txnid; - uint32_t dummy_fileid, dummy_optype, rectype; - - /* - * Default is to run recovery always (regardless of whether this - * connection has logging enabled). - */ - *recp = true; - - conn = S2C(session); - log = conn->log; - - if (log == NULL) - return (0); - - /* - * See if there are any data modification records between the - * checkpoint LSN and the end of the log. If there are none then - * we can skip recovery. - */ - WT_RET(__wt_curlog_open(session, "log:", NULL, &c)); - c->set_key(c, ckp_lsn->l.file, ckp_lsn->l.offset, 0); - if ((ret = c->search(c)) == 0) { - while ((ret = c->next(c)) == 0) { - /* - * The only thing we care about is the rectype. - */ - WT_ERR(c->get_value(c, &dummy_txnid, &rectype, - &dummy_optype, &dummy_fileid, - &dummy_key, &dummy_value)); - if (rectype == WT_LOGREC_COMMIT) - break; - } - /* - * If we get to the end of the log, we can skip recovery. - */ - if (ret == WT_NOTFOUND) { - *recp = false; - ret = 0; - } - } else if (ret == WT_NOTFOUND) - /* - * We should always find the checkpoint LSN as it now points - * to the beginning of a written log record. But if we're - * running recovery on an earlier database we may not. In - * that case, we need to run recovery, don't return an error. - */ - ret = 0; - else - WT_ERR(ret); - -err: WT_TRET(c->close(c)); - return (ret); + WT_CONNECTION_IMPL *conn; + WT_CURSOR *c; + WT_DECL_RET; + WT_ITEM dummy_key, dummy_value; + WT_LOG *log; + uint64_t dummy_txnid; + uint32_t dummy_fileid, dummy_optype, rectype; + + /* + * Default is to run recovery always (regardless of whether this connection has logging + * enabled). + */ + *recp = true; + + conn = S2C(session); + log = conn->log; + + if (log == NULL) + return (0); + + /* + * See if there are any data modification records between the checkpoint LSN and the end of the + * log. If there are none then we can skip recovery. + */ + WT_RET(__wt_curlog_open(session, "log:", NULL, &c)); + c->set_key(c, ckp_lsn->l.file, ckp_lsn->l.offset, 0); + if ((ret = c->search(c)) == 0) { + while ((ret = c->next(c)) == 0) { + /* + * The only thing we care about is the rectype. + */ + WT_ERR(c->get_value( + c, &dummy_txnid, &rectype, &dummy_optype, &dummy_fileid, &dummy_key, &dummy_value)); + if (rectype == WT_LOGREC_COMMIT) + break; + } + /* + * If we get to the end of the log, we can skip recovery. + */ + if (ret == WT_NOTFOUND) { + *recp = false; + ret = 0; + } + } else if (ret == WT_NOTFOUND) + /* + * We should always find the checkpoint LSN as it now points to the beginning of a written + * log record. But if we're running recovery on an earlier database we may not. In that + * case, we need to run recovery, don't return an error. + */ + ret = 0; + else + WT_ERR(ret); + +err: + WT_TRET(c->close(c)); + return (ret); } /* * __wt_log_written_reset -- - * Interface to reset the amount of log written during this - * checkpoint period. Called from the checkpoint code. + * Interface to reset the amount of log written during this checkpoint period. Called from the + * checkpoint code. */ void __wt_log_written_reset(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; + WT_CONNECTION_IMPL *conn; - conn = S2C(session); + conn = S2C(session); - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) - conn->log->log_written = 0; + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) + conn->log->log_written = 0; } /* * __wt_log_get_backup_files -- - * Retrieve the list of log files for taking a backup, either all of them - * or only the active ones (those that are not candidates for archiving). - * The caller is responsible for freeing the directory list returned. + * Retrieve the list of log files for taking a backup, either all of them or only the active + * ones (those that are not candidates for archiving). The caller is responsible for freeing the + * directory list returned. */ int -__wt_log_get_backup_files(WT_SESSION_IMPL *session, - char ***filesp, u_int *countp, uint32_t *maxid, bool active_only) +__wt_log_get_backup_files( + WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, bool active_only) { - WT_DECL_RET; - WT_LOG *log; - uint32_t id, max, max_file, min_file; - u_int count, i; - char **files; - - *filesp = NULL; - *countp = 0; - *maxid = 0; - - id = 0; - log = S2C(session)->log; - - /* - * Capture the next file utilized for writing to the log, before forcing - * a new log file. This represents the latest journal file that needs to - * be copied. Note the checkpoint selected for backup may be writing to - * an even later log file. In that case, copying the journal files is - * correct, but wasteful. - */ - max_file = log->alloc_lsn.l.file; - - /* - * Capture the journal file the current checkpoint started in. The - * current checkpoint or a later one may be selected for backing up, - * requiring log files as early as this file. Together with max_file, - * this defines the range of journal files to include. - */ - min_file = log->ckpt_lsn.l.file; - - /* - * Force the current slot to get written to the file. Also switch to - * using a new log file. That log file will be removed from the list of - * files returned. New writes will not be included in the backup. - */ - if (active_only) - F_SET(log, WT_LOG_FORCE_NEWFILE); - WT_RET(__wt_log_force_write(session, 1, NULL)); - WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count)); - - for (max = 0, i = 0; i < count; ) { - WT_ERR(__wt_log_extract_lognum(session, files[i], &id)); - if (active_only && - (id < min_file || id > max_file)) { - /* - * Any files not being returned are individually freed - * and the array adjusted. - */ - __wt_free(session, files[i]); - files[i] = files[count - 1]; - files[--count] = NULL; - } else { - if (id > max) - max = id; - i++; - } - } - - *maxid = max; - *filesp = files; - *countp = count; - - /* - * Only free on error. The caller is responsible for calling free - * once it is done using the returned list. - */ - if (0) { -err: WT_TRET(__wt_fs_directory_list_free(session, &files, count)); - } - return (ret); + WT_DECL_RET; + WT_LOG *log; + uint32_t id, max, max_file, min_file; + u_int count, i; + char **files; + + *filesp = NULL; + *countp = 0; + *maxid = 0; + + id = 0; + log = S2C(session)->log; + + /* + * Capture the next file utilized for writing to the log, before forcing a new log file. This + * represents the latest journal file that needs to be copied. Note the checkpoint selected for + * backup may be writing to an even later log file. In that case, copying the journal files is + * correct, but wasteful. + */ + max_file = log->alloc_lsn.l.file; + + /* + * Capture the journal file the current checkpoint started in. The current checkpoint or a later + * one may be selected for backing up, requiring log files as early as this file. Together with + * max_file, this defines the range of journal files to include. + */ + min_file = log->ckpt_lsn.l.file; + + /* + * Force the current slot to get written to the file. Also switch to using a new log file. That + * log file will be removed from the list of files returned. New writes will not be included in + * the backup. + */ + if (active_only) + F_SET(log, WT_LOG_FORCE_NEWFILE); + WT_RET(__wt_log_force_write(session, 1, NULL)); + WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count)); + + for (max = 0, i = 0; i < count;) { + WT_ERR(__wt_log_extract_lognum(session, files[i], &id)); + if (active_only && (id < min_file || id > max_file)) { + /* + * Any files not being returned are individually freed and the array adjusted. + */ + __wt_free(session, files[i]); + files[i] = files[count - 1]; + files[--count] = NULL; + } else { + if (id > max) + max = id; + i++; + } + } + + *maxid = max; + *filesp = files; + *countp = count; + + /* + * Only free on error. The caller is responsible for calling free once it is done using the + * returned list. + */ + if (0) { +err: + WT_TRET(__wt_fs_directory_list_free(session, &files, count)); + } + return (ret); } /* * __log_filename -- - * Given a log number, return a WT_ITEM of a generated log file name - * of the given prefix type. + * Given a log number, return a WT_ITEM of a generated log file name of the given prefix type. */ static int -__log_filename(WT_SESSION_IMPL *session, - uint32_t id, const char *file_prefix, WT_ITEM *buf) +__log_filename(WT_SESSION_IMPL *session, uint32_t id, const char *file_prefix, WT_ITEM *buf) { - return (__wt_filename_construct(session, - S2C(session)->log_path, file_prefix, UINTMAX_MAX, id, buf)); + return ( + __wt_filename_construct(session, S2C(session)->log_path, file_prefix, UINTMAX_MAX, id, buf)); } /* * __wt_log_extract_lognum -- - * Given a log file name, extract out the log number. + * Given a log file name, extract out the log number. */ int -__wt_log_extract_lognum( - WT_SESSION_IMPL *session, const char *name, uint32_t *id) +__wt_log_extract_lognum(WT_SESSION_IMPL *session, const char *name, uint32_t *id) { - const char *p; - - if (id == NULL || name == NULL) - WT_RET_MSG(session, EINVAL, - "unexpected usage: no id or no name"); - if ((p = strrchr(name, '.')) == NULL || - /* NOLINTNEXTLINE(cert-err34-c) */ - sscanf(++p, "%" SCNu32, id) != 1) - WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name); - return (0); + const char *p; + + if (id == NULL || name == NULL) + WT_RET_MSG(session, EINVAL, "unexpected usage: no id or no name"); + if ((p = strrchr(name, '.')) == NULL || + /* NOLINTNEXTLINE(cert-err34-c) */ + sscanf(++p, "%" SCNu32, id) != 1) + WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name); + return (0); } /* * __wt_log_reset -- - * Reset the existing log file to after the given file number. - * Called from recovery when toggling logging back on, it was off - * the previous open but it was on earlier before that toggle. + * Reset the existing log file to after the given file number. Called from recovery when + * toggling logging back on, it was off the previous open but it was on earlier before that + * toggle. */ int __wt_log_reset(WT_SESSION_IMPL *session, uint32_t lognum) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_LOG *log; - uint32_t old_lognum; - u_int i, logcount; - char **logfiles; - - conn = S2C(session); - log = conn->log; - - if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || - log->fileid > lognum) - return (0); - - WT_ASSERT(session, F_ISSET(conn, WT_CONN_RECOVERING)); - WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY)); - /* - * We know we're single threaded and called from recovery only when - * toggling logging back on. Therefore the only log files we have are - * old and outdated and the new one created when logging opened before - * recovery. We have to remove all old log files first and then create - * the new one so that log file numbers are contiguous in the file - * system. - */ - WT_RET(__wt_close(session, &log->log_fh)); - WT_RET(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); - for (i = 0; i < logcount; i++) { - WT_ERR(__wt_log_extract_lognum( - session, logfiles[i], &old_lognum)); - WT_ASSERT(session, old_lognum < lognum || lognum == 1); - WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, old_lognum)); - } - log->fileid = lognum; - - /* Send in true to update connection creation LSNs. */ - WT_WITH_SLOT_LOCK(session, log, - ret = __log_newfile(session, true, NULL)); - WT_ERR(__wt_log_slot_init(session, false)); -err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + uint32_t old_lognum; + u_int i, logcount; + char **logfiles; + + conn = S2C(session); + log = conn->log; + + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || log->fileid > lognum) + return (0); + + WT_ASSERT(session, F_ISSET(conn, WT_CONN_RECOVERING)); + WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY)); + /* + * We know we're single threaded and called from recovery only when toggling logging back on. + * Therefore the only log files we have are old and outdated and the new one created when + * logging opened before recovery. We have to remove all old log files first and then create the + * new one so that log file numbers are contiguous in the file system. + */ + WT_RET(__wt_close(session, &log->log_fh)); + WT_RET(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &old_lognum)); + WT_ASSERT(session, old_lognum < lognum || lognum == 1); + WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, old_lognum)); + } + log->fileid = lognum; + + /* Send in true to update connection creation LSNs. */ + WT_WITH_SLOT_LOCK(session, log, ret = __log_newfile(session, true, NULL)); + WT_ERR(__wt_log_slot_init(session, false)); +err: + WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); + return (ret); } /* * __log_prealloc -- - * Pre-allocate a log file. + * Pre-allocate a log file. */ static int __log_prealloc(WT_SESSION_IMPL *session, WT_FH *fh) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_LOG *log; - - conn = S2C(session); - log = conn->log; - - /* - * If the user configured zero filling, pre-allocate the log file - * manually. Otherwise use the file extension method to create - * and zero the log file based on what is available. - */ - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ZERO_FILL)) - return (__wt_file_zero(session, fh, - log->first_record, conn->log_file_max)); - - /* If configured to not extend the file, we're done. */ - if (conn->log_extend_len == 0) - return (0); - - /* - * We have exclusive access to the log file and there are no other - * writes happening concurrently, so there are no locking issues. - */ - ret = __wt_fextend(session, fh, conn->log_extend_len); - return (ret == EBUSY || ret == ENOTSUP ? 0 : ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + + /* + * If the user configured zero filling, pre-allocate the log file manually. Otherwise use the + * file extension method to create and zero the log file based on what is available. + */ + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ZERO_FILL)) + return (__wt_file_zero(session, fh, log->first_record, conn->log_file_max)); + + /* If configured to not extend the file, we're done. */ + if (conn->log_extend_len == 0) + return (0); + + /* + * We have exclusive access to the log file and there are no other writes happening + * concurrently, so there are no locking issues. + */ + ret = __wt_fextend(session, fh, conn->log_extend_len); + return (ret == EBUSY || ret == ENOTSUP ? 0 : ret); } /* * __log_size_fit -- - * Return whether or not recsize will fit in the log file. + * Return whether or not recsize will fit in the log file. */ static int __log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize) { - WT_CONNECTION_IMPL *conn; - WT_LOG *log; + WT_CONNECTION_IMPL *conn; + WT_LOG *log; - conn = S2C(session); - log = conn->log; - return (lsn->l.offset == log->first_record || - lsn->l.offset + (wt_off_t)recsize < conn->log_file_max); + conn = S2C(session); + log = conn->log; + return ( + lsn->l.offset == log->first_record || lsn->l.offset + (wt_off_t)recsize < conn->log_file_max); } /* * __log_decompress -- - * Decompress a log record. + * Decompress a log record. */ static int __log_decompress(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out) { - WT_COMPRESSOR *compressor; - WT_CONNECTION_IMPL *conn; - WT_LOG_RECORD *logrec; - size_t result_len, skip; - uint32_t uncompressed_size; - - conn = S2C(session); - logrec = (WT_LOG_RECORD *)in->mem; - skip = WT_LOG_COMPRESS_SKIP; - compressor = conn->log_compressor; - if (compressor == NULL || compressor->decompress == NULL) - WT_RET_MSG(session, WT_ERROR, - "Compressed record with no configured compressor"); - uncompressed_size = logrec->mem_len; - WT_RET(__wt_buf_initsize(session, out, uncompressed_size)); - memcpy(out->mem, in->mem, skip); - WT_RET(compressor->decompress(compressor, &session->iface, - (uint8_t *)in->mem + skip, in->size - skip, - (uint8_t *)out->mem + skip, - uncompressed_size - skip, &result_len)); - - /* - * If checksums were turned off because we're depending on the - * decompression to fail on any corrupted data, we'll end up - * here after corruption happens. If we're salvaging the file, - * it's OK, otherwise it's really, really bad. - */ - if (result_len != uncompressed_size - WT_LOG_COMPRESS_SKIP) - WT_RET_MSG(session, WT_ERROR, - "decompression failed with incorrect size"); - - return (0); + WT_COMPRESSOR *compressor; + WT_CONNECTION_IMPL *conn; + WT_LOG_RECORD *logrec; + size_t result_len, skip; + uint32_t uncompressed_size; + + conn = S2C(session); + logrec = (WT_LOG_RECORD *)in->mem; + skip = WT_LOG_COMPRESS_SKIP; + compressor = conn->log_compressor; + if (compressor == NULL || compressor->decompress == NULL) + WT_RET_MSG(session, WT_ERROR, "Compressed record with no configured compressor"); + uncompressed_size = logrec->mem_len; + WT_RET(__wt_buf_initsize(session, out, uncompressed_size)); + memcpy(out->mem, in->mem, skip); + WT_RET(compressor->decompress(compressor, &session->iface, (uint8_t *)in->mem + skip, + in->size - skip, (uint8_t *)out->mem + skip, uncompressed_size - skip, &result_len)); + + /* + * If checksums were turned off because we're depending on the decompression to fail on any + * corrupted data, we'll end up here after corruption happens. If we're salvaging the file, it's + * OK, otherwise it's really, really bad. + */ + if (result_len != uncompressed_size - WT_LOG_COMPRESS_SKIP) + WT_RET_MSG(session, WT_ERROR, "decompression failed with incorrect size"); + + return (0); } /* * __log_decrypt -- - * Decrypt a log record. + * Decrypt a log record. */ static int __log_decrypt(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out) { - WT_CONNECTION_IMPL *conn; - WT_ENCRYPTOR *encryptor; - WT_KEYED_ENCRYPTOR *kencryptor; - - conn = S2C(session); - kencryptor = conn->kencryptor; - if (kencryptor == NULL || - (encryptor = kencryptor->encryptor) == NULL || - encryptor->decrypt == NULL) - WT_RET_MSG(session, WT_ERROR, - "Encrypted record with no configured decrypt method"); - - return (__wt_decrypt(session, encryptor, WT_LOG_ENCRYPT_SKIP, in, out)); + WT_CONNECTION_IMPL *conn; + WT_ENCRYPTOR *encryptor; + WT_KEYED_ENCRYPTOR *kencryptor; + + conn = S2C(session); + kencryptor = conn->kencryptor; + if (kencryptor == NULL || (encryptor = kencryptor->encryptor) == NULL || + encryptor->decrypt == NULL) + WT_RET_MSG(session, WT_ERROR, "Encrypted record with no configured decrypt method"); + + return (__wt_decrypt(session, encryptor, WT_LOG_ENCRYPT_SKIP, in, out)); } /* * __wt_log_fill -- - * Copy a thread's log records into the assigned slot. + * Copy a thread's log records into the assigned slot. */ int -__wt_log_fill(WT_SESSION_IMPL *session, - WT_MYSLOT *myslot, bool force, WT_ITEM *record, WT_LSN *lsnp) +__wt_log_fill( + WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool force, WT_ITEM *record, WT_LSN *lsnp) { - WT_DECL_RET; - - /* - * Call write or copy into the buffer. For now the offset is the - * real byte offset. If the offset becomes a unit of WT_LOG_ALIGN this - * is where we would multiply by WT_LOG_ALIGN to get the real file byte - * offset for write(). - */ - if (!force && !F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) - memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset, - record->mem, record->size); - else - /* - * If this is a force or unbuffered write, write it now. - */ - WT_ERR(__log_fs_write(session, myslot->slot, - myslot->offset + myslot->slot->slot_start_offset, - record->size, record->mem)); - - WT_STAT_CONN_INCRV(session, log_bytes_written, record->size); - if (lsnp != NULL) { - *lsnp = myslot->slot->slot_start_lsn; - lsnp->l.offset += (uint32_t)myslot->offset; - } + WT_DECL_RET; + + /* + * Call write or copy into the buffer. For now the offset is the real byte offset. If the offset + * becomes a unit of WT_LOG_ALIGN this is where we would multiply by WT_LOG_ALIGN to get the + * real file byte offset for write(). + */ + if (!force && !F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) + memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset, record->mem, record->size); + else + /* + * If this is a force or unbuffered write, write it now. + */ + WT_ERR(__log_fs_write(session, myslot->slot, + myslot->offset + myslot->slot->slot_start_offset, record->size, record->mem)); + + WT_STAT_CONN_INCRV(session, log_bytes_written, record->size); + if (lsnp != NULL) { + *lsnp = myslot->slot->slot_start_lsn; + lsnp->l.offset += (uint32_t)myslot->offset; + } err: - if (ret != 0 && myslot->slot->slot_error == 0) - myslot->slot->slot_error = ret; - return (ret); + if (ret != 0 && myslot->slot->slot_error == 0) + myslot->slot->slot_error = ret; + return (ret); } /* * __log_file_header -- - * Create and write a log file header into a file handle. If writing - * into the main log, it will be called locked. If writing into a - * pre-allocated log, it will be called unlocked. + * Create and write a log file header into a file handle. If writing into the main log, it will + * be called locked. If writing into a pre-allocated log, it will be called unlocked. */ static int -__log_file_header( - WT_SESSION_IMPL *session, WT_FH *fh, WT_LSN *end_lsn, bool prealloc) +__log_file_header(WT_SESSION_IMPL *session, WT_FH *fh, WT_LSN *end_lsn, bool prealloc) { - WT_CONNECTION_IMPL *conn; - WT_DECL_ITEM(buf); - WT_DECL_RET; - WT_LOG *log; - WT_LOGSLOT tmp; - WT_LOG_DESC *desc; - WT_LOG_RECORD *logrec; - WT_MYSLOT myslot; - - conn = S2C(session); - log = conn->log; - - /* - * Set up the log descriptor record. Use a scratch buffer to - * get correct alignment for direct I/O. - */ - WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize); - WT_RET(__wt_scr_alloc(session, log->allocsize, &buf)); - memset(buf->mem, 0, log->allocsize); - buf->size = log->allocsize; - - logrec = (WT_LOG_RECORD *)buf->mem; - desc = (WT_LOG_DESC *)logrec->record; - desc->log_magic = WT_LOG_MAGIC; - desc->version = log->log_version; - desc->log_size = (uint64_t)conn->log_file_max; - __wt_log_desc_byteswap(desc); - - /* - * Now that the record is set up, initialize the record header. - * - * Checksum a little-endian version of the header, and write everything - * in little-endian format. The checksum is (potentially) returned in a - * big-endian format, swap it into place in a separate step. - */ - logrec->len = log->allocsize; - logrec->checksum = 0; - __wt_log_record_byteswap(logrec); - logrec->checksum = __wt_checksum(logrec, log->allocsize); + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_LOG *log; + WT_LOGSLOT tmp; + WT_LOG_DESC *desc; + WT_LOG_RECORD *logrec; + WT_MYSLOT myslot; + + conn = S2C(session); + log = conn->log; + + /* + * Set up the log descriptor record. Use a scratch buffer to get correct alignment for direct + * I/O. + */ + WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize); + WT_RET(__wt_scr_alloc(session, log->allocsize, &buf)); + memset(buf->mem, 0, log->allocsize); + buf->size = log->allocsize; + + logrec = (WT_LOG_RECORD *)buf->mem; + desc = (WT_LOG_DESC *)logrec->record; + desc->log_magic = WT_LOG_MAGIC; + desc->version = log->log_version; + desc->log_size = (uint64_t)conn->log_file_max; + __wt_log_desc_byteswap(desc); + + /* + * Now that the record is set up, initialize the record header. + * + * Checksum a little-endian version of the header, and write everything + * in little-endian format. The checksum is (potentially) returned in a + * big-endian format, swap it into place in a separate step. + */ + logrec->len = log->allocsize; + logrec->checksum = 0; + __wt_log_record_byteswap(logrec); + logrec->checksum = __wt_checksum(logrec, log->allocsize); #ifdef WORDS_BIGENDIAN - logrec->checksum = __wt_bswap32(logrec->checksum); + logrec->checksum = __wt_bswap32(logrec->checksum); #endif - WT_CLEAR(tmp); - memset(&myslot, 0, sizeof(myslot)); - myslot.slot = &tmp; - - /* - * We may recursively call __wt_log_acquire to allocate log space for - * the log descriptor record. Call __wt_log_fill to write it, but we - * do not need to call __wt_log_release because we're not waiting for - * any earlier operations to complete. - */ - if (prealloc) { - WT_ASSERT(session, fh != NULL); - tmp.slot_fh = fh; - } else { - WT_ASSERT(session, fh == NULL); - WT_ERR(__wt_log_acquire(session, log->allocsize, &tmp)); - } - WT_ERR(__wt_log_fill(session, &myslot, true, buf, NULL)); - /* - * Make sure the header gets to disk. - */ - WT_ERR(__wt_fsync(session, tmp.slot_fh, true)); - if (end_lsn != NULL) - *end_lsn = tmp.slot_end_lsn; - -err: __wt_scr_free(session, &buf); - return (ret); + WT_CLEAR(tmp); + memset(&myslot, 0, sizeof(myslot)); + myslot.slot = &tmp; + + /* + * We may recursively call __wt_log_acquire to allocate log space for the log descriptor record. + * Call __wt_log_fill to write it, but we do not need to call __wt_log_release because we're not + * waiting for any earlier operations to complete. + */ + if (prealloc) { + WT_ASSERT(session, fh != NULL); + tmp.slot_fh = fh; + } else { + WT_ASSERT(session, fh == NULL); + WT_ERR(__wt_log_acquire(session, log->allocsize, &tmp)); + } + WT_ERR(__wt_log_fill(session, &myslot, true, buf, NULL)); + /* + * Make sure the header gets to disk. + */ + WT_ERR(__wt_fsync(session, tmp.slot_fh, true)); + if (end_lsn != NULL) + *end_lsn = tmp.slot_end_lsn; + +err: + __wt_scr_free(session, &buf); + return (ret); } /* * __log_openfile -- - * Open a log file with the given log file number and return the WT_FH. + * Open a log file with the given log file number and return the WT_FH. */ static int -__log_openfile( - WT_SESSION_IMPL *session, uint32_t id, uint32_t flags, WT_FH **fhp) +__log_openfile(WT_SESSION_IMPL *session, uint32_t id, uint32_t flags, WT_FH **fhp) { - WT_CONNECTION_IMPL *conn; - WT_DECL_ITEM(buf); - WT_DECL_RET; - u_int wtopen_flags; - - conn = S2C(session); - WT_RET(__wt_scr_alloc(session, 0, &buf)); - /* - * If we are creating the file then we use a temporary file name. - * Otherwise it is a log file name. - */ - if (LF_ISSET(WT_LOG_OPEN_CREATE_OK)) { - wtopen_flags = WT_FS_OPEN_CREATE; - WT_ERR(__log_filename(session, id, WT_LOG_TMPNAME, buf)); - } else { - wtopen_flags = 0; - WT_ERR(__log_filename(session, id, WT_LOG_FILENAME, buf)); - } - __wt_verbose(session, WT_VERB_LOG, - "opening log %s", (const char *)buf->data); - if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG)) - FLD_SET(wtopen_flags, WT_FS_OPEN_DIRECTIO); - WT_ERR(__wt_open( - session, buf->data, WT_FS_OPEN_FILE_TYPE_LOG, wtopen_flags, fhp)); -err: __wt_scr_free(session, &buf); - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(buf); + WT_DECL_RET; + u_int wtopen_flags; + + conn = S2C(session); + WT_RET(__wt_scr_alloc(session, 0, &buf)); + /* + * If we are creating the file then we use a temporary file name. Otherwise it is a log file + * name. + */ + if (LF_ISSET(WT_LOG_OPEN_CREATE_OK)) { + wtopen_flags = WT_FS_OPEN_CREATE; + WT_ERR(__log_filename(session, id, WT_LOG_TMPNAME, buf)); + } else { + wtopen_flags = 0; + WT_ERR(__log_filename(session, id, WT_LOG_FILENAME, buf)); + } + __wt_verbose(session, WT_VERB_LOG, "opening log %s", (const char *)buf->data); + if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG)) + FLD_SET(wtopen_flags, WT_FS_OPEN_DIRECTIO); + WT_ERR(__wt_open(session, buf->data, WT_FS_OPEN_FILE_TYPE_LOG, wtopen_flags, fhp)); +err: + __wt_scr_free(session, &buf); + return (ret); } /* * __log_open_verify -- - * Open a log file with the given log file number, verify its - * header and return various pieces of system information about - * this log file. + * Open a log file with the given log file number, verify its header and return various pieces + * of system information about this log file. */ static int -__log_open_verify(WT_SESSION_IMPL *session, uint32_t id, WT_FH **fhp, - WT_LSN *lsnp, uint16_t *versionp, bool *need_salvagep) +__log_open_verify(WT_SESSION_IMPL *session, uint32_t id, WT_FH **fhp, WT_LSN *lsnp, + uint16_t *versionp, bool *need_salvagep) { - WT_CONNECTION_IMPL *conn; - WT_DECL_ITEM(buf); - WT_DECL_RET; - WT_FH *fh; - WT_LOG *log; - WT_LOG_DESC *desc; - WT_LOG_RECORD *logrec; - uint32_t allocsize, rectype; - const uint8_t *end, *p; - bool need_salvage, salvage_mode; - - conn = S2C(session); - fh = NULL; - log = conn->log; - need_salvage = false; - WT_RET(__wt_scr_alloc(session, 0, &buf)); - salvage_mode = (need_salvagep != NULL && - F_ISSET(conn, WT_CONN_SALVAGE)); - - if (log == NULL) - allocsize = WT_LOG_ALIGN; - else - allocsize = log->allocsize; - if (lsnp != NULL) - WT_ZERO_LSN(lsnp); - WT_ERR(__wt_buf_grow(session, buf, allocsize)); - memset(buf->mem, 0, allocsize); - - /* - * Any operation that fails from here on out indicates corruption - * that could be salvaged. - */ - need_salvage = true; - - /* - * Read in the log file header and verify it. - */ - WT_ERR(__log_openfile(session, id, 0, &fh)); - WT_ERR(__log_fs_read(session, fh, 0, allocsize, buf->mem)); - logrec = (WT_LOG_RECORD *)buf->mem; - __wt_log_record_byteswap(logrec); - desc = (WT_LOG_DESC *)logrec->record; - __wt_log_desc_byteswap(desc); - if (desc->log_magic != WT_LOG_MAGIC) { - if (salvage_mode) - WT_ERR_MSG(session, WT_ERROR, - "log file %s corrupted: Bad magic number %" PRIu32, - fh->name, desc->log_magic); - else - WT_PANIC_RET(session, WT_ERROR, - "log file %s corrupted: Bad magic number %" PRIu32, - fh->name, desc->log_magic); - } - /* - * We cannot read future log file formats. - */ - if (desc->version > WT_LOG_VERSION) - WT_ERR_MSG(session, WT_ERROR, - "unsupported WiredTiger file version: this build" - " only supports versions up to %d," - " and the file is version %" PRIu16, - WT_LOG_VERSION, desc->version); - - /* - * We error if the log version is less than the required minimum or - * larger than the required maximum. - */ - if (conn->req_max_major != WT_CONN_COMPAT_NONE && - desc->version > conn->log_req_max) - WT_ERR_MSG(session, WT_ERROR, - WT_COMPAT_MSG_PREFIX - "unsupported WiredTiger file version: this build" - " requires a maximum version of %" PRIu16 "," - " and the file is version %" PRIu16, - conn->log_req_max, desc->version); - - if (conn->req_min_major != WT_CONN_COMPAT_NONE && - desc->version < conn->log_req_min) - WT_ERR_MSG(session, WT_ERROR, - WT_COMPAT_MSG_PREFIX - "unsupported WiredTiger file version: this build" - " requires a minimum version of %" PRIu16 "," - " and the file is version %" PRIu16, - conn->log_req_min, desc->version); - - /* - * Set up the return values since the header is valid. - */ - if (versionp != NULL) - *versionp = desc->version; - - /* - * Skip reading in the previous LSN if log file is an old version - * or if the caller doesn't care about the LSN. Otherwise read that - * record in and set up the LSN. We already have a buffer that is - * the correct size. Reuse it. - */ - if (lsnp == NULL || - (desc->version < WT_LOG_VERSION_SYSTEM)) - goto err; - - memset(buf->mem, 0, allocsize); - WT_ERR(__log_fs_read(session, fh, allocsize, allocsize, buf->mem)); - logrec = (WT_LOG_RECORD *)buf->mem; - /* - * We have a valid header but the system record is not there. - * The log ends here. Return without setting the LSN. - */ - if (logrec->len == 0) { - __wt_verbose(session, WT_VERB_LOG, - "Log %s found empty log after header", fh->name); - goto err; - } - - if (!__log_checksum_match(buf, allocsize)) - WT_ERR_MSG(session, WT_ERROR, - "%s: System log record checksum mismatch", fh->name); - __wt_log_record_byteswap(logrec); - p = WT_LOG_SKIP_HEADER(buf->data); - end = (const uint8_t *)buf->data + allocsize; - WT_ERR(__wt_logrec_read(session, &p, end, &rectype)); - if (rectype != WT_LOGREC_SYSTEM) - WT_ERR_MSG(session, WT_ERROR, "System log record missing"); - WT_ERR(__wt_log_recover_system(session, &p, end, lsnp)); - -err: __wt_scr_free(session, &buf); - - /* - * Return the file handle if needed, otherwise close it. - */ - if (fhp != NULL && ret == 0) - *fhp = fh; - else if (ret != 0 && need_salvage && salvage_mode) { - /* Let the caller know this file must be salvaged. */ - ret = 0; - WT_TRET(__wt_close(session, &fh)); - if (fhp != NULL) - *fhp = NULL; - *need_salvagep = true; - } else - WT_TRET(__wt_close(session, &fh)); - - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(buf); + WT_DECL_RET; + WT_FH *fh; + WT_LOG *log; + WT_LOG_DESC *desc; + WT_LOG_RECORD *logrec; + uint32_t allocsize, rectype; + const uint8_t *end, *p; + bool need_salvage, salvage_mode; + + conn = S2C(session); + fh = NULL; + log = conn->log; + need_salvage = false; + WT_RET(__wt_scr_alloc(session, 0, &buf)); + salvage_mode = (need_salvagep != NULL && F_ISSET(conn, WT_CONN_SALVAGE)); + + if (log == NULL) + allocsize = WT_LOG_ALIGN; + else + allocsize = log->allocsize; + if (lsnp != NULL) + WT_ZERO_LSN(lsnp); + WT_ERR(__wt_buf_grow(session, buf, allocsize)); + memset(buf->mem, 0, allocsize); + + /* + * Any operation that fails from here on out indicates corruption that could be salvaged. + */ + need_salvage = true; + + /* + * Read in the log file header and verify it. + */ + WT_ERR(__log_openfile(session, id, 0, &fh)); + WT_ERR(__log_fs_read(session, fh, 0, allocsize, buf->mem)); + logrec = (WT_LOG_RECORD *)buf->mem; + __wt_log_record_byteswap(logrec); + desc = (WT_LOG_DESC *)logrec->record; + __wt_log_desc_byteswap(desc); + if (desc->log_magic != WT_LOG_MAGIC) { + if (salvage_mode) + WT_ERR_MSG(session, WT_ERROR, "log file %s corrupted: Bad magic number %" PRIu32, + fh->name, desc->log_magic); + else + WT_PANIC_RET(session, WT_ERROR, "log file %s corrupted: Bad magic number %" PRIu32, + fh->name, desc->log_magic); + } + /* + * We cannot read future log file formats. + */ + if (desc->version > WT_LOG_VERSION) + WT_ERR_MSG(session, WT_ERROR, + "unsupported WiredTiger file version: this build" + " only supports versions up to %d," + " and the file is version %" PRIu16, + WT_LOG_VERSION, desc->version); + + /* + * We error if the log version is less than the required minimum or larger than the required + * maximum. + */ + if (conn->req_max_major != WT_CONN_COMPAT_NONE && desc->version > conn->log_req_max) + WT_ERR_MSG(session, WT_ERROR, WT_COMPAT_MSG_PREFIX + "unsupported WiredTiger file version: this build" + " requires a maximum version of %" PRIu16 + "," + " and the file is version %" PRIu16, + conn->log_req_max, desc->version); + + if (conn->req_min_major != WT_CONN_COMPAT_NONE && desc->version < conn->log_req_min) + WT_ERR_MSG(session, WT_ERROR, WT_COMPAT_MSG_PREFIX + "unsupported WiredTiger file version: this build" + " requires a minimum version of %" PRIu16 + "," + " and the file is version %" PRIu16, + conn->log_req_min, desc->version); + + /* + * Set up the return values since the header is valid. + */ + if (versionp != NULL) + *versionp = desc->version; + + /* + * Skip reading in the previous LSN if log file is an old version or if the caller doesn't care + * about the LSN. Otherwise read that record in and set up the LSN. We already have a buffer + * that is the correct size. Reuse it. + */ + if (lsnp == NULL || (desc->version < WT_LOG_VERSION_SYSTEM)) + goto err; + + memset(buf->mem, 0, allocsize); + WT_ERR(__log_fs_read(session, fh, allocsize, allocsize, buf->mem)); + logrec = (WT_LOG_RECORD *)buf->mem; + /* + * We have a valid header but the system record is not there. The log ends here. Return without + * setting the LSN. + */ + if (logrec->len == 0) { + __wt_verbose(session, WT_VERB_LOG, "Log %s found empty log after header", fh->name); + goto err; + } + + if (!__log_checksum_match(buf, allocsize)) + WT_ERR_MSG(session, WT_ERROR, "%s: System log record checksum mismatch", fh->name); + __wt_log_record_byteswap(logrec); + p = WT_LOG_SKIP_HEADER(buf->data); + end = (const uint8_t *)buf->data + allocsize; + WT_ERR(__wt_logrec_read(session, &p, end, &rectype)); + if (rectype != WT_LOGREC_SYSTEM) + WT_ERR_MSG(session, WT_ERROR, "System log record missing"); + WT_ERR(__wt_log_recover_system(session, &p, end, lsnp)); + +err: + __wt_scr_free(session, &buf); + + /* + * Return the file handle if needed, otherwise close it. + */ + if (fhp != NULL && ret == 0) + *fhp = fh; + else if (ret != 0 && need_salvage && salvage_mode) { + /* Let the caller know this file must be salvaged. */ + ret = 0; + WT_TRET(__wt_close(session, &fh)); + if (fhp != NULL) + *fhp = NULL; + *need_salvagep = true; + } else + WT_TRET(__wt_close(session, &fh)); + + return (ret); } /* * __log_record_verify -- - * Check that values of the log record header are valid. - * No byteswap of the header has been done at this point. + * Check that values of the log record header are valid. No byteswap of the header has been done + * at this point. */ static int -__log_record_verify(WT_SESSION_IMPL *session, WT_FH *log_fh, uint32_t offset, - WT_LOG_RECORD *logrecp, bool *corrupt) +__log_record_verify( + WT_SESSION_IMPL *session, WT_FH *log_fh, uint32_t offset, WT_LOG_RECORD *logrecp, bool *corrupt) { - WT_LOG_RECORD logrec; - size_t i; - - *corrupt = false; - - /* - * Make our own copy of the header so we can get the bytes in the - * proper order. - */ - logrec = *logrecp; - __wt_log_record_byteswap(&logrec); - - if (F_ISSET(&logrec, ~(WT_LOG_RECORD_ALL_FLAGS))) { - WT_RET(__wt_msg(session, - "%s: log record at position %" PRIu32 - " has flag corruption 0x%" PRIx16, log_fh->name, offset, - logrec.flags)); - *corrupt = true; - } - for (i = 0; i < sizeof(logrec.unused); i++) - if (logrec.unused[i] != 0) { - WT_RET(__wt_msg(session, - "%s: log record at position %" PRIu32 - " has unused[%" WT_SIZET_FMT "] corruption 0x%" - PRIx8, log_fh->name, offset, i, logrec.unused[i])); - *corrupt = true; - } - if (logrec.mem_len != 0 && !F_ISSET(&logrec, - WT_LOG_RECORD_COMPRESSED | WT_LOG_RECORD_ENCRYPTED)) { - WT_RET(__wt_msg(session, - "%s: log record at position %" PRIu32 - " has memory len corruption 0x%" PRIx32, log_fh->name, - offset, logrec.mem_len)); - *corrupt = true; - } - if (logrec.len <= offsetof(WT_LOG_RECORD, record)) { - WT_RET(__wt_msg(session, - "%s: log record at position %" PRIu32 - " has record len corruption 0x%" PRIx32, log_fh->name, - offset, logrec.len)); - *corrupt = true; - } - return (0); + WT_LOG_RECORD logrec; + size_t i; + + *corrupt = false; + + /* + * Make our own copy of the header so we can get the bytes in the proper order. + */ + logrec = *logrecp; + __wt_log_record_byteswap(&logrec); + + if (F_ISSET(&logrec, ~(WT_LOG_RECORD_ALL_FLAGS))) { + WT_RET( + __wt_msg(session, "%s: log record at position %" PRIu32 " has flag corruption 0x%" PRIx16, + log_fh->name, offset, logrec.flags)); + *corrupt = true; + } + for (i = 0; i < sizeof(logrec.unused); i++) + if (logrec.unused[i] != 0) { + WT_RET(__wt_msg(session, "%s: log record at position %" PRIu32 + " has unused[%" WT_SIZET_FMT "] corruption 0x%" PRIx8, + log_fh->name, offset, i, logrec.unused[i])); + *corrupt = true; + } + if (logrec.mem_len != 0 && + !F_ISSET(&logrec, WT_LOG_RECORD_COMPRESSED | WT_LOG_RECORD_ENCRYPTED)) { + WT_RET(__wt_msg(session, + "%s: log record at position %" PRIu32 " has memory len corruption 0x%" PRIx32, + log_fh->name, offset, logrec.mem_len)); + *corrupt = true; + } + if (logrec.len <= offsetof(WT_LOG_RECORD, record)) { + WT_RET(__wt_msg(session, + "%s: log record at position %" PRIu32 " has record len corruption 0x%" PRIx32, + log_fh->name, offset, logrec.len)); + *corrupt = true; + } + return (0); } /* * __log_alloc_prealloc -- - * Look for a pre-allocated log file and rename it to use as the next - * real log file. Called locked. + * Look for a pre-allocated log file and rename it to use as the next real log file. Called + * locked. */ static int __log_alloc_prealloc(WT_SESSION_IMPL *session, uint32_t to_num) { - WT_CONNECTION_IMPL *conn; - WT_DECL_ITEM(from_path); - WT_DECL_ITEM(to_path); - WT_DECL_RET; - WT_LOG *log; - uint32_t from_num; - u_int logcount; - char **logfiles; - bool locked; - - conn = S2C(session); - log = conn->log; - logfiles = NULL; - locked = false; - - /* - * If there are no pre-allocated files, return WT_NOTFOUND. - */ - WT_RET(__log_get_files_single( - session, WT_LOG_PREPNAME, &logfiles, &logcount)); - if (logcount == 0) - return (WT_NOTFOUND); - - /* We have a file to use. */ - WT_ERR(__wt_log_extract_lognum(session, logfiles[0], &from_num)); - - WT_ERR(__wt_scr_alloc(session, 0, &from_path)); - WT_ERR(__wt_scr_alloc(session, 0, &to_path)); - WT_ERR(__log_filename(session, from_num, WT_LOG_PREPNAME, from_path)); - WT_ERR(__log_filename(session, to_num, WT_LOG_FILENAME, to_path)); - __wt_spin_lock(session, &log->log_fs_lock); - locked = true; - __wt_verbose(session, WT_VERB_LOG, - "log_alloc_prealloc: rename log %s to %s", - (const char *)from_path->data, (const char *)to_path->data); - WT_STAT_CONN_INCR(session, log_prealloc_used); - /* - * All file setup, writing the header and pre-allocation was done - * before. We only need to rename it. - */ - WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data, false)); - -err: __wt_scr_free(session, &from_path); - __wt_scr_free(session, &to_path); - if (locked) - __wt_spin_unlock(session, &log->log_fs_lock); - WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(from_path); + WT_DECL_ITEM(to_path); + WT_DECL_RET; + WT_LOG *log; + uint32_t from_num; + u_int logcount; + char **logfiles; + bool locked; + + conn = S2C(session); + log = conn->log; + logfiles = NULL; + locked = false; + + /* + * If there are no pre-allocated files, return WT_NOTFOUND. + */ + WT_RET(__log_get_files_single(session, WT_LOG_PREPNAME, &logfiles, &logcount)); + if (logcount == 0) + return (WT_NOTFOUND); + + /* We have a file to use. */ + WT_ERR(__wt_log_extract_lognum(session, logfiles[0], &from_num)); + + WT_ERR(__wt_scr_alloc(session, 0, &from_path)); + WT_ERR(__wt_scr_alloc(session, 0, &to_path)); + WT_ERR(__log_filename(session, from_num, WT_LOG_PREPNAME, from_path)); + WT_ERR(__log_filename(session, to_num, WT_LOG_FILENAME, to_path)); + __wt_spin_lock(session, &log->log_fs_lock); + locked = true; + __wt_verbose(session, WT_VERB_LOG, "log_alloc_prealloc: rename log %s to %s", + (const char *)from_path->data, (const char *)to_path->data); + WT_STAT_CONN_INCR(session, log_prealloc_used); + /* + * All file setup, writing the header and pre-allocation was done before. We only need to rename + * it. + */ + WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data, false)); + +err: + __wt_scr_free(session, &from_path); + __wt_scr_free(session, &to_path); + if (locked) + __wt_spin_unlock(session, &log->log_fs_lock); + WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); + return (ret); } /* * __log_newfile -- - * Create the next log file and write the file header record into it. + * Create the next log file and write the file header record into it. */ static int __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_FH *log_fh; - WT_LOG *log; - WT_LSN end_lsn, logrec_lsn; - u_int yield_cnt; - bool create_log, skipp; - - conn = S2C(session); - log = conn->log; - - /* - * Set aside the log file handle to be closed later. Other threads - * may still be using it to write to the log. If the log file size - * is small we could fill a log file before the previous one is closed. - * Wait for that to close. - */ - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); - for (yield_cnt = 0; log->log_close_fh != NULL;) { - WT_STAT_CONN_INCR(session, log_close_yields); - /* - * Processing slots will conditionally signal the file close - * server thread. But if we've tried a while, signal the - * thread directly here. - */ - __wt_log_wrlsn(session, NULL); - if (++yield_cnt % WT_THOUSAND == 0) { - __wt_spin_unlock(session, &log->log_slot_lock); - __wt_cond_signal(session, conn->log_file_cond); - __wt_spin_lock(session, &log->log_slot_lock); - } - if (++yield_cnt > WT_THOUSAND * 10) - return (__wt_set_return(session, EBUSY)); - __wt_yield(); - } - /* - * Note, the file server worker thread requires the LSN be set once the - * close file handle is set, force that ordering. - */ - if (log->log_fh == NULL) - log->log_close_fh = NULL; - else { - log->log_close_lsn = log->alloc_lsn; - WT_PUBLISH(log->log_close_fh, log->log_fh); - } - log->fileid++; - - /* - * If pre-allocating log files look for one; otherwise, or if we don't - * find one, create a log file. We can't use pre-allocated log files - * while a hot backup is in progress: applications can copy the files - * in any way they choose, and a log file rename might confuse things. - */ - create_log = true; - if (conn->log_prealloc > 0 && !conn->hot_backup) { - WT_WITH_HOTBACKUP_READ_LOCK(session, - ret = __log_alloc_prealloc(session, log->fileid), - &skipp); - - if (!skipp) { - /* - * If ret is 0 it means we found a pre-allocated file. - * If ret is WT_NOTFOUND, create the new log file and - * signal the server, we missed our pre-allocation. - * If ret is non-zero but not WT_NOTFOUND, return the - * error. - */ - WT_RET_NOTFOUND_OK(ret); - if (ret == 0) - create_log = false; - else { - WT_STAT_CONN_INCR(session, log_prealloc_missed); - if (conn->log_cond != NULL) - __wt_cond_signal( - session, conn->log_cond); - } - } - } - /* - * If we need to create the log file, do so now. - */ - if (create_log) { - /* - * Increment the missed pre-allocated file counter only - * if a hot backup is not in progress. We are deliberately - * not using pre-allocated log files during backup - * (see comment above). - */ - if (!conn->hot_backup) - log->prep_missed++; - WT_RET(__wt_log_allocfile( - session, log->fileid, WT_LOG_FILENAME)); - } - /* - * Since the file system clears the output file handle pointer before - * searching the handle list and filling in the new file handle, - * we must pass in a local file handle. Otherwise there is a wide - * window where another thread could see a NULL log file handle. - */ - WT_RET(__log_open_verify(session, log->fileid, &log_fh, NULL, NULL, - NULL)); - /* - * Write the LSN at the end of the last record in the previous log file - * as the first record in this log file. - */ - if (log->fileid == 1) - WT_INIT_LSN(&logrec_lsn); - else - logrec_lsn = log->alloc_lsn; - /* - * We need to setup the LSNs. Set the end LSN and alloc LSN to - * the end of the header. - */ - WT_SET_LSN(&log->alloc_lsn, log->fileid, WT_LOG_END_HEADER); - /* - * If we're running the version where we write a system record - * do so now and update the alloc_lsn. - */ - if (log->log_version >= WT_LOG_VERSION_SYSTEM) { - WT_RET(__wt_log_system_record(session, - log_fh, &logrec_lsn)); - WT_SET_LSN(&log->alloc_lsn, log->fileid, log->first_record); - } - end_lsn = log->alloc_lsn; - WT_PUBLISH(log->log_fh, log_fh); - - /* - * If we're called from connection creation code, we need to update - * the LSNs since we're the only write in progress. - */ - if (conn_open) { - WT_RET(__wt_fsync(session, log->log_fh, true)); - log->sync_lsn = end_lsn; - log->write_lsn = end_lsn; - log->write_start_lsn = end_lsn; - } - log->dirty_lsn = log->alloc_lsn; - if (created != NULL) - *created = create_log; - return (0); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_FH *log_fh; + WT_LOG *log; + WT_LSN end_lsn, logrec_lsn; + u_int yield_cnt; + bool create_log, skipp; + + conn = S2C(session); + log = conn->log; + + /* + * Set aside the log file handle to be closed later. Other threads may still be using it to + * write to the log. If the log file size is small we could fill a log file before the previous + * one is closed. Wait for that to close. + */ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + for (yield_cnt = 0; log->log_close_fh != NULL;) { + WT_STAT_CONN_INCR(session, log_close_yields); + /* + * Processing slots will conditionally signal the file close server thread. But if we've + * tried a while, signal the thread directly here. + */ + __wt_log_wrlsn(session, NULL); + if (++yield_cnt % WT_THOUSAND == 0) { + __wt_spin_unlock(session, &log->log_slot_lock); + __wt_cond_signal(session, conn->log_file_cond); + __wt_spin_lock(session, &log->log_slot_lock); + } + if (++yield_cnt > WT_THOUSAND * 10) + return (__wt_set_return(session, EBUSY)); + __wt_yield(); + } + /* + * Note, the file server worker thread requires the LSN be set once the close file handle is + * set, force that ordering. + */ + if (log->log_fh == NULL) + log->log_close_fh = NULL; + else { + log->log_close_lsn = log->alloc_lsn; + WT_PUBLISH(log->log_close_fh, log->log_fh); + } + log->fileid++; + + /* + * If pre-allocating log files look for one; otherwise, or if we don't find one, create a log + * file. We can't use pre-allocated log files while a hot backup is in progress: applications + * can copy the files in any way they choose, and a log file rename might confuse things. + */ + create_log = true; + if (conn->log_prealloc > 0 && !conn->hot_backup) { + WT_WITH_HOTBACKUP_READ_LOCK( + session, ret = __log_alloc_prealloc(session, log->fileid), &skipp); + + if (!skipp) { + /* + * If ret is 0 it means we found a pre-allocated file. If ret is WT_NOTFOUND, create the + * new log file and signal the server, we missed our pre-allocation. If ret is non-zero + * but not WT_NOTFOUND, return the error. + */ + WT_RET_NOTFOUND_OK(ret); + if (ret == 0) + create_log = false; + else { + WT_STAT_CONN_INCR(session, log_prealloc_missed); + if (conn->log_cond != NULL) + __wt_cond_signal(session, conn->log_cond); + } + } + } + /* + * If we need to create the log file, do so now. + */ + if (create_log) { + /* + * Increment the missed pre-allocated file counter only + * if a hot backup is not in progress. We are deliberately + * not using pre-allocated log files during backup + * (see comment above). + */ + if (!conn->hot_backup) + log->prep_missed++; + WT_RET(__wt_log_allocfile(session, log->fileid, WT_LOG_FILENAME)); + } + /* + * Since the file system clears the output file handle pointer before searching the handle list + * and filling in the new file handle, we must pass in a local file handle. Otherwise there is a + * wide window where another thread could see a NULL log file handle. + */ + WT_RET(__log_open_verify(session, log->fileid, &log_fh, NULL, NULL, NULL)); + /* + * Write the LSN at the end of the last record in the previous log file as the first record in + * this log file. + */ + if (log->fileid == 1) + WT_INIT_LSN(&logrec_lsn); + else + logrec_lsn = log->alloc_lsn; + /* + * We need to setup the LSNs. Set the end LSN and alloc LSN to the end of the header. + */ + WT_SET_LSN(&log->alloc_lsn, log->fileid, WT_LOG_END_HEADER); + /* + * If we're running the version where we write a system record do so now and update the + * alloc_lsn. + */ + if (log->log_version >= WT_LOG_VERSION_SYSTEM) { + WT_RET(__wt_log_system_record(session, log_fh, &logrec_lsn)); + WT_SET_LSN(&log->alloc_lsn, log->fileid, log->first_record); + } + end_lsn = log->alloc_lsn; + WT_PUBLISH(log->log_fh, log_fh); + + /* + * If we're called from connection creation code, we need to update the LSNs since we're the + * only write in progress. + */ + if (conn_open) { + WT_RET(__wt_fsync(session, log->log_fh, true)); + log->sync_lsn = end_lsn; + log->write_lsn = end_lsn; + log->write_start_lsn = end_lsn; + } + log->dirty_lsn = log->alloc_lsn; + if (created != NULL) + *created = create_log; + return (0); } /* * __log_set_version -- - * Set version related information under lock. + * Set version related information under lock. */ static int -__log_set_version(WT_SESSION_IMPL *session, uint16_t version, - uint32_t first_rec, bool live_chg, bool downgrade) +__log_set_version( + WT_SESSION_IMPL *session, uint16_t version, uint32_t first_rec, bool live_chg, bool downgrade) { - WT_CONNECTION_IMPL *conn; - WT_LOG *log; - - conn = S2C(session); - log = conn->log; - - log->log_version = version; - log->first_record = first_rec; - if (downgrade) - FLD_SET(conn->log_flags, WT_CONN_LOG_DOWNGRADED); - else - FLD_CLR(conn->log_flags, WT_CONN_LOG_DOWNGRADED); - if (live_chg) - F_SET(log, WT_LOG_FORCE_NEWFILE); - if (!F_ISSET(conn, WT_CONN_READONLY)) - return (__log_prealloc_remove(session)); - - return (0); + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + + log->log_version = version; + log->first_record = first_rec; + if (downgrade) + FLD_SET(conn->log_flags, WT_CONN_LOG_DOWNGRADED); + else + FLD_CLR(conn->log_flags, WT_CONN_LOG_DOWNGRADED); + if (live_chg) + F_SET(log, WT_LOG_FORCE_NEWFILE); + if (!F_ISSET(conn, WT_CONN_READONLY)) + return (__log_prealloc_remove(session)); + + return (0); } /* * __wt_log_set_version -- - * Change the version number in logging. Will be done with locking. - * We need to force the log file to advance and remove all old - * pre-allocated files. + * Change the version number in logging. Will be done with locking. We need to force the log + * file to advance and remove all old pre-allocated files. */ int -__wt_log_set_version(WT_SESSION_IMPL *session, uint16_t version, - uint32_t first_rec, bool downgrade, bool live_chg, uint32_t *lognump) +__wt_log_set_version(WT_SESSION_IMPL *session, uint16_t version, uint32_t first_rec, bool downgrade, + bool live_chg, uint32_t *lognump) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_LOG *log; - - conn = S2C(session); - log = conn->log; - - /* - * The steps are: - * - Set up versions and remove files under lock. - * - Set a flag so that the next slot change forces a file change. - * - Force out the slot that is currently active in the current log. - * - Write a log record to force a record into the new log file. - */ - WT_WITH_SLOT_LOCK(session, log, - ret = __log_set_version(session, - version, first_rec, live_chg, downgrade)); - if (!live_chg) - return (ret); - WT_ERR(ret); - /* - * A new log file will be used when we force out the earlier slot. - */ - WT_ERR(__wt_log_force_write(session, 1, NULL)); - - /* - * We need to write a record to the new version log file so that - * a potential checkpoint finds LSNs in that new log file and - * an archive correctly removes all earlier logs. - * Write an internal printf record. - */ - WT_ERR(__wt_log_printf(session, - "COMPATIBILITY: Version now %" PRIu16, log->log_version)); - if (lognump != NULL) - *lognump = log->alloc_lsn.l.file; + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + + /* + * The steps are: + * - Set up versions and remove files under lock. + * - Set a flag so that the next slot change forces a file change. + * - Force out the slot that is currently active in the current log. + * - Write a log record to force a record into the new log file. + */ + WT_WITH_SLOT_LOCK( + session, log, ret = __log_set_version(session, version, first_rec, live_chg, downgrade)); + if (!live_chg) + return (ret); + WT_ERR(ret); + /* + * A new log file will be used when we force out the earlier slot. + */ + WT_ERR(__wt_log_force_write(session, 1, NULL)); + + /* + * We need to write a record to the new version log file so that a potential checkpoint finds + * LSNs in that new log file and an archive correctly removes all earlier logs. Write an + * internal printf record. + */ + WT_ERR(__wt_log_printf(session, "COMPATIBILITY: Version now %" PRIu16, log->log_version)); + if (lognump != NULL) + *lognump = log->alloc_lsn.l.file; err: - return (ret); + return (ret); } /* * __wt_log_acquire -- - * Called serially when switching slots. Can be called recursively - * from __log_newfile when we change log files. + * Called serially when switching slots. Can be called recursively from __log_newfile when we + * change log files. */ int __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) { - WT_CONNECTION_IMPL *conn; - WT_LOG *log; - bool created_log; - - conn = S2C(session); - log = conn->log; - created_log = true; - /* - * Add recsize to alloc_lsn. Save our starting LSN - * where the previous allocation finished for the release LSN. - * That way when log files switch, we're waiting for the correct LSN - * from outstanding writes. - */ - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); - /* - * We need to set the release LSN earlier, before a log file change. - */ - slot->slot_release_lsn = log->alloc_lsn; - /* - * Make sure that the size can fit in the file. Proactively switch - * if it cannot. This reduces, but does not eliminate, log files - * that exceed the maximum file size. We want to minimize the risk - * of an error due to no space. - */ - if (F_ISSET(log, WT_LOG_FORCE_NEWFILE) || - !__log_size_fit(session, &log->alloc_lsn, recsize)) { - WT_RET(__log_newfile(session, false, &created_log)); - F_CLR(log, WT_LOG_FORCE_NEWFILE); - if (log->log_close_fh != NULL) - F_SET(slot, WT_SLOT_CLOSEFH); - } - - /* - * Pre-allocate on the first real write into the log file, if it - * was just created (i.e. not pre-allocated). - */ - if (log->alloc_lsn.l.offset == log->first_record && created_log) - WT_RET(__log_prealloc(session, log->log_fh)); - /* - * Initialize the slot for activation. - */ - __wt_log_slot_activate(session, slot); - - return (0); + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + bool created_log; + + conn = S2C(session); + log = conn->log; + created_log = true; + /* + * Add recsize to alloc_lsn. Save our starting LSN where the previous allocation finished for + * the release LSN. That way when log files switch, we're waiting for the correct LSN from + * outstanding writes. + */ + WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SLOT)); + /* + * We need to set the release LSN earlier, before a log file change. + */ + slot->slot_release_lsn = log->alloc_lsn; + /* + * Make sure that the size can fit in the file. Proactively switch if it cannot. This reduces, + * but does not eliminate, log files that exceed the maximum file size. We want to minimize the + * risk of an error due to no space. + */ + if (F_ISSET(log, WT_LOG_FORCE_NEWFILE) || !__log_size_fit(session, &log->alloc_lsn, recsize)) { + WT_RET(__log_newfile(session, false, &created_log)); + F_CLR(log, WT_LOG_FORCE_NEWFILE); + if (log->log_close_fh != NULL) + F_SET(slot, WT_SLOT_CLOSEFH); + } + + /* + * Pre-allocate on the first real write into the log file, if it was just created (i.e. not + * pre-allocated). + */ + if (log->alloc_lsn.l.offset == log->first_record && created_log) + WT_RET(__log_prealloc(session, log->log_fh)); + /* + * Initialize the slot for activation. + */ + __wt_log_slot_activate(session, slot); + + return (0); } /* * __log_truncate_file -- - * Truncate a log file to the specified offset. - * - * If the underlying file system doesn't support truncate then we need to - * zero out the rest of the file, doing an effective truncate. + * Truncate a log file to the specified offset. If the underlying file system doesn't support + * truncate then we need to zero out the rest of the file, doing an effective truncate. */ static int __log_truncate_file(WT_SESSION_IMPL *session, WT_FH *log_fh, wt_off_t offset) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_LOG *log; - bool skipp; - - conn = S2C(session); - log = conn->log; - - if (!F_ISSET(log, WT_LOG_TRUNCATE_NOTSUP) && !conn->hot_backup) { - WT_WITH_HOTBACKUP_READ_LOCK(session, - ret = __wt_ftruncate( - session, log_fh, offset), &skipp); - if (!skipp) { - if (ret != ENOTSUP) - return (ret); - F_SET(log, WT_LOG_TRUNCATE_NOTSUP); - } - } - - return (__wt_file_zero(session, log_fh, offset, conn->log_file_max)); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + bool skipp; + + conn = S2C(session); + log = conn->log; + + if (!F_ISSET(log, WT_LOG_TRUNCATE_NOTSUP) && !conn->hot_backup) { + WT_WITH_HOTBACKUP_READ_LOCK(session, ret = __wt_ftruncate(session, log_fh, offset), &skipp); + if (!skipp) { + if (ret != ENOTSUP) + return (ret); + F_SET(log, WT_LOG_TRUNCATE_NOTSUP); + } + } + + return (__wt_file_zero(session, log_fh, offset, conn->log_file_max)); } /* * __log_truncate -- - * Truncate the log to the given LSN. If this_log is set, it will only - * truncate the log file indicated in the given LSN. If not set, - * it will truncate between the given LSN and the trunc_lsn. That is, - * since we pre-allocate log files, it will free that space and allow the - * log to be traversed. We use the trunc_lsn because logging has already - * opened the new/next log file before recovery ran. If salvage_mode is - * set, we verify headers of log files visited and recreate them if they - * are damaged. This function assumes we are in recovery or other - * dedicated time and not during live running. + * Truncate the log to the given LSN. If this_log is set, it will only truncate the log file + * indicated in the given LSN. If not set, it will truncate between the given LSN and the + * trunc_lsn. That is, since we pre-allocate log files, it will free that space and allow the + * log to be traversed. We use the trunc_lsn because logging has already opened the new/next log + * file before recovery ran. If salvage_mode is set, we verify headers of log files visited and + * recreate them if they are damaged. This function assumes we are in recovery or other + * dedicated time and not during live running. */ static int -__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, bool this_log, - bool salvage_mode) +__log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, bool this_log, bool salvage_mode) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_FH *log_fh; - WT_LOG *log; - uint32_t lognum, salvage_first, salvage_last; - u_int i, logcount; - char **logfiles; - bool need_salvage, opened; - - conn = S2C(session); - log = conn->log; - log_fh = NULL; - logcount = 0; - logfiles = NULL; - salvage_first = salvage_last = 0; - need_salvage = false; - - /* - * Truncate the log file to the given LSN. - * - * It's possible the underlying file system doesn't support truncate - * (there are existing examples), which is fine, but we don't want to - * repeatedly do the setup work just to find that out every time. Check - * before doing work, and if there's a not-supported error, turn off - * future truncates. - */ - WT_ERR(__log_openfile(session, lsn->l.file, 0, &log_fh)); - WT_ERR(__log_truncate_file(session, log_fh, lsn->l.offset)); - WT_ERR(__wt_fsync(session, log_fh, true)); - WT_ERR(__wt_close(session, &log_fh)); - - if (salvage_mode) - WT_ERR(__wt_msg(session, - "salvage: log file %" PRIu32 " truncated", lsn->l.file)); - - /* - * If we just want to truncate the current log, return and skip - * looking for intervening logs. - */ - if (this_log) - goto err; - WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); - for (i = 0; i < logcount; i++) { - WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); - if (lognum > lsn->l.file && lognum < log->trunc_lsn.l.file) { - opened = false; - if (salvage_mode) { - /* - * When salvaging, we verify that the - * header of the log file is valid. - * If not, create a new, empty one. - */ - need_salvage = false; - WT_ERR(__log_open_verify(session, lognum, - &log_fh, NULL, NULL, &need_salvage)); - if (need_salvage) { - WT_ASSERT(session, log_fh == NULL); - WT_ERR(__wt_log_remove(session, - WT_LOG_FILENAME, lognum)); - WT_ERR(__wt_log_allocfile(session, - lognum, WT_LOG_FILENAME)); - } else - opened = true; - - if (salvage_first == 0) - salvage_first = lognum; - salvage_last = lognum; - } - if (!opened) - WT_ERR(__log_openfile(session, lognum, 0, - &log_fh)); - /* - * If there are intervening files pre-allocated, - * truncate them to the end of the log file header. - */ - WT_ERR(__log_truncate_file( - session, log_fh, log->first_record)); - WT_ERR(__wt_fsync(session, log_fh, true)); - WT_ERR(__wt_close(session, &log_fh)); - } - } -err: WT_TRET(__wt_close(session, &log_fh)); - WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); - if (salvage_first != 0) { - if (salvage_last > salvage_first) - WT_TRET(__wt_msg(session, - "salvage: log files %" PRIu32 "-%" PRIu32 - " truncated at beginning", salvage_first, - salvage_last)); - else - WT_TRET(__wt_msg(session, - "salvage: log file %" PRIu32 - " truncated at beginning", salvage_first)); - } - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_FH *log_fh; + WT_LOG *log; + uint32_t lognum, salvage_first, salvage_last; + u_int i, logcount; + char **logfiles; + bool need_salvage, opened; + + conn = S2C(session); + log = conn->log; + log_fh = NULL; + logcount = 0; + logfiles = NULL; + salvage_first = salvage_last = 0; + need_salvage = false; + + /* + * Truncate the log file to the given LSN. + * + * It's possible the underlying file system doesn't support truncate + * (there are existing examples), which is fine, but we don't want to + * repeatedly do the setup work just to find that out every time. Check + * before doing work, and if there's a not-supported error, turn off + * future truncates. + */ + WT_ERR(__log_openfile(session, lsn->l.file, 0, &log_fh)); + WT_ERR(__log_truncate_file(session, log_fh, lsn->l.offset)); + WT_ERR(__wt_fsync(session, log_fh, true)); + WT_ERR(__wt_close(session, &log_fh)); + + if (salvage_mode) + WT_ERR(__wt_msg(session, "salvage: log file %" PRIu32 " truncated", lsn->l.file)); + + /* + * If we just want to truncate the current log, return and skip looking for intervening logs. + */ + if (this_log) + goto err; + WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); + if (lognum > lsn->l.file && lognum < log->trunc_lsn.l.file) { + opened = false; + if (salvage_mode) { + /* + * When salvaging, we verify that the header of the log file is valid. If not, + * create a new, empty one. + */ + need_salvage = false; + WT_ERR(__log_open_verify(session, lognum, &log_fh, NULL, NULL, &need_salvage)); + if (need_salvage) { + WT_ASSERT(session, log_fh == NULL); + WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, lognum)); + WT_ERR(__wt_log_allocfile(session, lognum, WT_LOG_FILENAME)); + } else + opened = true; + + if (salvage_first == 0) + salvage_first = lognum; + salvage_last = lognum; + } + if (!opened) + WT_ERR(__log_openfile(session, lognum, 0, &log_fh)); + /* + * If there are intervening files pre-allocated, truncate them to the end of the log + * file header. + */ + WT_ERR(__log_truncate_file(session, log_fh, log->first_record)); + WT_ERR(__wt_fsync(session, log_fh, true)); + WT_ERR(__wt_close(session, &log_fh)); + } + } +err: + WT_TRET(__wt_close(session, &log_fh)); + WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); + if (salvage_first != 0) { + if (salvage_last > salvage_first) + WT_TRET( + __wt_msg(session, "salvage: log files %" PRIu32 "-%" PRIu32 " truncated at beginning", + salvage_first, salvage_last)); + else + WT_TRET(__wt_msg( + session, "salvage: log file %" PRIu32 " truncated at beginning", salvage_first)); + } + return (ret); } /* * __wt_log_allocfile -- - * Given a log number, create a new log file by writing the header, - * pre-allocating the file and moving it to the destination name. + * Given a log number, create a new log file by writing the header, pre-allocating the file and + * moving it to the destination name. */ int -__wt_log_allocfile( - WT_SESSION_IMPL *session, uint32_t lognum, const char *dest) +__wt_log_allocfile(WT_SESSION_IMPL *session, uint32_t lognum, const char *dest) { - WT_CONNECTION_IMPL *conn; - WT_DECL_ITEM(from_path); - WT_DECL_ITEM(to_path); - WT_DECL_RET; - WT_FH *log_fh; - WT_LOG *log; - uint32_t tmp_id; - - conn = S2C(session); - log = conn->log; - log_fh = NULL; - - /* - * Preparing a log file entails creating a temporary file: - * - Writing the header. - * - Truncating to the offset of the first record. - * - Pre-allocating the file if needed. - * - Renaming it to the desired file name. - */ - WT_RET(__wt_scr_alloc(session, 0, &from_path)); - WT_ERR(__wt_scr_alloc(session, 0, &to_path)); - tmp_id = __wt_atomic_add32(&log->tmp_fileid, 1); - WT_ERR(__log_filename(session, tmp_id, WT_LOG_TMPNAME, from_path)); - WT_ERR(__log_filename(session, lognum, dest, to_path)); - __wt_spin_lock(session, &log->log_fs_lock); - /* - * Set up the temporary file. - */ - WT_ERR(__log_openfile(session, tmp_id, WT_LOG_OPEN_CREATE_OK, &log_fh)); - WT_ERR(__log_file_header(session, log_fh, NULL, true)); - WT_ERR(__log_prealloc(session, log_fh)); - WT_ERR(__wt_fsync(session, log_fh, true)); - WT_ERR(__wt_close(session, &log_fh)); - __wt_verbose(session, WT_VERB_LOG, - "log_allocfile: rename %s to %s", - (const char *)from_path->data, (const char *)to_path->data); - /* - * Rename it into place and make it available. - */ - WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data, false)); - -err: __wt_scr_free(session, &from_path); - __wt_scr_free(session, &to_path); - __wt_spin_unlock(session, &log->log_fs_lock); - WT_TRET(__wt_close(session, &log_fh)); - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(from_path); + WT_DECL_ITEM(to_path); + WT_DECL_RET; + WT_FH *log_fh; + WT_LOG *log; + uint32_t tmp_id; + + conn = S2C(session); + log = conn->log; + log_fh = NULL; + + /* + * Preparing a log file entails creating a temporary file: + * - Writing the header. + * - Truncating to the offset of the first record. + * - Pre-allocating the file if needed. + * - Renaming it to the desired file name. + */ + WT_RET(__wt_scr_alloc(session, 0, &from_path)); + WT_ERR(__wt_scr_alloc(session, 0, &to_path)); + tmp_id = __wt_atomic_add32(&log->tmp_fileid, 1); + WT_ERR(__log_filename(session, tmp_id, WT_LOG_TMPNAME, from_path)); + WT_ERR(__log_filename(session, lognum, dest, to_path)); + __wt_spin_lock(session, &log->log_fs_lock); + /* + * Set up the temporary file. + */ + WT_ERR(__log_openfile(session, tmp_id, WT_LOG_OPEN_CREATE_OK, &log_fh)); + WT_ERR(__log_file_header(session, log_fh, NULL, true)); + WT_ERR(__log_prealloc(session, log_fh)); + WT_ERR(__wt_fsync(session, log_fh, true)); + WT_ERR(__wt_close(session, &log_fh)); + __wt_verbose(session, WT_VERB_LOG, "log_allocfile: rename %s to %s", + (const char *)from_path->data, (const char *)to_path->data); + /* + * Rename it into place and make it available. + */ + WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data, false)); + +err: + __wt_scr_free(session, &from_path); + __wt_scr_free(session, &to_path); + __wt_spin_unlock(session, &log->log_fs_lock); + WT_TRET(__wt_close(session, &log_fh)); + return (ret); } /* * __wt_log_remove -- - * Given a log number, remove that log file. + * Given a log number, remove that log file. */ int -__wt_log_remove(WT_SESSION_IMPL *session, - const char *file_prefix, uint32_t lognum) +__wt_log_remove(WT_SESSION_IMPL *session, const char *file_prefix, uint32_t lognum) { - WT_DECL_ITEM(path); - WT_DECL_RET; - - WT_RET(__wt_scr_alloc(session, 0, &path)); - WT_ERR(__log_filename(session, lognum, file_prefix, path)); - __wt_verbose(session, WT_VERB_LOG, - "log_remove: remove log %s", (const char *)path->data); - WT_ERR(__wt_fs_remove(session, path->data, false)); -err: __wt_scr_free(session, &path); - return (ret); + WT_DECL_ITEM(path); + WT_DECL_RET; + + WT_RET(__wt_scr_alloc(session, 0, &path)); + WT_ERR(__log_filename(session, lognum, file_prefix, path)); + __wt_verbose(session, WT_VERB_LOG, "log_remove: remove log %s", (const char *)path->data); + WT_ERR(__wt_fs_remove(session, path->data, false)); +err: + __wt_scr_free(session, &path); + return (ret); } /* * __wt_log_open -- - * Open the appropriate log file for the connection. The purpose is - * to find the last log file that exists, open it and set our initial - * LSNs to the end of that file. If none exist, call __log_newfile - * to create it. + * Open the appropriate log file for the connection. The purpose is to find the last log file + * that exists, open it and set our initial LSNs to the end of that file. If none exist, call + * __log_newfile to create it. */ int __wt_log_open(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_LOG *log; - uint32_t firstlog, lastlog, lognum; - uint16_t version; - u_int i, logcount; - char **logfiles; - bool need_salvage; - - conn = S2C(session); - log = conn->log; - logfiles = NULL; - logcount = 0; - - /* - * Open up a file handle to the log directory if we haven't. - */ - if (log->log_dir_fh == NULL) { - __wt_verbose(session, WT_VERB_LOG, - "log_open: open fh to directory %s", conn->log_path); - WT_RET(__wt_open(session, conn->log_path, - WT_FS_OPEN_FILE_TYPE_DIRECTORY, 0, &log->log_dir_fh)); - } - - if (!F_ISSET(conn, WT_CONN_READONLY)) - WT_ERR(__log_prealloc_remove(session)); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + uint32_t firstlog, lastlog, lognum; + uint16_t version; + u_int i, logcount; + char **logfiles; + bool need_salvage; + + conn = S2C(session); + log = conn->log; + logfiles = NULL; + logcount = 0; + + /* + * Open up a file handle to the log directory if we haven't. + */ + if (log->log_dir_fh == NULL) { + __wt_verbose(session, WT_VERB_LOG, "log_open: open fh to directory %s", conn->log_path); + WT_RET( + __wt_open(session, conn->log_path, WT_FS_OPEN_FILE_TYPE_DIRECTORY, 0, &log->log_dir_fh)); + } + + if (!F_ISSET(conn, WT_CONN_READONLY)) + WT_ERR(__log_prealloc_remove(session)); again: - /* - * Now look at the log files and set our LSNs. - */ - lastlog = 0; - firstlog = UINT32_MAX; - need_salvage = false; - - WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); - for (i = 0; i < logcount; i++) { - WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); - lastlog = WT_MAX(lastlog, lognum); - firstlog = WT_MIN(firstlog, lognum); - } - log->fileid = lastlog; - __wt_verbose(session, WT_VERB_LOG, - "log_open: first log %" PRIu32 " last log %" PRIu32, - firstlog, lastlog); - if (firstlog == UINT32_MAX) { - WT_ASSERT(session, logcount == 0); - WT_INIT_LSN(&log->first_lsn); - } else { - WT_SET_LSN(&log->first_lsn, firstlog, 0); - /* - * If we have existing log files, check the last log now before - * we create a new log file so that we can detect an unsupported - * version before modifying the file space. - */ - WT_ERR(__log_open_verify(session, lastlog, NULL, NULL, - &version, &need_salvage)); - - /* - * If we were asked to salvage and the last log file was - * indeed corrupt, remove it and try all over again. - */ - if (need_salvage) { - WT_ERR(__wt_log_remove( - session, WT_LOG_FILENAME, lastlog)); - WT_ERR(__wt_msg(session, - "salvage: log file %" PRIu32 " removed", lastlog)); - WT_ERR(__wt_fs_directory_list_free(session, &logfiles, - logcount)); - logfiles = NULL; - goto again; - } - } - - /* - * Start logging at the beginning of the next log file, no matter - * where the previous log file ends. - */ - if (!F_ISSET(conn, WT_CONN_READONLY)) { - WT_WITH_SLOT_LOCK(session, log, - ret = __log_newfile(session, true, NULL)); - WT_ERR(ret); - } - - /* If we found log files, save the new state. */ - if (logcount > 0) { - /* - * If we're running in a downgraded mode and there are earlier - * logs detect if they're at a higher version. If so, we need - * to force recovery (to write a full checkpoint) and force - * archiving to remove all higher version logs. - */ - if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_DOWNGRADED)) { - for (i = 0; i < logcount; ++i) { - WT_ERR(__wt_log_extract_lognum( - session, logfiles[i], &lognum)); - /* - * By sending in a NULL file handle, we don't - * have to close the file. - */ - WT_ERR(__log_open_verify(session, - lognum, NULL, NULL, &version, NULL)); - /* - * If we find any log file at the wrong version - * set the flag and we're done. - */ - if (log->log_version != version) { - FLD_SET(conn->log_flags, - WT_CONN_LOG_FORCE_DOWNGRADE); - break; - } - } - } - log->trunc_lsn = log->alloc_lsn; - FLD_SET(conn->log_flags, WT_CONN_LOG_EXISTED); - } - -err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); - if (ret == 0) - F_SET(log, WT_LOG_OPENED); - return (ret); + /* + * Now look at the log files and set our LSNs. + */ + lastlog = 0; + firstlog = UINT32_MAX; + need_salvage = false; + + WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); + lastlog = WT_MAX(lastlog, lognum); + firstlog = WT_MIN(firstlog, lognum); + } + log->fileid = lastlog; + __wt_verbose( + session, WT_VERB_LOG, "log_open: first log %" PRIu32 " last log %" PRIu32, firstlog, lastlog); + if (firstlog == UINT32_MAX) { + WT_ASSERT(session, logcount == 0); + WT_INIT_LSN(&log->first_lsn); + } else { + WT_SET_LSN(&log->first_lsn, firstlog, 0); + /* + * If we have existing log files, check the last log now before we create a new log file so + * that we can detect an unsupported version before modifying the file space. + */ + WT_ERR(__log_open_verify(session, lastlog, NULL, NULL, &version, &need_salvage)); + + /* + * If we were asked to salvage and the last log file was indeed corrupt, remove it and try + * all over again. + */ + if (need_salvage) { + WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, lastlog)); + WT_ERR(__wt_msg(session, "salvage: log file %" PRIu32 " removed", lastlog)); + WT_ERR(__wt_fs_directory_list_free(session, &logfiles, logcount)); + logfiles = NULL; + goto again; + } + } + + /* + * Start logging at the beginning of the next log file, no matter where the previous log file + * ends. + */ + if (!F_ISSET(conn, WT_CONN_READONLY)) { + WT_WITH_SLOT_LOCK(session, log, ret = __log_newfile(session, true, NULL)); + WT_ERR(ret); + } + + /* If we found log files, save the new state. */ + if (logcount > 0) { + /* + * If we're running in a downgraded mode and there are earlier logs detect if they're at a + * higher version. If so, we need to force recovery (to write a full checkpoint) and force + * archiving to remove all higher version logs. + */ + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_DOWNGRADED)) { + for (i = 0; i < logcount; ++i) { + WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); + /* + * By sending in a NULL file handle, we don't have to close the file. + */ + WT_ERR(__log_open_verify(session, lognum, NULL, NULL, &version, NULL)); + /* + * If we find any log file at the wrong version set the flag and we're done. + */ + if (log->log_version != version) { + FLD_SET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE); + break; + } + } + } + log->trunc_lsn = log->alloc_lsn; + FLD_SET(conn->log_flags, WT_CONN_LOG_EXISTED); + } + +err: + WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); + if (ret == 0) + F_SET(log, WT_LOG_OPENED); + return (ret); } /* * __wt_log_close -- - * Close the log file. + * Close the log file. */ int __wt_log_close(WT_SESSION_IMPL *session) { - WT_CONNECTION_IMPL *conn; - WT_LOG *log; - - conn = S2C(session); - log = conn->log; - - if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) { - __wt_verbose(session, WT_VERB_LOG, - "closing old log %s", log->log_close_fh->name); - if (!F_ISSET(conn, WT_CONN_READONLY)) - WT_RET(__wt_fsync(session, log->log_close_fh, true)); - WT_RET(__wt_close(session, &log->log_close_fh)); - } - if (log->log_fh != NULL) { - __wt_verbose(session, WT_VERB_LOG, - "closing log %s", log->log_fh->name); - if (!F_ISSET(conn, WT_CONN_READONLY)) - WT_RET(__wt_fsync(session, log->log_fh, true)); - WT_RET(__wt_close(session, &log->log_fh)); - log->log_fh = NULL; - } - if (log->log_dir_fh != NULL) { - __wt_verbose(session, WT_VERB_LOG, - "closing log directory %s", log->log_dir_fh->name); - if (!F_ISSET(conn, WT_CONN_READONLY)) - WT_RET(__wt_fsync(session, log->log_dir_fh, true)); - WT_RET(__wt_close(session, &log->log_dir_fh)); - log->log_dir_fh = NULL; - } - F_CLR(log, WT_LOG_OPENED); - return (0); + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + + conn = S2C(session); + log = conn->log; + + if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) { + __wt_verbose(session, WT_VERB_LOG, "closing old log %s", log->log_close_fh->name); + if (!F_ISSET(conn, WT_CONN_READONLY)) + WT_RET(__wt_fsync(session, log->log_close_fh, true)); + WT_RET(__wt_close(session, &log->log_close_fh)); + } + if (log->log_fh != NULL) { + __wt_verbose(session, WT_VERB_LOG, "closing log %s", log->log_fh->name); + if (!F_ISSET(conn, WT_CONN_READONLY)) + WT_RET(__wt_fsync(session, log->log_fh, true)); + WT_RET(__wt_close(session, &log->log_fh)); + log->log_fh = NULL; + } + if (log->log_dir_fh != NULL) { + __wt_verbose(session, WT_VERB_LOG, "closing log directory %s", log->log_dir_fh->name); + if (!F_ISSET(conn, WT_CONN_READONLY)) + WT_RET(__wt_fsync(session, log->log_dir_fh, true)); + WT_RET(__wt_close(session, &log->log_dir_fh)); + log->log_dir_fh = NULL; + } + F_CLR(log, WT_LOG_OPENED); + return (0); } /* * __log_has_hole -- - * Determine if the current offset represents a hole in the log - * file (i.e. there is valid data somewhere after the hole), or - * if this is the end of this log file and the remainder of the - * file is zeroes. + * Determine if the current offset represents a hole in the log file (i.e. there is valid data + * somewhere after the hole), or if this is the end of this log file and the remainder of the + * file is zeroes. */ static int -__log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t log_size, - wt_off_t offset, wt_off_t *error_offset, bool *hole) +__log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t log_size, wt_off_t offset, + wt_off_t *error_offset, bool *hole) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_LOG *log; - WT_LOG_RECORD *logrec; - wt_off_t off, remainder; - size_t allocsize, buf_left, bufsz, rdlen; - char *buf, *p, *zerobuf; - bool corrupt; - - *error_offset = 0; - corrupt = *hole = false; - - conn = S2C(session); - log = conn->log; - remainder = log_size - offset; - - /* - * It can be very slow looking for the last real record in the log - * in very small chunks. Walk a megabyte at a time. If we find a - * part of the log that is not just zeroes we know this log file - * has a hole in it. - */ - buf = zerobuf = NULL; - if (log == NULL || log->allocsize < WT_MEGABYTE) - bufsz = WT_MEGABYTE; - else - bufsz = log->allocsize; - - if ((size_t)remainder < bufsz) - bufsz = (size_t)remainder; - WT_RET(__wt_calloc_def(session, bufsz, &buf)); - WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf)); - - /* - * Read in a chunk starting at the given offset. - * Compare against a known zero byte chunk. - */ - for (off = offset; remainder > 0; - remainder -= (wt_off_t)rdlen, off += (wt_off_t)rdlen) { - rdlen = WT_MIN(bufsz, (size_t)remainder); - WT_ERR(__log_fs_read(session, fh, off, rdlen, buf)); - allocsize = (log == NULL ? WT_LOG_ALIGN : log->allocsize); - if (memcmp(buf, zerobuf, rdlen) != 0) { - /* - * Find where the next log record starts after the - * hole. - */ - for (p = buf, buf_left = rdlen; buf_left > 0; - buf_left -= rdlen, p += rdlen) { - rdlen = WT_MIN(allocsize, buf_left); - if (memcmp(p, zerobuf, rdlen) != 0) - break; - } - /* - * A presumed log record begins here where the buffer - * becomes non-zero. If we have enough of a log record - * present in the buffer, we either have a valid header - * or corruption. Verify the header of this record to - * determine whether it is just a hole or corruption. - * - * We don't bother making this check for backup copies, - * as records may have their beginning zeroed, hence - * the part after a hole may in fact be the middle of - * the record. - */ - if (!F_ISSET(conn, WT_CONN_WAS_BACKUP)) { - logrec = (WT_LOG_RECORD *)p; - if (buf_left >= sizeof(WT_LOG_RECORD)) { - off += p - buf; - WT_ERR(__log_record_verify(session, fh, - (uint32_t)off, logrec, &corrupt)); - if (corrupt) - *error_offset = off; - } - } - *hole = true; - break; - } - } - -err: __wt_free(session, buf); - __wt_free(session, zerobuf); - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_LOG_RECORD *logrec; + wt_off_t off, remainder; + size_t allocsize, buf_left, bufsz, rdlen; + char *buf, *p, *zerobuf; + bool corrupt; + + *error_offset = 0; + corrupt = *hole = false; + + conn = S2C(session); + log = conn->log; + remainder = log_size - offset; + + /* + * It can be very slow looking for the last real record in the log in very small chunks. Walk a + * megabyte at a time. If we find a part of the log that is not just zeroes we know this log + * file has a hole in it. + */ + buf = zerobuf = NULL; + if (log == NULL || log->allocsize < WT_MEGABYTE) + bufsz = WT_MEGABYTE; + else + bufsz = log->allocsize; + + if ((size_t)remainder < bufsz) + bufsz = (size_t)remainder; + WT_RET(__wt_calloc_def(session, bufsz, &buf)); + WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf)); + + /* + * Read in a chunk starting at the given offset. Compare against a known zero byte chunk. + */ + for (off = offset; remainder > 0; remainder -= (wt_off_t)rdlen, off += (wt_off_t)rdlen) { + rdlen = WT_MIN(bufsz, (size_t)remainder); + WT_ERR(__log_fs_read(session, fh, off, rdlen, buf)); + allocsize = (log == NULL ? WT_LOG_ALIGN : log->allocsize); + if (memcmp(buf, zerobuf, rdlen) != 0) { + /* + * Find where the next log record starts after the hole. + */ + for (p = buf, buf_left = rdlen; buf_left > 0; buf_left -= rdlen, p += rdlen) { + rdlen = WT_MIN(allocsize, buf_left); + if (memcmp(p, zerobuf, rdlen) != 0) + break; + } + /* + * A presumed log record begins here where the buffer + * becomes non-zero. If we have enough of a log record + * present in the buffer, we either have a valid header + * or corruption. Verify the header of this record to + * determine whether it is just a hole or corruption. + * + * We don't bother making this check for backup copies, + * as records may have their beginning zeroed, hence + * the part after a hole may in fact be the middle of + * the record. + */ + if (!F_ISSET(conn, WT_CONN_WAS_BACKUP)) { + logrec = (WT_LOG_RECORD *)p; + if (buf_left >= sizeof(WT_LOG_RECORD)) { + off += p - buf; + WT_ERR(__log_record_verify(session, fh, (uint32_t)off, logrec, &corrupt)); + if (corrupt) + *error_offset = off; + } + } + *hole = true; + break; + } + } + +err: + __wt_free(session, buf); + __wt_free(session, zerobuf); + return (ret); } /* * __log_check_partial_write -- - * Determine if the log record may be a partial write. If that's - * possible, return true, otherwise false. - * - * Since the log file is initially zeroed up to a predetermined size, - * any record that falls within that boundary that ends in one or - * more zeroes may be partial (or the initial record may have been - * padded with zeroes before writing). The only way we have any certainty - * is if the last byte is non-zero, when that happens, we know that - * the write cannot be partial. + * Determine if the log record may be a partial write. If that's possible, return true, + * otherwise false. Since the log file is initially zeroed up to a predetermined size, any + * record that falls within that boundary that ends in one or more zeroes may be partial (or the + * initial record may have been padded with zeroes before writing). The only way we have any + * certainty is if the last byte is non-zero, when that happens, we know that the write cannot + * be partial. */ static bool -__log_check_partial_write(WT_SESSION_IMPL *session, WT_ITEM *buf, - uint32_t reclen) +__log_check_partial_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint32_t reclen) { - uint8_t *rec; - - WT_UNUSED(session); - - /* - * We only check the final byte since that's the only way have any - * certainty. Even if the second to last byte is non-zero and the - * last byte is zero, that could still technically be the result of - * a partial write, however unlikely it may be. - */ - rec = buf->mem; - return (reclen > 0 && rec[reclen - 1] == 0); + uint8_t *rec; + + WT_UNUSED(session); + + /* + * We only check the final byte since that's the only way have any certainty. Even if the second + * to last byte is non-zero and the last byte is zero, that could still technically be the + * result of a partial write, however unlikely it may be. + */ + rec = buf->mem; + return (reclen > 0 && rec[reclen - 1] == 0); } /* * __wt_log_release -- - * Release a log slot. + * Release a log slot. */ int __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_LOG *log; - WT_LSN sync_lsn; - uint64_t fsync_duration_usecs, time_start, time_stop; - int64_t release_buffered, release_bytes; - bool locked; - - conn = S2C(session); - log = conn->log; - locked = false; - if (freep != NULL) - *freep = 1; - release_buffered = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state); - release_bytes = release_buffered + slot->slot_unbuffered; - - /* - * Checkpoints can be configured based on amount of log written. - * Add in this log record to the sum and if needed, signal the - * checkpoint condition. The logging subsystem manages the - * accumulated field. There is a bit of layering violation - * here checking the connection ckpt field and using its - * condition. - */ - if (WT_CKPT_LOGSIZE(conn)) { - log->log_written += (wt_off_t)release_bytes; - __wt_checkpoint_signal(session, log->log_written); - } - - /* Write the buffered records */ - if (release_buffered != 0) - WT_ERR(__log_fs_write(session, slot, slot->slot_start_offset, - (size_t)release_buffered, slot->slot_buf.mem)); - - /* - * If we have to wait for a synchronous operation, we do not pass - * handling of this slot off to the worker thread. The caller is - * responsible for freeing the slot in that case. Otherwise the - * worker thread will free it. - */ - if (!F_ISSET(slot, WT_SLOT_FLUSH | WT_SLOT_SYNC_FLAGS)) { - if (freep != NULL) - *freep = 0; - slot->slot_state = WT_LOG_SLOT_WRITTEN; - /* - * After this point the worker thread owns the slot. There - * is nothing more to do but return. - */ - /* - * !!! Signalling the wrlsn_cond condition here results in - * worse performance because it causes more scheduling churn - * and more walking of the slot pool for a very small number - * of slots to process. Don't signal here. - */ - return (0); - } - - /* - * Wait for earlier groups to finish, otherwise there could - * be holes in the log file. - */ - WT_STAT_CONN_INCR(session, log_release_write_lsn); - __log_wait_for_earlier_slot(session, slot); - - log->write_start_lsn = slot->slot_start_lsn; - log->write_lsn = slot->slot_end_lsn; - - WT_ASSERT(session, slot != log->active_slot); - __wt_cond_signal(session, log->log_write_cond); - F_CLR(slot, WT_SLOT_FLUSH); - - /* - * Signal the close thread if needed. - */ - if (F_ISSET(slot, WT_SLOT_CLOSEFH)) - __wt_cond_signal(session, conn->log_file_cond); - - if (F_ISSET(slot, WT_SLOT_SYNC_DIRTY) && !F_ISSET(slot, WT_SLOT_SYNC) && - (ret = __wt_fsync(session, log->log_fh, false)) != 0) { - /* - * Ignore ENOTSUP, but don't try again. - */ - if (ret != ENOTSUP) - WT_ERR(ret); - conn->log_dirty_max = 0; - } - - /* - * Try to consolidate calls to fsync to wait less. Acquire a spin lock - * so that threads finishing writing to the log will wait while the - * current fsync completes and advance log->sync_lsn. - */ - while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { - /* - * We have to wait until earlier log files have finished their - * sync operations. The most recent one will set the LSN to the - * beginning of our file. - */ - if (log->sync_lsn.l.file < slot->slot_end_lsn.l.file || - __wt_spin_trylock(session, &log->log_sync_lock) != 0) { - __wt_cond_wait( - session, log->log_sync_cond, 10000, NULL); - continue; - } - locked = true; - - /* - * Record the current end of our update after the lock. - * That is how far our calls can guarantee. - */ - sync_lsn = slot->slot_end_lsn; - /* - * Check if we have to sync the parent directory. Some - * combinations of sync flags may result in the log file - * not yet stable in its parent directory. Do that - * now if needed. - */ - if (F_ISSET(slot, WT_SLOT_SYNC_DIR) && - (log->sync_dir_lsn.l.file < sync_lsn.l.file)) { - WT_ASSERT(session, log->log_dir_fh != NULL); - __wt_verbose(session, WT_VERB_LOG, - "log_release: sync directory %s to LSN %" PRIu32 - "/%" PRIu32, - log->log_dir_fh->name, - sync_lsn.l.file, sync_lsn.l.offset); - time_start = __wt_clock(session); - WT_ERR(__wt_fsync(session, log->log_dir_fh, true)); - time_stop = __wt_clock(session); - fsync_duration_usecs = - WT_CLOCKDIFF_US(time_stop, time_start); - log->sync_dir_lsn = sync_lsn; - WT_STAT_CONN_INCR(session, log_sync_dir); - WT_STAT_CONN_INCRV(session, - log_sync_dir_duration, fsync_duration_usecs); - } - - /* - * Sync the log file if needed. - */ - if (F_ISSET(slot, WT_SLOT_SYNC) && - __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) { - __wt_verbose(session, WT_VERB_LOG, - "log_release: sync log %s to LSN %" PRIu32 - "/%" PRIu32, - log->log_fh->name, - sync_lsn.l.file, sync_lsn.l.offset); - WT_STAT_CONN_INCR(session, log_sync); - time_start = __wt_clock(session); - WT_ERR(__wt_fsync(session, log->log_fh, true)); - time_stop = __wt_clock(session); - fsync_duration_usecs = - WT_CLOCKDIFF_US(time_stop, time_start); - WT_STAT_CONN_INCRV(session, - log_sync_duration, fsync_duration_usecs); - log->sync_lsn = sync_lsn; - __wt_cond_signal(session, log->log_sync_cond); - } - /* - * Clear the flags before leaving the loop. - */ - F_CLR(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR); - locked = false; - __wt_spin_unlock(session, &log->log_sync_lock); - } -err: if (locked) - __wt_spin_unlock(session, &log->log_sync_lock); - if (ret != 0 && slot->slot_error == 0) - slot->slot_error = ret; - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_LSN sync_lsn; + uint64_t fsync_duration_usecs, time_start, time_stop; + int64_t release_buffered, release_bytes; + bool locked; + + conn = S2C(session); + log = conn->log; + locked = false; + if (freep != NULL) + *freep = 1; + release_buffered = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state); + release_bytes = release_buffered + slot->slot_unbuffered; + + /* + * Checkpoints can be configured based on amount of log written. Add in this log record to the + * sum and if needed, signal the checkpoint condition. The logging subsystem manages the + * accumulated field. There is a bit of layering violation here checking the connection ckpt + * field and using its condition. + */ + if (WT_CKPT_LOGSIZE(conn)) { + log->log_written += (wt_off_t)release_bytes; + __wt_checkpoint_signal(session, log->log_written); + } + + /* Write the buffered records */ + if (release_buffered != 0) + WT_ERR(__log_fs_write( + session, slot, slot->slot_start_offset, (size_t)release_buffered, slot->slot_buf.mem)); + + /* + * If we have to wait for a synchronous operation, we do not pass handling of this slot off to + * the worker thread. The caller is responsible for freeing the slot in that case. Otherwise the + * worker thread will free it. + */ + if (!F_ISSET(slot, WT_SLOT_FLUSH | WT_SLOT_SYNC_FLAGS)) { + if (freep != NULL) + *freep = 0; + slot->slot_state = WT_LOG_SLOT_WRITTEN; + /* + * After this point the worker thread owns the slot. There is nothing more to do but return. + */ + /* + * !!! Signalling the wrlsn_cond condition here results in + * worse performance because it causes more scheduling churn + * and more walking of the slot pool for a very small number + * of slots to process. Don't signal here. + */ + return (0); + } + + /* + * Wait for earlier groups to finish, otherwise there could be holes in the log file. + */ + WT_STAT_CONN_INCR(session, log_release_write_lsn); + __log_wait_for_earlier_slot(session, slot); + + log->write_start_lsn = slot->slot_start_lsn; + log->write_lsn = slot->slot_end_lsn; + + WT_ASSERT(session, slot != log->active_slot); + __wt_cond_signal(session, log->log_write_cond); + F_CLR(slot, WT_SLOT_FLUSH); + + /* + * Signal the close thread if needed. + */ + if (F_ISSET(slot, WT_SLOT_CLOSEFH)) + __wt_cond_signal(session, conn->log_file_cond); + + if (F_ISSET(slot, WT_SLOT_SYNC_DIRTY) && !F_ISSET(slot, WT_SLOT_SYNC) && + (ret = __wt_fsync(session, log->log_fh, false)) != 0) { + /* + * Ignore ENOTSUP, but don't try again. + */ + if (ret != ENOTSUP) + WT_ERR(ret); + conn->log_dirty_max = 0; + } + + /* + * Try to consolidate calls to fsync to wait less. Acquire a spin lock so that threads finishing + * writing to the log will wait while the current fsync completes and advance log->sync_lsn. + */ + while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { + /* + * We have to wait until earlier log files have finished their sync operations. The most + * recent one will set the LSN to the beginning of our file. + */ + if (log->sync_lsn.l.file < slot->slot_end_lsn.l.file || + __wt_spin_trylock(session, &log->log_sync_lock) != 0) { + __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); + continue; + } + locked = true; + + /* + * Record the current end of our update after the lock. That is how far our calls can + * guarantee. + */ + sync_lsn = slot->slot_end_lsn; + /* + * Check if we have to sync the parent directory. Some combinations of sync flags may result + * in the log file not yet stable in its parent directory. Do that now if needed. + */ + if (F_ISSET(slot, WT_SLOT_SYNC_DIR) && (log->sync_dir_lsn.l.file < sync_lsn.l.file)) { + WT_ASSERT(session, log->log_dir_fh != NULL); + __wt_verbose(session, WT_VERB_LOG, + "log_release: sync directory %s to LSN %" PRIu32 "/%" PRIu32, log->log_dir_fh->name, + sync_lsn.l.file, sync_lsn.l.offset); + time_start = __wt_clock(session); + WT_ERR(__wt_fsync(session, log->log_dir_fh, true)); + time_stop = __wt_clock(session); + fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); + log->sync_dir_lsn = sync_lsn; + WT_STAT_CONN_INCR(session, log_sync_dir); + WT_STAT_CONN_INCRV(session, log_sync_dir_duration, fsync_duration_usecs); + } + + /* + * Sync the log file if needed. + */ + if (F_ISSET(slot, WT_SLOT_SYNC) && __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) { + __wt_verbose(session, WT_VERB_LOG, + "log_release: sync log %s to LSN %" PRIu32 "/%" PRIu32, log->log_fh->name, + sync_lsn.l.file, sync_lsn.l.offset); + WT_STAT_CONN_INCR(session, log_sync); + time_start = __wt_clock(session); + WT_ERR(__wt_fsync(session, log->log_fh, true)); + time_stop = __wt_clock(session); + fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); + WT_STAT_CONN_INCRV(session, log_sync_duration, fsync_duration_usecs); + log->sync_lsn = sync_lsn; + __wt_cond_signal(session, log->log_sync_cond); + } + /* + * Clear the flags before leaving the loop. + */ + F_CLR(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR); + locked = false; + __wt_spin_unlock(session, &log->log_sync_lock); + } +err: + if (locked) + __wt_spin_unlock(session, &log->log_sync_lock); + if (ret != 0 && slot->slot_error == 0) + slot->slot_error = ret; + return (ret); } /* * __log_salvage_message -- - * Show messages consistently for a salvageable error. + * Show messages consistently for a salvageable error. */ static int -__log_salvage_message(WT_SESSION_IMPL *session, const char *log_name, - const char *extra_msg, wt_off_t offset) +__log_salvage_message( + WT_SESSION_IMPL *session, const char *log_name, const char *extra_msg, wt_off_t offset) { - WT_RET(__wt_msg(session, - "log file %s corrupted%s at position %" PRIuMAX - ", truncated", log_name, extra_msg, (uintmax_t)offset)); - F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); - return (WT_ERROR); + WT_RET(__wt_msg(session, "log file %s corrupted%s at position %" PRIuMAX ", truncated", + log_name, extra_msg, (uintmax_t)offset)); + F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); + return (WT_ERROR); } /* * __wt_log_scan -- - * Scan the logs, calling a function on each record found. + * Scan the logs, calling a function on each record found. */ int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, - int (*func)(WT_SESSION_IMPL *session, - WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp, - void *cookie, int firstrecord), void *cookie) + int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp, + void *cookie, int firstrecord), + void *cookie) { - WT_CONNECTION_IMPL *conn; - WT_DECL_ITEM(buf); - WT_DECL_ITEM(decryptitem); - WT_DECL_ITEM(uncitem); - WT_DECL_RET; - WT_FH *log_fh; - WT_ITEM *cbbuf; - WT_LOG *log; - WT_LOG_RECORD *logrec; - WT_LSN end_lsn, next_lsn, prev_eof, prev_lsn, rd_lsn, start_lsn; - wt_off_t bad_offset, log_size; - uint32_t allocsize, firstlog, lastlog, lognum, rdup_len, reclen; - uint16_t version; - u_int i, logcount; - int firstrecord; - char **logfiles; - bool corrupt, eol, need_salvage, partial_record; - - conn = S2C(session); - log = conn->log; - log_fh = NULL; - logcount = 0; - logfiles = NULL; - corrupt = eol = false; - firstrecord = 1; - need_salvage = false; - - /* - * If the caller did not give us a callback function there is nothing - * to do. - */ - if (func == NULL) - return (0); - - if (lsnp != NULL && - LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP)) - WT_RET_MSG(session, WT_ERROR, - "choose either a start LSN or a start flag"); - /* - * Set up the allocation size, starting and ending LSNs. The values - * for those depend on whether logging is currently enabled or not. - */ - lastlog = 0; - if (log != NULL) { - allocsize = log->allocsize; - end_lsn = log->alloc_lsn; - start_lsn = log->first_lsn; - if (lsnp == NULL) { - if (LF_ISSET(WT_LOGSCAN_FROM_CKP)) - start_lsn = log->ckpt_lsn; - else if (!LF_ISSET(WT_LOGSCAN_FIRST)) - WT_RET_MSG(session, WT_ERROR, - "WT_LOGSCAN_FIRST not set"); - } - lastlog = log->fileid; - } else { - /* - * If logging is not configured, we can still print out the log - * if log files exist. We just need to set the LSNs from what - * is in the files versus what is in the live connection. - */ - /* - * Set allocsize to the minimum alignment it could be. Larger - * records and larger allocation boundaries should always be - * a multiple of this. - */ - allocsize = WT_LOG_ALIGN; - firstlog = UINT32_MAX; - WT_RET(__log_get_files(session, - WT_LOG_FILENAME, &logfiles, &logcount)); - if (logcount == 0) - WT_RET_MSG(session, ENOTSUP, "no log files found"); - for (i = 0; i < logcount; i++) { - WT_ERR(__wt_log_extract_lognum(session, logfiles[i], - &lognum)); - lastlog = WT_MAX(lastlog, lognum); - firstlog = WT_MIN(firstlog, lognum); - } - WT_SET_LSN(&start_lsn, firstlog, 0); - WT_SET_LSN(&end_lsn, lastlog, 0); - WT_ERR( - __wt_fs_directory_list_free(session, &logfiles, logcount)); - } - if (lsnp != NULL) { - /* - * Offsets must be on allocation boundaries. - * An invalid LSN from a user should just return - * WT_NOTFOUND. It is not an error. But if it is - * from recovery, we expect valid LSNs so give more - * information about that. - */ - if (lsnp->l.offset % allocsize != 0) { - if (LF_ISSET(WT_LOGSCAN_RECOVER | - WT_LOGSCAN_RECOVER_METADATA)) - WT_ERR_MSG(session, WT_NOTFOUND, - "__wt_log_scan unaligned LSN %" - PRIu32 "/%" PRIu32, - lsnp->l.file, lsnp->l.offset); - else - WT_ERR(WT_NOTFOUND); - } - /* - * If the file is in the future it doesn't exist. - * An invalid LSN from a user should just return - * WT_NOTFOUND. It is not an error. But if it is - * from recovery, we expect valid LSNs so give more - * information about that. - */ - if (lsnp->l.file > lastlog) { - if (LF_ISSET(WT_LOGSCAN_RECOVER | - WT_LOGSCAN_RECOVER_METADATA)) - WT_ERR_MSG(session, WT_NOTFOUND, - "__wt_log_scan LSN %" PRIu32 "/%" PRIu32 - " larger than biggest log file %" PRIu32, - lsnp->l.file, lsnp->l.offset, lastlog); - else - WT_ERR(WT_NOTFOUND); - } - /* - * Log cursors may not know the starting LSN. If an - * LSN is passed in that it is equal to the smallest - * LSN, start from the beginning of the log. - */ - if (!WT_IS_INIT_LSN(lsnp)) - start_lsn = *lsnp; - } - WT_ERR(__log_open_verify(session, start_lsn.l.file, &log_fh, &prev_lsn, - NULL, &need_salvage)); - if (need_salvage) - WT_ERR_MSG(session, WT_ERROR, "log file requires salvage"); - WT_ERR(__wt_filesize(session, log_fh, &log_size)); - rd_lsn = start_lsn; - if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA)) - __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, - "Recovering log %" PRIu32 " through %" PRIu32, - rd_lsn.l.file, end_lsn.l.file); - - WT_ERR(__wt_scr_alloc(session, WT_LOG_ALIGN, &buf)); - WT_ERR(__wt_scr_alloc(session, 0, &decryptitem)); - WT_ERR(__wt_scr_alloc(session, 0, &uncitem)); - for (;;) { - if (rd_lsn.l.offset + allocsize > log_size) { + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(buf); + WT_DECL_ITEM(decryptitem); + WT_DECL_ITEM(uncitem); + WT_DECL_RET; + WT_FH *log_fh; + WT_ITEM *cbbuf; + WT_LOG *log; + WT_LOG_RECORD *logrec; + WT_LSN end_lsn, next_lsn, prev_eof, prev_lsn, rd_lsn, start_lsn; + wt_off_t bad_offset, log_size; + uint32_t allocsize, firstlog, lastlog, lognum, rdup_len, reclen; + uint16_t version; + u_int i, logcount; + int firstrecord; + char **logfiles; + bool corrupt, eol, need_salvage, partial_record; + + conn = S2C(session); + log = conn->log; + log_fh = NULL; + logcount = 0; + logfiles = NULL; + corrupt = eol = false; + firstrecord = 1; + need_salvage = false; + + /* + * If the caller did not give us a callback function there is nothing to do. + */ + if (func == NULL) + return (0); + + if (lsnp != NULL && LF_ISSET(WT_LOGSCAN_FIRST | WT_LOGSCAN_FROM_CKP)) + WT_RET_MSG(session, WT_ERROR, "choose either a start LSN or a start flag"); + /* + * Set up the allocation size, starting and ending LSNs. The values for those depend on whether + * logging is currently enabled or not. + */ + lastlog = 0; + if (log != NULL) { + allocsize = log->allocsize; + end_lsn = log->alloc_lsn; + start_lsn = log->first_lsn; + if (lsnp == NULL) { + if (LF_ISSET(WT_LOGSCAN_FROM_CKP)) + start_lsn = log->ckpt_lsn; + else if (!LF_ISSET(WT_LOGSCAN_FIRST)) + WT_RET_MSG(session, WT_ERROR, "WT_LOGSCAN_FIRST not set"); + } + lastlog = log->fileid; + } else { + /* + * If logging is not configured, we can still print out the log if log files exist. We just + * need to set the LSNs from what is in the files versus what is in the live connection. + */ + /* + * Set allocsize to the minimum alignment it could be. Larger records and larger allocation + * boundaries should always be a multiple of this. + */ + allocsize = WT_LOG_ALIGN; + firstlog = UINT32_MAX; + WT_RET(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); + if (logcount == 0) + WT_RET_MSG(session, ENOTSUP, "no log files found"); + for (i = 0; i < logcount; i++) { + WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); + lastlog = WT_MAX(lastlog, lognum); + firstlog = WT_MIN(firstlog, lognum); + } + WT_SET_LSN(&start_lsn, firstlog, 0); + WT_SET_LSN(&end_lsn, lastlog, 0); + WT_ERR(__wt_fs_directory_list_free(session, &logfiles, logcount)); + } + if (lsnp != NULL) { + /* + * Offsets must be on allocation boundaries. An invalid LSN from a user should just return + * WT_NOTFOUND. It is not an error. But if it is from recovery, we expect valid LSNs so give + * more information about that. + */ + if (lsnp->l.offset % allocsize != 0) { + if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA)) + WT_ERR_MSG(session, WT_NOTFOUND, "__wt_log_scan unaligned LSN %" PRIu32 "/%" PRIu32, + lsnp->l.file, lsnp->l.offset); + else + WT_ERR(WT_NOTFOUND); + } + /* + * If the file is in the future it doesn't exist. An invalid LSN from a user should just + * return WT_NOTFOUND. It is not an error. But if it is from recovery, we expect valid LSNs + * so give more information about that. + */ + if (lsnp->l.file > lastlog) { + if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA)) + WT_ERR_MSG(session, WT_NOTFOUND, + "__wt_log_scan LSN %" PRIu32 "/%" PRIu32 " larger than biggest log file %" PRIu32, + lsnp->l.file, lsnp->l.offset, lastlog); + else + WT_ERR(WT_NOTFOUND); + } + /* + * Log cursors may not know the starting LSN. If an LSN is passed in that it is equal to the + * smallest LSN, start from the beginning of the log. + */ + if (!WT_IS_INIT_LSN(lsnp)) + start_lsn = *lsnp; + } + WT_ERR(__log_open_verify(session, start_lsn.l.file, &log_fh, &prev_lsn, NULL, &need_salvage)); + if (need_salvage) + WT_ERR_MSG(session, WT_ERROR, "log file requires salvage"); + WT_ERR(__wt_filesize(session, log_fh, &log_size)); + rd_lsn = start_lsn; + if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA)) + __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, + "Recovering log %" PRIu32 " through %" PRIu32, rd_lsn.l.file, end_lsn.l.file); + + WT_ERR(__wt_scr_alloc(session, WT_LOG_ALIGN, &buf)); + WT_ERR(__wt_scr_alloc(session, 0, &decryptitem)); + WT_ERR(__wt_scr_alloc(session, 0, &uncitem)); + for (;;) { + if (rd_lsn.l.offset + allocsize > log_size) { advance: - if (rd_lsn.l.offset == log_size) - partial_record = false; - else { - /* - * See if there is anything non-zero at the - * end of this log file. - */ - WT_ERR(__log_has_hole( - session, log_fh, log_size, - rd_lsn.l.offset, &bad_offset, - &partial_record)); - if (bad_offset != 0) { - need_salvage = true; - WT_ERR(__log_salvage_message(session, - log_fh->name, "", bad_offset)); - } - } - /* - * If we read the last record, go to the next file. - */ - WT_ERR(__wt_close(session, &log_fh)); - log_fh = NULL; - eol = true; - /* - * Truncate this log file before we move to the next. - */ - if (LF_ISSET(WT_LOGSCAN_RECOVER) && - __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0) { - __wt_verbose(session, WT_VERB_LOG, - "Truncate end of log %" PRIu32 "/%" PRIu32, - rd_lsn.l.file, rd_lsn.l.offset); - WT_ERR(__log_truncate(session, &rd_lsn, true, - false)); - } - /* - * If we had a partial record, we'll want to break - * now after closing and truncating. Although for now - * log_truncate does not modify the LSN passed in, - * this code does not assume it is unmodified after that - * call which is why it uses the boolean set earlier. - */ - if (partial_record) - break; - /* - * Avoid an error message when we reach end of log - * by checking here. - */ - prev_eof = rd_lsn; - WT_SET_LSN(&rd_lsn, rd_lsn.l.file + 1, 0); - if (rd_lsn.l.file > end_lsn.l.file) - break; - if (LF_ISSET(WT_LOGSCAN_RECOVER | - WT_LOGSCAN_RECOVER_METADATA)) - __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, - "Recovering log %" PRIu32 - " through %" PRIu32, - rd_lsn.l.file, end_lsn.l.file); - WT_ERR(__log_open_verify(session, - rd_lsn.l.file, &log_fh, &prev_lsn, &version, - &need_salvage)); - if (need_salvage) - WT_ERR_MSG(session, WT_ERROR, - "log file requires salvage"); - /* - * Opening the log file reads with verify sets up the - * previous LSN from the first record. This detects - * a "hole" at the end of the previous log file. - */ - if (LF_ISSET(WT_LOGSCAN_RECOVER) && - !WT_IS_INIT_LSN(&prev_lsn) && - !WT_IS_ZERO_LSN(&prev_lsn) && - prev_lsn.l.offset != prev_eof.l.offset) { - WT_ASSERT(session, - prev_eof.l.file == prev_lsn.l.file); - break; - } - /* - * If we read a current version log file without a - * previous LSN record the log ended after writing - * that header. We're done. - */ - if (LF_ISSET(WT_LOGSCAN_RECOVER) && - version == WT_LOG_VERSION_SYSTEM && - WT_IS_ZERO_LSN(&prev_lsn)) { - __wt_verbose(session, WT_VERB_LOG, - "log_scan: Stopping, no system " - "record detected in %s.", log_fh->name); - break; - } - WT_ERR(__wt_filesize(session, log_fh, &log_size)); - eol = false; - continue; - } - /* - * Read the minimum allocation size a record could be. - * Conditionally set the need_salvage flag so that if the - * read fails, we know this is an situation we can salvage. - */ - WT_ASSERT(session, buf->memsize >= allocsize); - need_salvage = F_ISSET(conn, WT_CONN_SALVAGE); - WT_ERR(__log_fs_read(session, - log_fh, rd_lsn.l.offset, (size_t)allocsize, buf->mem)); - need_salvage = false; - /* - * See if we need to read more than the allocation size. We - * expect that we rarely will have to read more. Most log - * records will be fairly small. - */ - reclen = ((WT_LOG_RECORD *)buf->mem)->len; + if (rd_lsn.l.offset == log_size) + partial_record = false; + else { + /* + * See if there is anything non-zero at the end of this log file. + */ + WT_ERR(__log_has_hole( + session, log_fh, log_size, rd_lsn.l.offset, &bad_offset, &partial_record)); + if (bad_offset != 0) { + need_salvage = true; + WT_ERR(__log_salvage_message(session, log_fh->name, "", bad_offset)); + } + } + /* + * If we read the last record, go to the next file. + */ + WT_ERR(__wt_close(session, &log_fh)); + log_fh = NULL; + eol = true; + /* + * Truncate this log file before we move to the next. + */ + if (LF_ISSET(WT_LOGSCAN_RECOVER) && __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0) { + __wt_verbose(session, WT_VERB_LOG, "Truncate end of log %" PRIu32 "/%" PRIu32, + rd_lsn.l.file, rd_lsn.l.offset); + WT_ERR(__log_truncate(session, &rd_lsn, true, false)); + } + /* + * If we had a partial record, we'll want to break now after closing and truncating. + * Although for now log_truncate does not modify the LSN passed in, this code does not + * assume it is unmodified after that call which is why it uses the boolean set earlier. + */ + if (partial_record) + break; + /* + * Avoid an error message when we reach end of log by checking here. + */ + prev_eof = rd_lsn; + WT_SET_LSN(&rd_lsn, rd_lsn.l.file + 1, 0); + if (rd_lsn.l.file > end_lsn.l.file) + break; + if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA)) + __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, + "Recovering log %" PRIu32 " through %" PRIu32, rd_lsn.l.file, end_lsn.l.file); + WT_ERR(__log_open_verify( + session, rd_lsn.l.file, &log_fh, &prev_lsn, &version, &need_salvage)); + if (need_salvage) + WT_ERR_MSG(session, WT_ERROR, "log file requires salvage"); + /* + * Opening the log file reads with verify sets up the previous LSN from the first + * record. This detects a "hole" at the end of the previous log file. + */ + if (LF_ISSET(WT_LOGSCAN_RECOVER) && !WT_IS_INIT_LSN(&prev_lsn) && + !WT_IS_ZERO_LSN(&prev_lsn) && prev_lsn.l.offset != prev_eof.l.offset) { + WT_ASSERT(session, prev_eof.l.file == prev_lsn.l.file); + break; + } + /* + * If we read a current version log file without a previous LSN record the log ended + * after writing that header. We're done. + */ + if (LF_ISSET(WT_LOGSCAN_RECOVER) && version == WT_LOG_VERSION_SYSTEM && + WT_IS_ZERO_LSN(&prev_lsn)) { + __wt_verbose(session, WT_VERB_LOG, + "log_scan: Stopping, no system " + "record detected in %s.", + log_fh->name); + break; + } + WT_ERR(__wt_filesize(session, log_fh, &log_size)); + eol = false; + continue; + } + /* + * Read the minimum allocation size a record could be. Conditionally set the need_salvage + * flag so that if the read fails, we know this is an situation we can salvage. + */ + WT_ASSERT(session, buf->memsize >= allocsize); + need_salvage = F_ISSET(conn, WT_CONN_SALVAGE); + WT_ERR(__log_fs_read(session, log_fh, rd_lsn.l.offset, (size_t)allocsize, buf->mem)); + need_salvage = false; + /* + * See if we need to read more than the allocation size. We expect that we rarely will have + * to read more. Most log records will be fairly small. + */ + reclen = ((WT_LOG_RECORD *)buf->mem)->len; #ifdef WORDS_BIGENDIAN - reclen = __wt_bswap32(reclen); + reclen = __wt_bswap32(reclen); #endif - /* - * Log files are pre-allocated. We need to detect the - * difference between a hole in the file (where this location - * would be considered the end of log) and the last record - * in the log and we're at the zeroed part of the file. - * If we find a zeroed record, scan forward in the log looking - * for any data. If we detect any we have a hole and stop. - * Otherwise if the rest is all zeroes advance to the next file. - * When recovery finds the end of the log, truncate the file - * and remove any later log files that may exist. - */ - if (reclen == 0) { - WT_ERR(__log_has_hole( - session, log_fh, log_size, rd_lsn.l.offset, - &bad_offset, &eol)); - if (bad_offset != 0) { - need_salvage = true; - WT_ERR(__log_salvage_message(session, - log_fh->name, "", bad_offset)); - } - if (eol) - /* Found a hole. This LSN is the end. */ - break; - /* Last record in log. Look for more. */ - goto advance; - } - rdup_len = __wt_rduppo2(reclen, allocsize); - if (reclen > allocsize) { - /* - * The log file end could be the middle of this - * log record. If we have a partially written record - * then this is considered the end of the log. - */ - if (rd_lsn.l.offset + rdup_len > log_size) { - eol = true; - break; - } - /* - * We need to round up and read in the full padded - * record, especially for direct I/O. - */ - WT_ERR(__wt_buf_grow(session, buf, rdup_len)); - WT_ERR(__log_fs_read(session, log_fh, - rd_lsn.l.offset, (size_t)rdup_len, buf->mem)); - WT_STAT_CONN_INCR(session, log_scan_rereads); - } - /* - * We read in the record, now verify the checksum. A failed - * checksum does not imply corruption, it may be the result - * of a partial write. - */ - buf->size = reclen; - logrec = (WT_LOG_RECORD *)buf->mem; - if (!__log_checksum_match(buf, reclen)) { - /* - * A checksum mismatch means we have reached the end of - * the useful part of the log. This should be found on - * the first pass through recovery. In the second pass - * where we truncate the log, this is where it should - * end. - * Continue processing where possible, so remember any - * error returns, but don't skip to the error handler. - */ - if (log != NULL) - log->trunc_lsn = rd_lsn; - /* - * If the user asked for a specific LSN and it is not - * a valid LSN, return WT_NOTFOUND. - */ - if (LF_ISSET(WT_LOGSCAN_ONE)) - ret = WT_NOTFOUND; - - /* - * When we have a checksum mismatch, we would like - * to determine whether it may be the result of: - * 1) some expected corruption that can occur during - * backups - * 2) a partial write that can naturally occur when - * an application crashes - * 3) some other corruption - * so that we can (in case 3) flag cases of file system - * or hardware failures. Unfortunately, we have found - * on some systems that file system writes may in fact - * be lost, and this can readily be triggered with - * normal operations. Rather than force users to - * salvage in these situations, we merely truncate the - * log at this point and issue a message. - */ - if (F_ISSET(conn, WT_CONN_WAS_BACKUP)) - break; - - if (!__log_check_partial_write(session, buf, reclen)) { - /* - * It's not a partial write, and we have a bad - * checksum. We treat it as a corruption that - * must be salvaged. - */ - need_salvage = true; - WT_TRET(__log_salvage_message(session, - log_fh->name, ", bad checksum", - rd_lsn.l.offset)); - } else { - /* - * It may be a partial write, or it's possible - * that the header is corrupt. Make a sanity - * check of the log record header. - */ - WT_TRET(__log_record_verify(session, log_fh, - rd_lsn.l.offset, logrec, &corrupt)); - if (corrupt) { - need_salvage = true; - WT_TRET(__log_salvage_message(session, - log_fh->name, "", rd_lsn.l.offset)); - } - } - break; - } - __wt_log_record_byteswap(logrec); - - /* - * We have a valid log record. If it is not the log file - * header, invoke the callback. - */ - WT_STAT_CONN_INCR(session, log_scan_records); - next_lsn = rd_lsn; - next_lsn.l.offset += rdup_len; - if (rd_lsn.l.offset != 0) { - /* - * We need to manage the different buffers here. - * Buf is the buffer this function uses to read from - * the disk. The callback buffer may change based - * on whether encryption and compression are used. - * - * We want to free any buffers from compression and - * encryption but keep the one we use for reading. - */ - cbbuf = buf; - if (F_ISSET(logrec, WT_LOG_RECORD_ENCRYPTED)) { - WT_ERR(__log_decrypt( - session, cbbuf, decryptitem)); - cbbuf = decryptitem; - } - if (F_ISSET(logrec, WT_LOG_RECORD_COMPRESSED)) { - WT_ERR(__log_decompress( - session, cbbuf, uncitem)); - cbbuf = uncitem; - } - WT_ERR((*func)(session, - cbbuf, &rd_lsn, &next_lsn, cookie, firstrecord)); - - firstrecord = 0; - - if (LF_ISSET(WT_LOGSCAN_ONE)) - break; - } - rd_lsn = next_lsn; - } - - /* Truncate if we're in recovery. */ - if (LF_ISSET(WT_LOGSCAN_RECOVER) && - __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0) { - __wt_verbose(session, WT_VERB_LOG, - "End of recovery truncate end of log %" PRIu32 "/%" PRIu32, - rd_lsn.l.file, rd_lsn.l.offset); - /* Preserve prior error and fall through to error handling. */ - WT_TRET(__log_truncate(session, &rd_lsn, false, false)); - } - -err: WT_STAT_CONN_INCR(session, log_scans); - /* - * If we are salvaging and failed a salvageable operation, then - * truncate the log at the fail point. - */ - if (ret != 0 && ret != WT_PANIC && need_salvage) { - WT_TRET(__wt_close(session, &log_fh)); - log_fh = NULL; - WT_TRET(__log_truncate(session, &rd_lsn, false, true)); - ret = 0; - } - - /* - * If the first attempt to read a log record results in - * an error recovery is likely going to fail. Try to provide - * a helpful failure message. - */ - if (ret != 0 && firstrecord && LF_ISSET(WT_LOGSCAN_RECOVER | - WT_LOGSCAN_RECOVER_METADATA)) { - __wt_err(session, ret, - "WiredTiger is unable to read the recovery log."); - __wt_err(session, ret, "This may be due to the log" - " files being encrypted, being from an older" - " version or due to corruption on disk"); - __wt_err(session, ret, "You should confirm that you have" - " opened the database with the correct options including" - " all encryption and compression options"); - } - - WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); - - __wt_scr_free(session, &buf); - __wt_scr_free(session, &decryptitem); - __wt_scr_free(session, &uncitem); - - /* - * If the caller wants one record and it is at the end of log, - * return WT_NOTFOUND. - */ - if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0) - ret = WT_NOTFOUND; - WT_TRET(__wt_close(session, &log_fh)); - return (ret); + /* + * Log files are pre-allocated. We need to detect the difference between a hole in the file + * (where this location would be considered the end of log) and the last record in the log + * and we're at the zeroed part of the file. If we find a zeroed record, scan forward in the + * log looking for any data. If we detect any we have a hole and stop. Otherwise if the rest + * is all zeroes advance to the next file. When recovery finds the end of the log, truncate + * the file and remove any later log files that may exist. + */ + if (reclen == 0) { + WT_ERR(__log_has_hole(session, log_fh, log_size, rd_lsn.l.offset, &bad_offset, &eol)); + if (bad_offset != 0) { + need_salvage = true; + WT_ERR(__log_salvage_message(session, log_fh->name, "", bad_offset)); + } + if (eol) + /* Found a hole. This LSN is the end. */ + break; + /* Last record in log. Look for more. */ + goto advance; + } + rdup_len = __wt_rduppo2(reclen, allocsize); + if (reclen > allocsize) { + /* + * The log file end could be the middle of this log record. If we have a partially + * written record then this is considered the end of the log. + */ + if (rd_lsn.l.offset + rdup_len > log_size) { + eol = true; + break; + } + /* + * We need to round up and read in the full padded record, especially for direct I/O. + */ + WT_ERR(__wt_buf_grow(session, buf, rdup_len)); + WT_ERR(__log_fs_read(session, log_fh, rd_lsn.l.offset, (size_t)rdup_len, buf->mem)); + WT_STAT_CONN_INCR(session, log_scan_rereads); + } + /* + * We read in the record, now verify the checksum. A failed checksum does not imply + * corruption, it may be the result of a partial write. + */ + buf->size = reclen; + logrec = (WT_LOG_RECORD *)buf->mem; + if (!__log_checksum_match(buf, reclen)) { + /* + * A checksum mismatch means we have reached the end of the useful part of the log. This + * should be found on the first pass through recovery. In the second pass where we + * truncate the log, this is where it should end. Continue processing where possible, so + * remember any error returns, but don't skip to the error handler. + */ + if (log != NULL) + log->trunc_lsn = rd_lsn; + /* + * If the user asked for a specific LSN and it is not a valid LSN, return WT_NOTFOUND. + */ + if (LF_ISSET(WT_LOGSCAN_ONE)) + ret = WT_NOTFOUND; + + /* + * When we have a checksum mismatch, we would like + * to determine whether it may be the result of: + * 1) some expected corruption that can occur during + * backups + * 2) a partial write that can naturally occur when + * an application crashes + * 3) some other corruption + * so that we can (in case 3) flag cases of file system + * or hardware failures. Unfortunately, we have found + * on some systems that file system writes may in fact + * be lost, and this can readily be triggered with + * normal operations. Rather than force users to + * salvage in these situations, we merely truncate the + * log at this point and issue a message. + */ + if (F_ISSET(conn, WT_CONN_WAS_BACKUP)) + break; + + if (!__log_check_partial_write(session, buf, reclen)) { + /* + * It's not a partial write, and we have a bad checksum. We treat it as a corruption + * that must be salvaged. + */ + need_salvage = true; + WT_TRET( + __log_salvage_message(session, log_fh->name, ", bad checksum", rd_lsn.l.offset)); + } else { + /* + * It may be a partial write, or it's possible that the header is corrupt. Make a + * sanity check of the log record header. + */ + WT_TRET(__log_record_verify(session, log_fh, rd_lsn.l.offset, logrec, &corrupt)); + if (corrupt) { + need_salvage = true; + WT_TRET(__log_salvage_message(session, log_fh->name, "", rd_lsn.l.offset)); + } + } + break; + } + __wt_log_record_byteswap(logrec); + + /* + * We have a valid log record. If it is not the log file header, invoke the callback. + */ + WT_STAT_CONN_INCR(session, log_scan_records); + next_lsn = rd_lsn; + next_lsn.l.offset += rdup_len; + if (rd_lsn.l.offset != 0) { + /* + * We need to manage the different buffers here. + * Buf is the buffer this function uses to read from + * the disk. The callback buffer may change based + * on whether encryption and compression are used. + * + * We want to free any buffers from compression and + * encryption but keep the one we use for reading. + */ + cbbuf = buf; + if (F_ISSET(logrec, WT_LOG_RECORD_ENCRYPTED)) { + WT_ERR(__log_decrypt(session, cbbuf, decryptitem)); + cbbuf = decryptitem; + } + if (F_ISSET(logrec, WT_LOG_RECORD_COMPRESSED)) { + WT_ERR(__log_decompress(session, cbbuf, uncitem)); + cbbuf = uncitem; + } + WT_ERR((*func)(session, cbbuf, &rd_lsn, &next_lsn, cookie, firstrecord)); + + firstrecord = 0; + + if (LF_ISSET(WT_LOGSCAN_ONE)) + break; + } + rd_lsn = next_lsn; + } + + /* Truncate if we're in recovery. */ + if (LF_ISSET(WT_LOGSCAN_RECOVER) && __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0) { + __wt_verbose(session, WT_VERB_LOG, + "End of recovery truncate end of log %" PRIu32 "/%" PRIu32, rd_lsn.l.file, + rd_lsn.l.offset); + /* Preserve prior error and fall through to error handling. */ + WT_TRET(__log_truncate(session, &rd_lsn, false, false)); + } + +err: + WT_STAT_CONN_INCR(session, log_scans); + /* + * If we are salvaging and failed a salvageable operation, then truncate the log at the fail + * point. + */ + if (ret != 0 && ret != WT_PANIC && need_salvage) { + WT_TRET(__wt_close(session, &log_fh)); + log_fh = NULL; + WT_TRET(__log_truncate(session, &rd_lsn, false, true)); + ret = 0; + } + + /* + * If the first attempt to read a log record results in an error recovery is likely going to + * fail. Try to provide a helpful failure message. + */ + if (ret != 0 && firstrecord && LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA)) { + __wt_err(session, ret, "WiredTiger is unable to read the recovery log"); + __wt_err(session, ret, + "This may be due to the log" + " files being encrypted, being from an older" + " version or due to corruption on disk"); + __wt_err(session, ret, + "You should confirm that you have" + " opened the database with the correct options including" + " all encryption and compression options"); + } + + WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); + + __wt_scr_free(session, &buf); + __wt_scr_free(session, &decryptitem); + __wt_scr_free(session, &uncitem); + + /* + * If the caller wants one record and it is at the end of log, return WT_NOTFOUND. + */ + if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0) + ret = WT_NOTFOUND; + WT_TRET(__wt_close(session, &log_fh)); + return (ret); } /* * __wt_log_force_write -- - * Force a switch and release and write of the current slot. - * Wrapper function that takes the lock. + * Force a switch and release and write of the current slot. Wrapper function that takes the + * lock. */ int __wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work) { - WT_LOG *log; - WT_MYSLOT myslot; - - log = S2C(session)->log; - memset(&myslot, 0, sizeof(myslot)); - WT_STAT_CONN_INCR(session, log_force_write); - if (did_work != NULL) - *did_work = true; - myslot.slot = log->active_slot; - return (__wt_log_slot_switch(session, &myslot, retry, true, did_work)); + WT_LOG *log; + WT_MYSLOT myslot; + + log = S2C(session)->log; + memset(&myslot, 0, sizeof(myslot)); + WT_STAT_CONN_INCR(session, log_force_write); + if (did_work != NULL) + *did_work = true; + myslot.slot = log->active_slot; + return (__wt_log_slot_switch(session, &myslot, retry, true, did_work)); } /* * __wt_log_write -- - * Write a record into the log, compressing as necessary. + * Write a record into the log, compressing as necessary. */ int -__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, - uint32_t flags) +__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags) { - WT_COMPRESSOR *compressor; - WT_CONNECTION_IMPL *conn; - WT_DECL_ITEM(citem); - WT_DECL_ITEM(eitem); - WT_DECL_RET; - WT_ITEM *ip; - WT_KEYED_ENCRYPTOR *kencryptor; - WT_LOG *log; - WT_LOG_RECORD *newlrp; - size_t dst_len, len, new_size, result_len, src_len; - uint8_t *dst, *src; - int compression_failed; - - conn = S2C(session); - log = conn->log; - /* - * An error during opening the logging subsystem can result in it - * being enabled, but without an open log file. In that case, - * just return. We can also have logging opened for reading in a - * read-only database and attempt to write a record on close. - */ - if (!F_ISSET(log, WT_LOG_OPENED) || F_ISSET(conn, WT_CONN_READONLY)) - return (0); - ip = record; - if ((compressor = conn->log_compressor) != NULL && - record->size < log->allocsize) { - WT_STAT_CONN_INCR(session, log_compress_small); - } else if (compressor != NULL) { - /* Skip the log header */ - src = (uint8_t *)record->mem + WT_LOG_COMPRESS_SKIP; - src_len = record->size - WT_LOG_COMPRESS_SKIP; - - /* - * Compute the size needed for the destination buffer. We only - * allocate enough memory for a copy of the original by default, - * if any compressed version is bigger than the original, we - * won't use it. However, some compression engines (snappy is - * one example), may need more memory because they don't stop - * just because there's no more memory into which to compress. - */ - if (compressor->pre_size == NULL) - len = src_len; - else - WT_ERR(compressor->pre_size(compressor, - &session->iface, src, src_len, &len)); - - new_size = len + WT_LOG_COMPRESS_SKIP; - WT_ERR(__wt_scr_alloc(session, new_size, &citem)); - - /* Skip the header bytes of the destination data. */ - dst = (uint8_t *)citem->mem + WT_LOG_COMPRESS_SKIP; - dst_len = len; - - compression_failed = 0; - WT_ERR(compressor->compress(compressor, &session->iface, - src, src_len, dst, dst_len, &result_len, - &compression_failed)); - result_len += WT_LOG_COMPRESS_SKIP; - - /* - * If compression fails, or doesn't gain us at least one unit of - * allocation, fallback to the original version. This isn't - * unexpected: if compression doesn't work for some chunk of - * data for some reason (noting likely additional format/header - * information which compressed output requires), it just means - * the uncompressed version is as good as it gets, and that's - * what we use. - */ - if (compression_failed || - result_len / log->allocsize >= - record->size / log->allocsize) - WT_STAT_CONN_INCR(session, log_compress_write_fails); - else { - WT_STAT_CONN_INCR(session, log_compress_writes); - WT_STAT_CONN_INCRV(session, log_compress_mem, - record->size); - WT_STAT_CONN_INCRV(session, log_compress_len, - result_len); - - /* - * Copy in the skipped header bytes, set the final data - * size. - */ - memcpy(citem->mem, record->mem, WT_LOG_COMPRESS_SKIP); - citem->size = result_len; - ip = citem; - newlrp = (WT_LOG_RECORD *)citem->mem; - F_SET(newlrp, WT_LOG_RECORD_COMPRESSED); - WT_ASSERT(session, result_len < UINT32_MAX && - record->size < UINT32_MAX); - newlrp->mem_len = WT_STORE_SIZE(record->size); - } - } - if ((kencryptor = conn->kencryptor) != NULL) { - /* - * Allocate enough space for the original record plus the - * encryption size constant plus the length we store. - */ - __wt_encrypt_size(session, kencryptor, ip->size, &new_size); - WT_ERR(__wt_scr_alloc(session, new_size, &eitem)); - - WT_ERR(__wt_encrypt(session, kencryptor, - WT_LOG_ENCRYPT_SKIP, ip, eitem)); - - /* - * Final setup of new buffer. Set the flag for - * encryption in the record header. - */ - ip = eitem; - newlrp = (WT_LOG_RECORD *)eitem->mem; - F_SET(newlrp, WT_LOG_RECORD_ENCRYPTED); - WT_ASSERT(session, new_size < UINT32_MAX && - ip->size < UINT32_MAX); - } - ret = __log_write_internal(session, ip, lsnp, flags); - -err: __wt_scr_free(session, &citem); - __wt_scr_free(session, &eitem); - return (ret); + WT_COMPRESSOR *compressor; + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(citem); + WT_DECL_ITEM(eitem); + WT_DECL_RET; + WT_ITEM *ip; + WT_KEYED_ENCRYPTOR *kencryptor; + WT_LOG *log; + WT_LOG_RECORD *newlrp; + size_t dst_len, len, new_size, result_len, src_len; + uint8_t *dst, *src; + int compression_failed; + + conn = S2C(session); + log = conn->log; + /* + * An error during opening the logging subsystem can result in it being enabled, but without an + * open log file. In that case, just return. We can also have logging opened for reading in a + * read-only database and attempt to write a record on close. + */ + if (!F_ISSET(log, WT_LOG_OPENED) || F_ISSET(conn, WT_CONN_READONLY)) + return (0); + ip = record; + if ((compressor = conn->log_compressor) != NULL && record->size < log->allocsize) { + WT_STAT_CONN_INCR(session, log_compress_small); + } else if (compressor != NULL) { + /* Skip the log header */ + src = (uint8_t *)record->mem + WT_LOG_COMPRESS_SKIP; + src_len = record->size - WT_LOG_COMPRESS_SKIP; + + /* + * Compute the size needed for the destination buffer. We only allocate enough memory for a + * copy of the original by default, if any compressed version is bigger than the original, + * we won't use it. However, some compression engines (snappy is one example), may need more + * memory because they don't stop just because there's no more memory into which to + * compress. + */ + if (compressor->pre_size == NULL) + len = src_len; + else + WT_ERR(compressor->pre_size(compressor, &session->iface, src, src_len, &len)); + + new_size = len + WT_LOG_COMPRESS_SKIP; + WT_ERR(__wt_scr_alloc(session, new_size, &citem)); + + /* Skip the header bytes of the destination data. */ + dst = (uint8_t *)citem->mem + WT_LOG_COMPRESS_SKIP; + dst_len = len; + + compression_failed = 0; + WT_ERR(compressor->compress(compressor, &session->iface, src, src_len, dst, dst_len, + &result_len, &compression_failed)); + result_len += WT_LOG_COMPRESS_SKIP; + + /* + * If compression fails, or doesn't gain us at least one unit of allocation, fallback to the + * original version. This isn't unexpected: if compression doesn't work for some chunk of + * data for some reason (noting likely additional format/header information which compressed + * output requires), it just means the uncompressed version is as good as it gets, and + * that's what we use. + */ + if (compression_failed || result_len / log->allocsize >= record->size / log->allocsize) + WT_STAT_CONN_INCR(session, log_compress_write_fails); + else { + WT_STAT_CONN_INCR(session, log_compress_writes); + WT_STAT_CONN_INCRV(session, log_compress_mem, record->size); + WT_STAT_CONN_INCRV(session, log_compress_len, result_len); + + /* + * Copy in the skipped header bytes, set the final data size. + */ + memcpy(citem->mem, record->mem, WT_LOG_COMPRESS_SKIP); + citem->size = result_len; + ip = citem; + newlrp = (WT_LOG_RECORD *)citem->mem; + F_SET(newlrp, WT_LOG_RECORD_COMPRESSED); + WT_ASSERT(session, result_len < UINT32_MAX && record->size < UINT32_MAX); + newlrp->mem_len = WT_STORE_SIZE(record->size); + } + } + if ((kencryptor = conn->kencryptor) != NULL) { + /* + * Allocate enough space for the original record plus the encryption size constant plus the + * length we store. + */ + __wt_encrypt_size(session, kencryptor, ip->size, &new_size); + WT_ERR(__wt_scr_alloc(session, new_size, &eitem)); + + WT_ERR(__wt_encrypt(session, kencryptor, WT_LOG_ENCRYPT_SKIP, ip, eitem)); + + /* + * Final setup of new buffer. Set the flag for encryption in the record header. + */ + ip = eitem; + newlrp = (WT_LOG_RECORD *)eitem->mem; + F_SET(newlrp, WT_LOG_RECORD_ENCRYPTED); + WT_ASSERT(session, new_size < UINT32_MAX && ip->size < UINT32_MAX); + } + ret = __log_write_internal(session, ip, lsnp, flags); + +err: + __wt_scr_free(session, &citem); + __wt_scr_free(session, &eitem); + return (ret); } /* * __log_write_internal -- - * Write a record into the log. + * Write a record into the log. */ static int -__log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, - uint32_t flags) +__log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags) { - WT_CONNECTION_IMPL *conn; - WT_DECL_RET; - WT_LOG *log; - WT_LOG_RECORD *logrec; - WT_LSN lsn; - WT_MYSLOT myslot; - int64_t release_size; - uint32_t fill_size, force, rdup_len; - bool free_slot; - - conn = S2C(session); - log = conn->log; - if (record->size > UINT32_MAX) - WT_RET_MSG(session, EFBIG, - "Log record size of %" WT_SIZET_FMT " exceeds the maximum " - "supported size of %" PRIu32, - record->size, UINT32_MAX); - WT_INIT_LSN(&lsn); - myslot.slot = NULL; - memset(&myslot, 0, sizeof(myslot)); - /* - * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a - * header at the beginning for us to fill in. - * - * If using direct_io, the caller should pass us an aligned record. - * But we need to make sure it is big enough and zero-filled so - * that we can write the full amount. Do this whether or not - * direct_io is in use because it makes the reading code cleaner. - */ - WT_STAT_CONN_INCRV(session, log_bytes_payload, record->size); - rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize); - WT_ERR(__wt_buf_grow(session, record, rdup_len)); - WT_ASSERT(session, record->data == record->mem); - /* - * If the caller's record only partially fills the necessary - * space, we need to zero-fill the remainder. - * - * The cast is safe, we've already checked to make sure it's in range. - */ - fill_size = rdup_len - (uint32_t)record->size; - if (fill_size != 0) { - memset((uint8_t *)record->mem + record->size, 0, fill_size); - /* - * Set the last byte of the log record to a non-zero value, - * that allows us, on the input side, to tell that a log - * record was completely written; there couldn't have been - * a partial write. That means that any checksum mismatch - * in those conditions is a log corruption. - * - * Without this changed byte, when we see a zeroed last byte, - * we must always treat a checksum error as a possible partial - * write. Since partial writes can happen as a result of an - * interrupted process (for example, a shutdown), we must - * treat a checksum error as a normal occurrence, and merely - * the place where the log must be truncated. So any real - * corruption within log records is hard to detect as such. - * - * However, we can only make this modification if there is - * more than one byte being filled, as the first zero byte - * past the actual record is needed to terminate the loop - * in txn_commit_apply. - * - * This is not a log format change, as we only are changing a - * byte in the padding portion of a record, and no logging code - * has ever checked that it is any particular value up to now. - */ - if (fill_size > 1) - *((uint8_t *)record->mem + rdup_len - 1) = - WT_DEBUG_BYTE; - record->size = rdup_len; - } - /* - * Checksum a little-endian version of the header, and write everything - * in little-endian format. The checksum is (potentially) returned in a - * big-endian format, swap it into place in a separate step. - */ - logrec = (WT_LOG_RECORD *)record->mem; - logrec->len = (uint32_t)record->size; - logrec->checksum = 0; - __wt_log_record_byteswap(logrec); - logrec->checksum = __wt_checksum(logrec, record->size); + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + WT_LOG *log; + WT_LOG_RECORD *logrec; + WT_LSN lsn; + WT_MYSLOT myslot; + int64_t release_size; + uint32_t fill_size, force, rdup_len; + bool free_slot; + + conn = S2C(session); + log = conn->log; + if (record->size > UINT32_MAX) + WT_RET_MSG(session, EFBIG, "Log record size of %" WT_SIZET_FMT + " exceeds the maximum " + "supported size of %" PRIu32, + record->size, UINT32_MAX); + WT_INIT_LSN(&lsn); + myslot.slot = NULL; + memset(&myslot, 0, sizeof(myslot)); + /* + * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a + * header at the beginning for us to fill in. + * + * If using direct_io, the caller should pass us an aligned record. + * But we need to make sure it is big enough and zero-filled so + * that we can write the full amount. Do this whether or not + * direct_io is in use because it makes the reading code cleaner. + */ + WT_STAT_CONN_INCRV(session, log_bytes_payload, record->size); + rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize); + WT_ERR(__wt_buf_grow(session, record, rdup_len)); + WT_ASSERT(session, record->data == record->mem); + /* + * If the caller's record only partially fills the necessary + * space, we need to zero-fill the remainder. + * + * The cast is safe, we've already checked to make sure it's in range. + */ + fill_size = rdup_len - (uint32_t)record->size; + if (fill_size != 0) { + memset((uint8_t *)record->mem + record->size, 0, fill_size); + /* + * Set the last byte of the log record to a non-zero value, + * that allows us, on the input side, to tell that a log + * record was completely written; there couldn't have been + * a partial write. That means that any checksum mismatch + * in those conditions is a log corruption. + * + * Without this changed byte, when we see a zeroed last byte, + * we must always treat a checksum error as a possible partial + * write. Since partial writes can happen as a result of an + * interrupted process (for example, a shutdown), we must + * treat a checksum error as a normal occurrence, and merely + * the place where the log must be truncated. So any real + * corruption within log records is hard to detect as such. + * + * However, we can only make this modification if there is + * more than one byte being filled, as the first zero byte + * past the actual record is needed to terminate the loop + * in txn_commit_apply. + * + * This is not a log format change, as we only are changing a + * byte in the padding portion of a record, and no logging code + * has ever checked that it is any particular value up to now. + */ + if (fill_size > 1) + *((uint8_t *)record->mem + rdup_len - 1) = WT_DEBUG_BYTE; + record->size = rdup_len; + } + /* + * Checksum a little-endian version of the header, and write everything in little-endian format. + * The checksum is (potentially) returned in a big-endian format, swap it into place in a + * separate step. + */ + logrec = (WT_LOG_RECORD *)record->mem; + logrec->len = (uint32_t)record->size; + logrec->checksum = 0; + __wt_log_record_byteswap(logrec); + logrec->checksum = __wt_checksum(logrec, record->size); #ifdef WORDS_BIGENDIAN - logrec->checksum = __wt_bswap32(logrec->checksum); + logrec->checksum = __wt_bswap32(logrec->checksum); #endif - WT_STAT_CONN_INCR(session, log_writes); - - /* - * The only time joining a slot should ever return an error is if it - * detects a panic. - */ - __wt_log_slot_join(session, rdup_len, flags, &myslot); - /* - * If the addition of this record crosses the buffer boundary, - * switch in a new slot. - */ - force = LF_ISSET(WT_LOG_FLUSH | WT_LOG_FSYNC); - ret = 0; - if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX || - F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force) - ret = __wt_log_slot_switch(session, &myslot, true, false, NULL); - if (ret == 0) - ret = __wt_log_fill(session, &myslot, false, record, &lsn); - release_size = __wt_log_slot_release(&myslot, (int64_t)rdup_len); - /* - * If we get an error we still need to do proper accounting in - * the slot fields. - * XXX On error we may still need to call release and free. - */ - if (ret != 0) - myslot.slot->slot_error = ret; - WT_ASSERT(session, ret == 0); - if (WT_LOG_SLOT_DONE(release_size)) { - WT_ERR(__wt_log_release(session, myslot.slot, &free_slot)); - if (free_slot) - __wt_log_slot_free(session, myslot.slot); - } else if (force) { - /* - * If we are going to wait for this slot to get written, - * signal the wrlsn thread. - * - * XXX I've seen times when conditions are NULL. - */ - if (conn->log_cond != NULL) { - __wt_cond_signal(session, conn->log_cond); - __wt_yield(); - } else - WT_ERR(__wt_log_force_write(session, 1, NULL)); - } - if (LF_ISSET(WT_LOG_FLUSH)) { - /* Wait for our writes to reach the OS */ - while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 && - myslot.slot->slot_error == 0) - __wt_cond_wait( - session, log->log_write_cond, 10000, NULL); - } else if (LF_ISSET(WT_LOG_FSYNC)) { - /* Wait for our writes to reach disk */ - while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 && - myslot.slot->slot_error == 0) - __wt_cond_wait( - session, log->log_sync_cond, 10000, NULL); - } - - /* - * Advance the background sync LSN if needed. - */ - if (LF_ISSET(WT_LOG_BACKGROUND)) - __wt_log_background(session, &lsn); + WT_STAT_CONN_INCR(session, log_writes); + + /* + * The only time joining a slot should ever return an error is if it detects a panic. + */ + __wt_log_slot_join(session, rdup_len, flags, &myslot); + /* + * If the addition of this record crosses the buffer boundary, switch in a new slot. + */ + force = LF_ISSET(WT_LOG_FLUSH | WT_LOG_FSYNC); + ret = 0; + if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX || F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force) + ret = __wt_log_slot_switch(session, &myslot, true, false, NULL); + if (ret == 0) + ret = __wt_log_fill(session, &myslot, false, record, &lsn); + release_size = __wt_log_slot_release(&myslot, (int64_t)rdup_len); + /* + * If we get an error we still need to do proper accounting in the slot fields. XXX On error we + * may still need to call release and free. + */ + if (ret != 0) + myslot.slot->slot_error = ret; + WT_ASSERT(session, ret == 0); + if (WT_LOG_SLOT_DONE(release_size)) { + WT_ERR(__wt_log_release(session, myslot.slot, &free_slot)); + if (free_slot) + __wt_log_slot_free(session, myslot.slot); + } else if (force) { + /* + * If we are going to wait for this slot to get written, + * signal the wrlsn thread. + * + * XXX I've seen times when conditions are NULL. + */ + if (conn->log_cond != NULL) { + __wt_cond_signal(session, conn->log_cond); + __wt_yield(); + } else + WT_ERR(__wt_log_force_write(session, 1, NULL)); + } + if (LF_ISSET(WT_LOG_FLUSH)) { + /* Wait for our writes to reach the OS */ + while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) + __wt_cond_wait(session, log->log_write_cond, 10000, NULL); + } else if (LF_ISSET(WT_LOG_FSYNC)) { + /* Wait for our writes to reach disk */ + while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) + __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); + } + + /* + * Advance the background sync LSN if needed. + */ + if (LF_ISSET(WT_LOG_BACKGROUND)) + __wt_log_background(session, &lsn); err: - if (ret == 0 && lsnp != NULL) - *lsnp = lsn; - /* - * If we're synchronous and some thread had an error, we don't know - * if our write made it out to the file or not. The error could be - * before or after us. So, if anyone got an error, we report it. - * If we're not synchronous, only report if our own operation got - * an error. - */ - if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 && - myslot.slot != NULL) - ret = myslot.slot->slot_error; - - /* - * If one of the sync flags is set, assert the proper LSN has moved to - * match on success. - */ - WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FLUSH) || - __wt_log_cmp(&log->write_lsn, &lsn) >= 0); - WT_ASSERT(session, ret != 0 || !LF_ISSET(WT_LOG_FSYNC) || - __wt_log_cmp(&log->sync_lsn, &lsn) >= 0); - return (ret); + if (ret == 0 && lsnp != NULL) + *lsnp = lsn; + /* + * If we're synchronous and some thread had an error, we don't know if our write made it out to + * the file or not. The error could be before or after us. So, if anyone got an error, we report + * it. If we're not synchronous, only report if our own operation got an error. + */ + if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 && myslot.slot != NULL) + ret = myslot.slot->slot_error; + + /* + * If one of the sync flags is set, assert the proper LSN has moved to match on success. + */ + WT_ASSERT( + session, ret != 0 || !LF_ISSET(WT_LOG_FLUSH) || __wt_log_cmp(&log->write_lsn, &lsn) >= 0); + WT_ASSERT( + session, ret != 0 || !LF_ISSET(WT_LOG_FSYNC) || __wt_log_cmp(&log->sync_lsn, &lsn) >= 0); + return (ret); } /* * __wt_log_vprintf -- - * Write a message into the log. + * Write a message into the log. */ int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap) { - WT_CONNECTION_IMPL *conn; - WT_DECL_ITEM(logrec); - WT_DECL_RET; - size_t header_size, len; - uint32_t rectype; - const char *rec_fmt; - va_list ap_copy; - - conn = S2C(session); - rectype = WT_LOGREC_MESSAGE; - rec_fmt = WT_UNCHECKED_STRING(I); - - if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) - return (0); - - va_copy(ap_copy, ap); - len = 1; - ret = __wt_vsnprintf_len_incr(NULL, 0, &len, fmt, ap_copy); - va_end(ap_copy); - WT_RET(ret); - - WT_RET( - __wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec)); - - /* - * We're writing a record with the type (an integer) followed by a - * string (NUL-terminated data). To avoid writing the string into - * a buffer before copying it, we write the header first, then the - * raw bytes of the string. - */ - WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype)); - WT_ERR(__wt_struct_pack(session, - (uint8_t *)logrec->data + logrec->size, header_size, - rec_fmt, rectype)); - logrec->size += (uint32_t)header_size; - - WT_ERR(__wt_vsnprintf( - (char *)logrec->data + logrec->size, len, fmt, ap)); - - __wt_verbose(session, WT_VERB_LOG, - "log_printf: %s", (char *)logrec->data + logrec->size); - - logrec->size += len; - WT_ERR(__wt_log_write(session, logrec, NULL, 0)); -err: __wt_scr_free(session, &logrec); - return (ret); + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(logrec); + WT_DECL_RET; + size_t header_size, len; + uint32_t rectype; + const char *rec_fmt; + va_list ap_copy; + + conn = S2C(session); + rectype = WT_LOGREC_MESSAGE; + rec_fmt = WT_UNCHECKED_STRING(I); + + if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) + return (0); + + va_copy(ap_copy, ap); + len = 1; + ret = __wt_vsnprintf_len_incr(NULL, 0, &len, fmt, ap_copy); + va_end(ap_copy); + WT_RET(ret); + + WT_RET(__wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec)); + + /* + * We're writing a record with the type (an integer) followed by a string (NUL-terminated data). + * To avoid writing the string into a buffer before copying it, we write the header first, then + * the raw bytes of the string. + */ + WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype)); + WT_ERR(__wt_struct_pack( + session, (uint8_t *)logrec->data + logrec->size, header_size, rec_fmt, rectype)); + logrec->size += (uint32_t)header_size; + + WT_ERR(__wt_vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap)); + + __wt_verbose(session, WT_VERB_LOG, "log_printf: %s", (char *)logrec->data + logrec->size); + + logrec->size += len; + WT_ERR(__wt_log_write(session, logrec, NULL, 0)); +err: + __wt_scr_free(session, &logrec); + return (ret); } /* * __wt_log_flush -- - * Forcibly flush the log to the synchronization level specified. - * Wait until it has been completed. + * Forcibly flush the log to the synchronization level specified. Wait until it has been + * completed. */ int __wt_log_flush(WT_SESSION_IMPL *session, uint32_t flags) { - WT_CONNECTION_IMPL *conn; - WT_LOG *log; - WT_LSN last_lsn, lsn; - - conn = S2C(session); - WT_ASSERT(session, FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)); - log = conn->log; - /* - * We need to flush out the current slot first to get the real - * end of log LSN in log->alloc_lsn. - */ - WT_RET(__wt_log_flush_lsn(session, &lsn, false)); - last_lsn = log->alloc_lsn; - - /* - * If the last write caused a switch to a new log file, we should only - * wait for the last write to be flushed. Otherwise, if the workload - * is single-threaded we could wait here forever because the write LSN - * doesn't switch into the new file until it contains a record. - */ - if (last_lsn.l.offset == log->first_record) - last_lsn = log->log_close_lsn; - - /* - * Wait until all current outstanding writes have been written - * to the file system. - */ - while (__wt_log_cmp(&last_lsn, &lsn) > 0) { - __wt_sleep(0, WT_THOUSAND); - WT_RET(__wt_log_flush_lsn(session, &lsn, false)); - } - - __wt_verbose(session, WT_VERB_LOG, - "log_flush: flags %#" PRIx32 " LSN %" PRIu32 "/%" PRIu32, - flags, lsn.l.file, lsn.l.offset); - /* - * If the user wants write-no-sync, there is nothing more to do. - * If the user wants background sync, set the LSN and we're done. - * If the user wants sync, force it now. - */ - if (LF_ISSET(WT_LOG_BACKGROUND)) - __wt_log_background(session, &lsn); - else if (LF_ISSET(WT_LOG_FSYNC)) - WT_RET(__wt_log_force_sync(session, &lsn)); - return (0); + WT_CONNECTION_IMPL *conn; + WT_LOG *log; + WT_LSN last_lsn, lsn; + + conn = S2C(session); + WT_ASSERT(session, FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)); + log = conn->log; + /* + * We need to flush out the current slot first to get the real end of log LSN in log->alloc_lsn. + */ + WT_RET(__wt_log_flush_lsn(session, &lsn, false)); + last_lsn = log->alloc_lsn; + + /* + * If the last write caused a switch to a new log file, we should only wait for the last write + * to be flushed. Otherwise, if the workload is single-threaded we could wait here forever + * because the write LSN doesn't switch into the new file until it contains a record. + */ + if (last_lsn.l.offset == log->first_record) + last_lsn = log->log_close_lsn; + + /* + * Wait until all current outstanding writes have been written to the file system. + */ + while (__wt_log_cmp(&last_lsn, &lsn) > 0) { + __wt_sleep(0, WT_THOUSAND); + WT_RET(__wt_log_flush_lsn(session, &lsn, false)); + } + + __wt_verbose(session, WT_VERB_LOG, "log_flush: flags %#" PRIx32 " LSN %" PRIu32 "/%" PRIu32, + flags, lsn.l.file, lsn.l.offset); + /* + * If the user wants write-no-sync, there is nothing more to do. If the user wants background + * sync, set the LSN and we're done. If the user wants sync, force it now. + */ + if (LF_ISSET(WT_LOG_BACKGROUND)) + __wt_log_background(session, &lsn); + else if (LF_ISSET(WT_LOG_FSYNC)) + WT_RET(__wt_log_force_sync(session, &lsn)); + return (0); } |